74 files changed, 30664 insertions, 0 deletions
diff --git a/lib/Makefile.am b/lib/Makefile.am
new file mode 100644
index 0000000..89ce261
--- /dev/null
+++ b/lib/Makefile.am
@@ -0,0 +1,173 @@
+INCLUDES = -I$(top_srcdir)/include
+AM_CFLAGS = $(OGG_CFLAGS) $(CAIRO_CFLAGS)
+
+EXTRA_DIST = \
+	cpu.c \
+	encoder_disabled.c \
+	x86/mmxencfrag.c \
+	x86/mmxfdct.c \
+	x86/sse2fdct.c \
+	x86/x86enc.c \
+	x86/x86enc.h \
+	x86/mmxfrag.c \
+	x86/mmxfrag.h \
+	x86/mmxidct.c \
+	x86/mmxloop.h \
+	x86/mmxstate.c \
+	x86/x86int.h \
+	x86/x86state.c \
+	x86_vc
+
+lib_LTLIBRARIES = libtheoradec.la libtheoraenc.la libtheora.la
+
+if THEORA_DISABLE_ENCODE
+encoder_uniq_sources = \
+	encoder_disabled.c
+
+encoder_sources = \
+	$(encoder_uniq_sources)
+else
+encoder_uniq_x86_sources = \
+	x86/mmxencfrag.c \
+	x86/mmxfdct.c \
+	x86/x86enc.c
+
+encoder_uniq_x86_64_sources = \
+	x86/sse2fdct.c
+
+encoder_shared_x86_sources = \
+	x86/mmxfrag.c \
+	x86/mmxidct.c \
+	x86/mmxstate.c \
+	x86/x86state.c
+
+encoder_shared_x86_64_sources =
+
+if CPU_x86_64
+encoder_uniq_arch_sources = \
+ $(encoder_uniq_x86_sources) \
+ $(encoder_uniq_x86_64_sources)
+encoder_shared_arch_sources = \
+ $(encoder_shared_x86_sources) \
+ $(encoder_shared_x86_64_sources)
+else
+if CPU_x86_32
+encoder_uniq_arch_sources = $(encoder_uniq_x86_sources)
+encoder_shared_arch_sources = $(encoder_shared_x86_sources)
+else
+encoder_uniq_arch_sources =
+encoder_shared_arch_sources =
+endif
+endif
+
+encoder_uniq_sources = \
+	analyze.c \
+	fdct.c \
+	encfrag.c \
+	encapiwrapper.c \
+	encinfo.c \
+	encode.c \
+	enquant.c \
+	huffenc.c \
+	mathops.c \
+	mcenc.c \
+	rate.c \
+	tokenize.c \
+	$(encoder_uniq_arch_sources)
+
+encoder_sources = \
+	apiwrapper.c \
+	fragment.c \
+	idct.c \
+	internal.c \
+	state.c \
+	quant.c \
+	$(encoder_shared_arch_sources) \
+	$(encoder_uniq_sources)
+
+endif
+
+decoder_x86_sources = \
+	x86/mmxidct.c \
+	x86/mmxfrag.c \
+	x86/mmxstate.c \
+	x86/x86state.c
+if CPU_x86_64
+decoder_arch_sources = $(decoder_x86_sources)
+else
+if CPU_x86_32
+decoder_arch_sources = $(decoder_x86_sources)
+else
+decoder_arch_sources =
+endif
+endif
+
+decoder_sources = \
+	apiwrapper.c \
+	bitpack.c \
+	decapiwrapper.c \
+	decinfo.c \
+	decode.c \
+	dequant.c \
+	fragment.c \
+	huffdec.c \
+	idct.c \
+	info.c \
+	internal.c \
+	quant.c \
+	state.c \
+	$(decoder_arch_sources)
+
+noinst_HEADERS = \
+	cpu.h \
+	internal.h \
+	encint.h \
+	enquant.h \
+	huffenc.h \
+	mathops.h \
+	modedec.h \
+	x86/x86enc.h \
+	apiwrapper.h \
+	bitpack.h \
+	dct.h \
+	decint.h \
+	dequant.h \
+	huffdec.h \
+	huffman.h \
+	ocintrin.h \
+	quant.h \
+	x86/mmxfrag.h \
+	x86/mmxloop.h \
+	x86/x86int.h
+
+libtheoradec_la_SOURCES = \
+	$(decoder_sources) \
+	Version_script-dec theoradec.exp
+libtheoradec_la_LDFLAGS = \
+  -version-info @THDEC_LIB_CURRENT@:@THDEC_LIB_REVISION@:@THDEC_LIB_AGE@ \
+  @THEORADEC_LDFLAGS@ @CAIRO_LIBS@
+
+libtheoraenc_la_SOURCES = \
+	$(encoder_sources) \
+	Version_script-enc theoraenc.exp
+libtheoraenc_la_LDFLAGS = \
+  -version-info @THENC_LIB_CURRENT@:@THENC_LIB_REVISION@:@THENC_LIB_AGE@ \
+  @THEORAENC_LDFLAGS@ $(OGG_LIBS)
+
+libtheora_la_SOURCES = \
+	$(decoder_sources) \
+	$(encoder_uniq_sources) \
+	Version_script theora.exp
+libtheora_la_LDFLAGS = \
+  -version-info @TH_LIB_CURRENT@:@TH_LIB_REVISION@:@TH_LIB_AGE@ \
+  @THEORA_LDFLAGS@ @CAIRO_LIBS@ $(OGG_LIBS)
+
+debug:
+	$(MAKE) all CFLAGS="@DEBUG@" 
+
+profile:
+	$(MAKE) all CFLAGS="@PROFILE@"
+
+# contstruct various symbol export list files
+.def.exp : defexp.awk
+	awk -f defexp.awk $< > $@
diff --git a/lib/Makefile.in b/lib/Makefile.in
new file mode 100644
index 0000000..f26ccdc
--- /dev/null
+++ b/lib/Makefile.in
@@ -0,0 +1,845 @@
+# Makefile.in generated by automake 1.6.3 from Makefile.am.
+# @configure_input@
+
+# Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002
+# Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+SHELL = @SHELL@
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+prefix = @prefix@
+exec_prefix = @exec_prefix@
+
+bindir = @bindir@
+sbindir = @sbindir@
+libexecdir = @libexecdir@
+datadir = @datadir@
+sysconfdir = @sysconfdir@
+sharedstatedir = @sharedstatedir@
+localstatedir = @localstatedir@
+libdir = @libdir@
+infodir = @infodir@
+mandir = @mandir@
+includedir = @includedir@
+oldincludedir = /usr/include
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+top_builddir = ..
+
+ACLOCAL = @ACLOCAL@
+AUTOCONF = @AUTOCONF@
+AUTOMAKE = @AUTOMAKE@
+AUTOHEADER = @AUTOHEADER@
+
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+INSTALL = @INSTALL@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_DATA = @INSTALL_DATA@
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = @program_transform_name@
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+host_alias = @host_alias@
+host_triplet = @host@
+
+EXEEXT = @EXEEXT@
+OBJEXT = @OBJEXT@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+ACLOCAL_AMFLAGS = @ACLOCAL_AMFLAGS@
+AMTAR = @AMTAR@
+AR = @AR@
+ARGZ_H = @ARGZ_H@
+AS = @AS@
+AWK = @AWK@
+BUILDABLE_EXAMPLES = @BUILDABLE_EXAMPLES@
+CAIRO_CFLAGS = @CAIRO_CFLAGS@
+CAIRO_LIBS = @CAIRO_LIBS@
+CC = @CC@
+CPP = @CPP@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+DEBUG = @DEBUG@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+F77 = @F77@
+GCJ = @GCJ@
+GCJFLAGS = @GCJFLAGS@
+GETOPT_OBJS = @GETOPT_OBJS@
+GREP = @GREP@
+HAVE_BIBTEX = @HAVE_BIBTEX@
+HAVE_DOXYGEN = @HAVE_DOXYGEN@
+HAVE_PDFLATEX = @HAVE_PDFLATEX@
+HAVE_PKG_CONFIG = @HAVE_PKG_CONFIG@
+HAVE_TRANSFIG = @HAVE_TRANSFIG@
+HAVE_VALGRIND = @HAVE_VALGRIND@
+INCLTDL = @INCLTDL@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LIBADD_DL = @LIBADD_DL@
+LIBADD_DLD_LINK = @LIBADD_DLD_LINK@
+LIBADD_DLOPEN = @LIBADD_DLOPEN@
+LIBADD_SHL_LOAD = @LIBADD_SHL_LOAD@
+LIBLTDL = @LIBLTDL@
+LIBM = @LIBM@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTDLDEPS = @LTDLDEPS@
+LTDLINCL = @LTDLINCL@
+LTDLOPEN = @LTDLOPEN@
+LT_CONFIG_H = @LT_CONFIG_H@
+LT_DLLOADERS = @LT_DLLOADERS@
+LT_DLPREOPEN = @LT_DLPREOPEN@
+MAINT = @MAINT@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OGG_CFLAGS = @OGG_CFLAGS@
+OGG_LIBS = @OGG_LIBS@
+OSS_LIBS = @OSS_LIBS@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PKG_CONFIG = @PKG_CONFIG@
+PNG_CFLAGS = @PNG_CFLAGS@
+PNG_LIBS = @PNG_LIBS@
+PROFILE = @PROFILE@
+RANLIB = @RANLIB@
+RC = @RC@
+SDL_CFLAGS = @SDL_CFLAGS@
+SDL_CONFIG = @SDL_CONFIG@
+SDL_LIBS = @SDL_LIBS@
+SED = @SED@
+STRIP = @STRIP@
+THDEC_LIB_AGE = @THDEC_LIB_AGE@
+THDEC_LIB_CURRENT = @THDEC_LIB_CURRENT@
+THDEC_LIB_REVISION = @THDEC_LIB_REVISION@
+THENC_LIB_AGE = @THENC_LIB_AGE@
+THENC_LIB_CURRENT = @THENC_LIB_CURRENT@
+THENC_LIB_REVISION = @THENC_LIB_REVISION@
+THEORADEC_LDFLAGS = @THEORADEC_LDFLAGS@
+THEORAENC_LDFLAGS = @THEORAENC_LDFLAGS@
+THEORA_LDFLAGS = @THEORA_LDFLAGS@
+TH_LIB_AGE = @TH_LIB_AGE@
+TH_LIB_CURRENT = @TH_LIB_CURRENT@
+TH_LIB_REVISION = @TH_LIB_REVISION@
+VALGRIND_ENVIRONMENT = @VALGRIND_ENVIRONMENT@
+VERSION = @VERSION@
+VORBISENC_LIBS = @VORBISENC_LIBS@
+VORBISFILE_LIBS = @VORBISFILE_LIBS@
+VORBIS_CFLAGS = @VORBIS_CFLAGS@
+VORBIS_LIBS = @VORBIS_LIBS@
+am__include = @am__include@
+am__quote = @am__quote@
+install_sh = @install_sh@
+lt_ECHO = @lt_ECHO@
+ltdl_LIBOBJS = @ltdl_LIBOBJS@
+ltdl_LTLIBOBJS = @ltdl_LTLIBOBJS@
+sys_symbol_underscore = @sys_symbol_underscore@
+INCLUDES = -I$(top_srcdir)/include
+AM_CFLAGS = $(OGG_CFLAGS) $(CAIRO_CFLAGS)
+
+EXTRA_DIST = \
+	cpu.c \
+	encoder_disabled.c \
+	x86/mmxencfrag.c \
+	x86/mmxfdct.c \
+	x86/sse2fdct.c \
+	x86/x86enc.c \
+	x86/x86enc.h \
+	x86/mmxfrag.c \
+	x86/mmxfrag.h \
+	x86/mmxidct.c \
+	x86/mmxloop.h \
+	x86/mmxstate.c \
+	x86/x86int.h \
+	x86/x86state.c \
+	x86_vc
+
+
+lib_LTLIBRARIES = libtheoradec.la libtheoraenc.la libtheora.la
+
+@THEORA_DISABLE_ENCODE_TRUE@encoder_uniq_sources = \
+@THEORA_DISABLE_ENCODE_TRUE@	encoder_disabled.c
+
+@THEORA_DISABLE_ENCODE_FALSE@encoder_uniq_sources = \
+@THEORA_DISABLE_ENCODE_FALSE@	analyze.c \
+@THEORA_DISABLE_ENCODE_FALSE@	fdct.c \
+@THEORA_DISABLE_ENCODE_FALSE@	encfrag.c \
+@THEORA_DISABLE_ENCODE_FALSE@	encapiwrapper.c \
+@THEORA_DISABLE_ENCODE_FALSE@	encinfo.c \
+@THEORA_DISABLE_ENCODE_FALSE@	encode.c \
+@THEORA_DISABLE_ENCODE_FALSE@	enquant.c \
+@THEORA_DISABLE_ENCODE_FALSE@	huffenc.c \
+@THEORA_DISABLE_ENCODE_FALSE@	mathops.c \
+@THEORA_DISABLE_ENCODE_FALSE@	mcenc.c \
+@THEORA_DISABLE_ENCODE_FALSE@	rate.c \
+@THEORA_DISABLE_ENCODE_FALSE@	tokenize.c \
+@THEORA_DISABLE_ENCODE_FALSE@	$(encoder_uniq_arch_sources)
+
+
+@THEORA_DISABLE_ENCODE_TRUE@encoder_sources = \
+@THEORA_DISABLE_ENCODE_TRUE@	$(encoder_uniq_sources)
+
+@THEORA_DISABLE_ENCODE_FALSE@encoder_sources = \
+@THEORA_DISABLE_ENCODE_FALSE@	apiwrapper.c \
+@THEORA_DISABLE_ENCODE_FALSE@	fragment.c \
+@THEORA_DISABLE_ENCODE_FALSE@	idct.c \
+@THEORA_DISABLE_ENCODE_FALSE@	internal.c \
+@THEORA_DISABLE_ENCODE_FALSE@	state.c \
+@THEORA_DISABLE_ENCODE_FALSE@	quant.c \
+@THEORA_DISABLE_ENCODE_FALSE@	$(encoder_shared_arch_sources) \
+@THEORA_DISABLE_ENCODE_FALSE@	$(encoder_uniq_sources)
+
+@THEORA_DISABLE_ENCODE_FALSE@encoder_uniq_x86_sources = \
+@THEORA_DISABLE_ENCODE_FALSE@	x86/mmxencfrag.c \
+@THEORA_DISABLE_ENCODE_FALSE@	x86/mmxfdct.c \
+@THEORA_DISABLE_ENCODE_FALSE@	x86/x86enc.c
+
+
+@THEORA_DISABLE_ENCODE_FALSE@encoder_uniq_x86_64_sources = \
+@THEORA_DISABLE_ENCODE_FALSE@	x86/sse2fdct.c
+
+
+@THEORA_DISABLE_ENCODE_FALSE@encoder_shared_x86_sources = \
+@THEORA_DISABLE_ENCODE_FALSE@	x86/mmxfrag.c \
+@THEORA_DISABLE_ENCODE_FALSE@	x86/mmxidct.c \
+@THEORA_DISABLE_ENCODE_FALSE@	x86/mmxstate.c \
+@THEORA_DISABLE_ENCODE_FALSE@	x86/x86state.c
+
+
+@THEORA_DISABLE_ENCODE_FALSE@encoder_shared_x86_64_sources = 
+
+@CPU_x86_32_FALSE@@CPU_x86_64_FALSE@@THEORA_DISABLE_ENCODE_FALSE@encoder_uniq_arch_sources = 
+@CPU_x86_32_TRUE@@CPU_x86_64_FALSE@@THEORA_DISABLE_ENCODE_FALSE@encoder_uniq_arch_sources = $(encoder_uniq_x86_sources)
+@CPU_x86_64_TRUE@@THEORA_DISABLE_ENCODE_FALSE@encoder_uniq_arch_sources = \
+@CPU_x86_64_TRUE@@THEORA_DISABLE_ENCODE_FALSE@ $(encoder_uniq_x86_sources) \
+@CPU_x86_64_TRUE@@THEORA_DISABLE_ENCODE_FALSE@ $(encoder_uniq_x86_64_sources)
+
+@CPU_x86_32_FALSE@@CPU_x86_64_FALSE@@THEORA_DISABLE_ENCODE_FALSE@encoder_shared_arch_sources = 
+@CPU_x86_32_TRUE@@CPU_x86_64_FALSE@@THEORA_DISABLE_ENCODE_FALSE@encoder_shared_arch_sources = $(encoder_shared_x86_sources)
+@CPU_x86_64_TRUE@@THEORA_DISABLE_ENCODE_FALSE@encoder_shared_arch_sources = \
+@CPU_x86_64_TRUE@@THEORA_DISABLE_ENCODE_FALSE@ $(encoder_shared_x86_sources) \
+@CPU_x86_64_TRUE@@THEORA_DISABLE_ENCODE_FALSE@ $(encoder_shared_x86_64_sources)
+
+
+decoder_x86_sources = \
+	x86/mmxidct.c \
+	x86/mmxfrag.c \
+	x86/mmxstate.c \
+	x86/x86state.c
+
+@CPU_x86_32_FALSE@@CPU_x86_64_FALSE@decoder_arch_sources = 
+@CPU_x86_32_TRUE@@CPU_x86_64_FALSE@decoder_arch_sources = $(decoder_x86_sources)
+@CPU_x86_64_TRUE@decoder_arch_sources = $(decoder_x86_sources)
+
+decoder_sources = \
+	apiwrapper.c \
+	bitpack.c \
+	decapiwrapper.c \
+	decinfo.c \
+	decode.c \
+	dequant.c \
+	fragment.c \
+	huffdec.c \
+	idct.c \
+	info.c \
+	internal.c \
+	quant.c \
+	state.c \
+	$(decoder_arch_sources)
+
+
+noinst_HEADERS = \
+	cpu.h \
+	internal.h \
+	encint.h \
+	enquant.h \
+	huffenc.h \
+	mathops.h \
+	modedec.h \
+	x86/x86enc.h \
+	apiwrapper.h \
+	bitpack.h \
+	dct.h \
+	decint.h \
+	dequant.h \
+	huffdec.h \
+	huffman.h \
+	ocintrin.h \
+	quant.h \
+	x86/mmxfrag.h \
+	x86/mmxloop.h \
+	x86/x86int.h
+
+
+libtheoradec_la_SOURCES = \
+	$(decoder_sources) \
+	Version_script-dec theoradec.exp
+
+libtheoradec_la_LDFLAGS = \
+  -version-info @THDEC_LIB_CURRENT@:@THDEC_LIB_REVISION@:@THDEC_LIB_AGE@ \
+  @THEORADEC_LDFLAGS@ @CAIRO_LIBS@
+
+
+libtheoraenc_la_SOURCES = \
+	$(encoder_sources) \
+	Version_script-enc theoraenc.exp
+
+libtheoraenc_la_LDFLAGS = \
+  -version-info @THENC_LIB_CURRENT@:@THENC_LIB_REVISION@:@THENC_LIB_AGE@ \
+  @THEORAENC_LDFLAGS@ $(OGG_LIBS)
+
+
+libtheora_la_SOURCES = \
+	$(decoder_sources) \
+	$(encoder_uniq_sources) \
+	Version_script theora.exp
+
+libtheora_la_LDFLAGS = \
+  -version-info @TH_LIB_CURRENT@:@TH_LIB_REVISION@:@TH_LIB_AGE@ \
+  @THEORA_LDFLAGS@ @CAIRO_LIBS@ $(OGG_LIBS)
+
+subdir = lib
+mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+LTLIBRARIES = $(lib_LTLIBRARIES)
+
+libtheora_la_LIBADD =
+am__objects_1 = mmxidct.lo mmxfrag.lo mmxstate.lo x86state.lo
+@CPU_x86_32_FALSE@@CPU_x86_64_FALSE@am__objects_2 =
+@CPU_x86_32_TRUE@@CPU_x86_64_FALSE@am__objects_2 = $(am__objects_1)
+@CPU_x86_64_TRUE@am__objects_2 = $(am__objects_1)
+am__objects_3 = apiwrapper.lo bitpack.lo decapiwrapper.lo decinfo.lo \
+	decode.lo dequant.lo fragment.lo huffdec.lo idct.lo info.lo \
+	internal.lo quant.lo state.lo $(am__objects_2)
+@THEORA_DISABLE_ENCODE_FALSE@am__objects_4 = mmxencfrag.lo mmxfdct.lo \
+@THEORA_DISABLE_ENCODE_FALSE@	x86enc.lo
+@THEORA_DISABLE_ENCODE_FALSE@am__objects_5 = sse2fdct.lo
+@CPU_x86_32_FALSE@@CPU_x86_64_FALSE@@THEORA_DISABLE_ENCODE_FALSE@am__objects_6 =
+@CPU_x86_32_TRUE@@CPU_x86_64_FALSE@@THEORA_DISABLE_ENCODE_FALSE@am__objects_6 = \
+@CPU_x86_32_TRUE@@CPU_x86_64_FALSE@@THEORA_DISABLE_ENCODE_FALSE@	$(am__objects_4)
+@CPU_x86_64_TRUE@@THEORA_DISABLE_ENCODE_FALSE@am__objects_6 = \
+@CPU_x86_64_TRUE@@THEORA_DISABLE_ENCODE_FALSE@	$(am__objects_4) \
+@CPU_x86_64_TRUE@@THEORA_DISABLE_ENCODE_FALSE@	$(am__objects_5)
+@THEORA_DISABLE_ENCODE_TRUE@am__objects_7 = encoder_disabled.lo
+@THEORA_DISABLE_ENCODE_FALSE@am__objects_7 = analyze.lo fdct.lo \
+@THEORA_DISABLE_ENCODE_FALSE@	encfrag.lo encapiwrapper.lo \
+@THEORA_DISABLE_ENCODE_FALSE@	encinfo.lo encode.lo enquant.lo \
+@THEORA_DISABLE_ENCODE_FALSE@	huffenc.lo mathops.lo mcenc.lo \
+@THEORA_DISABLE_ENCODE_FALSE@	rate.lo tokenize.lo \
+@THEORA_DISABLE_ENCODE_FALSE@	$(am__objects_6)
+am_libtheora_la_OBJECTS = $(am__objects_3) $(am__objects_7)
+libtheora_la_OBJECTS = $(am_libtheora_la_OBJECTS)
+libtheoradec_la_LIBADD =
+am_libtheoradec_la_OBJECTS = $(am__objects_3)
+libtheoradec_la_OBJECTS = $(am_libtheoradec_la_OBJECTS)
+libtheoraenc_la_LIBADD =
+@THEORA_DISABLE_ENCODE_FALSE@am__objects_8 = mmxfrag.lo mmxidct.lo \
+@THEORA_DISABLE_ENCODE_FALSE@	mmxstate.lo x86state.lo
+@THEORA_DISABLE_ENCODE_FALSE@am__objects_9 =
+@CPU_x86_32_FALSE@@CPU_x86_64_FALSE@@THEORA_DISABLE_ENCODE_FALSE@am__objects_10 =
+@CPU_x86_32_TRUE@@CPU_x86_64_FALSE@@THEORA_DISABLE_ENCODE_FALSE@am__objects_10 = \
+@CPU_x86_32_TRUE@@CPU_x86_64_FALSE@@THEORA_DISABLE_ENCODE_FALSE@	$(am__objects_8)
+@CPU_x86_64_TRUE@@THEORA_DISABLE_ENCODE_FALSE@am__objects_10 = \
+@CPU_x86_64_TRUE@@THEORA_DISABLE_ENCODE_FALSE@	$(am__objects_8) \
+@CPU_x86_64_TRUE@@THEORA_DISABLE_ENCODE_FALSE@	$(am__objects_9)
+@THEORA_DISABLE_ENCODE_TRUE@am__objects_11 = $(am__objects_7)
+@THEORA_DISABLE_ENCODE_FALSE@am__objects_11 = apiwrapper.lo fragment.lo \
+@THEORA_DISABLE_ENCODE_FALSE@	idct.lo internal.lo state.lo \
+@THEORA_DISABLE_ENCODE_FALSE@	quant.lo $(am__objects_10) \
+@THEORA_DISABLE_ENCODE_FALSE@	$(am__objects_7)
+am_libtheoraenc_la_OBJECTS = $(am__objects_11)
+libtheoraenc_la_OBJECTS = $(am_libtheoraenc_la_OBJECTS)
+
+DEFS = @DEFS@
+DEFAULT_INCLUDES =  -I. -I$(srcdir) -I$(top_builddir)
+CPPFLAGS = @CPPFLAGS@
+LDFLAGS = @LDFLAGS@
+LIBS = @LIBS@
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+@AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/analyze.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/apiwrapper.Plo ./$(DEPDIR)/bitpack.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/decapiwrapper.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/decinfo.Plo ./$(DEPDIR)/decode.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/dequant.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/encapiwrapper.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/encfrag.Plo ./$(DEPDIR)/encinfo.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/encode.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/encoder_disabled.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/enquant.Plo ./$(DEPDIR)/fdct.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/fragment.Plo ./$(DEPDIR)/huffdec.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/huffenc.Plo ./$(DEPDIR)/idct.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/info.Plo ./$(DEPDIR)/internal.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/mathops.Plo ./$(DEPDIR)/mcenc.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/mmxencfrag.Plo ./$(DEPDIR)/mmxfdct.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/mmxfrag.Plo ./$(DEPDIR)/mmxidct.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/mmxstate.Plo ./$(DEPDIR)/quant.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/rate.Plo ./$(DEPDIR)/sse2fdct.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/state.Plo ./$(DEPDIR)/tokenize.Plo \
+@AMDEP_TRUE@	./$(DEPDIR)/x86enc.Plo ./$(DEPDIR)/x86state.Plo
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \
+	$(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+CFLAGS = @CFLAGS@
+DIST_SOURCES = $(libtheora_la_SOURCES) $(libtheoradec_la_SOURCES) \
+	$(libtheoraenc_la_SOURCES)
+HEADERS = $(noinst_HEADERS)
+
+DIST_COMMON = $(noinst_HEADERS) Makefile.am Makefile.in
+SOURCES = $(libtheora_la_SOURCES) $(libtheoradec_la_SOURCES) $(libtheoraenc_la_SOURCES)
+
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .def .exp .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am  $(top_srcdir)/configure.ac $(ACLOCAL_M4)
+	cd $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu  lib/Makefile
+Makefile: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.in  $(top_builddir)/config.status
+	cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)
+libLTLIBRARIES_INSTALL = $(INSTALL)
+install-libLTLIBRARIES: $(lib_LTLIBRARIES)
+	@$(NORMAL_INSTALL)
+	$(mkinstalldirs) $(DESTDIR)$(libdir)
+	@list='$(lib_LTLIBRARIES)'; for p in $$list; do \
+	  if test -f $$p; then \
+	    f="`echo $$p | sed -e 's|^.*/||'`"; \
+	    echo " $(LIBTOOL) --mode=install $(libLTLIBRARIES_INSTALL) $(INSTALL_STRIP_FLAG) $$p $(DESTDIR)$(libdir)/$$f"; \
+	    $(LIBTOOL) --mode=install $(libLTLIBRARIES_INSTALL) $(INSTALL_STRIP_FLAG) $$p $(DESTDIR)$(libdir)/$$f; \
+	  else :; fi; \
+	done
+
+uninstall-libLTLIBRARIES:
+	@$(NORMAL_UNINSTALL)
+	@list='$(lib_LTLIBRARIES)'; for p in $$list; do \
+	    p="`echo $$p | sed -e 's|^.*/||'`"; \
+	  echo " $(LIBTOOL) --mode=uninstall rm -f $(DESTDIR)$(libdir)/$$p"; \
+	  $(LIBTOOL) --mode=uninstall rm -f $(DESTDIR)$(libdir)/$$p; \
+	done
+
+clean-libLTLIBRARIES:
+	-test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES)
+	@list='$(lib_LTLIBRARIES)'; for p in $$list; do \
+	  dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \
+	  test -z "$dir" && dir=.; \
+	  echo "rm -f \"$${dir}/so_locations\""; \
+	  rm -f "$${dir}/so_locations"; \
+	done
+mmxidct.lo: x86/mmxidct.c
+mmxfrag.lo: x86/mmxfrag.c
+mmxstate.lo: x86/mmxstate.c
+x86state.lo: x86/x86state.c
+mmxencfrag.lo: x86/mmxencfrag.c
+mmxfdct.lo: x86/mmxfdct.c
+x86enc.lo: x86/x86enc.c
+sse2fdct.lo: x86/sse2fdct.c
+libtheora.la: $(libtheora_la_OBJECTS) $(libtheora_la_DEPENDENCIES) 
+	$(LINK) -rpath $(libdir) $(libtheora_la_LDFLAGS) $(libtheora_la_OBJECTS) $(libtheora_la_LIBADD) $(LIBS)
+libtheoradec.la: $(libtheoradec_la_OBJECTS) $(libtheoradec_la_DEPENDENCIES) 
+	$(LINK) -rpath $(libdir) $(libtheoradec_la_LDFLAGS) $(libtheoradec_la_OBJECTS) $(libtheoradec_la_LIBADD) $(LIBS)
+libtheoraenc.la: $(libtheoraenc_la_OBJECTS) $(libtheoraenc_la_DEPENDENCIES) 
+	$(LINK) -rpath $(libdir) $(libtheoraenc_la_LDFLAGS) $(libtheoraenc_la_OBJECTS) $(libtheoraenc_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT) core *.core
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/analyze.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/apiwrapper.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bitpack.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/decapiwrapper.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/decinfo.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/decode.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dequant.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/encapiwrapper.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/encfrag.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/encinfo.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/encode.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/encoder_disabled.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/enquant.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fdct.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fragment.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/huffdec.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/huffenc.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/idct.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/info.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/internal.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mathops.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mcenc.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mmxencfrag.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mmxfdct.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mmxfrag.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mmxidct.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mmxstate.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/quant.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rate.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sse2fdct.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/state.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tokenize.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/x86enc.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/x86state.Plo@am__quote@
+
+distclean-depend:
+	-rm -rf ./$(DEPDIR)
+
+.c.o:
+@AMDEP_TRUE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/$*.Po' tmpdepfile='$(DEPDIR)/$*.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(COMPILE) -c `test -f '$<' || echo '$(srcdir)/'`$<
+
+.c.obj:
+@AMDEP_TRUE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/$*.Po' tmpdepfile='$(DEPDIR)/$*.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(COMPILE) -c `cygpath -w $<`
+
+.c.lo:
+@AMDEP_TRUE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/$*.Plo' tmpdepfile='$(DEPDIR)/$*.TPlo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(LTCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$<
+
+mmxidct.o: x86/mmxidct.c
+@AMDEP_TRUE@	source='x86/mmxidct.c' object='mmxidct.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/mmxidct.Po' tmpdepfile='$(DEPDIR)/mmxidct.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mmxidct.o `test -f 'x86/mmxidct.c' || echo '$(srcdir)/'`x86/mmxidct.c
+
+mmxidct.obj: x86/mmxidct.c
+@AMDEP_TRUE@	source='x86/mmxidct.c' object='mmxidct.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/mmxidct.Po' tmpdepfile='$(DEPDIR)/mmxidct.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mmxidct.obj `cygpath -w x86/mmxidct.c`
+
+mmxidct.lo: x86/mmxidct.c
+@AMDEP_TRUE@	source='x86/mmxidct.c' object='mmxidct.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/mmxidct.Plo' tmpdepfile='$(DEPDIR)/mmxidct.TPlo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mmxidct.lo `test -f 'x86/mmxidct.c' || echo '$(srcdir)/'`x86/mmxidct.c
+
+mmxfrag.o: x86/mmxfrag.c
+@AMDEP_TRUE@	source='x86/mmxfrag.c' object='mmxfrag.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/mmxfrag.Po' tmpdepfile='$(DEPDIR)/mmxfrag.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mmxfrag.o `test -f 'x86/mmxfrag.c' || echo '$(srcdir)/'`x86/mmxfrag.c
+
+mmxfrag.obj: x86/mmxfrag.c
+@AMDEP_TRUE@	source='x86/mmxfrag.c' object='mmxfrag.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/mmxfrag.Po' tmpdepfile='$(DEPDIR)/mmxfrag.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mmxfrag.obj `cygpath -w x86/mmxfrag.c`
+
+mmxfrag.lo: x86/mmxfrag.c
+@AMDEP_TRUE@	source='x86/mmxfrag.c' object='mmxfrag.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/mmxfrag.Plo' tmpdepfile='$(DEPDIR)/mmxfrag.TPlo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mmxfrag.lo `test -f 'x86/mmxfrag.c' || echo '$(srcdir)/'`x86/mmxfrag.c
+
+mmxstate.o: x86/mmxstate.c
+@AMDEP_TRUE@	source='x86/mmxstate.c' object='mmxstate.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/mmxstate.Po' tmpdepfile='$(DEPDIR)/mmxstate.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mmxstate.o `test -f 'x86/mmxstate.c' || echo '$(srcdir)/'`x86/mmxstate.c
+
+mmxstate.obj: x86/mmxstate.c
+@AMDEP_TRUE@	source='x86/mmxstate.c' object='mmxstate.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/mmxstate.Po' tmpdepfile='$(DEPDIR)/mmxstate.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mmxstate.obj `cygpath -w x86/mmxstate.c`
+
+mmxstate.lo: x86/mmxstate.c
+@AMDEP_TRUE@	source='x86/mmxstate.c' object='mmxstate.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/mmxstate.Plo' tmpdepfile='$(DEPDIR)/mmxstate.TPlo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mmxstate.lo `test -f 'x86/mmxstate.c' || echo '$(srcdir)/'`x86/mmxstate.c
+
+x86state.o: x86/x86state.c
+@AMDEP_TRUE@	source='x86/x86state.c' object='x86state.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/x86state.Po' tmpdepfile='$(DEPDIR)/x86state.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o x86state.o `test -f 'x86/x86state.c' || echo '$(srcdir)/'`x86/x86state.c
+
+x86state.obj: x86/x86state.c
+@AMDEP_TRUE@	source='x86/x86state.c' object='x86state.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/x86state.Po' tmpdepfile='$(DEPDIR)/x86state.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o x86state.obj `cygpath -w x86/x86state.c`
+
+x86state.lo: x86/x86state.c
+@AMDEP_TRUE@	source='x86/x86state.c' object='x86state.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/x86state.Plo' tmpdepfile='$(DEPDIR)/x86state.TPlo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o x86state.lo `test -f 'x86/x86state.c' || echo '$(srcdir)/'`x86/x86state.c
+
+mmxencfrag.o: x86/mmxencfrag.c
+@AMDEP_TRUE@	source='x86/mmxencfrag.c' object='mmxencfrag.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/mmxencfrag.Po' tmpdepfile='$(DEPDIR)/mmxencfrag.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mmxencfrag.o `test -f 'x86/mmxencfrag.c' || echo '$(srcdir)/'`x86/mmxencfrag.c
+
+mmxencfrag.obj: x86/mmxencfrag.c
+@AMDEP_TRUE@	source='x86/mmxencfrag.c' object='mmxencfrag.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/mmxencfrag.Po' tmpdepfile='$(DEPDIR)/mmxencfrag.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mmxencfrag.obj `cygpath -w x86/mmxencfrag.c`
+
+mmxencfrag.lo: x86/mmxencfrag.c
+@AMDEP_TRUE@	source='x86/mmxencfrag.c' object='mmxencfrag.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/mmxencfrag.Plo' tmpdepfile='$(DEPDIR)/mmxencfrag.TPlo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mmxencfrag.lo `test -f 'x86/mmxencfrag.c' || echo '$(srcdir)/'`x86/mmxencfrag.c
+
+mmxfdct.o: x86/mmxfdct.c
+@AMDEP_TRUE@	source='x86/mmxfdct.c' object='mmxfdct.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/mmxfdct.Po' tmpdepfile='$(DEPDIR)/mmxfdct.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mmxfdct.o `test -f 'x86/mmxfdct.c' || echo '$(srcdir)/'`x86/mmxfdct.c
+
+mmxfdct.obj: x86/mmxfdct.c
+@AMDEP_TRUE@	source='x86/mmxfdct.c' object='mmxfdct.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/mmxfdct.Po' tmpdepfile='$(DEPDIR)/mmxfdct.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mmxfdct.obj `cygpath -w x86/mmxfdct.c`
+
+mmxfdct.lo: x86/mmxfdct.c
+@AMDEP_TRUE@	source='x86/mmxfdct.c' object='mmxfdct.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/mmxfdct.Plo' tmpdepfile='$(DEPDIR)/mmxfdct.TPlo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o mmxfdct.lo `test -f 'x86/mmxfdct.c' || echo '$(srcdir)/'`x86/mmxfdct.c
+
+x86enc.o: x86/x86enc.c
+@AMDEP_TRUE@	source='x86/x86enc.c' object='x86enc.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/x86enc.Po' tmpdepfile='$(DEPDIR)/x86enc.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o x86enc.o `test -f 'x86/x86enc.c' || echo '$(srcdir)/'`x86/x86enc.c
+
+x86enc.obj: x86/x86enc.c
+@AMDEP_TRUE@	source='x86/x86enc.c' object='x86enc.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/x86enc.Po' tmpdepfile='$(DEPDIR)/x86enc.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o x86enc.obj `cygpath -w x86/x86enc.c`
+
+x86enc.lo: x86/x86enc.c
+@AMDEP_TRUE@	source='x86/x86enc.c' object='x86enc.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/x86enc.Plo' tmpdepfile='$(DEPDIR)/x86enc.TPlo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o x86enc.lo `test -f 'x86/x86enc.c' || echo '$(srcdir)/'`x86/x86enc.c
+
+sse2fdct.o: x86/sse2fdct.c
+@AMDEP_TRUE@	source='x86/sse2fdct.c' object='sse2fdct.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/sse2fdct.Po' tmpdepfile='$(DEPDIR)/sse2fdct.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o sse2fdct.o `test -f 'x86/sse2fdct.c' || echo '$(srcdir)/'`x86/sse2fdct.c
+
+sse2fdct.obj: x86/sse2fdct.c
+@AMDEP_TRUE@	source='x86/sse2fdct.c' object='sse2fdct.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/sse2fdct.Po' tmpdepfile='$(DEPDIR)/sse2fdct.TPo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o sse2fdct.obj `cygpath -w x86/sse2fdct.c`
+
+sse2fdct.lo: x86/sse2fdct.c
+@AMDEP_TRUE@	source='x86/sse2fdct.c' object='sse2fdct.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@	depfile='$(DEPDIR)/sse2fdct.Plo' tmpdepfile='$(DEPDIR)/sse2fdct.TPlo' @AMDEPBACKSLASH@
+@AMDEP_TRUE@	$(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+	$(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o sse2fdct.lo `test -f 'x86/sse2fdct.c' || echo '$(srcdir)/'`x86/sse2fdct.c
+CCDEPMODE = @CCDEPMODE@
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+distclean-libtool:
+	-rm -f libtool
+uninstall-info-am:
+
+ETAGS = etags
+ETAGSFLAGS =
+
+tags: TAGS
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	mkid -fID $$unique
+
+TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	test -z "$(ETAGS_ARGS)$$tags$$unique" \
+	  || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	     $$tags $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && cd $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) $$here
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+
+top_distdir = ..
+distdir = $(top_distdir)/$(PACKAGE)-$(VERSION)
+
+distdir: $(DISTFILES)
+	$(mkinstalldirs) $(distdir)/x86
+	@list='$(DISTFILES)'; for file in $$list; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \
+	  if test "$$dir" != "$$file" && test "$$dir" != "."; then \
+	    dir="/$$dir"; \
+	    $(mkinstalldirs) "$(distdir)$$dir"; \
+	  else \
+	    dir=''; \
+	  fi; \
+	  if test -d $$d/$$file; then \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \
+	    fi; \
+	    cp -pR $$d/$$file $(distdir)$$dir || exit 1; \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || cp -p $$d/$$file $(distdir)/$$file \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LTLIBRARIES) $(HEADERS)
+
+installdirs:
+	$(mkinstalldirs) $(DESTDIR)$(libdir)
+
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	  INSTALL_STRIP_FLAG=-s \
+	  `test -z '$(STRIP)' || \
+	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-rm -f Makefile $(CONFIG_CLEAN_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \
+	mostlyclean-am
+
+distclean: distclean-am
+
+distclean-am: clean-am distclean-compile distclean-depend \
+	distclean-generic distclean-libtool distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-exec-am: install-libLTLIBRARIES
+
+install-info: install-info-am
+
+install-man:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+uninstall-am: uninstall-info-am uninstall-libLTLIBRARIES
+
+.PHONY: GTAGS all all-am check check-am clean clean-generic \
+	clean-libLTLIBRARIES clean-libtool distclean distclean-compile \
+	distclean-depend distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am info info-am install \
+	install-am install-data install-data-am install-exec \
+	install-exec-am install-info install-info-am \
+	install-libLTLIBRARIES install-man install-strip installcheck \
+	installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool tags uninstall \
+	uninstall-am uninstall-info-am uninstall-libLTLIBRARIES
+
+
+debug:
+	$(MAKE) all CFLAGS="@DEBUG@" 
+
+profile:
+	$(MAKE) all CFLAGS="@PROFILE@"
+
+# contstruct various symbol export list files
+.def.exp : defexp.awk
+	awk -f defexp.awk $< > $@
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/lib/Version_script b/lib/Version_script
new file mode 100644
index 0000000..2ecb5e4
--- /dev/null
+++ b/lib/Version_script
@@ -0,0 +1,53 @@
+#
+# Export file for libtheora
+#
+# Only the symbols listed in the global section will be callable from
+# applications linking to the libraries.
+#
+
+# We use something that looks like a versioned so filename here 
+# to define the old API because of a historical confusion. This
+# label must be kept to maintain ABI compatibility.
+
+libtheora.so.1.0
+{
+	global:
+		theora_version_string;
+		theora_version_number;
+
+		theora_encode_init;
+		theora_encode_YUVin;
+		theora_encode_packetout;
+		theora_encode_header;
+		theora_encode_comment;
+		theora_encode_tables;
+
+		theora_decode_header;
+		theora_decode_init;
+		theora_decode_packetin;
+		theora_decode_YUVout;
+
+		theora_control;
+
+		theora_packet_isheader;
+		theora_packet_iskeyframe;
+
+		theora_granule_shift;
+		theora_granule_frame;
+		theora_granule_time;
+
+		theora_info_init;
+		theora_info_clear;
+
+		theora_clear;
+
+		theora_comment_init;
+		theora_comment_add;
+		theora_comment_add_tag;
+		theora_comment_query;
+		theora_comment_query_count;
+		theora_comment_clear;
+
+	local:
+		*;
+};
diff --git a/lib/Version_script-dec b/lib/Version_script-dec
new file mode 100644
index 0000000..cab3683
--- /dev/null
+++ b/lib/Version_script-dec
@@ -0,0 +1,82 @@
+#
+# Export file for libtheoradec
+#
+# Only the symbols listed in the global section will be callable from
+# applications linking to the libraries.
+#
+
+# The 1.x API
+libtheoradec_1.0
+{
+	global:
+		th_version_string;
+		th_version_number;
+
+		th_decode_headerin;
+		th_decode_alloc;
+		th_setup_free;
+		th_decode_ctl;
+		th_decode_packetin;
+		th_decode_ycbcr_out;
+		th_decode_free;
+
+		th_packet_isheader;
+		th_packet_iskeyframe;
+
+		th_granule_frame;
+		th_granule_time;
+
+		th_info_init;
+		th_info_clear;
+
+		th_comment_init;
+		th_comment_add;
+		th_comment_add_tag;
+		th_comment_query;
+		th_comment_query_count;
+		th_comment_clear;
+
+	local:
+		*;
+};
+
+# The deprecated legacy api from the libtheora alpha releases.
+# We use something that looks like a versioned so filename here 
+# to define the old API because of a historical confusion. This
+# label must be kept to maintain ABI compatibility.
+
+libtheora.so.1.0
+{
+	global:
+		theora_version_string;
+		theora_version_number;
+
+		theora_decode_header;
+		theora_decode_init;
+		theora_decode_packetin;
+		theora_decode_YUVout;
+
+		theora_control;
+
+		theora_packet_isheader;
+		theora_packet_iskeyframe;
+
+		theora_granule_shift;
+		theora_granule_frame;
+		theora_granule_time;
+
+		theora_info_init;
+		theora_info_clear;
+
+		theora_clear;
+
+		theora_comment_init;
+		theora_comment_add;
+		theora_comment_add_tag;
+		theora_comment_query;
+		theora_comment_query_count;
+		theora_comment_clear;
+
+	local:
+		*;
+};
diff --git a/lib/Version_script-enc b/lib/Version_script-enc
new file mode 100644
index 0000000..37699ed
--- /dev/null
+++ b/lib/Version_script-enc
@@ -0,0 +1,43 @@
+#
+# Export file for libtheora
+#
+# Only the symbols listed in the global section will be callable from
+# applications linking to the libraries.
+#
+
+# The 1.x encoder API
+libtheoraenc_1.0
+{
+	global:
+		th_encode_alloc;
+		th_encode_ctl;
+		th_encode_flushheader;
+		th_encode_ycbcr_in;
+		th_encode_packetout;
+		th_encode_free;
+
+		TH_VP31_QUANT_INFO;
+		TH_VP31_HUFF_CODES;
+
+	local:
+		*;
+};
+
+# The encoder portion of the deprecated alpha release api.
+# We use something that looks like a versioned so filename here 
+# to define the old API because of a historical confusion. This
+# label must be kept to maintain ABI compatibility.
+
+libtheora.so.1.0
+{
+	global:
+		theora_encode_init;
+		theora_encode_YUVin;
+		theora_encode_packetout;
+		theora_encode_header;
+		theora_encode_comment;
+		theora_encode_tables;
+
+	local:
+		*;
+};
diff --git a/lib/analyze.c b/lib/analyze.c
new file mode 100644
index 0000000..af01b60
--- /dev/null
+++ b/lib/analyze.c
@@ -0,0 +1,2709 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function: mode selection code
+  last mod: $Id$
+
+ ********************************************************************/
+#include <limits.h>
+#include <string.h>
+#include "encint.h"
+#include "modedec.h"
+
+
+
+typedef struct oc_fr_state           oc_fr_state;
+typedef struct oc_qii_state          oc_qii_state;
+typedef struct oc_enc_pipeline_state oc_enc_pipeline_state;
+typedef struct oc_rd_metric          oc_rd_metric;
+typedef struct oc_mode_choice        oc_mode_choice;
+
+
+
+/*There are 8 possible schemes used to encode macro block modes.
+  Schemes 0-6 use a maximally-skewed Huffman code to code each of the modes.
+  The same set of Huffman codes is used for each of these 7 schemes, but the
+   mode assigned to each codeword varies.
+  Scheme 0 writes a custom mapping from codeword to MB mode to the bitstream,
+   while schemes 1-6 have a fixed mapping.
+  Scheme 7 just encodes each mode directly in 3 bits.*/
+
+/*The mode orderings for the various mode coding schemes.
+  Scheme 0 uses a custom alphabet, which is not stored in this table.
+  This is the inverse of the equivalent table OC_MODE_ALPHABETS in the
+   decoder.*/
+static const unsigned char OC_MODE_RANKS[7][OC_NMODES]={
+  /*Last MV dominates.*/ 
+  /*L P M N I G GM 4*/
+  {3,4,2,0,1,5,6,7},
+  /*L P N M I G GM 4*/
+  {2,4,3,0,1,5,6,7},
+  /*L M P N I G GM 4*/
+  {3,4,1,0,2,5,6,7},
+  /*L M N P I G GM 4*/
+  {2,4,1,0,3,5,6,7},
+  /*No MV dominates.*/
+  /*N L P M I G GM 4*/
+  {0,4,3,1,2,5,6,7},
+  /*N G L P M I GM 4*/
+  {0,5,4,2,3,1,6,7},
+  /*Default ordering.*/
+  /*N I M L P G GM 4*/
+  {0,1,2,3,4,5,6,7}
+};
+
+
+
+/*Initialize the mode scheme chooser.
+  This need only be called once per encoder.*/
+void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser){
+  int si;
+  _chooser->mode_ranks[0]=_chooser->scheme0_ranks;
+  for(si=1;si<8;si++)_chooser->mode_ranks[si]=OC_MODE_RANKS[si-1];
+}
+
+/*Reset the mode scheme chooser.
+  This needs to be called once for each frame, including the first.*/
+static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){
+  int si;
+  memset(_chooser->mode_counts,0,OC_NMODES*sizeof(*_chooser->mode_counts));
+  /*Scheme 0 starts with 24 bits to store the mode list in.*/
+  _chooser->scheme_bits[0]=24;
+  memset(_chooser->scheme_bits+1,0,7*sizeof(*_chooser->scheme_bits));
+  for(si=0;si<8;si++){
+    /*Scheme 7 should always start first, and scheme 0 should always start
+       last.*/
+    _chooser->scheme_list[si]=7-si;
+    _chooser->scheme0_list[si]=_chooser->scheme0_ranks[si]=si;
+  }
+}
+
+
+/*This is the real purpose of this data structure: not actually selecting a
+   mode scheme, but estimating the cost of coding a given mode given all the
+   modes selected so far.
+  This is done via opportunity cost: the cost is defined as the number of bits
+   required to encode all the modes selected so far including the current one
+   using the best possible scheme, minus the number of bits required to encode
+   all the modes selected so far not including the current one using the best
+   possible scheme.
+  The computational expense of doing this probably makes it overkill.
+  Just be happy we take a greedy approach instead of trying to solve the
+   global mode-selection problem (which is NP-hard).
+  _mb_mode: The mode to determine the cost of.
+  Return: The number of bits required to code this mode.*/
+static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser,
+ int _mb_mode){
+  int scheme0;
+  int scheme1;
+  int best_bits;
+  int mode_bits;
+  int si;
+  int scheme_bits;
+  scheme0=_chooser->scheme_list[0];
+  scheme1=_chooser->scheme_list[1];
+  best_bits=_chooser->scheme_bits[scheme0];
+  mode_bits=OC_MODE_BITS[scheme0+1>>3][_chooser->mode_ranks[scheme0][_mb_mode]];
+  /*Typical case: If the difference between the best scheme and the next best
+     is greater than 6 bits, then adding just one mode cannot change which
+     scheme we use.*/
+  if(_chooser->scheme_bits[scheme1]-best_bits>6)return mode_bits;
+  /*Otherwise, check to see if adding this mode selects a different scheme as
+     the best.*/
+  si=1;
+  best_bits+=mode_bits;
+  do{
+    /*For any scheme except 0, we can just use the bit cost of the mode's rank
+       in that scheme.*/
+    if(scheme1!=0){
+      scheme_bits=_chooser->scheme_bits[scheme1]+
+       OC_MODE_BITS[scheme1+1>>3][_chooser->mode_ranks[scheme1][_mb_mode]];
+    }
+    else{
+      int ri;
+      /*For scheme 0, incrementing the mode count could potentially change the
+         mode's rank.
+        Find the index where the mode would be moved to in the optimal list,
+         and use its bit cost instead of the one for the mode's current
+         position in the list.*/
+      /*We don't recompute scheme bits; this is computing opportunity cost, not
+         an update.*/
+      for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0&&
+       _chooser->mode_counts[_mb_mode]>=
+       _chooser->mode_counts[_chooser->scheme0_list[ri-1]];ri--);
+      scheme_bits=_chooser->scheme_bits[0]+OC_MODE_BITS[0][ri];
+    }
+    if(scheme_bits<best_bits)best_bits=scheme_bits;
+    if(++si>=8)break;
+    scheme1=_chooser->scheme_list[si];
+  }
+  while(_chooser->scheme_bits[scheme1]-_chooser->scheme_bits[scheme0]<=6);
+  return best_bits-_chooser->scheme_bits[scheme0];
+}
+
+/*Incrementally update the mode counts and per-scheme bit counts and re-order
+   the scheme lists once a mode has been selected.
+  _mb_mode: The mode that was chosen.*/
+static void oc_mode_scheme_chooser_update(oc_mode_scheme_chooser *_chooser,
+ int _mb_mode){
+  int ri;
+  int si;
+  _chooser->mode_counts[_mb_mode]++;
+  /*Re-order the scheme0 mode list if necessary.*/
+  for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0;ri--){
+    int pmode;
+    pmode=_chooser->scheme0_list[ri-1];
+    if(_chooser->mode_counts[pmode]>=_chooser->mode_counts[_mb_mode])break;
+    /*Reorder the mode ranking.*/
+    _chooser->scheme0_ranks[pmode]++;
+    _chooser->scheme0_list[ri]=pmode;
+  }
+  _chooser->scheme0_ranks[_mb_mode]=ri;
+  _chooser->scheme0_list[ri]=_mb_mode;
+  /*Now add the bit cost for the mode to each scheme.*/
+  for(si=0;si<8;si++){
+    _chooser->scheme_bits[si]+=
+     OC_MODE_BITS[si+1>>3][_chooser->mode_ranks[si][_mb_mode]];
+  }
+  /*Finally, re-order the list of schemes.*/
+  for(si=1;si<8;si++){
+    int sj;
+    int scheme0;
+    int bits0;
+    sj=si;
+    scheme0=_chooser->scheme_list[si];
+    bits0=_chooser->scheme_bits[scheme0];
+    do{
+      int scheme1;
+      scheme1=_chooser->scheme_list[sj-1];
+      if(bits0>=_chooser->scheme_bits[scheme1])break;
+      _chooser->scheme_list[sj]=scheme1;
+    }
+    while(--sj>0);
+    _chooser->scheme_list[sj]=scheme0;
+  }
+}
+
+
+
+/*The number of bits required to encode a super block run.
+  _run_count: The desired run count; must be positive and less than 4130.*/
+static int oc_sb_run_bits(int _run_count){
+  int i;
+  for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
+  return OC_SB_RUN_CODE_NBITS[i];
+}
+
+/*The number of bits required to encode a block run.
+  _run_count: The desired run count; must be positive and less than 30.*/
+static int oc_block_run_bits(int _run_count){
+  return OC_BLOCK_RUN_CODE_NBITS[_run_count-1];
+}
+
+
+
+/*State to track coded block flags and their bit cost.*/
+struct oc_fr_state{
+  ptrdiff_t  bits;
+  unsigned   sb_partial_count:16;
+  unsigned   sb_full_count:16;
+  unsigned   b_coded_count_prev:8;
+  unsigned   b_coded_count:8;
+  unsigned   b_count:8;
+  signed int sb_partial:2;
+  signed int sb_full:2;
+  signed int b_coded_prev:2;
+  signed int b_coded:2;
+};
+
+
+
+static void oc_fr_state_init(oc_fr_state *_fr){
+  _fr->bits=0;
+  _fr->sb_partial_count=0;
+  _fr->sb_full_count=0;
+  _fr->b_coded_count_prev=0;
+  _fr->b_coded_count=0;
+  _fr->b_count=0;
+  _fr->sb_partial=-1;
+  _fr->sb_full=-1;
+  _fr->b_coded_prev=-1;
+  _fr->b_coded=-1;
+}
+
+
+static void oc_fr_state_advance_sb(oc_fr_state *_fr,
+ int _sb_partial,int _sb_full){
+  ptrdiff_t bits;
+  int       sb_partial_count;
+  int       sb_full_count;
+  bits=_fr->bits;
+  /*Extend the sb_partial run, or start a new one.*/
+  sb_partial_count=_fr->sb_partial;
+  if(_fr->sb_partial==_sb_partial){
+    if(sb_partial_count>=4129){
+      bits++;
+      sb_partial_count=0;
+    }
+    else bits-=oc_sb_run_bits(sb_partial_count);
+  }
+  else sb_partial_count=0;
+  sb_partial_count++;
+  bits+=oc_sb_run_bits(sb_partial_count);
+  if(!_sb_partial){
+    /*Extend the sb_full run, or start a new one.*/
+    sb_full_count=_fr->sb_full_count;
+    if(_fr->sb_full==_sb_full){
+      if(sb_full_count>=4129){
+        bits++;
+        sb_full_count=0;
+      }
+      else bits-=oc_sb_run_bits(sb_full_count);
+    }
+    else sb_full_count=0;
+    sb_full_count++;
+    bits+=oc_sb_run_bits(sb_full_count);
+    _fr->sb_full=_sb_full;
+    _fr->sb_full_count=sb_full_count;
+  }
+  _fr->bits=bits;
+  _fr->sb_partial=_sb_partial;
+  _fr->sb_partial_count=sb_partial_count;
+}
+
+/*Flush any outstanding block flags for a SB (e.g., one with fewer than 16
+   blocks).*/
+static void oc_fr_state_flush_sb(oc_fr_state *_fr){
+  ptrdiff_t bits;
+  int       sb_partial;
+  int       sb_full=sb_full;
+  int       b_coded_count;
+  int       b_coded;
+  int       b_count;
+  b_count=_fr->b_count;
+  if(b_count>0){
+    bits=_fr->bits;
+    b_coded=_fr->b_coded;
+    b_coded_count=_fr->b_coded_count;
+    if(b_coded_count>=b_count){
+      /*This SB was fully coded/uncoded; roll back the partial block flags.*/
+      bits-=oc_block_run_bits(b_coded_count);
+      if(b_coded_count>b_count)bits+=oc_block_run_bits(b_coded_count-b_count);
+      sb_partial=0;
+      sb_full=b_coded;
+      b_coded=_fr->b_coded_prev;
+      b_coded_count=_fr->b_coded_count_prev;
+    }
+    else{
+      /*It was partially coded.*/
+      sb_partial=1;
+      /*sb_full is unused.*/
+    }
+    _fr->bits=bits;
+    _fr->b_coded_count=b_coded_count;
+    _fr->b_coded_count_prev=b_coded_count;
+    _fr->b_count=0;
+    _fr->b_coded=b_coded;
+    _fr->b_coded_prev=b_coded;
+    oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
+  }
+}
+
+static void oc_fr_state_advance_block(oc_fr_state *_fr,int _b_coded){
+  ptrdiff_t bits;
+  int       b_coded_count;
+  int       b_count;
+  int       sb_partial;
+  int       sb_full=sb_full;
+  bits=_fr->bits;
+  /*Extend the b_coded run, or start a new one.*/
+  b_coded_count=_fr->b_coded_count;
+  if(_fr->b_coded==_b_coded)bits-=oc_block_run_bits(b_coded_count);
+  else b_coded_count=0;
+  b_coded_count++;
+  b_count=_fr->b_count+1;
+  if(b_count>=16){
+    /*We finished a superblock.*/
+    if(b_coded_count>=16){
+      /*It was fully coded/uncoded; roll back the partial block flags.*/
+      if(b_coded_count>16)bits+=oc_block_run_bits(b_coded_count-16);
+      sb_partial=0;
+      sb_full=_b_coded;
+      _b_coded=_fr->b_coded_prev;
+      b_coded_count=_fr->b_coded_count_prev;
+    }
+    else{
+      bits+=oc_block_run_bits(b_coded_count);
+      /*It was partially coded.*/
+      sb_partial=1;
+      /*sb_full is unused.*/
+    }
+    _fr->bits=bits;
+    _fr->b_coded_count=b_coded_count;
+    _fr->b_coded_count_prev=b_coded_count;
+    _fr->b_count=0;
+    _fr->b_coded=_b_coded;
+    _fr->b_coded_prev=_b_coded;
+    oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
+  }
+  else{
+    bits+=oc_block_run_bits(b_coded_count);
+    _fr->bits=bits;
+    _fr->b_coded_count=b_coded_count;
+    _fr->b_count=b_count;
+    _fr->b_coded=_b_coded;
+  }
+}
+
+static void oc_fr_skip_block(oc_fr_state *_fr){
+  oc_fr_state_advance_block(_fr,0);
+}
+
+static void oc_fr_code_block(oc_fr_state *_fr){
+  oc_fr_state_advance_block(_fr,1);
+}
+
+static int oc_fr_cost1(const oc_fr_state *_fr){
+  oc_fr_state tmp;
+  ptrdiff_t   bits;
+  *&tmp=*_fr;
+  oc_fr_skip_block(&tmp);
+  bits=tmp.bits;
+  *&tmp=*_fr;
+  oc_fr_code_block(&tmp);
+  return (int)(tmp.bits-bits);
+}
+
+static int oc_fr_cost4(const oc_fr_state *_pre,const oc_fr_state *_post){
+  oc_fr_state tmp;
+  *&tmp=*_pre;
+  oc_fr_skip_block(&tmp);
+  oc_fr_skip_block(&tmp);
+  oc_fr_skip_block(&tmp);
+  oc_fr_skip_block(&tmp);
+  return (int)(_post->bits-tmp.bits);
+}
+
+
+
+struct oc_qii_state{
+  ptrdiff_t  bits;
+  unsigned   qi01_count:14;
+  signed int qi01:2;
+  unsigned   qi12_count:14;
+  signed int qi12:2;
+};
+
+
+
+static void oc_qii_state_init(oc_qii_state *_qs){
+  _qs->bits=0;
+  _qs->qi01_count=0;
+  _qs->qi01=-1;
+  _qs->qi12_count=0;
+  _qs->qi12=-1;
+}
+
+
+static void oc_qii_state_advance(oc_qii_state *_qd,
+ const oc_qii_state *_qs,int _qii){
+  ptrdiff_t bits;
+  int       qi01;
+  int       qi01_count;
+  int       qi12;
+  int       qi12_count;
+  bits=_qs->bits;
+  qi01=_qii+1>>1;
+  qi01_count=_qs->qi01_count;
+  if(qi01==_qs->qi01){
+    if(qi01_count>=4129){
+      bits++;
+      qi01_count=0;
+    }
+    else bits-=oc_sb_run_bits(qi01_count);
+  }
+  else qi01_count=0;
+  qi01_count++;
+  bits+=oc_sb_run_bits(qi01_count);
+  qi12_count=_qs->qi12_count;
+  if(_qii){
+    qi12=_qii>>1;
+    if(qi12==_qs->qi12){
+      if(qi12_count>=4129){
+        bits++;
+        qi12_count=0;
+      }
+      else bits-=oc_sb_run_bits(qi12_count);
+    }
+    else qi12_count=0;
+    qi12_count++;
+    bits+=oc_sb_run_bits(qi12_count);
+  }
+  else qi12=_qs->qi12;
+  _qd->bits=bits;
+  _qd->qi01=qi01;
+  _qd->qi01_count=qi01_count;
+  _qd->qi12=qi12;
+  _qd->qi12_count=qi12_count;
+}
+
+
+
+/*Temporary encoder state for the analysis pipeline.*/
+struct oc_enc_pipeline_state{
+  int                 bounding_values[256];
+  oc_fr_state         fr[3];
+  oc_qii_state        qs[3];
+  /*Condensed dequantization tables.*/
+  const ogg_uint16_t *dequant[3][3][2];
+  /*Condensed quantization tables.*/
+  const oc_iquant    *enquant[3][3][2];
+  /*Skip SSD storage for the current MCU in each plane.*/
+  unsigned           *skip_ssd[3];
+  /*Coded/uncoded fragment lists for each plane for the current MCU.*/
+  ptrdiff_t          *coded_fragis[3];
+  ptrdiff_t          *uncoded_fragis[3];
+  ptrdiff_t           ncoded_fragis[3];
+  ptrdiff_t           nuncoded_fragis[3];
+  /*The starting fragment for the current MCU in each plane.*/
+  ptrdiff_t           froffset[3];
+  /*The starting row for the current MCU in each plane.*/
+  int                 fragy0[3];
+  /*The ending row for the current MCU in each plane.*/
+  int                 fragy_end[3];
+  /*The starting superblock for the current MCU in each plane.*/
+  unsigned            sbi0[3];
+  /*The ending superblock for the current MCU in each plane.*/
+  unsigned            sbi_end[3];
+  /*The number of tokens for zzi=1 for each color plane.*/
+  int                 ndct_tokens1[3];
+  /*The outstanding eob_run count for zzi=1 for each color plane.*/
+  int                 eob_run1[3];
+  /*Whether or not the loop filter is enabled.*/
+  int                 loop_filter;
+};
+
+
+static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
+  ptrdiff_t *coded_fragis;
+  unsigned   mcu_nvsbs;
+  ptrdiff_t  mcu_nfrags;
+  int        hdec;
+  int        vdec;
+  int        pli;
+  int        qii;
+  int        qti;
+  /*Initialize the per-plane coded block flag trackers.
+    These are used for bit-estimation purposes only; the real flag bits span
+     all three planes, so we can't compute them in parallel.*/
+  for(pli=0;pli<3;pli++)oc_fr_state_init(_pipe->fr+pli);
+  for(pli=0;pli<3;pli++)oc_qii_state_init(_pipe->qs+pli);
+  /*Set up the per-plane skip SSD storage pointers.*/
+  mcu_nvsbs=_enc->mcu_nvsbs;
+  mcu_nfrags=mcu_nvsbs*_enc->state.fplanes[0].nhsbs*16;
+  hdec=!(_enc->state.info.pixel_fmt&1);
+  vdec=!(_enc->state.info.pixel_fmt&2);
+  _pipe->skip_ssd[0]=_enc->mcu_skip_ssd;
+  _pipe->skip_ssd[1]=_pipe->skip_ssd[0]+mcu_nfrags;
+  _pipe->skip_ssd[2]=_pipe->skip_ssd[1]+(mcu_nfrags>>hdec+vdec);
+  /*Set up per-plane pointers to the coded and uncoded fragments lists.
+    Unlike the decoder, each planes' coded and uncoded fragment list is kept
+     separate during the analysis stage; we only make the coded list for all
+     three planes contiguous right before the final packet is output
+     (destroying the uncoded lists, which are no longer needed).*/
+  coded_fragis=_enc->state.coded_fragis;
+  for(pli=0;pli<3;pli++){
+    _pipe->coded_fragis[pli]=coded_fragis;
+    coded_fragis+=_enc->state.fplanes[pli].nfrags;
+    _pipe->uncoded_fragis[pli]=coded_fragis;
+  }
+  memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis));
+  memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis));
+  /*Set up condensed quantizer tables.*/
+  for(pli=0;pli<3;pli++){
+    for(qii=0;qii<_enc->state.nqis;qii++){
+      int qi;
+      qi=_enc->state.qis[qii];
+      for(qti=0;qti<2;qti++){
+        _pipe->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
+        _pipe->enquant[pli][qii][qti]=_enc->enquant_tables[qi][pli][qti];
+      }
+    }
+  }
+  /*Initialize the tokenization state.*/
+  for(pli=0;pli<3;pli++){
+    _pipe->ndct_tokens1[pli]=0;
+    _pipe->eob_run1[pli]=0;
+  }
+  /*Initialize the bounding value array for the loop filter.*/
+  _pipe->loop_filter=!oc_state_loop_filter_init(&_enc->state,
+   _pipe->bounding_values);
+}
+
+/*Sets the current MCU stripe to super block row _sby.
+  Return: A non-zero value if this was the last MCU.*/
+static int oc_enc_pipeline_set_stripe(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,int _sby){
+  const oc_fragment_plane *fplane;
+  unsigned                 mcu_nvsbs;
+  int                      sby_end;
+  int                      notdone;
+  int                      vdec;
+  int                      pli;
+  mcu_nvsbs=_enc->mcu_nvsbs;
+  sby_end=_enc->state.fplanes[0].nvsbs;
+  notdone=_sby+mcu_nvsbs<sby_end;
+  if(notdone)sby_end=_sby+mcu_nvsbs;
+  vdec=0;
+  for(pli=0;pli<3;pli++){
+    fplane=_enc->state.fplanes+pli;
+    _pipe->sbi0[pli]=fplane->sboffset+(_sby>>vdec)*fplane->nhsbs;
+    _pipe->fragy0[pli]=_sby<<2-vdec;
+    _pipe->froffset[pli]=fplane->froffset
+     +_pipe->fragy0[pli]*(ptrdiff_t)fplane->nhfrags;
+    if(notdone){
+      _pipe->sbi_end[pli]=fplane->sboffset+(sby_end>>vdec)*fplane->nhsbs;
+      _pipe->fragy_end[pli]=sby_end<<2-vdec;
+    }
+    else{
+      _pipe->sbi_end[pli]=fplane->sboffset+fplane->nsbs;
+      _pipe->fragy_end[pli]=fplane->nvfrags;
+    }
+    vdec=!(_enc->state.info.pixel_fmt&2);
+  }
+  return notdone;
+}
+
+static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){
+  int refi;
+  /*Copy over all the uncoded fragments from this plane and advance the uncoded
+     fragment list.*/
+  _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
+  oc_state_frag_copy_list(&_enc->state,_pipe->uncoded_fragis[_pli],
+   _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli);
+  _pipe->nuncoded_fragis[_pli]=0;
+  /*Perform DC prediction.*/
+  oc_enc_pred_dc_frag_rows(_enc,_pli,
+   _pipe->fragy0[_pli],_pipe->fragy_end[_pli]);
+  /*Finish DC tokenization.*/
+  oc_enc_tokenize_dc_frag_list(_enc,_pli,
+   _pipe->coded_fragis[_pli],_pipe->ncoded_fragis[_pli],
+   _pipe->ndct_tokens1[_pli],_pipe->eob_run1[_pli]);
+  _pipe->ndct_tokens1[_pli]=_enc->ndct_tokens[_pli][1];
+  _pipe->eob_run1[_pli]=_enc->eob_run[_pli][1];
+  /*And advance the coded fragment list.*/
+  _enc->state.ncoded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
+  _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
+  _pipe->ncoded_fragis[_pli]=0;
+  /*Apply the loop filter if necessary.*/
+  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
+  if(_pipe->loop_filter){
+    oc_state_loop_filter_frag_rows(&_enc->state,_pipe->bounding_values,
+     refi,_pli,_pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
+  }
+  else _sdelay=_edelay=0;
+  /*To fill borders, we have an additional two pixel delay, since a fragment
+     in the next row could filter its top edge, using two pixels from a
+     fragment in this row.
+    But there's no reason to delay a full fragment between the two.*/
+  oc_state_borders_fill_rows(&_enc->state,refi,_pli,
+   (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1),
+   (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1));
+}
+
+
+
+/*Cost information about the coded blocks in a MB.*/
+struct oc_rd_metric{
+  int uncoded_ac_ssd;
+  int coded_ac_ssd;
+  int ac_bits;
+  int dc_flag;
+};
+
+
+
+static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,int _overhead_bits,
+ oc_rd_metric *_mo,oc_token_checkpoint **_stack){
+  OC_ALIGN16(ogg_int16_t  dct[64]);
+  OC_ALIGN16(ogg_int16_t  data[64]);
+  ogg_uint16_t            dc_dequant;
+  const ogg_uint16_t     *dequant;
+  const oc_iquant        *enquant;
+  ptrdiff_t               frag_offs;
+  int                     ystride;
+  const unsigned char    *src;
+  const unsigned char    *ref;
+  unsigned char          *dst;
+  int                     frame_type;
+  int                     nonzero;
+  unsigned                uncoded_ssd;
+  unsigned                coded_ssd;
+  int                     coded_dc;
+  oc_token_checkpoint    *checkpoint;
+  oc_fragment            *frags;
+  int                     mb_mode;
+  int                     mv_offs[2];
+  int                     nmv_offs;
+  int                     ac_bits;
+  int                     borderi;
+  int                     qti;
+  int                     qii;
+  int                     pi;
+  int                     zzi;
+  int                     v;
+  int                     val;
+  int                     d;
+  int                     s;
+  int                     dc;
+  frags=_enc->state.frags;
+  frag_offs=_enc->state.frag_buf_offs[_fragi];
+  ystride=_enc->state.ref_ystride[_pli];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
+  borderi=frags[_fragi].borderi;
+  qii=frags[_fragi].qii;
+  if(qii&~3){
+#if !defined(OC_COLLECT_METRICS)
+    if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
+      /*Enable early skip detection.*/
+      frags[_fragi].coded=0;
+      return 0;
+    }
+#endif
+    /*Try and code this block anyway.*/
+    qii&=3;
+    frags[_fragi].qii=qii;
+  }
+  mb_mode=frags[_fragi].mb_mode;
+  ref=_enc->state.ref_frame_data[
+   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]+frag_offs;
+  dst=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]]
+   +frag_offs;
+  /*Motion compensation:*/
+  switch(mb_mode){
+    case OC_MODE_INTRA:{
+      nmv_offs=0;
+      oc_enc_frag_sub_128(_enc,data,src,ystride);
+    }break;
+    case OC_MODE_GOLDEN_NOMV:
+    case OC_MODE_INTER_NOMV:{
+      nmv_offs=1;
+      mv_offs[0]=0;
+      oc_enc_frag_sub(_enc,data,src,ref,ystride);
+    }break;
+    default:{
+      const oc_mv *frag_mvs;
+      frag_mvs=(const oc_mv *)_enc->state.frag_mvs;
+      nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,_pli,
+       frag_mvs[_fragi][0],frag_mvs[_fragi][1]);
+      if(nmv_offs>1){
+        oc_enc_frag_copy2(_enc,dst,
+         ref+mv_offs[0],ref+mv_offs[1],ystride);
+        oc_enc_frag_sub(_enc,data,src,dst,ystride);
+      }
+      else oc_enc_frag_sub(_enc,data,src,ref+mv_offs[0],ystride);
+    }break;
+  }
+#if defined(OC_COLLECT_METRICS)
+  {
+    unsigned satd;
+    switch(nmv_offs){
+      case 0:satd=oc_enc_frag_intra_satd(_enc,src,ystride);break;
+      case 1:{
+        satd=oc_enc_frag_satd_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX);
+      }break;
+      default:{
+        satd=oc_enc_frag_satd_thresh(_enc,src,dst,ystride,UINT_MAX);
+      }
+    }
+    _enc->frag_satd[_fragi]=satd;
+  }
+#endif
+  /*Transform:*/
+  oc_enc_fdct8x8(_enc,dct,data);
+  /*Quantize the DC coefficient:*/
+  qti=mb_mode!=OC_MODE_INTRA;
+  enquant=_pipe->enquant[_pli][0][qti];
+  dc_dequant=_pipe->dequant[_pli][0][qti][0];
+  v=dct[0];
+  val=v<<1;
+  s=OC_SIGNMASK(val);
+  val+=dc_dequant+s^s;
+  val=((enquant[0].m*(ogg_int32_t)val>>16)+val>>enquant[0].l)-s;
+  dc=OC_CLAMPI(-580,val,580);
+  nonzero=0;
+  /*Quantize the AC coefficients:*/
+  dequant=_pipe->dequant[_pli][qii][qti];
+  enquant=_pipe->enquant[_pli][qii][qti];
+  for(zzi=1;zzi<64;zzi++){
+    v=dct[OC_FZIG_ZAG[zzi]];
+    d=dequant[zzi];
+    val=v<<1;
+    v=abs(val);
+    if(v>=d){
+      s=OC_SIGNMASK(val);
+      /*The bias added here rounds ties away from zero, since token
+         optimization can only decrease the magnitude of the quantized
+         value.*/
+      val+=d+s^s;
+      /*Note the arithmetic right shift is not guaranteed by ANSI C.
+        Hopefully no one still uses ones-complement architectures.*/
+      val=((enquant[zzi].m*(ogg_int32_t)val>>16)+val>>enquant[zzi].l)-s;
+      data[zzi]=OC_CLAMPI(-580,val,580);
+      nonzero=zzi;
+    }
+    else data[zzi]=0;
+  }
+  /*Tokenize.*/
+  checkpoint=*_stack;
+  ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
+   _stack,qti?0:3);
+  /*Reconstruct.
+    TODO: nonzero may need to be adjusted after tokenization.*/
+  if(nonzero==0){
+    ogg_int16_t p;
+    int         ci;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(dc*(ogg_int32_t)dc_dequant+15>>5);
+    /*LOOP VECTORIZES.*/
+    for(ci=0;ci<64;ci++)data[ci]=p;
+  }
+  else{
+    data[0]=dc*dc_dequant;
+    oc_idct8x8(&_enc->state,data,nonzero+1);
+  }
+  if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
+  else{
+    oc_enc_frag_recon_inter(_enc,dst,
+     nmv_offs==1?ref+mv_offs[0]:dst,ystride,data);
+  }
+  frame_type=_enc->state.frame_type;
+#if !defined(OC_COLLECT_METRICS)
+  if(frame_type!=OC_INTRA_FRAME)
+#endif
+  {
+    /*In retrospect, should we have skipped this block?*/
+    oc_enc_frag_sub(_enc,data,src,dst,ystride);
+    coded_ssd=coded_dc=0;
+    if(borderi<0){
+      for(pi=0;pi<64;pi++){
+        coded_ssd+=data[pi]*data[pi];
+        coded_dc+=data[pi];
+      }
+    }
+    else{
+      ogg_int64_t mask;
+      mask=_enc->state.borders[borderi].mask;
+      for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
+        coded_ssd+=data[pi]*data[pi];
+        coded_dc+=data[pi];
+      }
+    }
+    /*Scale to match DCT domain.*/
+    coded_ssd<<=4;
+    /*We actually only want the AC contribution to the SSD.*/
+    coded_ssd-=coded_dc*coded_dc>>2;
+#if defined(OC_COLLECT_METRICS)
+    _enc->frag_ssd[_fragi]=coded_ssd;
+  }
+  if(frame_type!=OC_INTRA_FRAME){
+#endif
+    uncoded_ssd=_pipe->skip_ssd[_pli][_fragi-_pipe->froffset[_pli]];
+    if(uncoded_ssd<UINT_MAX){
+      /*Although the fragment coding overhead determination is accurate, it is
+         greedy, using very coarse-grained local information.
+        Allowing it to mildly discourage coding turns out to be beneficial, but
+         it's not clear that allowing it to encourage coding through negative
+         coding overhead deltas is useful.
+        For that reason, we disallow negative coding_overheads.*/
+      if(_overhead_bits<0)_overhead_bits=0;
+      if(uncoded_ssd<=coded_ssd+(_overhead_bits+ac_bits)*_enc->lambda&&
+       /*Don't allow luma blocks to be skipped in 4MV mode when VP3
+          compatibility is enabled.*/
+       (!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR||_pli)){
+        /*Hm, not worth it; roll back.*/
+        oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint);
+        *_stack=checkpoint;
+        frags[_fragi].coded=0;
+        return 0;
+      }
+    }
+    else _mo->dc_flag=1;
+    _mo->uncoded_ac_ssd+=uncoded_ssd;
+    _mo->coded_ac_ssd+=coded_ssd;
+    _mo->ac_bits+=ac_bits;
+  }
+  oc_qii_state_advance(_pipe->qs+_pli,_pipe->qs+_pli,qii);
+  frags[_fragi].dc=dc;
+  frags[_fragi].coded=1;
+  return 1;
+}
+
+static int oc_enc_mb_transform_quantize_luma(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,unsigned _mbi,int _mode_overhead){
+  /*Worst case token stack usage for 4 fragments.*/
+  oc_token_checkpoint  stack[64*4];
+  oc_token_checkpoint *stackptr;
+  const oc_sb_map     *sb_maps;
+  signed char         *mb_modes;
+  oc_fragment         *frags;
+  ptrdiff_t           *coded_fragis;
+  ptrdiff_t            ncoded_fragis;
+  ptrdiff_t           *uncoded_fragis;
+  ptrdiff_t            nuncoded_fragis;
+  oc_rd_metric         mo;
+  oc_fr_state          fr_checkpoint;
+  oc_qii_state         qs_checkpoint;
+  int                  mb_mode;
+  int                  ncoded;
+  ptrdiff_t            fragi;
+  int                  bi;
+  *&fr_checkpoint=*(_pipe->fr+0);
+  *&qs_checkpoint=*(_pipe->qs+0);
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  mb_modes=_enc->state.mb_modes;
+  frags=_enc->state.frags;
+  coded_fragis=_pipe->coded_fragis[0];
+  ncoded_fragis=_pipe->ncoded_fragis[0];
+  uncoded_fragis=_pipe->uncoded_fragis[0];
+  nuncoded_fragis=_pipe->nuncoded_fragis[0];
+  mb_mode=mb_modes[_mbi];
+  ncoded=0;
+  stackptr=stack;
+  memset(&mo,0,sizeof(mo));
+  for(bi=0;bi<4;bi++){
+    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
+    frags[fragi].mb_mode=mb_mode;
+    if(oc_enc_block_transform_quantize(_enc,
+     _pipe,0,fragi,oc_fr_cost1(_pipe->fr+0),&mo,&stackptr)){
+      oc_fr_code_block(_pipe->fr+0);
+      coded_fragis[ncoded_fragis++]=fragi;
+      ncoded++;
+    }
+    else{
+      *(uncoded_fragis-++nuncoded_fragis)=fragi;
+      oc_fr_skip_block(_pipe->fr+0);
+    }
+  }
+  if(_enc->state.frame_type!=OC_INTRA_FRAME){
+    if(ncoded>0&&!mo.dc_flag){
+      int cost;
+      /*Some individual blocks were worth coding.
+        See if that's still true when accounting for mode and MV overhead.*/
+      cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits
+       +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+_mode_overhead);
+      if(mo.uncoded_ac_ssd<=cost){
+        /*Taking macroblock overhead into account, it is not worth coding this
+           MB.*/
+        oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack);
+        *(_pipe->fr+0)=*&fr_checkpoint;
+        *(_pipe->qs+0)=*&qs_checkpoint;
+        for(bi=0;bi<4;bi++){
+          fragi=sb_maps[_mbi>>2][_mbi&3][bi];
+          if(frags[fragi].coded){
+            *(uncoded_fragis-++nuncoded_fragis)=fragi;
+            frags[fragi].coded=0;
+          }
+          oc_fr_skip_block(_pipe->fr+0);
+        }
+        ncoded_fragis-=ncoded;
+        ncoded=0;
+      }
+    }
+    /*If no luma blocks coded, the mode is forced.*/
+    if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV;
+    /*Assume that a 1MV with a single coded block is always cheaper than a 4MV
+       with a single coded block.
+      This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
+       skipped blocks, while a 1MV does not.*/
+    else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){
+      mb_modes[_mbi]=OC_MODE_INTER_MV;
+    }
+  }
+  _pipe->ncoded_fragis[0]=ncoded_fragis;
+  _pipe->nuncoded_fragis[0]=nuncoded_fragis;
+  return ncoded;
+}
+
+static void oc_enc_sb_transform_quantize_chroma(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
+  const oc_sb_map *sb_maps;
+  oc_sb_flags     *sb_flags;
+  ptrdiff_t       *coded_fragis;
+  ptrdiff_t        ncoded_fragis;
+  ptrdiff_t       *uncoded_fragis;
+  ptrdiff_t        nuncoded_fragis;
+  int              sbi;
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  sb_flags=_enc->state.sb_flags;
+  coded_fragis=_pipe->coded_fragis[_pli];
+  ncoded_fragis=_pipe->ncoded_fragis[_pli];
+  uncoded_fragis=_pipe->uncoded_fragis[_pli];
+  nuncoded_fragis=_pipe->nuncoded_fragis[_pli];
+  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
+    /*Worst case token stack usage for 1 fragment.*/
+    oc_token_checkpoint stack[64];
+    oc_rd_metric        mo;
+    int                 quadi;
+    int                 bi;
+    memset(&mo,0,sizeof(mo));
+    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
+      ptrdiff_t fragi;
+      fragi=sb_maps[sbi][quadi][bi];
+      if(fragi>=0){
+        oc_token_checkpoint *stackptr;
+        stackptr=stack;
+        if(oc_enc_block_transform_quantize(_enc,
+         _pipe,_pli,fragi,oc_fr_cost1(_pipe->fr+_pli),&mo,&stackptr)){
+          coded_fragis[ncoded_fragis++]=fragi;
+          oc_fr_code_block(_pipe->fr+_pli);
+        }
+        else{
+          *(uncoded_fragis-++nuncoded_fragis)=fragi;
+          oc_fr_skip_block(_pipe->fr+_pli);
+        }
+      }
+    }
+    oc_fr_state_flush_sb(_pipe->fr+_pli);
+    sb_flags[sbi].coded_fully=_pipe->fr[_pli].sb_full;
+    sb_flags[sbi].coded_partially=_pipe->fr[_pli].sb_partial;
+  }
+  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
+  _pipe->nuncoded_fragis[_pli]=nuncoded_fragis;
+}
+
+/*Mode decision is done by exhaustively examining all potential choices.
+  Obviously, doing the motion compensation, fDCT, tokenization, and then
+   counting the bits each token uses is computationally expensive.
+  Theora's EOB runs can also split the cost of these tokens across multiple
+   fragments, and naturally we don't know what the optimal choice of Huffman
+   codes will be until we know all the tokens we're going to encode in all the
+   fragments.
+  So we use a simple approach to estimating the bit cost and distortion of each
+   mode based upon the SATD value of the residual before coding.
+  The mathematics behind the technique are outlined by Kim \cite{Kim03}, but
+   the process (modified somewhat from that of the paper) is very simple.
+  We build a non-linear regression of the mappings from
+   (pre-transform+quantization) SATD to (post-transform+quantization) bits and
+   SSD for each qi.
+  A separate set of mappings is kept for each quantization type and color
+   plane.
+  The mappings are constructed by partitioning the SATD values into a small
+   number of bins (currently 24) and using a linear regression in each bin
+   (as opposed to the 0th-order regression used by Kim).
+  The bit counts and SSD measurements are obtained by examining actual encoded
+   frames, with appropriate lambda values and optimal Huffman codes selected.
+  EOB bits are assigned to the fragment that started the EOB run (as opposed to
+   dividing them among all the blocks in the run; though the latter approach
+   seems more theoretically correct, Monty's testing showed a small improvement
+   with the former, though that may have been merely statistical noise).
+
+  @ARTICLE{Kim03,
+    author="Hyun Mun Kim",
+    title="Adaptive Rate Control Using Nonlinear Regression",
+    journal="IEEE Transactions on Circuits and Systems for Video Technology",
+    volume=13,
+    number=5,
+    pages="432--439",
+    month=May,
+    year=2003
+  }*/
+
+/*Computes (_ssd+_lambda*_rate)/(1<<OC_BIT_SCALE) with rounding, avoiding
+   overflow for large lambda values.*/
+#define OC_MODE_RD_COST(_ssd,_rate,_lambda) \
+ ((_ssd)>>OC_BIT_SCALE)+((_rate)>>OC_BIT_SCALE)*(_lambda) \
+ +(((_ssd)&(1<<OC_BIT_SCALE)-1)+((_rate)&(1<<OC_BIT_SCALE)-1)*(_lambda) \
+ +((1<<OC_BIT_SCALE)>>1)>>OC_BIT_SCALE)
+
+/*Estimate the R-D cost of the DCT coefficients given the SATD of a block after
+   prediction.*/
+static unsigned oc_dct_cost2(unsigned *_ssd,
+ int _qi,int _pli,int _qti,int _satd){
+  unsigned rmse;
+  int      bin;
+  int      dx;
+  int      y0;
+  int      z0;
+  int      dy;
+  int      dz;
+  /*SATD metrics for chroma planes vary much less than luma, so we scale them
+     by 4 to distribute them into the mode decision bins more evenly.*/
+  _satd<<=_pli+1&2;
+  bin=OC_MINI(_satd>>OC_SAD_SHIFT,OC_SAD_BINS-2);
+  dx=_satd-(bin<<OC_SAD_SHIFT);
+  y0=OC_MODE_RD[_qi][_pli][_qti][bin].rate;
+  z0=OC_MODE_RD[_qi][_pli][_qti][bin].rmse;
+  dy=OC_MODE_RD[_qi][_pli][_qti][bin+1].rate-y0;
+  dz=OC_MODE_RD[_qi][_pli][_qti][bin+1].rmse-z0;
+  rmse=OC_MAXI(z0+(dz*dx>>OC_SAD_SHIFT),0);
+  *_ssd=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE;
+  return OC_MAXI(y0+(dy*dx>>OC_SAD_SHIFT),0);
+}
+
+/*Select luma block-level quantizers for a MB in an INTRA frame.*/
+static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
+ const oc_qii_state *_qs,unsigned _mbi){
+  const unsigned char *src;
+  const ptrdiff_t     *frag_buf_offs;
+  const oc_sb_map     *sb_maps;
+  oc_fragment         *frags;
+  ptrdiff_t            frag_offs;
+  ptrdiff_t            fragi;
+  oc_qii_state         qs[4][3];
+  unsigned             cost[4][3];
+  unsigned             ssd[4][3];
+  unsigned             rate[4][3];
+  int                  prev[3][3];
+  unsigned             satd;
+  unsigned             best_cost;
+  unsigned             best_ssd;
+  unsigned             best_rate;
+  int                  best_qii;
+  int                  qii;
+  int                  lambda;
+  int                  ystride;
+  int                  nqis;
+  int                  bi;
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ystride=_enc->state.ref_ystride[0];
+  fragi=sb_maps[_mbi>>2][_mbi&3][0];
+  frag_offs=frag_buf_offs[fragi];
+  satd=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
+  nqis=_enc->state.nqis;
+  lambda=_enc->lambda;
+  for(qii=0;qii<nqis;qii++){
+    oc_qii_state_advance(qs[0]+qii,_qs,qii);
+    rate[0][qii]=oc_dct_cost2(ssd[0]+qii,_enc->state.qis[qii],0,0,satd)
+     +(qs[0][qii].bits-_qs->bits<<OC_BIT_SCALE);
+    cost[0][qii]=OC_MODE_RD_COST(ssd[0][qii],rate[0][qii],lambda);
+  }
+  for(bi=1;bi<4;bi++){
+    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
+    frag_offs=frag_buf_offs[fragi];
+    satd=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
+    for(qii=0;qii<nqis;qii++){
+      oc_qii_state qt[3];
+      unsigned     cur_ssd;
+      unsigned     cur_rate;
+      int          best_qij;
+      int          qij;
+      oc_qii_state_advance(qt+0,qs[bi-1]+0,qii);
+      cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],0,0,satd);
+      best_ssd=ssd[bi-1][0]+cur_ssd;
+      best_rate=rate[bi-1][0]+cur_rate
+       +(qt[0].bits-qs[bi-1][0].bits<<OC_BIT_SCALE);
+      best_cost=OC_MODE_RD_COST(best_ssd,best_rate,lambda);
+      best_qij=0;
+      for(qij=1;qij<nqis;qij++){
+        unsigned chain_ssd;
+        unsigned chain_rate;
+        unsigned chain_cost;
+        oc_qii_state_advance(qt+qij,qs[bi-1]+qij,qii);
+        chain_ssd=ssd[bi-1][qij]+cur_ssd;
+        chain_rate=rate[bi-1][qij]+cur_rate
+         +(qt[qij].bits-qs[bi-1][qij].bits<<OC_BIT_SCALE);
+        chain_cost=OC_MODE_RD_COST(chain_ssd,chain_rate,lambda);
+        if(chain_cost<best_cost){
+          best_cost=chain_cost;
+          best_ssd=chain_ssd;
+          best_rate=chain_rate;
+          best_qij=qij;
+        }
+      }
+      *(qs[bi]+qii)=*(qt+best_qij);
+      cost[bi][qii]=best_cost;
+      ssd[bi][qii]=best_ssd;
+      rate[bi][qii]=best_rate;
+      prev[bi-1][qii]=best_qij;
+    }
+  }
+  best_qii=0;
+  best_cost=cost[3][0];
+  for(qii=1;qii<nqis;qii++){
+    if(cost[3][qii]<best_cost){
+      best_cost=cost[3][qii];
+      best_qii=qii;
+    }
+  }
+  frags=_enc->state.frags;
+  for(bi=3;;){
+    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
+    frags[fragi].qii=best_qii;
+    if(bi--<=0)break;
+    best_qii=prev[bi][best_qii];
+  }
+  return best_cost;
+}
+
+/*Select a block-level quantizer for a single chroma block in an INTRA frame.*/
+static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc,
+ const oc_qii_state *_qs,int _pli,ptrdiff_t _fragi){
+  const unsigned char *src;
+  oc_fragment         *frags;
+  ptrdiff_t            frag_offs;
+  oc_qii_state         qt[3];
+  unsigned             cost[3];
+  unsigned             satd;
+  unsigned             best_cost;
+  int                  best_qii;
+  int                  qii;
+  int                  lambda;
+  int                  ystride;
+  int                  nqis;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ystride=_enc->state.ref_ystride[_pli];
+  frag_offs=_enc->state.frag_buf_offs[_fragi];
+  satd=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
+  nqis=_enc->state.nqis;
+  lambda=_enc->lambda;
+  best_qii=0;
+  for(qii=0;qii<nqis;qii++){
+    unsigned cur_rate;
+    unsigned cur_ssd;
+    oc_qii_state_advance(qt+qii,_qs,qii);
+    cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],_pli,0,satd)
+     +(qt[qii].bits-_qs->bits<<OC_BIT_SCALE);
+    cost[qii]=OC_MODE_RD_COST(cur_ssd,cur_rate,lambda);
+  }
+  best_cost=cost[0];
+  for(qii=1;qii<nqis;qii++){
+    if(cost[qii]<best_cost){
+      best_cost=cost[qii];
+      best_qii=qii;
+    }
+  }
+  frags=_enc->state.frags;
+  frags[_fragi].qii=best_qii;
+  return best_cost;
+}
+
+static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc,
+ oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
+  const oc_sb_map *sb_maps;
+  oc_sb_flags     *sb_flags;
+  ptrdiff_t       *coded_fragis;
+  ptrdiff_t        ncoded_fragis;
+  int              sbi;
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  sb_flags=_enc->state.sb_flags;
+  coded_fragis=_pipe->coded_fragis[_pli];
+  ncoded_fragis=_pipe->ncoded_fragis[_pli];
+  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
+    /*Worst case token stack usage for 1 fragment.*/
+    oc_token_checkpoint stack[64];
+    int                 quadi;
+    int                 bi;
+    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
+      ptrdiff_t fragi;
+      fragi=sb_maps[sbi][quadi][bi];
+      if(fragi>=0){
+        oc_token_checkpoint *stackptr;
+        oc_analyze_intra_chroma_block(_enc,_pipe->qs+_pli,_pli,fragi);
+        stackptr=stack;
+        oc_enc_block_transform_quantize(_enc,
+         _pipe,_pli,fragi,0,NULL,&stackptr);
+        coded_fragis[ncoded_fragis++]=fragi;
+      }
+    }
+  }
+  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
+}
+
+/*Analysis stage for an INTRA frame.*/
+void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
+  oc_enc_pipeline_state   pipe;
+  const unsigned char    *map_idxs;
+  int                     nmap_idxs;
+  oc_sb_flags            *sb_flags;
+  signed char            *mb_modes;
+  const oc_mb_map        *mb_maps;
+  oc_mb_enc_info         *embs;
+  oc_fragment            *frags;
+  unsigned                stripe_sby;
+  unsigned                mcu_nvsbs;
+  int                     notstart;
+  int                     notdone;
+  int                     refi;
+  int                     pli;
+  _enc->state.frame_type=OC_INTRA_FRAME;
+  oc_enc_tokenize_start(_enc);
+  oc_enc_pipeline_init(_enc,&pipe);
+  /*Choose MVs and MB modes and quantize and code luma.
+    Must be done in Hilbert order.*/
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  _enc->state.ncoded_fragis[0]=0;
+  _enc->state.ncoded_fragis[1]=0;
+  _enc->state.ncoded_fragis[2]=0;
+  sb_flags=_enc->state.sb_flags;
+  mb_modes=_enc->state.mb_modes;
+  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
+  embs=_enc->mb_info;
+  frags=_enc->state.frags;
+  notstart=0;
+  notdone=1;
+  mcu_nvsbs=_enc->mcu_nvsbs;
+  for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
+    unsigned sbi;
+    unsigned sbi_end;
+    notdone=oc_enc_pipeline_set_stripe(_enc,&pipe,stripe_sby);
+    sbi_end=pipe.sbi_end[0];
+    for(sbi=pipe.sbi0[0];sbi<sbi_end;sbi++){
+      int quadi;
+      /*Mode addressing is through Y plane, always 4 MB per SB.*/
+      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
+        unsigned  mbi;
+        int       mapii;
+        int       mapi;
+        int       bi;
+        ptrdiff_t fragi;
+        mbi=sbi<<2|quadi;
+        /*Motion estimation:
+          We always do a basic 1MV search for all macroblocks, coded or not,
+           keyframe or not.*/
+        if(!_recode&&_enc->state.curframe_num>0)oc_mcenc_search(_enc,mbi);
+        oc_analyze_intra_mb_luma(_enc,pipe.qs+0,mbi);
+        mb_modes[mbi]=OC_MODE_INTRA;
+        oc_enc_mb_transform_quantize_luma(_enc,&pipe,mbi,0);
+        /*Propagate final MB mode and MVs to the chroma blocks.*/
+        for(mapii=4;mapii<nmap_idxs;mapii++){
+          mapi=map_idxs[mapii];
+          pli=mapi>>2;
+          bi=mapi&3;
+          fragi=mb_maps[mbi][pli][bi];
+          frags[fragi].mb_mode=OC_MODE_INTRA;
+        }
+      }
+    }
+    oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,0,notstart,notdone);
+    /*Code chroma planes.*/
+    for(pli=1;pli<3;pli++){
+      oc_enc_sb_transform_quantize_intra_chroma(_enc,&pipe,
+       pli,pipe.sbi0[pli],pipe.sbi_end[pli]);
+      oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,pli,notstart,notdone);
+    }
+    notstart=1;
+  }
+  /*Finish filling in the reference frame borders.*/
+  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
+  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
+  _enc->state.ntotal_coded_fragis=_enc->state.nfrags;
+}
+
+
+
+/*Cost information about a MB mode.*/
+struct oc_mode_choice{
+  unsigned      cost;
+  unsigned      ssd;
+  unsigned      rate;
+  unsigned      overhead;
+  unsigned char qii[12];
+};
+
+
+
+static void oc_mode_set_cost(oc_mode_choice *_modec,int _lambda){
+  _modec->cost=OC_MODE_RD_COST(_modec->ssd,
+   _modec->rate+_modec->overhead,_lambda);
+}
+
+/*A set of skip SSD's to use to disable early skipping.*/
+static const unsigned OC_NOSKIP[12]={
+  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
+  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
+  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX
+};
+
+/*The estimated number of bits used by a coded chroma block to specify the AC
+   quantizer.
+  TODO: Currently this is just 0.5*log2(3) (estimating about 50% compression);
+   measurements suggest this is in the right ballpark, but it varies somewhat
+   with lambda.*/
+#define OC_CHROMA_QII_RATE ((0xCAE00D1DU>>31-OC_BIT_SCALE)+1>>1)
+
+static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
+ oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
+ const unsigned _frag_satd[12],const unsigned _skip_ssd[12],int _qti){
+  oc_fr_state  fr;
+  oc_qii_state qs;
+  unsigned     ssd;
+  unsigned     rate;
+  int          overhead;
+  unsigned     satd;
+  unsigned     best_ssd;
+  unsigned     best_rate;
+  int          best_overhead;
+  int          best_fri;
+  int          best_qii;
+  unsigned     cur_cost;
+  unsigned     cur_ssd;
+  unsigned     cur_rate;
+  int          cur_overhead;
+  int          lambda;
+  int          nqis;
+  int          nskipped;
+  int          bi;
+  int          qii;
+  lambda=_enc->lambda;
+  nqis=_enc->state.nqis;
+  /*We could do a trellis optimization here, but we don't make final skip
+     decisions until after transform+quantization, so the result wouldn't be
+     optimal anyway.
+    Instead we just use a greedy approach; for most SATD values, the
+     differences between the qiis are large enough to drown out the cost to
+     code the flags, anyway.*/
+  *&fr=*_fr;
+  *&qs=*_qs;
+  ssd=rate=overhead=nskipped=0;
+  for(bi=0;bi<4;bi++){
+    oc_fr_state  ft[2];
+    oc_qii_state qt[3];
+    unsigned     best_cost;
+    satd=_frag_satd[bi];
+    *(ft+0)=*&fr;
+    oc_fr_code_block(ft+0);
+    oc_qii_state_advance(qt+0,&qs,0);
+    best_overhead=(ft[0].bits-fr.bits<<OC_BIT_SCALE);
+    best_rate=oc_dct_cost2(&best_ssd,_enc->state.qis[0],0,_qti,satd)
+     +(qt[0].bits-qs.bits<<OC_BIT_SCALE);
+    best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate+best_overhead,lambda);
+    best_fri=0;
+    best_qii=0;
+    for(qii=1;qii<nqis;qii++){
+      oc_qii_state_advance(qt+qii,&qs,qii);
+      cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],0,_qti,satd)
+       +(qt[qii].bits-qs.bits<<OC_BIT_SCALE);
+      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate+best_overhead,lambda);
+      if(cur_cost<best_cost){
+        best_cost=cur_cost;
+        best_ssd=cur_ssd;
+        best_rate=cur_rate;
+        best_qii=qii;
+      }
+    }
+    if(_skip_ssd[bi]<UINT_MAX&&nskipped<3){
+      *(ft+1)=*&fr;
+      oc_fr_skip_block(ft+1);
+      cur_overhead=ft[1].bits-fr.bits<<OC_BIT_SCALE;
+      cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
+      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_overhead,lambda);
+      if(cur_cost<=best_cost){
+        best_ssd=cur_ssd;
+        best_rate=0;
+        best_overhead=cur_overhead;
+        best_fri=1;
+        best_qii+=4;
+      }
+    }
+    rate+=best_rate;
+    ssd+=best_ssd;
+    overhead+=best_overhead;
+    *&fr=*(ft+best_fri);
+    if(best_fri==0)*&qs=*(qt+best_qii);
+    else nskipped++;
+    _modec->qii[bi]=best_qii;
+  }
+  _modec->ssd=ssd;
+  _modec->rate=rate;
+  _modec->overhead=OC_MAXI(overhead,0);
+}
+
+static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
+ oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
+ const unsigned _frag_satd[12],const unsigned _skip_ssd[12],int _qti){
+  unsigned ssd;
+  unsigned rate;
+  unsigned satd;
+  unsigned best_ssd;
+  unsigned best_rate;
+  int      best_qii;
+  unsigned cur_cost;
+  unsigned cur_ssd;
+  unsigned cur_rate;
+  int      lambda;
+  int      nblocks;
+  int      nqis;
+  int      pli;
+  int      bi;
+  int      qii;
+  lambda=_enc->lambda;
+  nqis=_enc->state.nqis;
+  ssd=_modec->ssd;
+  rate=_modec->rate;
+  /*Because (except in 4:4:4 mode) we aren't considering chroma blocks in coded
+     order, we assume a constant overhead for coded block and qii flags.*/
+  nblocks=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  nblocks=(nblocks-4>>1)+4;
+  bi=4;
+  for(pli=1;pli<3;pli++){
+    for(;bi<nblocks;bi++){
+      unsigned best_cost;
+      satd=_frag_satd[bi];
+      best_rate=oc_dct_cost2(&best_ssd,_enc->state.qis[0],pli,_qti,satd)
+       +OC_CHROMA_QII_RATE;
+      best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
+      best_qii=0;
+      for(qii=1;qii<nqis;qii++){
+        cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],0,_qti,satd)
+         +OC_CHROMA_QII_RATE;
+        cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
+        if(cur_cost<best_cost){
+          best_cost=cur_cost;
+          best_ssd=cur_ssd;
+          best_rate=cur_rate;
+          best_qii=qii;
+        }
+      }
+      if(_skip_ssd[bi]<UINT_MAX){
+        cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
+        cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate,lambda);
+        if(cur_cost<=best_cost){
+          best_ssd=cur_ssd;
+          best_rate=0;
+          best_qii+=4;
+        }
+      }
+      rate+=best_rate;
+      ssd+=best_ssd;
+      _modec->qii[bi]=best_qii;
+    }
+    nblocks=(nblocks-4<<1)+4;
+  }
+  _modec->ssd=ssd;
+  _modec->rate=rate;
+}
+
+static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
+ unsigned _mbi,unsigned _ssd[12]){
+  OC_ALIGN16(ogg_int16_t  buffer[64]);
+  const unsigned char    *src;
+  const unsigned char    *ref;
+  int                     ystride;
+  const oc_fragment      *frags;
+  const ptrdiff_t        *frag_buf_offs;
+  const ptrdiff_t        *sb_map;
+  const oc_mb_map_plane  *mb_map;
+  const unsigned char    *map_idxs;
+  int                     map_nidxs;
+  ogg_int64_t             mask;
+  unsigned                uncoded_ssd;
+  int                     uncoded_dc;
+  unsigned                dc_dequant;
+  int                     dc_flag;
+  int                     mapii;
+  int                     mapi;
+  int                     pli;
+  int                     bi;
+  ptrdiff_t               fragi;
+  ptrdiff_t               frag_offs;
+  int                     borderi;
+  int                     pi;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
+  ystride=_enc->state.ref_ystride[0];
+  frags=_enc->state.frags;
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
+  dc_dequant=_enc->state.dequant_tables[_enc->state.qis[0]][0][1][0];
+  for(bi=0;bi<4;bi++){
+    fragi=sb_map[bi];
+    frag_offs=frag_buf_offs[fragi];
+    oc_enc_frag_sub(_enc,buffer,src+frag_offs,ref+frag_offs,ystride);
+    borderi=frags[fragi].borderi;
+    uncoded_ssd=uncoded_dc=0;
+    if(borderi<0){
+      for(pi=0;pi<64;pi++){
+        uncoded_ssd+=buffer[pi]*buffer[pi];
+        uncoded_dc+=buffer[pi];
+      }
+    }
+    else{
+      ogg_int64_t mask;
+      mask=_enc->state.borders[borderi].mask;
+      for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
+        uncoded_ssd+=buffer[pi]*buffer[pi];
+        uncoded_dc+=buffer[pi];
+      }
+    }
+    /*Scale to match DCT domain.*/
+    uncoded_ssd<<=4;
+    /*We actually only want the AC contribution to the SSD.*/
+    uncoded_ssd-=uncoded_dc*uncoded_dc>>2;
+    /*DC is a special case; if there's more than a full-quantizer improvement
+       in the effective DC component, always force-code the block.*/
+    dc_flag=abs(uncoded_dc)>dc_dequant<<1;
+    uncoded_ssd|=-dc_flag;
+    _pipe->skip_ssd[0][fragi-_pipe->froffset[0]]=_ssd[bi]=uncoded_ssd;
+  }
+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  map_nidxs=(map_nidxs-4>>1)+4;
+  mapii=4;
+  for(pli=1;pli<3;pli++){
+    ystride=_enc->state.ref_ystride[pli];
+    dc_dequant=_enc->state.dequant_tables[_enc->state.qis[0]][pli][1][0];
+    for(;mapii<map_nidxs;mapii++){
+      mapi=map_idxs[mapii];
+      bi=mapi&3;
+      fragi=mb_map[pli][bi];
+      frag_offs=frag_buf_offs[fragi];
+      oc_enc_frag_sub(_enc,buffer,src+frag_offs,ref+frag_offs,ystride);
+      borderi=frags[fragi].borderi;
+      uncoded_ssd=uncoded_dc=0;
+      if(borderi<0){
+        for(pi=0;pi<64;pi++){
+          uncoded_ssd+=buffer[pi]*buffer[pi];
+          uncoded_dc+=buffer[pi];
+        }
+      }
+      else{
+        mask=_enc->state.borders[borderi].mask;
+        for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
+          uncoded_ssd+=buffer[pi]*buffer[pi];
+          uncoded_dc+=buffer[pi];
+        }
+      }
+      /*Scale to match DCT domain.*/
+      uncoded_ssd<<=4;
+      /*We actually only want the AC contribution to the SSD.*/
+      uncoded_ssd-=uncoded_dc*uncoded_dc>>2;
+      /*DC is a special case; if there's more than a full-quantizer improvement
+         in the effective DC component, always force-code the block.*/
+      dc_flag=abs(uncoded_dc)>dc_dequant<<1;
+      uncoded_ssd|=-dc_flag;
+      _pipe->skip_ssd[pli][fragi-_pipe->froffset[pli]]=_ssd[mapii]=uncoded_ssd;
+    }
+    map_nidxs=(map_nidxs-4<<1)+4;
+  }
+}
+
+static void oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
+ unsigned _frag_satd[12]){
+  const unsigned char   *src;
+  const ptrdiff_t       *frag_buf_offs;
+  const ptrdiff_t       *sb_map;
+  const oc_mb_map_plane *mb_map;
+  const unsigned char   *map_idxs;
+  int                    map_nidxs;
+  int                    mapii;
+  int                    mapi;
+  int                    ystride;
+  int                    pli;
+  int                    bi;
+  ptrdiff_t              fragi;
+  ptrdiff_t              frag_offs;
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ystride=_enc->state.ref_ystride[0];
+  for(bi=0;bi<4;bi++){
+    fragi=sb_map[bi];
+    frag_offs=frag_buf_offs[fragi];
+    _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
+  }
+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
+  ystride=_enc->state.ref_ystride[1];
+  for(mapii=4;mapii<map_nidxs;mapii++){
+    mapi=map_idxs[mapii];
+    pli=mapi>>2;
+    bi=mapi&3;
+    fragi=mb_map[pli][bi];
+    frag_offs=frag_buf_offs[fragi];
+    _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
+  }
+}
+
+static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec,
+ unsigned _mbi,const oc_fr_state *_fr,const oc_qii_state *_qs,
+ const unsigned _frag_satd[12],const unsigned _skip_ssd[12]){
+  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,0);
+  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,0);
+  _modec->overhead+=
+   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTRA)<<OC_BIT_SCALE;
+  oc_mode_set_cost(_modec,_enc->lambda);
+}
+
+static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
+ unsigned _mbi,int _mb_mode,const signed char *_mv,
+ const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12]){
+  unsigned               frag_satd[12];
+  const unsigned char   *src;
+  const unsigned char   *ref;
+  int                    ystride;
+  const ptrdiff_t       *frag_buf_offs;
+  const ptrdiff_t       *sb_map;
+  const oc_mb_map_plane *mb_map;
+  const unsigned char   *map_idxs;
+  int                    map_nidxs;
+  int                    mapii;
+  int                    mapi;
+  int                    mv_offs[2];
+  int                    dx;
+  int                    dy;
+  int                    pli;
+  int                    bi;
+  ptrdiff_t              fragi;
+  ptrdiff_t              frag_offs;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[
+   _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(_mb_mode)]];
+  ystride=_enc->state.ref_ystride[0];
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
+  dx=_mv[0];
+  dy=_mv[1];
+  _modec->rate=_modec->ssd=0;
+  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,dx,dy)>1){
+    for(bi=0;bi<4;bi++){
+      fragi=sb_map[bi];
+      frag_offs=frag_buf_offs[fragi];
+      frag_satd[bi]=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
+    }
+  }
+  else{
+    for(bi=0;bi<4;bi++){
+      fragi=sb_map[bi];
+      frag_offs=frag_buf_offs[fragi];
+      frag_satd[bi]=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
+    }
+  }
+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
+  ystride=_enc->state.ref_ystride[1];
+  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,dx,dy)>1){
+    for(mapii=4;mapii<map_nidxs;mapii++){
+      mapi=map_idxs[mapii];
+      pli=mapi>>2;
+      bi=mapi&3;
+      fragi=mb_map[pli][bi];
+      frag_offs=frag_buf_offs[fragi];
+      frag_satd[mapii]=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
+    }
+  }
+  else{
+    for(mapii=4;mapii<map_nidxs;mapii++){
+      mapi=map_idxs[mapii];
+      pli=mapi>>2;
+      bi=mapi&3;
+      fragi=mb_map[pli][bi];
+      frag_offs=frag_buf_offs[fragi];
+      frag_satd[mapii]=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
+    }
+  }
+  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,1);
+  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,1);
+  _modec->overhead+=
+   oc_mode_scheme_chooser_cost(&_enc->chooser,_mb_mode)<<OC_BIT_SCALE;
+  oc_mode_set_cost(_modec,_enc->lambda);
+}
+
+static void oc_cost_inter_nomv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
+ unsigned _mbi,int _mb_mode,const oc_fr_state *_fr,const oc_qii_state *_qs,
+ const unsigned _skip_ssd[12]){
+  static const oc_mv OC_MV_ZERO;
+  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,OC_MV_ZERO,_fr,_qs,_skip_ssd);
+}
+
+static int oc_cost_inter1mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
+ unsigned _mbi,int _mb_mode,const signed char *_mv,
+ const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12]){
+  int bits0;
+  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_fr,_qs,_skip_ssd);
+  bits0=OC_MV_BITS[0][_mv[0]+31]+OC_MV_BITS[0][_mv[1]+31];
+  _modec->overhead+=OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+12)
+   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
+  oc_mode_set_cost(_modec,_enc->lambda);
+  return bits0;
+}
+
+/*A mapping from oc_mb_map (raster) ordering to oc_sb_map (Hilbert) ordering.*/
+static const unsigned char OC_MB_PHASE[4][4]={
+  {0,1,3,2},{0,3,1,2},{0,3,1,2},{2,3,1,0}
+};
+
+static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
+ unsigned _mbi,oc_mv _mv[4],const oc_fr_state *_fr,const oc_qii_state *_qs,
+ const unsigned _skip_ssd[12]){
+  unsigned               frag_satd[12];
+  oc_mv                  lbmvs[4];
+  oc_mv                  cbmvs[4];
+  const unsigned char   *src;
+  const unsigned char   *ref;
+  int                    ystride;
+  const ptrdiff_t       *frag_buf_offs;
+  oc_mv                 *frag_mvs;
+  const oc_mb_map_plane *mb_map;
+  const unsigned char   *map_idxs;
+  int                    map_nidxs;
+  int                    nqis;
+  int                    mapii;
+  int                    mapi;
+  int                    mv_offs[2];
+  int                    dx;
+  int                    dy;
+  int                    pli;
+  int                    bi;
+  ptrdiff_t              fragi;
+  ptrdiff_t              frag_offs;
+  int                    bits0;
+  int                    bits1;
+  unsigned               satd;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
+  ystride=_enc->state.ref_ystride[0];
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  frag_mvs=_enc->state.frag_mvs;
+  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
+  _modec->rate=_modec->ssd=0;
+  for(bi=0;bi<4;bi++){
+    fragi=mb_map[0][bi];
+    dx=_mv[bi][0];
+    dy=_mv[bi][1];
+    /*Save the block MVs as the current ones while we're here; we'll replace
+       them if we don't ultimately choose 4MV mode.*/
+    frag_mvs[fragi][0]=(signed char)dx;
+    frag_mvs[fragi][1]=(signed char)dy;
+    frag_offs=frag_buf_offs[fragi];
+    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,dx,dy)>1){
+      satd=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
+    }
+    else{
+      satd=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
+    }
+    frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd;
+  }
+  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,
+   _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,1);
+  /*Figure out which blocks are being skipped and give them (0,0) MVs.*/
+  bits0=0;
+  bits1=0;
+  nqis=_enc->state.nqis;
+  for(bi=0;bi<4;bi++){
+    if(_modec->qii[OC_MB_PHASE[_mbi&3][bi]]>=nqis){
+      memset(lbmvs+bi,0,sizeof(*lbmvs));
+    }
+    else{
+      memcpy(lbmvs+bi,_mv+bi,sizeof(*lbmvs));
+      bits0+=OC_MV_BITS[0][_mv[bi][0]+31]+OC_MV_BITS[0][_mv[bi][1]+31];
+      bits1+=12;
+    }
+  }
+  (*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs,
+   (const oc_mv *)lbmvs);
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
+  ystride=_enc->state.ref_ystride[1];
+  for(mapii=4;mapii<map_nidxs;mapii++){
+    mapi=map_idxs[mapii];
+    pli=mapi>>2;
+    bi=mapi&3;
+    fragi=mb_map[pli][bi];
+    dx=cbmvs[bi][0];
+    dy=cbmvs[bi][1];
+    frag_offs=frag_buf_offs[fragi];
+    /*TODO: We could save half these calls by re-using the results for the Cb
+       and Cr planes; is it worth it?*/
+    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,dx,dy)>1){
+      satd=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
+    }
+    else{
+      satd=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
+       ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
+    }
+    frag_satd[mapii]=satd;
+  }
+  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,1);
+  _modec->overhead+=
+   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTER_MV_FOUR)
+   +OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+bits1)
+   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
+  oc_mode_set_cost(_modec,_enc->lambda);
+}
+
+int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
+  oc_set_chroma_mvs_func  set_chroma_mvs;
+  oc_enc_pipeline_state   pipe;
+  oc_qii_state            intra_luma_qs;
+  oc_mv                   last_mv;
+  oc_mv                   prior_mv;
+  ogg_int64_t             interbits;
+  ogg_int64_t             intrabits;
+  const unsigned char    *map_idxs;
+  int                     nmap_idxs;
+  unsigned               *coded_mbis;
+  unsigned               *uncoded_mbis;
+  size_t                  ncoded_mbis;
+  size_t                  nuncoded_mbis;
+  oc_sb_flags            *sb_flags;
+  signed char            *mb_modes;
+  const oc_sb_map        *sb_maps;
+  const oc_mb_map        *mb_maps;
+  oc_mb_enc_info         *embs;
+  oc_fragment            *frags;
+  oc_mv                  *frag_mvs;
+  int                     qi;
+  unsigned                stripe_sby;
+  unsigned                mcu_nvsbs;
+  int                     notstart;
+  int                     notdone;
+  int                     vdec;
+  unsigned                sbi;
+  unsigned                sbi_end;
+  int                     refi;
+  int                     pli;
+  set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
+  _enc->state.frame_type=OC_INTER_FRAME;
+  oc_mode_scheme_chooser_reset(&_enc->chooser);
+  oc_enc_tokenize_start(_enc);
+  oc_enc_pipeline_init(_enc,&pipe);
+  if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs);
+  _enc->mv_bits[0]=_enc->mv_bits[1]=0;
+  interbits=intrabits=0;
+  last_mv[0]=last_mv[1]=prior_mv[0]=prior_mv[1]=0;
+  /*Choose MVs and MB modes and quantize and code luma.
+    Must be done in Hilbert order.*/
+  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
+  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
+  qi=_enc->state.qis[0];
+  coded_mbis=_enc->coded_mbis;
+  uncoded_mbis=coded_mbis+_enc->state.nmbs;
+  ncoded_mbis=0;
+  nuncoded_mbis=0;
+  _enc->state.ncoded_fragis[0]=0;
+  _enc->state.ncoded_fragis[1]=0;
+  _enc->state.ncoded_fragis[2]=0;
+  sb_flags=_enc->state.sb_flags;
+  mb_modes=_enc->state.mb_modes;
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
+  embs=_enc->mb_info;
+  frags=_enc->state.frags;
+  frag_mvs=_enc->state.frag_mvs;
+  vdec=!(_enc->state.info.pixel_fmt&2);
+  notstart=0;
+  notdone=1;
+  mcu_nvsbs=_enc->mcu_nvsbs;
+  for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
+    notdone=oc_enc_pipeline_set_stripe(_enc,&pipe,stripe_sby);
+    sbi_end=pipe.sbi_end[0];
+    for(sbi=pipe.sbi0[0];sbi<sbi_end;sbi++){
+      int quadi;
+      /*Mode addressing is through Y plane, always 4 MB per SB.*/
+      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
+        oc_mode_choice modes[8];
+        unsigned       skip_ssd[12];
+        unsigned       intra_satd[12];
+        int            mb_mv_bits_0;
+        int            mb_gmv_bits_0;
+        int            inter_mv_pref;
+        int            mb_mode;
+        int            dx;
+        int            dy;
+        unsigned       mbi;
+        int            mapii;
+        int            mapi;
+        int            bi;
+        ptrdiff_t      fragi;
+        mbi=sbi<<2|quadi;
+        /*Motion estimation:
+          We always do a basic 1MV search for all macroblocks, coded or not,
+           keyframe or not.*/
+        if(!_recode&&_enc->sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi);
+        dx=dy=0;
+        /*Find the block choice with the lowest estimated coding cost.
+          If a Cb or Cr block is coded but no Y' block from a macro block then
+           the mode MUST be OC_MODE_INTER_NOMV.
+          This is the default state to which the mode data structure is
+           initialised in encoder and decoder at the start of each frame.*/
+        /*Block coding cost is estimated from correlated SATD metrics.*/
+        /*At this point, all blocks that are in frame are still marked coded.*/
+        if(!_recode){
+          memcpy(embs[mbi].unref_mv,
+           embs[mbi].analysis_mv[0],sizeof(embs[mbi].unref_mv));
+          embs[mbi].refined=0;
+        }
+        oc_mb_intra_satd(_enc,mbi,intra_satd);
+        /*Estimate the cost of coding this MB in a keyframe.*/
+        if(_allow_keyframe){
+          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
+           pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP);
+          intrabits+=modes[OC_MODE_INTRA].rate;
+          for(bi=0;bi<4;bi++){
+            oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs,
+             modes[OC_MODE_INTRA].qii[bi]);
+          }
+        }
+        /*Estimate the cost in a delta frame for various modes.*/
+        oc_skip_cost(_enc,&pipe,mbi,skip_ssd);
+        oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
+         OC_MODE_INTER_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd);
+        if(_enc->sp_level<OC_SP_LEVEL_NOMC){
+          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
+           pipe.fr+0,pipe.qs+0,intra_satd,skip_ssd);
+          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
+           OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],
+           pipe.fr+0,pipe.qs+0,skip_ssd);
+          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
+           OC_MODE_INTER_MV_LAST,last_mv,pipe.fr+0,pipe.qs+0,skip_ssd);
+          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
+           OC_MODE_INTER_MV_LAST2,prior_mv,pipe.fr+0,pipe.qs+0,skip_ssd);
+          oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
+           embs[mbi].block_mv,pipe.fr+0,pipe.qs+0,skip_ssd);
+          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
+           OC_MODE_GOLDEN_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd);
+          mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
+           OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],
+           pipe.fr+0,pipe.qs+0,skip_ssd);
+          /*The explicit MV modes (2,6,7) have not yet gone through halfpel
+             refinement.
+            We choose the explicit MV mode that's already furthest ahead on
+             R-D cost and refine only that one.
+            We have to be careful to remember which ones we've refined so that
+             we don't refine it again if we re-encode this frame.*/
+          inter_mv_pref=_enc->lambda*3;
+          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
+           modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
+            if(!(embs[mbi].refined&0x80)){
+              oc_mcenc_refine4mv(_enc,mbi);
+              embs[mbi].refined|=0x80;
+            }
+            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
+             embs[mbi].ref_mv,pipe.fr+0,pipe.qs+0,skip_ssd);
+          }
+          else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
+           modes[OC_MODE_INTER_MV].cost){
+            if(!(embs[mbi].refined&0x40)){
+              oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD);
+              embs[mbi].refined|=0x40;
+            }
+            mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
+             OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],
+             pipe.fr+0,pipe.qs+0,skip_ssd);
+          }
+          if(!(embs[mbi].refined&0x04)){
+            oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
+            embs[mbi].refined|=0x04;
+          }
+          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
+           OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],
+           pipe.fr+0,pipe.qs+0,skip_ssd);
+          /*Finally, pick the mode with the cheapest estimated R-D cost.*/
+          mb_mode=OC_MODE_INTER_NOMV;
+          if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
+            mb_mode=OC_MODE_INTRA;
+          }
+          if(modes[OC_MODE_INTER_MV_LAST].cost<modes[mb_mode].cost){
+            mb_mode=OC_MODE_INTER_MV_LAST;
+          }
+          if(modes[OC_MODE_INTER_MV_LAST2].cost<modes[mb_mode].cost){
+            mb_mode=OC_MODE_INTER_MV_LAST2;
+          }
+          if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
+            mb_mode=OC_MODE_GOLDEN_NOMV;
+          }
+          if(modes[OC_MODE_GOLDEN_MV].cost<modes[mb_mode].cost){
+            mb_mode=OC_MODE_GOLDEN_MV;
+          }
+          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[mb_mode].cost){
+            mb_mode=OC_MODE_INTER_MV_FOUR;
+          }
+          /*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/
+          if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){
+            inter_mv_pref=0;
+          }
+          if(modes[OC_MODE_INTER_MV].cost<modes[mb_mode].cost+inter_mv_pref){
+            mb_mode=OC_MODE_INTER_MV;
+          }
+        }
+        else{
+          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
+           OC_MODE_GOLDEN_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd);
+          mb_mode=OC_MODE_INTER_NOMV;
+          if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
+            mb_mode=OC_MODE_INTRA;
+          }
+          if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
+            mb_mode=OC_MODE_GOLDEN_NOMV;
+          }
+          mb_mv_bits_0=mb_gmv_bits_0=0;
+        }
+        mb_modes[mbi]=mb_mode;
+        /*Propagate the MVs to the luma blocks.*/
+        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
+          switch(mb_mode){
+            case OC_MODE_INTER_MV:{
+              dx=embs[mbi].analysis_mv[0][OC_FRAME_PREV][0];
+              dy=embs[mbi].analysis_mv[0][OC_FRAME_PREV][1];
+            }break;
+            case OC_MODE_INTER_MV_LAST:{
+              dx=last_mv[0];
+              dy=last_mv[1];
+            }break;
+            case OC_MODE_INTER_MV_LAST2:{
+              dx=prior_mv[0];
+              dy=prior_mv[1];
+            }break;
+            case OC_MODE_GOLDEN_MV:{
+              dx=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][0];
+              dy=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][1];
+            }break;
+          }
+          for(bi=0;bi<4;bi++){
+            fragi=mb_maps[mbi][0][bi];
+            frag_mvs[fragi][0]=(signed char)dx;
+            frag_mvs[fragi][1]=(signed char)dy;
+          }
+        }
+        for(bi=0;bi<4;bi++){
+          fragi=sb_maps[mbi>>2][mbi&3][bi];
+          frags[fragi].qii=modes[mb_mode].qii[bi];
+        }
+        if(oc_enc_mb_transform_quantize_luma(_enc,&pipe,mbi,
+         modes[mb_mode].overhead>>OC_BIT_SCALE)>0){
+          int orig_mb_mode;
+          orig_mb_mode=mb_mode;
+          mb_mode=mb_modes[mbi];
+          switch(mb_mode){
+            case OC_MODE_INTER_MV:{
+              memcpy(prior_mv,last_mv,sizeof(prior_mv));
+              /*If we're backing out from 4MV, find the MV we're actually
+                 using.*/
+              if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
+                for(bi=0;;bi++){
+                  fragi=mb_maps[mbi][0][bi];
+                  if(frags[fragi].coded){
+                    memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv));
+                    dx=frag_mvs[fragi][0];
+                    dy=frag_mvs[fragi][1];
+                    break;
+                  }
+                }
+                mb_mv_bits_0=OC_MV_BITS[0][dx+31]+OC_MV_BITS[0][dy+31];
+              }
+              /*Otherwise we used the original analysis MV.*/
+              else{
+                memcpy(last_mv,
+                 embs[mbi].analysis_mv[0][OC_FRAME_PREV],sizeof(last_mv));
+              }
+              _enc->mv_bits[0]+=mb_mv_bits_0;
+              _enc->mv_bits[1]+=12;
+            }break;
+            case OC_MODE_INTER_MV_LAST2:{
+              oc_mv tmp_mv;
+              memcpy(tmp_mv,prior_mv,sizeof(tmp_mv));
+              memcpy(prior_mv,last_mv,sizeof(prior_mv));
+              memcpy(last_mv,tmp_mv,sizeof(last_mv));
+            }break;
+            case OC_MODE_GOLDEN_MV:{
+              _enc->mv_bits[0]+=mb_gmv_bits_0;
+              _enc->mv_bits[1]+=12;
+            }break;
+            case OC_MODE_INTER_MV_FOUR:{
+              oc_mv lbmvs[4];
+              oc_mv cbmvs[4];
+              memcpy(prior_mv,last_mv,sizeof(prior_mv));
+              for(bi=0;bi<4;bi++){
+                fragi=mb_maps[mbi][0][bi];
+                if(frags[fragi].coded){
+                  memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv));
+                  memcpy(lbmvs[bi],frag_mvs[fragi],sizeof(lbmvs[bi]));
+                  _enc->mv_bits[0]+=OC_MV_BITS[0][frag_mvs[fragi][0]+31]
+                   +OC_MV_BITS[0][frag_mvs[fragi][1]+31];
+                  _enc->mv_bits[1]+=12;
+                }
+                /*Replace the block MVs for not-coded blocks with (0,0).*/
+                else memset(lbmvs[bi],0,sizeof(lbmvs[bi]));
+              }
+              (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
+              for(mapii=4;mapii<nmap_idxs;mapii++){
+                mapi=map_idxs[mapii];
+                pli=mapi>>2;
+                bi=mapi&3;
+                fragi=mb_maps[mbi][pli][bi];
+                frags[fragi].mb_mode=mb_mode;
+                frags[fragi].qii=modes[OC_MODE_INTER_MV_FOUR].qii[mapii];
+                memcpy(frag_mvs[fragi],cbmvs[bi],sizeof(frag_mvs[fragi]));
+              }
+            }break;
+          }
+          coded_mbis[ncoded_mbis++]=mbi;
+          oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode);
+          interbits+=modes[mb_mode].rate+modes[mb_mode].overhead;
+        }
+        else{
+          *(uncoded_mbis-++nuncoded_mbis)=mbi;
+          mb_mode=OC_MODE_INTER_NOMV;
+          dx=dy=0;
+        }
+        /*Propagate final MB mode and MVs to the chroma blocks.
+          This has already been done for 4MV mode, since it requires individual
+           block motion vectors.*/
+        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
+          for(mapii=4;mapii<nmap_idxs;mapii++){
+            mapi=map_idxs[mapii];
+            pli=mapi>>2;
+            bi=mapi&3;
+            fragi=mb_maps[mbi][pli][bi];
+            frags[fragi].mb_mode=mb_mode;
+            /*If we switched from 4MV mode to INTER_MV mode, then the qii
+               values won't have been chosen with the right MV, but it's
+               probaby not worth re-estimating them.*/
+            frags[fragi].qii=modes[mb_mode].qii[mapii];
+            frag_mvs[fragi][0]=(signed char)dx;
+            frag_mvs[fragi][1]=(signed char)dy;
+          }
+        }
+      }
+      oc_fr_state_flush_sb(pipe.fr+0);
+      sb_flags[sbi].coded_fully=pipe.fr[0].sb_full;
+      sb_flags[sbi].coded_partially=pipe.fr[0].sb_partial;
+    }
+    oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,0,notstart,notdone);
+    /*Code chroma planes.*/
+    for(pli=1;pli<3;pli++){
+      oc_enc_sb_transform_quantize_chroma(_enc,&pipe,
+       pli,pipe.sbi0[pli],pipe.sbi_end[pli]);
+      oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,pli,notstart,notdone);
+    }
+    notstart=1;
+  }
+  /*Finish filling in the reference frame borders.*/
+  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
+  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
+  /*Finish adding flagging overhead costs to inter bit counts to determine if
+     we should have coded a key frame instead.*/
+  if(_allow_keyframe){
+    if(interbits>intrabits)return 1;
+    /*Technically the chroma plane counts are over-estimations, because they
+       don't account for continuing runs from the luma planes, but the
+       inaccuracy is small.*/
+    for(pli=0;pli<3;pli++)interbits+=pipe.fr[pli].bits<<OC_BIT_SCALE;
+    interbits+=OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
+    interbits+=
+     _enc->chooser.scheme_bits[_enc->chooser.scheme_list[0]]<<OC_BIT_SCALE;
+    if(interbits>intrabits)return 1;
+  }
+  _enc->ncoded_mbis=ncoded_mbis;
+  /*Compact the coded fragment list.*/
+  {
+    ptrdiff_t ncoded_fragis;
+    ncoded_fragis=_enc->state.ncoded_fragis[0];
+    for(pli=1;pli<3;pli++){
+      memmove(_enc->state.coded_fragis+ncoded_fragis,
+       _enc->state.coded_fragis+_enc->state.fplanes[pli].froffset,
+       _enc->state.ncoded_fragis[pli]*sizeof(*_enc->state.coded_fragis));
+      ncoded_fragis+=_enc->state.ncoded_fragis[pli];
+    }
+    _enc->state.ntotal_coded_fragis=ncoded_fragis;
+  }
+  return 0;
+}
+
+#if defined(OC_COLLECT_METRICS)
+# include <stdio.h>
+# include <math.h>
+
+/*TODO: It may be helpful (for block-level quantizers especially) to separate
+   out the contributions from AC and DC into separate tables.*/
+
+# define OC_ZWEIGHT   (0.25)
+
+static void oc_mode_metrics_add(oc_mode_metrics *_metrics,
+ double _w,int _satd,int _rate,double _rmse){
+  double rate;
+  /*Accumulate statistics without the scaling; this lets us change the scale
+     factor yet still use old data.*/
+  rate=ldexp(_rate,-OC_BIT_SCALE);
+  if(_metrics->fragw>0){
+    double dsatd;
+    double drate;
+    double drmse;
+    double w;
+    dsatd=_satd-_metrics->satd/_metrics->fragw;
+    drate=rate-_metrics->rate/_metrics->fragw;
+    drmse=_rmse-_metrics->rmse/_metrics->fragw;
+    w=_metrics->fragw*_w/(_metrics->fragw+_w);
+    _metrics->satd2+=dsatd*dsatd*w;
+    _metrics->satdrate+=dsatd*drate*w;
+    _metrics->rate2+=drate*drate*w;
+    _metrics->satdrmse+=dsatd*drmse*w;
+    _metrics->rmse2+=drmse*drmse*w;
+  }
+  _metrics->fragw+=_w;
+  _metrics->satd+=_satd*_w;
+  _metrics->rate+=rate*_w;
+  _metrics->rmse+=_rmse*_w;
+}
+
+static void oc_mode_metrics_merge(oc_mode_metrics *_dst,
+ const oc_mode_metrics *_src,int _n){
+  int i;
+  /*Find a non-empty set of metrics.*/
+  for(i=0;i<_n&&_src[i].fragw<=0;i++);
+  if(i>=_n){
+    memset(_dst,0,sizeof(*_dst));
+    return;
+  }
+  memcpy(_dst,_src+i,sizeof(*_dst));
+  /*And iterate over the remaining non-empty sets of metrics.*/
+  for(i++;i<_n;i++)if(_src[i].fragw>0){
+    double wa;
+    double wb;
+    double dsatd;
+    double drate;
+    double drmse;
+    double w;
+    wa=_dst->fragw;
+    wb=_src[i].fragw;
+    dsatd=_src[i].satd/wb-_dst->satd/wa;
+    drate=_src[i].rate/wb-_dst->rate/wa;
+    drmse=_src[i].rmse/wb-_dst->rmse/wa;
+    w=wa*wb/(wa+wb);
+    _dst->fragw+=_src[i].fragw;
+    _dst->satd+=_src[i].satd;
+    _dst->rate+=_src[i].rate;
+    _dst->rmse+=_src[i].rmse;
+    _dst->satd2+=_src[i].satd2+dsatd*dsatd*w;
+    _dst->satdrate+=_src[i].satdrate+dsatd*drate*w;
+    _dst->rate2+=_src[i].rate2+drate*drate*w;
+    _dst->satdrmse+=_src[i].satdrmse+dsatd*drmse*w;
+    _dst->rmse2+=_src[i].rmse2+drmse*drmse*w;
+  }
+}
+
+/*Compile collected SATD/rate/RMSE metrics into a form that's immediately
+   useful for mode decision.*/
+static void oc_enc_mode_metrics_update(oc_enc_ctx *_enc,int _qi){
+  int pli;
+  int qti;
+  oc_restore_fpu(&_enc->state);
+  /*Convert raw collected data into cleaned up sample points.*/
+  for(pli=0;pli<3;pli++){
+    for(qti=0;qti<2;qti++){
+      double fragw;
+      int    bin0;
+      int    bin1;
+      int    bin;
+      fragw=0;
+      bin0=bin1=0;
+      for(bin=0;bin<OC_SAD_BINS;bin++){
+        oc_mode_metrics metrics;
+        OC_MODE_RD[_qi][pli][qti][bin].rate=0;
+        OC_MODE_RD[_qi][pli][qti][bin].rmse=0;
+        /*Find some points on either side of the current bin.*/
+        while((bin1<bin+1||fragw<OC_ZWEIGHT)&&bin1<OC_SAD_BINS-1){
+          fragw+=OC_MODE_METRICS[_qi][pli][qti][bin1++].fragw;
+        }
+        while(bin0+1<bin&&bin0+1<bin1&&
+         fragw-OC_MODE_METRICS[_qi][pli][qti][bin0].fragw>=OC_ZWEIGHT){
+          fragw-=OC_MODE_METRICS[_qi][pli][qti][bin0++].fragw;
+        }
+        /*Merge statistics and fit lines.*/
+        oc_mode_metrics_merge(&metrics,
+         OC_MODE_METRICS[_qi][pli][qti]+bin0,bin1-bin0);
+        if(metrics.fragw>0&&metrics.satd2>0){
+          double a;
+          double b;
+          double msatd;
+          double mrate;
+          double mrmse;
+          double rate;
+          double rmse;
+          msatd=metrics.satd/metrics.fragw;
+          mrate=metrics.rate/metrics.fragw;
+          mrmse=metrics.rmse/metrics.fragw;
+          /*Compute the points on these lines corresponding to the actual bin
+             value.*/
+          b=metrics.satdrate/metrics.satd2;
+          a=mrate-b*msatd;
+          rate=ldexp(a+b*(bin<<OC_SAD_SHIFT),OC_BIT_SCALE);
+          OC_MODE_RD[_qi][pli][qti][bin].rate=
+           (ogg_int16_t)OC_CLAMPI(-32768,(int)(rate+0.5),32767);
+          b=metrics.satdrmse/metrics.satd2;
+          a=mrmse-b*msatd;
+          rmse=ldexp(a+b*(bin<<OC_SAD_SHIFT),OC_RMSE_SCALE);
+          OC_MODE_RD[_qi][pli][qti][bin].rmse=
+           (ogg_int16_t)OC_CLAMPI(-32768,(int)(rmse+0.5),32767);
+        }
+      }
+    }
+  }
+}
+
+
+
+/*The following token skipping code used to also be used in the decoder (and
+   even at one point other places in the encoder).
+  However, it was obsoleted by other optimizations, and is now only used here.
+  It has been moved here to avoid generating the code when it's not needed.*/
+
+/*Determines the number of blocks or coefficients to be skipped for a given
+   token value.
+  _token:      The token value to skip.
+  _extra_bits: The extra bits attached to this token.
+  Return: A positive value indicates that number of coefficients are to be
+           skipped in the current block.
+          Otherwise, the negative of the return value indicates that number of
+           blocks are to be ended.*/
+typedef ptrdiff_t (*oc_token_skip_func)(int _token,int _extra_bits);
+
+/*Handles the simple end of block tokens.*/
+static ptrdiff_t oc_token_skip_eob(int _token,int _extra_bits){
+  int nblocks_adjust;
+  nblocks_adjust=OC_UNIBBLE_TABLE32(0,1,2,3,7,15,0,0,_token)+1;
+  return -_extra_bits-nblocks_adjust;
+}
+
+/*The last EOB token has a special case, where an EOB run of size zero ends all
+   the remaining blocks in the frame.*/
+static ptrdiff_t oc_token_skip_eob6(int _token,int _extra_bits){
+  /*Note: We want to return -PTRDIFF_MAX, but that requires C99, which is not
+     yet available everywhere; this should be equivalent.*/
+  if(!_extra_bits)return -(~(size_t)0>>1);
+  return -_extra_bits;
+}
+
+/*Handles the pure zero run tokens.*/
+static ptrdiff_t oc_token_skip_zrl(int _token,int _extra_bits){
+  return _extra_bits+1;
+}
+
+/*Handles a normal coefficient value token.*/
+static ptrdiff_t oc_token_skip_val(void){
+  return 1;
+}
+
+/*Handles a category 1A zero run/coefficient value combo token.*/
+static ptrdiff_t oc_token_skip_run_cat1a(int _token){
+  return _token-OC_DCT_RUN_CAT1A+2;
+}
+
+/*Handles category 1b, 1c, 2a, and 2b zero run/coefficient value combo tokens.*/
+static ptrdiff_t oc_token_skip_run(int _token,int _extra_bits){
+  int run_cati;
+  int ncoeffs_mask;
+  int ncoeffs_adjust;
+  run_cati=_token-OC_DCT_RUN_CAT1B;
+  ncoeffs_mask=OC_BYTE_TABLE32(3,7,0,1,run_cati);
+  ncoeffs_adjust=OC_BYTE_TABLE32(7,11,2,3,run_cati);
+  return (_extra_bits&ncoeffs_mask)+ncoeffs_adjust;
+}
+
+/*A jump table for computing the number of coefficients or blocks to skip for
+   a given token value.
+  This reduces all the conditional branches, etc., needed to parse these token
+   values down to one indirect jump.*/
+static const oc_token_skip_func OC_TOKEN_SKIP_TABLE[TH_NDCT_TOKENS]={
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob6,
+  oc_token_skip_zrl,
+  oc_token_skip_zrl,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  oc_token_skip_run,
+  oc_token_skip_run,
+  oc_token_skip_run,
+  oc_token_skip_run
+};
+
+/*Determines the number of blocks or coefficients to be skipped for a given
+   token value.
+  _token:      The token value to skip.
+  _extra_bits: The extra bits attached to this token.
+  Return: A positive value indicates that number of coefficients are to be
+           skipped in the current block.
+          Otherwise, the negative of the return value indicates that number of
+           blocks are to be ended.
+          0 will never be returned, so that at least one coefficient in one
+           block will always be decoded for every token.*/
+static ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits){
+  return (*OC_TOKEN_SKIP_TABLE[_token])(_token,_extra_bits);
+}
+
+
+
+void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){
+  static const unsigned char OC_ZZI_HUFF_OFFSET[64]={
+     0,16,16,16,16,16,32,32,
+    32,32,32,32,32,32,32,48,
+    48,48,48,48,48,48,48,48,
+    48,48,48,48,64,64,64,64,
+    64,64,64,64,64,64,64,64,
+    64,64,64,64,64,64,64,64,
+    64,64,64,64,64,64,64,64
+  };
+  const oc_fragment *frags;
+  const unsigned    *frag_satd;
+  const unsigned    *frag_ssd;
+  const ptrdiff_t   *coded_fragis;
+  ptrdiff_t          ncoded_fragis;
+  ptrdiff_t          fragii;
+  double             fragw;
+  int                qti;
+  int                qii;
+  int                qi;
+  int                pli;
+  int                zzi;
+  int                token;
+  int                eb;
+  oc_restore_fpu(&_enc->state);
+  /*Load any existing mode metrics if we haven't already.*/
+  if(!oc_has_mode_metrics){
+    FILE *fmetrics;
+    memset(OC_MODE_METRICS,0,sizeof(OC_MODE_METRICS));
+    fmetrics=fopen("modedec.stats","rb");
+    if(fmetrics!=NULL){
+      fread(OC_MODE_METRICS,sizeof(OC_MODE_METRICS),1,fmetrics);
+      fclose(fmetrics);
+    }
+    for(qi=0;qi<64;qi++)oc_enc_mode_metrics_update(_enc,qi);
+    oc_has_mode_metrics=1;
+  }
+  qti=_enc->state.frame_type;
+  frags=_enc->state.frags;
+  frag_satd=_enc->frag_satd;
+  frag_ssd=_enc->frag_ssd;
+  coded_fragis=_enc->state.coded_fragis;
+  ncoded_fragis=fragii=0;
+  /*Weight the fragments by the inverse frame size; this prevents HD content
+     from dominating the statistics.*/
+  fragw=1.0/_enc->state.nfrags;
+  for(pli=0;pli<3;pli++){
+    ptrdiff_t ti[64];
+    int       eob_token[64];
+    int       eob_run[64];
+    /*Set up token indices and eob run counts.
+      We don't bother trying to figure out the real cost of the runs that span
+       coefficients; instead we use the costs that were available when R-D
+       token optimization was done.*/
+    for(zzi=0;zzi<64;zzi++){
+      ti[zzi]=_enc->dct_token_offs[pli][zzi];
+      if(ti[zzi]>0){
+        token=_enc->dct_tokens[pli][zzi][0];
+        eb=_enc->extra_bits[pli][zzi][0];
+        eob_token[zzi]=token;
+        eob_run[zzi]=-oc_dct_token_skip(token,eb);
+      }
+      else{
+        eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
+        eob_run[zzi]=0;
+      }
+    }
+    /*Scan the list of coded fragments for this plane.*/
+    ncoded_fragis+=_enc->state.ncoded_fragis[pli];
+    for(;fragii<ncoded_fragis;fragii++){
+      ptrdiff_t    fragi;
+      ogg_uint32_t frag_bits;
+      int          huffi;
+      int          skip;
+      int          mb_mode;
+      unsigned     satd;
+      int          bin;
+      fragi=coded_fragis[fragii];
+      frag_bits=0;
+      for(zzi=0;zzi<64;){
+        if(eob_run[zzi]>0){
+          /*We've reached the end of the block.*/
+          eob_run[zzi]--;
+          break;
+        }
+        huffi=_enc->huff_idxs[qti][zzi>0][pli+1>>1]
+         +OC_ZZI_HUFF_OFFSET[zzi];
+        if(eob_token[zzi]<OC_NDCT_EOB_TOKEN_MAX){
+          /*This token caused an EOB run to be flushed.
+            Therefore it gets the bits associated with it.*/
+          frag_bits+=_enc->huff_codes[huffi][eob_token[zzi]].nbits
+           +OC_DCT_TOKEN_EXTRA_BITS[eob_token[zzi]];
+          eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
+        }
+        token=_enc->dct_tokens[pli][zzi][ti[zzi]];
+        eb=_enc->extra_bits[pli][zzi][ti[zzi]];
+        ti[zzi]++;
+        skip=oc_dct_token_skip(token,eb);
+        if(skip<0){
+          eob_token[zzi]=token;
+          eob_run[zzi]=-skip;
+        }
+        else{
+          /*A regular DCT value token; accumulate the bits for it.*/
+          frag_bits+=_enc->huff_codes[huffi][token].nbits
+           +OC_DCT_TOKEN_EXTRA_BITS[token];
+          zzi+=skip;
+        }
+      }
+      mb_mode=frags[fragi].mb_mode;
+      qi=_enc->state.qis[frags[fragi].qii];
+      satd=frag_satd[fragi]<<(pli+1&2);
+      bin=OC_MINI(satd>>OC_SAD_SHIFT,OC_SAD_BINS-1);
+      oc_mode_metrics_add(OC_MODE_METRICS[qi][pli][mb_mode!=OC_MODE_INTRA]+bin,
+       fragw,satd,frag_bits<<OC_BIT_SCALE,sqrt(frag_ssd[fragi]));
+    }
+  }
+  /*Update global SATD/rate/RMSE estimation matrix.*/
+  for(qii=0;qii<_enc->state.nqis;qii++){
+    oc_enc_mode_metrics_update(_enc,_enc->state.qis[qii]);
+  }
+}
+
+void oc_enc_mode_metrics_dump(oc_enc_ctx *_enc){
+  FILE *fmetrics;
+  int   qi;
+  /*Generate sample points for complete list of QI values.*/
+  for(qi=0;qi<64;qi++)oc_enc_mode_metrics_update(_enc,qi);
+  fmetrics=fopen("modedec.stats","wb");
+  if(fmetrics!=NULL){
+    fwrite(OC_MODE_METRICS,sizeof(OC_MODE_METRICS),1,fmetrics);
+    fclose(fmetrics);
+  }
+  fprintf(stdout,
+   "/*File generated by libtheora with OC_COLLECT_METRICS"
+   " defined at compile time.*/\n"
+   "#if !defined(_modedec_H)\n"
+   "# define _modedec_H (1)\n"
+   "\n"
+   "\n"
+   "\n"
+   "# if defined(OC_COLLECT_METRICS)\n"
+   "typedef struct oc_mode_metrics oc_mode_metrics;\n"
+   "# endif\n"
+   "typedef struct oc_mode_rd      oc_mode_rd;\n"
+   "\n"
+   "\n"
+   "\n"
+   "/*The number of extra bits of precision at which to store rate"
+   " metrics.*/\n"
+   "# define OC_BIT_SCALE  (%i)\n"
+   "/*The number of extra bits of precision at which to store RMSE metrics.\n"
+   "  This must be at least half OC_BIT_SCALE (rounded up).*/\n"
+   "# define OC_RMSE_SCALE (%i)\n"
+   "/*The number of bins to partition statistics into.*/\n"
+   "# define OC_SAD_BINS   (%i)\n"
+   "/*The number of bits of precision to drop"
+   " from SAD scores to assign them to a\n"
+   "   bin.*/\n"
+   "# define OC_SAD_SHIFT  (%i)\n"
+   "\n"
+   "\n"
+   "\n"
+   "# if defined(OC_COLLECT_METRICS)\n"
+   "struct oc_mode_metrics{\n"
+   "  double fragw;\n"
+   "  double satd;\n"
+   "  double rate;\n"
+   "  double rmse;\n"
+   "  double satd2;\n"
+   "  double satdrate;\n"
+   "  double rate2;\n"
+   "  double satdrmse;\n"
+   "  double rmse2;\n"
+   "};\n"
+   "\n"
+   "\n"
+   "int             oc_has_mode_metrics;\n"
+   "oc_mode_metrics OC_MODE_METRICS[64][3][2][OC_SAD_BINS];\n"
+   "# endif\n"
+   "\n"
+   "\n"
+   "\n"
+   "struct oc_mode_rd{\n"
+   "  ogg_int16_t rate;\n"
+   "  ogg_int16_t rmse;\n"
+   "};\n"
+   "\n"
+   "\n"
+   "# if !defined(OC_COLLECT_METRICS)\n"
+   "static const\n"
+   "# endif\n"
+   "oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={\n",
+   OC_BIT_SCALE,OC_RMSE_SCALE,OC_SAD_BINS,OC_SAD_SHIFT);
+  for(qi=0;qi<64;qi++){
+    int pli;
+    fprintf(stdout,"  {\n");
+    for(pli=0;pli<3;pli++){
+      int qti;
+      fprintf(stdout,"    {\n");
+      for(qti=0;qti<2;qti++){
+        int bin;
+        static const char *pl_names[3]={"Y'","Cb","Cr"};
+        static const char *qti_names[2]={"INTRA","INTER"};
+        fprintf(stdout,"      /*%s  qi=%i  %s*/\n",
+         pl_names[pli],qi,qti_names[qti]);
+        fprintf(stdout,"      {\n");
+        fprintf(stdout,"        ");
+        for(bin=0;bin<OC_SAD_BINS;bin++){
+          if(bin&&!(bin&0x3))fprintf(stdout,"\n        ");
+          fprintf(stdout,"{%5i,%5i}",
+           OC_MODE_RD[qi][pli][qti][bin].rate,
+           OC_MODE_RD[qi][pli][qti][bin].rmse);
+          if(bin+1<OC_SAD_BINS)fprintf(stdout,",");
+        }
+        fprintf(stdout,"\n      }");
+        if(qti<1)fprintf(stdout,",");
+        fprintf(stdout,"\n");
+      }
+      fprintf(stdout,"    }");
+      if(pli<2)fprintf(stdout,",");
+      fprintf(stdout,"\n");
+    }
+    fprintf(stdout,"  }");
+    if(qi<63)fprintf(stdout,",");
+    fprintf(stdout,"\n");
+  }
+  fprintf(stdout,
+   "};\n"
+   "\n"
+   "#endif\n");
+}
+#endif
diff --git a/lib/apiwrapper.c b/lib/apiwrapper.c
new file mode 100644
index 0000000..dc959b8
--- /dev/null
+++ b/lib/apiwrapper.c
@@ -0,0 +1,166 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: apiwrapper.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "apiwrapper.h"
+
+
+
+const char *theora_version_string(void){
+  return th_version_string();
+}
+
+ogg_uint32_t theora_version_number(void){
+  return th_version_number();
+}
+
+void theora_info_init(theora_info *_ci){
+  memset(_ci,0,sizeof(*_ci));
+}
+
+void theora_info_clear(theora_info *_ci){
+  th_api_wrapper *api;
+  api=(th_api_wrapper *)_ci->codec_setup;
+  memset(_ci,0,sizeof(*_ci));
+  if(api!=NULL){
+    if(api->clear!=NULL)(*api->clear)(api);
+    _ogg_free(api);
+  }
+}
+
+void theora_clear(theora_state *_th){
+  /*Provide compatibility with mixed encoder and decoder shared lib versions.*/
+  if(_th->internal_decode!=NULL){
+    (*((oc_state_dispatch_vtable *)_th->internal_decode)->clear)(_th);
+  }
+  if(_th->internal_encode!=NULL){
+    (*((oc_state_dispatch_vtable *)_th->internal_encode)->clear)(_th);
+  }
+  if(_th->i!=NULL)theora_info_clear(_th->i);
+  memset(_th,0,sizeof(*_th));
+}
+
+int theora_control(theora_state *_th,int _req,void *_buf,size_t _buf_sz){
+  /*Provide compatibility with mixed encoder and decoder shared lib versions.*/
+  if(_th->internal_decode!=NULL){
+    return (*((oc_state_dispatch_vtable *)_th->internal_decode)->control)(_th,
+     _req,_buf,_buf_sz);
+  }
+  else if(_th->internal_encode!=NULL){
+    return (*((oc_state_dispatch_vtable *)_th->internal_encode)->control)(_th,
+     _req,_buf,_buf_sz);
+  }
+  else return TH_EINVAL;
+}
+
+ogg_int64_t theora_granule_frame(theora_state *_th,ogg_int64_t _gp){
+  /*Provide compatibility with mixed encoder and decoder shared lib versions.*/
+  if(_th->internal_decode!=NULL){
+    return (*((oc_state_dispatch_vtable *)_th->internal_decode)->granule_frame)(
+     _th,_gp);
+  }
+  else if(_th->internal_encode!=NULL){
+    return (*((oc_state_dispatch_vtable *)_th->internal_encode)->granule_frame)(
+     _th,_gp);
+  }
+  else return -1;
+}
+
+double theora_granule_time(theora_state *_th, ogg_int64_t _gp){
+  /*Provide compatibility with mixed encoder and decoder shared lib versions.*/
+  if(_th->internal_decode!=NULL){
+    return (*((oc_state_dispatch_vtable *)_th->internal_decode)->granule_time)(
+     _th,_gp);
+  }
+  else if(_th->internal_encode!=NULL){
+    return (*((oc_state_dispatch_vtable *)_th->internal_encode)->granule_time)(
+     _th,_gp);
+  }
+  else return -1;
+}
+
+void oc_theora_info2th_info(th_info *_info,const theora_info *_ci){
+  _info->version_major=_ci->version_major;
+  _info->version_minor=_ci->version_minor;
+  _info->version_subminor=_ci->version_subminor;
+  _info->frame_width=_ci->width;
+  _info->frame_height=_ci->height;
+  _info->pic_width=_ci->frame_width;
+  _info->pic_height=_ci->frame_height;
+  _info->pic_x=_ci->offset_x;
+  _info->pic_y=_ci->offset_y;
+  _info->fps_numerator=_ci->fps_numerator;
+  _info->fps_denominator=_ci->fps_denominator;
+  _info->aspect_numerator=_ci->aspect_numerator;
+  _info->aspect_denominator=_ci->aspect_denominator;
+  switch(_ci->colorspace){
+    case OC_CS_ITU_REC_470M:_info->colorspace=TH_CS_ITU_REC_470M;break;
+    case OC_CS_ITU_REC_470BG:_info->colorspace=TH_CS_ITU_REC_470BG;break;
+    default:_info->colorspace=TH_CS_UNSPECIFIED;break;
+  }
+  switch(_ci->pixelformat){
+    case OC_PF_420:_info->pixel_fmt=TH_PF_420;break;
+    case OC_PF_422:_info->pixel_fmt=TH_PF_422;break;
+    case OC_PF_444:_info->pixel_fmt=TH_PF_444;break;
+    default:_info->pixel_fmt=TH_PF_RSVD;
+  }
+  _info->target_bitrate=_ci->target_bitrate;
+  _info->quality=_ci->quality;
+  _info->keyframe_granule_shift=_ci->keyframe_frequency_force>0?
+   OC_MINI(31,oc_ilog(_ci->keyframe_frequency_force-1)):0;
+}
+
+int theora_packet_isheader(ogg_packet *_op){
+  return th_packet_isheader(_op);
+}
+
+int theora_packet_iskeyframe(ogg_packet *_op){
+  return th_packet_iskeyframe(_op);
+}
+
+int theora_granule_shift(theora_info *_ci){
+  /*This breaks when keyframe_frequency_force is not positive or is larger than
+     2**31 (if your int is more than 32 bits), but that's what the original
+     function does.*/
+  return oc_ilog(_ci->keyframe_frequency_force-1);
+}
+
+void theora_comment_init(theora_comment *_tc){
+  th_comment_init((th_comment *)_tc);
+}
+
+char *theora_comment_query(theora_comment *_tc,char *_tag,int _count){
+  return th_comment_query((th_comment *)_tc,_tag,_count);
+}
+
+int theora_comment_query_count(theora_comment *_tc,char *_tag){
+  return th_comment_query_count((th_comment *)_tc,_tag);
+}
+
+void theora_comment_clear(theora_comment *_tc){
+  th_comment_clear((th_comment *)_tc);
+}
+
+void theora_comment_add(theora_comment *_tc,char *_comment){
+  th_comment_add((th_comment *)_tc,_comment);
+}
+
+void theora_comment_add_tag(theora_comment *_tc, char *_tag, char *_value){
+  th_comment_add_tag((th_comment *)_tc,_tag,_value);
+}
diff --git a/lib/apiwrapper.h b/lib/apiwrapper.h
new file mode 100644
index 0000000..93454d7
--- /dev/null
+++ b/lib/apiwrapper.h
@@ -0,0 +1,54 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: apiwrapper.h 13596 2007-08-23 20:05:38Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_apiwrapper_H)
+# define _apiwrapper_H (1)
+# include <ogg/ogg.h>
+# include <theora/theora.h>
+# include "theora/theoradec.h"
+# include "theora/theoraenc.h"
+# include "internal.h"
+
+typedef struct th_api_wrapper th_api_wrapper;
+typedef struct th_api_info    th_api_info;
+
+/*Provide an entry point for the codec setup to clear itself in case we ever
+   want to break pieces off into a common base library shared by encoder and
+   decoder.
+  In addition, this makes several other pieces of the API wrapper cleaner.*/
+typedef void (*oc_setup_clear_func)(void *_ts);
+
+/*Generally only one of these pointers will be non-NULL in any given instance.
+  Technically we do not even really need this struct, since we should be able
+   to figure out which one from "context", but doing it this way makes sure we
+   don't flub it up.*/
+struct th_api_wrapper{
+  oc_setup_clear_func  clear;
+  th_setup_info       *setup;
+  th_dec_ctx          *decode;
+  th_enc_ctx          *encode;
+};
+
+struct th_api_info{
+  th_api_wrapper api;
+  theora_info    info;
+};
+
+
+void oc_theora_info2th_info(th_info *_info,const theora_info *_ci);
+
+#endif
diff --git a/lib/bitpack.c b/lib/bitpack.c
new file mode 100644
index 0000000..8195003
--- /dev/null
+++ b/lib/bitpack.c
@@ -0,0 +1,111 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE OggTheora SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function: packing variable sized words into an octet stream
+  last mod: $Id: bitpack.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+#include <string.h>
+#include <stdlib.h>
+#include "bitpack.h"
+
+/*We're 'MSb' endian; if we write a word but read individual bits,
+   then we'll read the MSb first.*/
+
+void oc_pack_readinit(oc_pack_buf *_b,unsigned char *_buf,long _bytes){
+  memset(_b,0,sizeof(*_b));
+  _b->ptr=_buf;
+  _b->stop=_buf+_bytes;
+}
+
+static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){
+  const unsigned char *ptr;
+  const unsigned char *stop;
+  oc_pb_window         window;
+  int                  available;
+  window=_b->window;
+  available=_b->bits;
+  ptr=_b->ptr;
+  stop=_b->stop;
+  while(available<=OC_PB_WINDOW_SIZE-8&&ptr<stop){
+    available+=8;
+    window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
+  }
+  _b->ptr=ptr;
+  if(_bits>available){
+    if(ptr>=stop){
+      _b->eof=1;
+      available=OC_LOTS_OF_BITS;
+    }
+    else window|=*ptr>>(available&7);
+  }
+  _b->bits=available;
+  return window;
+}
+
+int oc_pack_look1(oc_pack_buf *_b){
+  oc_pb_window window;
+  int          available;
+  window=_b->window;
+  available=_b->bits;
+  if(available<1)_b->window=window=oc_pack_refill(_b,1);
+  return window>>OC_PB_WINDOW_SIZE-1;
+}
+
+void oc_pack_adv1(oc_pack_buf *_b){
+  _b->window<<=1;
+  _b->bits--;
+}
+
+/*Here we assume that 0<=_bits&&_bits<=32.*/
+long oc_pack_read(oc_pack_buf *_b,int _bits){
+  oc_pb_window window;
+  int          available;
+  long         result;
+  window=_b->window;
+  available=_b->bits;
+  if(_bits==0)return 0;
+  if(available<_bits){
+    window=oc_pack_refill(_b,_bits);
+    available=_b->bits;
+  }
+  result=window>>OC_PB_WINDOW_SIZE-_bits;
+  available-=_bits;
+  window<<=1;
+  window<<=_bits-1;
+  _b->bits=available;
+  _b->window=window;
+  return result;
+}
+
+int oc_pack_read1(oc_pack_buf *_b){
+  oc_pb_window window;
+  int          available;
+  int          result;
+  window=_b->window;
+  available=_b->bits;
+  if(available<1){
+    window=oc_pack_refill(_b,1);
+    available=_b->bits;
+  }
+  result=window>>OC_PB_WINDOW_SIZE-1;
+  available--;
+  window<<=1;
+  _b->bits=available;
+  _b->window=window;
+  return result;
+}
+
+long oc_pack_bytes_left(oc_pack_buf *_b){
+  if(_b->eof)return -1;
+  return _b->stop-_b->ptr+(_b->bits>>3);
+}
diff --git a/lib/bitpack.h b/lib/bitpack.h
new file mode 100644
index 0000000..a020a29
--- /dev/null
+++ b/lib/bitpack.h
@@ -0,0 +1,59 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE OggTheora SOURCE CODE IS (C) COPYRIGHT 1994-2009             *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function: packing variable sized words into an octet stream
+  last mod: $Id: bitwise.c 7675 2004-09-01 00:34:39Z xiphmont $
+
+ ********************************************************************/
+#if !defined(_bitpack_H)
+# define _bitpack_H (1)
+# include <limits.h>
+
+
+
+typedef unsigned long      oc_pb_window;
+typedef struct oc_pack_buf oc_pack_buf;
+
+
+
+# define OC_PB_WINDOW_SIZE ((int)sizeof(oc_pb_window)*CHAR_BIT)
+/*This is meant to be a large, positive constant that can still be efficiently
+   loaded as an immediate (on platforms like ARM, for example).
+  Even relatively modest values like 100 would work fine.*/
+# define OC_LOTS_OF_BITS (0x40000000)
+
+
+
+struct oc_pack_buf{
+  oc_pb_window         window;
+  const unsigned char *ptr;
+  const unsigned char *stop;
+  int                  bits;
+  int                  eof;
+};
+
+void oc_pack_readinit(oc_pack_buf *_b,unsigned char *_buf,long _bytes);
+int oc_pack_look1(oc_pack_buf *_b);
+void oc_pack_adv1(oc_pack_buf *_b);
+/*Here we assume 0<=_bits&&_bits<=32.*/
+long oc_pack_read(oc_pack_buf *_b,int _bits);
+int oc_pack_read1(oc_pack_buf *_b);
+/* returns -1 for read beyond EOF, or the number of whole bytes available */
+long oc_pack_bytes_left(oc_pack_buf *_b);
+
+/*These two functions are implemented locally in huffdec.c*/
+/*Read in bits without advancing the bitptr.
+  Here we assume 0<=_bits&&_bits<=32.*/
+/*static int oc_pack_look(oc_pack_buf *_b,int _bits);*/
+/*static void oc_pack_adv(oc_pack_buf *_b,int _bits);*/
+
+#endif
diff --git a/lib/cpu.c b/lib/cpu.c
new file mode 100644
index 0000000..a863aad
--- /dev/null
+++ b/lib/cpu.c
@@ -0,0 +1,226 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+ CPU capability detection for x86 processors.
+  Originally written by Rudolf Marek.
+
+ function:
+  last mod: $Id: cpu.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#include "cpu.h"
+
+#if !defined(OC_X86_ASM)
+static ogg_uint32_t oc_cpu_flags_get(void){
+  return 0;
+}
+#else
+# if !defined(_MSC_VER)
+#  if defined(__amd64__)||defined(__x86_64__)
+/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
+   compiling with -fPIC.*/
+#   define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  __asm__ __volatile__( \
+   "cpuid\n\t" \
+   :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+   :"a"(_op) \
+   :"cc" \
+  )
+#  else
+/*On x86-32, not so much.*/
+#   define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  __asm__ __volatile__( \
+   "xchgl %%ebx,%[ebx]\n\t" \
+   "cpuid\n\t" \
+   "xchgl %%ebx,%[ebx]\n\t" \
+   :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+   :"a"(_op) \
+   :"cc" \
+  )
+#  endif
+# else
+/*Why does MSVC need this complicated rigamarole?
+  At this point I honestly do not care.*/
+
+/*Visual C cpuid helper function.
+  For VS2005 we could as well use the _cpuid builtin, but that wouldn't work
+   for VS2003 users, so we do it in inline assembler.*/
+static void oc_cpuid_helper(ogg_uint32_t _cpu_info[4],ogg_uint32_t _op){
+  _asm{
+    mov eax,[_op]
+    mov esi,_cpu_info
+    cpuid
+    mov [esi+0],eax
+    mov [esi+4],ebx
+    mov [esi+8],ecx
+    mov [esi+12],edx
+  }
+}
+
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  do{ \
+    ogg_uint32_t cpu_info[4]; \
+    oc_cpuid_helper(cpu_info,_op); \
+    (_eax)=cpu_info[0]; \
+    (_ebx)=cpu_info[1]; \
+    (_ecx)=cpu_info[2]; \
+    (_edx)=cpu_info[3]; \
+  }while(0)
+
+static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
+  _asm{
+    pushfd
+    pushfd
+    pop eax
+    mov ebx,eax
+    xor eax,200000h
+    push eax
+    popfd
+    pushfd
+    pop eax
+    popfd
+    mov ecx,_eax
+    mov [ecx],eax
+    mov ecx,_ebx
+    mov [ecx],ebx
+  }
+}
+# endif
+
+static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+  if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+  if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+  if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
+  if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
+  if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
+  return flags;
+}
+
+static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
+  if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
+  if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
+  if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
+  if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
+  return flags;
+}
+
+static ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags;
+  ogg_uint32_t eax;
+  ogg_uint32_t ebx;
+  ogg_uint32_t ecx;
+  ogg_uint32_t edx;
+# if !defined(__amd64__)&&!defined(__x86_64__)
+  /*Not all x86-32 chips support cpuid, so we have to check.*/
+#  if !defined(_MSC_VER)
+  __asm__ __volatile__(
+   "pushfl\n\t"
+   "pushfl\n\t"
+   "popl %[a]\n\t"
+   "movl %[a],%[b]\n\t"
+   "xorl $0x200000,%[a]\n\t"
+   "pushl %[a]\n\t"
+   "popfl\n\t"
+   "pushfl\n\t"
+   "popl %[a]\n\t"
+   "popfl\n\t"
+   :[a]"=r"(eax),[b]"=r"(ebx)
+   :
+   :"cc"
+  );
+#  else
+  oc_detect_cpuid_helper(&eax,&ebx);
+#  endif
+  /*No cpuid.*/
+  if(eax==ebx)return 0;
+# endif
+  cpuid(0,eax,ebx,ecx,edx);
+  /*         l e t n          I e n i          u n e G*/
+  if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
+   /*      6 8 x M          T e n i          u n e G*/
+   ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+    /*Intel, Transmeta (tested with Crusoe TM5800):*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+  }
+  /*              D M A c          i t n e          h t u A*/
+  else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
+   /*      C S N            y b   e          d o e G*/
+   ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
+    /*AMD, Geode:*/
+    cpuid(0x80000000,eax,ebx,ecx,edx);
+    if(eax<0x80000001)flags=0;
+    else{
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      flags=oc_parse_amd_flags(edx,ecx);
+    }
+    /*Also check for SSE.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags|=oc_parse_intel_flags(edx,ecx);
+  }
+  /*Technically some VIA chips can be configured in the BIOS to return any
+     string here the user wants.
+    There is a special detection method that can be used to identify such
+     processors, but in my opinion, if the user really wants to change it, they
+     deserve what they get.*/
+  /*              s l u a          H r u a          t n e C*/
+  else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
+    /*VIA:*/
+    /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
+       chips (thanks to the engineers from Centaur Technology who provided it).
+      These chips support Intel-like cpuid info.
+      The C3-2 (Nehemiah) cores appear to, as well.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+    if(eax>=0x80000001){
+      /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
+        We need to check this even if the Intel test succeeds to pick up 3DNow!
+         support on these processors.
+        Unlike actual AMD processors, we cannot _rely_ on this info, since
+         some cores (e.g., the 693 stepping of the Nehemiah) claim to support
+         this function, yet return edx=0, despite the Intel test indicating
+         MMX support.
+        Therefore the features detected here are strictly added to those
+         detected by the Intel test.*/
+      /*TODO: How about earlier chips?*/
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      /*Note: As of the C7, this function returns Intel-style extended feature
+         flags, not AMD-style.
+        Currently, this only defines bits 11, 20, and 29 (0x20100800), which
+         do not conflict with any of the AMD flags we inspect.
+        For the remaining bits, Intel tells us, "Do not count on their value",
+         but VIA assures us that they will all be zero (at least on the C7 and
+         Isaiah chips).
+        In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
+         (0xC0C00000) for something else, we will have to add code to detect
+         the model to decide when it is appropriate to inspect them.*/
+      flags|=oc_parse_amd_flags(edx,ecx);
+    }
+  }
+  else{
+    /*Implement me.*/
+    flags=0;
+  }
+  return flags;
+}
+#endif
diff --git a/lib/cpu.h b/lib/cpu.h
new file mode 100644
index 0000000..a43c957
--- /dev/null
+++ b/lib/cpu.h
@@ -0,0 +1,34 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+ function:
+    last mod: $Id: cpu.h 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#if !defined(_x86_cpu_H)
+# define _x86_cpu_H (1)
+#include "internal.h"
+
+#define OC_CPU_X86_MMX      (1<<0)
+#define OC_CPU_X86_3DNOW    (1<<1)
+#define OC_CPU_X86_3DNOWEXT (1<<2)
+#define OC_CPU_X86_MMXEXT   (1<<3)
+#define OC_CPU_X86_SSE      (1<<4)
+#define OC_CPU_X86_SSE2     (1<<5)
+#define OC_CPU_X86_PNI      (1<<6)
+#define OC_CPU_X86_SSSE3    (1<<7)
+#define OC_CPU_X86_SSE4_1   (1<<8)
+#define OC_CPU_X86_SSE4_2   (1<<9)
+#define OC_CPU_X86_SSE4A    (1<<10)
+#define OC_CPU_X86_SSE5     (1<<11)
+
+#endif
diff --git a/lib/dct.h b/lib/dct.h
new file mode 100644
index 0000000..24ba6f1
--- /dev/null
+++ b/lib/dct.h
@@ -0,0 +1,31 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dct.h 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+/*Definitions shared by the forward and inverse DCT transforms.*/
+#if !defined(_dct_H)
+# define _dct_H (1)
+
+/*cos(n*pi/16) (resp. sin(m*pi/16)) scaled by 65536.*/
+#define OC_C1S7 ((ogg_int32_t)64277)
+#define OC_C2S6 ((ogg_int32_t)60547)
+#define OC_C3S5 ((ogg_int32_t)54491)
+#define OC_C4S4 ((ogg_int32_t)46341)
+#define OC_C5S3 ((ogg_int32_t)36410)
+#define OC_C6S2 ((ogg_int32_t)25080)
+#define OC_C7S1 ((ogg_int32_t)12785)
+
+#endif
diff --git a/lib/decapiwrapper.c b/lib/decapiwrapper.c
new file mode 100644
index 0000000..12ea475
--- /dev/null
+++ b/lib/decapiwrapper.c
@@ -0,0 +1,193 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: decapiwrapper.c 13596 2007-08-23 20:05:38Z tterribe $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "apiwrapper.h"
+#include "decint.h"
+#include "theora/theoradec.h"
+
+static void th_dec_api_clear(th_api_wrapper *_api){
+  if(_api->setup)th_setup_free(_api->setup);
+  if(_api->decode)th_decode_free(_api->decode);
+  memset(_api,0,sizeof(*_api));
+}
+
+static void theora_decode_clear(theora_state *_td){
+  if(_td->i!=NULL)theora_info_clear(_td->i);
+  memset(_td,0,sizeof(*_td));
+}
+
+static int theora_decode_control(theora_state *_td,int _req,
+ void *_buf,size_t _buf_sz){
+  return th_decode_ctl(((th_api_wrapper *)_td->i->codec_setup)->decode,
+   _req,_buf,_buf_sz);
+}
+
+static ogg_int64_t theora_decode_granule_frame(theora_state *_td,
+ ogg_int64_t _gp){
+  return th_granule_frame(((th_api_wrapper *)_td->i->codec_setup)->decode,_gp);
+}
+
+static double theora_decode_granule_time(theora_state *_td,ogg_int64_t _gp){
+  return th_granule_time(((th_api_wrapper *)_td->i->codec_setup)->decode,_gp);
+}
+
+static const oc_state_dispatch_vtable OC_DEC_DISPATCH_VTBL={
+  (oc_state_clear_func)theora_decode_clear,
+  (oc_state_control_func)theora_decode_control,
+  (oc_state_granule_frame_func)theora_decode_granule_frame,
+  (oc_state_granule_time_func)theora_decode_granule_time,
+};
+
+static void th_info2theora_info(theora_info *_ci,const th_info *_info){
+  _ci->version_major=_info->version_major;
+  _ci->version_minor=_info->version_minor;
+  _ci->version_subminor=_info->version_subminor;
+  _ci->width=_info->frame_width;
+  _ci->height=_info->frame_height;
+  _ci->frame_width=_info->pic_width;
+  _ci->frame_height=_info->pic_height;
+  _ci->offset_x=_info->pic_x;
+  _ci->offset_y=_info->pic_y;
+  _ci->fps_numerator=_info->fps_numerator;
+  _ci->fps_denominator=_info->fps_denominator;
+  _ci->aspect_numerator=_info->aspect_numerator;
+  _ci->aspect_denominator=_info->aspect_denominator;
+  switch(_info->colorspace){
+    case TH_CS_ITU_REC_470M:_ci->colorspace=OC_CS_ITU_REC_470M;break;
+    case TH_CS_ITU_REC_470BG:_ci->colorspace=OC_CS_ITU_REC_470BG;break;
+    default:_ci->colorspace=OC_CS_UNSPECIFIED;break;
+  }
+  switch(_info->pixel_fmt){
+    case TH_PF_420:_ci->pixelformat=OC_PF_420;break;
+    case TH_PF_422:_ci->pixelformat=OC_PF_422;break;
+    case TH_PF_444:_ci->pixelformat=OC_PF_444;break;
+    default:_ci->pixelformat=OC_PF_RSVD;
+  }
+  _ci->target_bitrate=_info->target_bitrate;
+  _ci->quality=_info->quality;
+  _ci->keyframe_frequency_force=1<<_info->keyframe_granule_shift;
+}
+
+int theora_decode_init(theora_state *_td,theora_info *_ci){
+  th_api_info    *apiinfo;
+  th_api_wrapper *api;
+  th_info         info;
+  api=(th_api_wrapper *)_ci->codec_setup;
+  /*Allocate our own combined API wrapper/theora_info struct.
+    We put them both in one malloc'd block so that when the API wrapper is
+     freed, the info struct goes with it.
+    This avoids having to figure out whether or not we need to free the info
+     struct in either theora_info_clear() or theora_clear().*/
+  apiinfo=(th_api_info *)_ogg_calloc(1,sizeof(*apiinfo));
+  if(apiinfo==NULL)return OC_FAULT;
+  /*Make our own copy of the info struct, since its lifetime should be
+     independent of the one we were passed in.*/
+  *&apiinfo->info=*_ci;
+  /*Convert the info struct now instead of saving the the one we decoded with
+     theora_decode_header(), since the user might have modified values (i.e.,
+     color space, aspect ratio, etc. can be specified from a higher level).
+    The user also might be doing something "clever" with the header packets if
+     they are not using an Ogg encapsulation.*/
+  oc_theora_info2th_info(&info,_ci);
+  /*Don't bother to copy the setup info; th_decode_alloc() makes its own copy
+     of the stuff it needs.*/
+  apiinfo->api.decode=th_decode_alloc(&info,api->setup);
+  if(apiinfo->api.decode==NULL){
+    _ogg_free(apiinfo);
+    return OC_EINVAL;
+  }
+  apiinfo->api.clear=(oc_setup_clear_func)th_dec_api_clear;
+  _td->internal_encode=NULL;
+  /*Provide entry points for ABI compatibility with old decoder shared libs.*/
+  _td->internal_decode=(void *)&OC_DEC_DISPATCH_VTBL;
+  _td->granulepos=0;
+  _td->i=&apiinfo->info;
+  _td->i->codec_setup=&apiinfo->api;
+  return 0;
+}
+
+int theora_decode_header(theora_info *_ci,theora_comment *_cc,ogg_packet *_op){
+  th_api_wrapper *api;
+  th_info         info;
+  int             ret;
+  api=(th_api_wrapper *)_ci->codec_setup;
+  /*Allocate an API wrapper struct on demand, since it will not also include a
+     theora_info struct like the ones that are used in a theora_state struct.*/
+  if(api==NULL){
+    _ci->codec_setup=_ogg_calloc(1,sizeof(*api));
+    if(_ci->codec_setup==NULL)return OC_FAULT;
+    api=(th_api_wrapper *)_ci->codec_setup;
+    api->clear=(oc_setup_clear_func)th_dec_api_clear;
+  }
+  /*Convert from the theora_info struct instead of saving our own th_info
+     struct between calls.
+    The user might be doing something "clever" with the header packets if they
+     are not using an Ogg encapsulation, and we don't want to break this.*/
+  oc_theora_info2th_info(&info,_ci);
+  /*We rely on the fact that theora_comment and th_comment structures are
+     actually identical.
+    Take care not to change this fact unless you change the code here as
+     well!*/
+  ret=th_decode_headerin(&info,(th_comment *)_cc,&api->setup,_op);
+  /*We also rely on the fact that the error return code values are the same,
+    and that the implementations of these two functions return the same set of
+    them.
+   Note that theora_decode_header() really can return OC_NOTFORMAT, even
+    though it is not currently documented to do so.*/
+  if(ret<0)return ret;
+  th_info2theora_info(_ci,&info);
+  return 0;
+}
+
+int theora_decode_packetin(theora_state *_td,ogg_packet *_op){
+  th_api_wrapper *api;
+  ogg_int64_t     gp;
+  int             ret;
+  if(!_td||!_td->i||!_td->i->codec_setup)return OC_FAULT;
+  api=(th_api_wrapper *)_td->i->codec_setup;
+  ret=th_decode_packetin(api->decode,_op,&gp);
+  if(ret<0)return OC_BADPACKET;
+  _td->granulepos=gp;
+  return 0;
+}
+
+int theora_decode_YUVout(theora_state *_td,yuv_buffer *_yuv){
+  th_api_wrapper  *api;
+  th_dec_ctx      *decode;
+  th_ycbcr_buffer  buf;
+  int              ret;
+  if(!_td||!_td->i||!_td->i->codec_setup)return OC_FAULT;
+  api=(th_api_wrapper *)_td->i->codec_setup;
+  decode=(th_dec_ctx *)api->decode;
+  if(!decode)return OC_FAULT;
+  ret=th_decode_ycbcr_out(decode,buf);
+  if(ret>=0){
+    _yuv->y_width=buf[0].width;
+    _yuv->y_height=buf[0].height;
+    _yuv->y_stride=buf[0].stride;
+    _yuv->uv_width=buf[1].width;
+    _yuv->uv_height=buf[1].height;
+    _yuv->uv_stride=buf[1].stride;
+    _yuv->y=buf[0].data;
+    _yuv->u=buf[1].data;
+    _yuv->v=buf[2].data;
+  }
+  return ret;
+}
diff --git a/lib/decinfo.c b/lib/decinfo.c
new file mode 100644
index 0000000..845eb13
--- /dev/null
+++ b/lib/decinfo.c
@@ -0,0 +1,246 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: decinfo.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "decint.h"
+
+
+
+/*Unpacks a series of octets from a given byte array into the pack buffer.
+  No checking is done to ensure the buffer contains enough data.
+  _opb: The pack buffer to read the octets from.
+  _buf: The byte array to store the unpacked bytes in.
+  _len: The number of octets to unpack.*/
+static void oc_unpack_octets(oc_pack_buf *_opb,char *_buf,size_t _len){
+  while(_len-->0){
+    long val;
+    val=oc_pack_read(_opb,8);
+    *_buf++=(char)val;
+  }
+}
+
+/*Unpacks a 32-bit integer encoded by octets in little-endian form.*/
+static long oc_unpack_length(oc_pack_buf *_opb){
+  long ret[4];
+  int  i;
+  for(i=0;i<4;i++)ret[i]=oc_pack_read(_opb,8);
+  return ret[0]|ret[1]<<8|ret[2]<<16|ret[3]<<24;
+}
+
+static int oc_info_unpack(oc_pack_buf *_opb,th_info *_info){
+  long val;
+  /*Check the codec bitstream version.*/
+  val=oc_pack_read(_opb,8);
+  _info->version_major=(unsigned char)val;
+  val=oc_pack_read(_opb,8);
+  _info->version_minor=(unsigned char)val;
+  val=oc_pack_read(_opb,8);
+  _info->version_subminor=(unsigned char)val;
+  /*verify we can parse this bitstream version.
+     We accept earlier minors and all subminors, by spec*/
+  if(_info->version_major>TH_VERSION_MAJOR||
+   _info->version_major==TH_VERSION_MAJOR&&
+   _info->version_minor>TH_VERSION_MINOR){
+    return TH_EVERSION;
+  }
+  /*Read the encoded frame description.*/
+  val=oc_pack_read(_opb,16);
+  _info->frame_width=(ogg_uint32_t)val<<4;
+  val=oc_pack_read(_opb,16);
+  _info->frame_height=(ogg_uint32_t)val<<4;
+  val=oc_pack_read(_opb,24);
+  _info->pic_width=(ogg_uint32_t)val;
+  val=oc_pack_read(_opb,24);
+  _info->pic_height=(ogg_uint32_t)val;
+  val=oc_pack_read(_opb,8);
+  _info->pic_x=(ogg_uint32_t)val;
+  val=oc_pack_read(_opb,8);
+  _info->pic_y=(ogg_uint32_t)val;
+  val=oc_pack_read(_opb,32);
+  _info->fps_numerator=(ogg_uint32_t)val;
+  val=oc_pack_read(_opb,32);
+  _info->fps_denominator=(ogg_uint32_t)val;
+  if(_info->frame_width==0||_info->frame_height==0||
+   _info->pic_width+_info->pic_x>_info->frame_width||
+   _info->pic_height+_info->pic_y>_info->frame_height||
+   _info->fps_numerator==0||_info->fps_denominator==0){
+    return TH_EBADHEADER;
+  }
+  /*Note: The sense of pic_y is inverted in what we pass back to the
+     application compared to how it is stored in the bitstream.
+    This is because the bitstream uses a right-handed coordinate system, while
+     applications expect a left-handed one.*/
+  _info->pic_y=_info->frame_height-_info->pic_height-_info->pic_y;
+  val=oc_pack_read(_opb,24);
+  _info->aspect_numerator=(ogg_uint32_t)val;
+  val=oc_pack_read(_opb,24);
+  _info->aspect_denominator=(ogg_uint32_t)val;
+  val=oc_pack_read(_opb,8);
+  _info->colorspace=(th_colorspace)val;
+  val=oc_pack_read(_opb,24);
+  _info->target_bitrate=(int)val;
+  val=oc_pack_read(_opb,6);
+  _info->quality=(int)val;
+  val=oc_pack_read(_opb,5);
+  _info->keyframe_granule_shift=(int)val;
+  val=oc_pack_read(_opb,2);
+  _info->pixel_fmt=(th_pixel_fmt)val;
+  if(_info->pixel_fmt==TH_PF_RSVD)return TH_EBADHEADER;
+  val=oc_pack_read(_opb,3);
+  if(val!=0||oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
+  return 0;
+}
+
+static int oc_comment_unpack(oc_pack_buf *_opb,th_comment *_tc){
+  long len;
+  int  i;
+  /*Read the vendor string.*/
+  len=oc_unpack_length(_opb);
+  if(len<0||len>oc_pack_bytes_left(_opb))return TH_EBADHEADER;
+  _tc->vendor=_ogg_malloc((size_t)len+1);
+  if(_tc->vendor==NULL)return TH_EFAULT;
+  oc_unpack_octets(_opb,_tc->vendor,len);
+  _tc->vendor[len]='\0';
+  /*Read the user comments.*/
+  _tc->comments=(int)oc_unpack_length(_opb);
+  len=_tc->comments;
+  if(len<0||len>(LONG_MAX>>2)||len<<2>oc_pack_bytes_left(_opb)){
+    _tc->comments=0;
+    return TH_EBADHEADER;
+  }
+  _tc->comment_lengths=(int *)_ogg_malloc(
+   _tc->comments*sizeof(_tc->comment_lengths[0]));
+  _tc->user_comments=(char **)_ogg_malloc(
+   _tc->comments*sizeof(_tc->user_comments[0]));
+  for(i=0;i<_tc->comments;i++){
+    len=oc_unpack_length(_opb);
+    if(len<0||len>oc_pack_bytes_left(_opb)){
+      _tc->comments=i;
+      return TH_EBADHEADER;
+    }
+    _tc->comment_lengths[i]=len;
+    _tc->user_comments[i]=_ogg_malloc((size_t)len+1);
+    if(_tc->user_comments[i]==NULL){
+      _tc->comments=i;
+      return TH_EFAULT;
+    }
+    oc_unpack_octets(_opb,_tc->user_comments[i],len);
+    _tc->user_comments[i][len]='\0';
+  }
+  return oc_pack_bytes_left(_opb)<0?TH_EBADHEADER:0;
+}
+
+static int oc_setup_unpack(oc_pack_buf *_opb,th_setup_info *_setup){
+  int ret;
+  /*Read the quantizer tables.*/
+  ret=oc_quant_params_unpack(_opb,&_setup->qinfo);
+  if(ret<0)return ret;
+  /*Read the Huffman trees.*/
+  return oc_huff_trees_unpack(_opb,_setup->huff_tables);
+}
+
+static void oc_setup_clear(th_setup_info *_setup){
+  oc_quant_params_clear(&_setup->qinfo);
+  oc_huff_trees_clear(_setup->huff_tables);
+}
+
+static int oc_dec_headerin(oc_pack_buf *_opb,th_info *_info,
+ th_comment *_tc,th_setup_info **_setup,ogg_packet *_op){
+  char buffer[6];
+  long val;
+  int  packtype;
+  int  ret;
+  val=oc_pack_read(_opb,8);
+  packtype=(int)val;
+  /*If we're at a data packet and we have received all three headers, we're
+     done.*/
+  if(!(packtype&0x80)&&_info->frame_width>0&&_tc->vendor!=NULL&&*_setup!=NULL){
+    return 0;
+  }
+  /*Check the codec string.*/
+  oc_unpack_octets(_opb,buffer,6);
+  if(memcmp(buffer,"theora",6)!=0)return TH_ENOTFORMAT;
+  switch(packtype){
+    /*Codec info header.*/
+    case 0x80:{
+      /*This should be the first packet, and we should not already be
+         initialized.*/
+      if(!_op->b_o_s||_info->frame_width>0)return TH_EBADHEADER;
+      ret=oc_info_unpack(_opb,_info);
+      if(ret<0)th_info_clear(_info);
+      else ret=3;
+    }break;
+    /*Comment header.*/
+    case 0x81:{
+      if(_tc==NULL)return TH_EFAULT;
+      /*We shoud have already decoded the info header, and should not yet have
+         decoded the comment header.*/
+      if(_info->frame_width==0||_tc->vendor!=NULL)return TH_EBADHEADER;
+      ret=oc_comment_unpack(_opb,_tc);
+      if(ret<0)th_comment_clear(_tc);
+      else ret=2;
+    }break;
+    /*Codec setup header.*/
+    case 0x82:{
+      oc_setup_info *setup;
+      if(_tc==NULL||_setup==NULL)return TH_EFAULT;
+      /*We should have already decoded the info header and the comment header,
+         and should not yet have decoded the setup header.*/
+      if(_info->frame_width==0||_tc->vendor==NULL||*_setup!=NULL){
+        return TH_EBADHEADER;
+      }
+      setup=(oc_setup_info *)_ogg_calloc(1,sizeof(*setup));
+      if(setup==NULL)return TH_EFAULT;
+      ret=oc_setup_unpack(_opb,setup);
+      if(ret<0){
+        oc_setup_clear(setup);
+        _ogg_free(setup);
+      }
+      else{
+        *_setup=setup;
+        ret=1;
+      }
+    }break;
+    default:{
+      /*We don't know what this header is.*/
+      return TH_EBADHEADER;
+    }break;
+  }
+  return ret;
+}
+
+
+/*Decodes one header packet.
+  This should be called repeatedly with the packets at the beginning of the
+   stream until it returns 0.*/
+int th_decode_headerin(th_info *_info,th_comment *_tc,
+ th_setup_info **_setup,ogg_packet *_op){
+  oc_pack_buf opb;
+  if(_op==NULL)return TH_EBADHEADER;
+  if(_info==NULL)return TH_EFAULT;
+  oc_pack_readinit(&opb,_op->packet,_op->bytes);
+  return oc_dec_headerin(&opb,_info,_tc,_setup,_op);
+}
+
+void th_setup_free(th_setup_info *_setup){
+  if(_setup!=NULL){
+    oc_setup_clear(_setup);
+    _ogg_free(_setup);
+  }
+}
diff --git a/lib/decint.h b/lib/decint.h
new file mode 100644
index 0000000..261b676
--- /dev/null
+++ b/lib/decint.h
@@ -0,0 +1,107 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: decint.h 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#include <limits.h>
+#if !defined(_decint_H)
+# define _decint_H (1)
+# include "theora/theoradec.h"
+# include "internal.h"
+# include "bitpack.h"
+
+typedef struct th_setup_info oc_setup_info;
+typedef struct th_dec_ctx    oc_dec_ctx;
+
+# include "huffdec.h"
+# include "dequant.h"
+
+/*Constants for the packet-in state machine specific to the decoder.*/
+
+/*Next packet to read: Data packet.*/
+#define OC_PACKET_DATA (0)
+
+
+
+struct th_setup_info{
+  /*The Huffman codes.*/
+  oc_huff_node      *huff_tables[TH_NHUFFMAN_TABLES];
+  /*The quantization parameters.*/
+  th_quant_info  qinfo;
+};
+
+
+
+struct th_dec_ctx{
+  /*Shared encoder/decoder state.*/
+  oc_theora_state      state;
+  /*Whether or not packets are ready to be emitted.
+    This takes on negative values while there are remaining header packets to
+     be emitted, reaches 0 when the codec is ready for input, and goes to 1
+     when a frame has been processed and a data packet is ready.*/
+  int                  packet_state;
+  /*Buffer in which to assemble packets.*/
+  oc_pack_buf          opb;
+  /*Huffman decode trees.*/
+  oc_huff_node        *huff_tables[TH_NHUFFMAN_TABLES];
+  /*The index of the first token in each plane for each coefficient.*/
+  ptrdiff_t            ti0[3][64];
+  /*The number of outstanding EOB runs at the start of each coefficient in each
+     plane.*/
+  ptrdiff_t            eob_runs[3][64];
+  /*The DCT token lists.*/
+  unsigned char       *dct_tokens;
+  /*The extra bits associated with DCT tokens.*/
+  unsigned char       *extra_bits;
+  /*The number of dct tokens unpacked so far.*/
+  int                  dct_tokens_count;
+  /*The out-of-loop post-processing level.*/
+  int                  pp_level;
+  /*The DC scale used for out-of-loop deblocking.*/
+  int                  pp_dc_scale[64];
+  /*The sharpen modifier used for out-of-loop deringing.*/
+  int                  pp_sharp_mod[64];
+  /*The DC quantization index of each block.*/
+  unsigned char       *dc_qis;
+  /*The variance of each block.*/
+  int                 *variances;
+  /*The storage for the post-processed frame buffer.*/
+  unsigned char       *pp_frame_data;
+  /*Whether or not the post-processsed frame buffer has space for chroma.*/
+  int                  pp_frame_state;
+  /*The buffer used for the post-processed frame.
+    Note that this is _not_ guaranteed to have the same strides and offsets as
+     the reference frame buffers.*/
+  th_ycbcr_buffer      pp_frame_buf;
+  /*The striped decode callback function.*/
+  th_stripe_callback   stripe_cb;
+# if defined(HAVE_CAIRO)
+  /*Output metrics for debugging.*/
+  int                  telemetry;
+  int                  telemetry_mbmode;
+  int                  telemetry_mv;
+  int                  telemetry_qi;
+  int                  telemetry_bits;
+  int                  telemetry_frame_bytes;
+  int                  telemetry_coding_bytes;
+  int                  telemetry_mode_bytes;
+  int                  telemetry_mv_bytes;
+  int                  telemetry_qi_bytes;
+  int                  telemetry_dc_bytes;
+  unsigned char       *telemetry_frame_data;
+# endif
+};
+
+#endif
diff --git a/lib/decode.c b/lib/decode.c
new file mode 100644
index 0000000..7be6646
--- /dev/null
+++ b/lib/decode.c
@@ -0,0 +1,2943 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: decode.c 16581 2009-09-25 22:56:16Z gmaxwell $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <ogg/ogg.h>
+#include "decint.h"
+#if defined(OC_DUMP_IMAGES)
+# include <stdio.h>
+# include "png.h"
+#endif
+#if defined(HAVE_CAIRO)
+# include <cairo.h>
+#endif
+
+
+/*No post-processing.*/
+#define OC_PP_LEVEL_DISABLED  (0)
+/*Keep track of DC qi for each block only.*/
+#define OC_PP_LEVEL_TRACKDCQI (1)
+/*Deblock the luma plane.*/
+#define OC_PP_LEVEL_DEBLOCKY  (2)
+/*Dering the luma plane.*/
+#define OC_PP_LEVEL_DERINGY   (3)
+/*Stronger luma plane deringing.*/
+#define OC_PP_LEVEL_SDERINGY  (4)
+/*Deblock the chroma planes.*/
+#define OC_PP_LEVEL_DEBLOCKC  (5)
+/*Dering the chroma planes.*/
+#define OC_PP_LEVEL_DERINGC   (6)
+/*Stronger chroma plane deringing.*/
+#define OC_PP_LEVEL_SDERINGC  (7)
+/*Maximum valid post-processing level.*/
+#define OC_PP_LEVEL_MAX       (7)
+
+
+
+/*The mode alphabets for the various mode coding schemes.
+  Scheme 0 uses a custom alphabet, which is not stored in this table.*/
+static const unsigned char OC_MODE_ALPHABETS[7][OC_NMODES]={
+  /*Last MV dominates */
+  {
+    OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV_LAST2,OC_MODE_INTER_MV,
+    OC_MODE_INTER_NOMV,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV,
+    OC_MODE_INTER_MV_FOUR
+  },
+  {
+    OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV_LAST2,OC_MODE_INTER_NOMV,
+    OC_MODE_INTER_MV,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV,
+    OC_MODE_INTER_MV_FOUR
+  },
+  {
+    OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV,OC_MODE_INTER_MV_LAST2,
+    OC_MODE_INTER_NOMV,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV,
+    OC_MODE_INTER_MV_FOUR
+  },
+  {
+    OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV,OC_MODE_INTER_NOMV,
+    OC_MODE_INTER_MV_LAST2,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,
+    OC_MODE_GOLDEN_MV,OC_MODE_INTER_MV_FOUR
+  },
+  /*No MV dominates.*/
+  {
+    OC_MODE_INTER_NOMV,OC_MODE_INTER_MV_LAST,OC_MODE_INTER_MV_LAST2,
+    OC_MODE_INTER_MV,OC_MODE_INTRA,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV,
+    OC_MODE_INTER_MV_FOUR
+  },
+  {
+    OC_MODE_INTER_NOMV,OC_MODE_GOLDEN_NOMV,OC_MODE_INTER_MV_LAST,
+    OC_MODE_INTER_MV_LAST2,OC_MODE_INTER_MV,OC_MODE_INTRA,OC_MODE_GOLDEN_MV,
+    OC_MODE_INTER_MV_FOUR
+  },
+  /*Default ordering.*/
+  {
+    OC_MODE_INTER_NOMV,OC_MODE_INTRA,OC_MODE_INTER_MV,OC_MODE_INTER_MV_LAST,
+    OC_MODE_INTER_MV_LAST2,OC_MODE_GOLDEN_NOMV,OC_MODE_GOLDEN_MV,
+    OC_MODE_INTER_MV_FOUR
+  }
+};
+
+
+/*The original DCT tokens are extended and reordered during the construction of
+   the Huffman tables.
+  The extension means more bits can be read with fewer calls to the bitpacker
+   during the Huffman decoding process (at the cost of larger Huffman tables),
+   and fewer tokens require additional extra bits (reducing the average storage
+   per decoded token).
+  The revised ordering reveals essential information in the token value
+   itself; specifically, whether or not there are additional extra bits to read
+   and the parameter to which those extra bits are applied.
+  The token is used to fetch a code word from the OC_DCT_CODE_WORD table below.
+  The extra bits are added into code word at the bit position inferred from the
+   token value, giving the final code word from which all required parameters
+   are derived.
+  The number of EOBs and the leading zero run length can be extracted directly.
+  The coefficient magnitude is optionally negated before extraction, according
+   to a 'flip' bit.*/
+
+/*The number of additional extra bits that are decoded with each of the
+   internal DCT tokens.*/
+static const unsigned char OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[15]={
+  12,4,3,3,4,4,5,5,8,8,8,8,3,3,6
+};
+
+/*Whether or not an internal token needs any additional extra bits.*/
+#define OC_DCT_TOKEN_NEEDS_MORE(token) \
+ (token<(sizeof(OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)/ \
+  sizeof(*OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)))
+
+/*This token (OC_DCT_REPEAT_RUN3_TOKEN) requires more than 8 extra bits.*/
+#define OC_DCT_TOKEN_FAT_EOB (0)
+
+/*The number of EOBs to use for an end-of-frame token.
+  Note: We want to set eobs to PTRDIFF_MAX here, but that requires C99, which
+   is not yet available everywhere; this should be equivalent.*/
+#define OC_DCT_EOB_FINISH (~(size_t)0>>1)
+
+/*The location of the (6) run legth bits in the code word.
+  These are placed at index 0 and given 8 bits (even though 6 would suffice)
+   because it may be faster to extract the lower byte on some platforms.*/
+#define OC_DCT_CW_RLEN_SHIFT (0)
+/*The location of the (12) EOB bits in the code word.*/
+#define OC_DCT_CW_EOB_SHIFT  (8)
+/*The location of the (1) flip bit in the code word.
+  This must be right under the magnitude bits.*/
+#define OC_DCT_CW_FLIP_BIT   (20)
+/*The location of the (11) token magnitude bits in the code word.
+  These must be last, and rely on a sign-extending right shift.*/
+#define OC_DCT_CW_MAG_SHIFT  (21)
+
+/*Pack the given fields into a code word.*/
+#define OC_DCT_CW_PACK(_eobs,_rlen,_mag,_flip) \
+ ((_eobs)<<OC_DCT_CW_EOB_SHIFT| \
+ (_rlen)<<OC_DCT_CW_RLEN_SHIFT| \
+ (_flip)<<OC_DCT_CW_FLIP_BIT| \
+ (_mag)-(_flip)<<OC_DCT_CW_MAG_SHIFT)
+
+/*A special code word value that signals the end of the frame (a long EOB run
+   of zero).*/
+#define OC_DCT_CW_FINISH (0)
+
+/*The position at which to insert the extra bits in the code word.
+  We use this formulation because Intel has no useful cmov.
+  A real architecture would probably do better with two of those.
+  This translates to 11 instructions(!), and is _still_ faster than either a
+   table lookup (just barely) or the naive double-ternary implementation (which
+   gcc translates to a jump and a cmov).
+  This assumes OC_DCT_CW_RLEN_SHIFT is zero, but could easily be reworked if
+   you want to make one of the other shifts zero.*/
+#define OC_DCT_TOKEN_EB_POS(_token) \
+ ((OC_DCT_CW_EOB_SHIFT-OC_DCT_CW_MAG_SHIFT&-((_token)<2)) \
+ +(OC_DCT_CW_MAG_SHIFT&-((_token)<12)))
+
+/*The code words for each internal token.
+  See the notes at OC_DCT_TOKEN_MAP for the reasons why things are out of
+   order.*/
+static const ogg_int32_t OC_DCT_CODE_WORD[92]={
+  /*These tokens require additional extra bits for the EOB count.*/
+  /*OC_DCT_REPEAT_RUN3_TOKEN (12 extra bits)*/
+  OC_DCT_CW_FINISH,
+  /*OC_DCT_REPEAT_RUN2_TOKEN (4 extra bits)*/
+  OC_DCT_CW_PACK(16, 0,  0,0),
+  /*These tokens require additional extra bits for the magnitude.*/
+  /*OC_DCT_VAL_CAT5 (4 extra bits-1 already read)*/
+  OC_DCT_CW_PACK( 0, 0, 13,0),
+  OC_DCT_CW_PACK( 0, 0, 13,1),
+  /*OC_DCT_VAL_CAT6 (5 extra bits-1 already read)*/
+  OC_DCT_CW_PACK( 0, 0, 21,0),
+  OC_DCT_CW_PACK( 0, 0, 21,1),
+  /*OC_DCT_VAL_CAT7 (6 extra bits-1 already read)*/
+  OC_DCT_CW_PACK( 0, 0, 37,0),
+  OC_DCT_CW_PACK( 0, 0, 37,1),
+  /*OC_DCT_VAL_CAT8 (10 extra bits-2 already read)*/
+  OC_DCT_CW_PACK( 0, 0, 69,0),
+  OC_DCT_CW_PACK( 0, 0,325,0),
+  OC_DCT_CW_PACK( 0, 0, 69,1),
+  OC_DCT_CW_PACK( 0, 0,325,1),
+  /*These tokens require additional extra bits for the run length.*/
+  /*OC_DCT_RUN_CAT1C (4 extra bits-1 already read)*/
+  OC_DCT_CW_PACK( 0,10, +1,0),
+  OC_DCT_CW_PACK( 0,10, -1,0),
+  /*OC_DCT_ZRL_TOKEN (6 extra bits)
+    Flip is set to distinguish this from OC_DCT_CW_FINISH.*/
+  OC_DCT_CW_PACK( 0, 0,  0,1),
+  /*The remaining tokens require no additional extra bits.*/
+  /*OC_DCT_EOB1_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 1, 0,  0,0),
+  /*OC_DCT_EOB2_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 2, 0,  0,0),
+  /*OC_DCT_EOB3_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 3, 0,  0,0),
+  /*OC_DCT_RUN_CAT1A (1 extra bit-1 already read)x5*/
+  OC_DCT_CW_PACK( 0, 1, +1,0),
+  OC_DCT_CW_PACK( 0, 1, -1,0),
+  OC_DCT_CW_PACK( 0, 2, +1,0),
+  OC_DCT_CW_PACK( 0, 2, -1,0),
+  OC_DCT_CW_PACK( 0, 3, +1,0),
+  OC_DCT_CW_PACK( 0, 3, -1,0),
+  OC_DCT_CW_PACK( 0, 4, +1,0),
+  OC_DCT_CW_PACK( 0, 4, -1,0),
+  OC_DCT_CW_PACK( 0, 5, +1,0),
+  OC_DCT_CW_PACK( 0, 5, -1,0),
+  /*OC_DCT_RUN_CAT2A (2 extra bits-2 already read)*/
+  OC_DCT_CW_PACK( 0, 1, +2,0),
+  OC_DCT_CW_PACK( 0, 1, +3,0),
+  OC_DCT_CW_PACK( 0, 1, -2,0),
+  OC_DCT_CW_PACK( 0, 1, -3,0),
+  /*OC_DCT_RUN_CAT1B (3 extra bits-3 already read)*/
+  OC_DCT_CW_PACK( 0, 6, +1,0),
+  OC_DCT_CW_PACK( 0, 7, +1,0),
+  OC_DCT_CW_PACK( 0, 8, +1,0),
+  OC_DCT_CW_PACK( 0, 9, +1,0),
+  OC_DCT_CW_PACK( 0, 6, -1,0),
+  OC_DCT_CW_PACK( 0, 7, -1,0),
+  OC_DCT_CW_PACK( 0, 8, -1,0),
+  OC_DCT_CW_PACK( 0, 9, -1,0),
+  /*OC_DCT_RUN_CAT2B (3 extra bits-3 already read)*/
+  OC_DCT_CW_PACK( 0, 2, +2,0),
+  OC_DCT_CW_PACK( 0, 3, +2,0),
+  OC_DCT_CW_PACK( 0, 2, +3,0),
+  OC_DCT_CW_PACK( 0, 3, +3,0),
+  OC_DCT_CW_PACK( 0, 2, -2,0),
+  OC_DCT_CW_PACK( 0, 3, -2,0),
+  OC_DCT_CW_PACK( 0, 2, -3,0),
+  OC_DCT_CW_PACK( 0, 3, -3,0),
+  /*OC_DCT_SHORT_ZRL_TOKEN (3 extra bits-3 already read)
+    Flip is set on the first one to distinguish it from OC_DCT_CW_FINISH.*/
+  OC_DCT_CW_PACK( 0, 0,  0,1),
+  OC_DCT_CW_PACK( 0, 1,  0,0),
+  OC_DCT_CW_PACK( 0, 2,  0,0),
+  OC_DCT_CW_PACK( 0, 3,  0,0),
+  OC_DCT_CW_PACK( 0, 4,  0,0),
+  OC_DCT_CW_PACK( 0, 5,  0,0),
+  OC_DCT_CW_PACK( 0, 6,  0,0),
+  OC_DCT_CW_PACK( 0, 7,  0,0),
+  /*OC_ONE_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 0, 0, +1,0),
+  /*OC_MINUS_ONE_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 0, 0, -1,0),
+  /*OC_TWO_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 0, 0, +2,0),
+  /*OC_MINUS_TWO_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 0, 0, -2,0),
+  /*OC_DCT_VAL_CAT2 (1 extra bit-1 already read)x4*/
+  OC_DCT_CW_PACK( 0, 0, +3,0),
+  OC_DCT_CW_PACK( 0, 0, -3,0),
+  OC_DCT_CW_PACK( 0, 0, +4,0),
+  OC_DCT_CW_PACK( 0, 0, -4,0),
+  OC_DCT_CW_PACK( 0, 0, +5,0),
+  OC_DCT_CW_PACK( 0, 0, -5,0),
+  OC_DCT_CW_PACK( 0, 0, +6,0),
+  OC_DCT_CW_PACK( 0, 0, -6,0),
+  /*OC_DCT_VAL_CAT3 (2 extra bits-2 already read)*/
+  OC_DCT_CW_PACK( 0, 0, +7,0),
+  OC_DCT_CW_PACK( 0, 0, +8,0),
+  OC_DCT_CW_PACK( 0, 0, -7,0),
+  OC_DCT_CW_PACK( 0, 0, -8,0),
+  /*OC_DCT_VAL_CAT4 (3 extra bits-3 already read)*/
+  OC_DCT_CW_PACK( 0, 0, +9,0),
+  OC_DCT_CW_PACK( 0, 0,+10,0),
+  OC_DCT_CW_PACK( 0, 0,+11,0),
+  OC_DCT_CW_PACK( 0, 0,+12,0),
+  OC_DCT_CW_PACK( 0, 0, -9,0),
+  OC_DCT_CW_PACK( 0, 0,-10,0),
+  OC_DCT_CW_PACK( 0, 0,-11,0),
+  OC_DCT_CW_PACK( 0, 0,-12,0),
+  /*OC_DCT_REPEAT_RUN1_TOKEN (3 extra bits-3 already read)*/
+  OC_DCT_CW_PACK( 8, 0,  0,0),
+  OC_DCT_CW_PACK( 9, 0,  0,0),
+  OC_DCT_CW_PACK(10, 0,  0,0),
+  OC_DCT_CW_PACK(11, 0,  0,0),
+  OC_DCT_CW_PACK(12, 0,  0,0),
+  OC_DCT_CW_PACK(13, 0,  0,0),
+  OC_DCT_CW_PACK(14, 0,  0,0),
+  OC_DCT_CW_PACK(15, 0,  0,0),
+  /*OC_DCT_REPEAT_RUN0_TOKEN (2 extra bits-2 already read)*/
+  OC_DCT_CW_PACK( 4, 0,  0,0),
+  OC_DCT_CW_PACK( 5, 0,  0,0),
+  OC_DCT_CW_PACK( 6, 0,  0,0),
+  OC_DCT_CW_PACK( 7, 0,  0,0),
+};
+
+
+
+static int oc_sb_run_unpack(oc_pack_buf *_opb){
+  long bits;
+  int ret;
+  /*Coding scheme:
+       Codeword            Run Length
+     0                       1
+     10x                     2-3
+     110x                    4-5
+     1110xx                  6-9
+     11110xxx                10-17
+     111110xxxx              18-33
+     111111xxxxxxxxxxxx      34-4129*/
+  bits=oc_pack_read1(_opb);
+  if(bits==0)return 1;
+  bits=oc_pack_read(_opb,2);
+  if((bits&2)==0)return 2+(int)bits;
+  else if((bits&1)==0){
+    bits=oc_pack_read1(_opb);
+    return 4+(int)bits;
+  }
+  bits=oc_pack_read(_opb,3);
+  if((bits&4)==0)return 6+(int)bits;
+  else if((bits&2)==0){
+    ret=10+((bits&1)<<2);
+    bits=oc_pack_read(_opb,2);
+    return ret+(int)bits;
+  }
+  else if((bits&1)==0){
+    bits=oc_pack_read(_opb,4);
+    return 18+(int)bits;
+  }
+  bits=oc_pack_read(_opb,12);
+  return 34+(int)bits;
+}
+
+static int oc_block_run_unpack(oc_pack_buf *_opb){
+  long bits;
+  long bits2;
+  /*Coding scheme:
+     Codeword             Run Length
+     0x                      1-2
+     10x                     3-4
+     110x                    5-6
+     1110xx                  7-10
+     11110xx                 11-14
+     11111xxxx               15-30*/
+  bits=oc_pack_read(_opb,2);
+  if((bits&2)==0)return 1+(int)bits;
+  else if((bits&1)==0){
+    bits=oc_pack_read1(_opb);
+    return 3+(int)bits;
+  }
+  bits=oc_pack_read(_opb,2);
+  if((bits&2)==0)return 5+(int)bits;
+  else if((bits&1)==0){
+    bits=oc_pack_read(_opb,2);
+    return 7+(int)bits;
+  }
+  bits=oc_pack_read(_opb,3);
+  if((bits&4)==0)return 11+bits;
+  bits2=oc_pack_read(_opb,2);
+  return 15+((bits&3)<<2)+bits2;
+}
+
+
+
+static int oc_dec_init(oc_dec_ctx *_dec,const th_info *_info,
+ const th_setup_info *_setup){
+  int qti;
+  int pli;
+  int qi;
+  int ret;
+  ret=oc_state_init(&_dec->state,_info,3);
+  if(ret<0)return ret;
+  ret=oc_huff_trees_copy(_dec->huff_tables,
+   (const oc_huff_node *const *)_setup->huff_tables);
+  if(ret<0){
+    oc_state_clear(&_dec->state);
+    return ret;
+  }
+  /*For each fragment, allocate one byte for every DCT coefficient token, plus
+     one byte for extra-bits for each token, plus one more byte for the long
+     EOB run, just in case it's the very last token and has a run length of
+     one.*/
+  _dec->dct_tokens=(unsigned char *)_ogg_malloc((64+64+1)*
+   _dec->state.nfrags*sizeof(_dec->dct_tokens[0]));
+  if(_dec->dct_tokens==NULL){
+    oc_huff_trees_clear(_dec->huff_tables);
+    oc_state_clear(&_dec->state);
+    return TH_EFAULT;
+  }
+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+    _dec->state.dequant_tables[qi][pli][qti]=
+     _dec->state.dequant_table_data[qi][pli][qti];
+  }
+  oc_dequant_tables_init(_dec->state.dequant_tables,_dec->pp_dc_scale,
+   &_setup->qinfo);
+  for(qi=0;qi<64;qi++){
+    int qsum;
+    qsum=0;
+    for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
+      qsum+=_dec->state.dequant_tables[qti][pli][qi][12]+
+       _dec->state.dequant_tables[qti][pli][qi][17]+
+       _dec->state.dequant_tables[qti][pli][qi][18]+
+       _dec->state.dequant_tables[qti][pli][qi][24]<<(pli==0);
+    }
+    _dec->pp_sharp_mod[qi]=-(qsum>>11);
+  }
+  memcpy(_dec->state.loop_filter_limits,_setup->qinfo.loop_filter_limits,
+   sizeof(_dec->state.loop_filter_limits));
+  _dec->pp_level=OC_PP_LEVEL_DISABLED;
+  _dec->dc_qis=NULL;
+  _dec->variances=NULL;
+  _dec->pp_frame_data=NULL;
+  _dec->stripe_cb.ctx=NULL;
+  _dec->stripe_cb.stripe_decoded=NULL;
+#if defined(HAVE_CAIRO)
+  _dec->telemetry=0;
+  _dec->telemetry_bits=0;
+  _dec->telemetry_qi=0;
+  _dec->telemetry_mbmode=0;
+  _dec->telemetry_mv=0;
+  _dec->telemetry_frame_data=NULL;
+#endif
+  return 0;
+}
+
+static void oc_dec_clear(oc_dec_ctx *_dec){
+#if defined(HAVE_CAIRO)
+  _ogg_free(_dec->telemetry_frame_data);
+#endif
+  _ogg_free(_dec->pp_frame_data);
+  _ogg_free(_dec->variances);
+  _ogg_free(_dec->dc_qis);
+  _ogg_free(_dec->dct_tokens);
+  oc_huff_trees_clear(_dec->huff_tables);
+  oc_state_clear(&_dec->state);
+}
+
+
+static int oc_dec_frame_header_unpack(oc_dec_ctx *_dec){
+  long val;
+  /*Check to make sure this is a data packet.*/
+  val=oc_pack_read1(&_dec->opb);
+  if(val!=0)return TH_EBADPACKET;
+  /*Read in the frame type (I or P).*/
+  val=oc_pack_read1(&_dec->opb);
+  _dec->state.frame_type=(int)val;
+  /*Read in the qi list.*/
+  val=oc_pack_read(&_dec->opb,6);
+  _dec->state.qis[0]=(unsigned char)val;
+  val=oc_pack_read1(&_dec->opb);
+  if(!val)_dec->state.nqis=1;
+  else{
+    val=oc_pack_read(&_dec->opb,6);
+    _dec->state.qis[1]=(unsigned char)val;
+    val=oc_pack_read1(&_dec->opb);
+    if(!val)_dec->state.nqis=2;
+    else{
+      val=oc_pack_read(&_dec->opb,6);
+      _dec->state.qis[2]=(unsigned char)val;
+      _dec->state.nqis=3;
+    }
+  }
+  if(_dec->state.frame_type==OC_INTRA_FRAME){
+    /*Keyframes have 3 unused configuration bits, holdovers from VP3 days.
+      Most of the other unused bits in the VP3 headers were eliminated.
+      I don't know why these remain.*/
+    /*I wanted to eliminate wasted bits, but not all config wiggle room
+       --Monty.*/
+    val=oc_pack_read(&_dec->opb,3);
+    if(val!=0)return TH_EIMPL;
+  }
+  return 0;
+}
+
+/*Mark all fragments as coded and in OC_MODE_INTRA.
+  This also builds up the coded fragment list (in coded order), and clears the
+   uncoded fragment list.
+  It does not update the coded macro block list nor the super block flags, as
+   those are not used when decoding INTRA frames.*/
+static void oc_dec_mark_all_intra(oc_dec_ctx *_dec){
+  const oc_sb_map   *sb_maps;
+  const oc_sb_flags *sb_flags;
+  oc_fragment       *frags;
+  ptrdiff_t         *coded_fragis;
+  ptrdiff_t          ncoded_fragis;
+  ptrdiff_t          prev_ncoded_fragis;
+  unsigned           nsbs;
+  unsigned           sbi;
+  int                pli;
+  coded_fragis=_dec->state.coded_fragis;
+  prev_ncoded_fragis=ncoded_fragis=0;
+  sb_maps=(const oc_sb_map *)_dec->state.sb_maps;
+  sb_flags=_dec->state.sb_flags;
+  frags=_dec->state.frags;
+  sbi=nsbs=0;
+  for(pli=0;pli<3;pli++){
+    nsbs+=_dec->state.fplanes[pli].nsbs;
+    for(;sbi<nsbs;sbi++){
+      int quadi;
+      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
+        int bi;
+        for(bi=0;bi<4;bi++){
+          ptrdiff_t fragi;
+          fragi=sb_maps[sbi][quadi][bi];
+          if(fragi>=0){
+            frags[fragi].coded=1;
+            frags[fragi].mb_mode=OC_MODE_INTRA;
+            coded_fragis[ncoded_fragis++]=fragi;
+          }
+        }
+      }
+    }
+    _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis;
+    prev_ncoded_fragis=ncoded_fragis;
+  }
+  _dec->state.ntotal_coded_fragis=ncoded_fragis;
+}
+
+/*Decodes the bit flags indicating whether each super block is partially coded
+   or not.
+  Return: The number of partially coded super blocks.*/
+static unsigned oc_dec_partial_sb_flags_unpack(oc_dec_ctx *_dec){
+  oc_sb_flags *sb_flags;
+  unsigned     nsbs;
+  unsigned     sbi;
+  unsigned     npartial;
+  unsigned     run_count;
+  long         val;
+  int          flag;
+  val=oc_pack_read1(&_dec->opb);
+  flag=(int)val;
+  sb_flags=_dec->state.sb_flags;
+  nsbs=_dec->state.nsbs;
+  sbi=npartial=0;
+  while(sbi<nsbs){
+    int full_run;
+    run_count=oc_sb_run_unpack(&_dec->opb);
+    full_run=run_count>=4129;
+    do{
+      sb_flags[sbi].coded_partially=flag;
+      sb_flags[sbi].coded_fully=0;
+      npartial+=flag;
+      sbi++;
+    }
+    while(--run_count>0&&sbi<nsbs);
+    if(full_run&&sbi<nsbs){
+      val=oc_pack_read1(&_dec->opb);
+      flag=(int)val;
+    }
+    else flag=!flag;
+  }
+  /*TODO: run_count should be 0 here.
+    If it's not, we should issue a warning of some kind.*/
+  return npartial;
+}
+
+/*Decodes the bit flags for whether or not each non-partially-coded super
+   block is fully coded or not.
+  This function should only be called if there is at least one
+   non-partially-coded super block.
+  Return: The number of partially coded super blocks.*/
+static void oc_dec_coded_sb_flags_unpack(oc_dec_ctx *_dec){
+  oc_sb_flags *sb_flags;
+  unsigned     nsbs;
+  unsigned     sbi;
+  unsigned     run_count;
+  long         val;
+  int          flag;
+  sb_flags=_dec->state.sb_flags;
+  nsbs=_dec->state.nsbs;
+  /*Skip partially coded super blocks.*/
+  for(sbi=0;sb_flags[sbi].coded_partially;sbi++);
+  val=oc_pack_read1(&_dec->opb);
+  flag=(int)val;
+  do{
+    int full_run;
+    run_count=oc_sb_run_unpack(&_dec->opb);
+    full_run=run_count>=4129;
+    for(;sbi<nsbs;sbi++){
+      if(sb_flags[sbi].coded_partially)continue;
+      if(run_count--<=0)break;
+      sb_flags[sbi].coded_fully=flag;
+    }
+    if(full_run&&sbi<nsbs){
+      val=oc_pack_read1(&_dec->opb);
+      flag=(int)val;
+    }
+    else flag=!flag;
+  }
+  while(sbi<nsbs);
+  /*TODO: run_count should be 0 here.
+    If it's not, we should issue a warning of some kind.*/
+}
+
+static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){
+  const oc_sb_map   *sb_maps;
+  const oc_sb_flags *sb_flags;
+  oc_fragment       *frags;
+  unsigned           nsbs;
+  unsigned           sbi;
+  unsigned           npartial;
+  long               val;
+  int                pli;
+  int                flag;
+  int                run_count;
+  ptrdiff_t         *coded_fragis;
+  ptrdiff_t         *uncoded_fragis;
+  ptrdiff_t          ncoded_fragis;
+  ptrdiff_t          nuncoded_fragis;
+  ptrdiff_t          prev_ncoded_fragis;
+  npartial=oc_dec_partial_sb_flags_unpack(_dec);
+  if(npartial<_dec->state.nsbs)oc_dec_coded_sb_flags_unpack(_dec);
+  if(npartial>0){
+    val=oc_pack_read1(&_dec->opb);
+    flag=!(int)val;
+  }
+  else flag=0;
+  sb_maps=(const oc_sb_map *)_dec->state.sb_maps;
+  sb_flags=_dec->state.sb_flags;
+  frags=_dec->state.frags;
+  sbi=nsbs=run_count=0;
+  coded_fragis=_dec->state.coded_fragis;
+  uncoded_fragis=coded_fragis+_dec->state.nfrags;
+  prev_ncoded_fragis=ncoded_fragis=nuncoded_fragis=0;
+  for(pli=0;pli<3;pli++){
+    nsbs+=_dec->state.fplanes[pli].nsbs;
+    for(;sbi<nsbs;sbi++){
+      int quadi;
+      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
+        int bi;
+        for(bi=0;bi<4;bi++){
+          ptrdiff_t fragi;
+          fragi=sb_maps[sbi][quadi][bi];
+          if(fragi>=0){
+            int coded;
+            if(sb_flags[sbi].coded_fully)coded=1;
+            else if(!sb_flags[sbi].coded_partially)coded=0;
+            else{
+              if(run_count<=0){
+                run_count=oc_block_run_unpack(&_dec->opb);
+                flag=!flag;
+              }
+              run_count--;
+              coded=flag;
+            }
+            if(coded)coded_fragis[ncoded_fragis++]=fragi;
+            else *(uncoded_fragis-++nuncoded_fragis)=fragi;
+            frags[fragi].coded=coded;
+          }
+        }
+      }
+    }
+    _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis;
+    prev_ncoded_fragis=ncoded_fragis;
+  }
+  _dec->state.ntotal_coded_fragis=ncoded_fragis;
+  /*TODO: run_count should be 0 here.
+    If it's not, we should issue a warning of some kind.*/
+}
+
+
+
+typedef int (*oc_mode_unpack_func)(oc_pack_buf *_opb);
+
+static int oc_vlc_mode_unpack(oc_pack_buf *_opb){
+  long val;
+  int  i;
+  for(i=0;i<7;i++){
+    val=oc_pack_read1(_opb);
+    if(!val)break;
+  }
+  return i;
+}
+
+static int oc_clc_mode_unpack(oc_pack_buf *_opb){
+  long val;
+  val=oc_pack_read(_opb,3);
+  return (int)val;
+}
+
+/*Unpacks the list of macro block modes for INTER frames.*/
+static void oc_dec_mb_modes_unpack(oc_dec_ctx *_dec){
+  const oc_mb_map     *mb_maps;
+  signed char         *mb_modes;
+  const oc_fragment   *frags;
+  const unsigned char *alphabet;
+  unsigned char        scheme0_alphabet[8];
+  oc_mode_unpack_func  mode_unpack;
+  size_t               nmbs;
+  size_t               mbi;
+  long                 val;
+  int                  mode_scheme;
+  val=oc_pack_read(&_dec->opb,3);
+  mode_scheme=(int)val;
+  if(mode_scheme==0){
+    int mi;
+    /*Just in case, initialize the modes to something.
+      If the bitstream doesn't contain each index exactly once, it's likely
+       corrupt and the rest of the packet is garbage anyway, but this way we
+       won't crash, and we'll decode SOMETHING.*/
+    /*LOOP VECTORIZES*/
+    for(mi=0;mi<OC_NMODES;mi++)scheme0_alphabet[mi]=OC_MODE_INTER_NOMV;
+    for(mi=0;mi<OC_NMODES;mi++){
+      val=oc_pack_read(&_dec->opb,3);
+      scheme0_alphabet[val]=OC_MODE_ALPHABETS[6][mi];
+    }
+    alphabet=scheme0_alphabet;
+  }
+  else alphabet=OC_MODE_ALPHABETS[mode_scheme-1];
+  if(mode_scheme==7)mode_unpack=oc_clc_mode_unpack;
+  else mode_unpack=oc_vlc_mode_unpack;
+  mb_modes=_dec->state.mb_modes;
+  mb_maps=(const oc_mb_map *)_dec->state.mb_maps;
+  nmbs=_dec->state.nmbs;
+  frags=_dec->state.frags;
+  for(mbi=0;mbi<nmbs;mbi++){
+    if(mb_modes[mbi]!=OC_MODE_INVALID){
+      int bi;
+      /*Check for a coded luma block in this macro block.*/
+      for(bi=0;bi<4&&!frags[mb_maps[mbi][0][bi]].coded;bi++);
+      /*We found one, decode a mode.*/
+      if(bi<4)mb_modes[mbi]=alphabet[(*mode_unpack)(&_dec->opb)];
+      /*There were none: INTER_NOMV is forced.*/
+      else mb_modes[mbi]=OC_MODE_INTER_NOMV;
+    }
+  }
+}
+
+
+
+typedef int (*oc_mv_comp_unpack_func)(oc_pack_buf *_opb);
+
+static int oc_vlc_mv_comp_unpack(oc_pack_buf *_opb){
+  long bits;
+  int  mask;
+  int  mv;
+  bits=oc_pack_read(_opb,3);
+  switch(bits){
+    case  0:return 0;
+    case  1:return 1;
+    case  2:return -1;
+    case  3:
+    case  4:{
+      mv=(int)(bits-1);
+      bits=oc_pack_read1(_opb);
+    }break;
+    /*case  5:
+    case  6:
+    case  7:*/
+    default:{
+      mv=1<<bits-3;
+      bits=oc_pack_read(_opb,bits-2);
+      mv+=(int)(bits>>1);
+      bits&=1;
+    }break;
+  }
+  mask=-(int)bits;
+  return mv+mask^mask;
+}
+
+static int oc_clc_mv_comp_unpack(oc_pack_buf *_opb){
+  long bits;
+  int  mask;
+  int  mv;
+  bits=oc_pack_read(_opb,6);
+  mv=(int)bits>>1;
+  mask=-((int)bits&1);
+  return mv+mask^mask;
+}
+
+/*Unpacks the list of motion vectors for INTER frames, and propagtes the macro
+   block modes and motion vectors to the individual fragments.*/
+static void oc_dec_mv_unpack_and_frag_modes_fill(oc_dec_ctx *_dec){
+  const oc_mb_map        *mb_maps;
+  const signed char      *mb_modes;
+  oc_set_chroma_mvs_func  set_chroma_mvs;
+  oc_mv_comp_unpack_func  mv_comp_unpack;
+  oc_fragment            *frags;
+  oc_mv                  *frag_mvs;
+  const unsigned char    *map_idxs;
+  int                     map_nidxs;
+  oc_mv                   last_mv[2];
+  oc_mv                   cbmvs[4];
+  size_t                  nmbs;
+  size_t                  mbi;
+  long                    val;
+  set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_dec->state.info.pixel_fmt];
+  val=oc_pack_read1(&_dec->opb);
+  mv_comp_unpack=val?oc_clc_mv_comp_unpack:oc_vlc_mv_comp_unpack;
+  map_idxs=OC_MB_MAP_IDXS[_dec->state.info.pixel_fmt];
+  map_nidxs=OC_MB_MAP_NIDXS[_dec->state.info.pixel_fmt];
+  memset(last_mv,0,sizeof(last_mv));
+  frags=_dec->state.frags;
+  frag_mvs=_dec->state.frag_mvs;
+  mb_maps=(const oc_mb_map *)_dec->state.mb_maps;
+  mb_modes=_dec->state.mb_modes;
+  nmbs=_dec->state.nmbs;
+  for(mbi=0;mbi<nmbs;mbi++){
+    int          mb_mode;
+    mb_mode=mb_modes[mbi];
+    if(mb_mode!=OC_MODE_INVALID){
+      oc_mv        mbmv;
+      ptrdiff_t    fragi;
+      int          coded[13];
+      int          codedi;
+      int          ncoded;
+      int          mapi;
+      int          mapii;
+      /*Search for at least one coded fragment.*/
+      ncoded=mapii=0;
+      do{
+        mapi=map_idxs[mapii];
+        fragi=mb_maps[mbi][mapi>>2][mapi&3];
+        if(frags[fragi].coded)coded[ncoded++]=mapi;
+      }
+      while(++mapii<map_nidxs);
+      if(ncoded<=0)continue;
+      switch(mb_mode){
+        case OC_MODE_INTER_MV_FOUR:{
+          oc_mv       lbmvs[4];
+          int         bi;
+          /*Mark the tail of the list, so we don't accidentally go past it.*/
+          coded[ncoded]=-1;
+          for(bi=codedi=0;bi<4;bi++){
+            if(coded[codedi]==bi){
+              codedi++;
+              fragi=mb_maps[mbi][0][bi];
+              frags[fragi].mb_mode=mb_mode;
+              lbmvs[bi][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+              lbmvs[bi][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+              memcpy(frag_mvs[fragi],lbmvs[bi],sizeof(lbmvs[bi]));
+            }
+            else lbmvs[bi][0]=lbmvs[bi][1]=0;
+          }
+          if(codedi>0){
+            memcpy(last_mv[1],last_mv[0],sizeof(last_mv[1]));
+            memcpy(last_mv[0],lbmvs[coded[codedi-1]],sizeof(last_mv[0]));
+          }
+          if(codedi<ncoded){
+            (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
+            for(;codedi<ncoded;codedi++){
+              mapi=coded[codedi];
+              bi=mapi&3;
+              fragi=mb_maps[mbi][mapi>>2][bi];
+              frags[fragi].mb_mode=mb_mode;
+              memcpy(frag_mvs[fragi],cbmvs[bi],sizeof(cbmvs[bi]));
+            }
+          }
+        }break;
+        case OC_MODE_INTER_MV:{
+          memcpy(last_mv[1],last_mv[0],sizeof(last_mv[1]));
+          mbmv[0]=last_mv[0][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+          mbmv[1]=last_mv[0][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+        }break;
+        case OC_MODE_INTER_MV_LAST:memcpy(mbmv,last_mv[0],sizeof(mbmv));break;
+        case OC_MODE_INTER_MV_LAST2:{
+          memcpy(mbmv,last_mv[1],sizeof(mbmv));
+          memcpy(last_mv[1],last_mv[0],sizeof(last_mv[1]));
+          memcpy(last_mv[0],mbmv,sizeof(last_mv[0]));
+        }break;
+        case OC_MODE_GOLDEN_MV:{
+          mbmv[0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+          mbmv[1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+        }break;
+        default:memset(mbmv,0,sizeof(mbmv));break;
+      }
+      /*4MV mode fills in the fragments itself.
+        For all other modes we can use this common code.*/
+      if(mb_mode!=OC_MODE_INTER_MV_FOUR){
+        for(codedi=0;codedi<ncoded;codedi++){
+          mapi=coded[codedi];
+          fragi=mb_maps[mbi][mapi>>2][mapi&3];
+          frags[fragi].mb_mode=mb_mode;
+          memcpy(frag_mvs[fragi],mbmv,sizeof(mbmv));
+        }
+      }
+    }
+  }
+}
+
+static void oc_dec_block_qis_unpack(oc_dec_ctx *_dec){
+  oc_fragment     *frags;
+  const ptrdiff_t *coded_fragis;
+  ptrdiff_t        ncoded_fragis;
+  ptrdiff_t        fragii;
+  ptrdiff_t        fragi;
+  ncoded_fragis=_dec->state.ntotal_coded_fragis;
+  if(ncoded_fragis<=0)return;
+  frags=_dec->state.frags;
+  coded_fragis=_dec->state.coded_fragis;
+  if(_dec->state.nqis==1){
+    /*If this frame has only a single qi value, then just use it for all coded
+       fragments.*/
+    for(fragii=0;fragii<ncoded_fragis;fragii++){
+      frags[coded_fragis[fragii]].qii=0;
+    }
+  }
+  else{
+    long val;
+    int  flag;
+    int  nqi1;
+    int  run_count;
+    /*Otherwise, we decode a qi index for each fragment, using two passes of
+      the same binary RLE scheme used for super-block coded bits.
+     The first pass marks each fragment as having a qii of 0 or greater than
+      0, and the second pass (if necessary), distinguishes between a qii of
+      1 and 2.
+     At first we just store the qii in the fragment.
+     After all the qii's are decoded, we make a final pass to replace them
+      with the corresponding qi's for this frame.*/
+    val=oc_pack_read1(&_dec->opb);
+    flag=(int)val;
+    nqi1=0;
+    fragii=0;
+    while(fragii<ncoded_fragis){
+      int full_run;
+      run_count=oc_sb_run_unpack(&_dec->opb);
+      full_run=run_count>=4129;
+      do{
+        frags[coded_fragis[fragii++]].qii=flag;
+        nqi1+=flag;
+      }
+      while(--run_count>0&&fragii<ncoded_fragis);
+      if(full_run&&fragii<ncoded_fragis){
+        val=oc_pack_read1(&_dec->opb);
+        flag=(int)val;
+      }
+      else flag=!flag;
+    }
+    /*TODO: run_count should be 0 here.
+      If it's not, we should issue a warning of some kind.*/
+    /*If we have 3 different qi's for this frame, and there was at least one
+       fragment with a non-zero qi, make the second pass.*/
+    if(_dec->state.nqis==3&&nqi1>0){
+      /*Skip qii==0 fragments.*/
+      for(fragii=0;frags[coded_fragis[fragii]].qii==0;fragii++);
+      val=oc_pack_read1(&_dec->opb);
+      flag=(int)val;
+      do{
+        int full_run;
+        run_count=oc_sb_run_unpack(&_dec->opb);
+        full_run=run_count>=4129;
+        for(;fragii<ncoded_fragis;fragii++){
+          fragi=coded_fragis[fragii];
+          if(frags[fragi].qii==0)continue;
+          if(run_count--<=0)break;
+          frags[fragi].qii+=flag;
+        }
+        if(full_run&&fragii<ncoded_fragis){
+          val=oc_pack_read1(&_dec->opb);
+          flag=(int)val;
+        }
+        else flag=!flag;
+      }
+      while(fragii<ncoded_fragis);
+      /*TODO: run_count should be 0 here.
+        If it's not, we should issue a warning of some kind.*/
+    }
+  }
+}
+
+
+
+/*Unpacks the DC coefficient tokens.
+  Unlike when unpacking the AC coefficient tokens, we actually need to decode
+   the DC coefficient values now so that we can do DC prediction.
+  _huff_idx:   The index of the Huffman table to use for each color plane.
+  _ntoks_left: The number of tokens left to be decoded in each color plane for
+                each coefficient.
+               This is updated as EOB tokens and zero run tokens are decoded.
+  Return: The length of any outstanding EOB run.*/
+static ptrdiff_t oc_dec_dc_coeff_unpack(oc_dec_ctx *_dec,int _huff_idxs[2],
+ ptrdiff_t _ntoks_left[3][64]){
+  unsigned char   *dct_tokens;
+  oc_fragment     *frags;
+  const ptrdiff_t *coded_fragis;
+  ptrdiff_t        ncoded_fragis;
+  ptrdiff_t        fragii;
+  ptrdiff_t        eobs;
+  ptrdiff_t        ti;
+  int              pli;
+  dct_tokens=_dec->dct_tokens;
+  frags=_dec->state.frags;
+  coded_fragis=_dec->state.coded_fragis;
+  ncoded_fragis=fragii=eobs=ti=0;
+  for(pli=0;pli<3;pli++){
+    ptrdiff_t run_counts[64];
+    ptrdiff_t eob_count;
+    ptrdiff_t eobi;
+    int       rli;
+    ncoded_fragis+=_dec->state.ncoded_fragis[pli];
+    memset(run_counts,0,sizeof(run_counts));
+    _dec->eob_runs[pli][0]=eobs;
+    _dec->ti0[pli][0]=ti;
+    /*Continue any previous EOB run, if there was one.*/
+    eobi=eobs;
+    if(ncoded_fragis-fragii<eobi)eobi=ncoded_fragis-fragii;
+    eob_count=eobi;
+    eobs-=eobi;
+    while(eobi-->0)frags[coded_fragis[fragii++]].dc=0;
+    while(fragii<ncoded_fragis){
+      int token;
+      int cw;
+      int eb;
+      int skip;
+      token=oc_huff_token_decode(&_dec->opb,
+       _dec->huff_tables[_huff_idxs[pli+1>>1]]);
+      dct_tokens[ti++]=(unsigned char)token;
+      if(OC_DCT_TOKEN_NEEDS_MORE(token)){
+        eb=(int)oc_pack_read(&_dec->opb,
+         OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[token]);
+        dct_tokens[ti++]=(unsigned char)eb;
+        if(token==OC_DCT_TOKEN_FAT_EOB)dct_tokens[ti++]=(unsigned char)(eb>>8);
+        eb<<=OC_DCT_TOKEN_EB_POS(token);
+      }
+      else eb=0;
+      cw=OC_DCT_CODE_WORD[token]+eb;
+      eobs=cw>>OC_DCT_CW_EOB_SHIFT&0xFFF;
+      if(cw==OC_DCT_CW_FINISH)eobs=OC_DCT_EOB_FINISH;
+      if(eobs){
+        eobi=OC_MINI(eobs,ncoded_fragis-fragii);
+        eob_count+=eobi;
+        eobs-=eobi;
+        while(eobi-->0)frags[coded_fragis[fragii++]].dc=0;
+      }
+      else{
+        int coeff;
+        skip=(unsigned char)(cw>>OC_DCT_CW_RLEN_SHIFT);
+        cw^=-(cw&1<<OC_DCT_CW_FLIP_BIT);
+        coeff=cw>>OC_DCT_CW_MAG_SHIFT;
+        if(skip)coeff=0;
+        run_counts[skip]++;
+        frags[coded_fragis[fragii++]].dc=coeff;
+      }
+    }
+    /*Add the total EOB count to the longest run length.*/
+    run_counts[63]+=eob_count;
+    /*And convert the run_counts array to a moment table.*/
+    for(rli=63;rli-->0;)run_counts[rli]+=run_counts[rli+1];
+    /*Finally, subtract off the number of coefficients that have been
+       accounted for by runs started in this coefficient.*/
+    for(rli=64;rli-->0;)_ntoks_left[pli][rli]-=run_counts[rli];
+  }
+  _dec->dct_tokens_count=ti;
+  return eobs;
+}
+
+/*Unpacks the AC coefficient tokens.
+  This can completely discard coefficient values while unpacking, and so is
+   somewhat simpler than unpacking the DC coefficient tokens.
+  _huff_idx:   The index of the Huffman table to use for each color plane.
+  _ntoks_left: The number of tokens left to be decoded in each color plane for
+                each coefficient.
+               This is updated as EOB tokens and zero run tokens are decoded.
+  _eobs:       The length of any outstanding EOB run from previous
+                coefficients.
+  Return: The length of any outstanding EOB run.*/
+static int oc_dec_ac_coeff_unpack(oc_dec_ctx *_dec,int _zzi,int _huff_idxs[2],
+ ptrdiff_t _ntoks_left[3][64],ptrdiff_t _eobs){
+  unsigned char *dct_tokens;
+  ptrdiff_t      ti;
+  int            pli;
+  dct_tokens=_dec->dct_tokens;
+  ti=_dec->dct_tokens_count;
+  for(pli=0;pli<3;pli++){
+    ptrdiff_t run_counts[64];
+    ptrdiff_t eob_count;
+    size_t    ntoks_left;
+    size_t    ntoks;
+    int       rli;
+    _dec->eob_runs[pli][_zzi]=_eobs;
+    _dec->ti0[pli][_zzi]=ti;
+    ntoks_left=_ntoks_left[pli][_zzi];
+    memset(run_counts,0,sizeof(run_counts));
+    eob_count=0;
+    ntoks=0;
+    while(ntoks+_eobs<ntoks_left){
+      int token;
+      int cw;
+      int eb;
+      int skip;
+      ntoks+=_eobs;
+      eob_count+=_eobs;
+      token=oc_huff_token_decode(&_dec->opb,
+       _dec->huff_tables[_huff_idxs[pli+1>>1]]);
+      dct_tokens[ti++]=(unsigned char)token;
+      if(OC_DCT_TOKEN_NEEDS_MORE(token)){
+        eb=(int)oc_pack_read(&_dec->opb,
+         OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[token]);
+        dct_tokens[ti++]=(unsigned char)eb;
+        if(token==OC_DCT_TOKEN_FAT_EOB)dct_tokens[ti++]=(unsigned char)(eb>>8);
+        eb<<=OC_DCT_TOKEN_EB_POS(token);
+      }
+      else eb=0;
+      cw=OC_DCT_CODE_WORD[token]+eb;
+      skip=(unsigned char)(cw>>OC_DCT_CW_RLEN_SHIFT);
+      _eobs=cw>>OC_DCT_CW_EOB_SHIFT&0xFFF;
+      if(cw==OC_DCT_CW_FINISH)_eobs=OC_DCT_EOB_FINISH;
+      if(_eobs==0){
+        run_counts[skip]++;
+        ntoks++;
+      }
+    }
+    /*Add the portion of the last EOB run actually used by this coefficient.*/
+    eob_count+=ntoks_left-ntoks;
+    /*And remove it from the remaining EOB count.*/
+    _eobs-=ntoks_left-ntoks;
+    /*Add the total EOB count to the longest run length.*/
+    run_counts[63]+=eob_count;
+    /*And convert the run_counts array to a moment table.*/
+    for(rli=63;rli-->0;)run_counts[rli]+=run_counts[rli+1];
+    /*Finally, subtract off the number of coefficients that have been
+       accounted for by runs started in this coefficient.*/
+    for(rli=64-_zzi;rli-->0;)_ntoks_left[pli][_zzi+rli]-=run_counts[rli];
+  }
+  _dec->dct_tokens_count=ti;
+  return _eobs;
+}
+
+/*Tokens describing the DCT coefficients that belong to each fragment are
+   stored in the bitstream grouped by coefficient, not by fragment.
+
+  This means that we either decode all the tokens in order, building up a
+   separate coefficient list for each fragment as we go, and then go back and
+   do the iDCT on each fragment, or we have to create separate lists of tokens
+   for each coefficient, so that we can pull the next token required off the
+   head of the appropriate list when decoding a specific fragment.
+
+  The former was VP3's choice, and it meant 2*w*h extra storage for all the
+   decoded coefficient values.
+
+  We take the second option, which lets us store just one to three bytes per
+   token (generally far fewer than the number of coefficients, due to EOB
+   tokens and zero runs), and which requires us to only maintain a counter for
+   each of the 64 coefficients, instead of a counter for every fragment to
+   determine where the next token goes.
+
+  We actually use 3 counters per coefficient, one for each color plane, so we
+   can decode all color planes simultaneously.
+  This lets color conversion, etc., be done as soon as a full MCU (one or
+   two super block rows) is decoded, while the image data is still in cache.*/
+
+static void oc_dec_residual_tokens_unpack(oc_dec_ctx *_dec){
+  static const unsigned char OC_HUFF_LIST_MAX[5]={1,6,15,28,64};
+  ptrdiff_t  ntoks_left[3][64];
+  int        huff_idxs[2];
+  ptrdiff_t  eobs;
+  long       val;
+  int        pli;
+  int        zzi;
+  int        hgi;
+  for(pli=0;pli<3;pli++)for(zzi=0;zzi<64;zzi++){
+    ntoks_left[pli][zzi]=_dec->state.ncoded_fragis[pli];
+  }
+  val=oc_pack_read(&_dec->opb,4);
+  huff_idxs[0]=(int)val;
+  val=oc_pack_read(&_dec->opb,4);
+  huff_idxs[1]=(int)val;
+  _dec->eob_runs[0][0]=0;
+  eobs=oc_dec_dc_coeff_unpack(_dec,huff_idxs,ntoks_left);
+#if defined(HAVE_CAIRO)
+  _dec->telemetry_dc_bytes=oc_pack_bytes_left(&_dec->opb);
+#endif
+  val=oc_pack_read(&_dec->opb,4);
+  huff_idxs[0]=(int)val;
+  val=oc_pack_read(&_dec->opb,4);
+  huff_idxs[1]=(int)val;
+  zzi=1;
+  for(hgi=1;hgi<5;hgi++){
+    huff_idxs[0]+=16;
+    huff_idxs[1]+=16;
+    for(;zzi<OC_HUFF_LIST_MAX[hgi];zzi++){
+      eobs=oc_dec_ac_coeff_unpack(_dec,zzi,huff_idxs,ntoks_left,eobs);
+    }
+  }
+  /*TODO: eobs should be exactly zero, or 4096 or greater.
+    The second case occurs when an EOB run of size zero is encountered, which
+     gets treated as an infinite EOB run (where infinity is PTRDIFF_MAX).
+    If neither of these conditions holds, then a warning should be issued.*/
+}
+
+
+static int oc_dec_postprocess_init(oc_dec_ctx *_dec){
+  /*pp_level 0: disabled; free any memory used and return*/
+  if(_dec->pp_level<=OC_PP_LEVEL_DISABLED){
+    if(_dec->dc_qis!=NULL){
+      _ogg_free(_dec->dc_qis);
+      _dec->dc_qis=NULL;
+      _ogg_free(_dec->variances);
+      _dec->variances=NULL;
+      _ogg_free(_dec->pp_frame_data);
+      _dec->pp_frame_data=NULL;
+    }
+    return 1;
+  }
+  if(_dec->dc_qis==NULL){
+    /*If we haven't been tracking DC quantization indices, there's no point in
+       starting now.*/
+    if(_dec->state.frame_type!=OC_INTRA_FRAME)return 1;
+    _dec->dc_qis=(unsigned char *)_ogg_malloc(
+     _dec->state.nfrags*sizeof(_dec->dc_qis[0]));
+    if(_dec->dc_qis==NULL)return 1;
+    memset(_dec->dc_qis,_dec->state.qis[0],_dec->state.nfrags);
+  }
+  else{
+    unsigned char   *dc_qis;
+    const ptrdiff_t *coded_fragis;
+    ptrdiff_t        ncoded_fragis;
+    ptrdiff_t        fragii;
+    unsigned char    qi0;
+    /*Update the DC quantization index of each coded block.*/
+    dc_qis=_dec->dc_qis;
+    coded_fragis=_dec->state.coded_fragis;
+    ncoded_fragis=_dec->state.ncoded_fragis[0]+
+     _dec->state.ncoded_fragis[1]+_dec->state.ncoded_fragis[2];
+    qi0=(unsigned char)_dec->state.qis[0];
+    for(fragii=0;fragii<ncoded_fragis;fragii++){
+      dc_qis[coded_fragis[fragii]]=qi0;
+    }
+  }
+  /*pp_level 1: Stop after updating DC quantization indices.*/
+  if(_dec->pp_level<=OC_PP_LEVEL_TRACKDCQI){
+    if(_dec->variances!=NULL){
+      _ogg_free(_dec->variances);
+      _dec->variances=NULL;
+      _ogg_free(_dec->pp_frame_data);
+      _dec->pp_frame_data=NULL;
+    }
+    return 1;
+  }
+  if(_dec->variances==NULL){
+    size_t frame_sz;
+    size_t c_sz;
+    int    c_w;
+    int    c_h;
+    frame_sz=_dec->state.info.frame_width*(size_t)_dec->state.info.frame_height;
+    c_w=_dec->state.info.frame_width>>!(_dec->state.info.pixel_fmt&1);
+    c_h=_dec->state.info.frame_height>>!(_dec->state.info.pixel_fmt&2);
+    c_sz=c_w*(size_t)c_h;
+    /*Allocate space for the chroma planes, even if we're not going to use
+       them; this simplifies allocation state management, though it may waste
+       memory on the few systems that don't overcommit pages.*/
+    frame_sz+=c_sz<<1;
+    _dec->pp_frame_data=(unsigned char *)_ogg_malloc(
+     frame_sz*sizeof(_dec->pp_frame_data[0]));
+    _dec->variances=(int *)_ogg_malloc(
+     _dec->state.nfrags*sizeof(_dec->variances[0]));
+    if(_dec->variances==NULL||_dec->pp_frame_data==NULL){
+      _ogg_free(_dec->pp_frame_data);
+      _dec->pp_frame_data=NULL;
+      _ogg_free(_dec->variances);
+      _dec->variances=NULL;
+      return 1;
+    }
+    /*Force an update of the PP buffer pointers.*/
+    _dec->pp_frame_state=0;
+  }
+  /*Update the PP buffer pointers if necessary.*/
+  if(_dec->pp_frame_state!=1+(_dec->pp_level>=OC_PP_LEVEL_DEBLOCKC)){
+    if(_dec->pp_level<OC_PP_LEVEL_DEBLOCKC){
+      /*If chroma processing is disabled, just use the PP luma plane.*/
+      _dec->pp_frame_buf[0].width=_dec->state.info.frame_width;
+      _dec->pp_frame_buf[0].height=_dec->state.info.frame_height;
+      _dec->pp_frame_buf[0].stride=-_dec->pp_frame_buf[0].width;
+      _dec->pp_frame_buf[0].data=_dec->pp_frame_data+
+       (1-_dec->pp_frame_buf[0].height)*(ptrdiff_t)_dec->pp_frame_buf[0].stride;
+    }
+    else{
+      size_t y_sz;
+      size_t c_sz;
+      int    c_w;
+      int    c_h;
+      /*Otherwise, set up pointers to all three PP planes.*/
+      y_sz=_dec->state.info.frame_width*(size_t)_dec->state.info.frame_height;
+      c_w=_dec->state.info.frame_width>>!(_dec->state.info.pixel_fmt&1);
+      c_h=_dec->state.info.frame_height>>!(_dec->state.info.pixel_fmt&2);
+      c_sz=c_w*(size_t)c_h;
+      _dec->pp_frame_buf[0].width=_dec->state.info.frame_width;
+      _dec->pp_frame_buf[0].height=_dec->state.info.frame_height;
+      _dec->pp_frame_buf[0].stride=_dec->pp_frame_buf[0].width;
+      _dec->pp_frame_buf[0].data=_dec->pp_frame_data;
+      _dec->pp_frame_buf[1].width=c_w;
+      _dec->pp_frame_buf[1].height=c_h;
+      _dec->pp_frame_buf[1].stride=_dec->pp_frame_buf[1].width;
+      _dec->pp_frame_buf[1].data=_dec->pp_frame_buf[0].data+y_sz;
+      _dec->pp_frame_buf[2].width=c_w;
+      _dec->pp_frame_buf[2].height=c_h;
+      _dec->pp_frame_buf[2].stride=_dec->pp_frame_buf[2].width;
+      _dec->pp_frame_buf[2].data=_dec->pp_frame_buf[1].data+c_sz;
+      oc_ycbcr_buffer_flip(_dec->pp_frame_buf,_dec->pp_frame_buf);
+    }
+    _dec->pp_frame_state=1+(_dec->pp_level>=OC_PP_LEVEL_DEBLOCKC);
+  }
+  /*If we're not processing chroma, copy the reference frame's chroma planes.*/
+  if(_dec->pp_level<OC_PP_LEVEL_DEBLOCKC){
+    memcpy(_dec->pp_frame_buf+1,
+     _dec->state.ref_frame_bufs[_dec->state.ref_frame_idx[OC_FRAME_SELF]]+1,
+     sizeof(_dec->pp_frame_buf[1])*2);
+  }
+  return 0;
+}
+
+
+
+typedef struct{
+  int                 bounding_values[256];
+  ptrdiff_t           ti[3][64];
+  ptrdiff_t           eob_runs[3][64];
+  const ptrdiff_t    *coded_fragis[3];
+  const ptrdiff_t    *uncoded_fragis[3];
+  ptrdiff_t           ncoded_fragis[3];
+  ptrdiff_t           nuncoded_fragis[3];
+  const ogg_uint16_t *dequant[3][3][2];
+  int                 fragy0[3];
+  int                 fragy_end[3];
+  int                 pred_last[3][3];
+  int                 mcu_nvfrags;
+  int                 loop_filter;
+  int                 pp_level;
+}oc_dec_pipeline_state;
+
+
+
+/*Initialize the main decoding pipeline.*/
+static void oc_dec_pipeline_init(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe){
+  const ptrdiff_t *coded_fragis;
+  const ptrdiff_t *uncoded_fragis;
+  int              pli;
+  int              qii;
+  int              qti;
+  /*If chroma is sub-sampled in the vertical direction, we have to decode two
+     super block rows of Y' for each super block row of Cb and Cr.*/
+  _pipe->mcu_nvfrags=4<<!(_dec->state.info.pixel_fmt&2);
+  /*Initialize the token and extra bits indices for each plane and
+     coefficient.*/
+  memcpy(_pipe->ti,_dec->ti0,sizeof(_pipe->ti));
+  /*Also copy over the initial the EOB run counts.*/
+  memcpy(_pipe->eob_runs,_dec->eob_runs,sizeof(_pipe->eob_runs));
+  /*Set up per-plane pointers to the coded and uncoded fragments lists.*/
+  coded_fragis=_dec->state.coded_fragis;
+  uncoded_fragis=coded_fragis+_dec->state.nfrags;
+  for(pli=0;pli<3;pli++){
+    ptrdiff_t ncoded_fragis;
+    _pipe->coded_fragis[pli]=coded_fragis;
+    _pipe->uncoded_fragis[pli]=uncoded_fragis;
+    ncoded_fragis=_dec->state.ncoded_fragis[pli];
+    coded_fragis+=ncoded_fragis;
+    uncoded_fragis+=ncoded_fragis-_dec->state.fplanes[pli].nfrags;
+  }
+  /*Set up condensed quantizer tables.*/
+  for(pli=0;pli<3;pli++){
+    for(qii=0;qii<_dec->state.nqis;qii++){
+      for(qti=0;qti<2;qti++){
+        _pipe->dequant[pli][qii][qti]=
+         _dec->state.dequant_tables[_dec->state.qis[qii]][pli][qti];
+      }
+    }
+  }
+  /*Set the previous DC predictor to 0 for all color planes and frame types.*/
+  memset(_pipe->pred_last,0,sizeof(_pipe->pred_last));
+  /*Initialize the bounding value array for the loop filter.*/
+  _pipe->loop_filter=!oc_state_loop_filter_init(&_dec->state,
+   _pipe->bounding_values);
+  /*Initialize any buffers needed for post-processing.
+    We also save the current post-processing level, to guard against the user
+     changing it from a callback.*/
+  if(!oc_dec_postprocess_init(_dec))_pipe->pp_level=_dec->pp_level;
+  /*If we don't have enough information to post-process, disable it, regardless
+     of the user-requested level.*/
+  else{
+    _pipe->pp_level=OC_PP_LEVEL_DISABLED;
+    memcpy(_dec->pp_frame_buf,
+     _dec->state.ref_frame_bufs[_dec->state.ref_frame_idx[OC_FRAME_SELF]],
+     sizeof(_dec->pp_frame_buf[0])*3);
+  }
+}
+
+/*Undo the DC prediction in a single plane of an MCU (one or two super block
+   rows).
+  As a side effect, the number of coded and uncoded fragments in this plane of
+   the MCU is also computed.*/
+static void oc_dec_dc_unpredict_mcu_plane(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe,int _pli){
+  const oc_fragment_plane *fplane;
+  oc_fragment             *frags;
+  int                     *pred_last;
+  ptrdiff_t                ncoded_fragis;
+  ptrdiff_t                fragi;
+  int                      fragx;
+  int                      fragy;
+  int                      fragy0;
+  int                      fragy_end;
+  int                      nhfrags;
+  /*Compute the first and last fragment row of the current MCU for this
+     plane.*/
+  fplane=_dec->state.fplanes+_pli;
+  fragy0=_pipe->fragy0[_pli];
+  fragy_end=_pipe->fragy_end[_pli];
+  nhfrags=fplane->nhfrags;
+  pred_last=_pipe->pred_last[_pli];
+  frags=_dec->state.frags;
+  ncoded_fragis=0;
+  fragi=fplane->froffset+fragy0*(ptrdiff_t)nhfrags;
+  for(fragy=fragy0;fragy<fragy_end;fragy++){
+    if(fragy==0){
+      /*For the first row, all of the cases reduce to just using the previous
+         predictor for the same reference frame.*/
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        if(frags[fragi].coded){
+          int ref;
+          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+          pred_last[ref]=frags[fragi].dc+=pred_last[ref];
+          ncoded_fragis++;
+        }
+      }
+    }
+    else{
+      oc_fragment *u_frags;
+      int          l_ref;
+      int          ul_ref;
+      int          u_ref;
+      u_frags=frags-nhfrags;
+      l_ref=-1;
+      ul_ref=-1;
+      u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        int ur_ref;
+        if(fragx+1>=nhfrags)ur_ref=-1;
+        else{
+          ur_ref=u_frags[fragi+1].coded?
+           OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
+        }
+        if(frags[fragi].coded){
+          int pred;
+          int ref;
+          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+          /*We break out a separate case based on which of our neighbors use
+             the same reference frames.
+            This is somewhat faster than trying to make a generic case which
+             handles all of them, since it reduces lots of poorly predicted
+             jumps to one switch statement, and also lets a number of the
+             multiplications be optimized out by strength reduction.*/
+          switch((l_ref==ref)|(ul_ref==ref)<<1|
+           (u_ref==ref)<<2|(ur_ref==ref)<<3){
+            default:pred=pred_last[ref];break;
+            case  1:
+            case  3:pred=frags[fragi-1].dc;break;
+            case  2:pred=u_frags[fragi-1].dc;break;
+            case  4:
+            case  6:
+            case 12:pred=u_frags[fragi].dc;break;
+            case  5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break;
+            case  8:pred=u_frags[fragi+1].dc;break;
+            case  9:
+            case 11:
+            case 13:{
+              pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
+            }break;
+            case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
+            case 14:{
+              pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc)
+               +10*u_frags[fragi].dc)/16;
+            }break;
+            case  7:
+            case 15:{
+              int p0;
+              int p1;
+              int p2;
+              p0=frags[fragi-1].dc;
+              p1=u_frags[fragi-1].dc;
+              p2=u_frags[fragi].dc;
+              pred=(29*(p0+p2)-26*p1)/32;
+              if(abs(pred-p2)>128)pred=p2;
+              else if(abs(pred-p0)>128)pred=p0;
+              else if(abs(pred-p1)>128)pred=p1;
+            }break;
+          }
+          pred_last[ref]=frags[fragi].dc+=pred;
+          ncoded_fragis++;
+          l_ref=ref;
+        }
+        else l_ref=-1;
+        ul_ref=u_ref;
+        u_ref=ur_ref;
+      }
+    }
+  }
+  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
+  /*Also save the number of uncoded fragments so we know how many to copy.*/
+  _pipe->nuncoded_fragis[_pli]=
+   (fragy_end-fragy0)*(ptrdiff_t)nhfrags-ncoded_fragis;
+}
+
+/*Reconstructs all coded fragments in a single MCU (one or two super block
+   rows).
+  This requires that each coded fragment have a proper macro block mode and
+   motion vector (if not in INTRA mode), and have it's DC value decoded, with
+   the DC prediction process reversed, and the number of coded and uncoded
+   fragments in this plane of the MCU be counted.
+  The token lists for each color plane and coefficient should also be filled
+   in, along with initial token offsets, extra bits offsets, and EOB run
+   counts.*/
+static void oc_dec_frags_recon_mcu_plane(oc_dec_ctx *_dec,
+ oc_dec_pipeline_state *_pipe,int _pli){
+  unsigned char       *dct_tokens;
+  const unsigned char *dct_fzig_zag;
+  ogg_uint16_t         dc_quant[2];
+  const oc_fragment   *frags;
+  const ptrdiff_t     *coded_fragis;
+  ptrdiff_t            ncoded_fragis;
+  ptrdiff_t            fragii;
+  ptrdiff_t           *ti;
+  ptrdiff_t           *eob_runs;
+  int                  qti;
+  dct_tokens=_dec->dct_tokens;
+  dct_fzig_zag=_dec->state.opt_data.dct_fzig_zag;
+  frags=_dec->state.frags;
+  coded_fragis=_pipe->coded_fragis[_pli];
+  ncoded_fragis=_pipe->ncoded_fragis[_pli];
+  ti=_pipe->ti[_pli];
+  eob_runs=_pipe->eob_runs[_pli];
+  for(qti=0;qti<2;qti++)dc_quant[qti]=_pipe->dequant[_pli][0][qti][0];
+  for(fragii=0;fragii<ncoded_fragis;fragii++){
+    /*This array is made one element larger because the zig-zag index array
+       uses the final element as a dumping ground for out-of-range indices
+       to protect us from buffer overflow.*/
+    OC_ALIGN8(ogg_int16_t dct_coeffs[65]);
+    const ogg_uint16_t *ac_quant;
+    ptrdiff_t           fragi;
+    int                 last_zzi;
+    int                 zzi;
+    fragi=coded_fragis[fragii];
+    for(zzi=0;zzi<64;zzi++)dct_coeffs[zzi]=0;
+    qti=frags[fragi].mb_mode!=OC_MODE_INTRA;
+    ac_quant=_pipe->dequant[_pli][frags[fragi].qii][qti];
+    /*Decode the AC coefficients.*/
+    for(zzi=0;zzi<64;){
+      int token;
+      last_zzi=zzi;
+      if(eob_runs[zzi]){
+        eob_runs[zzi]--;
+        break;
+      }
+      else{
+        ptrdiff_t eob;
+        int       cw;
+        int       rlen;
+        int       coeff;
+        int       lti;
+        lti=ti[zzi];
+        token=dct_tokens[lti++];
+        cw=OC_DCT_CODE_WORD[token];
+        /*These parts could be done branchless, but the branches are fairly
+           predictable and the C code translates into more than a few
+           instructions, so it's worth it to avoid them.*/
+        if(OC_DCT_TOKEN_NEEDS_MORE(token)){
+          cw+=dct_tokens[lti++]<<OC_DCT_TOKEN_EB_POS(token);
+        }
+        eob=cw>>OC_DCT_CW_EOB_SHIFT&0xFFF;
+        if(token==OC_DCT_TOKEN_FAT_EOB){
+          eob+=dct_tokens[lti++]<<8;
+          if(eob==0)eob=OC_DCT_EOB_FINISH;
+        }
+        rlen=(unsigned char)(cw>>OC_DCT_CW_RLEN_SHIFT);
+        cw^=-(cw&1<<OC_DCT_CW_FLIP_BIT);
+        coeff=cw>>OC_DCT_CW_MAG_SHIFT;
+        eob_runs[zzi]=eob;
+        ti[zzi]=lti;
+        zzi+=rlen;
+        dct_coeffs[dct_fzig_zag[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]);
+        zzi+=!eob;
+      }
+    }
+    /*TODO: zzi should be exactly 64 here.
+      If it's not, we should report some kind of warning.*/
+    zzi=OC_MINI(zzi,64);
+    dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc;
+    /*last_zzi is always initialized.
+      If your compiler thinks otherwise, it is dumb.*/
+    oc_state_frag_recon(&_dec->state,fragi,_pli,
+     dct_coeffs,last_zzi,dc_quant[qti]);
+  }
+  _pipe->coded_fragis[_pli]+=ncoded_fragis;
+  /*Right now the reconstructed MCU has only the coded blocks in it.*/
+  /*TODO: We make the decision here to always copy the uncoded blocks into it
+     from the reference frame.
+    We could also copy the coded blocks back over the reference frame, if we
+     wait for an additional MCU to be decoded, which might be faster if only a
+     small number of blocks are coded.
+    However, this introduces more latency, creating a larger cache footprint.
+    It's unknown which decision is better, but this one results in simpler
+     code, and the hard case (high bitrate, high resolution) is handled
+     correctly.*/
+  /*Copy the uncoded blocks from the previous reference frame.*/
+  _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
+  oc_state_frag_copy_list(&_dec->state,_pipe->uncoded_fragis[_pli],
+   _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli);
+}
+
+/*Filter a horizontal block edge.*/
+static void oc_filter_hedge(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src,int _src_ystride,int _qstep,int _flimit,
+ int *_variance0,int *_variance1){
+  unsigned char       *rdst;
+  const unsigned char *rsrc;
+  unsigned char       *cdst;
+  const unsigned char *csrc;
+  int                  r[10];
+  int                  sum0;
+  int                  sum1;
+  int                  bx;
+  int                  by;
+  rdst=_dst;
+  rsrc=_src;
+  for(bx=0;bx<8;bx++){
+    cdst=rdst;
+    csrc=rsrc;
+    for(by=0;by<10;by++){
+      r[by]=*csrc;
+      csrc+=_src_ystride;
+    }
+    sum0=sum1=0;
+    for(by=0;by<4;by++){
+      sum0+=abs(r[by+1]-r[by]);
+      sum1+=abs(r[by+5]-r[by+6]);
+    }
+    *_variance0+=OC_MINI(255,sum0);
+    *_variance1+=OC_MINI(255,sum1);
+    if(sum0<_flimit&&sum1<_flimit&&r[5]-r[4]<_qstep&&r[4]-r[5]<_qstep){
+      *cdst=(unsigned char)(r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4>>3);
+      cdst+=_dst_ystride;
+      *cdst=(unsigned char)(r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4>>3);
+      cdst+=_dst_ystride;
+      for(by=0;by<4;by++){
+        *cdst=(unsigned char)(r[by]+r[by+1]+r[by+2]+r[by+3]*2+
+         r[by+4]+r[by+5]+r[by+6]+4>>3);
+        cdst+=_dst_ystride;
+      }
+      *cdst=(unsigned char)(r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4>>3);
+      cdst+=_dst_ystride;
+      *cdst=(unsigned char)(r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4>>3);
+    }
+    else{
+      for(by=1;by<=8;by++){
+        *cdst=(unsigned char)r[by];
+        cdst+=_dst_ystride;
+      }
+    }
+    rdst++;
+    rsrc++;
+  }
+}
+
+/*Filter a vertical block edge.*/
+static void oc_filter_vedge(unsigned char *_dst,int _dst_ystride,
+ int _qstep,int _flimit,int *_variances){
+  unsigned char       *rdst;
+  const unsigned char *rsrc;
+  unsigned char       *cdst;
+  int                  r[10];
+  int                  sum0;
+  int                  sum1;
+  int                  bx;
+  int                  by;
+  cdst=_dst;
+  for(by=0;by<8;by++){
+    rsrc=cdst-1;
+    rdst=cdst;
+    for(bx=0;bx<10;bx++)r[bx]=*rsrc++;
+    sum0=sum1=0;
+    for(bx=0;bx<4;bx++){
+      sum0+=abs(r[bx+1]-r[bx]);
+      sum1+=abs(r[bx+5]-r[bx+6]);
+    }
+    _variances[0]+=OC_MINI(255,sum0);
+    _variances[1]+=OC_MINI(255,sum1);
+    if(sum0<_flimit&&sum1<_flimit&&r[5]-r[4]<_qstep&&r[4]-r[5]<_qstep){
+      *rdst++=(unsigned char)(r[0]*3+r[1]*2+r[2]+r[3]+r[4]+4>>3);
+      *rdst++=(unsigned char)(r[0]*2+r[1]+r[2]*2+r[3]+r[4]+r[5]+4>>3);
+      for(bx=0;bx<4;bx++){
+        *rdst++=(unsigned char)(r[bx]+r[bx+1]+r[bx+2]+r[bx+3]*2+
+         r[bx+4]+r[bx+5]+r[bx+6]+4>>3);
+      }
+      *rdst++=(unsigned char)(r[4]+r[5]+r[6]+r[7]*2+r[8]+r[9]*2+4>>3);
+      *rdst=(unsigned char)(r[5]+r[6]+r[7]+r[8]*2+r[9]*3+4>>3);
+    }
+    cdst+=_dst_ystride;
+  }
+}
+
+static void oc_dec_deblock_frag_rows(oc_dec_ctx *_dec,
+ th_img_plane *_dst,th_img_plane *_src,int _pli,int _fragy0,
+ int _fragy_end){
+  oc_fragment_plane   *fplane;
+  int                 *variance;
+  unsigned char       *dc_qi;
+  unsigned char       *dst;
+  const unsigned char *src;
+  ptrdiff_t            froffset;
+  int                  dst_ystride;
+  int                  src_ystride;
+  int                  nhfrags;
+  int                  width;
+  int                  notstart;
+  int                  notdone;
+  int                  flimit;
+  int                  qstep;
+  int                  y_end;
+  int                  y;
+  int                  x;
+  _dst+=_pli;
+  _src+=_pli;
+  fplane=_dec->state.fplanes+_pli;
+  nhfrags=fplane->nhfrags;
+  froffset=fplane->froffset+_fragy0*(ptrdiff_t)nhfrags;
+  variance=_dec->variances+froffset;
+  dc_qi=_dec->dc_qis+froffset;
+  notstart=_fragy0>0;
+  notdone=_fragy_end<fplane->nvfrags;
+  /*We want to clear an extra row of variances, except at the end.*/
+  memset(variance+(nhfrags&-notstart),0,
+   (_fragy_end+notdone-_fragy0-notstart)*(nhfrags*sizeof(variance[0])));
+  /*Except for the first time, we want to point to the middle of the row.*/
+  y=(_fragy0<<3)+(notstart<<2);
+  dst_ystride=_dst->stride;
+  src_ystride=_src->stride;
+  dst=_dst->data+y*(ptrdiff_t)dst_ystride;
+  src=_src->data+y*(ptrdiff_t)src_ystride;
+  width=_dst->width;
+  for(;y<4;y++){
+    memcpy(dst,src,width*sizeof(dst[0]));
+    dst+=dst_ystride;
+    src+=src_ystride;
+  }
+  /*We also want to skip the last row in the frame for this loop.*/
+  y_end=_fragy_end-!notdone<<3;
+  for(;y<y_end;y+=8){
+    qstep=_dec->pp_dc_scale[*dc_qi];
+    flimit=(qstep*3)>>2;
+    oc_filter_hedge(dst,dst_ystride,src-src_ystride,src_ystride,
+     qstep,flimit,variance,variance+nhfrags);
+    variance++;
+    dc_qi++;
+    for(x=8;x<width;x+=8){
+      qstep=_dec->pp_dc_scale[*dc_qi];
+      flimit=(qstep*3)>>2;
+      oc_filter_hedge(dst+x,dst_ystride,src+x-src_ystride,src_ystride,
+       qstep,flimit,variance,variance+nhfrags);
+      oc_filter_vedge(dst+x-(dst_ystride<<2)-4,dst_ystride,
+       qstep,flimit,variance-1);
+      variance++;
+      dc_qi++;
+    }
+    dst+=dst_ystride<<3;
+    src+=src_ystride<<3;
+  }
+  /*And finally, handle the last row in the frame, if it's in the range.*/
+  if(!notdone){
+    int height;
+    height=_dst->height;
+    for(;y<height;y++){
+      memcpy(dst,src,width*sizeof(dst[0]));
+      dst+=dst_ystride;
+      src+=src_ystride;
+    }
+    /*Filter the last row of vertical block edges.*/
+    dc_qi++;
+    for(x=8;x<width;x+=8){
+      qstep=_dec->pp_dc_scale[*dc_qi++];
+      flimit=(qstep*3)>>2;
+      oc_filter_vedge(dst+x-(dst_ystride<<3)-4,dst_ystride,
+       qstep,flimit,variance++);
+    }
+  }
+}
+
+static void oc_dering_block(unsigned char *_idata,int _ystride,int _b,
+ int _dc_scale,int _sharp_mod,int _strong){
+  static const unsigned char OC_MOD_MAX[2]={24,32};
+  static const unsigned char OC_MOD_SHIFT[2]={1,0};
+  const unsigned char *psrc;
+  const unsigned char *src;
+  const unsigned char *nsrc;
+  unsigned char       *dst;
+  int                  vmod[72];
+  int                  hmod[72];
+  int                  mod_hi;
+  int                  by;
+  int                  bx;
+  mod_hi=OC_MINI(3*_dc_scale,OC_MOD_MAX[_strong]);
+  dst=_idata;
+  src=dst;
+  psrc=src-(_ystride&-!(_b&4));
+  for(by=0;by<9;by++){
+    for(bx=0;bx<8;bx++){
+      int mod;
+      mod=32+_dc_scale-(abs(src[bx]-psrc[bx])<<OC_MOD_SHIFT[_strong]);
+      vmod[(by<<3)+bx]=mod<-64?_sharp_mod:OC_CLAMPI(0,mod,mod_hi);
+    }
+    psrc=src;
+    src+=_ystride&-(!(_b&8)|by<7);
+  }
+  nsrc=dst;
+  psrc=dst-!(_b&1);
+  for(bx=0;bx<9;bx++){
+    src=nsrc;
+    for(by=0;by<8;by++){
+      int mod;
+      mod=32+_dc_scale-(abs(*src-*psrc)<<OC_MOD_SHIFT[_strong]);
+      hmod[(bx<<3)+by]=mod<-64?_sharp_mod:OC_CLAMPI(0,mod,mod_hi);
+      psrc+=_ystride;
+      src+=_ystride;
+    }
+    psrc=nsrc;
+    nsrc+=!(_b&2)|bx<7;
+  }
+  src=dst;
+  psrc=src-(_ystride&-!(_b&4));
+  nsrc=src+_ystride;
+  for(by=0;by<8;by++){
+    int a;
+    int b;
+    int w;
+    a=128;
+    b=64;
+    w=hmod[by];
+    a-=w;
+    b+=w**(src-!(_b&1));
+    w=vmod[by<<3];
+    a-=w;
+    b+=w*psrc[0];
+    w=vmod[by+1<<3];
+    a-=w;
+    b+=w*nsrc[0];
+    w=hmod[(1<<3)+by];
+    a-=w;
+    b+=w*src[1];
+    dst[0]=OC_CLAMP255(a*src[0]+b>>7);
+    for(bx=1;bx<7;bx++){
+      a=128;
+      b=64;
+      w=hmod[(bx<<3)+by];
+      a-=w;
+      b+=w*src[bx-1];
+      w=vmod[(by<<3)+bx];
+      a-=w;
+      b+=w*psrc[bx];
+      w=vmod[(by+1<<3)+bx];
+      a-=w;
+      b+=w*nsrc[bx];
+      w=hmod[(bx+1<<3)+by];
+      a-=w;
+      b+=w*src[bx+1];
+      dst[bx]=OC_CLAMP255(a*src[bx]+b>>7);
+    }
+    a=128;
+    b=64;
+    w=hmod[(7<<3)+by];
+    a-=w;
+    b+=w*src[6];
+    w=vmod[(by<<3)+7];
+    a-=w;
+    b+=w*psrc[7];
+    w=vmod[(by+1<<3)+7];
+    a-=w;
+    b+=w*nsrc[7];
+    w=hmod[(8<<3)+by];
+    a-=w;
+    b+=w*src[7+!(_b&2)];
+    dst[7]=OC_CLAMP255(a*src[7]+b>>7);
+    dst+=_ystride;
+    psrc=src;
+    src=nsrc;
+    nsrc+=_ystride&-(!(_b&8)|by<6);
+  }
+}
+
+#define OC_DERING_THRESH1 (384)
+#define OC_DERING_THRESH2 (4*OC_DERING_THRESH1)
+#define OC_DERING_THRESH3 (5*OC_DERING_THRESH1)
+#define OC_DERING_THRESH4 (10*OC_DERING_THRESH1)
+
+static void oc_dec_dering_frag_rows(oc_dec_ctx *_dec,th_img_plane *_img,
+ int _pli,int _fragy0,int _fragy_end){
+  th_img_plane      *iplane;
+  oc_fragment_plane *fplane;
+  oc_fragment       *frag;
+  int               *variance;
+  unsigned char     *idata;
+  ptrdiff_t          froffset;
+  int                ystride;
+  int                nhfrags;
+  int                sthresh;
+  int                strong;
+  int                y_end;
+  int                width;
+  int                height;
+  int                y;
+  int                x;
+  iplane=_img+_pli;
+  fplane=_dec->state.fplanes+_pli;
+  nhfrags=fplane->nhfrags;
+  froffset=fplane->froffset+_fragy0*(ptrdiff_t)nhfrags;
+  variance=_dec->variances+froffset;
+  frag=_dec->state.frags+froffset;
+  strong=_dec->pp_level>=(_pli?OC_PP_LEVEL_SDERINGC:OC_PP_LEVEL_SDERINGY);
+  sthresh=_pli?OC_DERING_THRESH4:OC_DERING_THRESH3;
+  y=_fragy0<<3;
+  ystride=iplane->stride;
+  idata=iplane->data+y*(ptrdiff_t)ystride;
+  y_end=_fragy_end<<3;
+  width=iplane->width;
+  height=iplane->height;
+  for(;y<y_end;y+=8){
+    for(x=0;x<width;x+=8){
+      int b;
+      int qi;
+      int var;
+      qi=_dec->state.qis[frag->qii];
+      var=*variance;
+      b=(x<=0)|(x+8>=width)<<1|(y<=0)<<2|(y+8>=height)<<3;
+      if(strong&&var>sthresh){
+        oc_dering_block(idata+x,ystride,b,
+         _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1);
+        if(_pli||!(b&1)&&*(variance-1)>OC_DERING_THRESH4||
+         !(b&2)&&variance[1]>OC_DERING_THRESH4||
+         !(b&4)&&*(variance-nhfrags)>OC_DERING_THRESH4||
+         !(b&8)&&variance[nhfrags]>OC_DERING_THRESH4){
+          oc_dering_block(idata+x,ystride,b,
+           _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1);
+          oc_dering_block(idata+x,ystride,b,
+           _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1);
+        }
+      }
+      else if(var>OC_DERING_THRESH2){
+        oc_dering_block(idata+x,ystride,b,
+         _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1);
+      }
+      else if(var>OC_DERING_THRESH1){
+        oc_dering_block(idata+x,ystride,b,
+         _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],0);
+      }
+      frag++;
+      variance++;
+    }
+    idata+=ystride<<3;
+  }
+}
+
+
+
+th_dec_ctx *th_decode_alloc(const th_info *_info,const th_setup_info *_setup){
+  oc_dec_ctx *dec;
+  if(_info==NULL||_setup==NULL)return NULL;
+  dec=_ogg_malloc(sizeof(*dec));
+  if(dec==NULL||oc_dec_init(dec,_info,_setup)<0){
+    _ogg_free(dec);
+    return NULL;
+  }
+  dec->state.curframe_num=0;
+  return dec;
+}
+
+void th_decode_free(th_dec_ctx *_dec){
+  if(_dec!=NULL){
+    oc_dec_clear(_dec);
+    _ogg_free(_dec);
+  }
+}
+
+int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
+ size_t _buf_sz){
+  switch(_req){
+  case TH_DECCTL_GET_PPLEVEL_MAX:{
+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
+    (*(int *)_buf)=OC_PP_LEVEL_MAX;
+    return 0;
+  }break;
+  case TH_DECCTL_SET_PPLEVEL:{
+    int pp_level;
+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
+    pp_level=*(int *)_buf;
+    if(pp_level<0||pp_level>OC_PP_LEVEL_MAX)return TH_EINVAL;
+    _dec->pp_level=pp_level;
+    return 0;
+  }break;
+  case TH_DECCTL_SET_GRANPOS:{
+    ogg_int64_t granpos;
+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
+    if(_buf_sz!=sizeof(ogg_int64_t))return TH_EINVAL;
+    granpos=*(ogg_int64_t *)_buf;
+    if(granpos<0)return TH_EINVAL;
+    _dec->state.granpos=granpos;
+    _dec->state.keyframe_num=(granpos>>_dec->state.info.keyframe_granule_shift)
+     -_dec->state.granpos_bias;
+    _dec->state.curframe_num=_dec->state.keyframe_num
+     +(granpos&(1<<_dec->state.info.keyframe_granule_shift)-1);
+    return 0;
+  }break;
+  case TH_DECCTL_SET_STRIPE_CB:{
+    th_stripe_callback *cb;
+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
+    if(_buf_sz!=sizeof(th_stripe_callback))return TH_EINVAL;
+    cb=(th_stripe_callback *)_buf;
+    _dec->stripe_cb.ctx=cb->ctx;
+    _dec->stripe_cb.stripe_decoded=cb->stripe_decoded;
+    return 0;
+  }break;
+#ifdef HAVE_CAIRO
+  case TH_DECCTL_SET_TELEMETRY_MBMODE:{
+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
+    _dec->telemetry=1;
+    _dec->telemetry_mbmode=*(int *)_buf;
+    return 0;
+  }break;
+  case TH_DECCTL_SET_TELEMETRY_MV:{
+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
+    _dec->telemetry=1;
+    _dec->telemetry_mv=*(int *)_buf;
+    return 0;
+  }break;
+  case TH_DECCTL_SET_TELEMETRY_QI:{
+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
+    _dec->telemetry=1;
+    _dec->telemetry_qi=*(int *)_buf;
+    return 0;
+  }break;
+  case TH_DECCTL_SET_TELEMETRY_BITS:{
+    if(_dec==NULL||_buf==NULL)return TH_EFAULT;
+    if(_buf_sz!=sizeof(int))return TH_EINVAL;
+    _dec->telemetry=1;
+    _dec->telemetry_bits=*(int *)_buf;
+    return 0;
+  }break;
+#endif
+  default:return TH_EIMPL;
+  }
+}
+
+/*We're decoding an INTER frame, but have no initialized reference
+   buffers (i.e., decoding did not start on a key frame).
+  We initialize them to a solid gray here.*/
+static void oc_dec_init_dummy_frame(th_dec_ctx *_dec){
+  th_info *info;
+  size_t   yplane_sz;
+  size_t   cplane_sz;
+  int      yhstride;
+  int      yheight;
+  int      chstride;
+  int      cheight;
+  _dec->state.ref_frame_idx[OC_FRAME_GOLD]=0;
+  _dec->state.ref_frame_idx[OC_FRAME_PREV]=0;
+  _dec->state.ref_frame_idx[OC_FRAME_SELF]=1;
+  info=&_dec->state.info;
+  yhstride=info->frame_width+2*OC_UMV_PADDING;
+  yheight=info->frame_height+2*OC_UMV_PADDING;
+  chstride=yhstride>>!(info->pixel_fmt&1);
+  cheight=yheight>>!(info->pixel_fmt&2);
+  yplane_sz=yhstride*(size_t)yheight;
+  cplane_sz=chstride*(size_t)cheight;
+  memset(_dec->state.ref_frame_data[0],0x80,yplane_sz+2*cplane_sz);
+}
+
+int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op,
+ ogg_int64_t *_granpos){
+  int ret;
+  if(_dec==NULL||_op==NULL)return TH_EFAULT;
+  /*A completely empty packet indicates a dropped frame and is treated exactly
+     like an inter frame with no coded blocks.
+    Only proceed if we have a non-empty packet.*/
+  if(_op->bytes!=0){
+    oc_dec_pipeline_state pipe;
+    th_ycbcr_buffer       stripe_buf;
+    int                   stripe_fragy;
+    int                   refi;
+    int                   pli;
+    int                   notstart;
+    int                   notdone;
+    oc_pack_readinit(&_dec->opb,_op->packet,_op->bytes);
+#if defined(HAVE_CAIRO)
+    _dec->telemetry_frame_bytes=_op->bytes;
+#endif
+    ret=oc_dec_frame_header_unpack(_dec);
+    if(ret<0)return ret;
+    /*Select a free buffer to use for the reconstructed version of this
+       frame.*/
+    if(_dec->state.frame_type!=OC_INTRA_FRAME&&
+     (_dec->state.ref_frame_idx[OC_FRAME_GOLD]<0||
+     _dec->state.ref_frame_idx[OC_FRAME_PREV]<0)){
+      /*No reference frames yet!*/
+      oc_dec_init_dummy_frame(_dec);
+      refi=_dec->state.ref_frame_idx[OC_FRAME_SELF];
+    }
+    else{
+      for(refi=0;refi==_dec->state.ref_frame_idx[OC_FRAME_GOLD]||
+       refi==_dec->state.ref_frame_idx[OC_FRAME_PREV];refi++);
+      _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi;
+    }
+    if(_dec->state.frame_type==OC_INTRA_FRAME){
+      oc_dec_mark_all_intra(_dec);
+      _dec->state.keyframe_num=_dec->state.curframe_num;
+#if defined(HAVE_CAIRO)
+      _dec->telemetry_coding_bytes=
+       _dec->telemetry_mode_bytes=
+       _dec->telemetry_mv_bytes=oc_pack_bytes_left(&_dec->opb);
+#endif
+    }
+    else{
+      oc_dec_coded_flags_unpack(_dec);
+#if defined(HAVE_CAIRO)
+      _dec->telemetry_coding_bytes=oc_pack_bytes_left(&_dec->opb);
+#endif
+      oc_dec_mb_modes_unpack(_dec);
+#if defined(HAVE_CAIRO)
+      _dec->telemetry_mode_bytes=oc_pack_bytes_left(&_dec->opb);
+#endif
+      oc_dec_mv_unpack_and_frag_modes_fill(_dec);
+#if defined(HAVE_CAIRO)
+      _dec->telemetry_mv_bytes=oc_pack_bytes_left(&_dec->opb);
+#endif
+    }
+    oc_dec_block_qis_unpack(_dec);
+#if defined(HAVE_CAIRO)
+    _dec->telemetry_qi_bytes=oc_pack_bytes_left(&_dec->opb);
+#endif
+    oc_dec_residual_tokens_unpack(_dec);
+    /*Update granule position.
+      This must be done before the striped decode callbacks so that the
+       application knows what to do with the frame data.*/
+    _dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<<
+     _dec->state.info.keyframe_granule_shift)
+     +(_dec->state.curframe_num-_dec->state.keyframe_num);
+    _dec->state.curframe_num++;
+    if(_granpos!=NULL)*_granpos=_dec->state.granpos;
+    /*All of the rest of the operations -- DC prediction reversal,
+       reconstructing coded fragments, copying uncoded fragments, loop
+       filtering, extending borders, and out-of-loop post-processing -- should
+       be pipelined.
+      I.e., DC prediction reversal, reconstruction, and uncoded fragment
+       copying are done for one or two super block rows, then loop filtering is
+       run as far as it can, then bordering copying, then post-processing.
+      For 4:2:0 video a Minimum Codable Unit or MCU contains two luma super
+       block rows, and one chroma.
+      Otherwise, an MCU consists of one super block row from each plane.
+      Inside each MCU, we perform all of the steps on one color plane before
+       moving on to the next.
+      After reconstruction, the additional filtering stages introduce a delay
+       since they need some pixels from the next fragment row.
+      Thus the actual number of decoded rows available is slightly smaller for
+       the first MCU, and slightly larger for the last.
+
+      This entire process allows us to operate on the data while it is still in
+       cache, resulting in big performance improvements.
+      An application callback allows further application processing (blitting
+       to video memory, color conversion, etc.) to also use the data while it's
+       in cache.*/
+    oc_dec_pipeline_init(_dec,&pipe);
+    oc_ycbcr_buffer_flip(stripe_buf,_dec->pp_frame_buf);
+    notstart=0;
+    notdone=1;
+    for(stripe_fragy=0;notdone;stripe_fragy+=pipe.mcu_nvfrags){
+      int avail_fragy0;
+      int avail_fragy_end;
+      avail_fragy0=avail_fragy_end=_dec->state.fplanes[0].nvfrags;
+      notdone=stripe_fragy+pipe.mcu_nvfrags<avail_fragy_end;
+      for(pli=0;pli<3;pli++){
+        oc_fragment_plane *fplane;
+        int                frag_shift;
+        int                pp_offset;
+        int                sdelay;
+        int                edelay;
+        fplane=_dec->state.fplanes+pli;
+        /*Compute the first and last fragment row of the current MCU for this
+           plane.*/
+        frag_shift=pli!=0&&!(_dec->state.info.pixel_fmt&2);
+        pipe.fragy0[pli]=stripe_fragy>>frag_shift;
+        pipe.fragy_end[pli]=OC_MINI(fplane->nvfrags,
+         pipe.fragy0[pli]+(pipe.mcu_nvfrags>>frag_shift));
+        oc_dec_dc_unpredict_mcu_plane(_dec,&pipe,pli);
+        oc_dec_frags_recon_mcu_plane(_dec,&pipe,pli);
+        sdelay=edelay=0;
+        if(pipe.loop_filter){
+          sdelay+=notstart;
+          edelay+=notdone;
+          oc_state_loop_filter_frag_rows(&_dec->state,pipe.bounding_values,
+           refi,pli,pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
+        }
+        /*To fill the borders, we have an additional two pixel delay, since a
+           fragment in the next row could filter its top edge, using two pixels
+           from a fragment in this row.
+          But there's no reason to delay a full fragment between the two.*/
+        oc_state_borders_fill_rows(&_dec->state,refi,pli,
+         (pipe.fragy0[pli]-sdelay<<3)-(sdelay<<1),
+         (pipe.fragy_end[pli]-edelay<<3)-(edelay<<1));
+        /*Out-of-loop post-processing.*/
+        pp_offset=3*(pli!=0);
+        if(pipe.pp_level>=OC_PP_LEVEL_DEBLOCKY+pp_offset){
+          /*Perform de-blocking in one plane.*/
+          sdelay+=notstart;
+          edelay+=notdone;
+          oc_dec_deblock_frag_rows(_dec,_dec->pp_frame_buf,
+           _dec->state.ref_frame_bufs[refi],pli,
+           pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
+          if(pipe.pp_level>=OC_PP_LEVEL_DERINGY+pp_offset){
+            /*Perform de-ringing in one plane.*/
+            sdelay+=notstart;
+            edelay+=notdone;
+            oc_dec_dering_frag_rows(_dec,_dec->pp_frame_buf,pli,
+             pipe.fragy0[pli]-sdelay,pipe.fragy_end[pli]-edelay);
+          }
+        }
+        /*If no post-processing is done, we still need to delay a row for the
+           loop filter, thanks to the strange filtering order VP3 chose.*/
+        else if(pipe.loop_filter){
+          sdelay+=notstart;
+          edelay+=notdone;
+        }
+        /*Compute the intersection of the available rows in all planes.
+          If chroma is sub-sampled, the effect of each of its delays is
+           doubled, but luma might have more post-processing filters enabled
+           than chroma, so we don't know up front which one is the limiting
+           factor.*/
+        avail_fragy0=OC_MINI(avail_fragy0,pipe.fragy0[pli]-sdelay<<frag_shift);
+        avail_fragy_end=OC_MINI(avail_fragy_end,
+         pipe.fragy_end[pli]-edelay<<frag_shift);
+      }
+      if(_dec->stripe_cb.stripe_decoded!=NULL){
+        /*The callback might want to use the FPU, so let's make sure they can.
+          We violate all kinds of ABI restrictions by not doing this until
+           now, but none of them actually matter since we don't use floating
+           point ourselves.*/
+        oc_restore_fpu(&_dec->state);
+        /*Make the callback, ensuring we flip the sense of the "start" and
+           "end" of the available region upside down.*/
+        (*_dec->stripe_cb.stripe_decoded)(_dec->stripe_cb.ctx,stripe_buf,
+         _dec->state.fplanes[0].nvfrags-avail_fragy_end,
+         _dec->state.fplanes[0].nvfrags-avail_fragy0);
+      }
+      notstart=1;
+    }
+    /*Finish filling in the reference frame borders.*/
+    for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_dec->state,refi,pli);
+    /*Update the reference frame indices.*/
+    if(_dec->state.frame_type==OC_INTRA_FRAME){
+      /*The new frame becomes both the previous and gold reference frames.*/
+      _dec->state.ref_frame_idx[OC_FRAME_GOLD]=
+       _dec->state.ref_frame_idx[OC_FRAME_PREV]=
+       _dec->state.ref_frame_idx[OC_FRAME_SELF];
+    }
+    else{
+      /*Otherwise, just replace the previous reference frame.*/
+      _dec->state.ref_frame_idx[OC_FRAME_PREV]=
+       _dec->state.ref_frame_idx[OC_FRAME_SELF];
+    }
+    /*Restore the FPU before dump_frame, since that _does_ use the FPU (for PNG
+       gamma values, if nothing else).*/
+    oc_restore_fpu(&_dec->state);
+#if defined(OC_DUMP_IMAGES)
+    /*Don't dump images for dropped frames.*/
+    oc_state_dump_frame(&_dec->state,OC_FRAME_SELF,"dec");
+#endif
+    return 0;
+  }
+  else{
+    if(_dec->state.ref_frame_idx[OC_FRAME_GOLD]<0||
+     _dec->state.ref_frame_idx[OC_FRAME_PREV]<0){
+      int refi;
+      /*No reference frames yet!*/
+      oc_dec_init_dummy_frame(_dec);
+      refi=_dec->state.ref_frame_idx[OC_FRAME_PREV];
+      _dec->state.ref_frame_idx[OC_FRAME_SELF]=refi;
+      memcpy(_dec->pp_frame_buf,_dec->state.ref_frame_bufs[refi],
+       sizeof(_dec->pp_frame_buf[0])*3);
+    }
+    /*Just update the granule position and return.*/
+    _dec->state.granpos=(_dec->state.keyframe_num+_dec->state.granpos_bias<<
+     _dec->state.info.keyframe_granule_shift)
+     +(_dec->state.curframe_num-_dec->state.keyframe_num);
+    _dec->state.curframe_num++;
+    if(_granpos!=NULL)*_granpos=_dec->state.granpos;
+    return TH_DUPFRAME;
+  }
+}
+
+int th_decode_ycbcr_out(th_dec_ctx *_dec,th_ycbcr_buffer _ycbcr){
+  if(_dec==NULL||_ycbcr==NULL)return TH_EFAULT;
+  oc_ycbcr_buffer_flip(_ycbcr,_dec->pp_frame_buf);
+#if defined(HAVE_CAIRO)
+  /*If telemetry ioctls are active, we need to draw to the output buffer.
+    Stuff the plane into cairo.*/
+  if(_dec->telemetry){
+    cairo_surface_t *cs;
+    unsigned char   *data;
+    unsigned char   *y_row;
+    unsigned char   *u_row;
+    unsigned char   *v_row;
+    unsigned char   *rgb_row;
+    int              cstride;
+    int              w;
+    int              h;
+    int              x;
+    int              y;
+    int              hdec;
+    int              vdec;
+    w=_ycbcr[0].width;
+    h=_ycbcr[0].height;
+    hdec=!(_dec->state.info.pixel_fmt&1);
+    vdec=!(_dec->state.info.pixel_fmt&2);
+    /*Lazy data buffer init.
+      We could try to re-use the post-processing buffer, which would save
+       memory, but complicate the allocation logic there.
+      I don't think anyone cares about memory usage when using telemetry; it is
+       not meant for embedded devices.*/
+    if(_dec->telemetry_frame_data==NULL){
+      _dec->telemetry_frame_data=_ogg_malloc(
+       (w*h+2*(w>>hdec)*(h>>vdec))*sizeof(*_dec->telemetry_frame_data));
+      if(_dec->telemetry_frame_data==NULL)return 0;
+    }
+    cs=cairo_image_surface_create(CAIRO_FORMAT_RGB24,w,h);
+    /*Sadly, no YUV support in Cairo (yet); convert into the RGB buffer.*/
+    data=cairo_image_surface_get_data(cs);
+    if(data==NULL){
+      cairo_surface_destroy(cs);
+      return 0;
+    }
+    cstride=cairo_image_surface_get_stride(cs);
+    y_row=_ycbcr[0].data;
+    u_row=_ycbcr[1].data;
+    v_row=_ycbcr[2].data;
+    rgb_row=data;
+    for(y=0;y<h;y++){
+      for(x=0;x<w;x++){
+        int r;
+        int g;
+        int b;
+        r=(1904000*y_row[x]+2609823*v_row[x>>hdec]-363703744)/1635200;
+        g=(3827562*y_row[x]-1287801*u_row[x>>hdec]
+         -2672387*v_row[x>>hdec]+447306710)/3287200;
+        b=(952000*y_row[x]+1649289*u_row[x>>hdec]-225932192)/817600;
+        rgb_row[4*x+0]=OC_CLAMP255(b);
+        rgb_row[4*x+1]=OC_CLAMP255(g);
+        rgb_row[4*x+2]=OC_CLAMP255(r);
+      }
+      y_row+=_ycbcr[0].stride;
+      u_row+=_ycbcr[1].stride&-((y&1)|!vdec);
+      v_row+=_ycbcr[2].stride&-((y&1)|!vdec);
+      rgb_row+=cstride;
+    }
+    /*Draw coded identifier for each macroblock (stored in Hilbert order).*/
+    {
+      cairo_t           *c;
+      const oc_fragment *frags;
+      oc_mv             *frag_mvs;
+      const signed char *mb_modes;
+      oc_mb_map         *mb_maps;
+      size_t             nmbs;
+      size_t             mbi;
+      int                row2;
+      int                col2;
+      int                qim[3]={0,0,0};
+      if(_dec->state.nqis==2){
+        int bqi;
+        bqi=_dec->state.qis[0];
+        if(_dec->state.qis[1]>bqi)qim[1]=1;
+        if(_dec->state.qis[1]<bqi)qim[1]=-1;
+      }
+      if(_dec->state.nqis==3){
+        int bqi;
+        int cqi;
+        int dqi;
+        bqi=_dec->state.qis[0];
+        cqi=_dec->state.qis[1];
+        dqi=_dec->state.qis[2];
+        if(cqi>bqi&&dqi>bqi){
+          if(dqi>cqi){
+            qim[1]=1;
+            qim[2]=2;
+          }
+          else{
+            qim[1]=2;
+            qim[2]=1;
+          }
+        }
+        else if(cqi<bqi&&dqi<bqi){
+          if(dqi<cqi){
+            qim[1]=-1;
+            qim[2]=-2;
+          }
+          else{
+            qim[1]=-2;
+            qim[2]=-1;
+          }
+        }
+        else{
+          if(cqi<bqi)qim[1]=-1;
+          else qim[1]=1;
+          if(dqi<bqi)qim[2]=-1;
+          else qim[2]=1;
+        }
+      }
+      c=cairo_create(cs);
+      frags=_dec->state.frags;
+      frag_mvs=_dec->state.frag_mvs;
+      mb_modes=_dec->state.mb_modes;
+      mb_maps=_dec->state.mb_maps;
+      nmbs=_dec->state.nmbs;
+      row2=0;
+      col2=0;
+      for(mbi=0;mbi<nmbs;mbi++){
+        float x;
+        float y;
+        int   bi;
+        y=h-(row2+((col2+1>>1)&1))*16-16;
+        x=(col2>>1)*16;
+        cairo_set_line_width(c,1.);
+        /*Keyframe (all intra) red box.*/
+        if(_dec->state.frame_type==OC_INTRA_FRAME){
+          if(_dec->telemetry_mbmode&0x02){
+            cairo_set_source_rgba(c,1.,0,0,.5);
+            cairo_rectangle(c,x+2.5,y+2.5,11,11);
+            cairo_stroke_preserve(c);
+            cairo_set_source_rgba(c,1.,0,0,.25);
+            cairo_fill(c);
+          }
+        }
+        else{
+          const signed char *frag_mv;
+          ptrdiff_t          fragi;
+          for(bi=0;bi<4;bi++){
+            fragi=mb_maps[mbi][0][bi];
+            if(fragi>=0&&frags[fragi].coded){
+              frag_mv=frag_mvs[fragi];
+              break;
+            }
+          }
+          if(bi<4){
+            switch(mb_modes[mbi]){
+              case OC_MODE_INTRA:{
+                if(_dec->telemetry_mbmode&0x02){
+                  cairo_set_source_rgba(c,1.,0,0,.5);
+                  cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                  cairo_stroke_preserve(c);
+                  cairo_set_source_rgba(c,1.,0,0,.25);
+                  cairo_fill(c);
+                }
+              }break;
+              case OC_MODE_INTER_NOMV:{
+                if(_dec->telemetry_mbmode&0x01){
+                  cairo_set_source_rgba(c,0,0,1.,.5);
+                  cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                  cairo_stroke_preserve(c);
+                  cairo_set_source_rgba(c,0,0,1.,.25);
+                  cairo_fill(c);
+                }
+              }break;
+              case OC_MODE_INTER_MV:{
+                if(_dec->telemetry_mbmode&0x04){
+                  cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                  cairo_set_source_rgba(c,0,1.,0,.5);
+                  cairo_stroke(c);
+                }
+                if(_dec->telemetry_mv&0x04){
+                  cairo_move_to(c,x+8+frag_mv[0],y+8-frag_mv[1]);
+                  cairo_set_source_rgba(c,1.,1.,1.,.9);
+                  cairo_set_line_width(c,3.);
+                  cairo_line_to(c,x+8+frag_mv[0]*.66,y+8-frag_mv[1]*.66);
+                  cairo_stroke_preserve(c);
+                  cairo_set_line_width(c,2.);
+                  cairo_line_to(c,x+8+frag_mv[0]*.33,y+8-frag_mv[1]*.33);
+                  cairo_stroke_preserve(c);
+                  cairo_set_line_width(c,1.);
+                  cairo_line_to(c,x+8,y+8);
+                  cairo_stroke(c);
+                }
+              }break;
+              case OC_MODE_INTER_MV_LAST:{
+                if(_dec->telemetry_mbmode&0x08){
+                  cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                  cairo_set_source_rgba(c,0,1.,0,.5);
+                  cairo_move_to(c,x+13.5,y+2.5);
+                  cairo_line_to(c,x+2.5,y+8);
+                  cairo_line_to(c,x+13.5,y+13.5);
+                  cairo_stroke(c);
+                }
+                if(_dec->telemetry_mv&0x08){
+                  cairo_move_to(c,x+8+frag_mv[0],y+8-frag_mv[1]);
+                  cairo_set_source_rgba(c,1.,1.,1.,.9);
+                  cairo_set_line_width(c,3.);
+                  cairo_line_to(c,x+8+frag_mv[0]*.66,y+8-frag_mv[1]*.66);
+                  cairo_stroke_preserve(c);
+                  cairo_set_line_width(c,2.);
+                  cairo_line_to(c,x+8+frag_mv[0]*.33,y+8-frag_mv[1]*.33);
+                  cairo_stroke_preserve(c);
+                  cairo_set_line_width(c,1.);
+                  cairo_line_to(c,x+8,y+8);
+                  cairo_stroke(c);
+                }
+              }break;
+              case OC_MODE_INTER_MV_LAST2:{
+                if(_dec->telemetry_mbmode&0x10){
+                  cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                  cairo_set_source_rgba(c,0,1.,0,.5);
+                  cairo_move_to(c,x+8,y+2.5);
+                  cairo_line_to(c,x+2.5,y+8);
+                  cairo_line_to(c,x+8,y+13.5);
+                  cairo_move_to(c,x+13.5,y+2.5);
+                  cairo_line_to(c,x+8,y+8);
+                  cairo_line_to(c,x+13.5,y+13.5);
+                  cairo_stroke(c);
+                }
+                if(_dec->telemetry_mv&0x10){
+                  cairo_move_to(c,x+8+frag_mv[0],y+8-frag_mv[1]);
+                  cairo_set_source_rgba(c,1.,1.,1.,.9);
+                  cairo_set_line_width(c,3.);
+                  cairo_line_to(c,x+8+frag_mv[0]*.66,y+8-frag_mv[1]*.66);
+                  cairo_stroke_preserve(c);
+                  cairo_set_line_width(c,2.);
+                  cairo_line_to(c,x+8+frag_mv[0]*.33,y+8-frag_mv[1]*.33);
+                  cairo_stroke_preserve(c);
+                  cairo_set_line_width(c,1.);
+                  cairo_line_to(c,x+8,y+8);
+                  cairo_stroke(c);
+                }
+              }break;
+              case OC_MODE_GOLDEN_NOMV:{
+                if(_dec->telemetry_mbmode&0x20){
+                  cairo_set_source_rgba(c,1.,1.,0,.5);
+                  cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                  cairo_stroke_preserve(c);
+                  cairo_set_source_rgba(c,1.,1.,0,.25);
+                  cairo_fill(c);
+                }
+              }break;
+              case OC_MODE_GOLDEN_MV:{
+                if(_dec->telemetry_mbmode&0x40){
+                  cairo_rectangle(c,x+2.5,y+2.5,11,11);
+                  cairo_set_source_rgba(c,1.,1.,0,.5);
+                  cairo_stroke(c);
+                }
+                if(_dec->telemetry_mv&0x40){
+                  cairo_move_to(c,x+8+frag_mv[0],y+8-frag_mv[1]);
+                  cairo_set_source_rgba(c,1.,1.,1.,.9);
+                  cairo_set_line_width(c,3.);
+                  cairo_line_to(c,x+8+frag_mv[0]*.66,y+8-frag_mv[1]*.66);
+                  cairo_stroke_preserve(c);
+                  cairo_set_line_width(c,2.);
+                  cairo_line_to(c,x+8+frag_mv[0]*.33,y+8-frag_mv[1]*.33);
+                  cairo_stroke_preserve(c);
+                  cairo_set_line_width(c,1.);
+                  cairo_line_to(c,x+8,y+8);
+                  cairo_stroke(c);
+                }
+              }break;
+              case OC_MODE_INTER_MV_FOUR:{
+                if(_dec->telemetry_mbmode&0x80){
+                  cairo_rectangle(c,x+2.5,y+2.5,4,4);
+                  cairo_rectangle(c,x+9.5,y+2.5,4,4);
+                  cairo_rectangle(c,x+2.5,y+9.5,4,4);
+                  cairo_rectangle(c,x+9.5,y+9.5,4,4);
+                  cairo_set_source_rgba(c,0,1.,0,.5);
+                  cairo_stroke(c);
+                }
+                /*4mv is odd, coded in raster order.*/
+                fragi=mb_maps[mbi][0][0];
+                if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
+                  frag_mv=frag_mvs[fragi];
+                  cairo_move_to(c,x+4+frag_mv[0],y+12-frag_mv[1]);
+                  cairo_set_source_rgba(c,1.,1.,1.,.9);
+                  cairo_set_line_width(c,3.);
+                  cairo_line_to(c,x+4+frag_mv[0]*.66,y+12-frag_mv[1]*.66);
+                  cairo_stroke_preserve(c);
+                  cairo_set_line_width(c,2.);
+                  cairo_line_to(c,x+4+frag_mv[0]*.33,y+12-frag_mv[1]*.33);
+                  cairo_stroke_preserve(c);
+                  cairo_set_line_width(c,1.);
+                  cairo_line_to(c,x+4,y+12);
+                  cairo_stroke(c);
+                }
+                fragi=mb_maps[mbi][0][1];
+                if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
+                  frag_mv=frag_mvs[fragi];
+                  cairo_move_to(c,x+12+frag_mv[0],y+12-frag_mv[1]);
+                  cairo_set_source_rgba(c,1.,1.,1.,.9);
+                  cairo_set_line_width(c,3.);
+                  cairo_line_to(c,x+12+frag_mv[0]*.66,y+12-frag_mv[1]*.66);
+                  cairo_stroke_preserve(c);
+                  cairo_set_line_width(c,2.);
+                  cairo_line_to(c,x+12+frag_mv[0]*.33,y+12-frag_mv[1]*.33);
+                  cairo_stroke_preserve(c);
+                  cairo_set_line_width(c,1.);
+                  cairo_line_to(c,x+12,y+12);
+                  cairo_stroke(c);
+                }
+                fragi=mb_maps[mbi][0][2];
+                if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
+                  frag_mv=frag_mvs[fragi];
+                  cairo_move_to(c,x+4+frag_mv[0],y+4-frag_mv[1]);
+                  cairo_set_source_rgba(c,1.,1.,1.,.9);
+                  cairo_set_line_width(c,3.);
+                  cairo_line_to(c,x+4+frag_mv[0]*.66,y+4-frag_mv[1]*.66);
+                  cairo_stroke_preserve(c);
+                  cairo_set_line_width(c,2.);
+                  cairo_line_to(c,x+4+frag_mv[0]*.33,y+4-frag_mv[1]*.33);
+                  cairo_stroke_preserve(c);
+                  cairo_set_line_width(c,1.);
+                  cairo_line_to(c,x+4,y+4);
+                  cairo_stroke(c);
+                }
+                fragi=mb_maps[mbi][0][3];
+                if(frags[fragi].coded&&_dec->telemetry_mv&0x80){
+                  frag_mv=frag_mvs[fragi];
+                  cairo_move_to(c,x+12+frag_mv[0],y+4-frag_mv[1]);
+                  cairo_set_source_rgba(c,1.,1.,1.,.9);
+                  cairo_set_line_width(c,3.);
+                  cairo_line_to(c,x+12+frag_mv[0]*.66,y+4-frag_mv[1]*.66);
+                  cairo_stroke_preserve(c);
+                  cairo_set_line_width(c,2.);
+                  cairo_line_to(c,x+12+frag_mv[0]*.33,y+4-frag_mv[1]*.33);
+                  cairo_stroke_preserve(c);
+                  cairo_set_line_width(c,1.);
+                  cairo_line_to(c,x+12,y+4);
+                  cairo_stroke(c);
+                }
+              }break;
+            }
+          }
+        }
+        /*qii illustration.*/
+        if(_dec->telemetry_qi&0x2){
+          cairo_set_line_cap(c,CAIRO_LINE_CAP_SQUARE);
+          for(bi=0;bi<4;bi++){
+            ptrdiff_t fragi;
+            int       qiv;
+            int       xp;
+            int       yp;
+            xp=x+(bi&1)*8;
+            yp=y+8-(bi&2)*4;
+            fragi=mb_maps[mbi][0][bi];
+            if(fragi>=0&&frags[fragi].coded){
+              qiv=qim[frags[fragi].qii];
+              cairo_set_line_width(c,3.);
+              cairo_set_source_rgba(c,0.,0.,0.,.5);
+              switch(qiv){
+                /*Double plus:*/
+                case 2:{
+                  if((bi&1)^((bi&2)>>1)){
+                    cairo_move_to(c,xp+2.5,yp+1.5);
+                    cairo_line_to(c,xp+2.5,yp+3.5);
+                    cairo_move_to(c,xp+1.5,yp+2.5);
+                    cairo_line_to(c,xp+3.5,yp+2.5);
+                    cairo_move_to(c,xp+5.5,yp+4.5);
+                    cairo_line_to(c,xp+5.5,yp+6.5);
+                    cairo_move_to(c,xp+4.5,yp+5.5);
+                    cairo_line_to(c,xp+6.5,yp+5.5);
+                    cairo_stroke_preserve(c);
+                    cairo_set_source_rgba(c,0.,1.,1.,1.);
+                  }
+                  else{
+                    cairo_move_to(c,xp+5.5,yp+1.5);
+                    cairo_line_to(c,xp+5.5,yp+3.5);
+                    cairo_move_to(c,xp+4.5,yp+2.5);
+                    cairo_line_to(c,xp+6.5,yp+2.5);
+                    cairo_move_to(c,xp+2.5,yp+4.5);
+                    cairo_line_to(c,xp+2.5,yp+6.5);
+                    cairo_move_to(c,xp+1.5,yp+5.5);
+                    cairo_line_to(c,xp+3.5,yp+5.5);
+                    cairo_stroke_preserve(c);
+                    cairo_set_source_rgba(c,0.,1.,1.,1.);
+                  }
+                }break;
+                /*Double minus:*/
+                case -2:{
+                  cairo_move_to(c,xp+2.5,yp+2.5);
+                  cairo_line_to(c,xp+5.5,yp+2.5);
+                  cairo_move_to(c,xp+2.5,yp+5.5);
+                  cairo_line_to(c,xp+5.5,yp+5.5);
+                  cairo_stroke_preserve(c);
+                  cairo_set_source_rgba(c,1.,1.,1.,1.);
+                }break;
+                /*Plus:*/
+                case 1:{
+                  if(bi&2==0)yp-=2;
+                  if(bi&1==0)xp-=2;
+                  cairo_move_to(c,xp+4.5,yp+2.5);
+                  cairo_line_to(c,xp+4.5,yp+6.5);
+                  cairo_move_to(c,xp+2.5,yp+4.5);
+                  cairo_line_to(c,xp+6.5,yp+4.5);
+                  cairo_stroke_preserve(c);
+                  cairo_set_source_rgba(c,.1,1.,.3,1.);
+                  break;
+                }
+                /*Fall through.*/
+                /*Minus:*/
+                case -1:{
+                  cairo_move_to(c,xp+2.5,yp+4.5);
+                  cairo_line_to(c,xp+6.5,yp+4.5);
+                  cairo_stroke_preserve(c);
+                  cairo_set_source_rgba(c,1.,.3,.1,1.);
+                }break;
+                default:continue;
+              }
+              cairo_set_line_width(c,1.);
+              cairo_stroke(c);
+            }
+          }
+        }
+        col2++;
+        if((col2>>1)>=_dec->state.nhmbs){
+          col2=0;
+          row2+=2;
+        }
+      }
+      /*Bit usage indicator[s]:*/
+      if(_dec->telemetry_bits){
+        int widths[6];
+        int fpsn;
+        int fpsd;
+        int mult;
+        int fullw;
+        int padw;
+        int i;
+        fpsn=_dec->state.info.fps_numerator;
+        fpsd=_dec->state.info.fps_denominator;
+        mult=(_dec->telemetry_bits>=0xFF?1:_dec->telemetry_bits);
+        fullw=250.f*h*fpsd*mult/fpsn;
+        padw=w-24;
+        /*Header and coded block bits.*/
+        if(_dec->telemetry_frame_bytes<0||
+         _dec->telemetry_frame_bytes==OC_LOTS_OF_BITS){
+          _dec->telemetry_frame_bytes=0;
+        }
+        if(_dec->telemetry_coding_bytes<0||
+         _dec->telemetry_coding_bytes>_dec->telemetry_frame_bytes){
+          _dec->telemetry_coding_bytes=0;
+        }
+        if(_dec->telemetry_mode_bytes<0||
+         _dec->telemetry_mode_bytes>_dec->telemetry_frame_bytes){
+          _dec->telemetry_mode_bytes=0;
+        }
+        if(_dec->telemetry_mv_bytes<0||
+         _dec->telemetry_mv_bytes>_dec->telemetry_frame_bytes){
+          _dec->telemetry_mv_bytes=0;
+        }
+        if(_dec->telemetry_qi_bytes<0||
+         _dec->telemetry_qi_bytes>_dec->telemetry_frame_bytes){
+          _dec->telemetry_qi_bytes=0;
+        }
+        if(_dec->telemetry_dc_bytes<0||
+         _dec->telemetry_dc_bytes>_dec->telemetry_frame_bytes){
+          _dec->telemetry_dc_bytes=0;
+        }
+        widths[0]=padw*(_dec->telemetry_frame_bytes-_dec->telemetry_coding_bytes)/fullw;
+        widths[1]=padw*(_dec->telemetry_coding_bytes-_dec->telemetry_mode_bytes)/fullw;
+        widths[2]=padw*(_dec->telemetry_mode_bytes-_dec->telemetry_mv_bytes)/fullw;
+        widths[3]=padw*(_dec->telemetry_mv_bytes-_dec->telemetry_qi_bytes)/fullw;
+        widths[4]=padw*(_dec->telemetry_qi_bytes-_dec->telemetry_dc_bytes)/fullw;
+        widths[5]=padw*(_dec->telemetry_dc_bytes)/fullw;
+        for(i=0;i<6;i++)if(widths[i]>w)widths[i]=w;
+        cairo_set_source_rgba(c,.0,.0,.0,.6);
+        cairo_rectangle(c,10,h-33,widths[0]+1,5);
+        cairo_rectangle(c,10,h-29,widths[1]+1,5);
+        cairo_rectangle(c,10,h-25,widths[2]+1,5);
+        cairo_rectangle(c,10,h-21,widths[3]+1,5);
+        cairo_rectangle(c,10,h-17,widths[4]+1,5);
+        cairo_rectangle(c,10,h-13,widths[5]+1,5);
+        cairo_fill(c);
+        cairo_set_source_rgb(c,1,0,0);
+        cairo_rectangle(c,10.5,h-32.5,widths[0],4);
+        cairo_fill(c);
+        cairo_set_source_rgb(c,0,1,0);
+        cairo_rectangle(c,10.5,h-28.5,widths[1],4);
+        cairo_fill(c);
+        cairo_set_source_rgb(c,0,0,1);
+        cairo_rectangle(c,10.5,h-24.5,widths[2],4);
+        cairo_fill(c);
+        cairo_set_source_rgb(c,.6,.4,.0);
+        cairo_rectangle(c,10.5,h-20.5,widths[3],4);
+        cairo_fill(c);
+        cairo_set_source_rgb(c,.3,.3,.3);
+        cairo_rectangle(c,10.5,h-16.5,widths[4],4);
+        cairo_fill(c);
+        cairo_set_source_rgb(c,.5,.5,.8);
+        cairo_rectangle(c,10.5,h-12.5,widths[5],4);
+        cairo_fill(c);
+      }
+      /*Master qi indicator[s]:*/
+      if(_dec->telemetry_qi&0x1){
+        cairo_text_extents_t extents;
+        char                 buffer[10];
+        int                  p;
+        int                  y;
+        p=0;
+        y=h-7.5;
+        if(_dec->state.qis[0]>=10)buffer[p++]=48+_dec->state.qis[0]/10;
+        buffer[p++]=48+_dec->state.qis[0]%10;
+        if(_dec->state.nqis>=2){
+          buffer[p++]=' ';
+          if(_dec->state.qis[1]>=10)buffer[p++]=48+_dec->state.qis[1]/10;
+          buffer[p++]=48+_dec->state.qis[1]%10;
+        }
+        if(_dec->state.nqis==3){
+          buffer[p++]=' ';
+          if(_dec->state.qis[2]>=10)buffer[p++]=48+_dec->state.qis[2]/10;
+          buffer[p++]=48+_dec->state.qis[2]%10;
+        }
+        buffer[p++]='\0';
+        cairo_select_font_face(c,"sans",
+         CAIRO_FONT_SLANT_NORMAL,CAIRO_FONT_WEIGHT_BOLD);
+        cairo_set_font_size(c,18);
+        cairo_text_extents(c,buffer,&extents);
+        cairo_set_source_rgb(c,1,1,1);
+        cairo_move_to(c,w-extents.x_advance-10,y);
+        cairo_show_text(c,buffer);
+        cairo_set_source_rgb(c,0,0,0);
+        cairo_move_to(c,w-extents.x_advance-10,y);
+        cairo_text_path(c,buffer);
+        cairo_set_line_width(c,.8);
+        cairo_set_line_join(c,CAIRO_LINE_JOIN_ROUND);
+        cairo_stroke(c);
+      }
+      cairo_destroy(c);
+    }
+    /*Out of the Cairo plane into the telemetry YUV buffer.*/
+    _ycbcr[0].data=_dec->telemetry_frame_data;
+    _ycbcr[0].stride=_ycbcr[0].width;
+    _ycbcr[1].data=_ycbcr[0].data+h*_ycbcr[0].stride;
+    _ycbcr[1].stride=_ycbcr[1].width;
+    _ycbcr[2].data=_ycbcr[1].data+(h>>vdec)*_ycbcr[1].stride;
+    _ycbcr[2].stride=_ycbcr[2].width;
+    y_row=_ycbcr[0].data;
+    u_row=_ycbcr[1].data;
+    v_row=_ycbcr[2].data;
+    rgb_row=data;
+    /*This is one of the few places it's worth handling chroma on a
+       case-by-case basis.*/
+    switch(_dec->state.info.pixel_fmt){
+      case TH_PF_420:{
+        for(y=0;y<h;y+=2){
+          unsigned char *y_row2;
+          unsigned char *rgb_row2;
+          y_row2=y_row+_ycbcr[0].stride;
+          rgb_row2=rgb_row+cstride;
+          for(x=0;x<w;x+=2){
+            int y;
+            int u;
+            int v;
+            y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1]
+             +24966*rgb_row[4*x+0]+4207500)/255000;
+            y_row[x]=OC_CLAMP255(y);
+            y=(65481*rgb_row[4*x+6]+128553*rgb_row[4*x+5]
+             +24966*rgb_row[4*x+4]+4207500)/255000;
+            y_row[x+1]=OC_CLAMP255(y);
+            y=(65481*rgb_row2[4*x+2]+128553*rgb_row2[4*x+1]
+             +24966*rgb_row2[4*x+0]+4207500)/255000;
+            y_row2[x]=OC_CLAMP255(y);
+            y=(65481*rgb_row2[4*x+6]+128553*rgb_row2[4*x+5]
+             +24966*rgb_row2[4*x+4]+4207500)/255000;
+            y_row2[x+1]=OC_CLAMP255(y);
+            u=(-8372*(rgb_row[4*x+2]+rgb_row[4*x+6]
+             +rgb_row2[4*x+2]+rgb_row2[4*x+6])
+             -16436*(rgb_row[4*x+1]+rgb_row[4*x+5]
+             +rgb_row2[4*x+1]+rgb_row2[4*x+5])
+             +24808*(rgb_row[4*x+0]+rgb_row[4*x+4]
+             +rgb_row2[4*x+0]+rgb_row2[4*x+4])+29032005)/225930;
+            v=(39256*(rgb_row[4*x+2]+rgb_row[4*x+6]
+             +rgb_row2[4*x+2]+rgb_row2[4*x+6])
+             -32872*(rgb_row[4*x+1]+rgb_row[4*x+5]
+              +rgb_row2[4*x+1]+rgb_row2[4*x+5])
+             -6384*(rgb_row[4*x+0]+rgb_row[4*x+4]
+              +rgb_row2[4*x+0]+rgb_row2[4*x+4])+45940035)/357510;
+            u_row[x>>1]=OC_CLAMP255(u);
+            v_row[x>>1]=OC_CLAMP255(v);
+          }
+          y_row+=_ycbcr[0].stride<<1;
+          u_row+=_ycbcr[1].stride;
+          v_row+=_ycbcr[2].stride;
+          rgb_row+=cstride<<1;
+        }
+      }break;
+      case TH_PF_422:{
+        for(y=0;y<h;y++){
+          for(x=0;x<w;x+=2){
+            int y;
+            int u;
+            int v;
+            y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1]
+             +24966*rgb_row[4*x+0]+4207500)/255000;
+            y_row[x]=OC_CLAMP255(y);
+            y=(65481*rgb_row[4*x+6]+128553*rgb_row[4*x+5]
+             +24966*rgb_row[4*x+4]+4207500)/255000;
+            y_row[x+1]=OC_CLAMP255(y);
+            u=(-16744*(rgb_row[4*x+2]+rgb_row[4*x+6])
+             -32872*(rgb_row[4*x+1]+rgb_row[4*x+5])
+             +49616*(rgb_row[4*x+0]+rgb_row[4*x+4])+29032005)/225930;
+            v=(78512*(rgb_row[4*x+2]+rgb_row[4*x+6])
+             -65744*(rgb_row[4*x+1]+rgb_row[4*x+5])
+             -12768*(rgb_row[4*x+0]+rgb_row[4*x+4])+45940035)/357510;
+            u_row[x>>1]=OC_CLAMP255(u);
+            v_row[x>>1]=OC_CLAMP255(v);
+          }
+          y_row+=_ycbcr[0].stride;
+          u_row+=_ycbcr[1].stride;
+          v_row+=_ycbcr[2].stride;
+          rgb_row+=cstride;
+        }
+      }break;
+      /*case TH_PF_444:*/
+      default:{
+        for(y=0;y<h;y++){
+          for(x=0;x<w;x++){
+            int y;
+            int u;
+            int v;
+            y=(65481*rgb_row[4*x+2]+128553*rgb_row[4*x+1]
+             +24966*rgb_row[4*x+0]+4207500)/255000;
+            u=(-33488*rgb_row[4*x+2]-65744*rgb_row[4*x+1]
+             +99232*rgb_row[4*x+0]+29032005)/225930;
+            v=(157024*rgb_row[4*x+2]-131488*rgb_row[4*x+1]
+             -25536*rgb_row[4*x+0]+45940035)/357510;
+            y_row[x]=OC_CLAMP255(y);
+            u_row[x]=OC_CLAMP255(u);
+            v_row[x]=OC_CLAMP255(v);
+          }
+          y_row+=_ycbcr[0].stride;
+          u_row+=_ycbcr[1].stride;
+          v_row+=_ycbcr[2].stride;
+          rgb_row+=cstride;
+        }
+      }break;
+    }
+    /*Finished.
+      Destroy the surface.*/
+    cairo_surface_destroy(cs);
+  }
+#endif
+  return 0;
+}
diff --git a/lib/dequant.c b/lib/dequant.c
new file mode 100644
index 0000000..e554872
--- /dev/null
+++ b/lib/dequant.c
@@ -0,0 +1,182 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: dequant.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <ogg/ogg.h>
+#include "dequant.h"
+#include "decint.h"
+
+int oc_quant_params_unpack(oc_pack_buf *_opb,th_quant_info *_qinfo){
+  th_quant_base *base_mats;
+  long           val;
+  int            nbase_mats;
+  int            sizes[64];
+  int            indices[64];
+  int            nbits;
+  int            bmi;
+  int            ci;
+  int            qti;
+  int            pli;
+  int            qri;
+  int            qi;
+  int            i;
+  val=oc_pack_read(_opb,3);
+  nbits=(int)val;
+  for(qi=0;qi<64;qi++){
+    val=oc_pack_read(_opb,nbits);
+    _qinfo->loop_filter_limits[qi]=(unsigned char)val;
+  }
+  val=oc_pack_read(_opb,4);
+  nbits=(int)val+1;
+  for(qi=0;qi<64;qi++){
+    val=oc_pack_read(_opb,nbits);
+    _qinfo->ac_scale[qi]=(ogg_uint16_t)val;
+  }
+  val=oc_pack_read(_opb,4);
+  nbits=(int)val+1;
+  for(qi=0;qi<64;qi++){
+    val=oc_pack_read(_opb,nbits);
+    _qinfo->dc_scale[qi]=(ogg_uint16_t)val;
+  }
+  val=oc_pack_read(_opb,9);
+  nbase_mats=(int)val+1;
+  base_mats=_ogg_malloc(nbase_mats*sizeof(base_mats[0]));
+  if(base_mats==NULL)return TH_EFAULT;
+  for(bmi=0;bmi<nbase_mats;bmi++){
+    for(ci=0;ci<64;ci++){
+      val=oc_pack_read(_opb,8);
+      base_mats[bmi][ci]=(unsigned char)val;
+    }
+  }
+  nbits=oc_ilog(nbase_mats-1);
+  for(i=0;i<6;i++){
+    th_quant_ranges *qranges;
+    th_quant_base   *qrbms;
+    int             *qrsizes;
+    qti=i/3;
+    pli=i%3;
+    qranges=_qinfo->qi_ranges[qti]+pli;
+    if(i>0){
+      val=oc_pack_read1(_opb);
+      if(!val){
+        int qtj;
+        int plj;
+        if(qti>0){
+          val=oc_pack_read1(_opb);
+          if(val){
+            qtj=qti-1;
+            plj=pli;
+          }
+          else{
+            qtj=(i-1)/3;
+            plj=(i-1)%3;
+          }
+        }
+        else{
+          qtj=(i-1)/3;
+          plj=(i-1)%3;
+        }
+        *qranges=*(_qinfo->qi_ranges[qtj]+plj);
+        continue;
+      }
+    }
+    val=oc_pack_read(_opb,nbits);
+    indices[0]=(int)val;
+    for(qi=qri=0;qi<63;){
+      val=oc_pack_read(_opb,oc_ilog(62-qi));
+      sizes[qri]=(int)val+1;
+      qi+=(int)val+1;
+      val=oc_pack_read(_opb,nbits);
+      indices[++qri]=(int)val;
+    }
+    /*Note: The caller is responsible for cleaning up any partially
+       constructed qinfo.*/
+    if(qi>63){
+      _ogg_free(base_mats);
+      return TH_EBADHEADER;
+    }
+    qranges->nranges=qri;
+    qranges->sizes=qrsizes=(int *)_ogg_malloc(qri*sizeof(qrsizes[0]));
+    if(qranges->sizes==NULL){
+      /*Note: The caller is responsible for cleaning up any partially
+         constructed qinfo.*/
+      _ogg_free(base_mats);
+      return TH_EFAULT;
+    }
+    memcpy(qrsizes,sizes,qri*sizeof(qrsizes[0]));
+    qrbms=(th_quant_base *)_ogg_malloc((qri+1)*sizeof(qrbms[0]));
+    if(qrbms==NULL){
+      /*Note: The caller is responsible for cleaning up any partially
+         constructed qinfo.*/
+      _ogg_free(base_mats);
+      return TH_EFAULT;
+    }
+    qranges->base_matrices=(const th_quant_base *)qrbms;
+    do{
+      bmi=indices[qri];
+      /*Note: The caller is responsible for cleaning up any partially
+         constructed qinfo.*/
+      if(bmi>=nbase_mats){
+        _ogg_free(base_mats);
+        return TH_EBADHEADER;
+      }
+      memcpy(qrbms[qri],base_mats[bmi],sizeof(qrbms[qri]));
+    }
+    while(qri-->0);
+  }
+  _ogg_free(base_mats);
+  return 0;
+}
+
+void oc_quant_params_clear(th_quant_info *_qinfo){
+  int i;
+  for(i=6;i-->0;){
+    int qti;
+    int pli;
+    qti=i/3;
+    pli=i%3;
+    /*Clear any duplicate pointer references.*/
+    if(i>0){
+      int qtj;
+      int plj;
+      qtj=(i-1)/3;
+      plj=(i-1)%3;
+      if(_qinfo->qi_ranges[qti][pli].sizes==
+       _qinfo->qi_ranges[qtj][plj].sizes){
+        _qinfo->qi_ranges[qti][pli].sizes=NULL;
+      }
+      if(_qinfo->qi_ranges[qti][pli].base_matrices==
+       _qinfo->qi_ranges[qtj][plj].base_matrices){
+        _qinfo->qi_ranges[qti][pli].base_matrices=NULL;
+      }
+    }
+    if(qti>0){
+      if(_qinfo->qi_ranges[1][pli].sizes==
+       _qinfo->qi_ranges[0][pli].sizes){
+        _qinfo->qi_ranges[1][pli].sizes=NULL;
+      }
+      if(_qinfo->qi_ranges[1][pli].base_matrices==
+       _qinfo->qi_ranges[0][pli].base_matrices){
+        _qinfo->qi_ranges[1][pli].base_matrices=NULL;
+      }
+    }
+    /*Now free all the non-duplicate storage.*/
+    _ogg_free((void *)_qinfo->qi_ranges[qti][pli].sizes);
+    _ogg_free((void *)_qinfo->qi_ranges[qti][pli].base_matrices);
+  }
+}
diff --git a/lib/dequant.h b/lib/dequant.h
new file mode 100644
index 0000000..ef25838
--- /dev/null
+++ b/lib/dequant.h
@@ -0,0 +1,27 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: dequant.h 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#if !defined(_dequant_H)
+# define _dequant_H (1)
+# include "quant.h"
+# include "bitpack.h"
+
+int oc_quant_params_unpack(oc_pack_buf *_opb,
+ th_quant_info *_qinfo);
+void oc_quant_params_clear(th_quant_info *_qinfo);
+
+#endif
diff --git a/lib/encapiwrapper.c b/lib/encapiwrapper.c
new file mode 100644
index 0000000..874f124
--- /dev/null
+++ b/lib/encapiwrapper.c
@@ -0,0 +1,168 @@
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "apiwrapper.h"
+#include "encint.h"
+#include "theora/theoraenc.h"
+
+
+
+static void th_enc_api_clear(th_api_wrapper *_api){
+  if(_api->encode)th_encode_free(_api->encode);
+  memset(_api,0,sizeof(*_api));
+}
+
+static void theora_encode_clear(theora_state *_te){
+  if(_te->i!=NULL)theora_info_clear(_te->i);
+  memset(_te,0,sizeof(*_te));
+}
+
+static int theora_encode_control(theora_state *_te,int _req,
+ void *_buf,size_t _buf_sz){
+  return th_encode_ctl(((th_api_wrapper *)_te->i->codec_setup)->encode,
+   _req,_buf,_buf_sz);
+}
+
+static ogg_int64_t theora_encode_granule_frame(theora_state *_te,
+ ogg_int64_t _gp){
+  return th_granule_frame(((th_api_wrapper *)_te->i->codec_setup)->encode,_gp);
+}
+
+static double theora_encode_granule_time(theora_state *_te,ogg_int64_t _gp){
+  return th_granule_time(((th_api_wrapper *)_te->i->codec_setup)->encode,_gp);
+}
+
+static const oc_state_dispatch_vtable OC_ENC_DISPATCH_VTBL={
+  (oc_state_clear_func)theora_encode_clear,
+  (oc_state_control_func)theora_encode_control,
+  (oc_state_granule_frame_func)theora_encode_granule_frame,
+  (oc_state_granule_time_func)theora_encode_granule_time,
+};
+
+int theora_encode_init(theora_state *_te,theora_info *_ci){
+  th_api_info *apiinfo;
+  th_info      info;
+  ogg_uint32_t keyframe_frequency_force;
+  /*Allocate our own combined API wrapper/theora_info struct.
+    We put them both in one malloc'd block so that when the API wrapper is
+     freed, the info struct goes with it.
+    This avoids having to figure out whether or not we need to free the info
+     struct in either theora_info_clear() or theora_clear().*/
+  apiinfo=(th_api_info *)_ogg_malloc(sizeof(*apiinfo));
+  if(apiinfo==NULL)return TH_EFAULT;
+  /*Make our own copy of the info struct, since its lifetime should be
+     independent of the one we were passed in.*/
+  *&apiinfo->info=*_ci;
+  oc_theora_info2th_info(&info,_ci);
+  apiinfo->api.encode=th_encode_alloc(&info);
+  if(apiinfo->api.encode==NULL){
+    _ogg_free(apiinfo);
+    return OC_EINVAL;
+  }
+  apiinfo->api.clear=(oc_setup_clear_func)th_enc_api_clear;
+  /*Provide entry points for ABI compatibility with old decoder shared libs.*/
+  _te->internal_encode=(void *)&OC_ENC_DISPATCH_VTBL;
+  _te->internal_decode=NULL;
+  _te->granulepos=0;
+  _te->i=&apiinfo->info;
+  _te->i->codec_setup=&apiinfo->api;
+  /*Set the precise requested keyframe frequency.*/
+  keyframe_frequency_force=_ci->keyframe_auto_p?
+   _ci->keyframe_frequency_force:_ci->keyframe_frequency;
+  th_encode_ctl(apiinfo->api.encode,
+   TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE,
+   &keyframe_frequency_force,sizeof(keyframe_frequency_force));
+  /*TODO: Additional codec setup using the extra fields in theora_info.*/
+  return 0;
+}
+
+int theora_encode_YUVin(theora_state *_te,yuv_buffer *_yuv){
+  th_api_wrapper  *api;
+  th_ycbcr_buffer  buf;
+  int              ret;
+  api=(th_api_wrapper *)_te->i->codec_setup;
+  buf[0].width=_yuv->y_width;
+  buf[0].height=_yuv->y_height;
+  buf[0].stride=_yuv->y_stride;
+  buf[0].data=_yuv->y;
+  buf[1].width=_yuv->uv_width;
+  buf[1].height=_yuv->uv_height;
+  buf[1].stride=_yuv->uv_stride;
+  buf[1].data=_yuv->u;
+  buf[2].width=_yuv->uv_width;
+  buf[2].height=_yuv->uv_height;
+  buf[2].stride=_yuv->uv_stride;
+  buf[2].data=_yuv->v;
+  ret=th_encode_ycbcr_in(api->encode,buf);
+  if(ret<0)return ret;
+  _te->granulepos=api->encode->state.granpos;
+  return ret;
+}
+
+int theora_encode_packetout(theora_state *_te,int _last_p,ogg_packet *_op){
+  th_api_wrapper *api;
+  api=(th_api_wrapper *)_te->i->codec_setup;
+  return th_encode_packetout(api->encode,_last_p,_op);
+}
+
+int theora_encode_header(theora_state *_te,ogg_packet *_op){
+  oc_enc_ctx     *enc;
+  th_api_wrapper *api;
+  int             ret;
+  api=(th_api_wrapper *)_te->i->codec_setup;
+  enc=api->encode;
+  /*If we've already started encoding, fail.*/
+  if(enc->packet_state>OC_PACKET_EMPTY||enc->state.granpos!=0){
+    return TH_EINVAL;
+  }
+  /*Reset the state to make sure we output an info packet.*/
+  enc->packet_state=OC_PACKET_INFO_HDR;
+  ret=th_encode_flushheader(api->encode,NULL,_op);
+  return ret>=0?0:ret;
+}
+
+int theora_encode_comment(theora_comment *_tc,ogg_packet *_op){
+  oggpack_buffer  opb;
+  void           *buf;
+  int             packet_state;
+  int             ret;
+  packet_state=OC_PACKET_COMMENT_HDR;
+  oggpackB_writeinit(&opb);
+  ret=oc_state_flushheader(NULL,&packet_state,&opb,NULL,NULL,
+   th_version_string(),(th_comment *)_tc,_op);
+  if(ret>=0){
+    /*The oggpack_buffer's lifetime ends with this function, so we have to
+       copy out the packet contents.
+      Presumably the application knows it is supposed to free this.
+      This part works nothing like the Vorbis API, and the documentation on it
+       has been wrong for some time, claiming libtheora owned the memory.*/
+    buf=_ogg_malloc(_op->bytes);
+    if(buf==NULL){
+      _op->packet=NULL;
+      ret=TH_EFAULT;
+    }
+    else{
+      memcpy(buf,_op->packet,_op->bytes);
+      _op->packet=buf;
+      ret=0;
+    }
+  }
+  oggpack_writeclear(&opb);
+  return ret;
+}
+
+int theora_encode_tables(theora_state *_te,ogg_packet *_op){
+  oc_enc_ctx     *enc;
+  th_api_wrapper *api;
+  int             ret;
+  api=(th_api_wrapper *)_te->i->codec_setup;
+  enc=api->encode;
+  /*If we've already started encoding, fail.*/
+  if(enc->packet_state>OC_PACKET_EMPTY||enc->state.granpos!=0){
+    return TH_EINVAL;
+  }
+  /*Reset the state to make sure we output a setup packet.*/
+  enc->packet_state=OC_PACKET_SETUP_HDR;
+  ret=th_encode_flushheader(api->encode,NULL,_op);
+  return ret>=0?0:ret;
+}
diff --git a/lib/encfrag.c b/lib/encfrag.c
new file mode 100644
index 0000000..bb814c8
--- /dev/null
+++ b/lib/encfrag.c
@@ -0,0 +1,388 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: encfrag.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+#include <stdlib.h>
+#include <string.h>
+#include "encint.h"
+
+
+void oc_enc_frag_sub(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
+ const unsigned char *_src,const unsigned char *_ref,int _ystride){
+  (*_enc->opt_vtable.frag_sub)(_diff,_src,_ref,_ystride);
+}
+
+void oc_enc_frag_sub_c(ogg_int16_t _diff[64],const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  int i;
+  for(i=0;i<8;i++){
+    int j;
+    for(j=0;j<8;j++)_diff[i*8+j]=(ogg_int16_t)(_src[j]-_ref[j]);
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+}
+
+void oc_enc_frag_sub_128(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
+ const unsigned char *_src,int _ystride){
+  (*_enc->opt_vtable.frag_sub_128)(_diff,_src,_ystride);
+}
+
+void oc_enc_frag_sub_128_c(ogg_int16_t *_diff,
+ const unsigned char *_src,int _ystride){
+  int i;
+  for(i=0;i<8;i++){
+    int j;
+    for(j=0;j<8;j++)_diff[i*8+j]=(ogg_int16_t)(_src[j]-128);
+    _src+=_ystride;
+  }
+}
+
+unsigned oc_enc_frag_sad(const oc_enc_ctx *_enc,const unsigned char *_x,
+ const unsigned char *_y,int _ystride){
+  return (*_enc->opt_vtable.frag_sad)(_x,_y,_ystride);
+}
+
+unsigned oc_enc_frag_sad_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  unsigned sad;
+  int      i;
+  sad=0;
+  for(i=8;i-->0;){
+    int j;
+    for(j=0;j<8;j++)sad+=abs(_src[j]-_ref[j]);
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+  return sad;
+}
+
+unsigned oc_enc_frag_sad_thresh(const oc_enc_ctx *_enc,
+ const unsigned char *_src,const unsigned char *_ref,int _ystride,
+ unsigned _thresh){
+  return (*_enc->opt_vtable.frag_sad_thresh)(_src,_ref,_ystride,_thresh);
+}
+
+unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh){
+  unsigned sad;
+  int      i;
+  sad=0;
+  for(i=8;i-->0;){
+    int j;
+    for(j=0;j<8;j++)sad+=abs(_src[j]-_ref[j]);
+    if(sad>_thresh)break;
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+  return sad;
+}
+
+unsigned oc_enc_frag_sad2_thresh(const oc_enc_ctx *_enc,
+ const unsigned char *_src,const unsigned char *_ref1,
+ const unsigned char *_ref2,int _ystride,unsigned _thresh){
+  return (*_enc->opt_vtable.frag_sad2_thresh)(_src,_ref1,_ref2,_ystride,
+   _thresh);
+}
+
+unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh){
+  unsigned sad;
+  int      i;
+  sad=0;
+  for(i=8;i-->0;){
+    int j;
+    for(j=0;j<8;j++)sad+=abs(_src[j]-(_ref1[j]+_ref2[j]>>1));
+    if(sad>_thresh)break;
+    _src+=_ystride;
+    _ref1+=_ystride;
+    _ref2+=_ystride;
+  }
+  return sad;
+}
+
+static void oc_diff_hadamard(ogg_int16_t _buf[64],const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  int i;
+  for(i=0;i<8;i++){
+    int t0;
+    int t1;
+    int t2;
+    int t3;
+    int t4;
+    int t5;
+    int t6;
+    int t7;
+    int r;
+    /*Hadamard stage 1:*/
+    t0=_src[0]-_ref[0]+_src[4]-_ref[4];
+    t4=_src[0]-_ref[0]-_src[4]+_ref[4];
+    t1=_src[1]-_ref[1]+_src[5]-_ref[5];
+    t5=_src[1]-_ref[1]-_src[5]+_ref[5];
+    t2=_src[2]-_ref[2]+_src[6]-_ref[6];
+    t6=_src[2]-_ref[2]-_src[6]+_ref[6];
+    t3=_src[3]-_ref[3]+_src[7]-_ref[7];
+    t7=_src[3]-_ref[3]-_src[7]+_ref[7];
+    /*Hadamard stage 2:*/
+    r=t0;
+    t0+=t2;
+    t2=r-t2;
+    r=t1;
+    t1+=t3;
+    t3=r-t3;
+    r=t4;
+    t4+=t6;
+    t6=r-t6;
+    r=t5;
+    t5+=t7;
+    t7=r-t7;
+    /*Hadamard stage 3:*/
+    _buf[0*8+i]=(ogg_int16_t)(t0+t1);
+    _buf[1*8+i]=(ogg_int16_t)(t0-t1);
+    _buf[2*8+i]=(ogg_int16_t)(t2+t3);
+    _buf[3*8+i]=(ogg_int16_t)(t2-t3);
+    _buf[4*8+i]=(ogg_int16_t)(t4+t5);
+    _buf[5*8+i]=(ogg_int16_t)(t4-t5);
+    _buf[6*8+i]=(ogg_int16_t)(t6+t7);
+    _buf[7*8+i]=(ogg_int16_t)(t6-t7);
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+}
+
+static void oc_diff_hadamard2(ogg_int16_t _buf[64],const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
+  int i;
+  for(i=0;i<8;i++){
+    int t0;
+    int t1;
+    int t2;
+    int t3;
+    int t4;
+    int t5;
+    int t6;
+    int t7;
+    int r;
+    /*Hadamard stage 1:*/
+    r=_ref1[0]+_ref2[0]>>1;
+    t4=_ref1[4]+_ref2[4]>>1;
+    t0=_src[0]-r+_src[4]-t4;
+    t4=_src[0]-r-_src[4]+t4;
+    r=_ref1[1]+_ref2[1]>>1;
+    t5=_ref1[5]+_ref2[5]>>1;
+    t1=_src[1]-r+_src[5]-t5;
+    t5=_src[1]-r-_src[5]+t5;
+    r=_ref1[2]+_ref2[2]>>1;
+    t6=_ref1[6]+_ref2[6]>>1;
+    t2=_src[2]-r+_src[6]-t6;
+    t6=_src[2]-r-_src[6]+t6;
+    r=_ref1[3]+_ref2[3]>>1;
+    t7=_ref1[7]+_ref2[7]>>1;
+    t3=_src[3]-r+_src[7]-t7;
+    t7=_src[3]-r-_src[7]+t7;
+    /*Hadamard stage 2:*/
+    r=t0;
+    t0+=t2;
+    t2=r-t2;
+    r=t1;
+    t1+=t3;
+    t3=r-t3;
+    r=t4;
+    t4+=t6;
+    t6=r-t6;
+    r=t5;
+    t5+=t7;
+    t7=r-t7;
+    /*Hadamard stage 3:*/
+    _buf[0*8+i]=(ogg_int16_t)(t0+t1);
+    _buf[1*8+i]=(ogg_int16_t)(t0-t1);
+    _buf[2*8+i]=(ogg_int16_t)(t2+t3);
+    _buf[3*8+i]=(ogg_int16_t)(t2-t3);
+    _buf[4*8+i]=(ogg_int16_t)(t4+t5);
+    _buf[5*8+i]=(ogg_int16_t)(t4-t5);
+    _buf[6*8+i]=(ogg_int16_t)(t6+t7);
+    _buf[7*8+i]=(ogg_int16_t)(t6-t7);
+    _src+=_ystride;
+    _ref1+=_ystride;
+    _ref2+=_ystride;
+  }
+}
+
+static void oc_intra_hadamard(ogg_int16_t _buf[64],const unsigned char *_src,
+ int _ystride){
+  int i;
+  for(i=0;i<8;i++){
+    int t0;
+    int t1;
+    int t2;
+    int t3;
+    int t4;
+    int t5;
+    int t6;
+    int t7;
+    int r;
+    /*Hadamard stage 1:*/
+    t0=_src[0]+_src[4];
+    t4=_src[0]-_src[4];
+    t1=_src[1]+_src[5];
+    t5=_src[1]-_src[5];
+    t2=_src[2]+_src[6];
+    t6=_src[2]-_src[6];
+    t3=_src[3]+_src[7];
+    t7=_src[3]-_src[7];
+    /*Hadamard stage 2:*/
+    r=t0;
+    t0+=t2;
+    t2=r-t2;
+    r=t1;
+    t1+=t3;
+    t3=r-t3;
+    r=t4;
+    t4+=t6;
+    t6=r-t6;
+    r=t5;
+    t5+=t7;
+    t7=r-t7;
+    /*Hadamard stage 3:*/
+    _buf[0*8+i]=(ogg_int16_t)(t0+t1);
+    _buf[1*8+i]=(ogg_int16_t)(t0-t1);
+    _buf[2*8+i]=(ogg_int16_t)(t2+t3);
+    _buf[3*8+i]=(ogg_int16_t)(t2-t3);
+    _buf[4*8+i]=(ogg_int16_t)(t4+t5);
+    _buf[5*8+i]=(ogg_int16_t)(t4-t5);
+    _buf[6*8+i]=(ogg_int16_t)(t6+t7);
+    _buf[7*8+i]=(ogg_int16_t)(t6-t7);
+    _src+=_ystride;
+  }
+}
+
+unsigned oc_hadamard_sad_thresh(const ogg_int16_t _buf[64],unsigned _thresh){
+  unsigned    sad;
+  int         t0;
+  int         t1;
+  int         t2;
+  int         t3;
+  int         t4;
+  int         t5;
+  int         t6;
+  int         t7;
+  int         r;
+  int         i;
+  sad=0;
+  for(i=0;i<8;i++){
+    /*Hadamard stage 1:*/
+    t0=_buf[i*8+0]+_buf[i*8+4];
+    t4=_buf[i*8+0]-_buf[i*8+4];
+    t1=_buf[i*8+1]+_buf[i*8+5];
+    t5=_buf[i*8+1]-_buf[i*8+5];
+    t2=_buf[i*8+2]+_buf[i*8+6];
+    t6=_buf[i*8+2]-_buf[i*8+6];
+    t3=_buf[i*8+3]+_buf[i*8+7];
+    t7=_buf[i*8+3]-_buf[i*8+7];
+    /*Hadamard stage 2:*/
+    r=t0;
+    t0+=t2;
+    t2=r-t2;
+    r=t1;
+    t1+=t3;
+    t3=r-t3;
+    r=t4;
+    t4+=t6;
+    t6=r-t6;
+    r=t5;
+    t5+=t7;
+    t7=r-t7;
+    /*Hadamard stage 3:*/
+    r=abs(t0+t1);
+    r+=abs(t0-t1);
+    r+=abs(t2+t3);
+    r+=abs(t2-t3);
+    r+=abs(t4+t5);
+    r+=abs(t4-t5);
+    r+=abs(t6+t7);
+    r+=abs(t6-t7);
+    sad+=r;
+    if(sad>_thresh)break;
+  }
+  return sad;
+}
+
+unsigned oc_enc_frag_satd_thresh(const oc_enc_ctx *_enc,
+ const unsigned char *_src,const unsigned char *_ref,int _ystride,
+ unsigned _thresh){
+  return (*_enc->opt_vtable.frag_satd_thresh)(_src,_ref,_ystride,_thresh);
+}
+
+unsigned oc_enc_frag_satd_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh){
+  ogg_int16_t buf[64];
+  oc_diff_hadamard(buf,_src,_ref,_ystride);
+  return oc_hadamard_sad_thresh(buf,_thresh);
+}
+
+unsigned oc_enc_frag_satd2_thresh(const oc_enc_ctx *_enc,
+ const unsigned char *_src,const unsigned char *_ref1,
+ const unsigned char *_ref2,int _ystride,unsigned _thresh){
+  return (*_enc->opt_vtable.frag_satd2_thresh)(_src,_ref1,_ref2,_ystride,
+   _thresh);
+}
+
+unsigned oc_enc_frag_satd2_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh){
+  ogg_int16_t buf[64];
+  oc_diff_hadamard2(buf,_src,_ref1,_ref2,_ystride);
+  return oc_hadamard_sad_thresh(buf,_thresh);
+}
+
+unsigned oc_enc_frag_intra_satd(const oc_enc_ctx *_enc,
+ const unsigned char *_src,int _ystride){
+  return (*_enc->opt_vtable.frag_intra_satd)(_src,_ystride);
+}
+
+unsigned oc_enc_frag_intra_satd_c(const unsigned char *_src,int _ystride){
+  ogg_int16_t buf[64];
+  oc_intra_hadamard(buf,_src,_ystride);
+  return oc_hadamard_sad_thresh(buf,UINT_MAX)
+   -abs(buf[0]+buf[1]+buf[2]+buf[3]+buf[4]+buf[5]+buf[6]+buf[7]);
+}
+
+void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride){
+  (*_enc->opt_vtable.frag_copy2)(_dst,_src1,_src2,_ystride);
+}
+
+void oc_enc_frag_copy2_c(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride){
+  int i;
+  int j;
+  for(i=8;i-->0;){
+    for(j=0;j<8;j++)_dst[j]=_src1[j]+_src2[j]>>1;
+    _dst+=_ystride;
+    _src1+=_ystride;
+    _src2+=_ystride;
+  }
+}
+
+void oc_enc_frag_recon_intra(const oc_enc_ctx *_enc,
+ unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]){
+  (*_enc->opt_vtable.frag_recon_intra)(_dst,_ystride,_residue);
+}
+
+void oc_enc_frag_recon_inter(const oc_enc_ctx *_enc,unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
+  (*_enc->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue);
+}
diff --git a/lib/encinfo.c b/lib/encinfo.c
new file mode 100644
index 0000000..83be1da
--- /dev/null
+++ b/lib/encinfo.c
@@ -0,0 +1,121 @@
+#include <stdlib.h>
+#include <string.h>
+#include "internal.h"
+#include "enquant.h"
+#include "huffenc.h"
+
+
+
+/*Packs a series of octets from a given byte array into the pack buffer.
+  _opb: The pack buffer to store the octets in.
+  _buf: The byte array containing the bytes to pack.
+  _len: The number of octets to pack.*/
+static void oc_pack_octets(oggpack_buffer *_opb,const char *_buf,int _len){
+  int i;
+  for(i=0;i<_len;i++)oggpackB_write(_opb,_buf[i],8);
+}
+
+
+
+int oc_state_flushheader(oc_theora_state *_state,int *_packet_state,
+ oggpack_buffer *_opb,const th_quant_info *_qinfo,
+ const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS],
+ const char *_vendor,th_comment *_tc,ogg_packet *_op){
+  unsigned char *packet;
+  int            b_o_s;
+  if(_op==NULL)return TH_EFAULT;
+  switch(*_packet_state){
+    /*Codec info header.*/
+    case OC_PACKET_INFO_HDR:{
+      if(_state==NULL)return TH_EFAULT;
+      oggpackB_reset(_opb);
+      /*Mark this packet as the info header.*/
+      oggpackB_write(_opb,0x80,8);
+      /*Write the codec string.*/
+      oc_pack_octets(_opb,"theora",6);
+      /*Write the codec bitstream version.*/
+      oggpackB_write(_opb,TH_VERSION_MAJOR,8);
+      oggpackB_write(_opb,TH_VERSION_MINOR,8);
+      oggpackB_write(_opb,TH_VERSION_SUB,8);
+      /*Describe the encoded frame.*/
+      oggpackB_write(_opb,_state->info.frame_width>>4,16);
+      oggpackB_write(_opb,_state->info.frame_height>>4,16);
+      oggpackB_write(_opb,_state->info.pic_width,24);
+      oggpackB_write(_opb,_state->info.pic_height,24);
+      oggpackB_write(_opb,_state->info.pic_x,8);
+      oggpackB_write(_opb,_state->info.pic_y,8);
+      oggpackB_write(_opb,_state->info.fps_numerator,32);
+      oggpackB_write(_opb,_state->info.fps_denominator,32);
+      oggpackB_write(_opb,_state->info.aspect_numerator,24);
+      oggpackB_write(_opb,_state->info.aspect_denominator,24);
+      oggpackB_write(_opb,_state->info.colorspace,8);
+      oggpackB_write(_opb,_state->info.target_bitrate,24);
+      oggpackB_write(_opb,_state->info.quality,6);
+      oggpackB_write(_opb,_state->info.keyframe_granule_shift,5);
+      oggpackB_write(_opb,_state->info.pixel_fmt,2);
+      /*Spare configuration bits.*/
+      oggpackB_write(_opb,0,3);
+      b_o_s=1;
+    }break;
+    /*Comment header.*/
+    case OC_PACKET_COMMENT_HDR:{
+      int vendor_len;
+      int i;
+      if(_tc==NULL)return TH_EFAULT;
+      vendor_len=strlen(_vendor);
+      oggpackB_reset(_opb);
+      /*Mark this packet as the comment header.*/
+      oggpackB_write(_opb,0x81,8);
+      /*Write the codec string.*/
+      oc_pack_octets(_opb,"theora",6);
+      /*Write the vendor string.*/
+      oggpack_write(_opb,vendor_len,32);
+      oc_pack_octets(_opb,_vendor,vendor_len);
+      oggpack_write(_opb,_tc->comments,32);
+      for(i=0;i<_tc->comments;i++){
+        if(_tc->user_comments[i]!=NULL){
+          oggpack_write(_opb,_tc->comment_lengths[i],32);
+          oc_pack_octets(_opb,_tc->user_comments[i],_tc->comment_lengths[i]);
+        }
+        else oggpack_write(_opb,0,32);
+      }
+      b_o_s=0;
+    }break;
+    /*Codec setup header.*/
+    case OC_PACKET_SETUP_HDR:{
+      int ret;
+      oggpackB_reset(_opb);
+      /*Mark this packet as the setup header.*/
+      oggpackB_write(_opb,0x82,8);
+      /*Write the codec string.*/
+      oc_pack_octets(_opb,"theora",6);
+      /*Write the quantizer tables.*/
+      oc_quant_params_pack(_opb,_qinfo);
+      /*Write the huffman codes.*/
+      ret=oc_huff_codes_pack(_opb,_codes);
+      /*This should never happen, because we validate the tables when they
+         are set.
+        If you see, it's a good chance memory is being corrupted.*/
+      if(ret<0)return ret;
+      b_o_s=0;
+    }break;
+    /*No more headers to emit.*/
+    default:return 0;
+  }
+  /*This is kind of fugly: we hand the user a buffer which they do not own.
+    We will overwrite it when the next packet is output, so the user better be
+     done with it by then.
+    Vorbis is little better: it hands back buffers that it will free the next
+     time the headers are requested, or when the encoder is cleared.
+    Hopefully libogg2 will make this much cleaner.*/
+  packet=oggpackB_get_buffer(_opb);
+  /*If there's no packet, malloc failed while writing.*/
+  if(packet==NULL)return TH_EFAULT;
+  _op->packet=packet;
+  _op->bytes=oggpackB_bytes(_opb);
+  _op->b_o_s=b_o_s;
+  _op->e_o_s=0;
+  _op->granulepos=0;
+  _op->packetno=*_packet_state+3;
+  return ++(*_packet_state)+3;
+}
diff --git a/lib/encint.h b/lib/encint.h
new file mode 100644
index 0000000..97897d5
--- /dev/null
+++ b/lib/encint.h
@@ -0,0 +1,493 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: encint.h 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+#if !defined(_encint_H)
+# define _encint_H (1)
+# if defined(HAVE_CONFIG_H)
+#  include "config.h"
+# endif
+# include "theora/theoraenc.h"
+# include "internal.h"
+# include "ocintrin.h"
+# include "mathops.h"
+# include "enquant.h"
+# include "huffenc.h"
+/*# define OC_COLLECT_METRICS*/
+
+
+
+typedef oc_mv                         oc_mv2[2];
+
+typedef struct oc_enc_opt_vtable      oc_enc_opt_vtable;
+typedef struct oc_mb_enc_info         oc_mb_enc_info;
+typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
+typedef struct oc_iir_filter          oc_iir_filter;
+typedef struct oc_frame_metrics       oc_frame_metrics;
+typedef struct oc_rc_state            oc_rc_state;
+typedef struct th_enc_ctx             oc_enc_ctx;
+typedef struct oc_token_checkpoint    oc_token_checkpoint;
+
+
+
+/*Constants for the packet-out state machine specific to the encoder.*/
+
+/*Next packet to emit: Data packet, but none are ready yet.*/
+#define OC_PACKET_EMPTY (0)
+/*Next packet to emit: Data packet, and one is ready.*/
+#define OC_PACKET_READY (1)
+
+/*All features enabled.*/
+#define OC_SP_LEVEL_SLOW       (0)
+/*Enable early skip.*/
+#define OC_SP_LEVEL_EARLY_SKIP (1)
+/*Disable motion compensation.*/
+#define OC_SP_LEVEL_NOMC       (2)
+/*Maximum valid speed level.*/
+#define OC_SP_LEVEL_MAX        (2)
+
+
+/*The bits used for each of the MB mode codebooks.*/
+extern const unsigned char OC_MODE_BITS[2][OC_NMODES];
+
+/*The bits used for each of the MV codebooks.*/
+extern const unsigned char OC_MV_BITS[2][64];
+
+/*The minimum value that can be stored in a SB run for each codeword.
+  The last entry is the upper bound on the length of a single SB run.*/
+extern const ogg_uint16_t  OC_SB_RUN_VAL_MIN[8];
+/*The bits used for each SB run codeword.*/
+extern const unsigned char OC_SB_RUN_CODE_NBITS[7];
+
+/*The bits used for each block run length (starting with 1).*/
+extern const unsigned char OC_BLOCK_RUN_CODE_NBITS[30];
+
+
+
+/*Encoder specific functions with accelerated variants.*/
+struct oc_enc_opt_vtable{
+  unsigned (*frag_sad)(const unsigned char *_src,
+   const unsigned char *_ref,int _ystride);
+  unsigned (*frag_sad_thresh)(const unsigned char *_src,
+   const unsigned char *_ref,int _ystride,unsigned _thresh);
+  unsigned (*frag_sad2_thresh)(const unsigned char *_src,
+   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+   unsigned _thresh);
+  unsigned (*frag_satd_thresh)(const unsigned char *_src,
+   const unsigned char *_ref,int _ystride,unsigned _thresh);
+  unsigned (*frag_satd2_thresh)(const unsigned char *_src,
+   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+   unsigned _thresh);
+  unsigned (*frag_intra_satd)(const unsigned char *_src,int _ystride);
+  void     (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src,
+   const unsigned char *_ref,int _ystride);
+  void     (*frag_sub_128)(ogg_int16_t _diff[64],
+   const unsigned char *_src,int _ystride);
+  void     (*frag_copy2)(unsigned char *_dst,
+   const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+  void     (*frag_recon_intra)(unsigned char *_dst,int _ystride,
+   const ogg_int16_t _residue[64]);
+  void     (*frag_recon_inter)(unsigned char *_dst,
+   const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
+  void     (*fdct8x8)(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+};
+
+
+void oc_enc_vtable_init(oc_enc_ctx *_enc);
+
+
+
+/*Encoder-specific macroblock information.*/
+struct oc_mb_enc_info{
+  /*Neighboring macro blocks that have MVs available from the current frame.*/
+  unsigned      cneighbors[4];
+  /*Neighboring macro blocks to use for MVs from the previous frame.*/
+  unsigned      pneighbors[4];
+  /*The number of current-frame neighbors.*/
+  unsigned char ncneighbors;
+  /*The number of previous-frame neighbors.*/
+  unsigned char npneighbors;
+  /*Flags indicating which MB modes have been refined.*/
+  unsigned char refined;
+  /*Motion vectors for a macro block for the current frame and the
+     previous two frames.
+    Each is a set of 2 vectors against OC_FRAME_GOLD and OC_FRAME_PREV, which
+     can be used to estimate constant velocity and constant acceleration
+     predictors.
+    Uninitialized MVs are (0,0).*/
+  oc_mv2        analysis_mv[3];
+  /*Current unrefined analysis MVs.*/
+  oc_mv         unref_mv[2];
+  /*Unrefined block MVs.*/
+  oc_mv         block_mv[4];
+  /*Refined block MVs.*/
+  oc_mv         ref_mv[4];
+  /*Minimum motion estimation error from the analysis stage.*/
+  ogg_uint16_t  error[2];
+  /*MB error for half-pel refinement for each frame type.*/
+  unsigned      satd[2];
+  /*Block error for half-pel refinement.*/
+  unsigned      block_satd[4];
+};
+
+
+
+/*State machine to estimate the opportunity cost of coding a MB mode.*/
+struct oc_mode_scheme_chooser{
+  /*Pointers to the a list containing the index of each mode in the mode
+     alphabet used by each scheme.
+    The first entry points to the dynamic scheme0_ranks, while the remaining 7
+     point to the constant entries stored in OC_MODE_SCHEMES.*/
+  const unsigned char *mode_ranks[8];
+  /*The ranks for each mode when coded with scheme 0.
+    These are optimized so that the more frequent modes have lower ranks.*/
+  unsigned char        scheme0_ranks[OC_NMODES];
+  /*The list of modes, sorted in descending order of frequency, that
+    corresponds to the ranks above.*/
+  unsigned char        scheme0_list[OC_NMODES];
+  /*The number of times each mode has been chosen so far.*/
+  int                  mode_counts[OC_NMODES];
+  /*The list of mode coding schemes, sorted in ascending order of bit cost.*/
+  unsigned char        scheme_list[8];
+  /*The number of bits used by each mode coding scheme.*/
+  ptrdiff_t            scheme_bits[8];
+};
+
+
+void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser);
+
+
+
+/*A 2nd order low-pass Bessel follower.
+  We use this for rate control because it has fast reaction time, but is
+   critically damped.*/
+struct oc_iir_filter{
+  ogg_int32_t c[2];
+  ogg_int64_t g;
+  ogg_int32_t x[2];
+  ogg_int32_t y[2];
+};
+
+
+
+/*The 2-pass metrics associated with a single frame.*/
+struct oc_frame_metrics{
+  /*The log base 2 of the scale factor for this frame in Q24 format.*/
+  ogg_int32_t   log_scale;
+  /*The number of application-requested duplicates of this frame.*/
+  unsigned      dup_count:31;
+  /*The frame type from pass 1.*/
+  unsigned      frame_type:1;
+};
+
+
+
+/*Rate control state information.*/
+struct oc_rc_state{
+  /*The target average bits per frame.*/
+  ogg_int64_t        bits_per_frame;
+  /*The current buffer fullness (bits available to be used).*/
+  ogg_int64_t        fullness;
+  /*The target buffer fullness.
+    This is where we'd like to be by the last keyframe the appears in the next
+     buf_delay frames.*/
+  ogg_int64_t        target;
+  /*The maximum buffer fullness (total size of the buffer).*/
+  ogg_int64_t        max;
+  /*The log of the number of pixels in a frame in Q57 format.*/
+  ogg_int64_t        log_npixels;
+  /*The exponent used in the rate model in Q8 format.*/
+  unsigned           exp[2];
+  /*The number of frames to distribute the buffer usage over.*/
+  int                buf_delay;
+  /*The total drop count from the previous frame.
+    This includes duplicates explicitly requested via the
+     TH_ENCCTL_SET_DUP_COUNT API as well as frames we chose to drop ourselves.*/
+  ogg_uint32_t       prev_drop_count;
+  /*The log of an estimated scale factor used to obtain the real framerate, for
+     VFR sources or, e.g., 12 fps content doubled to 24 fps, etc.*/
+  ogg_int64_t        log_drop_scale;
+  /*The log of estimated scale factor for the rate model in Q57 format.*/
+  ogg_int64_t        log_scale[2];
+  /*The log of the target quantizer level in Q57 format.*/
+  ogg_int64_t        log_qtarget;
+  /*Will we drop frames to meet bitrate target?*/
+  unsigned char      drop_frames;
+  /*Do we respect the maximum buffer fullness?*/
+  unsigned char      cap_overflow;
+  /*Can the reservoir go negative?*/
+  unsigned char      cap_underflow;
+  /*Second-order lowpass filters to track scale and VFR.*/
+  oc_iir_filter      scalefilter[2];
+  int                inter_count;
+  int                inter_delay;
+  int                inter_delay_target;
+  oc_iir_filter      vfrfilter;
+  /*Two-pass mode state.
+    0 => 1-pass encoding.
+    1 => 1st pass of 2-pass encoding.
+    2 => 2nd pass of 2-pass encoding.*/
+  int                twopass;
+  /*Buffer for current frame metrics.*/
+  unsigned char      twopass_buffer[48];
+  /*The number of bytes in the frame metrics buffer.
+    When 2-pass encoding is enabled, this is set to 0 after each frame is
+     submitted, and must be non-zero before the next frame will be accepted.*/
+  int                twopass_buffer_bytes;
+  int                twopass_buffer_fill;
+  /*Whether or not to force the next frame to be a keyframe.*/
+  unsigned char      twopass_force_kf;
+  /*The metrics for the previous frame.*/
+  oc_frame_metrics   prev_metrics;
+  /*The metrics for the current frame.*/
+  oc_frame_metrics   cur_metrics;
+  /*The buffered metrics for future frames.*/
+  oc_frame_metrics  *frame_metrics;
+  int                nframe_metrics;
+  int                cframe_metrics;
+  /*The index of the current frame in the circular metric buffer.*/
+  int                frame_metrics_head;
+  /*The frame count of each type (keyframes, delta frames, and dup frames);
+     32 bits limits us to 2.268 years at 60 fps.*/
+  ogg_uint32_t       frames_total[3];
+  /*The number of frames of each type yet to be processed.*/
+  ogg_uint32_t       frames_left[3];
+  /*The sum of the scale values for each frame type.*/
+  ogg_int64_t        scale_sum[2];
+  /*The start of the window over which the current scale sums are taken.*/
+  int                scale_window0;
+  /*The end of the window over which the current scale sums are taken.*/
+  int                scale_window_end;
+  /*The frame count of each type in the current 2-pass window; this does not
+     include dup frames.*/
+  int                nframes[3];
+  /*The total accumulated estimation bias.*/
+  ogg_int64_t        rate_bias;
+};
+
+
+void oc_rc_state_init(oc_rc_state *_rc,oc_enc_ctx *_enc);
+void oc_rc_state_clear(oc_rc_state *_rc);
+
+void oc_enc_rc_resize(oc_enc_ctx *_enc);
+int oc_enc_select_qi(oc_enc_ctx *_enc,int _qti,int _clamp);
+void oc_enc_calc_lambda(oc_enc_ctx *_enc,int _frame_type);
+int oc_enc_update_rc_state(oc_enc_ctx *_enc,
+ long _bits,int _qti,int _qi,int _trial,int _droppable);
+int oc_enc_rc_2pass_out(oc_enc_ctx *_enc,unsigned char **_buf);
+int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes);
+
+
+
+/*The internal encoder state.*/
+struct th_enc_ctx{
+  /*Shared encoder/decoder state.*/
+  oc_theora_state          state;
+  /*Buffer in which to assemble packets.*/
+  oggpack_buffer           opb;
+  /*Encoder-specific macroblock information.*/
+  oc_mb_enc_info          *mb_info;
+  /*DC coefficients after prediction.*/
+  ogg_int16_t             *frag_dc;
+  /*The list of coded macro blocks, in coded order.*/
+  unsigned                *coded_mbis;
+  /*The number of coded macro blocks.*/
+  size_t                   ncoded_mbis;
+  /*Whether or not packets are ready to be emitted.
+    This takes on negative values while there are remaining header packets to
+     be emitted, reaches 0 when the codec is ready for input, and becomes
+     positive when a frame has been processed and data packets are ready.*/
+  int                      packet_state;
+  /*The maximum distance between keyframes.*/
+  ogg_uint32_t             keyframe_frequency_force;
+  /*The number of duplicates to produce for the next frame.*/
+  ogg_uint32_t             dup_count;
+  /*The number of duplicates remaining to be emitted for the current frame.*/
+  ogg_uint32_t             nqueued_dups;
+  /*The number of duplicates emitted for the last frame.*/
+  ogg_uint32_t             prev_dup_count;
+  /*The current speed level.*/
+  int                      sp_level;
+  /*Whether or not VP3 compatibility mode has been enabled.*/
+  unsigned char            vp3_compatible;
+  /*Whether or not any INTER frames have been coded.*/
+  unsigned char            coded_inter_frame;
+  /*Whether or not previous frame was dropped.*/
+  unsigned char            prevframe_dropped;
+  /*Stores most recently chosen Huffman tables for each frame type, DC and AC
+     coefficients, and luma and chroma tokens.
+    The actual Huffman table used for a given coefficient depends not only on
+     the choice made here, but also its index in the zig-zag ordering.*/
+  unsigned char            huff_idxs[2][2][2];
+  /*Current count of bits used by each MV coding mode.*/
+  size_t                   mv_bits[2];
+  /*The mode scheme chooser for estimating mode coding costs.*/
+  oc_mode_scheme_chooser   chooser;
+  /*The number of vertical super blocks in an MCU.*/
+  int                      mcu_nvsbs;
+  /*The SSD error for skipping each fragment in the current MCU.*/
+  unsigned                *mcu_skip_ssd;
+  /*The DCT token lists for each coefficient and each plane.*/
+  unsigned char          **dct_tokens[3];
+  /*The extra bits associated with each DCT token.*/
+  ogg_uint16_t           **extra_bits[3];
+  /*The number of DCT tokens for each coefficient for each plane.*/
+  ptrdiff_t                ndct_tokens[3][64];
+  /*Pending EOB runs for each coefficient for each plane.*/
+  ogg_uint16_t             eob_run[3][64];
+  /*The offset of the first DCT token for each coefficient for each plane.*/
+  unsigned char            dct_token_offs[3][64];
+  /*The last DC coefficient for each plane and reference frame.*/
+  int                      dc_pred_last[3][3];
+#if defined(OC_COLLECT_METRICS)
+  /*Fragment SATD statistics for MB mode estimation metrics.*/
+  unsigned                *frag_satd;
+  /*Fragment SSD statistics for MB mode estimation metrics.*/
+  unsigned                *frag_ssd;
+#endif
+  /*The R-D optimization parameter.*/
+  int                      lambda;
+  /*The huffman tables in use.*/
+  th_huff_code             huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
+  /*The quantization parameters in use.*/
+  th_quant_info            qinfo;
+  oc_iquant               *enquant_tables[64][3][2];
+  oc_iquant_table          enquant_table_data[64][3][2];
+  /*An "average" quantizer for each quantizer type (INTRA or INTER) and qi
+     value.
+    This is used to paramterize the rate control decisions.
+    They are kept in the log domain to simplify later processing.
+    Keep in mind these are DCT domain quantizers, and so are scaled by an
+     additional factor of 4 from the pixel domain.*/
+  ogg_int64_t              log_qavg[2][64];
+  /*The buffer state used to drive rate control.*/
+  oc_rc_state              rc;
+  /*Table for encoder acceleration functions.*/
+  oc_enc_opt_vtable        opt_vtable;
+};
+
+
+void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode);
+int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode);
+#if defined(OC_COLLECT_METRICS)
+void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc);
+void oc_enc_mode_metrics_dump(oc_enc_ctx *_enc);
+#endif
+
+
+
+/*Perform fullpel motion search for a single MB against both reference frames.*/
+void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi);
+/*Refine a MB MV for one frame.*/
+void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame);
+/*Refine the block MVs.*/
+void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi);
+
+
+
+/*Used to rollback a tokenlog transaction when we retroactively decide to skip
+   a fragment.
+  A checkpoint is taken right before each token is added.*/
+struct oc_token_checkpoint{
+  /*The color plane the token was added to.*/
+  unsigned char pli;
+  /*The zig-zag index the token was added to.*/
+  unsigned char zzi;
+  /*The outstanding EOB run count before the token was added.*/
+  ogg_uint16_t  eob_run;
+  /*The token count before the token was added.*/
+  ptrdiff_t     ndct_tokens;
+};
+
+
+
+void oc_enc_tokenize_start(oc_enc_ctx *_enc);
+int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
+ ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ int _zzi,oc_token_checkpoint **_stack,int _acmin);
+void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
+ const oc_token_checkpoint *_stack,int _n);
+void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
+ int _pli,int _fragy0,int _frag_yend);
+void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
+ const ptrdiff_t *_coded_fragis,ptrdiff_t _ncoded_fragis,
+ int _prev_ndct_tokens1,int _prev_eob_run1);
+void oc_enc_tokenize_finish(oc_enc_ctx *_enc);
+
+
+
+/*Utility routine to encode one of the header packets.*/
+int oc_state_flushheader(oc_theora_state *_state,int *_packet_state,
+ oggpack_buffer *_opb,const th_quant_info *_qinfo,
+ const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS],
+ const char *_vendor,th_comment *_tc,ogg_packet *_op);
+
+
+
+/*Encoder-specific accelerated functions.*/
+void oc_enc_frag_sub(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
+ const unsigned char *_src,const unsigned char *_ref,int _ystride);
+void oc_enc_frag_sub_128(const oc_enc_ctx *_enc,ogg_int16_t _diff[64],
+ const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_sad(const oc_enc_ctx *_enc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_sad_thresh(const oc_enc_ctx *_enc,
+ const unsigned char *_src,const unsigned char *_ref,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_sad2_thresh(const oc_enc_ctx *_enc,
+ const unsigned char *_src,const unsigned char *_ref1,
+ const unsigned char *_ref2,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_satd_thresh(const oc_enc_ctx *_enc,
+ const unsigned char *_src,const unsigned char *_ref,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_satd2_thresh(const oc_enc_ctx *_enc,
+ const unsigned char *_src,const unsigned char *_ref1,
+ const unsigned char *_ref2,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_intra_satd(const oc_enc_ctx *_enc,
+ const unsigned char *_src,int _ystride);
+void oc_enc_frag_copy2(const oc_enc_ctx *_enc,unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+void oc_enc_frag_recon_intra(const oc_enc_ctx *_enc,
+ unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]);
+void oc_enc_frag_recon_inter(const oc_enc_ctx *_enc,unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
+void oc_enc_fdct8x8(const oc_enc_ctx *_enc,ogg_int16_t _y[64],
+ const ogg_int16_t _x[64]);
+
+/*Default pure-C implementations.*/
+void oc_enc_vtable_init_c(oc_enc_ctx *_enc);
+
+void oc_enc_frag_sub_c(ogg_int16_t _diff[64],
+ const unsigned char *_src,const unsigned char *_ref,int _ystride);
+void oc_enc_frag_sub_128_c(ogg_int16_t _diff[64],
+ const unsigned char *_src,int _ystride);
+void oc_enc_frag_copy2_c(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+unsigned oc_enc_frag_sad_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_satd_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_satd2_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_intra_satd_c(const unsigned char *_src,int _ystride);
+void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+
+#endif
diff --git a/lib/encode.c b/lib/encode.c
new file mode 100644
index 0000000..0c5ea6a
--- /dev/null
+++ b/lib/encode.c
@@ -0,0 +1,1615 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: encode.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+#include <stdlib.h>
+#include <string.h>
+#include "encint.h"
+#if defined(OC_X86_ASM)
+# include "x86/x86enc.h"
+#endif
+
+
+
+/*The default quantization parameters used by VP3.1.*/
+static const int OC_VP31_RANGE_SIZES[1]={63};
+static const th_quant_base OC_VP31_BASES_INTRA_Y[2]={
+  {
+     16, 11, 10, 16, 24, 40, 51, 61,
+     12, 12, 14, 19, 26, 58, 60, 55,
+     14, 13, 16, 24, 40, 57, 69, 56,
+     14, 17, 22, 29, 51, 87, 80, 62,
+     18, 22, 37, 58, 68,109,103, 77,
+     24, 35, 55, 64, 81,104,113, 92,
+     49, 64, 78, 87,103,121,120,101,
+     72, 92, 95, 98,112,100,103, 99
+  },
+  {
+     16, 11, 10, 16, 24, 40, 51, 61,
+     12, 12, 14, 19, 26, 58, 60, 55,
+     14, 13, 16, 24, 40, 57, 69, 56,
+     14, 17, 22, 29, 51, 87, 80, 62,
+     18, 22, 37, 58, 68,109,103, 77,
+     24, 35, 55, 64, 81,104,113, 92,
+     49, 64, 78, 87,103,121,120,101,
+     72, 92, 95, 98,112,100,103, 99
+  }
+};
+static const th_quant_base OC_VP31_BASES_INTRA_C[2]={
+  {
+     17, 18, 24, 47, 99, 99, 99, 99,
+     18, 21, 26, 66, 99, 99, 99, 99,
+     24, 26, 56, 99, 99, 99, 99, 99,
+     47, 66, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99
+  },
+  {
+     17, 18, 24, 47, 99, 99, 99, 99,
+     18, 21, 26, 66, 99, 99, 99, 99,
+     24, 26, 56, 99, 99, 99, 99, 99,
+     47, 66, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99
+  }
+};
+static const th_quant_base OC_VP31_BASES_INTER[2]={
+  {
+     16, 16, 16, 20, 24, 28, 32, 40,
+     16, 16, 20, 24, 28, 32, 40, 48,
+     16, 20, 24, 28, 32, 40, 48, 64,
+     20, 24, 28, 32, 40, 48, 64, 64,
+     24, 28, 32, 40, 48, 64, 64, 64,
+     28, 32, 40, 48, 64, 64, 64, 96,
+     32, 40, 48, 64, 64, 64, 96,128,
+     40, 48, 64, 64, 64, 96,128,128
+  },
+  {
+     16, 16, 16, 20, 24, 28, 32, 40,
+     16, 16, 20, 24, 28, 32, 40, 48,
+     16, 20, 24, 28, 32, 40, 48, 64,
+     20, 24, 28, 32, 40, 48, 64, 64,
+     24, 28, 32, 40, 48, 64, 64, 64,
+     28, 32, 40, 48, 64, 64, 64, 96,
+     32, 40, 48, 64, 64, 64, 96,128,
+     40, 48, 64, 64, 64, 96,128,128
+  }
+};
+
+const th_quant_info TH_VP31_QUANT_INFO={
+  {
+    220,200,190,180,170,170,160,160,
+    150,150,140,140,130,130,120,120,
+    110,110,100,100, 90, 90, 90, 80,
+     80, 80, 70, 70, 70, 60, 60, 60,
+     60, 50, 50, 50, 50, 40, 40, 40,
+     40, 40, 30, 30, 30, 30, 30, 30,
+     30, 20, 20, 20, 20, 20, 20, 20,
+     20, 10, 10, 10, 10, 10, 10, 10
+  },
+  {
+    500,450,400,370,340,310,285,265,
+    245,225,210,195,185,180,170,160,
+    150,145,135,130,125,115,110,107,
+    100, 96, 93, 89, 85, 82, 75, 74,
+     70, 68, 64, 60, 57, 56, 52, 50,
+     49, 45, 44, 43, 40, 38, 37, 35,
+     33, 32, 30, 29, 28, 25, 24, 22,
+     21, 19, 18, 17, 15, 13, 12, 10
+  },
+  {
+    30,25,20,20,15,15,14,14,
+    13,13,12,12,11,11,10,10,
+     9, 9, 8, 8, 7, 7, 7, 7,
+     6, 6, 6, 6, 5, 5, 5, 5,
+     4, 4, 4, 4, 3, 3, 3, 3,
+     2, 2, 2, 2, 2, 2, 2, 2,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0
+  },
+  {
+    {
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTRA_Y},
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTRA_C},
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTRA_C}
+    },
+    {
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTER},
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTER},
+      {1,OC_VP31_RANGE_SIZES,OC_VP31_BASES_INTER}
+    }
+  }
+};
+
+/*The current default quantization parameters.*/
+static const int OC_DEF_QRANGE_SIZES[3]={32,16,15};
+static const th_quant_base OC_DEF_BASES_INTRA_Y[4]={
+  {
+     15, 15, 15, 15, 15, 15, 15, 15,
+     15, 15, 15, 15, 15, 15, 15, 15,
+     15, 15, 15, 15, 15, 15, 15, 15,
+     15, 15, 15, 15, 15, 15, 15, 15,
+     15, 15, 15, 15, 15, 15, 15, 15,
+     15, 15, 15, 15, 15, 15, 15, 15,
+     15, 15, 15, 15, 15, 15, 15, 15,
+     15, 15, 15, 15, 15, 15, 15, 15,
+  },
+  {
+     15, 12, 12, 15, 18, 20, 20, 21,
+     13, 13, 14, 17, 18, 21, 21, 20,
+     14, 14, 15, 18, 20, 21, 21, 21,
+     14, 16, 17, 19, 20, 21, 21, 21,
+     16, 17, 20, 21, 21, 21, 21, 21,
+     18, 19, 20, 21, 21, 21, 21, 21,
+     20, 21, 21, 21, 21, 21, 21, 21,
+     21, 21, 21, 21, 21, 21, 21, 21
+  },
+  {
+     16, 12, 11, 16, 20, 25, 27, 28,
+     13, 13, 14, 18, 21, 28, 28, 27,
+     14, 13, 16, 20, 25, 28, 28, 28,
+     14, 16, 19, 22, 27, 29, 29, 28,
+     17, 19, 25, 28, 28, 30, 30, 29,
+     20, 24, 27, 28, 29, 30, 30, 29,
+     27, 28, 29, 29, 30, 30, 30, 30,
+     29, 29, 29, 29, 30, 30, 30, 29
+  },
+  {
+     16, 11, 10, 16, 24, 40, 51, 61,
+     12, 12, 14, 19, 26, 58, 60, 55,
+     14, 13, 16, 24, 40, 57, 69, 56,
+     14, 17, 22, 29, 51, 87, 80, 62,
+     18, 22, 37, 58, 68,109,103, 77,
+     24, 35, 55, 64, 81,104,113, 92,
+     49, 64, 78, 87,103,121,120,101,
+     72, 92, 95, 98,112,100,103, 99
+  }
+};
+static const th_quant_base OC_DEF_BASES_INTRA_C[4]={
+  {
+     19, 19, 19, 19, 19, 19, 19, 19,
+     19, 19, 19, 19, 19, 19, 19, 19,
+     19, 19, 19, 19, 19, 19, 19, 19,
+     19, 19, 19, 19, 19, 19, 19, 19,
+     19, 19, 19, 19, 19, 19, 19, 19,
+     19, 19, 19, 19, 19, 19, 19, 19,
+     19, 19, 19, 19, 19, 19, 19, 19,
+     19, 19, 19, 19, 19, 19, 19, 19
+  },
+  {
+     18, 18, 21, 25, 26, 26, 26, 26,
+     18, 20, 22, 26, 26, 26, 26, 26,
+     21, 22, 25, 26, 26, 26, 26, 26,
+     25, 26, 26, 26, 26, 26, 26, 26,
+     26, 26, 26, 26, 26, 26, 26, 26,
+     26, 26, 26, 26, 26, 26, 26, 26,
+     26, 26, 26, 26, 26, 26, 26, 26,
+     26, 26, 26, 26, 26, 26, 26, 26
+  },
+  {
+     17, 18, 22, 31, 36, 36, 36, 36,
+     18, 20, 24, 34, 36, 36, 36, 36,
+     22, 24, 33, 36, 36, 36, 36, 36,
+     31, 34, 36, 36, 36, 36, 36, 36,
+     36, 36, 36, 36, 36, 36, 36, 36,
+     36, 36, 36, 36, 36, 36, 36, 36,
+     36, 36, 36, 36, 36, 36, 36, 36,
+     36, 36, 36, 36, 36, 36, 36, 36
+  },
+  {
+     17, 18, 24, 47, 99, 99, 99, 99,
+     18, 21, 26, 66, 99, 99, 99, 99,
+     24, 26, 56, 99, 99, 99, 99, 99,
+     47, 66, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99,
+     99, 99, 99, 99, 99, 99, 99, 99
+  }
+};
+static const th_quant_base OC_DEF_BASES_INTER[4]={
+  {
+     21, 21, 21, 21, 21, 21, 21, 21,
+     21, 21, 21, 21, 21, 21, 21, 21,
+     21, 21, 21, 21, 21, 21, 21, 21,
+     21, 21, 21, 21, 21, 21, 21, 21,
+     21, 21, 21, 21, 21, 21, 21, 21,
+     21, 21, 21, 21, 21, 21, 21, 21,
+     21, 21, 21, 21, 21, 21, 21, 21,
+     21, 21, 21, 21, 21, 21, 21, 21
+  },
+  {
+     18, 18, 18, 21, 23, 24, 25, 27,
+     18, 18, 21, 23, 24, 25, 27, 28,
+     18, 21, 23, 24, 25, 27, 28, 29,
+     21, 23, 24, 25, 27, 28, 29, 29,
+     23, 24, 25, 27, 28, 29, 29, 29,
+     24, 25, 27, 28, 29, 29, 29, 30,
+     25, 27, 28, 29, 29, 29, 30, 30,
+     27, 28, 29, 29, 29, 30, 30, 30
+  },
+  {
+     17, 17, 17, 20, 23, 26, 28, 32,
+     17, 17, 20, 23, 26, 28, 32, 34,
+     17, 20, 23, 26, 28, 32, 34, 37,
+     20, 23, 26, 28, 32, 34, 37, 37,
+     23, 26, 28, 32, 34, 37, 37, 37,
+     26, 28, 32, 34, 37, 37, 37, 41,
+     28, 32, 34, 37, 37, 37, 41, 42,
+     32, 34, 37, 37, 37, 41, 42, 42
+  },
+  {
+     16, 16, 16, 20, 24, 28, 32, 40,
+     16, 16, 20, 24, 28, 32, 40, 48,
+     16, 20, 24, 28, 32, 40, 48, 64,
+     20, 24, 28, 32, 40, 48, 64, 64,
+     24, 28, 32, 40, 48, 64, 64, 64,
+     28, 32, 40, 48, 64, 64, 64, 96,
+     32, 40, 48, 64, 64, 64, 96,128,
+     40, 48, 64, 64, 64, 96,128,128
+  }
+};
+
+const th_quant_info TH_DEF_QUANT_INFO={
+  {
+    365,348,333,316,300,287,277,265,
+    252,240,229,219,206,197,189,180,
+    171,168,160,153,146,139,132,127,
+    121,115,110,107,101, 97, 94, 89,
+     85, 83, 78, 73, 72, 67, 66, 62,
+     60, 59, 56, 53, 52, 48, 47, 43,
+     42, 40, 36, 35, 34, 33, 31, 30,
+     28, 25, 24, 22, 20, 17, 14, 10
+  },
+  {
+    365,348,333,316,300,287,277,265,
+    252,240,229,219,206,197,189,180,
+    171,168,160,153,146,139,132,127,
+    121,115,110,107,101, 97, 94, 89,
+     85, 83, 78, 73, 72, 67, 66, 62,
+     60, 59, 56, 53, 52, 48, 47, 43,
+     42, 40, 36, 35, 34, 33, 31, 30,
+     28, 25, 24, 22, 20, 17, 14, 10
+  },
+  {
+    30,25,20,20,15,15,14,14,
+    13,13,12,12,11,11,10,10,
+     9, 9, 8, 8, 7, 7, 7, 7,
+     6, 6, 6, 6, 5, 5, 5, 5,
+     4, 4, 4, 4, 3, 3, 3, 3,
+     2, 2, 2, 2, 2, 2, 2, 2,
+     0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0
+  },
+  {
+    {
+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTRA_Y},
+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTRA_C},
+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTRA_C}
+    },
+    {
+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTER},
+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTER},
+      {3,OC_DEF_QRANGE_SIZES,OC_DEF_BASES_INTER}
+    }
+  }
+};
+
+
+
+/*The Huffman codes used for macro block modes.*/
+
+const unsigned char OC_MODE_BITS[2][OC_NMODES]={
+  /*Codebook 0: a maximally skewed prefix code.*/
+  {1,2,3,4,5,6,7,7},
+  /*Codebook 1: a fixed-length code.*/
+  {3,3,3,3,3,3,3,3}
+};
+
+static const unsigned char OC_MODE_CODES[2][OC_NMODES]={
+  /*Codebook 0: a maximally skewed prefix code.*/
+  {0x00,0x02,0x06,0x0E,0x1E,0x3E,0x7E,0x7F},
+  /*Codebook 1: a fixed-length code.*/
+  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07}
+};
+
+
+/*The Huffman codes used for motion vectors.*/
+
+const unsigned char OC_MV_BITS[2][64]={
+  /*Codebook 0: VLC code.*/
+  {
+      8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+    8,7,7,7,7,7,7,7,7,6,6,6,6,4,4,3,
+    3,
+    3,4,4,6,6,6,6,7,7,7,7,7,7,7,7,8,
+    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+  },
+  /*Codebook 1: (5 bit magnitude, 1 bit sign).
+    This wastes a code word (0x01, negative zero), or a bit (0x00, positive
+     zero, requires only 5 bits to uniquely decode), but is hopefully not used
+     very often.*/
+  {
+      6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+    6,
+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
+  }
+};
+
+static const unsigned char OC_MV_CODES[2][64]={
+  /*Codebook 0: VLC code.*/
+  {
+         0xFF,0xFD,0xFB,0xF9,0xF7,0xF5,0xF3,
+    0xF1,0xEF,0xED,0xEB,0xE9,0xE7,0xE5,0xE3,
+    0xE1,0x6F,0x6D,0x6B,0x69,0x67,0x65,0x63,
+    0x61,0x2F,0x2D,0x2B,0x29,0x09,0x07,0x02,
+    0x00,
+    0x01,0x06,0x08,0x28,0x2A,0x2C,0x2E,0x60,
+    0x62,0x64,0x66,0x68,0x6A,0x6C,0x6E,0xE0,
+    0xE2,0xE4,0xE6,0xE8,0xEA,0xEC,0xEE,0xF0,
+    0xF2,0xF4,0xF6,0xF8,0xFA,0xFC,0xFE
+  },
+  /*Codebook 1: (5 bit magnitude, 1 bit sign).*/
+  {
+         0x3F,0x3D,0x3B,0x39,0x37,0x35,0x33,
+    0x31,0x2F,0x2D,0x2B,0x29,0x27,0x25,0x23,
+    0x21,0x1F,0x1D,0x1B,0x19,0x17,0x15,0x13,
+    0x11,0x0F,0x0D,0x0B,0x09,0x07,0x05,0x03,
+    0x00,
+    0x02,0x04,0x06,0x08,0x0A,0x0C,0x0E,0x10,
+    0x12,0x14,0x16,0x18,0x1A,0x1C,0x1E,0x20,
+    0x22,0x24,0x26,0x28,0x2A,0x2C,0x2E,0x30,
+    0x32,0x34,0x36,0x38,0x3A,0x3C,0x3E
+  }
+};
+
+
+
+/*Super block run coding scheme:
+   Codeword             Run Length
+   0                       1
+   10x                     2-3
+   110x                    4-5
+   1110xx                  6-9
+   11110xxx                10-17
+   111110xxxx              18-33
+   111111xxxxxxxxxxxx      34-4129*/
+const ogg_uint16_t    OC_SB_RUN_VAL_MIN[8]={1,2,4,6,10,18,34,4130};
+static const unsigned OC_SB_RUN_CODE_PREFIX[7]={
+  0,4,0xC,0x38,0xF0,0x3E0,0x3F000
+};
+const unsigned char   OC_SB_RUN_CODE_NBITS[7]={1,3,4,6,8,10,18};
+
+
+/*Writes the bit pattern for the run length of a super block run to the given
+   oggpack_buffer.
+  _opb:       The buffer to write to.
+  _run_count: The length of the run, which must be positive.
+  _flag:      The current flag.
+  _done:      Whether or not more flags are to be encoded.*/
+static void oc_sb_run_pack(oggpack_buffer *_opb,ptrdiff_t _run_count,
+ int _flag,int _done){
+  int i;
+  if(_run_count>=4129){
+    do{
+      oggpackB_write(_opb,0x3FFFF,18);
+      _run_count-=4129;
+      if(_run_count>0)oggpackB_write(_opb,_flag,1);
+      else if(!_done)oggpackB_write(_opb,!_flag,1);
+    }
+    while(_run_count>=4129);
+    if(_run_count<=0)return;
+  }
+  for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
+  oggpackB_write(_opb,OC_SB_RUN_CODE_PREFIX[i]+_run_count-OC_SB_RUN_VAL_MIN[i],
+   OC_SB_RUN_CODE_NBITS[i]);
+}
+
+
+
+/*Block run coding scheme:
+   Codeword             Run Length
+   0x                      1-2
+   10x                     3-4
+   110x                    5-6
+   1110xx                  7-10
+   11110xx                 11-14
+   11111xxxx               15-30*/
+const unsigned char OC_BLOCK_RUN_CODE_NBITS[30]={
+  2,2,3,3,4,4,6,6,6,6,7,7,7,7,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9
+};
+static const ogg_uint16_t  OC_BLOCK_RUN_CODE_PATTERN[30]={
+        0x000,0x001,0x004,0x005,0x00C,0x00D,0x038,
+  0x039,0x03A,0x03B,0x078,0x079,0x07A,0x07B,0x1F0,
+  0x1F1,0x1F2,0x1F3,0x1F4,0x1F5,0x1F6,0x1F7,0x1F8,
+  0x1F9,0x1FA,0x1FB,0x1FC,0x1FD,0x1FE,0x1FF
+};
+
+
+/*Writes the bit pattern for the run length of a block run to the given
+   oggpack_buffer.
+  _opb:       The buffer to write to.
+  _run_count: The length of the run.
+              This must be positive, and no more than 30.*/
+static void oc_block_run_pack(oggpack_buffer *_opb,int _run_count){
+  oggpackB_write(_opb,OC_BLOCK_RUN_CODE_PATTERN[_run_count-1],
+   OC_BLOCK_RUN_CODE_NBITS[_run_count-1]);
+}
+
+
+
+static void oc_enc_frame_header_pack(oc_enc_ctx *_enc){
+  /*Mark this as a data packet.*/
+  oggpackB_write(&_enc->opb,0,1);
+  /*Output the frame type (key frame or delta frame).*/
+  oggpackB_write(&_enc->opb,_enc->state.frame_type,1);
+  /*Write out the current qi list.*/
+  oggpackB_write(&_enc->opb,_enc->state.qis[0],6);
+  if(_enc->state.nqis>1){
+    oggpackB_write(&_enc->opb,1,1);
+    oggpackB_write(&_enc->opb,_enc->state.qis[1],6);
+    if(_enc->state.nqis>2){
+      oggpackB_write(&_enc->opb,1,1);
+      oggpackB_write(&_enc->opb,_enc->state.qis[2],6);
+    }
+    else oggpackB_write(&_enc->opb,0,1);
+  }
+  else oggpackB_write(&_enc->opb,0,1);
+  if(_enc->state.frame_type==OC_INTRA_FRAME){
+    /*Key frames have 3 unused configuration bits, holdovers from the VP3 days.
+      Most of the other unused bits in the VP3 headers were eliminated.
+      Monty kept these to leave us some wiggle room for future expansion,
+       though a single bit in all frames would have been far more useful.*/
+    oggpackB_write(&_enc->opb,0,3);
+  }
+}
+
+/*Writes the bit flags for whether or not each super block is partially coded
+   or not.
+  These flags are run-length encoded, with the flag value alternating between
+   each run.
+  Return: The number partially coded SBs.*/
+static unsigned oc_enc_partial_sb_flags_pack(oc_enc_ctx *_enc){
+  const oc_sb_flags *sb_flags;
+  unsigned           nsbs;
+  unsigned           sbi;
+  unsigned           npartial;
+  int                flag;
+  sb_flags=_enc->state.sb_flags;
+  nsbs=_enc->state.nsbs;
+  flag=sb_flags[0].coded_partially;
+  oggpackB_write(&_enc->opb,flag,1);
+  sbi=npartial=0;
+  do{
+    unsigned run_count;
+    for(run_count=0;sbi<nsbs;sbi++){
+      if(sb_flags[sbi].coded_partially!=flag)break;
+      run_count++;
+      npartial+=flag;
+    }
+    oc_sb_run_pack(&_enc->opb,run_count,flag,sbi>=nsbs);
+    flag=!flag;
+  }
+  while(sbi<nsbs);
+  return npartial;
+}
+
+/*Writes the coded/not coded flags for each super block that is not partially
+   coded.
+  These flags are run-length encoded, with the flag value altenating between
+   each run.*/
+static void oc_enc_coded_sb_flags_pack(oc_enc_ctx *_enc){
+  const oc_sb_flags *sb_flags;
+  unsigned           nsbs;
+  unsigned           sbi;
+  int                flag;
+  sb_flags=_enc->state.sb_flags;
+  nsbs=_enc->state.nsbs;
+  /*Skip partially coded super blocks; their flags have already been coded.*/
+  for(sbi=0;sb_flags[sbi].coded_partially;sbi++);
+  flag=sb_flags[sbi].coded_fully;
+  oggpackB_write(&_enc->opb,flag,1);
+  do{
+    unsigned run_count;
+    for(run_count=0;sbi<nsbs;sbi++){
+      if(sb_flags[sbi].coded_partially)continue;
+      if(sb_flags[sbi].coded_fully!=flag)break;
+      run_count++;
+    }
+    oc_sb_run_pack(&_enc->opb,run_count,flag,sbi>=nsbs);
+    flag=!flag;
+  }
+  while(sbi<nsbs);
+}
+
+static void oc_enc_coded_flags_pack(oc_enc_ctx *_enc){
+  const oc_sb_map   *sb_maps;
+  const oc_sb_flags *sb_flags;
+  unsigned           nsbs;
+  const oc_fragment *frags;
+  unsigned           npartial;
+  int                run_count;
+  int                flag;
+  int                pli;
+  unsigned           sbi;
+  npartial=oc_enc_partial_sb_flags_pack(_enc);
+  if(npartial<_enc->state.nsbs)oc_enc_coded_sb_flags_pack(_enc);
+  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
+  sb_flags=_enc->state.sb_flags;
+  nsbs=_enc->state.nsbs;
+  frags=_enc->state.frags;
+  for(sbi=0;sbi<nsbs&&!sb_flags[sbi].coded_partially;sbi++);
+  /*If there's at least one partial SB, store individual coded block flags.*/
+  if(sbi<nsbs){
+    flag=frags[sb_maps[sbi][0][0]].coded;
+    oggpackB_write(&_enc->opb,flag,1);
+    run_count=0;
+    nsbs=sbi=0;
+    for(pli=0;pli<3;pli++){
+      nsbs+=_enc->state.fplanes[pli].nsbs;
+      for(;sbi<nsbs;sbi++){
+        int       quadi;
+        int       bi;
+        ptrdiff_t fragi;
+        if(sb_flags[sbi].coded_partially){
+          for(quadi=0;quadi<4;quadi++){
+            for(bi=0;bi<4;bi++){
+              fragi=sb_maps[sbi][quadi][bi];
+              if(fragi>=0){
+                if(frags[fragi].coded!=flag){
+                  oc_block_run_pack(&_enc->opb,run_count);
+                  flag=!flag;
+                  run_count=1;
+                }
+                else run_count++;
+              }
+            }
+          }
+        }
+      }
+    }
+    /*Flush any trailing block coded run.*/
+    if(run_count>0)oc_block_run_pack(&_enc->opb,run_count);
+  }
+}
+
+static void oc_enc_mb_modes_pack(oc_enc_ctx *_enc){
+  const unsigned char *mode_codes;
+  const unsigned char *mode_bits;
+  const unsigned char *mode_ranks;
+  unsigned            *coded_mbis;
+  size_t               ncoded_mbis;
+  const signed char   *mb_modes;
+  unsigned             mbii;
+  int                  scheme;
+  int                  mb_mode;
+  scheme=_enc->chooser.scheme_list[0];
+  /*Encode the best scheme.*/
+  oggpackB_write(&_enc->opb,scheme,3);
+  /*If the chosen scheme is scheme 0, send the mode frequency ordering.*/
+  if(scheme==0){
+    for(mb_mode=0;mb_mode<OC_NMODES;mb_mode++){
+      oggpackB_write(&_enc->opb,_enc->chooser.scheme0_ranks[mb_mode],3);
+    }
+  }
+  mode_ranks=_enc->chooser.mode_ranks[scheme];
+  mode_bits=OC_MODE_BITS[scheme+1>>3];
+  mode_codes=OC_MODE_CODES[scheme+1>>3];
+  coded_mbis=_enc->coded_mbis;
+  ncoded_mbis=_enc->ncoded_mbis;
+  mb_modes=_enc->state.mb_modes;
+  for(mbii=0;mbii<ncoded_mbis;mbii++){
+    int rank;
+    rank=mode_ranks[mb_modes[coded_mbis[mbii]]];
+    oggpackB_write(&_enc->opb,mode_codes[rank],mode_bits[rank]);
+  }
+}
+
+static void oc_enc_mv_pack(oc_enc_ctx *_enc,int _mv_scheme,int _dx,int _dy){
+  oggpackB_write(&_enc->opb,
+   OC_MV_CODES[_mv_scheme][_dx+31],OC_MV_BITS[_mv_scheme][_dx+31]);
+  oggpackB_write(&_enc->opb,
+   OC_MV_CODES[_mv_scheme][_dy+31],OC_MV_BITS[_mv_scheme][_dy+31]);
+}
+
+static void oc_enc_mvs_pack(oc_enc_ctx *_enc){
+  const unsigned     *coded_mbis;
+  size_t              ncoded_mbis;
+  const oc_mb_map    *mb_maps;
+  const signed char  *mb_modes;
+  const oc_fragment  *frags;
+  const oc_mv        *frag_mvs;
+  unsigned            mbii;
+  int                 mv_scheme;
+  /*Choose the coding scheme.*/
+  mv_scheme=_enc->mv_bits[1]<_enc->mv_bits[0];
+  oggpackB_write(&_enc->opb,mv_scheme,1);
+  /*Encode the motion vectors.
+    Macro blocks are iterated in Hilbert scan order, but the MVs within the
+     macro block are coded in raster order.*/
+  coded_mbis=_enc->coded_mbis;
+  ncoded_mbis=_enc->ncoded_mbis;
+  mb_modes=_enc->state.mb_modes;
+  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
+  frags=_enc->state.frags;
+  frag_mvs=(const oc_mv *)_enc->state.frag_mvs;
+  for(mbii=0;mbii<ncoded_mbis;mbii++){
+    ptrdiff_t fragi;
+    unsigned  mbi;
+    int       bi;
+    mbi=coded_mbis[mbii];
+    switch(mb_modes[mbi]){
+      case OC_MODE_INTER_MV:
+      case OC_MODE_GOLDEN_MV:{
+        for(bi=0;;bi++){
+          fragi=mb_maps[mbi][0][bi];
+          if(frags[fragi].coded){
+            oc_enc_mv_pack(_enc,mv_scheme,
+             frag_mvs[fragi][0],frag_mvs[fragi][1]);
+            /*Only code a single MV for this macro block.*/
+            break;
+          }
+        }
+      }break;
+      case OC_MODE_INTER_MV_FOUR:{
+        for(bi=0;bi<4;bi++){
+          fragi=mb_maps[mbi][0][bi];
+          if(frags[fragi].coded){
+            oc_enc_mv_pack(_enc,mv_scheme,
+             frag_mvs[fragi][0],frag_mvs[fragi][1]);
+            /*Keep coding all the MVs for this macro block.*/
+          }
+        }
+      }break;
+    }
+  }
+}
+
+static void oc_enc_block_qis_pack(oc_enc_ctx *_enc){
+  const oc_fragment *frags;
+  ptrdiff_t         *coded_fragis;
+  ptrdiff_t          ncoded_fragis;
+  ptrdiff_t          fragii;
+  ptrdiff_t          run_count;
+  ptrdiff_t          nqi0;
+  int                flag;
+  if(_enc->state.nqis<=1)return;
+  ncoded_fragis=_enc->state.ntotal_coded_fragis;
+  if(ncoded_fragis<=0)return;
+  coded_fragis=_enc->state.coded_fragis;
+  frags=_enc->state.frags;
+  flag=!!frags[coded_fragis[0]].qii;
+  oggpackB_write(&_enc->opb,flag,1);
+  nqi0=0;
+  for(fragii=0;fragii<ncoded_fragis;){
+    for(run_count=0;fragii<ncoded_fragis;fragii++){
+      if(!!frags[coded_fragis[fragii]].qii!=flag)break;
+      run_count++;
+      nqi0+=!flag;
+    }
+    oc_sb_run_pack(&_enc->opb,run_count,flag,fragii>=ncoded_fragis);
+    flag=!flag;
+  }
+  if(_enc->state.nqis<3||nqi0>=ncoded_fragis)return;
+  for(fragii=0;!frags[coded_fragis[fragii]].qii;fragii++);
+  flag=frags[coded_fragis[fragii]].qii-1;
+  oggpackB_write(&_enc->opb,flag,1);
+  while(fragii<ncoded_fragis){
+    for(run_count=0;fragii<ncoded_fragis;fragii++){
+      int qii;
+      qii=frags[coded_fragis[fragii]].qii;
+      if(!qii)continue;
+      if(qii-1!=flag)break;
+      run_count++;
+    }
+    oc_sb_run_pack(&_enc->opb,run_count,flag,fragii>=ncoded_fragis);
+    flag=!flag;
+  }
+}
+
+/*Counts the tokens of each type used for the given range of coefficient
+   indices in zig-zag order.
+  _zzi_start:      The first zig-zag index to include.
+  _zzi_end:        The first zig-zag index to not include.
+  _token_counts_y: Returns the token counts for the Y' plane.
+  _token_counts_c: Returns the token counts for the Cb and Cr planes.*/
+static void oc_enc_count_tokens(oc_enc_ctx *_enc,int _zzi_start,int _zzi_end,
+ ptrdiff_t _token_counts_y[32],ptrdiff_t _token_counts_c[32]){
+  const unsigned char *dct_tokens;
+  ptrdiff_t            ndct_tokens;
+  int                  pli;
+  int                  zzi;
+  ptrdiff_t            ti;
+  memset(_token_counts_y,0,32*sizeof(*_token_counts_y));
+  memset(_token_counts_c,0,32*sizeof(*_token_counts_c));
+  for(zzi=_zzi_start;zzi<_zzi_end;zzi++){
+    dct_tokens=_enc->dct_tokens[0][zzi];
+    ndct_tokens=_enc->ndct_tokens[0][zzi];
+    for(ti=_enc->dct_token_offs[0][zzi];ti<ndct_tokens;ti++){
+      _token_counts_y[dct_tokens[ti]]++;
+    }
+  }
+  for(pli=1;pli<3;pli++){
+    for(zzi=_zzi_start;zzi<_zzi_end;zzi++){
+      dct_tokens=_enc->dct_tokens[pli][zzi];
+      ndct_tokens=_enc->ndct_tokens[pli][zzi];
+      for(ti=_enc->dct_token_offs[pli][zzi];ti<ndct_tokens;ti++){
+        _token_counts_c[dct_tokens[ti]]++;
+      }
+    }
+  }
+}
+
+/*Computes the number of bits used for each of the potential Huffman code for
+   the given list of token counts.
+  The bits are added to whatever the current bit counts are.*/
+static void oc_enc_count_bits(oc_enc_ctx *_enc,int _hgi,
+ const ptrdiff_t _token_counts[32],size_t _bit_counts[16]){
+  int huffi;
+  int huff_offs;
+  int token;
+  huff_offs=_hgi<<4;
+  for(huffi=0;huffi<16;huffi++){
+    for(token=0;token<32;token++){
+      _bit_counts[huffi]+=
+       _token_counts[token]*_enc->huff_codes[huffi+huff_offs][token].nbits;
+    }
+  }
+}
+
+/*Returns the Huffman index using the fewest number of bits.*/
+static int oc_select_huff_idx(size_t _bit_counts[16]){
+  int best_huffi;
+  int huffi;
+  best_huffi=0;
+  for(huffi=1;huffi<16;huffi++)if(_bit_counts[huffi]<_bit_counts[best_huffi]){
+    best_huffi=huffi;
+  }
+  return best_huffi;
+}
+
+static void oc_enc_huff_group_pack(oc_enc_ctx *_enc,
+ int _zzi_start,int _zzi_end,const int _huff_idxs[2]){
+  int zzi;
+  for(zzi=_zzi_start;zzi<_zzi_end;zzi++){
+    int pli;
+    for(pli=0;pli<3;pli++){
+      const unsigned char *dct_tokens;
+      const ogg_uint16_t  *extra_bits;
+      ptrdiff_t            ndct_tokens;
+      const th_huff_code  *huff_codes;
+      ptrdiff_t            ti;
+      dct_tokens=_enc->dct_tokens[pli][zzi];
+      extra_bits=_enc->extra_bits[pli][zzi];
+      ndct_tokens=_enc->ndct_tokens[pli][zzi];
+      huff_codes=_enc->huff_codes[_huff_idxs[pli+1>>1]];
+      for(ti=_enc->dct_token_offs[pli][zzi];ti<ndct_tokens;ti++){
+        int token;
+        int neb;
+        token=dct_tokens[ti];
+        oggpackB_write(&_enc->opb,huff_codes[token].pattern,
+         huff_codes[token].nbits);
+        neb=OC_DCT_TOKEN_EXTRA_BITS[token];
+        if(neb)oggpackB_write(&_enc->opb,extra_bits[ti],neb);
+      }
+    }
+  }
+}
+
+static void oc_enc_residual_tokens_pack(oc_enc_ctx *_enc){
+  static const unsigned char  OC_HUFF_GROUP_MIN[6]={0,1,6,15,28,64};
+  static const unsigned char *OC_HUFF_GROUP_MAX=OC_HUFF_GROUP_MIN+1;
+  ptrdiff_t token_counts_y[32];
+  ptrdiff_t token_counts_c[32];
+  size_t    bits_y[16];
+  size_t    bits_c[16];
+  int       huff_idxs[2];
+  int       frame_type;
+  int       hgi;
+  frame_type=_enc->state.frame_type;
+  /*Choose which Huffman tables to use for the DC token list.*/
+  oc_enc_count_tokens(_enc,0,1,token_counts_y,token_counts_c);
+  memset(bits_y,0,sizeof(bits_y));
+  memset(bits_c,0,sizeof(bits_c));
+  oc_enc_count_bits(_enc,0,token_counts_y,bits_y);
+  oc_enc_count_bits(_enc,0,token_counts_c,bits_c);
+  huff_idxs[0]=oc_select_huff_idx(bits_y);
+  huff_idxs[1]=oc_select_huff_idx(bits_c);
+  /*Write the DC token list with the chosen tables.*/
+  oggpackB_write(&_enc->opb,huff_idxs[0],4);
+  oggpackB_write(&_enc->opb,huff_idxs[1],4);
+  _enc->huff_idxs[frame_type][0][0]=(unsigned char)huff_idxs[0];
+  _enc->huff_idxs[frame_type][0][1]=(unsigned char)huff_idxs[1];
+  oc_enc_huff_group_pack(_enc,0,1,huff_idxs);
+  /*Choose which Huffman tables to use for the AC token lists.*/
+  memset(bits_y,0,sizeof(bits_y));
+  memset(bits_c,0,sizeof(bits_c));
+  for(hgi=1;hgi<5;hgi++){
+    oc_enc_count_tokens(_enc,OC_HUFF_GROUP_MIN[hgi],OC_HUFF_GROUP_MAX[hgi],
+     token_counts_y,token_counts_c);
+    oc_enc_count_bits(_enc,hgi,token_counts_y,bits_y);
+    oc_enc_count_bits(_enc,hgi,token_counts_c,bits_c);
+  }
+  huff_idxs[0]=oc_select_huff_idx(bits_y);
+  huff_idxs[1]=oc_select_huff_idx(bits_c);
+  /*Write the AC token lists using the chosen tables.*/
+  oggpackB_write(&_enc->opb,huff_idxs[0],4);
+  oggpackB_write(&_enc->opb,huff_idxs[1],4);
+  _enc->huff_idxs[frame_type][1][0]=(unsigned char)huff_idxs[0];
+  _enc->huff_idxs[frame_type][1][1]=(unsigned char)huff_idxs[1];
+  for(hgi=1;hgi<5;hgi++){
+    huff_idxs[0]+=16;
+    huff_idxs[1]+=16;
+    oc_enc_huff_group_pack(_enc,
+     OC_HUFF_GROUP_MIN[hgi],OC_HUFF_GROUP_MAX[hgi],huff_idxs);
+  }
+}
+
+static void oc_enc_frame_pack(oc_enc_ctx *_enc){
+  oggpackB_reset(&_enc->opb);
+  /*Only proceed if we have some coded blocks.
+    If there are no coded blocks, we can drop this frame simply by emitting a
+     0 byte packet.*/
+  if(_enc->state.ntotal_coded_fragis>0){
+    oc_enc_frame_header_pack(_enc);
+    if(_enc->state.frame_type==OC_INTER_FRAME){
+      /*Coded block flags, MB modes, and MVs are only needed for delta frames.*/
+      oc_enc_coded_flags_pack(_enc);
+      oc_enc_mb_modes_pack(_enc);
+      oc_enc_mvs_pack(_enc);
+    }
+    oc_enc_block_qis_pack(_enc);
+    oc_enc_tokenize_finish(_enc);
+    oc_enc_residual_tokens_pack(_enc);
+  }
+  /*Success: Mark the packet as ready to be flushed.*/
+  _enc->packet_state=OC_PACKET_READY;
+#if defined(OC_COLLECT_METRICS)
+  oc_enc_mode_metrics_collect(_enc);
+#endif
+}
+
+
+void oc_enc_vtable_init_c(oc_enc_ctx *_enc){
+  /*The implementations prefixed with oc_enc_ are encoder-specific.
+    The rest we re-use from the decoder.*/
+  _enc->opt_vtable.frag_sad=oc_enc_frag_sad_c;
+  _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_c;
+  _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_c;
+  _enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_c;
+  _enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_c;
+  _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_c;
+  _enc->opt_vtable.frag_sub=oc_enc_frag_sub_c;
+  _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_c;
+  _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_c;
+  _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
+  _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
+  _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_c;
+}
+
+/*Initialize the macro block neighbor lists for MC analysis.
+  This assumes that the entire mb_info memory region has been initialized with
+   zeros.*/
+static void oc_enc_mb_info_init(oc_enc_ctx *_enc){
+  oc_mb_enc_info    *embs;
+  const signed char *mb_modes;
+  unsigned           nhsbs;
+  unsigned           nvsbs;
+  unsigned           nhmbs;
+  unsigned           nvmbs;
+  unsigned           sby;
+  mb_modes=_enc->state.mb_modes;
+  embs=_enc->mb_info;
+  nhsbs=_enc->state.fplanes[0].nhsbs;
+  nvsbs=_enc->state.fplanes[0].nvsbs;
+  nhmbs=_enc->state.nhmbs;
+  nvmbs=_enc->state.nvmbs;
+  for(sby=0;sby<nvsbs;sby++){
+    unsigned sbx;
+    for(sbx=0;sbx<nhsbs;sbx++){
+      int quadi;
+      for(quadi=0;quadi<4;quadi++){
+        /*Because of the Hilbert curve ordering the macro blocks are
+           visited in, the available neighbors change depending on where in
+           a super block the macro block is located.
+          Only the first three vectors are used in the median calculation
+           for the optimal predictor, and so the most important should be
+           listed first.
+          Additional vectors are used, so there will always be at least 3,
+           except for in the upper-left most macro block.*/
+        /*The number of current neighbors for each macro block position.*/
+        static const unsigned char NCNEIGHBORS[4]={4,3,2,4};
+        /*The offset of each current neighbor in the X direction.*/
+        static const signed char   CDX[4][4]={
+          {-1,0,1,-1},
+          {-1,0,-1,},
+          {-1,-1},
+          {-1,0,0,1}
+        };
+        /*The offset of each current neighbor in the Y direction.*/
+        static const signed char   CDY[4][4]={
+          {0,-1,-1,-1},
+          {0,-1,-1},
+          {0,-1},
+          {0,-1,1,-1}
+        };
+        /*The offset of each previous neighbor in the X direction.*/
+        static const signed char   PDX[4]={-1,0,1,0};
+        /*The offset of each previous neighbor in the Y direction.*/
+        static const signed char   PDY[4]={0,-1,0,1};
+        unsigned mbi;
+        int      mbx;
+        int      mby;
+        unsigned nmbi;
+        int      nmbx;
+        int      nmby;
+        int      ni;
+        mbi=(sby*nhsbs+sbx<<2)+quadi;
+        if(mb_modes[mbi]==OC_MODE_INVALID)continue;
+        mbx=2*sbx+(quadi>>1);
+        mby=2*sby+(quadi+1>>1&1);
+        /*Fill in the neighbors with current motion vectors available.*/
+        for(ni=0;ni<NCNEIGHBORS[quadi];ni++){
+          nmbx=mbx+CDX[quadi][ni];
+          nmby=mby+CDY[quadi][ni];
+          if(nmbx<0||nmbx>=nhmbs||nmby<0||nmby>=nvmbs)continue;
+          nmbi=(nmby&~1)*nhmbs+((nmbx&~1)<<1)+OC_MB_MAP[nmby&1][nmbx&1];
+          if(mb_modes[nmbi]==OC_MODE_INVALID)continue;
+          embs[mbi].cneighbors[embs[mbi].ncneighbors++]=nmbi;
+        }
+        /*Fill in the neighbors with previous motion vectors available.*/
+        for(ni=0;ni<4;ni++){
+          nmbx=mbx+PDX[ni];
+          nmby=mby+PDY[ni];
+          if(nmbx<0||nmbx>=nhmbs||nmby<0||nmby>=nvmbs)continue;
+          nmbi=(nmby&~1)*nhmbs+((nmbx&~1)<<1)+OC_MB_MAP[nmby&1][nmbx&1];
+          if(mb_modes[nmbi]==OC_MODE_INVALID)continue;
+          embs[mbi].pneighbors[embs[mbi].npneighbors++]=nmbi;
+        }
+      }
+    }
+  }
+}
+
+static int oc_enc_set_huffman_codes(oc_enc_ctx *_enc,
+ const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]){
+  int ret;
+  if(_enc==NULL)return TH_EFAULT;
+  if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL;
+  if(_codes==NULL)_codes=TH_VP31_HUFF_CODES;
+  /*Validate the codes.*/
+  oggpackB_reset(&_enc->opb);
+  ret=oc_huff_codes_pack(&_enc->opb,_codes);
+  if(ret<0)return ret;
+  memcpy(_enc->huff_codes,_codes,sizeof(_enc->huff_codes));
+  return 0;
+}
+
+/*Sets the quantization parameters to use.
+  This may only be called before the setup header is written.
+  If it is called multiple times, only the last call has any effect.
+  _qinfo: The quantization parameters.
+          These are described in more detail in theoraenc.h.
+          This can be NULL, in which case the default quantization parameters
+           will be used.*/
+static int oc_enc_set_quant_params(oc_enc_ctx *_enc,
+ const th_quant_info *_qinfo){
+  int qi;
+  int pli;
+  int qti;
+  if(_enc==NULL)return TH_EFAULT;
+  if(_enc->packet_state>OC_PACKET_SETUP_HDR)return TH_EINVAL;
+  if(_qinfo==NULL)_qinfo=&TH_DEF_QUANT_INFO;
+  /*TODO: Analyze for packing purposes instead of just doing a shallow copy.*/
+  memcpy(&_enc->qinfo,_qinfo,sizeof(_enc->qinfo));
+  for(qi=0;qi<64;qi++)for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
+    _enc->state.dequant_tables[qi][pli][qti]=
+     _enc->state.dequant_table_data[qi][pli][qti];
+    _enc->enquant_tables[qi][pli][qti]=_enc->enquant_table_data[qi][pli][qti];
+  }
+  oc_enquant_tables_init(_enc->state.dequant_tables,
+   _enc->enquant_tables,_qinfo);
+  memcpy(_enc->state.loop_filter_limits,_qinfo->loop_filter_limits,
+   sizeof(_enc->state.loop_filter_limits));
+  oc_enquant_qavg_init(_enc->log_qavg,_enc->state.dequant_tables,
+   _enc->state.info.pixel_fmt);
+  return 0;
+}
+
+static void oc_enc_clear(oc_enc_ctx *_enc);
+
+static int oc_enc_init(oc_enc_ctx *_enc,const th_info *_info){
+  th_info   info;
+  size_t    mcu_nmbs;
+  ptrdiff_t mcu_nfrags;
+  int       hdec;
+  int       vdec;
+  int       ret;
+  int       pli;
+  /*Clean up the requested settings.*/
+  memcpy(&info,_info,sizeof(info));
+  info.version_major=TH_VERSION_MAJOR;
+  info.version_minor=TH_VERSION_MINOR;
+  info.version_subminor=TH_VERSION_SUB;
+  if(info.quality>63)info.quality=63;
+  if(info.quality<0)info.quality=32;
+  if(info.target_bitrate<0)info.target_bitrate=0;
+  /*Initialize the shared encoder/decoder state.*/
+  ret=oc_state_init(&_enc->state,&info,4);
+  if(ret<0)return ret;
+  _enc->mb_info=_ogg_calloc(_enc->state.nmbs,sizeof(*_enc->mb_info));
+  _enc->frag_dc=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_dc));
+  _enc->coded_mbis=
+   (unsigned *)_ogg_malloc(_enc->state.nmbs*sizeof(*_enc->coded_mbis));
+  hdec=!(_enc->state.info.pixel_fmt&1);
+  vdec=!(_enc->state.info.pixel_fmt&2);
+  /*If chroma is sub-sampled in the vertical direction, we have to encode two
+     super block rows of Y' for each super block row of Cb and Cr.*/
+  _enc->mcu_nvsbs=1<<vdec;
+  mcu_nmbs=_enc->mcu_nvsbs*_enc->state.fplanes[0].nhsbs*(size_t)4;
+  mcu_nfrags=4*mcu_nmbs+(8*mcu_nmbs>>hdec+vdec);
+  _enc->mcu_skip_ssd=(unsigned *)_ogg_malloc(
+   mcu_nfrags*sizeof(*_enc->mcu_skip_ssd));
+  for(pli=0;pli<3;pli++){
+    _enc->dct_tokens[pli]=(unsigned char **)oc_malloc_2d(64,
+     _enc->state.fplanes[pli].nfrags,sizeof(**_enc->dct_tokens));
+    _enc->extra_bits[pli]=(ogg_uint16_t **)oc_malloc_2d(64,
+     _enc->state.fplanes[pli].nfrags,sizeof(**_enc->extra_bits));
+  }
+#if defined(OC_COLLECT_METRICS)
+  _enc->frag_satd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_satd));
+  _enc->frag_ssd=_ogg_calloc(_enc->state.nfrags,sizeof(*_enc->frag_ssd));
+#endif
+#if defined(OC_X86_ASM)
+  oc_enc_vtable_init_x86(_enc);
+#else
+  oc_enc_vtable_init_c(_enc);
+#endif
+  _enc->keyframe_frequency_force=1<<_enc->state.info.keyframe_granule_shift;
+  _enc->state.qis[0]=_enc->state.info.quality;
+  _enc->state.nqis=1;
+  oc_rc_state_init(&_enc->rc,_enc);
+  oggpackB_writeinit(&_enc->opb);
+  if(_enc->mb_info==NULL||_enc->frag_dc==NULL||_enc->coded_mbis==NULL||
+   _enc->mcu_skip_ssd==NULL||_enc->dct_tokens[0]==NULL||
+   _enc->dct_tokens[1]==NULL||_enc->dct_tokens[2]==NULL||
+   _enc->extra_bits[0]==NULL||_enc->extra_bits[1]==NULL||
+   _enc->extra_bits[2]==NULL
+#if defined(OC_COLLECT_METRICS)
+   ||_enc->frag_satd==NULL||_enc->frag_ssd==NULL
+#endif
+   ){
+    oc_enc_clear(_enc);
+    return TH_EFAULT;
+  }
+  oc_mode_scheme_chooser_init(&_enc->chooser);
+  oc_enc_mb_info_init(_enc);
+  memset(_enc->huff_idxs,0,sizeof(_enc->huff_idxs));
+  /*Reset the packet-out state machine.*/
+  _enc->packet_state=OC_PACKET_INFO_HDR;
+  _enc->dup_count=0;
+  _enc->nqueued_dups=0;
+  _enc->prev_dup_count=0;
+  /*Enable speed optimizations up through early skip by default.*/
+  _enc->sp_level=OC_SP_LEVEL_EARLY_SKIP;
+  /*Disable VP3 compatibility by default.*/
+  _enc->vp3_compatible=0;
+  /*No INTER frames coded yet.*/
+  _enc->coded_inter_frame=0;
+  memcpy(_enc->huff_codes,TH_VP31_HUFF_CODES,sizeof(_enc->huff_codes));
+  oc_enc_set_quant_params(_enc,NULL);
+  return 0;
+}
+
+static void oc_enc_clear(oc_enc_ctx *_enc){
+  int pli;
+  oc_rc_state_clear(&_enc->rc);
+#if defined(OC_COLLECT_METRICS)
+  oc_enc_mode_metrics_dump(_enc);
+#endif
+  oggpackB_writeclear(&_enc->opb);
+#if defined(OC_COLLECT_METRICS)
+  _ogg_free(_enc->frag_ssd);
+  _ogg_free(_enc->frag_satd);
+#endif
+  for(pli=3;pli-->0;){
+    oc_free_2d(_enc->extra_bits[pli]);
+    oc_free_2d(_enc->dct_tokens[pli]);
+  }
+  _ogg_free(_enc->mcu_skip_ssd);
+  _ogg_free(_enc->coded_mbis);
+  _ogg_free(_enc->frag_dc);
+  _ogg_free(_enc->mb_info);
+  oc_state_clear(&_enc->state);
+}
+
+static void oc_enc_drop_frame(th_enc_ctx *_enc){
+  /*Use the previous frame's reconstruction.*/
+  _enc->state.ref_frame_idx[OC_FRAME_SELF]=
+   _enc->state.ref_frame_idx[OC_FRAME_PREV];
+  /*Flag motion vector analysis about the frame drop.*/
+  _enc->prevframe_dropped=1;
+  /*Zero the packet.*/
+  oggpackB_reset(&_enc->opb);
+}
+
+static void oc_enc_compress_keyframe(oc_enc_ctx *_enc,int _recode){
+  if(_enc->state.info.target_bitrate>0){
+    _enc->state.qis[0]=oc_enc_select_qi(_enc,OC_INTRA_FRAME,
+     _enc->state.curframe_num>0);
+    _enc->state.nqis=1;
+  }
+  oc_enc_calc_lambda(_enc,OC_INTRA_FRAME);
+  oc_enc_analyze_intra(_enc,_recode);
+  oc_enc_frame_pack(_enc);
+  /*On the first frame, the previous call was an initial dry-run to prime
+     feed-forward statistics.*/
+  if(!_recode&&_enc->state.curframe_num==0){
+    if(_enc->state.info.target_bitrate>0){
+      oc_enc_update_rc_state(_enc,oggpackB_bytes(&_enc->opb)<<3,
+                             OC_INTRA_FRAME,_enc->state.qis[0],1,0);
+    }
+    oc_enc_compress_keyframe(_enc,1);
+  }
+}
+
+static void oc_enc_compress_frame(oc_enc_ctx *_enc,int _recode){
+  if(_enc->state.info.target_bitrate>0){
+    _enc->state.qis[0]=oc_enc_select_qi(_enc,OC_INTER_FRAME,1);
+    _enc->state.nqis=1;
+  }
+  oc_enc_calc_lambda(_enc,OC_INTER_FRAME);
+  if(oc_enc_analyze_inter(_enc,_enc->rc.twopass!=2,_recode)){
+    /*Mode analysis thinks this should have been a keyframe; start over.*/
+    oc_enc_compress_keyframe(_enc,1);
+  }
+  else{
+    oc_enc_frame_pack(_enc);
+    if(!_enc->coded_inter_frame){
+      /*On the first INTER frame, the previous call was an initial dry-run to
+         prime feed-forward statistics.*/
+      _enc->coded_inter_frame=1;
+      if(_enc->state.info.target_bitrate>0){
+        /*Rate control also needs to prime.*/
+        oc_enc_update_rc_state(_enc,oggpackB_bytes(&_enc->opb)<<3,
+         OC_INTER_FRAME,_enc->state.qis[0],1,0);
+      }
+      oc_enc_compress_frame(_enc,1);
+    }
+  }
+}
+
+/*Set the granule position for the next packet to output based on the current
+   internal state.*/
+static void oc_enc_set_granpos(oc_enc_ctx *_enc){
+  unsigned dup_offs;
+  /*Add an offset for the number of duplicate frames we've emitted so far.*/
+  dup_offs=_enc->prev_dup_count-_enc->nqueued_dups;
+  /*If the current frame was a keyframe, use it for the high part.*/
+  if(_enc->state.frame_type==OC_INTRA_FRAME){
+    _enc->state.granpos=(_enc->state.curframe_num+_enc->state.granpos_bias<<
+     _enc->state.info.keyframe_granule_shift)+dup_offs;
+  }
+  /*Otherwise use the last keyframe in the high part and put the current frame
+     in the low part.*/
+  else{
+    _enc->state.granpos=
+     (_enc->state.keyframe_num+_enc->state.granpos_bias<<
+     _enc->state.info.keyframe_granule_shift)
+     +_enc->state.curframe_num-_enc->state.keyframe_num+dup_offs;
+  }
+}
+
+
+th_enc_ctx *th_encode_alloc(const th_info *_info){
+  oc_enc_ctx *enc;
+  if(_info==NULL)return NULL;
+  enc=_ogg_malloc(sizeof(*enc));
+  if(enc==NULL||oc_enc_init(enc,_info)<0){
+    _ogg_free(enc);
+    return NULL;
+  }
+  return enc;
+}
+
+void th_encode_free(th_enc_ctx *_enc){
+  if(_enc!=NULL){
+    oc_enc_clear(_enc);
+    _ogg_free(_enc);
+  }
+}
+
+int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){
+  switch(_req){
+    case TH_ENCCTL_SET_HUFFMAN_CODES:{
+      if(_buf==NULL&&_buf_sz!=0||
+       _buf!=NULL&&_buf_sz!=sizeof(th_huff_table)*TH_NHUFFMAN_TABLES){
+        return TH_EINVAL;
+      }
+      return oc_enc_set_huffman_codes(_enc,(const th_huff_table *)_buf);
+    }break;
+    case TH_ENCCTL_SET_QUANT_PARAMS:{
+      if(_buf==NULL&&_buf_sz!=0||
+       _buf!=NULL&&_buf_sz!=sizeof(th_quant_info)){
+        return TH_EINVAL;
+      }
+      return oc_enc_set_quant_params(_enc,(th_quant_info *)_buf);
+    }break;
+    case TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE:{
+      ogg_uint32_t keyframe_frequency_force;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(keyframe_frequency_force))return TH_EINVAL;
+      keyframe_frequency_force=*(ogg_uint32_t *)_buf;
+      if(keyframe_frequency_force<=0)keyframe_frequency_force=1;
+      if(_enc->packet_state==OC_PACKET_INFO_HDR){
+        /*It's still early enough to enlarge keyframe_granule_shift.*/
+        _enc->state.info.keyframe_granule_shift=OC_CLAMPI(
+         _enc->state.info.keyframe_granule_shift,
+         OC_ILOG_32(keyframe_frequency_force-1),31);
+      }
+      _enc->keyframe_frequency_force=OC_MINI(keyframe_frequency_force,
+       (ogg_uint32_t)1U<<_enc->state.info.keyframe_granule_shift);
+      *(ogg_uint32_t *)_buf=_enc->keyframe_frequency_force;
+      return 0;
+    }break;
+    case TH_ENCCTL_SET_VP3_COMPATIBLE:{
+      int vp3_compatible;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(vp3_compatible))return TH_EINVAL;
+      vp3_compatible=*(int *)_buf;
+      _enc->vp3_compatible=vp3_compatible;
+      if(oc_enc_set_huffman_codes(_enc,TH_VP31_HUFF_CODES)<0)vp3_compatible=0;
+      if(oc_enc_set_quant_params(_enc,&TH_VP31_QUANT_INFO)<0)vp3_compatible=0;
+      if(_enc->state.info.pixel_fmt!=TH_PF_420||
+       _enc->state.info.pic_width<_enc->state.info.frame_width||
+       _enc->state.info.pic_height<_enc->state.info.frame_height||
+      /*If we have more than 4095 super blocks, VP3's RLE coding might
+         overflow.
+        We could overcome this by ensuring we flip the coded/not-coded flags on
+         at least one super block in the frame, but we pick the simple solution
+         of just telling the user the stream will be incompatible instead.
+        It's unlikely the old VP3 codec would be able to decode streams at this
+         resolution in real time in the first place.*/
+       _enc->state.nsbs>4095){
+        vp3_compatible=0;
+      }
+      *(int *)_buf=vp3_compatible;
+      return 0;
+    }break;
+    case TH_ENCCTL_GET_SPLEVEL_MAX:{
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(int))return TH_EINVAL;
+      *(int *)_buf=OC_SP_LEVEL_MAX;
+      return 0;
+    }break;
+    case TH_ENCCTL_SET_SPLEVEL:{
+      int speed;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(speed))return TH_EINVAL;
+      speed=*(int *)_buf;
+      if(speed<0||speed>OC_SP_LEVEL_MAX)return TH_EINVAL;
+      _enc->sp_level=speed;
+      return 0;
+    }break;
+    case TH_ENCCTL_GET_SPLEVEL:{
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(int))return TH_EINVAL;
+      *(int *)_buf=_enc->sp_level;
+      return 0;
+    }
+    case TH_ENCCTL_SET_DUP_COUNT:{
+      int dup_count;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(dup_count))return TH_EINVAL;
+      dup_count=*(int *)_buf;
+      if(dup_count>=_enc->keyframe_frequency_force)return TH_EINVAL;
+      _enc->dup_count=OC_MAXI(dup_count,0);
+      return 0;
+    }break;
+    case TH_ENCCTL_SET_QUALITY:{
+      int qi;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_enc->state.info.target_bitrate>0)return TH_EINVAL;
+      qi=*(int *)_buf;
+      if(qi<0||qi>63)return TH_EINVAL;
+      _enc->state.info.quality=qi;
+      _enc->state.qis[0]=(unsigned char)qi;
+      _enc->state.nqis=1;
+      return 0;
+    }break;
+    case TH_ENCCTL_SET_BITRATE:{
+      long bitrate;
+      int  reset;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      bitrate=*(long *)_buf;
+      if(bitrate<=0)return TH_EINVAL;
+      reset=_enc->state.info.target_bitrate<=0;
+      _enc->state.info.target_bitrate=bitrate>INT_MAX?INT_MAX:bitrate;
+      if(reset)oc_rc_state_init(&_enc->rc,_enc);
+      else oc_enc_rc_resize(_enc);
+      return 0;
+    }break;
+    case TH_ENCCTL_SET_RATE_FLAGS:{
+      int set;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(set))return TH_EINVAL;
+      if(_enc->state.info.target_bitrate<=0)return TH_EINVAL;
+      set=*(int *)_buf;
+      _enc->rc.drop_frames=set&TH_RATECTL_DROP_FRAMES;
+      _enc->rc.cap_overflow=set&TH_RATECTL_CAP_OVERFLOW;
+      _enc->rc.cap_underflow=set&TH_RATECTL_CAP_UNDERFLOW;
+      return 0;
+    }break;
+    case TH_ENCCTL_SET_RATE_BUFFER:{
+      int set;
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_buf_sz!=sizeof(set))return TH_EINVAL;
+      if(_enc->state.info.target_bitrate<=0)return TH_EINVAL;
+      set=*(int *)_buf;
+      _enc->rc.buf_delay=set;
+      oc_enc_rc_resize(_enc);
+      *(int *)_buf=_enc->rc.buf_delay;
+      return 0;
+    }break;
+    case TH_ENCCTL_2PASS_OUT:{
+      if(_enc==NULL||_buf==NULL)return TH_EFAULT;
+      if(_enc->state.info.target_bitrate<=0||
+       _enc->state.curframe_num>=0&&_enc->rc.twopass!=1||
+       _buf_sz!=sizeof(unsigned char *)){
+        return TH_EINVAL;
+      }
+      return oc_enc_rc_2pass_out(_enc,(unsigned char **)_buf);
+    }break;
+    case TH_ENCCTL_2PASS_IN:{
+      if(_enc==NULL)return TH_EFAULT;
+      if(_enc->state.info.target_bitrate<=0||
+       _enc->state.curframe_num>=0&&_enc->rc.twopass!=2){
+        return TH_EINVAL;
+      }
+      return oc_enc_rc_2pass_in(_enc,_buf,_buf_sz);
+    }break;
+    default:return TH_EIMPL;
+  }
+}
+
+int th_encode_flushheader(th_enc_ctx *_enc,th_comment *_tc,ogg_packet *_op){
+  if(_enc==NULL)return TH_EFAULT;
+  return oc_state_flushheader(&_enc->state,&_enc->packet_state,&_enc->opb,
+   &_enc->qinfo,(const th_huff_table *)_enc->huff_codes,th_version_string(),
+   _tc,_op);
+}
+
+static void oc_img_plane_copy_pad(th_img_plane *_dst,th_img_plane *_src,
+ ogg_int32_t _pic_x,ogg_int32_t _pic_y,
+ ogg_int32_t _pic_width,ogg_int32_t _pic_height){
+  unsigned char *dst;
+  int            dstride;
+  ogg_uint32_t   frame_width;
+  ogg_uint32_t   frame_height;
+  ogg_uint32_t   y;
+  frame_width=_dst->width;
+  frame_height=_dst->height;
+  /*If we have _no_ data, just encode a dull green.*/
+  if(_pic_width==0||_pic_height==0){
+    dst=_dst->data;
+    dstride=_dst->stride;
+    for(y=0;y<frame_height;y++){
+      memset(dst,0,frame_width*sizeof(*dst));
+      dst+=dstride;
+    }
+  }
+  /*Otherwise, copy what we do have, and add our own padding.*/
+  else{
+    unsigned char *dst_data;
+    unsigned char *src_data;
+    unsigned char *src;
+    int            sstride;
+    ogg_uint32_t   x;
+    /*Step 1: Copy the data we do have.*/
+    dstride=_dst->stride;
+    sstride=_src->stride;
+    dst_data=_dst->data;
+    src_data=_src->data;
+    dst=dst_data+_pic_y*(ptrdiff_t)dstride+_pic_x;
+    src=src_data+_pic_y*(ptrdiff_t)sstride+_pic_x;
+    for(y=0;y<_pic_height;y++){
+      memcpy(dst,src,_pic_width);
+      dst+=dstride;
+      src+=sstride;
+    }
+    /*Step 2: Perform a low-pass extension into the padding region.*/
+    /*Left side.*/
+    for(x=_pic_x;x-->0;){
+      dst=dst_data+_pic_y*(ptrdiff_t)dstride+x;
+      for(y=0;y<_pic_height;y++){
+        dst[0]=(dst[1]<<1)+(dst-(dstride&-(y>0)))[1]
+         +(dst+(dstride&-(y+1<_pic_height)))[1]+2>>2;
+        dst+=dstride;
+      }
+    }
+    /*Right side.*/
+    for(x=_pic_x+_pic_width;x<frame_width;x++){
+      dst=dst_data+_pic_y*(ptrdiff_t)dstride+x-1;
+      for(y=0;y<_pic_height;y++){
+        dst[1]=(dst[0]<<1)+(dst-(dstride&-(y>0)))[0]
+         +(dst+(dstride&-(y+1<_pic_height)))[0]+2>>2;
+        dst+=dstride;
+      }
+    }
+    /*Top.*/
+    dst=dst_data+_pic_y*(ptrdiff_t)dstride;
+    for(y=_pic_y;y-->0;){
+      for(x=0;x<frame_width;x++){
+        (dst-dstride)[x]=(dst[x]<<1)+dst[x-(x>0)]
+         +dst[x+(x+1<frame_width)]+2>>2;
+      }
+      dst-=dstride;
+    }
+    /*Bottom.*/
+    dst=dst_data+(_pic_y+_pic_height)*(ptrdiff_t)dstride;
+    for(y=_pic_y+_pic_height;y<frame_height;y++){
+      for(x=0;x<frame_width;x++){
+        dst[x]=((dst-dstride)[x]<<1)+(dst-dstride)[x-(x>0)]
+         +(dst-dstride)[x+(x+1<frame_width)]+2>>2;
+      }
+      dst+=dstride;
+    }
+  }
+}
+
+int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
+  th_ycbcr_buffer img;
+  int             cframe_width;
+  int             cframe_height;
+  int             cpic_width;
+  int             cpic_height;
+  int             cpic_x;
+  int             cpic_y;
+  int             hdec;
+  int             vdec;
+  int             pli;
+  int             refi;
+  int             drop;
+  /*Step 1: validate parameters.*/
+  if(_enc==NULL||_img==NULL)return TH_EFAULT;
+  if(_enc->packet_state==OC_PACKET_DONE)return TH_EINVAL;
+  if(_enc->rc.twopass&&_enc->rc.twopass_buffer_bytes==0)return TH_EINVAL;
+  if((ogg_uint32_t)_img[0].width!=_enc->state.info.frame_width||
+   (ogg_uint32_t)_img[0].height!=_enc->state.info.frame_height){
+    return TH_EINVAL;
+  }
+  hdec=!(_enc->state.info.pixel_fmt&1);
+  vdec=!(_enc->state.info.pixel_fmt&2);
+  cframe_width=_enc->state.info.frame_width>>hdec;
+  cframe_height=_enc->state.info.frame_height>>vdec;
+  if(_img[1].width!=cframe_width||_img[2].width!=cframe_width||
+   _img[1].height!=cframe_height||_img[2].height!=cframe_height){
+    return TH_EINVAL;
+  }
+  /*Step 2: Copy the input to our internal buffer.
+    This lets us add padding, if necessary, so we don't have to worry about
+     dereferencing possibly invalid addresses, and allows us to use the same
+     strides and fragment offsets for both the input frame and the reference
+     frames.*/
+  /*Flip the input buffer upside down.*/
+  oc_ycbcr_buffer_flip(img,_img);
+  oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[OC_FRAME_IO]+0,img+0,
+   _enc->state.info.pic_x,_enc->state.info.pic_y,
+   _enc->state.info.pic_width,_enc->state.info.pic_height);
+  cpic_x=_enc->state.info.pic_x>>hdec;
+  cpic_y=_enc->state.info.pic_y>>vdec;
+  cpic_width=(_enc->state.info.pic_x+_enc->state.info.pic_width+hdec>>hdec)
+   -cpic_x;
+  cpic_height=(_enc->state.info.pic_y+_enc->state.info.pic_height+vdec>>vdec)
+   -cpic_y;
+  for(pli=1;pli<3;pli++){
+    oc_img_plane_copy_pad(_enc->state.ref_frame_bufs[OC_FRAME_IO]+pli,img+pli,
+     cpic_x,cpic_y,cpic_width,cpic_height);
+  }
+  /*Step 3: Update the buffer state.*/
+  if(_enc->state.ref_frame_idx[OC_FRAME_SELF]>=0){
+    _enc->state.ref_frame_idx[OC_FRAME_PREV]=
+     _enc->state.ref_frame_idx[OC_FRAME_SELF];
+    if(_enc->state.frame_type==OC_INTRA_FRAME){
+      /*The new frame becomes both the previous and gold reference frames.*/
+      _enc->state.keyframe_num=_enc->state.curframe_num;
+      _enc->state.ref_frame_idx[OC_FRAME_GOLD]=
+       _enc->state.ref_frame_idx[OC_FRAME_SELF];
+    }
+  }
+  /*Select a free buffer to use for the reconstructed version of this frame.*/
+  for(refi=0;refi==_enc->state.ref_frame_idx[OC_FRAME_GOLD]||
+   refi==_enc->state.ref_frame_idx[OC_FRAME_PREV];refi++);
+  _enc->state.ref_frame_idx[OC_FRAME_SELF]=refi;
+  _enc->state.curframe_num+=_enc->prev_dup_count+1;
+  /*Step 4: Compress the frame.*/
+  /*Start with a keyframe, and don't allow the generation of invalid files that
+     overflow the keyframe_granule_shift.*/
+  if(_enc->rc.twopass_force_kf||_enc->state.curframe_num==0||
+   _enc->state.curframe_num-_enc->state.keyframe_num+_enc->dup_count>=
+   _enc->keyframe_frequency_force){
+    oc_enc_compress_keyframe(_enc,0);
+    drop=0;
+  }
+  else{
+    oc_enc_compress_frame(_enc,0);
+    drop=1;
+  }
+  oc_restore_fpu(&_enc->state);
+  /*drop currently indicates if the frame is droppable.*/
+  if(_enc->state.info.target_bitrate>0){
+    drop=oc_enc_update_rc_state(_enc,oggpackB_bytes(&_enc->opb)<<3,
+     _enc->state.frame_type,_enc->state.qis[0],0,drop);
+  }
+  else drop=0;
+  /*drop now indicates if the frame was dropped.*/
+  if(drop)oc_enc_drop_frame(_enc);
+  else _enc->prevframe_dropped=0;
+  _enc->packet_state=OC_PACKET_READY;
+  _enc->prev_dup_count=_enc->nqueued_dups=_enc->dup_count;
+  _enc->dup_count=0;
+#if defined(OC_DUMP_IMAGES)
+  oc_enc_set_granpos(_enc);
+  oc_state_dump_frame(&_enc->state,OC_FRAME_IO,"src");
+  oc_state_dump_frame(&_enc->state,OC_FRAME_SELF,"rec");
+#endif
+  return 0;
+}
+
+int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){
+  if(_enc==NULL||_op==NULL)return TH_EFAULT;
+  if(_enc->packet_state==OC_PACKET_READY){
+    _enc->packet_state=OC_PACKET_EMPTY;
+    if(_enc->rc.twopass!=1){
+      unsigned char *packet;
+      packet=oggpackB_get_buffer(&_enc->opb);
+      /*If there's no packet, malloc failed while writing; it's lost forever.*/
+      if(packet==NULL)return TH_EFAULT;
+      _op->packet=packet;
+      _op->bytes=oggpackB_bytes(&_enc->opb);
+    }
+    /*For the first pass in 2-pass mode, don't emit any packet data.*/
+    else{
+      _op->packet=NULL;
+      _op->bytes=0;
+    }
+  }
+  else if(_enc->packet_state==OC_PACKET_EMPTY){
+    if(_enc->nqueued_dups>0){
+      _enc->nqueued_dups--;
+      _op->packet=NULL;
+      _op->bytes=0;
+    }
+    else{
+      if(_last_p)_enc->packet_state=OC_PACKET_DONE;
+      return 0;
+    }
+  }
+  else return 0;
+  _last_p=_last_p&&_enc->nqueued_dups<=0;
+  _op->b_o_s=0;
+  _op->e_o_s=_last_p;
+  oc_enc_set_granpos(_enc);
+  _op->packetno=th_granule_frame(_enc,_enc->state.granpos)+3;
+  _op->granulepos=_enc->state.granpos;
+  if(_last_p)_enc->packet_state=OC_PACKET_DONE;
+  return 1+_enc->nqueued_dups;
+}
diff --git a/lib/encoder_disabled.c b/lib/encoder_disabled.c
new file mode 100644
index 0000000..0cbf664
--- /dev/null
+++ b/lib/encoder_disabled.c
@@ -0,0 +1,67 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: encoder_disabled.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+#include "apiwrapper.h"
+#include "encint.h"
+
+th_enc_ctx *th_encode_alloc(const th_info *_info){
+  return NULL;
+}
+
+void th_encode_free(th_enc_ctx *_enc){}
+
+
+int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){
+  return OC_DISABLED;
+}
+
+int th_encode_flushheader(th_enc_ctx *_enc,th_comment *_tc,ogg_packet *_op){
+  return OC_DISABLED;
+}
+
+int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
+  return OC_DISABLED;
+}
+
+int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){
+  return OC_DISABLED;
+}
+
+
+
+int theora_encode_init(theora_state *_te,theora_info *_ci){
+  return OC_DISABLED;
+}
+
+int theora_encode_YUVin(theora_state *_te,yuv_buffer *_yuv){
+  return OC_DISABLED;
+}
+
+int theora_encode_packetout(theora_state *_te,int _last_p,ogg_packet *_op){
+  return OC_DISABLED;
+}
+
+int theora_encode_header(theora_state *_te,ogg_packet *_op){
+  return OC_DISABLED;
+}
+
+int theora_encode_comment(theora_comment *_tc,ogg_packet *_op){
+  return OC_DISABLED;
+}
+
+int theora_encode_tables(theora_state *_te,ogg_packet *_op){
+  return OC_DISABLED;
+}
diff --git a/lib/enquant.c b/lib/enquant.c
new file mode 100644
index 0000000..3372fed
--- /dev/null
+++ b/lib/enquant.c
@@ -0,0 +1,274 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: enquant.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+#include <stdlib.h>
+#include <string.h>
+#include "encint.h"
+
+
+
+void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo){
+  const th_quant_ranges *qranges;
+  const th_quant_base   *base_mats[2*3*64];
+  int                    indices[2][3][64];
+  int                    nbase_mats;
+  int                    nbits;
+  int                    ci;
+  int                    qi;
+  int                    qri;
+  int                    qti;
+  int                    pli;
+  int                    qtj;
+  int                    plj;
+  int                    bmi;
+  int                    i;
+  i=_qinfo->loop_filter_limits[0];
+  for(qi=1;qi<64;qi++)i=OC_MAXI(i,_qinfo->loop_filter_limits[qi]);
+  nbits=OC_ILOG_32(i);
+  oggpackB_write(_opb,nbits,3);
+  for(qi=0;qi<64;qi++){
+    oggpackB_write(_opb,_qinfo->loop_filter_limits[qi],nbits);
+  }
+  /*580 bits for VP3.*/
+  i=1;
+  for(qi=0;qi<64;qi++)i=OC_MAXI(_qinfo->ac_scale[qi],i);
+  nbits=OC_ILOGNZ_32(i);
+  oggpackB_write(_opb,nbits-1,4);
+  for(qi=0;qi<64;qi++)oggpackB_write(_opb,_qinfo->ac_scale[qi],nbits);
+  /*516 bits for VP3.*/
+  i=1;
+  for(qi=0;qi<64;qi++)i=OC_MAXI(_qinfo->dc_scale[qi],i);
+  nbits=OC_ILOGNZ_32(i);
+  oggpackB_write(_opb,nbits-1,4);
+  for(qi=0;qi<64;qi++)oggpackB_write(_opb,_qinfo->dc_scale[qi],nbits);
+  /*Consolidate any duplicate base matrices.*/
+  nbase_mats=0;
+  for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
+    qranges=_qinfo->qi_ranges[qti]+pli;
+    for(qri=0;qri<=qranges->nranges;qri++){
+      for(bmi=0;;bmi++){
+        if(bmi>=nbase_mats){
+          base_mats[bmi]=qranges->base_matrices+qri;
+          indices[qti][pli][qri]=nbase_mats++;
+          break;
+        }
+        else if(memcmp(base_mats[bmi][0],qranges->base_matrices[qri],
+         sizeof(base_mats[bmi][0]))==0){
+          indices[qti][pli][qri]=bmi;
+          break;
+        }
+      }
+    }
+  }
+  /*Write out the list of unique base matrices.
+    1545 bits for VP3 matrices.*/
+  oggpackB_write(_opb,nbase_mats-1,9);
+  for(bmi=0;bmi<nbase_mats;bmi++){
+    for(ci=0;ci<64;ci++)oggpackB_write(_opb,base_mats[bmi][0][ci],8);
+  }
+  /*Now store quant ranges and their associated indices into the base matrix
+     list.
+    46 bits for VP3 matrices.*/
+  nbits=OC_ILOG_32(nbase_mats-1);
+  for(i=0;i<6;i++){
+    qti=i/3;
+    pli=i%3;
+    qranges=_qinfo->qi_ranges[qti]+pli;
+    if(i>0){
+      if(qti>0){
+        if(qranges->nranges==_qinfo->qi_ranges[qti-1][pli].nranges&&
+         memcmp(qranges->sizes,_qinfo->qi_ranges[qti-1][pli].sizes,
+         qranges->nranges*sizeof(qranges->sizes[0]))==0&&
+         memcmp(indices[qti][pli],indices[qti-1][pli],
+         (qranges->nranges+1)*sizeof(indices[qti][pli][0]))==0){
+          oggpackB_write(_opb,1,2);
+          continue;
+        }
+      }
+      qtj=(i-1)/3;
+      plj=(i-1)%3;
+      if(qranges->nranges==_qinfo->qi_ranges[qtj][plj].nranges&&
+       memcmp(qranges->sizes,_qinfo->qi_ranges[qtj][plj].sizes,
+       qranges->nranges*sizeof(qranges->sizes[0]))==0&&
+       memcmp(indices[qti][pli],indices[qtj][plj],
+       (qranges->nranges+1)*sizeof(indices[qti][pli][0]))==0){
+        oggpackB_write(_opb,0,1+(qti>0));
+        continue;
+      }
+      oggpackB_write(_opb,1,1);
+    }
+    oggpackB_write(_opb,indices[qti][pli][0],nbits);
+    for(qi=qri=0;qi<63;qri++){
+      oggpackB_write(_opb,qranges->sizes[qri]-1,OC_ILOG_32(62-qi));
+      qi+=qranges->sizes[qri];
+      oggpackB_write(_opb,indices[qti][pli][qri+1],nbits);
+    }
+  }
+}
+
+static void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d){
+  ogg_uint32_t t;
+  int          l;
+  _d<<=1;
+  l=OC_ILOGNZ_32(_d)-1;
+  t=1+((ogg_uint32_t)1<<16+l)/_d;
+  _this->m=(ogg_int16_t)(t-0x10000);
+  _this->l=l;
+}
+
+/*See comments at oc_dequant_tables_init() for how the quantization tables'
+   storage should be initialized.*/
+void oc_enquant_tables_init(ogg_uint16_t *_dequant[64][3][2],
+ oc_iquant *_enquant[64][3][2],const th_quant_info *_qinfo){
+  int qi;
+  int pli;
+  int qti;
+  /*Initialize the dequantization tables first.*/
+  oc_dequant_tables_init(_dequant,NULL,_qinfo);
+  /*Derive the quantization tables directly from the dequantization tables.*/
+  for(qi=0;qi<64;qi++)for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
+    int zzi;
+    int plj;
+    int qtj;
+    int dupe;
+    dupe=0;
+    for(qtj=0;qtj<=qti;qtj++){
+      for(plj=0;plj<(qtj<qti?3:pli);plj++){
+        if(_dequant[qi][pli][qti]==_dequant[qi][plj][qtj]){
+          dupe=1;
+          break;
+        }
+      }
+      if(dupe)break;
+    }
+    if(dupe){
+      _enquant[qi][pli][qti]=_enquant[qi][plj][qtj];
+      continue;
+    }
+    /*In the original VP3.2 code, the rounding offset and the size of the
+       dead zone around 0 were controlled by a "sharpness" parameter.
+      We now R-D optimize the tokens for each block after quantization,
+       so the rounding offset should always be 1/2, and an explicit dead
+       zone is unnecessary.
+      Hence, all of that VP3.2 code is gone from here, and the remaining
+       floating point code has been implemented as equivalent integer
+       code with exact precision.*/
+    for(zzi=0;zzi<64;zzi++){
+      oc_iquant_init(_enquant[qi][pli][qti]+zzi,
+       _dequant[qi][pli][qti][zzi]);
+    }
+  }
+}
+
+
+
+/*This table gives the square root of the fraction of the squared magnitude of
+   each DCT coefficient relative to the total, scaled by 2**16, for both INTRA
+   and INTER modes.
+  These values were measured after motion-compensated prediction, before
+   quantization, over a large set of test video (from QCIF to 1080p) encoded at
+   all possible rates.
+  The DC coefficient takes into account the DPCM prediction (using the
+   quantized values from neighboring blocks, as the encoder does, but still
+   before quantization of the coefficient in the current block).
+  The results differ significantly from the expected variance (e.g., using an
+   AR(1) model of the signal with rho=0.95, as is frequently done to compute
+   the coding gain of the DCT).
+  We use them to estimate an "average" quantizer for a given quantizer matrix,
+   as this is used to parameterize a number of the rate control decisions.
+  These values are themselves probably quantizer-matrix dependent, since the
+   shape of the matrix affects the noise distribution in the reference frames,
+   but they should at least give us _some_ amount of adaptivity to different
+   matrices, as opposed to hard-coding a table of average Q values for the
+   current set.
+  The main features they capture are that a) only a few of the quantizers in
+   the upper-left corner contribute anything significant at all (though INTER
+   mode is significantly flatter) and b) the DPCM prediction of the DC
+   coefficient gives a very minor improvement in the INTRA case and a quite
+   significant one in the INTER case (over the expected variance).*/
+static const ogg_uint16_t OC_RPSD[2][64]={
+  {
+    52725,17370,10399, 6867, 5115, 3798, 2942, 2076,
+    17370, 9900, 6948, 4994, 3836, 2869, 2229, 1619,
+    10399, 6948, 5516, 4202, 3376, 2573, 2015, 1461,
+     6867, 4994, 4202, 3377, 2800, 2164, 1718, 1243,
+     5115, 3836, 3376, 2800, 2391, 1884, 1530, 1091,
+     3798, 2869, 2573, 2164, 1884, 1495, 1212,  873,
+     2942, 2229, 2015, 1718, 1530, 1212, 1001,  704,
+     2076, 1619, 1461, 1243, 1091,  873,  704,  474
+  },
+  {
+    23411,15604,13529,11601,10683, 8958, 7840, 6142,
+    15604,11901,10718, 9108, 8290, 6961, 6023, 4487,
+    13529,10718, 9961, 8527, 7945, 6689, 5742, 4333,
+    11601, 9108, 8527, 7414, 7084, 5923, 5175, 3743,
+    10683, 8290, 7945, 7084, 6771, 5754, 4793, 3504,
+     8958, 6961, 6689, 5923, 5754, 4679, 3936, 2989,
+     7840, 6023, 5742, 5175, 4793, 3936, 3522, 2558,
+     6142, 4487, 4333, 3743, 3504, 2989, 2558, 1829
+  }
+};
+
+/*The fraction of the squared magnitude of the residuals in each color channel
+   relative to the total, scaled by 2**16, for each pixel format.
+  These values were measured after motion-compensated prediction, before
+   quantization, over a large set of test video encoded at all possible rates.
+  TODO: These values are only from INTER frames; it should be re-measured for
+   INTRA frames.*/
+static const ogg_uint16_t OC_PCD[4][3]={
+  {59926, 3038, 2572},
+  {55201, 5597, 4738},
+  {55201, 5597, 4738},
+  {47682, 9669, 8185}
+};
+
+
+/*Compute an "average" quantizer for each qi level.
+  We do one for INTER and one for INTRA, since their behavior is very
+   different, but average across chroma channels.
+  The basic approach is to compute a harmonic average of the squared quantizer,
+   weighted by the expected squared magnitude of the DCT coefficients.
+  Under the (not quite true) assumption that DCT coefficients are
+   Laplacian-distributed, this preserves the product Q*lambda, where
+   lambda=sqrt(2/sigma**2) is the Laplacian distribution parameter (not to be
+   confused with the lambda used in R-D optimization throughout most of the
+   rest of the code).
+  The value Q*lambda completely determines the entropy of the coefficients.*/
+void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64],
+ ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt){
+  int qi;
+  int pli;
+  int qti;
+  int ci;
+  for(qti=0;qti<2;qti++)for(qi=0;qi<64;qi++){
+    ogg_int64_t q2;
+    q2=0;
+    for(pli=0;pli<3;pli++){
+      ogg_uint32_t qp;
+      qp=0;
+      for(ci=0;ci<64;ci++){
+        unsigned rq;
+        unsigned qd;
+        qd=_dequant[qi][pli][qti][OC_IZIG_ZAG[ci]];
+        rq=(OC_RPSD[qti][ci]+(qd>>1))/qd;
+        qp+=rq*(ogg_uint32_t)rq;
+      }
+      q2+=OC_PCD[_pixel_fmt][pli]*(ogg_int64_t)qp;
+    }
+    /*qavg=1.0/sqrt(q2).*/
+    _log_qavg[qti][qi]=OC_Q57(48)-oc_blog64(q2)>>1;
+  }
+}
diff --git a/lib/enquant.h b/lib/enquant.h
new file mode 100644
index 0000000..d62df10
--- /dev/null
+++ b/lib/enquant.h
@@ -0,0 +1,27 @@
+#if !defined(_enquant_H)
+# define _enquant_H (1)
+# include "quant.h"
+
+typedef struct oc_iquant oc_iquant;
+
+#define OC_QUANT_MAX_LOG (OC_Q57(OC_STATIC_ILOG_32(OC_QUANT_MAX)-1))
+
+/*Used to compute x/d via ((x*m>>16)+x>>l)+(x<0))
+   (i.e., one 16x16->16 mul, 2 shifts, and 2 adds).
+  This is not an approximation; for 16-bit x and d, it is exact.*/
+struct oc_iquant{
+  ogg_int16_t m;
+  ogg_int16_t l;
+};
+
+typedef oc_iquant        oc_iquant_table[64];
+
+
+
+void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo);
+void oc_enquant_tables_init(ogg_uint16_t *_dequant[64][3][2],
+ oc_iquant *_enquant[64][3][2],const th_quant_info *_qinfo);
+void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64],
+ ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt);
+
+#endif
diff --git a/lib/fdct.c b/lib/fdct.c
new file mode 100644
index 0000000..dc3a66f
--- /dev/null
+++ b/lib/fdct.c
@@ -0,0 +1,422 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: fdct.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+#include "encint.h"
+#include "dct.h"
+
+
+
+/*Performs a forward 8 point Type-II DCT transform.
+  The output is scaled by a factor of 2 from the orthonormal version of the
+   transform.
+  _y: The buffer to store the result in.
+      Data will be placed the first 8 entries (e.g., in a row of an 8x8 block).
+  _x: The input coefficients.
+      Every 8th entry is used (e.g., from a column of an 8x8 block).*/
+static void oc_fdct8(ogg_int16_t _y[8],const ogg_int16_t *_x){
+  int t0;
+  int t1;
+  int t2;
+  int t3;
+  int t4;
+  int t5;
+  int t6;
+  int t7;
+  int r;
+  int s;
+  int u;
+  int v;
+  /*Stage 1:*/
+  /*0-7 butterfly.*/
+  t0=_x[0<<3]+(int)_x[7<<3];
+  t7=_x[0<<3]-(int)_x[7<<3];
+  /*1-6 butterfly.*/
+  t1=_x[1<<3]+(int)_x[6<<3];
+  t6=_x[1<<3]-(int)_x[6<<3];
+  /*2-5 butterfly.*/
+  t2=_x[2<<3]+(int)_x[5<<3];
+  t5=_x[2<<3]-(int)_x[5<<3];
+  /*3-4 butterfly.*/
+  t3=_x[3<<3]+(int)_x[4<<3];
+  t4=_x[3<<3]-(int)_x[4<<3];
+  /*Stage 2:*/
+  /*0-3 butterfly.*/
+  r=t0+t3;
+  t3=t0-t3;
+  t0=r;
+  /*1-2 butterfly.*/
+  r=t1+t2;
+  t2=t1-t2;
+  t1=r;
+  /*6-5 butterfly.*/
+  r=t6+t5;
+  t5=t6-t5;
+  t6=r;
+  /*Stages 3 and 4 are where all the approximation occurs.
+    These are chosen to be as close to an exact inverse of the approximations
+     made in the iDCT as possible, while still using mostly 16-bit arithmetic.
+    We use some 16x16->32 signed MACs, but those still commonly execute in 1
+     cycle on a 16-bit DSP.
+    For example, s=(27146*t5+0x4000>>16)+t5+(t5!=0) is an exact inverse of
+     t5=(OC_C4S4*s>>16).
+    That is, applying the latter to the output of the former will recover t5
+     exactly (over the valid input range of t5, -23171...23169).
+    We increase the rounding bias to 0xB500 in this particular case so that
+     errors inverting the subsequent butterfly are not one-sided (e.g., the
+     mean error is very close to zero).
+    The (t5!=0) term could be replaced simply by 1, but we want to send 0 to 0.
+    The fDCT of an all-zeros block will still not be zero, because of the
+     biases we added at the very beginning of the process, but it will be close
+     enough that it is guaranteed to round to zero.*/
+  /*Stage 3:*/
+  /*4-5 butterfly.*/
+  s=(27146*t5+0xB500>>16)+t5+(t5!=0)>>1;
+  r=t4+s;
+  t5=t4-s;
+  t4=r;
+  /*7-6 butterfly.*/
+  s=(27146*t6+0xB500>>16)+t6+(t6!=0)>>1;
+  r=t7+s;
+  t6=t7-s;
+  t7=r;
+  /*Stage 4:*/
+  /*0-1 butterfly.*/
+  r=(27146*t0+0x4000>>16)+t0+(t0!=0);
+  s=(27146*t1+0xB500>>16)+t1+(t1!=0);
+  u=r+s>>1;
+  v=r-u;
+  _y[0]=u;
+  _y[4]=v;
+  /*3-2 rotation by 6pi/16*/
+  u=(OC_C6S2*t2+OC_C2S6*t3+0x6CB7>>16)+(t3!=0);
+  s=(OC_C6S2*u>>16)-t2;
+  v=(s*21600+0x2800>>18)+s+(s!=0);
+  _y[2]=u;
+  _y[6]=v;
+  /*6-5 rotation by 3pi/16*/
+  u=(OC_C5S3*t6+OC_C3S5*t5+0x0E3D>>16)+(t5!=0);
+  s=t6-(OC_C5S3*u>>16);
+  v=(s*26568+0x3400>>17)+s+(s!=0);
+  _y[5]=u;
+  _y[3]=v;
+  /*7-4 rotation by 7pi/16*/
+  u=(OC_C7S1*t4+OC_C1S7*t7+0x7B1B>>16)+(t7!=0);
+  s=(OC_C7S1*u>>16)-t4;
+  v=(s*20539+0x3000>>20)+s+(s!=0);
+  _y[1]=u;
+  _y[7]=v;
+}
+
+void oc_enc_fdct8x8(const oc_enc_ctx *_enc,ogg_int16_t _y[64],
+ const ogg_int16_t _x[64]){
+  (*_enc->opt_vtable.fdct8x8)(_y,_x);
+}
+
+/*Performs a forward 8x8 Type-II DCT transform.
+  The output is scaled by a factor of 4 relative to the orthonormal version
+   of the transform.
+  _y: The buffer to store the result in.
+      This may be the same as _x.
+  _x: The input coefficients. */
+void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  const ogg_int16_t *in;
+  ogg_int16_t       *end;
+  ogg_int16_t       *out;
+  ogg_int16_t        w[64];
+  int                i;
+  /*Add two extra bits of working precision to improve accuracy; any more and
+     we could overflow.*/
+  for(i=0;i<64;i++)w[i]=_x[i]<<2;
+  /*These biases correct for some systematic error that remains in the full
+     fDCT->iDCT round trip.*/
+  w[0]+=(w[0]!=0)+1;
+  w[1]++;
+  w[8]--;
+  /*Transform columns of w into rows of _y.*/
+  for(in=w,out=_y,end=out+64;out<end;in++,out+=8)oc_fdct8(out,in);
+  /*Transform columns of _y into rows of w.*/
+  for(in=_y,out=w,end=out+64;out<end;in++,out+=8)oc_fdct8(out,in);
+  /*Round the result back to the external working precision (which is still
+     scaled by four relative to the orthogonal result).
+    TODO: We should just update the external working precision.*/
+  for(i=0;i<64;i++)_y[i]=w[i]+2>>2;
+}
+
+
+
+/*This does not seem to outperform simple LFE border padding before MC.
+  It yields higher PSNR, but much higher bitrate usage.*/
+#if 0
+typedef struct oc_extension_info oc_extension_info;
+
+
+
+/*Information needed to pad boundary blocks.
+  We multiply each row/column by an extension matrix that fills in the padding
+   values as a linear combination of the active values, so that an equivalent
+   number of coefficients are forced to zero.
+  This costs at most 16 multiplies, the same as a 1-D fDCT itself, and as
+   little as 7 multiplies.
+  We compute the extension matrices for every possible shape in advance, as
+   there are only 35.
+  The coefficients for all matrices are stored in a single array to take
+   advantage of the overlap and repetitiveness of many of the shapes.
+  A similar technique is applied to the offsets into this array.
+  This reduces the required table storage by about 48%.
+  See tools/extgen.c for details.
+  We could conceivably do the same for all 256 possible shapes.*/
+struct oc_extension_info{
+  /*The mask of the active pixels in the shape.*/
+  short                     mask;
+  /*The number of active pixels in the shape.*/
+  short                     na;
+  /*The extension matrix.
+    This is (8-na)xna*/
+  const ogg_int16_t *const *ext;
+  /*The pixel indices: na active pixels followed by 8-na padding pixels.*/
+  unsigned char             pi[8];
+  /*The coefficient indices: na unconstrained coefficients followed by 8-na
+     coefficients to be forced to zero.*/
+  unsigned char             ci[8];
+};
+
+
+/*The number of shapes we need.*/
+#define OC_NSHAPES   (35)
+
+static const ogg_int16_t OC_EXT_COEFFS[229]={
+  0x7FFF,0xE1F8,0x6903,0xAA79,0x5587,0x7FFF,0x1E08,0x7FFF,
+  0x5587,0xAA79,0x6903,0xE1F8,0x7FFF,0x0000,0x0000,0x0000,
+  0x7FFF,0x0000,0x0000,0x7FFF,0x8000,0x7FFF,0x0000,0x0000,
+  0x7FFF,0xE1F8,0x1E08,0xB0A7,0xAA1D,0x337C,0x7FFF,0x4345,
+  0x2267,0x4345,0x7FFF,0x337C,0xAA1D,0xB0A7,0x8A8C,0x4F59,
+  0x03B4,0xE2D6,0x7FFF,0x2CF3,0x7FFF,0xE2D6,0x03B4,0x4F59,
+  0x8A8C,0x1103,0x7AEF,0x5225,0xDF60,0xC288,0xDF60,0x5225,
+  0x7AEF,0x1103,0x668A,0xD6EE,0x3A16,0x0E6C,0xFA07,0x0E6C,
+  0x3A16,0xD6EE,0x668A,0x2A79,0x2402,0x980F,0x50F5,0x4882,
+  0x50F5,0x980F,0x2402,0x2A79,0xF976,0x2768,0x5F22,0x2768,
+  0xF976,0x1F91,0x76C1,0xE9AE,0x76C1,0x1F91,0x7FFF,0xD185,
+  0x0FC8,0xD185,0x7FFF,0x4F59,0x4345,0xED62,0x4345,0x4F59,
+  0xF574,0x5D99,0x2CF3,0x5D99,0xF574,0x5587,0x3505,0x30FC,
+  0xF482,0x953C,0xEAC4,0x7FFF,0x4F04,0x7FFF,0xEAC4,0x953C,
+  0xF482,0x30FC,0x4F04,0x273D,0xD8C3,0x273D,0x1E09,0x61F7,
+  0x1E09,0x273D,0xD8C3,0x273D,0x4F04,0x30FC,0xA57E,0x153C,
+  0x6AC4,0x3C7A,0x1E08,0x3C7A,0x6AC4,0x153C,0xA57E,0x7FFF,
+  0xA57E,0x5A82,0x6AC4,0x153C,0xC386,0xE1F8,0xC386,0x153C,
+  0x6AC4,0x5A82,0xD8C3,0x273D,0x7FFF,0xE1F7,0x7FFF,0x273D,
+  0xD8C3,0x4F04,0x30FC,0xD8C3,0x273D,0xD8C3,0x30FC,0x4F04,
+  0x1FC8,0x67AD,0x1853,0xE038,0x1853,0x67AD,0x1FC8,0x4546,
+  0xE038,0x1FC8,0x3ABA,0x1FC8,0xE038,0x4546,0x3505,0x5587,
+  0xF574,0xBC11,0x78F4,0x4AFB,0xE6F3,0x4E12,0x3C11,0xF8F4,
+  0x4AFB,0x3C7A,0xF88B,0x3C11,0x78F4,0xCAFB,0x7FFF,0x08CC,
+  0x070C,0x236D,0x5587,0x236D,0x070C,0xF88B,0x3C7A,0x4AFB,
+  0xF8F4,0x3C11,0x7FFF,0x153C,0xCAFB,0x153C,0x7FFF,0x1E08,
+  0xE1F8,0x7FFF,0x08CC,0x7FFF,0xCAFB,0x78F4,0x3C11,0x4E12,
+  0xE6F3,0x4AFB,0x78F4,0xBC11,0xFE3D,0x7FFF,0xFE3D,0x2F3A,
+  0x7FFF,0x2F3A,0x89BC,0x7FFF,0x89BC
+};
+
+static const ogg_int16_t *const OC_EXT_ROWS[96]={
+  OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   0,
+  OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   6,
+  OC_EXT_COEFFS+  27,OC_EXT_COEFFS+  38,OC_EXT_COEFFS+  43,OC_EXT_COEFFS+  32,
+  OC_EXT_COEFFS+  49,OC_EXT_COEFFS+  58,OC_EXT_COEFFS+  67,OC_EXT_COEFFS+  71,
+  OC_EXT_COEFFS+  62,OC_EXT_COEFFS+  53,OC_EXT_COEFFS+  12,OC_EXT_COEFFS+  15,
+  OC_EXT_COEFFS+  14,OC_EXT_COEFFS+  13,OC_EXT_COEFFS+  76,OC_EXT_COEFFS+  81,
+  OC_EXT_COEFFS+  86,OC_EXT_COEFFS+  91,OC_EXT_COEFFS+  96,OC_EXT_COEFFS+  98,
+  OC_EXT_COEFFS+  93,OC_EXT_COEFFS+  88,OC_EXT_COEFFS+  83,OC_EXT_COEFFS+  78,
+  OC_EXT_COEFFS+  12,OC_EXT_COEFFS+  15,OC_EXT_COEFFS+  15,OC_EXT_COEFFS+  12,
+  OC_EXT_COEFFS+  12,OC_EXT_COEFFS+  15,OC_EXT_COEFFS+  12,OC_EXT_COEFFS+  15,
+  OC_EXT_COEFFS+  15,OC_EXT_COEFFS+  12,OC_EXT_COEFFS+ 103,OC_EXT_COEFFS+ 108,
+  OC_EXT_COEFFS+ 126,OC_EXT_COEFFS+  16,OC_EXT_COEFFS+ 137,OC_EXT_COEFFS+ 141,
+  OC_EXT_COEFFS+  20,OC_EXT_COEFFS+ 130,OC_EXT_COEFFS+ 113,OC_EXT_COEFFS+ 116,
+  OC_EXT_COEFFS+ 146,OC_EXT_COEFFS+ 153,OC_EXT_COEFFS+ 160,OC_EXT_COEFFS+ 167,
+  OC_EXT_COEFFS+ 170,OC_EXT_COEFFS+ 163,OC_EXT_COEFFS+ 156,OC_EXT_COEFFS+ 149,
+  OC_EXT_COEFFS+ 119,OC_EXT_COEFFS+ 122,OC_EXT_COEFFS+ 174,OC_EXT_COEFFS+ 177,
+  OC_EXT_COEFFS+ 182,OC_EXT_COEFFS+ 187,OC_EXT_COEFFS+ 192,OC_EXT_COEFFS+ 197,
+  OC_EXT_COEFFS+ 202,OC_EXT_COEFFS+ 207,OC_EXT_COEFFS+ 210,OC_EXT_COEFFS+ 215,
+  OC_EXT_COEFFS+ 179,OC_EXT_COEFFS+ 189,OC_EXT_COEFFS+  24,OC_EXT_COEFFS+ 204,
+  OC_EXT_COEFFS+ 184,OC_EXT_COEFFS+ 194,OC_EXT_COEFFS+ 212,OC_EXT_COEFFS+ 199,
+  OC_EXT_COEFFS+ 217,OC_EXT_COEFFS+ 100,OC_EXT_COEFFS+ 134,OC_EXT_COEFFS+ 135,
+  OC_EXT_COEFFS+ 135,OC_EXT_COEFFS+  12,OC_EXT_COEFFS+  15,OC_EXT_COEFFS+ 134,
+  OC_EXT_COEFFS+ 134,OC_EXT_COEFFS+ 135,OC_EXT_COEFFS+ 220,OC_EXT_COEFFS+ 223,
+  OC_EXT_COEFFS+ 226,OC_EXT_COEFFS+ 227,OC_EXT_COEFFS+ 224,OC_EXT_COEFFS+ 221
+};
+
+static const oc_extension_info OC_EXTENSION_INFO[OC_NSHAPES]={
+  {0x7F,7,OC_EXT_ROWS+  0,{0,1,2,3,4,5,6,7},{0,1,2,4,5,6,7,3}},
+  {0xFE,7,OC_EXT_ROWS+  7,{1,2,3,4,5,6,7,0},{0,1,2,4,5,6,7,3}},
+  {0x3F,6,OC_EXT_ROWS+  8,{0,1,2,3,4,5,7,6},{0,1,3,4,6,7,5,2}},
+  {0xFC,6,OC_EXT_ROWS+ 10,{2,3,4,5,6,7,1,0},{0,1,3,4,6,7,5,2}},
+  {0x1F,5,OC_EXT_ROWS+ 12,{0,1,2,3,4,7,6,5},{0,2,3,5,7,6,4,1}},
+  {0xF8,5,OC_EXT_ROWS+ 15,{3,4,5,6,7,2,1,0},{0,2,3,5,7,6,4,1}},
+  {0x0F,4,OC_EXT_ROWS+ 18,{0,1,2,3,7,6,5,4},{0,2,4,6,7,5,3,1}},
+  {0xF0,4,OC_EXT_ROWS+ 18,{4,5,6,7,3,2,1,0},{0,2,4,6,7,5,3,1}},
+  {0x07,3,OC_EXT_ROWS+ 22,{0,1,2,7,6,5,4,3},{0,3,6,7,5,4,2,1}},
+  {0xE0,3,OC_EXT_ROWS+ 27,{5,6,7,4,3,2,1,0},{0,3,6,7,5,4,2,1}},
+  {0x03,2,OC_EXT_ROWS+ 32,{0,1,7,6,5,4,3,2},{0,4,7,6,5,3,2,1}},
+  {0xC0,2,OC_EXT_ROWS+ 32,{6,7,5,4,3,2,1,0},{0,4,7,6,5,3,2,1}},
+  {0x01,1,OC_EXT_ROWS+  0,{0,7,6,5,4,3,2,1},{0,7,6,5,4,3,2,1}},
+  {0x80,1,OC_EXT_ROWS+  0,{7,6,5,4,3,2,1,0},{0,7,6,5,4,3,2,1}},
+  {0x7E,6,OC_EXT_ROWS+ 42,{1,2,3,4,5,6,7,0},{0,1,2,5,6,7,4,3}},
+  {0x7C,5,OC_EXT_ROWS+ 44,{2,3,4,5,6,7,1,0},{0,1,4,5,7,6,3,2}},
+  {0x3E,5,OC_EXT_ROWS+ 47,{1,2,3,4,5,7,6,0},{0,1,4,5,7,6,3,2}},
+  {0x78,4,OC_EXT_ROWS+ 50,{3,4,5,6,7,2,1,0},{0,4,5,7,6,3,2,1}},
+  {0x3C,4,OC_EXT_ROWS+ 54,{2,3,4,5,7,6,1,0},{0,3,4,7,6,5,2,1}},
+  {0x1E,4,OC_EXT_ROWS+ 58,{1,2,3,4,7,6,5,0},{0,4,5,7,6,3,2,1}},
+  {0x70,3,OC_EXT_ROWS+ 62,{4,5,6,7,3,2,1,0},{0,5,7,6,4,3,2,1}},
+  {0x38,3,OC_EXT_ROWS+ 67,{3,4,5,7,6,2,1,0},{0,5,6,7,4,3,2,1}},
+  {0x1C,3,OC_EXT_ROWS+ 72,{2,3,4,7,6,5,1,0},{0,5,6,7,4,3,2,1}},
+  {0x0E,3,OC_EXT_ROWS+ 77,{1,2,3,7,6,5,4,0},{0,5,7,6,4,3,2,1}},
+  {0x60,2,OC_EXT_ROWS+ 82,{5,6,7,4,3,2,1,0},{0,2,7,6,5,4,3,1}},
+  {0x30,2,OC_EXT_ROWS+ 36,{4,5,7,6,3,2,1,0},{0,4,7,6,5,3,2,1}},
+  {0x18,2,OC_EXT_ROWS+ 90,{3,4,7,6,5,2,1,0},{0,1,7,6,5,4,3,2}},
+  {0x0C,2,OC_EXT_ROWS+ 34,{2,3,7,6,5,4,1,0},{0,4,7,6,5,3,2,1}},
+  {0x06,2,OC_EXT_ROWS+ 84,{1,2,7,6,5,4,3,0},{0,2,7,6,5,4,3,1}},
+  {0x40,1,OC_EXT_ROWS+  0,{6,7,5,4,3,2,1,0},{0,7,6,5,4,3,2,1}},
+  {0x20,1,OC_EXT_ROWS+  0,{5,7,6,4,3,2,1,0},{0,7,6,5,4,3,2,1}},
+  {0x10,1,OC_EXT_ROWS+  0,{4,7,6,5,3,2,1,0},{0,7,6,5,4,3,2,1}},
+  {0x08,1,OC_EXT_ROWS+  0,{3,7,6,5,4,2,1,0},{0,7,6,5,4,3,2,1}},
+  {0x04,1,OC_EXT_ROWS+  0,{2,7,6,5,4,3,1,0},{0,7,6,5,4,3,2,1}},
+  {0x02,1,OC_EXT_ROWS+  0,{1,7,6,5,4,3,2,0},{0,7,6,5,4,3,2,1}}
+};
+
+
+
+/*Pads a single column of a partial block and then performs a forward Type-II
+   DCT on the result.
+  The input is scaled by a factor of 4 and biased appropriately for the current
+   fDCT implementation.
+  The output is scaled by an additional factor of 2 from the orthonormal
+   version of the transform.
+  _y: The buffer to store the result in.
+      Data will be placed the first 8 entries (e.g., in a row of an 8x8 block).
+  _x: The input coefficients.
+      Every 8th entry is used (e.g., from a column of an 8x8 block).
+  _e: The extension information for the shape.*/
+static void oc_fdct8_ext(ogg_int16_t _y[8],ogg_int16_t *_x,
+ const oc_extension_info *_e){
+  const unsigned char *pi;
+  int                  na;
+  na=_e->na;
+  pi=_e->pi;
+  if(na==1){
+    int ci;
+    /*While the branch below is still correct for shapes with na==1, we can
+       perform the entire transform with just 1 multiply in this case instead
+       of 23.*/
+    _y[0]=(ogg_int16_t)(OC_DIV2_16(OC_C4S4*(_x[pi[0]])));
+    for(ci=1;ci<8;ci++)_y[ci]=0;
+  }
+  else{
+    const ogg_int16_t *const *ext;
+    int                       zpi;
+    int                       api;
+    int                       nz;
+    /*First multiply by the extension matrix to compute the padding values.*/
+    nz=8-na;
+    ext=_e->ext;
+    for(zpi=0;zpi<nz;zpi++){
+      ogg_int32_t v;
+      v=0;
+      for(api=0;api<na;api++){
+        v+=ext[zpi][api]*(ogg_int32_t)(_x[pi[api]<<3]<<1);
+      }
+      _x[pi[na+zpi]<<3]=(ogg_int16_t)(v+0x8000>>16)+1>>1;
+    }
+    oc_fdct8(_y,_x);
+  }
+}
+
+/*Performs a forward 8x8 Type-II DCT transform on blocks which overlap the
+   border of the picture region.
+  This method ONLY works with rectangular regions.
+  _border: A description of which pixels are inside the border.
+  _y:      The buffer to store the result in.
+           This may be the same as _x.
+  _x:      The input pixel values.
+           Pixel values outside the border will be ignored.*/
+void oc_fdct8x8_border(const oc_border_info *_border,
+ ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  ogg_int16_t             *in;
+  ogg_int16_t             *out;
+  ogg_int16_t              w[64];
+  ogg_int64_t              mask;
+  const oc_extension_info *cext;
+  const oc_extension_info *rext;
+  int                      cmask;
+  int                      rmask;
+  int                      ri;
+  int                      ci;
+  /*Identify the shapes of the non-zero rows and columns.*/
+  rmask=cmask=0;
+  mask=_border->mask;
+  for(ri=0;ri<8;ri++){
+    /*This aggregation is _only_ correct for rectangular masks.*/
+    cmask|=((mask&0xFF)!=0)<<ri;
+    rmask|=mask&0xFF;
+    mask>>=8;
+  }
+  /*Find the associated extension info for these shapes.*/
+  if(cmask==0xFF)cext=NULL;
+  else for(cext=OC_EXTENSION_INFO;cext->mask!=cmask;){
+    /*If we somehow can't find the shape, then just do an unpadded fDCT.
+      It won't be efficient, but it should still be correct.*/
+    if(++cext>=OC_EXTENSION_INFO+OC_NSHAPES){
+      oc_enc_fdct8x8_c(_y,_x);
+      return;
+    }
+  }
+  if(rmask==0xFF)rext=NULL;
+  else for(rext=OC_EXTENSION_INFO;rext->mask!=rmask;){
+    /*If we somehow can't find the shape, then just do an unpadded fDCT.
+      It won't be efficient, but it should still be correct.*/
+    if(++rext>=OC_EXTENSION_INFO+OC_NSHAPES){
+      oc_enc_fdct8x8_c(_y,_x);
+      return;
+    }
+  }
+  /*Add two extra bits of working precision to improve accuracy; any more and
+     we could overflow.*/
+  for(ci=0;ci<64;ci++)w[ci]=_x[ci]<<2;
+  /*These biases correct for some systematic error that remains in the full
+     fDCT->iDCT round trip.
+    We can safely add them before padding, since if these pixel values are
+     overwritten, we didn't care what they were anyway (and the unbiased values
+     will usually yield smaller DCT coefficient magnitudes).*/
+  w[0]+=(w[0]!=0)+1;
+  w[1]++;
+  w[8]--;
+  /*Transform the columns.
+    We can ignore zero columns without a problem.*/
+  in=w;
+  out=_y;
+  if(cext==NULL)for(ci=0;ci<8;ci++)oc_fdct8(out+(ci<<3),in+ci);
+  else for(ci=0;ci<8;ci++)if(rmask&(1<<ci))oc_fdct8_ext(out+(ci<<3),in+ci,cext);
+  /*Transform the rows.
+    We transform even rows that are supposedly zero, because rounding errors
+     may make them slightly non-zero, and this will give a more precise
+     reconstruction with very small quantizers.*/
+  in=_y;
+  out=w;
+  if(rext==NULL)for(ri=0;ri<8;ri++)oc_fdct8(out+(ri<<3),in+ri);
+  else for(ri=0;ri<8;ri++)oc_fdct8_ext(out+(ri<<3),in+ri,rext);
+  /*Round the result back to the external working precision (which is still
+     scaled by four relative to the orthogonal result).
+    TODO: We should just update the external working precision.*/
+  for(ci=0;ci<64;ci++)_y[ci]=w[ci]+2>>2;
+}
+#endif
diff --git a/lib/fragment.c b/lib/fragment.c
new file mode 100644
index 0000000..15372e9
--- /dev/null
+++ b/lib/fragment.c
@@ -0,0 +1,87 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: fragment.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+#include <string.h>
+#include "internal.h"
+
+void oc_frag_copy(const oc_theora_state *_state,unsigned char *_dst,
+ const unsigned char *_src,int _ystride){
+  (*_state->opt_vtable.frag_copy)(_dst,_src,_ystride);
+}
+
+void oc_frag_copy_c(unsigned char *_dst,const unsigned char *_src,int _ystride){
+  int i;
+  for(i=8;i-->0;){
+    memcpy(_dst,_src,8*sizeof(*_dst));
+    _dst+=_ystride;
+    _src+=_ystride;
+  }
+}
+
+void oc_frag_recon_intra(const oc_theora_state *_state,unsigned char *_dst,
+ int _ystride,const ogg_int16_t _residue[64]){
+  _state->opt_vtable.frag_recon_intra(_dst,_ystride,_residue);
+}
+
+void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride,
+ const ogg_int16_t _residue[64]){
+  int i;
+  for(i=0;i<8;i++){
+    int j;
+    for(j=0;j<8;j++)_dst[j]=OC_CLAMP255(_residue[i*8+j]+128);
+    _dst+=_ystride;
+  }
+}
+
+void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
+  _state->opt_vtable.frag_recon_inter(_dst,_src,_ystride,_residue);
+}
+
+void oc_frag_recon_inter_c(unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
+  int i;
+  for(i=0;i<8;i++){
+    int j;
+    for(j=0;j<8;j++)_dst[j]=OC_CLAMP255(_residue[i*8+j]+_src[j]);
+    _dst+=_ystride;
+    _src+=_ystride;
+  }
+}
+
+void oc_frag_recon_inter2(const oc_theora_state *_state,unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride,
+ const ogg_int16_t _residue[64]){
+  _state->opt_vtable.frag_recon_inter2(_dst,_src1,_src2,_ystride,_residue);
+}
+
+void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]){
+  int i;
+  for(i=0;i<8;i++){
+    int j;
+    for(j=0;j<8;j++)_dst[j]=OC_CLAMP255(_residue[i*8+j]+(_src1[j]+_src2[j]>>1));
+    _dst+=_ystride;
+    _src1+=_ystride;
+    _src2+=_ystride;
+  }
+}
+
+void oc_restore_fpu(const oc_theora_state *_state){
+  _state->opt_vtable.restore_fpu();
+}
+
+void oc_restore_fpu_c(void){}
diff --git a/lib/huffdec.c b/lib/huffdec.c
new file mode 100644
index 0000000..8cf27f0
--- /dev/null
+++ b/lib/huffdec.c
@@ -0,0 +1,489 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: huffdec.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <ogg/ogg.h>
+#include "huffdec.h"
+#include "decint.h"
+
+
+/*The ANSI offsetof macro is broken on some platforms (e.g., older DECs).*/
+#define _ogg_offsetof(_type,_field)\
+ ((size_t)((char *)&((_type *)0)->_field-(char *)0))
+
+/*The number of internal tokens associated with each of the spec tokens.*/
+static const unsigned char OC_DCT_TOKEN_MAP_ENTRIES[TH_NDCT_TOKENS]={
+  1,1,1,4,8,1,1,8,1,1,1,1,1,2,2,2,2,4,8,2,2,2,4,2,2,2,2,2,8,2,4,8
+};
+
+/*The map from external spec-defined tokens to internal tokens.
+  This is constructed so that any extra bits read with the original token value
+   can be masked off the least significant bits of its internal token index.
+  In addition, all of the tokens which require additional extra bits are placed
+   at the start of the list, and grouped by type.
+  OC_DCT_REPEAT_RUN3_TOKEN is placed first, as it is an extra-special case, so
+   giving it index 0 may simplify comparisons on some architectures.
+  These requirements require some substantial reordering.*/
+static const unsigned char OC_DCT_TOKEN_MAP[TH_NDCT_TOKENS]={
+  /*OC_DCT_EOB1_TOKEN (0 extra bits)*/
+  15,
+  /*OC_DCT_EOB2_TOKEN (0 extra bits)*/
+  16,
+  /*OC_DCT_EOB3_TOKEN (0 extra bits)*/
+  17,
+  /*OC_DCT_REPEAT_RUN0_TOKEN (2 extra bits)*/
+  88,
+  /*OC_DCT_REPEAT_RUN1_TOKEN (3 extra bits)*/
+  80,
+  /*OC_DCT_REPEAT_RUN2_TOKEN (4 extra bits)*/
+   1,
+  /*OC_DCT_REPEAT_RUN3_TOKEN (12 extra bits)*/
+   0,
+  /*OC_DCT_SHORT_ZRL_TOKEN (3 extra bits)*/
+  48,
+  /*OC_DCT_ZRL_TOKEN (6 extra bits)*/
+  14,
+  /*OC_ONE_TOKEN (0 extra bits)*/
+  56,
+  /*OC_MINUS_ONE_TOKEN (0 extra bits)*/
+  57,
+  /*OC_TWO_TOKEN (0 extra bits)*/
+  58,
+  /*OC_MINUS_TWO_TOKEN (0 extra bits)*/
+  59,
+  /*OC_DCT_VAL_CAT2 (1 extra bit)*/
+  60,
+  62,
+  64,
+  66,
+  /*OC_DCT_VAL_CAT3 (2 extra bits)*/
+  68,
+  /*OC_DCT_VAL_CAT4 (3 extra bits)*/
+  72,
+  /*OC_DCT_VAL_CAT5 (4 extra bits)*/
+   2,
+  /*OC_DCT_VAL_CAT6 (5 extra bits)*/
+   4,
+  /*OC_DCT_VAL_CAT7 (6 extra bits)*/
+   6,
+  /*OC_DCT_VAL_CAT8 (10 extra bits)*/
+   8,
+  /*OC_DCT_RUN_CAT1A (1 extra bit)*/
+  18,
+  20,
+  22,
+  24,
+  26,
+  /*OC_DCT_RUN_CAT1B (3 extra bits)*/
+  32,
+  /*OC_DCT_RUN_CAT1C (4 extra bits)*/
+  12,
+  /*OC_DCT_RUN_CAT2A (2 extra bits)*/
+  28,
+  /*OC_DCT_RUN_CAT2B (3 extra bits)*/
+  40
+};
+
+/*These three functions are really part of the bitpack.c module, but
+   they are only used here.
+  Declaring local static versions so they can be inlined saves considerable
+   function call overhead.*/
+
+static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){
+  const unsigned char *ptr;
+  const unsigned char *stop;
+  oc_pb_window         window;
+  int                  available;
+  window=_b->window;
+  available=_b->bits;
+  ptr=_b->ptr;
+  stop=_b->stop;
+  /*This version of _refill() doesn't bother setting eof because we won't
+     check for it after we've started decoding DCT tokens.*/
+  if(ptr>=stop)available=OC_LOTS_OF_BITS;
+  while(available<=OC_PB_WINDOW_SIZE-8){
+    available+=8;
+    window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
+    if(ptr>=stop)available=OC_LOTS_OF_BITS;
+  }
+  _b->ptr=ptr;
+  if(_bits>available)window|=*ptr>>(available&7);
+  _b->bits=available;
+  return window;
+}
+
+
+/*Read in bits without advancing the bit pointer.
+  Here we assume 0<=_bits&&_bits<=32.*/
+static long oc_pack_look(oc_pack_buf *_b,int _bits){
+  oc_pb_window window;
+  int          available;
+  long         result;
+  window=_b->window;
+  available=_b->bits;
+  if(_bits==0)return 0;
+  if(_bits>available)_b->window=window=oc_pack_refill(_b,_bits);
+  result=window>>OC_PB_WINDOW_SIZE-_bits;
+  return result;
+}
+
+/*Advance the bit pointer.*/
+static void oc_pack_adv(oc_pack_buf *_b,int _bits){
+  /*We ignore the special cases for _bits==0 and _bits==32 here, since they are
+     never used actually used.
+    OC_HUFF_SLUSH (defined below) would have to be at least 27 to actually read
+     32 bits in a single go, and would require a 32 GB lookup table (assuming
+     8 byte pointers, since 4 byte pointers couldn't fit such a table).*/
+  _b->window<<=_bits;
+  _b->bits-=_bits;
+}
+
+
+/*The log_2 of the size of a lookup table is allowed to grow to relative to
+   the number of unique nodes it contains.
+  E.g., if OC_HUFF_SLUSH is 2, then at most 75% of the space in the tree is
+   wasted (each node will have an amortized cost of at most 20 bytes when using
+   4-byte pointers).
+  Larger numbers can decode tokens with fewer read operations, while smaller
+   numbers may save more space (requiring as little as 8 bytes amortized per
+   node, though there will be more nodes).
+  With a sample file:
+  32233473 read calls are required when no tree collapsing is done (100.0%).
+  19269269 read calls are required when OC_HUFF_SLUSH is 0 (59.8%).
+  11144969 read calls are required when OC_HUFF_SLUSH is 1 (34.6%).
+  10538563 read calls are required when OC_HUFF_SLUSH is 2 (32.7%).
+  10192578 read calls are required when OC_HUFF_SLUSH is 3 (31.6%).
+  Since a value of 1 gets us the vast majority of the speed-up with only a
+   small amount of wasted memory, this is what we use.*/
+#define OC_HUFF_SLUSH (1)
+
+
+/*Determines the size in bytes of a Huffman tree node that represents a
+   subtree of depth _nbits.
+  _nbits: The depth of the subtree.
+          If this is 0, the node is a leaf node.
+          Otherwise 1<<_nbits pointers are allocated for children.
+  Return: The number of bytes required to store the node.*/
+static size_t oc_huff_node_size(int _nbits){
+  size_t size;
+  size=_ogg_offsetof(oc_huff_node,nodes);
+  if(_nbits>0)size+=sizeof(oc_huff_node *)*(1<<_nbits);
+  return size;
+}
+
+static oc_huff_node *oc_huff_node_init(char **_storage,size_t _size,int _nbits){
+  oc_huff_node *ret;
+  ret=(oc_huff_node *)*_storage;
+  ret->nbits=(unsigned char)_nbits;
+  (*_storage)+=_size;
+  return ret;
+}
+
+
+/*Determines the size in bytes of a Huffman tree.
+  _nbits: The depth of the subtree.
+          If this is 0, the node is a leaf node.
+          Otherwise storage for 1<<_nbits pointers are added for children.
+  Return: The number of bytes required to store the tree.*/
+static size_t oc_huff_tree_size(const oc_huff_node *_node){
+  size_t size;
+  size=oc_huff_node_size(_node->nbits);
+  if(_node->nbits){
+    int nchildren;
+    int i;
+    nchildren=1<<_node->nbits;
+    for(i=0;i<nchildren;i+=1<<_node->nbits-_node->nodes[i]->depth){
+      size+=oc_huff_tree_size(_node->nodes[i]);
+    }
+  }
+  return size;
+}
+
+
+/*Unpacks a sub-tree from the given buffer.
+  _opb:      The buffer to unpack from.
+  _binodes:  The nodes to store the sub-tree in.
+  _nbinodes: The number of nodes available for the sub-tree.
+  Return: 0 on success, or a negative value on error.*/
+static int oc_huff_tree_unpack(oc_pack_buf *_opb,
+ oc_huff_node *_binodes,int _nbinodes){
+  oc_huff_node *binode;
+  long          bits;
+  int           nused;
+  if(_nbinodes<1)return TH_EBADHEADER;
+  binode=_binodes;
+  nused=0;
+  bits=oc_pack_read1(_opb);
+  if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
+  /*Read an internal node:*/
+  if(!bits){
+    int ret;
+    nused++;
+    binode->nbits=1;
+    binode->depth=1;
+    binode->nodes[0]=_binodes+nused;
+    ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused);
+    if(ret>=0){
+      nused+=ret;
+      binode->nodes[1]=_binodes+nused;
+      ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused);
+    }
+    if(ret<0)return ret;
+    nused+=ret;
+  }
+  /*Read a leaf node:*/
+  else{
+    int ntokens;
+    int token;
+    int i;
+    bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
+    if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
+    /*Find out how many internal tokens we translate this external token into.*/
+    ntokens=OC_DCT_TOKEN_MAP_ENTRIES[bits];
+    if(_nbinodes<2*ntokens-1)return TH_EBADHEADER;
+    /*Fill in a complete binary tree pointing to the internal tokens.*/
+    for(i=1;i<ntokens;i<<=1){
+      int j;
+      binode=_binodes+nused;
+      nused+=i;
+      for(j=0;j<i;j++){
+        binode[j].nbits=1;
+        binode[j].depth=1;
+        binode[j].nodes[0]=_binodes+nused+2*j;
+        binode[j].nodes[1]=_binodes+nused+2*j+1;
+      }
+    }
+    /*And now the leaf nodes with those tokens.*/
+    token=OC_DCT_TOKEN_MAP[bits];
+    for(i=0;i<ntokens;i++){
+      binode=_binodes+nused++;
+      binode->nbits=0;
+      binode->depth=1;
+      binode->token=token+i;
+    }
+  }
+  return nused;
+}
+
+/*Finds the depth of shortest branch of the given sub-tree.
+  The tree must be binary.
+  _binode: The root of the given sub-tree.
+           _binode->nbits must be 0 or 1.
+  Return: The smallest depth of a leaf node in this sub-tree.
+          0 indicates this sub-tree is a leaf node.*/
+static int oc_huff_tree_mindepth(oc_huff_node *_binode){
+  int depth0;
+  int depth1;
+  if(_binode->nbits==0)return 0;
+  depth0=oc_huff_tree_mindepth(_binode->nodes[0]);
+  depth1=oc_huff_tree_mindepth(_binode->nodes[1]);
+  return OC_MINI(depth0,depth1)+1;
+}
+
+/*Finds the number of internal nodes at a given depth, plus the number of
+   leaves at that depth or shallower.
+  The tree must be binary.
+  _binode: The root of the given sub-tree.
+           _binode->nbits must be 0 or 1.
+  Return: The number of entries that would be contained in a jump table of the
+           given depth.*/
+static int oc_huff_tree_occupancy(oc_huff_node *_binode,int _depth){
+  if(_binode->nbits==0||_depth<=0)return 1;
+  else{
+    return oc_huff_tree_occupancy(_binode->nodes[0],_depth-1)+
+     oc_huff_tree_occupancy(_binode->nodes[1],_depth-1);
+  }
+}
+
+/*Makes a copy of the given Huffman tree.
+  _node: The Huffman tree to copy.
+  Return: The copy of the Huffman tree.*/
+static oc_huff_node *oc_huff_tree_copy(const oc_huff_node *_node,
+ char **_storage){
+  oc_huff_node *ret;
+  ret=oc_huff_node_init(_storage,oc_huff_node_size(_node->nbits),_node->nbits);
+  ret->depth=_node->depth;
+  if(_node->nbits){
+    int nchildren;
+    int i;
+    int inext;
+    nchildren=1<<_node->nbits;
+    for(i=0;i<nchildren;){
+      ret->nodes[i]=oc_huff_tree_copy(_node->nodes[i],_storage);
+      inext=i+(1<<_node->nbits-ret->nodes[i]->depth);
+      while(++i<inext)ret->nodes[i]=ret->nodes[i-1];
+    }
+  }
+  else ret->token=_node->token;
+  return ret;
+}
+
+static size_t oc_huff_tree_collapse_size(oc_huff_node *_binode,int _depth){
+  size_t size;
+  int    mindepth;
+  int    depth;
+  int    loccupancy;
+  int    occupancy;
+  if(_binode->nbits!=0&&_depth>0){
+    return oc_huff_tree_collapse_size(_binode->nodes[0],_depth-1)+
+     oc_huff_tree_collapse_size(_binode->nodes[1],_depth-1);
+  }
+  depth=mindepth=oc_huff_tree_mindepth(_binode);
+  occupancy=1<<mindepth;
+  do{
+    loccupancy=occupancy;
+    occupancy=oc_huff_tree_occupancy(_binode,++depth);
+  }
+  while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
+  depth--;
+  size=oc_huff_node_size(depth);
+  if(depth>0){
+    size+=oc_huff_tree_collapse_size(_binode->nodes[0],depth-1);
+    size+=oc_huff_tree_collapse_size(_binode->nodes[1],depth-1);
+  }
+  return size;
+}
+
+static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode,
+ char **_storage);
+
+/*Fills the given nodes table with all the children in the sub-tree at the
+   given depth.
+  The nodes in the sub-tree with a depth less than that stored in the table
+   are freed.
+  The sub-tree must be binary and complete up until the given depth.
+  _nodes:  The nodes table to fill.
+  _binode: The root of the sub-tree to fill it with.
+           _binode->nbits must be 0 or 1.
+  _level:  The current level in the table.
+           0 indicates that the current node should be stored, regardless of
+            whether it is a leaf node or an internal node.
+  _depth:  The depth of the nodes to fill the table with, relative to their
+            parent.*/
+static void oc_huff_node_fill(oc_huff_node **_nodes,
+ oc_huff_node *_binode,int _level,int _depth,char **_storage){
+  if(_level<=0||_binode->nbits==0){
+    int i;
+    _binode->depth=(unsigned char)(_depth-_level);
+    _nodes[0]=oc_huff_tree_collapse(_binode,_storage);
+    for(i=1;i<1<<_level;i++)_nodes[i]=_nodes[0];
+  }
+  else{
+    _level--;
+    oc_huff_node_fill(_nodes,_binode->nodes[0],_level,_depth,_storage);
+    _nodes+=1<<_level;
+    oc_huff_node_fill(_nodes,_binode->nodes[1],_level,_depth,_storage);
+  }
+}
+
+/*Finds the largest complete sub-tree rooted at the current node and collapses
+   it into a single node.
+  This procedure is then applied recursively to all the children of that node.
+  _binode: The root of the sub-tree to collapse.
+           _binode->nbits must be 0 or 1.
+  Return: The new root of the collapsed sub-tree.*/
+static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode,
+ char **_storage){
+  oc_huff_node *root;
+  size_t        size;
+  int           mindepth;
+  int           depth;
+  int           loccupancy;
+  int           occupancy;
+  depth=mindepth=oc_huff_tree_mindepth(_binode);
+  occupancy=1<<mindepth;
+  do{
+    loccupancy=occupancy;
+    occupancy=oc_huff_tree_occupancy(_binode,++depth);
+  }
+  while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
+  depth--;
+  if(depth<=1)return oc_huff_tree_copy(_binode,_storage);
+  size=oc_huff_node_size(depth);
+  root=oc_huff_node_init(_storage,size,depth);
+  root->depth=_binode->depth;
+  oc_huff_node_fill(root->nodes,_binode,depth,depth,_storage);
+  return root;
+}
+
+/*Unpacks a set of Huffman trees, and reduces them to a collapsed
+   representation.
+  _opb:   The buffer to unpack the trees from.
+  _nodes: The table to fill with the Huffman trees.
+  Return: 0 on success, or a negative value on error.*/
+int oc_huff_trees_unpack(oc_pack_buf *_opb,
+ oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){
+  int i;
+  for(i=0;i<TH_NHUFFMAN_TABLES;i++){
+    oc_huff_node  nodes[511];
+    char         *storage;
+    size_t        size;
+    int           ret;
+    /*Unpack the full tree into a temporary buffer.*/
+    ret=oc_huff_tree_unpack(_opb,nodes,sizeof(nodes)/sizeof(*nodes));
+    if(ret<0)return ret;
+    /*Figure out how big the collapsed tree will be.*/
+    size=oc_huff_tree_collapse_size(nodes,0);
+    storage=(char *)_ogg_calloc(1,size);
+    if(storage==NULL)return TH_EFAULT;
+    /*And collapse it.*/
+    _nodes[i]=oc_huff_tree_collapse(nodes,&storage);
+  }
+  return 0;
+}
+
+/*Makes a copy of the given set of Huffman trees.
+  _dst: The array to store the copy in.
+  _src: The array of trees to copy.*/
+int oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES],
+ const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]){
+  int i;
+  for(i=0;i<TH_NHUFFMAN_TABLES;i++){
+    size_t  size;
+    char   *storage;
+    size=oc_huff_tree_size(_src[i]);
+    storage=(char *)_ogg_calloc(1,size);
+    if(storage==NULL){
+      while(i-->0)_ogg_free(_dst[i]);
+      return TH_EFAULT;
+    }
+    _dst[i]=oc_huff_tree_copy(_src[i],&storage);
+  }
+  return 0;
+}
+
+/*Frees the memory used by a set of Huffman trees.
+  _nodes: The array of trees to free.*/
+void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){
+  int i;
+  for(i=0;i<TH_NHUFFMAN_TABLES;i++)_ogg_free(_nodes[i]);
+}
+
+/*Unpacks a single token using the given Huffman tree.
+  _opb:  The buffer to unpack the token from.
+  _node: The tree to unpack the token with.
+  Return: The token value.*/
+int oc_huff_token_decode(oc_pack_buf *_opb,const oc_huff_node *_node){
+  long bits;
+  while(_node->nbits!=0){
+    bits=oc_pack_look(_opb,_node->nbits);
+    _node=_node->nodes[bits];
+    oc_pack_adv(_opb,_node->depth);
+  }
+  return _node->token;
+}
diff --git a/lib/huffdec.h b/lib/huffdec.h
new file mode 100644
index 0000000..d7ffa0e
--- /dev/null
+++ b/lib/huffdec.h
@@ -0,0 +1,92 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: huffdec.h 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#if !defined(_huffdec_H)
+# define _huffdec_H (1)
+# include "huffman.h"
+# include "bitpack.h"
+
+
+
+typedef struct oc_huff_node oc_huff_node;
+
+/*A node in the Huffman tree.
+  Instead of storing every branching in the tree, subtrees can be collapsed
+   into one node, with a table of size 1<<nbits pointing directly to its
+   descedents nbits levels down.
+  This allows more than one bit to be read at a time, and avoids following all
+   the intermediate branches with next to no increased code complexity once
+   the collapsed tree has been built.
+  We do _not_ require that a subtree be complete to be collapsed, but instead
+   store duplicate pointers in the table, and record the actual depth of the
+   node below its parent.
+  This tells us the number of bits to advance the stream after reaching it.
+
+  This turns out to be equivalent to the method described in \cite{Hash95},
+   without the requirement that codewords be sorted by length.
+  If the codewords were sorted by length (so-called ``canonical-codes''), they
+   could be decoded much faster via either Lindell and Moffat's approach or
+   Hashemian's Condensed Huffman Code approach, the latter of which has an
+   extremely small memory footprint.
+  We can't use Choueka et al.'s finite state machine approach, which is
+   extremely fast, because we can't allow multiple symbols to be output at a
+   time; the codebook can and does change between symbols.
+  It also has very large memory requirements, which impairs cache coherency.
+
+  @ARTICLE{Hash95,
+    author="Reza Hashemian",
+    title="Memory Efficient and High-Speed Search {Huffman} Coding",
+    journal="{IEEE} Transactions on Communications",
+    volume=43,
+    number=10,
+    pages="2576--2581",
+    month=Oct,
+    year=1995
+  }*/
+struct oc_huff_node{
+  /*The number of bits of the code needed to descend through this node.
+    0 indicates a leaf node.
+    Otherwise there are 1<<nbits nodes in the nodes table, which can be
+     indexed by reading nbits bits from the stream.*/
+  unsigned char  nbits;
+  /*The value of a token stored in a leaf node.
+    The value in non-leaf nodes is undefined.*/
+  unsigned char  token;
+  /*The depth of the current node, relative to its parent in the collapsed
+     tree.
+    This can be less than its parent's nbits value, in which case there are
+     1<<nbits-depth copies of this node in the table, and the bitstream should
+     only be advanced depth bits after reaching this node.*/
+  unsigned char  depth;
+  /*The table of child nodes.
+    The ACTUAL size of this array is 1<<nbits, despite what the declaration
+     below claims.
+    The exception is that for leaf nodes the size is 0.*/
+  oc_huff_node  *nodes[2];
+};
+
+
+
+int oc_huff_trees_unpack(oc_pack_buf *_opb,
+ oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]);
+int oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES],
+ const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]);
+void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]);
+int oc_huff_token_decode(oc_pack_buf *_opb,const oc_huff_node *_node);
+
+
+#endif
diff --git a/lib/huffenc.c b/lib/huffenc.c
new file mode 100644
index 0000000..bf624e0
--- /dev/null
+++ b/lib/huffenc.c
@@ -0,0 +1,910 @@
+#include <stdlib.h>
+#include <string.h>
+#include <ogg/ogg.h>
+#include "huffenc.h"
+
+
+
+/*The default Huffman codes used for VP3.1.*/
+const th_huff_code TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]={
+  {
+    {0x002D, 6},{0x0026, 7},{0x0166, 9},{0x004E, 8},
+    {0x02CE,10},{0x059E,11},{0x027D,11},{0x0008, 5},
+    {0x04F9,12},{0x000F, 4},{0x000E, 4},{0x001B, 5},
+    {0x0006, 4},{0x0008, 4},{0x0005, 4},{0x001A, 5},
+    {0x0015, 5},{0x0007, 4},{0x000C, 4},{0x0001, 3},
+    {0x0000, 3},{0x0009, 4},{0x0017, 5},{0x0029, 6},
+    {0x0028, 6},{0x00B2, 8},{0x04F8,12},{0x059F,11},
+    {0x009E, 9},{0x013F,10},{0x0012, 6},{0x0058, 7}
+  },
+  {
+    {0x0010, 5},{0x0047, 7},{0x01FF, 9},{0x008C, 8},
+    {0x03FC,10},{0x046A,11},{0x0469,11},{0x0022, 6},
+    {0x11A1,13},{0x000E, 4},{0x000D, 4},{0x0004, 4},
+    {0x0005, 4},{0x0009, 4},{0x0006, 4},{0x001E, 5},
+    {0x0016, 5},{0x0007, 4},{0x000C, 4},{0x0001, 3},
+    {0x0000, 3},{0x000A, 4},{0x0017, 5},{0x007D, 7},
+    {0x007E, 7},{0x011B, 9},{0x08D1,12},{0x03FD,10},
+    {0x046B,11},{0x11A0,13},{0x007C, 7},{0x00FE, 8}
+  },
+  {
+    {0x0016, 5},{0x0020, 6},{0x0086, 8},{0x0087, 8},
+    {0x0367,10},{0x06CC,11},{0x06CB,11},{0x006E, 7},
+    {0x366D,14},{0x000F, 4},{0x000E, 4},{0x0004, 4},
+    {0x0005, 4},{0x000A, 4},{0x0006, 4},{0x001A, 5},
+    {0x0011, 5},{0x0007, 4},{0x000C, 4},{0x0001, 3},
+    {0x0000, 3},{0x0009, 4},{0x0017, 5},{0x006F, 7},
+    {0x006D, 7},{0x0364,10},{0x0D9A,12},{0x06CA,11},
+    {0x1B37,13},{0x366C,14},{0x0042, 7},{0x00D8, 8}
+  },
+  {
+    {0x0000, 4},{0x002D, 6},{0x00F7, 8},{0x0058, 7},
+    {0x0167, 9},{0x02CB,10},{0x02CA,10},{0x000E, 6},
+    {0x1661,13},{0x0003, 3},{0x0002, 3},{0x0008, 4},
+    {0x0009, 4},{0x000D, 4},{0x0002, 4},{0x001F, 5},
+    {0x0017, 5},{0x0001, 4},{0x000C, 4},{0x000E, 4},
+    {0x000A, 4},{0x0006, 5},{0x0078, 7},{0x000F, 6},
+    {0x007A, 7},{0x0164, 9},{0x0599,11},{0x02CD,10},
+    {0x0B31,12},{0x1660,13},{0x0079, 7},{0x00F6, 8}
+  },
+  {
+    {0x0003, 4},{0x003C, 6},{0x000F, 7},{0x007A, 7},
+    {0x001D, 8},{0x0020, 9},{0x0072,10},{0x0006, 6},
+    {0x0399,13},{0x0004, 3},{0x0005, 3},{0x0005, 4},
+    {0x0006, 4},{0x000E, 4},{0x0004, 4},{0x0000, 4},
+    {0x0019, 5},{0x0002, 4},{0x000D, 4},{0x0007, 4},
+    {0x001F, 5},{0x0030, 6},{0x0011, 8},{0x0031, 6},
+    {0x0005, 6},{0x0021, 9},{0x00E7,11},{0x0038, 9},
+    {0x01CD,12},{0x0398,13},{0x007B, 7},{0x0009, 7}
+  },
+  {
+    {0x0009, 4},{0x0002, 5},{0x0074, 7},{0x0007, 6},
+    {0x00EC, 8},{0x00D1, 9},{0x01A6,10},{0x0006, 6},
+    {0x0D21,13},{0x0005, 3},{0x0006, 3},{0x0008, 4},
+    {0x0007, 4},{0x000F, 4},{0x0004, 4},{0x0000, 4},
+    {0x001C, 5},{0x0002, 4},{0x0005, 4},{0x0003, 4},
+    {0x000C, 5},{0x0035, 7},{0x01A7,10},{0x001B, 6},
+    {0x0077, 7},{0x01A5,10},{0x0349,11},{0x00D0, 9},
+    {0x0691,12},{0x0D20,13},{0x0075, 7},{0x00ED, 8}
+  },
+  {
+    {0x000A, 4},{0x000C, 5},{0x0012, 6},{0x001B, 6},
+    {0x00B7, 8},{0x016C, 9},{0x0099, 9},{0x005A, 7},
+    {0x16D8,13},{0x0007, 3},{0x0006, 3},{0x0009, 4},
+    {0x0008, 4},{0x0000, 3},{0x0005, 4},{0x0017, 5},
+    {0x000E, 5},{0x0002, 4},{0x0003, 4},{0x000F, 5},
+    {0x001A, 6},{0x004D, 8},{0x2DB3,14},{0x002C, 6},
+    {0x0011, 6},{0x02DA,10},{0x05B7,11},{0x0098, 9},
+    {0x0B6D,12},{0x2DB2,14},{0x0010, 6},{0x0027, 7}
+  },
+  {
+    {0x000D, 4},{0x000F, 5},{0x001D, 6},{0x0008, 5},
+    {0x0051, 7},{0x0056, 8},{0x00AF, 9},{0x002A, 7},
+    {0x148A,13},{0x0007, 3},{0x0000, 2},{0x0008, 4},
+    {0x0009, 4},{0x000C, 4},{0x0006, 4},{0x0017, 5},
+    {0x000B, 5},{0x0016, 5},{0x0015, 5},{0x0009, 5},
+    {0x0050, 7},{0x00AE, 9},{0x2917,14},{0x001C, 6},
+    {0x0014, 6},{0x0290,10},{0x0523,11},{0x0149, 9},
+    {0x0A44,12},{0x2916,14},{0x0053, 7},{0x00A5, 8}
+  },
+  {
+    {0x0001, 4},{0x001D, 6},{0x00F5, 8},{0x00F4, 8},
+    {0x024D,10},{0x0499,11},{0x0498,11},{0x0001, 5},
+    {0x0021, 6},{0x0006, 3},{0x0005, 3},{0x0006, 4},
+    {0x0005, 4},{0x0002, 4},{0x0007, 5},{0x0025, 6},
+    {0x007B, 7},{0x001C, 6},{0x0020, 6},{0x000D, 6},
+    {0x0048, 7},{0x0092, 8},{0x0127, 9},{0x000E, 4},
+    {0x0004, 4},{0x0011, 5},{0x000C, 6},{0x003C, 6},
+    {0x000F, 5},{0x0000, 5},{0x001F, 5},{0x0013, 5}
+  },
+  {
+    {0x0005, 4},{0x003C, 6},{0x0040, 7},{0x000D, 7},
+    {0x0031, 9},{0x0061,10},{0x0060,10},{0x0002, 5},
+    {0x00F5, 8},{0x0006, 3},{0x0005, 3},{0x0007, 4},
+    {0x0006, 4},{0x0002, 4},{0x0009, 5},{0x0025, 6},
+    {0x0007, 6},{0x0021, 6},{0x0024, 6},{0x0010, 6},
+    {0x0041, 7},{0x00F4, 8},{0x0019, 8},{0x000E, 4},
+    {0x0003, 4},{0x0011, 5},{0x0011, 6},{0x003F, 6},
+    {0x003E, 6},{0x007B, 7},{0x0000, 4},{0x0013, 5}
+  },
+  {
+    {0x000A, 4},{0x0007, 5},{0x0001, 6},{0x0009, 6},
+    {0x0131, 9},{0x0261,10},{0x0260,10},{0x0015, 6},
+    {0x0001, 7},{0x0007, 3},{0x0006, 3},{0x0008, 4},
+    {0x0007, 4},{0x0006, 4},{0x0012, 5},{0x002F, 6},
+    {0x0014, 6},{0x0027, 6},{0x002D, 6},{0x0016, 6},
+    {0x004D, 7},{0x0099, 8},{0x0000, 7},{0x0004, 4},
+    {0x0001, 4},{0x0005, 5},{0x0017, 6},{0x002E, 6},
+    {0x002C, 6},{0x0008, 6},{0x0006, 5},{0x0001, 5}
+  },
+  {
+    {0x0000, 3},{0x000E, 5},{0x0017, 6},{0x002A, 6},
+    {0x0010, 7},{0x00F9,10},{0x00F8,10},{0x001E, 7},
+    {0x003F, 8},{0x0007, 3},{0x0006, 3},{0x0009, 4},
+    {0x0008, 4},{0x0006, 4},{0x000F, 5},{0x0005, 5},
+    {0x0016, 6},{0x0029, 6},{0x002B, 6},{0x0015, 6},
+    {0x0050, 7},{0x0011, 7},{0x007D, 9},{0x0004, 4},
+    {0x0017, 5},{0x0006, 5},{0x0014, 6},{0x002C, 6},
+    {0x002D, 6},{0x000E, 6},{0x0009, 6},{0x0051, 7}
+  },
+  {
+    {0x0002, 3},{0x0018, 5},{0x002F, 6},{0x000D, 5},
+    {0x0053, 7},{0x0295,10},{0x0294,10},{0x00A4, 8},
+    {0x007C, 8},{0x0000, 2},{0x0007, 3},{0x0009, 4},
+    {0x0008, 4},{0x001B, 5},{0x000C, 5},{0x0028, 6},
+    {0x006A, 7},{0x001E, 6},{0x001D, 6},{0x0069, 7},
+    {0x00D7, 8},{0x007D, 8},{0x014B, 9},{0x0019, 5},
+    {0x0016, 5},{0x002E, 6},{0x001C, 6},{0x002B, 6},
+    {0x002A, 6},{0x0068, 7},{0x003F, 7},{0x00D6, 8}
+  },
+  {
+    {0x0002, 3},{0x001B, 5},{0x000C, 5},{0x0018, 5},
+    {0x0029, 6},{0x007F, 8},{0x02F0,10},{0x0198, 9},
+    {0x0179, 9},{0x0000, 2},{0x0007, 3},{0x0009, 4},
+    {0x0008, 4},{0x001A, 5},{0x000D, 5},{0x002A, 6},
+    {0x0064, 7},{0x001E, 6},{0x0067, 7},{0x005F, 7},
+    {0x00CD, 8},{0x007E, 8},{0x02F1,10},{0x0016, 5},
+    {0x000E, 5},{0x002E, 6},{0x0065, 7},{0x002B, 6},
+    {0x0028, 6},{0x003E, 7},{0x00BD, 8},{0x0199, 9}
+  },
+  {
+    {0x0002, 3},{0x0007, 4},{0x0016, 5},{0x0006, 4},
+    {0x0036, 6},{0x005C, 7},{0x015D, 9},{0x015C, 9},
+    {0x02BF,10},{0x0000, 2},{0x0007, 3},{0x0009, 4},
+    {0x0008, 4},{0x0018, 5},{0x0034, 6},{0x002A, 6},
+    {0x005E, 7},{0x006A, 7},{0x0064, 7},{0x005D, 7},
+    {0x00CB, 8},{0x00AD, 8},{0x02BE,10},{0x0014, 5},
+    {0x0033, 6},{0x006E, 7},{0x005F, 7},{0x006F, 7},
+    {0x006B, 7},{0x00CA, 8},{0x00AC, 8},{0x015E, 9}
+  },
+  {
+    {0x000F, 4},{0x001D, 5},{0x0018, 5},{0x000B, 4},
+    {0x0019, 5},{0x0029, 6},{0x00D6, 8},{0x0551,11},
+    {0x0AA1,12},{0x0001, 2},{0x0000, 2},{0x0009, 4},
+    {0x0008, 4},{0x001B, 5},{0x0038, 6},{0x0028, 6},
+    {0x0057, 7},{0x006A, 7},{0x0068, 7},{0x0056, 7},
+    {0x00E5, 8},{0x0155, 9},{0x0AA0,12},{0x0073, 7},
+    {0x0069, 7},{0x00D7, 8},{0x00AB, 8},{0x00E4, 8},
+    {0x00A9, 8},{0x0151, 9},{0x0150, 9},{0x02A9,10}
+  },
+  {
+    {0x0008, 5},{0x0025, 7},{0x017A, 9},{0x02F7,10},
+    {0x0BDB,12},{0x17B4,13},{0x2F6B,14},{0x001D, 5},
+    {0x2F6A,14},{0x0008, 4},{0x0007, 4},{0x0001, 4},
+    {0x0002, 4},{0x000A, 4},{0x0006, 4},{0x0000, 4},
+    {0x001C, 5},{0x0009, 4},{0x000D, 4},{0x000F, 4},
+    {0x000C, 4},{0x0003, 4},{0x000A, 5},{0x0016, 5},
+    {0x0013, 6},{0x005D, 7},{0x0024, 7},{0x00BC, 8},
+    {0x005C, 7},{0x05EC,11},{0x000B, 5},{0x005F, 7}
+  },
+  {
+    {0x000F, 5},{0x0010, 6},{0x004B, 8},{0x00C6, 8},
+    {0x031D,10},{0x0C71,12},{0x0C70,12},{0x0001, 4},
+    {0x0C73,12},{0x0008, 4},{0x0009, 4},{0x0002, 4},
+    {0x0003, 4},{0x000B, 4},{0x0006, 4},{0x0000, 4},
+    {0x001C, 5},{0x0005, 4},{0x000D, 4},{0x000F, 4},
+    {0x000A, 4},{0x0019, 5},{0x0013, 6},{0x001D, 5},
+    {0x0030, 6},{0x0062, 7},{0x0024, 7},{0x004A, 8},
+    {0x018F, 9},{0x0C72,12},{0x000E, 5},{0x0011, 6}
+  },
+  {
+    {0x001B, 5},{0x0003, 6},{0x008D, 8},{0x0040, 7},
+    {0x0239,10},{0x0471,11},{0x08E0,12},{0x0003, 4},
+    {0x11C3,13},{0x000A, 4},{0x0009, 4},{0x0004, 4},
+    {0x0005, 4},{0x000E, 4},{0x0007, 4},{0x0001, 4},
+    {0x001E, 5},{0x0006, 4},{0x000C, 4},{0x000B, 4},
+    {0x0002, 4},{0x0000, 5},{0x0041, 7},{0x001F, 5},
+    {0x0022, 6},{0x0002, 6},{0x008F, 8},{0x008C, 8},
+    {0x011D, 9},{0x11C2,13},{0x001A, 5},{0x0021, 6}
+  },
+  {
+    {0x001F, 5},{0x0003, 6},{0x0003, 7},{0x0043, 7},
+    {0x000B, 9},{0x0015,10},{0x0051,12},{0x0003, 4},
+    {0x0050,12},{0x000D, 4},{0x000C, 4},{0x0004, 4},
+    {0x0006, 4},{0x000E, 4},{0x000A, 4},{0x0001, 4},
+    {0x001E, 5},{0x0005, 4},{0x0009, 4},{0x0007, 4},
+    {0x0011, 5},{0x0002, 6},{0x0004, 8},{0x0002, 4},
+    {0x002D, 6},{0x0020, 6},{0x0042, 7},{0x0001, 7},
+    {0x0000, 7},{0x0029,11},{0x0017, 5},{0x002C, 6}
+  },
+  {
+    {0x0003, 4},{0x001F, 6},{0x003A, 7},{0x005D, 7},
+    {0x0173, 9},{0x02E4,10},{0x172D,13},{0x0004, 4},
+    {0x172C,13},{0x000F, 4},{0x000E, 4},{0x0009, 4},
+    {0x0008, 4},{0x000C, 4},{0x000A, 4},{0x0001, 4},
+    {0x0016, 5},{0x0002, 4},{0x0005, 4},{0x001A, 5},
+    {0x002F, 6},{0x0038, 7},{0x05CA,11},{0x0006, 4},
+    {0x0037, 6},{0x001E, 6},{0x003B, 7},{0x0039, 7},
+    {0x00B8, 8},{0x0B97,12},{0x0000, 4},{0x0036, 6}
+  },
+  {
+    {0x0006, 4},{0x0037, 6},{0x005D, 7},{0x000C, 6},
+    {0x00B9, 8},{0x02E3,10},{0x05C4,11},{0x0004, 4},
+    {0x1715,13},{0x0000, 3},{0x000F, 4},{0x0008, 4},
+    {0x0007, 4},{0x000C, 4},{0x0009, 4},{0x001D, 5},
+    {0x0016, 5},{0x001C, 5},{0x001A, 5},{0x000B, 5},
+    {0x005E, 7},{0x0170, 9},{0x1714,13},{0x000A, 4},
+    {0x000A, 5},{0x0036, 6},{0x005F, 7},{0x001B, 7},
+    {0x001A, 7},{0x0B8B,12},{0x0002, 4},{0x0007, 5}
+  },
+  {
+    {0x000C, 4},{0x000B, 5},{0x0079, 7},{0x0022, 6},
+    {0x00F0, 8},{0x0119, 9},{0x0230,10},{0x001D, 5},
+    {0x08C4,12},{0x0001, 3},{0x0000, 3},{0x000A, 4},
+    {0x0009, 4},{0x000B, 4},{0x0007, 4},{0x001C, 5},
+    {0x003D, 6},{0x000D, 5},{0x0008, 5},{0x0015, 6},
+    {0x008D, 8},{0x118B,13},{0x118A,13},{0x000D, 4},
+    {0x0010, 5},{0x0009, 5},{0x0014, 6},{0x0047, 7},
+    {0x00F1, 8},{0x0463,11},{0x001F, 5},{0x000C, 5}
+  },
+  {
+    {0x0000, 3},{0x001A, 5},{0x0033, 6},{0x000C, 5},
+    {0x0046, 7},{0x01E3, 9},{0x03C5,10},{0x0017, 5},
+    {0x1E21,13},{0x0002, 3},{0x0001, 3},{0x0009, 4},
+    {0x000A, 4},{0x0007, 4},{0x001B, 5},{0x003D, 6},
+    {0x001B, 6},{0x0022, 6},{0x0079, 7},{0x00F0, 8},
+    {0x1E20,13},{0x1E23,13},{0x1E22,13},{0x000E, 4},
+    {0x0016, 5},{0x0018, 5},{0x0032, 6},{0x001A, 6},
+    {0x0047, 7},{0x0789,11},{0x001F, 5},{0x0010, 5}
+  },
+  {
+    {0x001D, 5},{0x0061, 7},{0x004E, 8},{0x009E, 9},
+    {0x027C,11},{0x09F5,13},{0x09F4,13},{0x0003, 4},
+    {0x0060, 7},{0x0000, 3},{0x000F, 4},{0x000B, 4},
+    {0x000A, 4},{0x0009, 4},{0x0005, 4},{0x000D, 5},
+    {0x0031, 6},{0x0008, 5},{0x0038, 6},{0x0012, 6},
+    {0x0026, 7},{0x013F,10},{0x04FB,12},{0x000D, 4},
+    {0x0002, 4},{0x000C, 5},{0x0039, 6},{0x001C, 6},
+    {0x000F, 5},{0x001D, 6},{0x0008, 4},{0x0019, 5}
+  },
+  {
+    {0x0007, 4},{0x0019, 6},{0x00AB, 8},{0x00AA, 8},
+    {0x0119,10},{0x0461,12},{0x0460,12},{0x001B, 5},
+    {0x0047, 8},{0x0001, 3},{0x0000, 3},{0x000C, 4},
+    {0x000B, 4},{0x0009, 4},{0x0005, 4},{0x000D, 5},
+    {0x0035, 6},{0x003D, 6},{0x003C, 6},{0x0018, 6},
+    {0x0022, 7},{0x008D, 9},{0x0231,11},{0x000E, 4},
+    {0x001F, 5},{0x0009, 5},{0x002B, 6},{0x0010, 6},
+    {0x0034, 6},{0x0054, 7},{0x0008, 4},{0x0014, 5}
+  },
+  {
+    {0x000C, 4},{0x0005, 5},{0x0008, 6},{0x005B, 7},
+    {0x004D, 9},{0x0131,11},{0x0261,12},{0x001A, 5},
+    {0x0012, 7},{0x0000, 3},{0x000F, 4},{0x000A, 4},
+    {0x0009, 4},{0x0006, 4},{0x001B, 5},{0x0006, 5},
+    {0x001C, 6},{0x002C, 6},{0x0015, 6},{0x005A, 7},
+    {0x0027, 8},{0x0099,10},{0x0260,12},{0x000E, 4},
+    {0x0004, 4},{0x000F, 5},{0x0007, 5},{0x001D, 6},
+    {0x000B, 5},{0x0014, 6},{0x0008, 4},{0x0017, 5}
+  },
+  {
+    {0x000F, 4},{0x0013, 5},{0x0075, 7},{0x0024, 6},
+    {0x0095, 8},{0x0251,10},{0x04A0,11},{0x0010, 5},
+    {0x00C8, 8},{0x0002, 3},{0x0001, 3},{0x0001, 4},
+    {0x0000, 4},{0x001A, 5},{0x0011, 5},{0x002C, 6},
+    {0x0065, 7},{0x0074, 7},{0x004B, 7},{0x00C9, 8},
+    {0x0129, 9},{0x0943,12},{0x0942,12},{0x0003, 3},
+    {0x000A, 4},{0x001C, 5},{0x0018, 5},{0x0033, 6},
+    {0x0017, 5},{0x002D, 6},{0x001B, 5},{0x003B, 6}
+  },
+  {
+    {0x0003, 3},{0x001A, 5},{0x002D, 6},{0x0038, 6},
+    {0x0028, 7},{0x0395,10},{0x0E51,12},{0x0037, 6},
+    {0x00E4, 8},{0x0001, 3},{0x0000, 3},{0x001F, 5},
+    {0x001E, 5},{0x0017, 5},{0x003A, 6},{0x0073, 7},
+    {0x002A, 7},{0x002B, 7},{0x0029, 7},{0x01CB, 9},
+    {0x0729,11},{0x1CA1,13},{0x1CA0,13},{0x0004, 3},
+    {0x000A, 4},{0x0004, 4},{0x0018, 5},{0x0036, 6},
+    {0x000B, 5},{0x002C, 6},{0x0019, 5},{0x003B, 6}
+  },
+  {
+    {0x0004, 3},{0x0004, 4},{0x003F, 6},{0x0017, 5},
+    {0x0075, 7},{0x01F5, 9},{0x07D1,11},{0x0017, 6},
+    {0x01F6, 9},{0x0001, 3},{0x0000, 3},{0x001B, 5},
+    {0x001A, 5},{0x000A, 5},{0x0032, 6},{0x0074, 7},
+    {0x00F8, 8},{0x00F9, 8},{0x01F7, 9},{0x03E9,10},
+    {0x0FA0,12},{0x1F43,13},{0x1F42,13},{0x0003, 3},
+    {0x000A, 4},{0x001E, 5},{0x001C, 5},{0x003B, 6},
+    {0x0018, 5},{0x0016, 6},{0x0016, 5},{0x0033, 6}
+  },
+  {
+    {0x0004, 3},{0x0007, 4},{0x0018, 5},{0x001E, 5},
+    {0x0036, 6},{0x0031, 7},{0x0177, 9},{0x0077, 7},
+    {0x0176, 9},{0x0001, 3},{0x0000, 3},{0x001A, 5},
+    {0x0019, 5},{0x003A, 6},{0x0019, 6},{0x005C, 7},
+    {0x00BA, 8},{0x0061, 8},{0x00C1, 9},{0x0180,10},
+    {0x0302,11},{0x0607,12},{0x0606,12},{0x0002, 3},
+    {0x000A, 4},{0x001F, 5},{0x001C, 5},{0x0037, 6},
+    {0x0016, 5},{0x0076, 7},{0x000D, 5},{0x002F, 6}
+  },
+  {
+    {0x0000, 3},{0x000A, 4},{0x001A, 5},{0x000C, 4},
+    {0x001D, 5},{0x0039, 6},{0x0078, 7},{0x005E, 7},
+    {0x0393,11},{0x0002, 3},{0x0001, 3},{0x0016, 5},
+    {0x000F, 5},{0x002E, 6},{0x005F, 7},{0x0073, 8},
+    {0x00E5, 9},{0x01C8,10},{0x0E4A,13},{0x1C97,14},
+    {0x1C96,14},{0x0E49,13},{0x0E48,13},{0x0004, 3},
+    {0x0006, 4},{0x001F, 5},{0x001B, 5},{0x001D, 6},
+    {0x0038, 6},{0x0038, 7},{0x003D, 6},{0x0079, 7}
+  },
+  {
+    {0x000B, 5},{0x002B, 7},{0x0054, 8},{0x01B7, 9},
+    {0x06D9,11},{0x0DB1,12},{0x0DB0,12},{0x0002, 4},
+    {0x00AB, 9},{0x0009, 4},{0x000A, 4},{0x0007, 4},
+    {0x0008, 4},{0x000F, 4},{0x000C, 4},{0x0003, 4},
+    {0x001D, 5},{0x0004, 4},{0x000B, 4},{0x0006, 4},
+    {0x001A, 5},{0x0003, 6},{0x00AA, 9},{0x0001, 4},
+    {0x0000, 5},{0x0014, 6},{0x006C, 7},{0x00DA, 8},
+    {0x0002, 6},{0x036D,10},{0x001C, 5},{0x0037, 6}
+  },
+  {
+    {0x001D, 5},{0x0004, 6},{0x00B6, 8},{0x006A, 8},
+    {0x05B9,11},{0x16E1,13},{0x16E0,13},{0x0007, 4},
+    {0x016F, 9},{0x000C, 4},{0x000D, 4},{0x0009, 4},
+    {0x0008, 4},{0x000F, 4},{0x000A, 4},{0x0003, 4},
+    {0x0017, 5},{0x0002, 4},{0x0004, 4},{0x001C, 5},
+    {0x002C, 6},{0x006B, 8},{0x0B71,12},{0x0005, 4},
+    {0x0003, 5},{0x001B, 6},{0x005A, 7},{0x0034, 7},
+    {0x0005, 6},{0x02DD,10},{0x0000, 4},{0x000C, 5}
+  },
+  {
+    {0x0003, 4},{0x007F, 7},{0x00A1, 8},{0x00A0, 8},
+    {0x020C,10},{0x0834,12},{0x106B,13},{0x0007, 4},
+    {0x0082, 8},{0x000E, 4},{0x000D, 4},{0x000B, 4},
+    {0x000C, 4},{0x0000, 3},{0x0009, 4},{0x0002, 4},
+    {0x0011, 5},{0x001E, 5},{0x0015, 5},{0x003E, 6},
+    {0x0040, 7},{0x041B,11},{0x106A,13},{0x0006, 4},
+    {0x000A, 5},{0x0029, 6},{0x007E, 7},{0x0051, 7},
+    {0x0021, 6},{0x0107, 9},{0x0004, 4},{0x000B, 5}
+  },
+  {
+    {0x0007, 4},{0x001B, 6},{0x00F6, 8},{0x00E9, 8},
+    {0x03A1,10},{0x0740,11},{0x0E82,12},{0x001F, 5},
+    {0x01EF, 9},{0x0001, 3},{0x0002, 3},{0x000B, 4},
+    {0x000C, 4},{0x000D, 4},{0x0008, 4},{0x001C, 5},
+    {0x0003, 5},{0x0012, 5},{0x0002, 5},{0x0075, 7},
+    {0x01D1, 9},{0x1D07,13},{0x1D06,13},{0x000A, 4},
+    {0x0013, 5},{0x003B, 6},{0x001A, 6},{0x007A, 7},
+    {0x003C, 6},{0x01EE, 9},{0x0000, 4},{0x000C, 5}
+  },
+  {
+    {0x000D, 4},{0x003D, 6},{0x0042, 7},{0x0037, 7},
+    {0x00D9, 9},{0x0362,11},{0x06C6,12},{0x001F, 5},
+    {0x0086, 8},{0x0001, 3},{0x0002, 3},{0x000C, 4},
+    {0x000B, 4},{0x000A, 4},{0x0001, 4},{0x000F, 5},
+    {0x0025, 6},{0x003C, 6},{0x001A, 6},{0x0087, 8},
+    {0x01B0,10},{0x0D8F,13},{0x0D8E,13},{0x000E, 4},
+    {0x0013, 5},{0x000C, 5},{0x0024, 6},{0x0020, 6},
+    {0x0011, 5},{0x006D, 8},{0x0000, 4},{0x000E, 5}
+  },
+  {
+    {0x0000, 3},{0x0012, 5},{0x0076, 7},{0x0077, 7},
+    {0x014D, 9},{0x0533,11},{0x14C9,13},{0x0013, 5},
+    {0x00A5, 8},{0x0002, 3},{0x0003, 3},{0x000B, 4},
+    {0x000C, 4},{0x0008, 4},{0x001A, 5},{0x002B, 6},
+    {0x0075, 7},{0x0074, 7},{0x00A7, 8},{0x0298,10},
+    {0x14C8,13},{0x14CB,13},{0x14CA,13},{0x000F, 4},
+    {0x001C, 5},{0x0007, 5},{0x002A, 6},{0x0028, 6},
+    {0x001B, 5},{0x00A4, 8},{0x0002, 4},{0x0006, 5}
+  },
+  {
+    {0x0002, 3},{0x001A, 5},{0x002B, 6},{0x003A, 6},
+    {0x00ED, 8},{0x0283,10},{0x0A0A,12},{0x0004, 5},
+    {0x00A1, 8},{0x0004, 3},{0x0003, 3},{0x000B, 4},
+    {0x000C, 4},{0x001F, 5},{0x0006, 5},{0x0077, 7},
+    {0x00A3, 8},{0x00A2, 8},{0x0140, 9},{0x1417,13},
+    {0x1416,13},{0x0A09,12},{0x0A08,12},{0x0000, 3},
+    {0x001E, 5},{0x0007, 5},{0x002A, 6},{0x0029, 6},
+    {0x001C, 5},{0x00EC, 8},{0x001B, 5},{0x0005, 5}
+  },
+  {
+    {0x0002, 3},{0x0002, 4},{0x0018, 5},{0x001D, 5},
+    {0x0035, 6},{0x00E4, 8},{0x01CF,11},{0x001D, 7},
+    {0x0072, 9},{0x0004, 3},{0x0005, 3},{0x0006, 4},
+    {0x0007, 4},{0x0006, 5},{0x0073, 7},{0x0038, 8},
+    {0x01CE,11},{0x039B,12},{0x0398,12},{0x0733,13},
+    {0x0732,13},{0x0735,13},{0x0734,13},{0x0000, 3},
+    {0x001F, 5},{0x001B, 5},{0x0034, 6},{0x000F, 6},
+    {0x001E, 5},{0x00E5, 8},{0x0019, 5},{0x0038, 6}
+  },
+  {
+    {0x0016, 5},{0x0050, 7},{0x0172, 9},{0x02E7,10},
+    {0x1732,13},{0x2E67,14},{0x2E66,14},{0x0006, 4},
+    {0x0051, 7},{0x0001, 3},{0x0000, 3},{0x000D, 4},
+    {0x000C, 4},{0x0009, 4},{0x001C, 5},{0x0009, 5},
+    {0x001C, 6},{0x001D, 6},{0x005D, 7},{0x00B8, 8},
+    {0x05CD,11},{0x1731,13},{0x1730,13},{0x000F, 4},
+    {0x0005, 4},{0x000F, 5},{0x0008, 5},{0x0029, 6},
+    {0x001D, 5},{0x002F, 6},{0x0008, 4},{0x0015, 5}
+  },
+  {
+    {0x0009, 4},{0x0021, 6},{0x0040, 7},{0x00AD, 8},
+    {0x02B0,10},{0x1589,13},{0x1588,13},{0x001C, 5},
+    {0x005F, 7},{0x0000, 3},{0x000F, 4},{0x000D, 4},
+    {0x000C, 4},{0x0006, 4},{0x0011, 5},{0x002A, 6},
+    {0x0057, 7},{0x005E, 7},{0x0041, 7},{0x0159, 9},
+    {0x0563,11},{0x158B,13},{0x158A,13},{0x0001, 3},
+    {0x0005, 4},{0x0014, 5},{0x003B, 6},{0x002E, 6},
+    {0x0004, 4},{0x003A, 6},{0x0007, 4},{0x0016, 5}
+  },
+  {
+    {0x000E, 4},{0x0007, 5},{0x0046, 7},{0x0045, 7},
+    {0x0064, 9},{0x032A,12},{0x0657,13},{0x0018, 5},
+    {0x000D, 6},{0x0000, 3},{0x000F, 4},{0x000A, 4},
+    {0x000B, 4},{0x001A, 5},{0x0036, 6},{0x0047, 7},
+    {0x0044, 7},{0x0018, 7},{0x0033, 8},{0x00CB,10},
+    {0x0656,13},{0x0329,12},{0x0328,12},{0x0002, 3},
+    {0x0006, 4},{0x0019, 5},{0x000E, 5},{0x0037, 6},
+    {0x0009, 4},{0x000F, 5},{0x0002, 4},{0x0010, 5}
+  },
+  {
+    {0x0003, 3},{0x0018, 5},{0x0023, 6},{0x0077, 7},
+    {0x0194, 9},{0x1956,13},{0x32AF,14},{0x003A, 6},
+    {0x0076, 7},{0x0002, 3},{0x0001, 3},{0x001F, 5},
+    {0x001E, 5},{0x0014, 5},{0x0022, 6},{0x0064, 7},
+    {0x0197, 9},{0x0196, 9},{0x032B,10},{0x0654,11},
+    {0x32AE,14},{0x1955,13},{0x1954,13},{0x0000, 3},
+    {0x0009, 4},{0x001C, 5},{0x0015, 5},{0x0010, 5},
+    {0x000D, 4},{0x0017, 5},{0x0016, 5},{0x0033, 6}
+  },
+  {
+    {0x0005, 3},{0x0006, 4},{0x003E, 6},{0x0010, 5},
+    {0x0048, 7},{0x093F,12},{0x24FA,14},{0x0032, 6},
+    {0x0067, 7},{0x0002, 3},{0x0001, 3},{0x001B, 5},
+    {0x001E, 5},{0x0034, 6},{0x0066, 7},{0x0092, 8},
+    {0x0126, 9},{0x024E,10},{0x049E,11},{0x49F7,15},
+    {0x49F6,15},{0x24F9,14},{0x24F8,14},{0x0000, 3},
+    {0x0007, 4},{0x0018, 5},{0x0011, 5},{0x003F, 6},
+    {0x000E, 4},{0x0013, 5},{0x0035, 6},{0x0025, 6}
+  },
+  {
+    {0x0005, 3},{0x0008, 4},{0x0012, 5},{0x001C, 5},
+    {0x001C, 6},{0x00EA, 9},{0x1D75,14},{0x001E, 6},
+    {0x0066, 7},{0x0001, 3},{0x0002, 3},{0x001B, 5},
+    {0x001A, 5},{0x001F, 6},{0x003B, 7},{0x0074, 8},
+    {0x01D6,10},{0x03AF,11},{0x1D74,14},{0x1D77,14},
+    {0x1D76,14},{0x0EB9,13},{0x0EB8,13},{0x000F, 4},
+    {0x0006, 4},{0x0013, 5},{0x003B, 6},{0x003A, 6},
+    {0x0000, 3},{0x0018, 5},{0x0032, 6},{0x0067, 7}
+  },
+  {
+    {0x0004, 3},{0x000A, 4},{0x001B, 5},{0x000C, 4},
+    {0x000D, 5},{0x00E6, 8},{0x0684,11},{0x0072, 7},
+    {0x00E7, 8},{0x0002, 3},{0x0001, 3},{0x0017, 5},
+    {0x0016, 5},{0x0018, 6},{0x00D1, 8},{0x01A0, 9},
+    {0x0686,11},{0x0D0F,12},{0x0D0A,12},{0x1A17,13},
+    {0x1A16,13},{0x1A1D,13},{0x1A1C,13},{0x000F, 4},
+    {0x001D, 5},{0x000E, 5},{0x0035, 6},{0x0038, 6},
+    {0x0000, 3},{0x000F, 5},{0x0019, 6},{0x0069, 7}
+  },
+  {
+    {0x0003, 3},{0x000C, 4},{0x001B, 5},{0x0000, 3},
+    {0x0003, 4},{0x002E, 6},{0x0051, 9},{0x00BC, 8},
+    {0x0053, 9},{0x0004, 3},{0x0002, 3},{0x0016, 5},
+    {0x0015, 5},{0x0015, 7},{0x0050, 9},{0x00A4,10},
+    {0x0294,12},{0x052B,13},{0x052A,13},{0x052D,13},
+    {0x052C,13},{0x052F,13},{0x052E,13},{0x000E, 4},
+    {0x001A, 5},{0x0004, 5},{0x0028, 6},{0x0029, 6},
+    {0x000F, 4},{0x000B, 6},{0x005F, 7},{0x00BD, 8}
+  },
+  {
+    {0x0003, 4},{0x0009, 6},{0x00D0, 8},{0x01A3, 9},
+    {0x0344,10},{0x0D14,12},{0x1A2B,13},{0x0004, 4},
+    {0x0015, 7},{0x0000, 3},{0x000F, 4},{0x000B, 4},
+    {0x000C, 4},{0x000E, 4},{0x0009, 4},{0x001B, 5},
+    {0x000A, 5},{0x0014, 5},{0x000D, 5},{0x002A, 6},
+    {0x0014, 7},{0x068B,11},{0x1A2A,13},{0x0008, 4},
+    {0x000B, 5},{0x002B, 6},{0x000B, 6},{0x0069, 7},
+    {0x0035, 6},{0x0008, 6},{0x0007, 4},{0x000C, 5}
+  },
+  {
+    {0x000A, 4},{0x003C, 6},{0x0032, 7},{0x0030, 7},
+    {0x00C5, 9},{0x0621,12},{0x0620,12},{0x001F, 5},
+    {0x0033, 7},{0x0001, 3},{0x0000, 3},{0x000E, 4},
+    {0x000D, 4},{0x000C, 4},{0x0004, 4},{0x000D, 5},
+    {0x0026, 6},{0x0027, 6},{0x0014, 6},{0x0063, 8},
+    {0x0189,10},{0x0623,12},{0x0622,12},{0x000B, 4},
+    {0x0012, 5},{0x003D, 6},{0x0022, 6},{0x0015, 6},
+    {0x000B, 5},{0x0023, 6},{0x0007, 4},{0x0010, 5}
+  },
+  {
+    {0x000F, 4},{0x000C, 5},{0x0043, 7},{0x0010, 6},
+    {0x0044, 8},{0x0114,10},{0x0455,12},{0x0018, 5},
+    {0x0023, 7},{0x0001, 3},{0x0000, 3},{0x000E, 4},
+    {0x000D, 4},{0x0009, 4},{0x0019, 5},{0x0009, 5},
+    {0x0017, 6},{0x0016, 6},{0x0042, 7},{0x008B, 9},
+    {0x0454,12},{0x0457,12},{0x0456,12},{0x000B, 4},
+    {0x0015, 5},{0x000A, 5},{0x0029, 6},{0x0020, 6},
+    {0x000D, 5},{0x0028, 6},{0x0007, 4},{0x0011, 5}
+  },
+  {
+    {0x0001, 3},{0x001A, 5},{0x0029, 6},{0x002A, 6},
+    {0x00A0, 8},{0x0285,10},{0x1425,13},{0x0002, 5},
+    {0x0000, 7},{0x0002, 3},{0x0003, 3},{0x000C, 4},
+    {0x000B, 4},{0x0008, 4},{0x0012, 5},{0x0001, 6},
+    {0x0051, 7},{0x0001, 7},{0x0143, 9},{0x0508,11},
+    {0x1424,13},{0x1427,13},{0x1426,13},{0x000F, 4},
+    {0x001C, 5},{0x0003, 5},{0x0037, 6},{0x002B, 6},
+    {0x0013, 5},{0x0036, 6},{0x001D, 5},{0x0001, 5}
+  },
+  {
+    {0x0004, 3},{0x001F, 5},{0x003D, 6},{0x0006, 5},
+    {0x0016, 7},{0x0053, 9},{0x014A,11},{0x0034, 6},
+    {0x002A, 8},{0x0002, 3},{0x0003, 3},{0x000B, 4},
+    {0x000C, 4},{0x001C, 5},{0x0037, 6},{0x0017, 7},
+    {0x002B, 8},{0x0028, 8},{0x00A4,10},{0x052D,13},
+    {0x052C,13},{0x052F,13},{0x052E,13},{0x0000, 3},
+    {0x001D, 5},{0x0007, 5},{0x0004, 5},{0x0035, 6},
+    {0x0014, 5},{0x0036, 6},{0x0015, 5},{0x003C, 6}
+  },
+  {
+    {0x0004, 3},{0x000A, 4},{0x0007, 5},{0x001D, 5},
+    {0x0009, 6},{0x01F3, 9},{0x07C7,11},{0x0008, 6},
+    {0x01F0, 9},{0x0003, 3},{0x0002, 3},{0x000D, 4},
+    {0x000C, 4},{0x0017, 5},{0x007D, 7},{0x01F2, 9},
+    {0x07C6,11},{0x07C5,11},{0x1F12,13},{0x3E27,14},
+    {0x3E26,14},{0x1F11,13},{0x1F10,13},{0x0000, 3},
+    {0x001E, 5},{0x0006, 5},{0x0039, 6},{0x0038, 6},
+    {0x003F, 6},{0x002C, 6},{0x0005, 5},{0x002D, 6}
+  },
+  {
+    {0x0002, 3},{0x0007, 4},{0x0018, 5},{0x0003, 4},
+    {0x0005, 5},{0x0035, 7},{0x004F, 9},{0x0012, 7},
+    {0x04E5,13},{0x0005, 3},{0x0004, 3},{0x000D, 4},
+    {0x000E, 4},{0x0033, 6},{0x0026, 8},{0x009D,10},
+    {0x04E4,13},{0x04E7,13},{0x04E6,13},{0x04E1,13},
+    {0x04E0,13},{0x04E3,13},{0x04E2,13},{0x0000, 3},
+    {0x001F, 5},{0x000C, 5},{0x003D, 6},{0x003C, 6},
+    {0x0032, 6},{0x0034, 7},{0x001B, 6},{0x0008, 6}
+  },
+  {
+    {0x0000, 3},{0x0004, 4},{0x001C, 5},{0x000F, 4},
+    {0x0002, 4},{0x0007, 5},{0x0075, 7},{0x00E8, 8},
+    {0x1D2A,13},{0x0005, 3},{0x0004, 3},{0x000D, 4},
+    {0x000C, 4},{0x0077, 7},{0x0E96,12},{0x3A57,14},
+    {0x3A56,14},{0x3A5D,14},{0x3A5C,14},{0x3A5F,14},
+    {0x3A5E,14},{0x1D29,13},{0x1D28,13},{0x0003, 3},
+    {0x0006, 5},{0x000A, 5},{0x002C, 7},{0x0017, 6},
+    {0x0076, 7},{0x01D3, 9},{0x03A4,10},{0x002D, 7}
+  },
+  {
+    {0x000A, 4},{0x0024, 6},{0x00BF, 8},{0x0085, 8},
+    {0x0211,10},{0x0842,12},{0x1087,13},{0x0018, 5},
+    {0x0020, 6},{0x0001, 3},{0x0002, 3},{0x000E, 4},
+    {0x000D, 4},{0x0007, 4},{0x0013, 5},{0x0025, 6},
+    {0x005E, 7},{0x0043, 7},{0x00BE, 8},{0x0109, 9},
+    {0x1086,13},{0x0841,12},{0x0840,12},{0x000F, 4},
+    {0x0001, 4},{0x0011, 5},{0x0000, 5},{0x002E, 6},
+    {0x0019, 5},{0x0001, 5},{0x0006, 4},{0x0016, 5}
+  },
+  {
+    {0x0002, 3},{0x000F, 5},{0x006F, 7},{0x0061, 7},
+    {0x0374,10},{0x1BA8,13},{0x3753,14},{0x0012, 5},
+    {0x0036, 6},{0x0000, 3},{0x0001, 3},{0x000A, 4},
+    {0x000B, 4},{0x001A, 5},{0x0031, 6},{0x0060, 7},
+    {0x00DC, 8},{0x01BB, 9},{0x06EB,11},{0x1BAB,13},
+    {0x3752,14},{0x3755,14},{0x3754,14},{0x000E, 4},
+    {0x0006, 4},{0x0013, 5},{0x000E, 5},{0x003E, 6},
+    {0x0008, 4},{0x001E, 5},{0x0019, 5},{0x003F, 6}
+  },
+  {
+    {0x0003, 3},{0x001C, 5},{0x0025, 6},{0x0024, 6},
+    {0x01DA, 9},{0x1DBD,13},{0x3B7C,14},{0x003C, 6},
+    {0x003D, 6},{0x0000, 3},{0x0001, 3},{0x000B, 4},
+    {0x000A, 4},{0x000B, 5},{0x0077, 7},{0x00EC, 8},
+    {0x03B6,10},{0x076E,11},{0x1DBF,13},{0x76FB,15},
+    {0x76FA,15},{0x3B79,14},{0x3B78,14},{0x000D, 4},
+    {0x001F, 5},{0x0013, 5},{0x000A, 5},{0x0008, 5},
+    {0x000C, 4},{0x0008, 4},{0x0009, 5},{0x003A, 6}
+  },
+  {
+    {0x0005, 3},{0x0003, 4},{0x0004, 5},{0x0010, 5},
+    {0x008F, 8},{0x0475,11},{0x11D1,13},{0x0079, 7},
+    {0x0027, 6},{0x0002, 3},{0x0003, 3},{0x0001, 4},
+    {0x0000, 4},{0x0026, 6},{0x0046, 7},{0x011C, 9},
+    {0x0477,11},{0x08ED,12},{0x11D0,13},{0x11D3,13},
+    {0x11D2,13},{0x11D9,13},{0x11D8,13},{0x000D, 4},
+    {0x001F, 5},{0x0012, 5},{0x0005, 5},{0x003D, 6},
+    {0x000C, 4},{0x000E, 4},{0x0022, 6},{0x0078, 7}
+  },
+  {
+    {0x0005, 3},{0x000C, 4},{0x001B, 5},{0x0000, 4},
+    {0x0006, 6},{0x03E2,10},{0x3E3D,14},{0x000F, 7},
+    {0x0034, 6},{0x0003, 3},{0x0002, 3},{0x001E, 5},
+    {0x001D, 5},{0x007D, 7},{0x01F0, 9},{0x07C6,11},
+    {0x3E3C,14},{0x3E3F,14},{0x3E3E,14},{0x3E39,14},
+    {0x3E38,14},{0x3E3B,14},{0x3E3A,14},{0x0008, 4},
+    {0x001C, 5},{0x0002, 5},{0x003F, 6},{0x0035, 6},
+    {0x0009, 4},{0x0001, 3},{0x000E, 7},{0x00F9, 8}
+  },
+  {
+    {0x0004, 3},{0x000B, 4},{0x0001, 4},{0x000A, 4},
+    {0x001E, 6},{0x00E0, 9},{0x0E1E,13},{0x0071, 8},
+    {0x0039, 7},{0x0007, 3},{0x0006, 3},{0x000D, 5},
+    {0x000C, 5},{0x0020, 7},{0x01C2,10},{0x1C3F,14},
+    {0x1C3E,14},{0x0E19,13},{0x0E18,13},{0x0E1B,13},
+    {0x0E1A,13},{0x0E1D,13},{0x0E1C,13},{0x0000, 4},
+    {0x0009, 5},{0x001D, 6},{0x001F, 6},{0x0011, 6},
+    {0x0005, 4},{0x0001, 3},{0x0043, 8},{0x0042, 8}
+  },
+  {
+    {0x0004, 3},{0x000D, 4},{0x0007, 4},{0x0002, 3},
+    {0x0014, 5},{0x016C, 9},{0x16D1,13},{0x02DF,10},
+    {0x016E, 9},{0x0000, 2},{0x0007, 3},{0x002C, 6},
+    {0x002B, 6},{0x02DE,10},{0x16D0,13},{0x16D3,13},
+    {0x16D2,13},{0x2DB5,14},{0x2DB4,14},{0x2DB7,14},
+    {0x2DB6,14},{0x16D9,13},{0x16D8,13},{0x000C, 5},
+    {0x002A, 6},{0x005A, 7},{0x001B, 6},{0x001A, 6},
+    {0x0017, 5},{0x000C, 4},{0x05B7,11},{0x05B5,11}
+  },
+  {
+    {0x0002, 2},{0x000F, 4},{0x001C, 5},{0x000C, 4},
+    {0x003B, 6},{0x01AC, 9},{0x1AD8,13},{0x35B3,14},
+    {0x35B2,14},{0x0001, 2},{0x0000, 2},{0x0069, 7},
+    {0x0068, 7},{0x35BD,14},{0x35BC,14},{0x35BF,14},
+    {0x35BE,14},{0x35B9,14},{0x35B8,14},{0x35BB,14},
+    {0x35BA,14},{0x35B5,14},{0x35B4,14},{0x01A9, 9},
+    {0x01A8, 9},{0x035A,10},{0x00D7, 8},{0x00D5, 8},
+    {0x003A, 6},{0x001B, 5},{0x35B7,14},{0x35B6,14}
+  },
+  {
+    {0x0000, 3},{0x0010, 5},{0x0072, 7},{0x0071, 7},
+    {0x0154, 9},{0x0AAB,12},{0x0AA8,12},{0x0014, 5},
+    {0x0070, 7},{0x0002, 3},{0x0003, 3},{0x000C, 4},
+    {0x000B, 4},{0x0003, 4},{0x0011, 5},{0x0073, 7},
+    {0x0054, 7},{0x00AB, 8},{0x02AB,10},{0x1553,13},
+    {0x1552,13},{0x1555,13},{0x1554,13},{0x000D, 4},
+    {0x001E, 5},{0x0012, 5},{0x003E, 6},{0x002B, 6},
+    {0x0002, 4},{0x003F, 6},{0x001D, 5},{0x0013, 5}
+  },
+  {
+    {0x0003, 3},{0x001F, 5},{0x0029, 6},{0x003D, 6},
+    {0x000C, 7},{0x0069,10},{0x0345,13},{0x0002, 5},
+    {0x0028, 6},{0x0002, 3},{0x0001, 3},{0x000E, 4},
+    {0x000C, 4},{0x0015, 5},{0x0007, 6},{0x001B, 8},
+    {0x006B,10},{0x006A,10},{0x0344,13},{0x0347,13},
+    {0x0346,13},{0x01A1,12},{0x01A0,12},{0x000B, 4},
+    {0x001A, 5},{0x0012, 5},{0x0000, 5},{0x003C, 6},
+    {0x0008, 4},{0x001B, 5},{0x0013, 5},{0x0001, 5}
+  },
+  {
+    {0x0004, 3},{0x0004, 4},{0x003F, 6},{0x0014, 5},
+    {0x0056, 7},{0x015C, 9},{0x15D5,13},{0x003C, 6},
+    {0x002A, 6},{0x0000, 3},{0x0001, 3},{0x000E, 4},
+    {0x000D, 4},{0x000C, 5},{0x00AF, 8},{0x02BB,10},
+    {0x15D4,13},{0x15D7,13},{0x15D6,13},{0x15D1,13},
+    {0x15D0,13},{0x15D3,13},{0x15D2,13},{0x000B, 4},
+    {0x0019, 5},{0x000D, 5},{0x003E, 6},{0x0031, 6},
+    {0x0007, 4},{0x0005, 4},{0x003D, 6},{0x0030, 6}
+  },
+  {
+    {0x0005, 3},{0x0008, 4},{0x001A, 5},{0x0000, 4},
+    {0x0036, 6},{0x0011, 8},{0x0106,12},{0x000A, 7},
+    {0x006E, 7},{0x0002, 3},{0x0003, 3},{0x0003, 4},
+    {0x0002, 4},{0x006F, 7},{0x0021, 9},{0x020F,13},
+    {0x020E,13},{0x0101,12},{0x0100,12},{0x0103,12},
+    {0x0102,12},{0x0105,12},{0x0104,12},{0x000C, 4},
+    {0x001E, 5},{0x0003, 5},{0x003E, 6},{0x003F, 6},
+    {0x0009, 4},{0x000E, 4},{0x000B, 7},{0x0009, 7}
+  },
+  {
+    {0x0002, 3},{0x000E, 4},{0x001E, 5},{0x000C, 4},
+    {0x001F, 5},{0x006E, 7},{0x00AD,10},{0x00AF,10},
+    {0x0014, 7},{0x0004, 3},{0x0003, 3},{0x001A, 5},
+    {0x0017, 5},{0x002A, 8},{0x0576,13},{0x0AEF,14},
+    {0x0AEE,14},{0x0571,13},{0x0570,13},{0x0573,13},
+    {0x0572,13},{0x0575,13},{0x0574,13},{0x0003, 4},
+    {0x0016, 5},{0x0004, 5},{0x0036, 6},{0x000B, 6},
+    {0x000A, 4},{0x0000, 3},{0x006F, 7},{0x00AC,10}
+  },
+  {
+    {0x0004, 3},{0x0005, 4},{0x0003, 3},{0x0001, 3},
+    {0x0004, 4},{0x002F, 6},{0x0526,11},{0x1495,13},
+    {0x00A6, 8},{0x0007, 3},{0x0006, 3},{0x002D, 6},
+    {0x002C, 6},{0x1494,13},{0x1497,13},{0x1496,13},
+    {0x1491,13},{0x1490,13},{0x1493,13},{0x1492,13},
+    {0x293D,14},{0x293C,14},{0x293F,14},{0x0000, 3},
+    {0x0028, 6},{0x00A5, 8},{0x0148, 9},{0x00A7, 8},
+    {0x002E, 6},{0x0015, 5},{0x0A4E,12},{0x293E,14}
+  },
+  {
+    {0x0004, 3},{0x0005, 4},{0x0003, 3},{0x0001, 3},
+    {0x0004, 4},{0x002F, 6},{0x0526,11},{0x1495,13},
+    {0x00A6, 8},{0x0007, 3},{0x0006, 3},{0x002D, 6},
+    {0x002C, 6},{0x1494,13},{0x1497,13},{0x1496,13},
+    {0x1491,13},{0x1490,13},{0x1493,13},{0x1492,13},
+    {0x293D,14},{0x293C,14},{0x293F,14},{0x0000, 3},
+    {0x0028, 6},{0x00A5, 8},{0x0148, 9},{0x00A7, 8},
+    {0x002E, 6},{0x0015, 5},{0x0A4E,12},{0x293E,14}
+  },
+  {
+    {0x0004, 3},{0x0005, 4},{0x0003, 3},{0x0001, 3},
+    {0x0004, 4},{0x002F, 6},{0x0526,11},{0x1495,13},
+    {0x00A6, 8},{0x0007, 3},{0x0006, 3},{0x002D, 6},
+    {0x002C, 6},{0x1494,13},{0x1497,13},{0x1496,13},
+    {0x1491,13},{0x1490,13},{0x1493,13},{0x1492,13},
+    {0x293D,14},{0x293C,14},{0x293F,14},{0x0000, 3},
+    {0x0028, 6},{0x00A5, 8},{0x0148, 9},{0x00A7, 8},
+    {0x002E, 6},{0x0015, 5},{0x0A4E,12},{0x293E,14}
+  },
+  {
+    {0x0003, 3},{0x0011, 5},{0x0020, 6},{0x0074, 7},
+    {0x010D, 9},{0x0863,12},{0x0860,12},{0x000A, 5},
+    {0x0075, 7},{0x0001, 3},{0x0000, 3},{0x000B, 4},
+    {0x000A, 4},{0x0018, 5},{0x0038, 6},{0x0042, 7},
+    {0x010F, 9},{0x010E, 9},{0x0219,10},{0x10C3,13},
+    {0x10C2,13},{0x10C5,13},{0x10C4,13},{0x000F, 4},
+    {0x0004, 4},{0x0019, 5},{0x000B, 5},{0x0039, 6},
+    {0x0009, 4},{0x001B, 5},{0x001A, 5},{0x003B, 6}
+  },
+  {
+    {0x0005, 3},{0x0001, 4},{0x003E, 6},{0x0001, 5},
+    {0x00E2, 8},{0x1C6F,13},{0x38D9,14},{0x0039, 6},
+    {0x001F, 6},{0x0002, 3},{0x0001, 3},{0x0009, 4},
+    {0x0008, 4},{0x0000, 5},{0x0070, 7},{0x01C7, 9},
+    {0x038C,10},{0x071A,11},{0x38D8,14},{0x38DB,14},
+    {0x38DA,14},{0x38DD,14},{0x38DC,14},{0x000D, 4},
+    {0x001D, 5},{0x000E, 5},{0x003F, 6},{0x003C, 6},
+    {0x000C, 4},{0x0006, 4},{0x003D, 6},{0x001E, 6}
+  },
+  {
+    {0x0006, 3},{0x000B, 4},{0x0011, 5},{0x001E, 5},
+    {0x0074, 7},{0x03AA,10},{0x1D5C,13},{0x0001, 6},
+    {0x0021, 6},{0x0001, 3},{0x0002, 3},{0x0007, 4},
+    {0x0006, 4},{0x003E, 6},{0x00EB, 8},{0x01D4, 9},
+    {0x0EAF,12},{0x3ABB,14},{0x3ABA,14},{0x1D59,13},
+    {0x1D58,13},{0x1D5B,13},{0x1D5A,13},{0x000A, 4},
+    {0x001C, 5},{0x0001, 5},{0x003F, 6},{0x003B, 6},
+    {0x0001, 4},{0x0009, 4},{0x0020, 6},{0x0000, 6}
+  },
+  {
+    {0x0004, 3},{0x000A, 4},{0x0017, 5},{0x0004, 4},
+    {0x0016, 6},{0x016A, 9},{0x16B1,13},{0x0017, 7},
+    {0x005B, 7},{0x0006, 3},{0x0007, 3},{0x0001, 4},
+    {0x0000, 4},{0x000A, 6},{0x02D7,10},{0x0B5A,12},
+    {0x16B0,13},{0x16B3,13},{0x16B2,13},{0x2D6D,14},
+    {0x2D6C,14},{0x2D6F,14},{0x2D6E,14},{0x0006, 4},
+    {0x000A, 5},{0x0004, 5},{0x002C, 6},{0x0017, 6},
+    {0x0003, 4},{0x0007, 4},{0x0016, 7},{0x00B4, 8}
+  },
+  {
+    {0x0005, 3},{0x000D, 4},{0x0005, 4},{0x0009, 4},
+    {0x0033, 6},{0x0193, 9},{0x192C,13},{0x0061, 8},
+    {0x0031, 7},{0x0000, 2},{0x0007, 3},{0x0010, 5},
+    {0x0011, 5},{0x00C8, 8},{0x192F,13},{0x325B,14},
+    {0x325A,14},{0x1929,13},{0x1928,13},{0x192B,13},
+    {0x192A,13},{0x325D,14},{0x325C,14},{0x0018, 5},
+    {0x001A, 6},{0x001B, 6},{0x0065, 7},{0x0019, 6},
+    {0x0004, 4},{0x0007, 4},{0x0060, 8},{0x0324,10}
+  },
+  {
+    {0x0006, 3},{0x0000, 3},{0x0002, 4},{0x000F, 4},
+    {0x0039, 6},{0x01D9, 9},{0x1D82,13},{0x0761,11},
+    {0x03BE,10},{0x0001, 2},{0x0002, 2},{0x000F, 6},
+    {0x000E, 6},{0x0762,11},{0x3B07,14},{0x3B06,14},
+    {0x3B1D,14},{0x3B1C,14},{0x3B1F,14},{0x3B1E,14},
+    {0x3B19,14},{0x3B18,14},{0x3B1B,14},{0x0038, 6},
+    {0x01DE, 9},{0x00ED, 8},{0x03BF,10},{0x00EE, 8},
+    {0x003A, 6},{0x0006, 5},{0x0EC0,12},{0x3B1A,14}
+  },
+  {
+    {0x0000, 2},{0x0002, 3},{0x000F, 5},{0x0006, 4},
+    {0x001C, 6},{0x01D0,10},{0x0E8C,13},{0x1D1B,14},
+    {0x1D1A,14},{0x0003, 2},{0x0002, 2},{0x00EA, 9},
+    {0x00E9, 9},{0x0E89,13},{0x0E88,13},{0x0E8B,13},
+    {0x0E8A,13},{0x1D65,14},{0x1D64,14},{0x1D67,14},
+    {0x1D66,14},{0x1D61,14},{0x1D60,14},{0x03AD,11},
+    {0x1D63,14},{0x1D62,14},{0x1D1D,14},{0x1D1C,14},
+    {0x003B, 7},{0x01D7,10},{0x1D1F,14},{0x1D1E,14}
+  },
+  {
+    {0x0002, 2},{0x000F, 4},{0x001C, 5},{0x000C, 4},
+    {0x003B, 6},{0x01AC, 9},{0x1AD8,13},{0x35B3,14},
+    {0x35B2,14},{0x0001, 2},{0x0000, 2},{0x0069, 7},
+    {0x0068, 7},{0x35BD,14},{0x35BC,14},{0x35BF,14},
+    {0x35BE,14},{0x35B9,14},{0x35B8,14},{0x35BB,14},
+    {0x35BA,14},{0x35B5,14},{0x35B4,14},{0x01A9, 9},
+    {0x01A8, 9},{0x035A,10},{0x00D7, 8},{0x00D5, 8},
+    {0x003A, 6},{0x001B, 5},{0x35B7,14},{0x35B6,14}
+  }
+};
+
+
+
+/*A description of a Huffman code value used when encoding the tree.*/
+typedef struct{
+  /*The bit pattern, left-shifted so that the MSB of all patterns is
+     aligned.*/
+  ogg_uint32_t pattern;
+  /*The amount the bit pattern was shifted.*/
+  int          shift;
+  /*The token this bit pattern represents.*/
+  int          token;
+}oc_huff_entry;
+
+
+
+/*Compares two oc_huff_entry structures by their bit patterns.
+  _c1: The first entry to compare.
+  _c2: The second entry to compare.
+  Return: <0 if _c1<_c2, >0 if _c1>_c2.*/
+static int huff_entry_cmp(const void *_c1,const void *_c2){
+  ogg_uint32_t b1;
+  ogg_uint32_t b2;
+  b1=((const oc_huff_entry *)_c1)->pattern;
+  b2=((const oc_huff_entry *)_c2)->pattern;
+  return b1<b2?-1:b1>b2?1:0;
+}
+
+/*Encodes a description of the given Huffman tables.
+  Although the codes are stored in the encoder as flat arrays, in the bit
+   stream and in the decoder they are structured as a tree.
+  This function recovers the tree structure from the flat array and then
+   writes it out.
+  Note that the codes MUST form a Huffman code, and not merely a prefix-free
+   code, since the binary tree is assumed to be full.
+  _opb:   The buffer to store the tree in.
+  _codes: The Huffman tables to pack.
+  Return: 0 on success, or a negative value if one of the given Huffman tables
+   does not form a full, prefix-free code.*/
+int oc_huff_codes_pack(oggpack_buffer *_opb,
+ const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]){
+  int i;
+  for(i=0;i<TH_NHUFFMAN_TABLES;i++){
+    oc_huff_entry entries[TH_NDCT_TOKENS];
+    int           bpos;
+    int           maxlen;
+    int           mask;
+    int           j;
+    /*First, find the maximum code length so we can align all the bit
+       patterns.*/
+    maxlen=_codes[i][0].nbits;
+    for(j=1;j<TH_NDCT_TOKENS;j++){
+      maxlen=OC_MAXI(_codes[i][j].nbits,maxlen);
+    }
+    mask=(1<<(maxlen>>1)<<(maxlen+1>>1))-1;
+    /*Copy over the codes into our temporary workspace.
+      The bit patterns are aligned, and the original entry each code is from
+       is stored as well.*/
+    for(j=0;j<TH_NDCT_TOKENS;j++){
+      entries[j].shift=maxlen-_codes[i][j].nbits;
+      entries[j].pattern=_codes[i][j].pattern<<entries[j].shift&mask;
+      entries[j].token=j;
+    }
+    /*Sort the codes into ascending order.
+      This is the order the leaves of the tree will be traversed.*/
+    qsort(entries,TH_NDCT_TOKENS,sizeof(entries[0]),huff_entry_cmp);
+    /*For each leaf of the tree:*/
+    bpos=maxlen;
+    for(j=0;j<TH_NDCT_TOKENS;j++){
+      int bit;
+      /*If this code has any bits at all.*/
+      if(entries[j].shift<maxlen){
+        /*Descend into the tree, writing a bit for each branch.*/
+        for(;bpos>entries[j].shift;bpos--)oggpackB_write(_opb,0,1);
+        /*Mark this as a leaf node, and write its value.*/
+        oggpackB_write(_opb,1,1);
+        oggpackB_write(_opb,entries[j].token,5);
+        /*For each 1 branch we've descended, back up the tree until we reach a
+           0 branch.*/
+        bit=1<<bpos;
+        for(;entries[j].pattern&bit;bpos++)bit<<=1;
+        /*Validate the code.*/
+        if(j+1<TH_NDCT_TOKENS){
+          mask=~(bit-1)<<1;
+          /*The next entry should have a 1 bit where we had a 0, and should
+             match our code above that bit.
+            This verifies both fullness and prefix-freeness simultaneously.*/
+          if(!(entries[j+1].pattern&bit)||
+           (entries[j].pattern&mask)!=(entries[j+1].pattern&mask)){
+            return TH_EINVAL;
+          }
+        }
+        /*If there are no more codes, we should have ascended back to the top
+           of the tree.*/
+        else if(bpos<maxlen)return TH_EINVAL;
+      }
+    }
+  }
+  return 0;
+}
diff --git a/lib/huffenc.h b/lib/huffenc.h
new file mode 100644
index 0000000..c5a3956
--- /dev/null
+++ b/lib/huffenc.h
@@ -0,0 +1,19 @@
+#if !defined(_huffenc_H)
+# define _huffenc_H (1)
+# include "huffman.h"
+
+
+
+typedef th_huff_code                  th_huff_table[TH_NDCT_TOKENS];
+
+
+
+extern const th_huff_code
+ TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
+
+
+
+int oc_huff_codes_pack(oggpack_buffer *_opb,
+ const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]);
+
+#endif
diff --git a/lib/huffman.h b/lib/huffman.h
new file mode 100644
index 0000000..36cf757
--- /dev/null
+++ b/lib/huffman.h
@@ -0,0 +1,70 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: huffman.h 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#if !defined(_huffman_H)
+# define _hufffman_H (1)
+# include "theora/codec.h"
+# include "ocintrin.h"
+
+/*The range of valid quantized DCT coefficient values.
+  VP3 used 511 in the encoder, but the bitstream is capable of 580.*/
+#define OC_DCT_VAL_RANGE         (580)
+
+#define OC_NDCT_TOKEN_BITS       (5)
+
+#define OC_DCT_EOB1_TOKEN        (0)
+#define OC_DCT_EOB2_TOKEN        (1)
+#define OC_DCT_EOB3_TOKEN        (2)
+#define OC_DCT_REPEAT_RUN0_TOKEN (3)
+#define OC_DCT_REPEAT_RUN1_TOKEN (4)
+#define OC_DCT_REPEAT_RUN2_TOKEN (5)
+#define OC_DCT_REPEAT_RUN3_TOKEN (6)
+
+#define OC_DCT_SHORT_ZRL_TOKEN   (7)
+#define OC_DCT_ZRL_TOKEN         (8)
+
+#define OC_ONE_TOKEN             (9)
+#define OC_MINUS_ONE_TOKEN       (10)
+#define OC_TWO_TOKEN             (11)
+#define OC_MINUS_TWO_TOKEN       (12)
+
+#define OC_DCT_VAL_CAT2          (13)
+#define OC_DCT_VAL_CAT3          (17)
+#define OC_DCT_VAL_CAT4          (18)
+#define OC_DCT_VAL_CAT5          (19)
+#define OC_DCT_VAL_CAT6          (20)
+#define OC_DCT_VAL_CAT7          (21)
+#define OC_DCT_VAL_CAT8          (22)
+
+#define OC_DCT_RUN_CAT1A         (23)
+#define OC_DCT_RUN_CAT1B         (28)
+#define OC_DCT_RUN_CAT1C         (29)
+#define OC_DCT_RUN_CAT2A         (30)
+#define OC_DCT_RUN_CAT2B         (31)
+
+#define OC_NDCT_EOB_TOKEN_MAX    (7)
+#define OC_NDCT_ZRL_TOKEN_MAX    (9)
+#define OC_NDCT_VAL_MAX          (23)
+#define OC_NDCT_VAL_CAT1_MAX     (13)
+#define OC_NDCT_VAL_CAT2_MAX     (17)
+#define OC_NDCT_VAL_CAT2_SIZE    (OC_NDCT_VAL_CAT2_MAX-OC_DCT_VAL_CAT2)
+#define OC_NDCT_RUN_MAX          (32)
+#define OC_NDCT_RUN_CAT1A_MAX    (28)
+
+extern const unsigned char OC_DCT_TOKEN_EXTRA_BITS[TH_NDCT_TOKENS];
+
+#endif
diff --git a/lib/idct.c b/lib/idct.c
new file mode 100644
index 0000000..0e68ac7
--- /dev/null
+++ b/lib/idct.c
@@ -0,0 +1,335 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: idct.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#include <string.h>
+#include "internal.h"
+#include "dct.h"
+
+/*Performs an inverse 8 point Type-II DCT transform.
+  The output is scaled by a factor of 2 relative to the orthonormal version of
+   the transform.
+  _y: The buffer to store the result in.
+      Data will be placed in every 8th entry (e.g., in a column of an 8x8
+       block).
+  _x: The input coefficients.
+      The first 8 entries are used (e.g., from a row of an 8x8 block).*/
+static void idct8(ogg_int16_t *_y,const ogg_int16_t _x[8]){
+  ogg_int32_t t[8];
+  ogg_int32_t r;
+  /*Stage 1:*/
+  /*0-1 butterfly.*/
+  t[0]=OC_C4S4*(ogg_int16_t)(_x[0]+_x[4])>>16;
+  t[1]=OC_C4S4*(ogg_int16_t)(_x[0]-_x[4])>>16;
+  /*2-3 rotation by 6pi/16.*/
+  t[2]=(OC_C6S2*_x[2]>>16)-(OC_C2S6*_x[6]>>16);
+  t[3]=(OC_C2S6*_x[2]>>16)+(OC_C6S2*_x[6]>>16);
+  /*4-7 rotation by 7pi/16.*/
+  t[4]=(OC_C7S1*_x[1]>>16)-(OC_C1S7*_x[7]>>16);
+  /*5-6 rotation by 3pi/16.*/
+  t[5]=(OC_C3S5*_x[5]>>16)-(OC_C5S3*_x[3]>>16);
+  t[6]=(OC_C5S3*_x[5]>>16)+(OC_C3S5*_x[3]>>16);
+  t[7]=(OC_C1S7*_x[1]>>16)+(OC_C7S1*_x[7]>>16);
+  /*Stage 2:*/
+  /*4-5 butterfly.*/
+  r=t[4]+t[5];
+  t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16;
+  t[4]=r;
+  /*7-6 butterfly.*/
+  r=t[7]+t[6];
+  t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16;
+  t[7]=r;
+  /*Stage 3:*/
+  /*0-3 butterfly.*/
+  r=t[0]+t[3];
+  t[3]=t[0]-t[3];
+  t[0]=r;
+  /*1-2 butterfly.*/
+  r=t[1]+t[2];
+  t[2]=t[1]-t[2];
+  t[1]=r;
+  /*6-5 butterfly.*/
+  r=t[6]+t[5];
+  t[5]=t[6]-t[5];
+  t[6]=r;
+  /*Stage 4:*/
+  /*0-7 butterfly.*/
+  _y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
+  /*1-6 butterfly.*/
+  _y[1<<3]=(ogg_int16_t)(t[1]+t[6]);
+  /*2-5 butterfly.*/
+  _y[2<<3]=(ogg_int16_t)(t[2]+t[5]);
+  /*3-4 butterfly.*/
+  _y[3<<3]=(ogg_int16_t)(t[3]+t[4]);
+  _y[4<<3]=(ogg_int16_t)(t[3]-t[4]);
+  _y[5<<3]=(ogg_int16_t)(t[2]-t[5]);
+  _y[6<<3]=(ogg_int16_t)(t[1]-t[6]);
+  _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
+}
+
+/*Performs an inverse 8 point Type-II DCT transform.
+  The output is scaled by a factor of 2 relative to the orthonormal version of
+   the transform.
+  _y: The buffer to store the result in.
+      Data will be placed in every 8th entry (e.g., in a column of an 8x8
+       block).
+  _x: The input coefficients.
+      Only the first 4 entries are used.
+      The other 4 are assumed to be 0.*/
+static void idct8_4(ogg_int16_t *_y,const ogg_int16_t _x[8]){
+  ogg_int32_t t[8];
+  ogg_int32_t r;
+  /*Stage 1:*/
+  t[0]=OC_C4S4*_x[0]>>16;
+  t[2]=OC_C6S2*_x[2]>>16;
+  t[3]=OC_C2S6*_x[2]>>16;
+  t[4]=OC_C7S1*_x[1]>>16;
+  t[5]=-(OC_C5S3*_x[3]>>16);
+  t[6]=OC_C3S5*_x[3]>>16;
+  t[7]=OC_C1S7*_x[1]>>16;
+  /*Stage 2:*/
+  r=t[4]+t[5];
+  t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16;
+  t[4]=r;
+  r=t[7]+t[6];
+  t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16;
+  t[7]=r;
+  /*Stage 3:*/
+  t[1]=t[0]+t[2];
+  t[2]=t[0]-t[2];
+  r=t[0]+t[3];
+  t[3]=t[0]-t[3];
+  t[0]=r;
+  r=t[6]+t[5];
+  t[5]=t[6]-t[5];
+  t[6]=r;
+  /*Stage 4:*/
+  _y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
+  _y[1<<3]=(ogg_int16_t)(t[1]+t[6]);
+  _y[2<<3]=(ogg_int16_t)(t[2]+t[5]);
+  _y[3<<3]=(ogg_int16_t)(t[3]+t[4]);
+  _y[4<<3]=(ogg_int16_t)(t[3]-t[4]);
+  _y[5<<3]=(ogg_int16_t)(t[2]-t[5]);
+  _y[6<<3]=(ogg_int16_t)(t[1]-t[6]);
+  _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
+}
+
+/*Performs an inverse 8 point Type-II DCT transform.
+  The output is scaled by a factor of 2 relative to the orthonormal version of
+   the transform.
+  _y: The buffer to store the result in.
+      Data will be placed in every 8th entry (e.g., in a column of an 8x8
+       block).
+  _x: The input coefficients.
+      Only the first 3 entries are used.
+      The other 5 are assumed to be 0.*/
+static void idct8_3(ogg_int16_t *_y,const ogg_int16_t _x[8]){
+  ogg_int32_t t[8];
+  ogg_int32_t r;
+  /*Stage 1:*/
+  t[0]=OC_C4S4*_x[0]>>16;
+  t[2]=OC_C6S2*_x[2]>>16;
+  t[3]=OC_C2S6*_x[2]>>16;
+  t[4]=OC_C7S1*_x[1]>>16;
+  t[7]=OC_C1S7*_x[1]>>16;
+  /*Stage 2:*/
+  t[5]=OC_C4S4*t[4]>>16;
+  t[6]=OC_C4S4*t[7]>>16;
+  /*Stage 3:*/
+  t[1]=t[0]+t[2];
+  t[2]=t[0]-t[2];
+  r=t[0]+t[3];
+  t[3]=t[0]-t[3];
+  t[0]=r;
+  r=t[6]+t[5];
+  t[5]=t[6]-t[5];
+  t[6]=r;
+  /*Stage 4:*/
+  _y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
+  _y[1<<3]=(ogg_int16_t)(t[1]+t[6]);
+  _y[2<<3]=(ogg_int16_t)(t[2]+t[5]);
+  _y[3<<3]=(ogg_int16_t)(t[3]+t[4]);
+  _y[4<<3]=(ogg_int16_t)(t[3]-t[4]);
+  _y[5<<3]=(ogg_int16_t)(t[2]-t[5]);
+  _y[6<<3]=(ogg_int16_t)(t[1]-t[6]);
+  _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
+}
+
+/*Performs an inverse 8 point Type-II DCT transform.
+  The output is scaled by a factor of 2 relative to the orthonormal version of
+   the transform.
+  _y: The buffer to store the result in.
+      Data will be placed in every 8th entry (e.g., in a column of an 8x8
+       block).
+  _x: The input coefficients.
+      Only the first 2 entries are used.
+      The other 6 are assumed to be 0.*/
+static void idct8_2(ogg_int16_t *_y,const ogg_int16_t _x[8]){
+  ogg_int32_t t[8];
+  ogg_int32_t r;
+  /*Stage 1:*/
+  t[0]=OC_C4S4*_x[0]>>16;
+  t[4]=OC_C7S1*_x[1]>>16;
+  t[7]=OC_C1S7*_x[1]>>16;
+  /*Stage 2:*/
+  t[5]=OC_C4S4*t[4]>>16;
+  t[6]=OC_C4S4*t[7]>>16;
+  /*Stage 3:*/
+  r=t[6]+t[5];
+  t[5]=t[6]-t[5];
+  t[6]=r;
+  /*Stage 4:*/
+  _y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
+  _y[1<<3]=(ogg_int16_t)(t[0]+t[6]);
+  _y[2<<3]=(ogg_int16_t)(t[0]+t[5]);
+  _y[3<<3]=(ogg_int16_t)(t[0]+t[4]);
+  _y[4<<3]=(ogg_int16_t)(t[0]-t[4]);
+  _y[5<<3]=(ogg_int16_t)(t[0]-t[5]);
+  _y[6<<3]=(ogg_int16_t)(t[0]-t[6]);
+  _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
+}
+
+/*Performs an inverse 8 point Type-II DCT transform.
+  The output is scaled by a factor of 2 relative to the orthonormal version of
+   the transform.
+  _y: The buffer to store the result in.
+      Data will be placed in every 8th entry (e.g., in a column of an 8x8
+       block).
+  _x: The input coefficients.
+      Only the first entry is used.
+      The other 7 are assumed to be 0.*/
+static void idct8_1(ogg_int16_t *_y,const ogg_int16_t _x[1]){
+  _y[0<<3]=_y[1<<3]=_y[2<<3]=_y[3<<3]=
+   _y[4<<3]=_y[5<<3]=_y[6<<3]=_y[7<<3]=(ogg_int16_t)(OC_C4S4*_x[0]>>16);
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.
+  All coefficients but the first 3 in zig-zag scan order are assumed to be 0:
+   x  x  0  0  0  0  0  0
+   x  0  0  0  0  0  0  0
+   0  0  0  0  0  0  0  0
+   0  0  0  0  0  0  0  0
+   0  0  0  0  0  0  0  0
+   0  0  0  0  0  0  0  0
+   0  0  0  0  0  0  0  0
+   0  0  0  0  0  0  0  0
+  _y: The buffer to store the result in.
+      This may be the same as _x.
+  _x: The input coefficients.*/
+static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  const ogg_int16_t *in;
+  ogg_int16_t       *end;
+  ogg_int16_t       *out;
+  ogg_int16_t        w[64];
+  /*Transform rows of x into columns of w.*/
+  idct8_2(w,_x);
+  idct8_1(w+1,_x+8);
+  /*Transform rows of w into columns of y.*/
+  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_2(out,in);
+  /*Adjust for the scale factor.*/
+  for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.
+  All coefficients but the first 10 in zig-zag scan order are assumed to be 0:
+   x  x  x  x  0  0  0  0
+   x  x  x  0  0  0  0  0
+   x  x  0  0  0  0  0  0
+   x  0  0  0  0  0  0  0
+   0  0  0  0  0  0  0  0
+   0  0  0  0  0  0  0  0
+   0  0  0  0  0  0  0  0
+   0  0  0  0  0  0  0  0
+  _y: The buffer to store the result in.
+      This may be the same as _x.
+  _x: The input coefficients.*/
+static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  const ogg_int16_t *in;
+  ogg_int16_t       *end;
+  ogg_int16_t       *out;
+  ogg_int16_t        w[64];
+  /*Transform rows of x into columns of w.*/
+  idct8_4(w,_x);
+  idct8_3(w+1,_x+8);
+  idct8_2(w+2,_x+16);
+  idct8_1(w+3,_x+24);
+  /*Transform rows of w into columns of y.*/
+  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_4(out,in);
+  /*Adjust for the scale factor.*/
+  for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.
+  _y: The buffer to store the result in.
+      This may be the same as _x.
+  _x: The input coefficients.*/
+static void oc_idct8x8_slow(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  const ogg_int16_t *in;
+  ogg_int16_t       *end;
+  ogg_int16_t       *out;
+  ogg_int16_t        w[64];
+  /*Transform rows of x into columns of w.*/
+  for(in=_x,out=w,end=out+8;out<end;in+=8,out++)idct8(out,in);
+  /*Transform rows of w into columns of y.*/
+  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8(out,in);
+  /*Adjust for the scale factor.*/
+  for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
+}
+
+void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
+ int _last_zzi){
+  (*_state->opt_vtable.idct8x8)(_y,_last_zzi);
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.*/
+void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){
+  /*_last_zzi is subtly different from an actual count of the number of
+     coefficients we decoded for this block.
+    It contains the value of zzi BEFORE the final token in the block was
+     decoded.
+    In most cases this is an EOB token (the continuation of an EOB run from a
+     previous block counts), and so this is the same as the coefficient count.
+    However, in the case that the last token was NOT an EOB token, but filled
+     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+    Provided the last token was not a pure zero run, the minimum value it can
+     be is 46, and so that doesn't affect any of the cases in this routine.
+    However, if the last token WAS a pure zero run of length 63, then _last_zzi
+     will be 1 while the number of coefficients decoded is 64.
+    Thus, we will trigger the following special case, where the real
+     coefficient count would not.
+    Note also that a zero run of length 64 will give _last_zzi a value of 0,
+     but we still process the DC coefficient, which might have a non-zero value
+     due to DC prediction.
+    Although convoluted, this is arguably the correct behavior: it allows us to
+     use a smaller transform when the block ends with a long zero run instead
+     of a normal EOB token.
+    It could be smarter... multiple separate zero runs at the end of a block
+     will fool it, but an encoder that generates these really deserves what it
+     gets.
+    Needless to say we inherited this approach from VP3.*/
+  /*Then perform the iDCT.*/
+  if(_last_zzi<3)oc_idct8x8_3(_y,_y);
+  else if(_last_zzi<10)oc_idct8x8_10(_y,_y);
+  else oc_idct8x8_slow(_y,_y);
+}
diff --git a/lib/info.c b/lib/info.c
new file mode 100644
index 0000000..6b97629
--- /dev/null
+++ b/lib/info.c
@@ -0,0 +1,131 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: info.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include "internal.h"
+
+
+
+/*This is more or less the same as strncasecmp, but that doesn't exist
+   everywhere, and this is a fairly trivial function, so we include it.
+  Note: We take advantage of the fact that we know _n is less than or equal to
+   the length of at least one of the strings.*/
+static int oc_tagcompare(const char *_s1,const char *_s2,int _n){
+  int c;
+  for(c=0;c<_n;c++){
+    if(toupper(_s1[c])!=toupper(_s2[c]))return !0;
+  }
+  return _s1[c]!='=';
+}
+
+
+
+void th_info_init(th_info *_info){
+  memset(_info,0,sizeof(*_info));
+  _info->version_major=TH_VERSION_MAJOR;
+  _info->version_minor=TH_VERSION_MINOR;
+  _info->version_subminor=TH_VERSION_SUB;
+  _info->keyframe_granule_shift=6;
+}
+
+void th_info_clear(th_info *_info){
+  memset(_info,0,sizeof(*_info));
+}
+
+
+
+void th_comment_init(th_comment *_tc){
+  memset(_tc,0,sizeof(*_tc));
+}
+
+void th_comment_add(th_comment *_tc,char *_comment){
+  char **user_comments;
+  int   *comment_lengths;
+  int    comment_len;
+  user_comments=_ogg_realloc(_tc->user_comments,
+   (_tc->comments+2)*sizeof(*_tc->user_comments));
+  if(user_comments==NULL)return;
+  _tc->user_comments=user_comments;
+  comment_lengths=_ogg_realloc(_tc->comment_lengths,
+   (_tc->comments+2)*sizeof(*_tc->comment_lengths));
+  if(comment_lengths==NULL)return;
+  _tc->comment_lengths=comment_lengths;
+  comment_len=strlen(_comment);
+  comment_lengths[_tc->comments]=comment_len;
+  user_comments[_tc->comments]=_ogg_malloc(comment_len+1);
+  if(user_comments[_tc->comments]==NULL)return;
+  memcpy(_tc->user_comments[_tc->comments],_comment,comment_len+1);
+  _tc->comments++;
+  _tc->user_comments[_tc->comments]=NULL;
+}
+
+void th_comment_add_tag(th_comment *_tc,char *_tag,char *_val){
+  char *comment;
+  int   tag_len;
+  int   val_len;
+  tag_len=strlen(_tag);
+  val_len=strlen(_val);
+  /*+2 for '=' and '\0'.*/
+  comment=_ogg_malloc(tag_len+val_len+2);
+  if(comment==NULL)return;
+  memcpy(comment,_tag,tag_len);
+  comment[tag_len]='=';
+  memcpy(comment+tag_len+1,_val,val_len+1);
+  th_comment_add(_tc,comment);
+  _ogg_free(comment);
+}
+
+char *th_comment_query(th_comment *_tc,char *_tag,int _count){
+  long i;
+  int  found;
+  int  tag_len;
+  tag_len=strlen(_tag);
+  found=0;
+  for(i=0;i<_tc->comments;i++){
+    if(!oc_tagcompare(_tc->user_comments[i],_tag,tag_len)){
+      /*We return a pointer to the data, not a copy.*/
+      if(_count==found++)return _tc->user_comments[i]+tag_len+1;
+    }
+  }
+  /*Didn't find anything.*/
+  return NULL;
+}
+
+int th_comment_query_count(th_comment *_tc,char *_tag){
+  long i;
+  int  tag_len;
+  int  count;
+  tag_len=strlen(_tag);
+  count=0;
+  for(i=0;i<_tc->comments;i++){
+    if(!oc_tagcompare(_tc->user_comments[i],_tag,tag_len))count++;
+  }
+  return count;
+}
+
+void th_comment_clear(th_comment *_tc){
+  if(_tc!=NULL){
+    long i;
+    for(i=0;i<_tc->comments;i++)_ogg_free(_tc->user_comments[i]);
+    _ogg_free(_tc->user_comments);
+    _ogg_free(_tc->comment_lengths);
+    _ogg_free(_tc->vendor);
+    memset(_tc,0,sizeof(*_tc));
+  }
+}
diff --git a/lib/internal.c b/lib/internal.c
new file mode 100644
index 0000000..0fe4f63
--- /dev/null
+++ b/lib/internal.c
@@ -0,0 +1,262 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: internal.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include "internal.h"
+
+
+
+/*A map from the index in the zig zag scan to the coefficient number in a
+   block.
+  All zig zag indices beyond 63 are sent to coefficient 64, so that zero runs
+   past the end of a block in bogus streams get mapped to a known location.*/
+const unsigned char OC_FZIG_ZAG[128]={
+   0, 1, 8,16, 9, 2, 3,10,
+  17,24,32,25,18,11, 4, 5,
+  12,19,26,33,40,48,41,34,
+  27,20,13, 6, 7,14,21,28,
+  35,42,49,56,57,50,43,36,
+  29,22,15,23,30,37,44,51,
+  58,59,52,45,38,31,39,46,
+  53,60,61,54,47,55,62,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
+};
+
+/*A map from the coefficient number in a block to its index in the zig zag
+   scan.*/
+const unsigned char OC_IZIG_ZAG[64]={
+   0, 1, 5, 6,14,15,27,28,
+   2, 4, 7,13,16,26,29,42,
+   3, 8,12,17,25,30,41,43,
+   9,11,18,24,31,40,44,53,
+  10,19,23,32,39,45,52,54,
+  20,22,33,38,46,51,55,60,
+  21,34,37,47,50,56,59,61,
+  35,36,48,49,57,58,62,63
+};
+
+/*A map from physical macro block ordering to bitstream macro block
+   ordering within a super block.*/
+const unsigned char OC_MB_MAP[2][2]={{0,3},{1,2}};
+
+/*A list of the indices in the oc_mb.map array that can be valid for each of
+   the various chroma decimation types.*/
+const unsigned char OC_MB_MAP_IDXS[TH_PF_NFORMATS][12]={
+  {0,1,2,3,4,8},
+  {0,1,2,3,4,5,8,9},
+  {0,1,2,3,4,6,8,10},
+  {0,1,2,3,4,5,6,7,8,9,10,11}
+};
+
+/*The number of indices in the oc_mb.map array that can be valid for each of
+   the various chroma decimation types.*/
+const unsigned char OC_MB_MAP_NIDXS[TH_PF_NFORMATS]={6,8,8,12};
+
+/*The number of extra bits that are coded with each of the DCT tokens.
+  Each DCT token has some fixed number of additional bits (possibly 0) stored
+   after the token itself, containing, for example, coefficient magnitude,
+   sign bits, etc.*/
+const unsigned char OC_DCT_TOKEN_EXTRA_BITS[TH_NDCT_TOKENS]={
+  0,0,0,2,3,4,12,3,6,
+  0,0,0,0,
+  1,1,1,1,2,3,4,5,6,10,
+  1,1,1,1,1,3,4,
+  2,3
+};
+
+
+
+int oc_ilog(unsigned _v){
+  int ret;
+  for(ret=0;_v;ret++)_v>>=1;
+  return ret;
+}
+
+
+
+/*The function used to fill in the chroma plane motion vectors for a macro
+   block when 4 different motion vectors are specified in the luma plane.
+  This version is for use with chroma decimated in the X and Y directions
+   (4:2:0).
+  _cbmvs: The chroma block-level motion vectors to fill in.
+  _lbmvs: The luma block-level motion vectors.*/
+static void oc_set_chroma_mvs00(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
+  int dx;
+  int dy;
+  dx=_lbmvs[0][0]+_lbmvs[1][0]+_lbmvs[2][0]+_lbmvs[3][0];
+  dy=_lbmvs[0][1]+_lbmvs[1][1]+_lbmvs[2][1]+_lbmvs[3][1];
+  _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,2,2);
+  _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,2,2);
+}
+
+/*The function used to fill in the chroma plane motion vectors for a macro
+   block when 4 different motion vectors are specified in the luma plane.
+  This version is for use with chroma decimated in the Y direction.
+  _cbmvs: The chroma block-level motion vectors to fill in.
+  _lbmvs: The luma block-level motion vectors.*/
+static void oc_set_chroma_mvs01(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
+  int dx;
+  int dy;
+  dx=_lbmvs[0][0]+_lbmvs[2][0];
+  dy=_lbmvs[0][1]+_lbmvs[2][1];
+  _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
+  _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
+  dx=_lbmvs[1][0]+_lbmvs[3][0];
+  dy=_lbmvs[1][1]+_lbmvs[3][1];
+  _cbmvs[1][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
+  _cbmvs[1][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
+}
+
+/*The function used to fill in the chroma plane motion vectors for a macro
+   block when 4 different motion vectors are specified in the luma plane.
+  This version is for use with chroma decimated in the X direction (4:2:2).
+  _cbmvs: The chroma block-level motion vectors to fill in.
+  _lbmvs: The luma block-level motion vectors.*/
+static void oc_set_chroma_mvs10(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
+  int dx;
+  int dy;
+  dx=_lbmvs[0][0]+_lbmvs[1][0];
+  dy=_lbmvs[0][1]+_lbmvs[1][1];
+  _cbmvs[0][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
+  _cbmvs[0][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
+  dx=_lbmvs[2][0]+_lbmvs[3][0];
+  dy=_lbmvs[2][1]+_lbmvs[3][1];
+  _cbmvs[2][0]=(signed char)OC_DIV_ROUND_POW2(dx,1,1);
+  _cbmvs[2][1]=(signed char)OC_DIV_ROUND_POW2(dy,1,1);
+}
+
+/*The function used to fill in the chroma plane motion vectors for a macro
+   block when 4 different motion vectors are specified in the luma plane.
+  This version is for use with no chroma decimation (4:4:4).
+  _cbmvs: The chroma block-level motion vectors to fill in.
+  _lmbmv: The luma macro-block level motion vector to fill in for use in
+           prediction.
+  _lbmvs: The luma block-level motion vectors.*/
+static void oc_set_chroma_mvs11(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]){
+  memcpy(_cbmvs,_lbmvs,4*sizeof(_lbmvs[0]));
+}
+
+/*A table of functions used to fill in the chroma plane motion vectors for a
+   macro block when 4 different motion vectors are specified in the luma
+   plane.*/
+const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS]={
+  (oc_set_chroma_mvs_func)oc_set_chroma_mvs00,
+  (oc_set_chroma_mvs_func)oc_set_chroma_mvs01,
+  (oc_set_chroma_mvs_func)oc_set_chroma_mvs10,
+  (oc_set_chroma_mvs_func)oc_set_chroma_mvs11
+};
+
+
+
+void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz){
+  size_t  rowsz;
+  size_t  colsz;
+  size_t  datsz;
+  char   *ret;
+  colsz=_height*sizeof(void *);
+  rowsz=_sz*_width;
+  datsz=rowsz*_height;
+  /*Alloc array and row pointers.*/
+  ret=(char *)_ogg_malloc(datsz+colsz);
+  if(ret==NULL)return NULL;
+  /*Initialize the array.*/
+  if(ret!=NULL){
+    size_t   i;
+    void   **p;
+    char    *datptr;
+    p=(void **)ret;
+    i=_height;
+    for(datptr=ret+colsz;i-->0;p++,datptr+=rowsz)*p=(void *)datptr;
+  }
+  return (void **)ret;
+}
+
+void **oc_calloc_2d(size_t _height,size_t _width,size_t _sz){
+  size_t  colsz;
+  size_t  rowsz;
+  size_t  datsz;
+  char   *ret;
+  colsz=_height*sizeof(void *);
+  rowsz=_sz*_width;
+  datsz=rowsz*_height;
+  /*Alloc array and row pointers.*/
+  ret=(char *)_ogg_calloc(datsz+colsz,1);
+  if(ret==NULL)return NULL;
+  /*Initialize the array.*/
+  if(ret!=NULL){
+    size_t   i;
+    void   **p;
+    char    *datptr;
+    p=(void **)ret;
+    i=_height;
+    for(datptr=ret+colsz;i-->0;p++,datptr+=rowsz)*p=(void *)datptr;
+  }
+  return (void **)ret;
+}
+
+void oc_free_2d(void *_ptr){
+  _ogg_free(_ptr);
+}
+
+/*Fills in a Y'CbCr buffer with a pointer to the image data in the first
+   buffer, but with the opposite vertical orientation.
+  _dst: The destination buffer.
+        This can be the same as _src.
+  _src: The source buffer.*/
+void oc_ycbcr_buffer_flip(th_ycbcr_buffer _dst,
+ const th_ycbcr_buffer _src){
+  int pli;
+  for(pli=0;pli<3;pli++){
+    _dst[pli].width=_src[pli].width;
+    _dst[pli].height=_src[pli].height;
+    _dst[pli].stride=-_src[pli].stride;
+    _dst[pli].data=_src[pli].data
+     +(1-_dst[pli].height)*(ptrdiff_t)_dst[pli].stride;
+  }
+}
+
+const char *th_version_string(void){
+  return OC_VENDOR_STRING;
+}
+
+ogg_uint32_t th_version_number(void){
+  return (TH_VERSION_MAJOR<<16)+(TH_VERSION_MINOR<<8)+TH_VERSION_SUB;
+}
+
+/*Determines the packet type.
+  Note that this correctly interprets a 0-byte packet as a video data packet.
+  Return: 1 for a header packet, 0 for a data packet.*/
+int th_packet_isheader(ogg_packet *_op){
+  return _op->bytes>0?_op->packet[0]>>7:0;
+}
+
+/*Determines the frame type of a video data packet.
+  Note that this correctly interprets a 0-byte packet as a delta frame.
+  Return: 1 for a key frame, 0 for a delta frame, and -1 for a header
+           packet.*/
+int th_packet_iskeyframe(ogg_packet *_op){
+  return _op->bytes<=0?0:_op->packet[0]&0x80?-1:!(_op->packet[0]&0x40);
+}
diff --git a/lib/internal.h b/lib/internal.h
new file mode 100644
index 0000000..d81263e
--- /dev/null
+++ b/lib/internal.h
@@ -0,0 +1,509 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: internal.h 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+#if !defined(_internal_H)
+# define _internal_H (1)
+# include <stdlib.h>
+# include <limits.h>
+# if defined(HAVE_CONFIG_H)
+#  include <config.h>
+# endif
+# include "theora/codec.h"
+# include "theora/theora.h"
+
+# if defined(_MSC_VER)
+/*Disable missing EMMS warnings.*/
+#  pragma warning(disable:4799)
+/*Thank you Microsoft, I know the order of operations.*/
+#  pragma warning(disable:4554)
+# endif
+/*You, too, gcc.*/
+# if defined(__GNUC_PREREQ)
+#  if __GNUC_PREREQ(4,2)
+#   pragma GCC diagnostic ignored "-Wparentheses"
+#  endif
+# endif
+
+# include "ocintrin.h"
+# include "huffman.h"
+# include "quant.h"
+
+/*Some assembly constructs require aligned operands.*/
+# if defined(OC_X86_ASM)
+#  if defined(__GNUC__)
+#   define OC_ALIGN8(expr) expr __attribute__((aligned(8)))
+#   define OC_ALIGN16(expr) expr __attribute__((aligned(16)))
+#  elif defined(_MSC_VER)
+#   define OC_ALIGN8(expr) __declspec (align(8)) expr
+#   define OC_ALIGN16(expr) __declspec (align(16)) expr
+#  endif
+# endif
+# if !defined(OC_ALIGN8)
+#  define OC_ALIGN8(expr) expr
+# endif
+# if !defined(OC_ALIGN16)
+#  define OC_ALIGN16(expr) expr
+# endif
+
+
+
+typedef struct oc_sb_flags              oc_sb_flags;
+typedef struct oc_border_info           oc_border_info;
+typedef struct oc_fragment              oc_fragment;
+typedef struct oc_fragment_plane        oc_fragment_plane;
+typedef struct oc_base_opt_vtable       oc_base_opt_vtable;
+typedef struct oc_base_opt_data         oc_base_opt_data;
+typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable;
+typedef struct oc_theora_state          oc_theora_state;
+
+
+
+/*This library's version.*/
+# define OC_VENDOR_STRING "Xiph.Org libtheora 1.1 20090822 (Thusnelda)"
+
+/*Theora bitstream version.*/
+# define TH_VERSION_MAJOR (3)
+# define TH_VERSION_MINOR (2)
+# define TH_VERSION_SUB   (1)
+# define TH_VERSION_CHECK(_info,_maj,_min,_sub) \
+ ((_info)->version_major>(_maj)||(_info)->version_major==(_maj)&& \
+ ((_info)->version_minor>(_min)||(_info)->version_minor==(_min)&& \
+ (_info)->version_subminor>=(_sub)))
+
+/*A keyframe.*/
+#define OC_INTRA_FRAME (0)
+/*A predicted frame.*/
+#define OC_INTER_FRAME (1)
+/*A frame of unknown type (frame type decision has not yet been made).*/
+#define OC_UNKWN_FRAME (-1)
+
+/*The amount of padding to add to the reconstructed frame buffers on all
+   sides.
+  This is used to allow unrestricted motion vectors without special casing.
+  This must be a multiple of 2.*/
+#define OC_UMV_PADDING (16)
+
+/*Frame classification indices.*/
+/*The previous golden frame.*/
+#define OC_FRAME_GOLD (0)
+/*The previous frame.*/
+#define OC_FRAME_PREV (1)
+/*The current frame.*/
+#define OC_FRAME_SELF (2)
+
+/*The input or output buffer.*/
+#define OC_FRAME_IO   (3)
+
+/*Macroblock modes.*/
+/*Macro block is invalid: It is never coded.*/
+#define OC_MODE_INVALID        (-1)
+/*Encoded difference from the same macro block in the previous frame.*/
+#define OC_MODE_INTER_NOMV     (0)
+/*Encoded with no motion compensated prediction.*/
+#define OC_MODE_INTRA          (1)
+/*Encoded difference from the previous frame offset by the given motion 
+  vector.*/
+#define OC_MODE_INTER_MV       (2)
+/*Encoded difference from the previous frame offset by the last coded motion 
+  vector.*/
+#define OC_MODE_INTER_MV_LAST  (3)
+/*Encoded difference from the previous frame offset by the second to last 
+  coded motion vector.*/
+#define OC_MODE_INTER_MV_LAST2 (4)
+/*Encoded difference from the same macro block in the previous golden 
+  frame.*/
+#define OC_MODE_GOLDEN_NOMV    (5)
+/*Encoded difference from the previous golden frame offset by the given motion 
+  vector.*/
+#define OC_MODE_GOLDEN_MV      (6)
+/*Encoded difference from the previous frame offset by the individual motion 
+  vectors given for each block.*/
+#define OC_MODE_INTER_MV_FOUR  (7)
+/*The number of (coded) modes.*/
+#define OC_NMODES              (8)
+
+/*Determines the reference frame used for a given MB mode.*/
+#define OC_FRAME_FOR_MODE(_x) \
+ OC_UNIBBLE_TABLE32(OC_FRAME_PREV,OC_FRAME_SELF,OC_FRAME_PREV,OC_FRAME_PREV, \
+  OC_FRAME_PREV,OC_FRAME_GOLD,OC_FRAME_GOLD,OC_FRAME_PREV,(_x))
+
+/*Constants for the packet state machine common between encoder and decoder.*/
+
+/*Next packet to emit/read: Codec info header.*/
+#define OC_PACKET_INFO_HDR    (-3)
+/*Next packet to emit/read: Comment header.*/
+#define OC_PACKET_COMMENT_HDR (-2)
+/*Next packet to emit/read: Codec setup header.*/
+#define OC_PACKET_SETUP_HDR   (-1)
+/*No more packets to emit/read.*/
+#define OC_PACKET_DONE        (INT_MAX)
+
+
+
+/*Super blocks are 32x32 segments of pixels in a single color plane indexed
+   in image order.
+  Internally, super blocks are broken up into four quadrants, each of which
+   contains a 2x2 pattern of blocks, each of which is an 8x8 block of pixels.
+  Quadrants, and the blocks within them, are indexed in a special order called
+   a "Hilbert curve" within the super block.
+
+  In order to differentiate between the Hilbert-curve indexing strategy and
+   the regular image order indexing strategy, blocks indexed in image order
+   are called "fragments".
+  Fragments are indexed in image order, left to right, then bottom to top,
+   from Y' plane to Cb plane to Cr plane.
+
+  The co-located fragments in all image planes corresponding to the location
+   of a single quadrant of a luma plane super block form a macro block.
+  Thus there is only a single set of macro blocks for all planes, each of which
+   contains between 6 and 12 fragments, depending on the pixel format.
+  Therefore macro block information is kept in a separate set of arrays from
+   super blocks to avoid unused space in the other planes.
+  The lists are indexed in super block order.
+  That is, the macro block corresponding to the macro block mbi in (luma plane)
+   super block sbi is at index (sbi<<2|mbi).
+  Thus the number of macro blocks in each dimension is always twice the number
+   of super blocks, even when only an odd number fall inside the coded frame.
+  These "extra" macro blocks are just an artifact of our internal data layout,
+   and not part of the coded stream; they are flagged with a negative MB mode.*/
+
+
+
+/*A single quadrant of the map from a super block to fragment numbers.*/
+typedef ptrdiff_t       oc_sb_map_quad[4];
+/*A map from a super block to fragment numbers.*/
+typedef oc_sb_map_quad  oc_sb_map[4];
+/*A single plane of the map from a macro block to fragment numbers.*/
+typedef ptrdiff_t       oc_mb_map_plane[4];
+/*A map from a macro block to fragment numbers.*/
+typedef oc_mb_map_plane oc_mb_map[3];
+/*A motion vector.*/
+typedef signed char     oc_mv[2];
+
+
+
+/*Super block information.*/
+struct oc_sb_flags{
+  unsigned char coded_fully:1;
+  unsigned char coded_partially:1;
+  unsigned char quad_valid:4;
+};
+
+
+
+/*Information about a fragment which intersects the border of the displayable
+   region.
+  This marks which pixels belong to the displayable region.*/
+struct oc_border_info{
+  /*A bit mask marking which pixels are in the displayable region.
+    Pixel (x,y) corresponds to bit (y<<3|x).*/
+  ogg_int64_t mask;
+  /*The number of pixels in the displayable region.
+    This is always positive, and always less than 64.*/
+  int         npixels;
+};
+
+
+
+/*Fragment information.*/
+struct oc_fragment{
+  /*A flag indicating whether or not this fragment is coded.*/
+  unsigned   coded:1;
+  /*A flag indicating that this entire fragment lies outside the displayable
+     region of the frame.
+    Note the contrast with an invalid macro block, which is outside the coded
+     frame, not just the displayable one.
+    There are no fragments outside the coded frame by construction.*/
+  unsigned   invalid:1;
+  /*The index of the quality index used for this fragment's AC coefficients.*/
+  unsigned   qii:6;
+  /*The mode of the macroblock this fragment belongs to.*/
+  unsigned   mb_mode:3;
+  /*The index of the associated border information for fragments which lie
+     partially outside the displayable region.
+    For fragments completely inside or outside this region, this is -1.
+    Note that the C standard requires an explicit signed keyword for bitfield
+     types, since some compilers may treat them as unsigned without it.*/
+  signed int borderi:5;
+  /*The prediction-corrected DC component.
+    Note that the C standard requires an explicit signed keyword for bitfield
+     types, since some compilers may treat them as unsigned without it.*/
+  signed int dc:16;
+};
+
+
+
+/*A description of each fragment plane.*/
+struct oc_fragment_plane{
+  /*The number of fragments in the horizontal direction.*/
+  int       nhfrags;
+  /*The number of fragments in the vertical direction.*/
+  int       nvfrags;
+  /*The offset of the first fragment in the plane.*/
+  ptrdiff_t froffset;
+  /*The total number of fragments in the plane.*/
+  ptrdiff_t nfrags;
+  /*The number of super blocks in the horizontal direction.*/
+  unsigned  nhsbs;
+  /*The number of super blocks in the vertical direction.*/
+  unsigned  nvsbs;
+  /*The offset of the first super block in the plane.*/
+  unsigned  sboffset;
+  /*The total number of super blocks in the plane.*/
+  unsigned  nsbs;
+};
+
+
+
+/*The shared (encoder and decoder) functions that have accelerated variants.*/
+struct oc_base_opt_vtable{
+  void (*frag_copy)(unsigned char *_dst,
+   const unsigned char *_src,int _ystride);
+  void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
+   const ogg_int16_t _residue[64]);
+  void (*frag_recon_inter)(unsigned char *_dst,
+   const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
+  void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1,
+   const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
+  void (*idct8x8)(ogg_int16_t _y[64],int _last_zzi);
+  void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi,
+   int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
+  void (*state_frag_copy_list)(const oc_theora_state *_state,
+   const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
+   int _dst_frame,int _src_frame,int _pli);
+  void (*state_loop_filter_frag_rows)(const oc_theora_state *_state,
+   int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);  
+  void (*restore_fpu)(void);
+};
+
+/*The shared (encoder and decoder) tables that vary according to which variants
+   of the above functions are used.*/
+struct oc_base_opt_data{
+  const unsigned char *dct_fzig_zag;
+};
+
+
+/*State information common to both the encoder and decoder.*/
+struct oc_theora_state{
+  /*The stream information.*/
+  th_info             info;
+  /*Table for shared accelerated functions.*/
+  oc_base_opt_vtable  opt_vtable;
+  /*Table for shared data used by accelerated functions.*/
+  oc_base_opt_data    opt_data;
+  /*CPU flags to detect the presence of extended instruction sets.*/
+  ogg_uint32_t        cpu_flags;
+  /*The fragment plane descriptions.*/
+  oc_fragment_plane   fplanes[3];
+  /*The list of fragments, indexed in image order.*/
+  oc_fragment        *frags;
+  /*The the offset into the reference frame buffer to the upper-left pixel of
+     each fragment.*/
+  ptrdiff_t          *frag_buf_offs;
+  /*The motion vector for each fragment.*/
+  oc_mv              *frag_mvs;
+  /*The total number of fragments in a single frame.*/
+  ptrdiff_t           nfrags;
+  /*The list of super block maps, indexed in image order.*/
+  oc_sb_map          *sb_maps;
+  /*The list of super block flags, indexed in image order.*/
+  oc_sb_flags        *sb_flags;
+  /*The total number of super blocks in a single frame.*/
+  unsigned            nsbs;
+  /*The fragments from each color plane that belong to each macro block.
+    Fragments are stored in image order (left to right then top to bottom).
+    When chroma components are decimated, the extra fragments have an index of
+     -1.*/
+  oc_mb_map          *mb_maps;
+  /*The list of macro block modes.
+    A negative number indicates the macro block lies entirely outside the
+     coded frame.*/
+  signed char        *mb_modes;
+  /*The number of macro blocks in the X direction.*/
+  unsigned            nhmbs;
+  /*The number of macro blocks in the Y direction.*/
+  unsigned            nvmbs;
+  /*The total number of macro blocks.*/
+  size_t              nmbs;
+  /*The list of coded fragments, in coded order.
+    Uncoded fragments are stored in reverse order from the end of the list.*/
+  ptrdiff_t          *coded_fragis;
+  /*The number of coded fragments in each plane.*/
+  ptrdiff_t           ncoded_fragis[3];
+  /*The total number of coded fragments.*/
+  ptrdiff_t           ntotal_coded_fragis;
+  /*The index of the buffers being used for each OC_FRAME_* reference frame.*/
+  int                 ref_frame_idx[4];
+  /*The actual buffers used for the previously decoded frames.*/
+  th_ycbcr_buffer     ref_frame_bufs[4];
+  /*The storage for the reference frame buffers.*/
+  unsigned char      *ref_frame_data[4];
+  /*The strides for each plane in the reference frames.*/
+  int                 ref_ystride[3];
+  /*The number of unique border patterns.*/
+  int                 nborders;
+  /*The unique border patterns for all border fragments.
+    The borderi field of fragments which straddle the border indexes this
+     list.*/
+  oc_border_info      borders[16];
+  /*The frame number of the last keyframe.*/
+  ogg_int64_t         keyframe_num;
+  /*The frame number of the current frame.*/
+  ogg_int64_t         curframe_num;
+  /*The granpos of the current frame.*/
+  ogg_int64_t         granpos;
+  /*The type of the current frame.*/
+  unsigned char       frame_type;
+  /*The bias to add to the frame count when computing granule positions.*/
+  unsigned char       granpos_bias;
+  /*The number of quality indices used in the current frame.*/
+  unsigned char       nqis;
+  /*The quality indices of the current frame.*/
+  unsigned char       qis[3];
+  /*The dequantization tables, stored in zig-zag order, and indexed by
+     qi, pli, qti, and zzi.*/
+  ogg_uint16_t       *dequant_tables[64][3][2];
+  OC_ALIGN16(oc_quant_table      dequant_table_data[64][3][2]);
+  /*Loop filter strength parameters.*/
+  unsigned char       loop_filter_limits[64];
+};
+
+
+
+/*The function type used to fill in the chroma plane motion vectors for a
+   macro block when 4 different motion vectors are specified in the luma
+   plane.
+  _cbmvs: The chroma block-level motion vectors to fill in.
+  _lmbmv: The luma macro-block level motion vector to fill in for use in
+           prediction.
+  _lbmvs: The luma block-level motion vectors.*/
+typedef void (*oc_set_chroma_mvs_func)(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]);
+
+
+
+/*A map from the index in the zig zag scan to the coefficient number in a
+   block.*/
+extern const unsigned char OC_FZIG_ZAG[128];
+/*A map from the coefficient number in a block to its index in the zig zag
+   scan.*/
+extern const unsigned char OC_IZIG_ZAG[64];
+/*A map from physical macro block ordering to bitstream macro block
+   ordering within a super block.*/
+extern const unsigned char OC_MB_MAP[2][2];
+/*A list of the indices in the oc_mb_map array that can be valid for each of
+   the various chroma decimation types.*/
+extern const unsigned char OC_MB_MAP_IDXS[TH_PF_NFORMATS][12];
+/*The number of indices in the oc_mb_map array that can be valid for each of
+   the various chroma decimation types.*/
+extern const unsigned char OC_MB_MAP_NIDXS[TH_PF_NFORMATS];
+/*A table of functions used to fill in the Cb,Cr plane motion vectors for a
+   macro block when 4 different motion vectors are specified in the luma
+   plane.*/
+extern const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS];
+
+
+
+int oc_ilog(unsigned _v);
+void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz);
+void **oc_calloc_2d(size_t _height,size_t _width,size_t _sz);
+void oc_free_2d(void *_ptr);
+
+void oc_ycbcr_buffer_flip(th_ycbcr_buffer _dst,
+ const th_ycbcr_buffer _src);
+
+int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs);
+void oc_state_clear(oc_theora_state *_state);
+void oc_state_vtable_init_c(oc_theora_state *_state);
+void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli,
+ int _y0,int _yend);
+void oc_state_borders_fill_caps(oc_theora_state *_state,int _refi,int _pli);
+void oc_state_borders_fill(oc_theora_state *_state,int _refi);
+void oc_state_fill_buffer_ptrs(oc_theora_state *_state,int _buf_idx,
+ th_ycbcr_buffer _img);
+int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby);
+int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
+ int _pli,int _dx,int _dy);
+
+int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
+void oc_state_loop_filter(oc_theora_state *_state,int _frame);
+#if defined(OC_DUMP_IMAGES)
+int oc_state_dump_frame(const oc_theora_state *_state,int _frame,
+ const char *_suf);
+#endif
+
+/*Shared accelerated functions.*/
+void oc_frag_copy(const oc_theora_state *_state,unsigned char *_dst,
+ const unsigned char *_src,int _ystride);
+void oc_frag_recon_intra(const oc_theora_state *_state,
+ unsigned char *_dst,int _dst_ystride,const ogg_int16_t _residue[64]);
+void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
+void oc_frag_recon_inter2(const oc_theora_state *_state,
+ unsigned char *_dst,const unsigned char *_src1,const unsigned char *_src2,
+ int _ystride,const ogg_int16_t _residue[64]);
+void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],int _last_zzi);
+void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_state_frag_copy_list(const oc_theora_state *_state,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
+ int _dst_frame,int _src_frame,int _pli);
+void oc_state_loop_filter_frag_rows(const oc_theora_state *_state,
+ int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_restore_fpu(const oc_theora_state *_state);
+
+/*Default pure-C implementations.*/
+void oc_frag_copy_c(unsigned char *_dst,
+ const unsigned char *_src,int _src_ystride);
+void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
+ const ogg_int16_t _residue[64]);
+void oc_frag_recon_inter_c(unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
+void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
+void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi);
+void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_state_frag_copy_list_c(const oc_theora_state *_state,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
+ int _dst_frame,int _src_frame,int _pli);
+void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
+ int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_restore_fpu_c(void);
+
+/*We need a way to call a few encoder functions without introducing a link-time
+   dependency into the decoder, while still allowing the old alpha API which
+   does not distinguish between encoder and decoder objects to be used.
+  We do this by placing a function table at the start of the encoder object
+   which can dispatch into the encoder library.
+  We do a similar thing for the decoder in case we ever decide to split off a
+   common base library.*/
+typedef void (*oc_state_clear_func)(theora_state *_th);
+typedef int (*oc_state_control_func)(theora_state *th,int _req,
+ void *_buf,size_t _buf_sz);
+typedef ogg_int64_t (*oc_state_granule_frame_func)(theora_state *_th,
+ ogg_int64_t _granulepos);
+typedef double (*oc_state_granule_time_func)(theora_state *_th,
+ ogg_int64_t _granulepos);
+
+
+struct oc_state_dispatch_vtable{
+  oc_state_clear_func         clear;
+  oc_state_control_func       control;
+  oc_state_granule_frame_func granule_frame;
+  oc_state_granule_time_func  granule_time;
+};
+
+#endif
diff --git a/lib/mathops.c b/lib/mathops.c
new file mode 100644
index 0000000..d3fb909
--- /dev/null
+++ b/lib/mathops.c
@@ -0,0 +1,296 @@
+#include "mathops.h"
+#include <limits.h>
+
+/*The fastest fallback strategy for platforms with fast multiplication appears
+   to be based on de Bruijn sequences~\cite{LP98}.
+  Tests confirmed this to be true even on an ARM11, where it is actually faster
+   than using the native clz instruction.
+  Define OC_ILOG_NODEBRUIJN to use a simpler fallback on platforms where
+   multiplication or table lookups are too expensive.
+
+  @UNPUBLISHED{LP98,
+    author="Charles E. Leiserson and Harald Prokop",
+    title="Using de {Bruijn} Sequences to Index a 1 in a Computer Word",
+    month=Jun,
+    year=1998,
+    note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}"
+  }*/
+#if !defined(OC_ILOG_NODEBRUIJN)&& \
+ !defined(OC_CLZ32)||!defined(OC_CLZ64)&&LONG_MAX<9223372036854775807LL
+static const unsigned char OC_DEBRUIJN_IDX32[32]={
+   0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8,
+  31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9
+};
+#endif
+
+int oc_ilog32(ogg_uint32_t _v){
+#if defined(OC_CLZ32)
+  return (OC_CLZ32_OFFS-OC_CLZ32(_v))&-!!_v;
+#else
+/*On a Pentium M, this branchless version tested as the fastest version without
+   multiplications on 1,000,000,000 random 32-bit integers, edging out a
+   similar version with branches, and a 256-entry LUT version.*/
+# if defined(OC_ILOG_NODEBRUIJN)
+  int ret;
+  int m;
+  ret=_v>0;
+  m=(_v>0xFFFFU)<<4;
+  _v>>=m;
+  ret|=m;
+  m=(_v>0xFFU)<<3;
+  _v>>=m;
+  ret|=m;
+  m=(_v>0xFU)<<2;
+  _v>>=m;
+  ret|=m;
+  m=(_v>3)<<1;
+  _v>>=m;
+  ret|=m;
+  ret+=_v>1;
+  return ret;
+/*This de Bruijn sequence version is faster if you have a fast multiplier.*/
+# else
+  int ret;
+  ret=_v>0;
+  _v|=_v>>1;
+  _v|=_v>>2;
+  _v|=_v>>4;
+  _v|=_v>>8;
+  _v|=_v>>16;
+  _v=(_v>>1)+1;
+  ret+=OC_DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F];
+  return ret;
+# endif
+#endif
+}
+
+int oc_ilog64(ogg_int64_t _v){
+#if defined(OC_CLZ64)
+  return (OC_CLZ64_OFFS-OC_CLZ64(_v))&-!!_v;
+#else
+# if defined(OC_ILOG_NODEBRUIJN)
+  ogg_uint32_t v;
+  int          ret;
+  int          m;
+  ret=_v>0;
+  m=(_v>0xFFFFFFFFU)<<5;
+  v=(ogg_uint32_t)(_v>>m);
+  ret|=m;
+  m=(v>0xFFFFU)<<4;
+  v>>=m;
+  ret|=m;
+  m=(v>0xFFU)<<3;
+  v>>=m;
+  ret|=m;
+  m=(v>0xFU)<<2;
+  v>>=m;
+  ret|=m;
+  m=(v>3)<<1;
+  v>>=m;
+  ret|=m;
+  ret+=v>1;
+  return ret;
+# else
+/*If we don't have a 64-bit word, split it into two 32-bit halves.*/
+#  if LONG_MAX<9223372036854775807LL
+  ogg_uint32_t v;
+  int          ret;
+  int          m;
+  ret=_v>0;
+  m=(_v>0xFFFFFFFFU)<<5;
+  v=(ogg_uint32_t)(_v>>m);
+  ret|=m;
+  v|=v>>1;
+  v|=v>>2;
+  v|=v>>4;
+  v|=v>>8;
+  v|=v>>16;
+  v=(v>>1)+1;
+  ret+=OC_DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F];
+  return ret;
+/*Otherwise do it in one 64-bit operation.*/
+#  else
+  static const unsigned char OC_DEBRUIJN_IDX64[64]={
+     0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
+     5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
+    63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56,
+    62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
+  };
+  int ret;
+  ret=_v>0;
+  _v|=_v>>1;
+  _v|=_v>>2;
+  _v|=_v>>4;
+  _v|=_v>>8;
+  _v|=_v>>16;
+  _v|=_v>>32;
+  _v=(_v>>1)+1;
+  ret+=OC_DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F];
+  return ret;
+#  endif
+# endif
+#endif
+}
+
+/*round(2**(62+i)*atanh(2**(-(i+1)))/log(2))*/
+static const ogg_int64_t OC_ATANH_LOG2[32]={
+  0x32B803473F7AD0F4LL,0x2F2A71BD4E25E916LL,0x2E68B244BB93BA06LL,
+  0x2E39FB9198CE62E4LL,0x2E2E683F68565C8FLL,0x2E2B850BE2077FC1LL,
+  0x2E2ACC58FE7B78DBLL,0x2E2A9E2DE52FD5F2LL,0x2E2A92A338D53EECLL,
+  0x2E2A8FC08F5E19B6LL,0x2E2A8F07E51A485ELL,0x2E2A8ED9BA8AF388LL,
+  0x2E2A8ECE2FE7384ALL,0x2E2A8ECB4D3E4B1ALL,0x2E2A8ECA94940FE8LL,
+  0x2E2A8ECA6669811DLL,0x2E2A8ECA5ADEDD6ALL,0x2E2A8ECA57FC347ELL,
+  0x2E2A8ECA57438A43LL,0x2E2A8ECA57155FB4LL,0x2E2A8ECA5709D510LL,
+  0x2E2A8ECA5706F267LL,0x2E2A8ECA570639BDLL,0x2E2A8ECA57060B92LL,
+  0x2E2A8ECA57060008LL,0x2E2A8ECA5705FD25LL,0x2E2A8ECA5705FC6CLL,
+  0x2E2A8ECA5705FC3ELL,0x2E2A8ECA5705FC33LL,0x2E2A8ECA5705FC30LL,
+  0x2E2A8ECA5705FC2FLL,0x2E2A8ECA5705FC2FLL
+};
+
+/*Computes the binary exponential of _z, a log base 2 in Q57 format.*/
+ogg_int64_t oc_bexp64(ogg_int64_t _z){
+  ogg_int64_t w;
+  ogg_int64_t z;
+  int         ipart;
+  ipart=(int)(_z>>57);
+  if(ipart<0)return 0;
+  if(ipart>=63)return 0x7FFFFFFFFFFFFFFFLL;
+  z=_z-OC_Q57(ipart);
+  if(z){
+    ogg_int64_t mask;
+    long        wlo;
+    int         i;
+    /*C doesn't give us 64x64->128 muls, so we use CORDIC.
+      This is not particularly fast, but it's not being used in time-critical
+       code; it is very accurate.*/
+    /*z is the fractional part of the log in Q62 format.
+      We need 1 bit of headroom since the magnitude can get larger than 1
+       during the iteration, and a sign bit.*/
+    z<<=5;
+    /*w is the exponential in Q61 format (since it also needs headroom and can
+       get as large as 2.0); we could get another bit if we dropped the sign,
+       but we'll recover that bit later anyway.
+      Ideally this should start out as
+        \lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}}
+       but in order to guarantee convergence we have to repeat iterations 4,
+        13 (=3*4+1), and 40 (=3*13+1, etc.), so it winds up somewhat larger.*/
+    w=0x26A3D0E401DD846DLL;
+    for(i=0;;i++){
+      mask=-(z<0);
+      w+=(w>>i+1)+mask^mask;
+      z-=OC_ATANH_LOG2[i]+mask^mask;
+      /*Repeat iteration 4.*/
+      if(i>=3)break;
+      z<<=1;
+    }
+    for(;;i++){
+      mask=-(z<0);
+      w+=(w>>i+1)+mask^mask;
+      z-=OC_ATANH_LOG2[i]+mask^mask;
+      /*Repeat iteration 13.*/
+      if(i>=12)break;
+      z<<=1;
+    }
+    for(;i<32;i++){
+      mask=-(z<0);
+      w+=(w>>i+1)+mask^mask;
+      z=z-(OC_ATANH_LOG2[i]+mask^mask)<<1;
+    }
+    wlo=0;
+    /*Skip the remaining iterations unless we really require that much
+       precision.
+      We could have bailed out earlier for smaller iparts, but that would
+       require initializing w from a table, as the limit doesn't converge to
+       61-bit precision until n=30.*/
+    if(ipart>30){
+      /*For these iterations, we just update the low bits, as the high bits
+         can't possibly be affected.
+        OC_ATANH_LOG2 has also converged (it actually did so one iteration
+         earlier, but that's no reason for an extra special case).*/
+      for(;;i++){
+        mask=-(z<0);
+        wlo+=(w>>i)+mask^mask;
+        z-=OC_ATANH_LOG2[31]+mask^mask;
+        /*Repeat iteration 40.*/
+        if(i>=39)break;
+        z<<=1;
+      }
+      for(;i<61;i++){
+        mask=-(z<0);
+        wlo+=(w>>i)+mask^mask;
+        z=z-(OC_ATANH_LOG2[31]+mask^mask)<<1;
+      }
+    }
+    w=(w<<1)+wlo;
+  }
+  else w=(ogg_int64_t)1<<62;
+  if(ipart<62)w=(w>>61-ipart)+1>>1;
+  return w;
+}
+
+/*Computes the binary logarithm of _w, returned in Q57 format.*/
+ogg_int64_t oc_blog64(ogg_int64_t _w){
+  ogg_int64_t z;
+  int         ipart;
+  if(_w<=0)return -1;
+  ipart=OC_ILOGNZ_64(_w)-1;
+  if(ipart>61)_w>>=ipart-61;
+  else _w<<=61-ipart;
+  z=0;
+  if(_w&_w-1){
+    ogg_int64_t x;
+    ogg_int64_t y;
+    ogg_int64_t u;
+    ogg_int64_t mask;
+    int         i;
+    /*C doesn't give us 64x64->128 muls, so we use CORDIC.
+      This is not particularly fast, but it's not being used in time-critical
+       code; it is very accurate.*/
+    /*z is the fractional part of the log in Q61 format.*/
+    /*x and y are the cosh() and sinh(), respectively, in Q61 format.
+      We are computing z=2*atanh(y/x)=2*atanh((_w-1)/(_w+1)).*/
+    x=_w+((ogg_int64_t)1<<61);
+    y=_w-((ogg_int64_t)1<<61);
+    for(i=0;i<4;i++){
+      mask=-(y<0);
+      z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
+      u=x>>i+1;
+      x-=(y>>i+1)+mask^mask;
+      y-=u+mask^mask;
+    }
+    /*Repeat iteration 4.*/
+    for(i--;i<13;i++){
+      mask=-(y<0);
+      z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
+      u=x>>i+1;
+      x-=(y>>i+1)+mask^mask;
+      y-=u+mask^mask;
+    }
+    /*Repeat iteration 13.*/
+    for(i--;i<32;i++){
+      mask=-(y<0);
+      z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
+      u=x>>i+1;
+      x-=(y>>i+1)+mask^mask;
+      y-=u+mask^mask;
+    }
+    /*OC_ATANH_LOG2 has converged.*/
+    for(;i<40;i++){
+      mask=-(y<0);
+      z+=(OC_ATANH_LOG2[31]>>i)+mask^mask;
+      u=x>>i+1;
+      x-=(y>>i+1)+mask^mask;
+      y-=u+mask^mask;
+    }
+    /*Repeat iteration 40.*/
+    for(i--;i<62;i++){
+      mask=-(y<0);
+      z+=(OC_ATANH_LOG2[31]>>i)+mask^mask;
+      u=x>>i+1;
+      x-=(y>>i+1)+mask^mask;
+      y-=u+mask^mask;
+    }
+    z=z+8>>4;
+  }
+  return OC_Q57(ipart)+z;
+}
diff --git a/lib/mathops.h b/lib/mathops.h
new file mode 100644
index 0000000..efbc537
--- /dev/null
+++ b/lib/mathops.h
@@ -0,0 +1,141 @@
+#if !defined(_mathops_H)
+# define _mathops_H (1)
+# include <ogg/ogg.h>
+
+# ifdef __GNUC_PREREQ
+#  if __GNUC_PREREQ(3,4)
+#   include <limits.h>
+/*Note the casts to (int) below: this prevents OC_CLZ{32|64}_OFFS from
+   "upgrading" the type of an entire expression to an (unsigned) size_t.*/
+#   if INT_MAX>=2147483647
+#    define OC_CLZ32_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
+#    define OC_CLZ32(_x) (__builtin_clz(_x))
+#   elif LONG_MAX>=2147483647L
+#    define OC_CLZ32_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
+#    define OC_CLZ32(_x) (__builtin_clzl(_x))
+#   endif
+#   if INT_MAX>=9223372036854775807LL
+#    define OC_CLZ64_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
+#    define OC_CLZ64(_x) (__builtin_clz(_x))
+#   elif LONG_MAX>=9223372036854775807LL
+#    define OC_CLZ64_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
+#    define OC_CLZ64(_x) (__builtin_clzl(_x))
+#   elif LLONG_MAX>=9223372036854775807LL|| \
+     __LONG_LONG_MAX__>=9223372036854775807LL
+#    define OC_CLZ64_OFFS ((int)sizeof(unsigned long long)*CHAR_BIT)
+#    define OC_CLZ64(_x) (__builtin_clzll(_x))
+#   endif
+#  endif
+# endif
+
+
+
+/**
+ * oc_ilog32 - Integer binary logarithm of a 32-bit value.
+ * @_v: A 32-bit value.
+ * Returns floor(log2(_v))+1, or 0 if _v==0.
+ * This is the number of bits that would be required to represent _v in two's
+ *  complement notation with all of the leading zeros stripped.
+ * The OC_ILOG_32() or OC_ILOGNZ_32() macros may be able to use a builtin
+ *  function instead, which should be faster.
+ */
+int oc_ilog32(ogg_uint32_t _v);
+/**
+ * oc_ilog64 - Integer binary logarithm of a 64-bit value.
+ * @_v: A 64-bit value.
+ * Returns floor(log2(_v))+1, or 0 if _v==0.
+ * This is the number of bits that would be required to represent _v in two's
+ *  complement notation with all of the leading zeros stripped.
+ * The OC_ILOG_64() or OC_ILOGNZ_64() macros may be able to use a builtin
+ *  function instead, which should be faster.
+ */
+int oc_ilog64(ogg_int64_t _v);
+
+
+# if defined(OC_CLZ32)
+/**
+ * OC_ILOGNZ_32 - Integer binary logarithm of a non-zero 32-bit value.
+ * @_v: A non-zero 32-bit value.
+ * Returns floor(log2(_v))+1.
+ * This is the number of bits that would be required to represent _v in two's
+ *  complement notation with all of the leading zeros stripped.
+ * If _v is zero, the return value is undefined; use OC_ILOG_32() instead.
+ */
+#  define OC_ILOGNZ_32(_v) (OC_CLZ32_OFFS-OC_CLZ32(_v))
+/**
+ * OC_ILOG_32 - Integer binary logarithm of a 32-bit value.
+ * @_v: A 32-bit value.
+ * Returns floor(log2(_v))+1, or 0 if _v==0.
+ * This is the number of bits that would be required to represent _v in two's
+ *  complement notation with all of the leading zeros stripped.
+ */
+#  define OC_ILOG_32(_v)   (OC_ILOGNZ_32(_v)&-!!(_v))
+# else
+#  define OC_ILOGNZ_32(_v) (oc_ilog32(_v))
+#  define OC_ILOG_32(_v)   (oc_ilog32(_v))
+# endif
+
+# if defined(CLZ64)
+/**
+ * OC_ILOGNZ_64 - Integer binary logarithm of a non-zero 64-bit value.
+ * @_v: A non-zero 64-bit value.
+ * Returns floor(log2(_v))+1.
+ * This is the number of bits that would be required to represent _v in two's
+ *  complement notation with all of the leading zeros stripped.
+ * If _v is zero, the return value is undefined; use OC_ILOG_64() instead.
+ */
+#  define OC_ILOGNZ_64(_v) (CLZ64_OFFS-CLZ64(_v))
+/**
+ * OC_ILOG_64 - Integer binary logarithm of a 64-bit value.
+ * @_v: A 64-bit value.
+ * Returns floor(log2(_v))+1, or 0 if _v==0.
+ * This is the number of bits that would be required to represent _v in two's
+ *  complement notation with all of the leading zeros stripped.
+ */
+#  define OC_ILOG_64(_v)   (OC_ILOGNZ_64(_v)&-!!(_v))
+# else
+#  define OC_ILOGNZ_64(_v) (oc_ilog64(_v))
+#  define OC_ILOG_64(_v)   (oc_ilog64(_v))
+# endif
+
+# define OC_STATIC_ILOG0(_v) (!!(_v))
+# define OC_STATIC_ILOG1(_v) (((_v)&0x2)?2:OC_STATIC_ILOG0(_v))
+# define OC_STATIC_ILOG2(_v) \
+ (((_v)&0xC)?2+OC_STATIC_ILOG1((_v)>>2):OC_STATIC_ILOG1(_v))
+# define OC_STATIC_ILOG3(_v) \
+ (((_v)&0xF0)?4+OC_STATIC_ILOG2((_v)>>4):OC_STATIC_ILOG2(_v))
+# define OC_STATIC_ILOG4(_v) \
+ (((_v)&0xFF00)?8+OC_STATIC_ILOG3((_v)>>8):OC_STATIC_ILOG3(_v))
+# define OC_STATIC_ILOG5(_v) \
+ (((_v)&0xFFFF0000)?16+OC_STATIC_ILOG4((_v)>>16):OC_STATIC_ILOG4(_v))
+# define OC_STATIC_ILOG6(_v) \
+ (((_v)&0xFFFFFFFF00000000ULL)?32+OC_STATIC_ILOG5((_v)>>32):OC_STATIC_ILOG5(_v))
+/**
+ * OC_STATIC_ILOG_32 - The integer logarithm of an (unsigned, 32-bit) constant.
+ * @_v: A non-negative 32-bit constant.
+ * Returns floor(log2(_v))+1, or 0 if _v==0.
+ * This is the number of bits that would be required to represent _v in two's
+ *  complement notation with all of the leading zeros stripped.
+ * This macro is suitable for evaluation at compile time, but it should not be
+ *  used on values that can change at runtime, as it operates via exhaustive
+ *  search.
+ */
+# define OC_STATIC_ILOG_32(_v) (OC_STATIC_ILOG5((ogg_uint32_t)(_v)))
+/**
+ * OC_STATIC_ILOG_64 - The integer logarithm of an (unsigned, 64-bit) constant.
+ * @_v: A non-negative 64-bit constant.
+ * Returns floor(log2(_v))+1, or 0 if _v==0.
+ * This is the number of bits that would be required to represent _v in two's
+ *  complement notation with all of the leading zeros stripped.
+ * This macro is suitable for evaluation at compile time, but it should not be
+ *  used on values that can change at runtime, as it operates via exhaustive
+ *  search.
+ */
+# define OC_STATIC_ILOG_64(_v) (OC_STATIC_ILOG6((ogg_int64_t)(_v)))
+
+#define OC_Q57(_v) ((ogg_int64_t)(_v)<<57)
+
+ogg_int64_t oc_bexp64(ogg_int64_t _z);
+ogg_int64_t oc_blog64(ogg_int64_t _w);
+
+#endif
diff --git a/lib/mcenc.c b/lib/mcenc.c
new file mode 100644
index 0000000..797e81f
--- /dev/null
+++ b/lib/mcenc.c
@@ -0,0 +1,767 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id$
+
+ ********************************************************************/
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include "encint.h"
+
+
+
+typedef struct oc_mcenc_ctx           oc_mcenc_ctx;
+
+
+
+/*Temporary state used for motion estimation.*/
+struct oc_mcenc_ctx{
+  /*The candidate motion vectors.*/
+  int                candidates[13][2];
+  /*The start of the Set B candidates.*/
+  int                setb0;
+  /*The total number of candidates.*/
+  int                ncandidates;
+};
+
+
+
+/*The maximum Y plane SAD value for accepting the median predictor.*/
+#define OC_YSAD_THRESH1            (256)
+/*The amount to right shift the minimum error by when inflating it for
+   computing the second maximum Y plane SAD threshold.*/
+#define OC_YSAD_THRESH2_SCALE_BITS (4)
+/*The amount to add to the second maximum Y plane threshold when inflating
+   it.*/
+#define OC_YSAD_THRESH2_OFFSET     (64)
+
+/*The vector offsets in the X direction for each search site in the square
+   pattern.*/
+static const int OC_SQUARE_DX[9]={-1,0,1,-1,0,1,-1,0,1};
+/*The vector offsets in the Y direction for each search site in the square
+   pattern.*/
+static const int OC_SQUARE_DY[9]={-1,-1,-1,0,0,0,1,1,1};
+/*The number of sites to search for each boundary condition in the square
+   pattern.
+  Bit flags for the boundary conditions are as follows:
+  1: -16==dx
+  2:      dx==15(.5)
+  4: -16==dy
+  8:      dy==15(.5)*/
+static const int OC_SQUARE_NSITES[11]={8,5,5,0,5,3,3,0,5,3,3};
+/*The list of sites to search for each boundary condition in the square
+   pattern.*/
+static const int OC_SQUARE_SITES[11][8]={
+  /* -15.5<dx<31,       -15.5<dy<15(.5)*/
+  {0,1,2,3,5,6,7,8},
+  /*-15.5==dx,          -15.5<dy<15(.5)*/
+  {1,2,5,7,8},
+  /*     dx==15(.5),    -15.5<dy<15(.5)*/
+  {0,1,3,6,7},
+  /*-15.5==dx==15(.5),  -15.5<dy<15(.5)*/
+  {-1},
+  /* -15.5<dx<15(.5),  -15.5==dy*/
+  {3,5,6,7,8},
+  /*-15.5==dx,         -15.5==dy*/
+  {5,7,8},
+  /*     dx==15(.5),   -15.5==dy*/
+  {3,6,7},
+  /*-15.5==dx==15(.5), -15.5==dy*/
+  {-1},
+  /*-15.5dx<15(.5),           dy==15(.5)*/
+  {0,1,2,3,5},
+  /*-15.5==dx,                dy==15(.5)*/
+  {1,2,5},
+  /*       dx==15(.5),        dy==15(.5)*/
+  {0,1,3}
+};
+
+
+static void oc_mcenc_find_candidates(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
+ int _accum[2],int _mbi,int _frame){
+  oc_mb_enc_info *embs;
+  int             a[3][2];
+  int             ncandidates;
+  unsigned        nmbi;
+  int             i;
+  embs=_enc->mb_info;
+  /*Skip a position to store the median predictor in.*/
+  ncandidates=1;
+  if(embs[_mbi].ncneighbors>0){
+    /*Fill in the first part of set A: the vectors from adjacent blocks.*/
+    for(i=0;i<embs[_mbi].ncneighbors;i++){
+      nmbi=embs[_mbi].cneighbors[i];
+      _mcenc->candidates[ncandidates][0]=embs[nmbi].analysis_mv[0][_frame][0];
+      _mcenc->candidates[ncandidates][1]=embs[nmbi].analysis_mv[0][_frame][1];
+      ncandidates++;
+    }
+  }
+  /*Add a few additional vectors to set A: the vectors used in the previous
+     frames and the (0,0) vector.*/
+  _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,_accum[0],31);
+  _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,_accum[1],31);
+  ncandidates++;
+  _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
+   embs[_mbi].analysis_mv[1][_frame][0]+_accum[0],31);
+  _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
+   embs[_mbi].analysis_mv[1][_frame][1]+_accum[1],31);
+  ncandidates++;
+  _mcenc->candidates[ncandidates][0]=0;
+  _mcenc->candidates[ncandidates][1]=0;
+  ncandidates++;
+  /*Use the first three vectors of set A to find our best predictor: their
+     median.*/
+  memcpy(a,_mcenc->candidates+1,sizeof(a));
+  OC_SORT2I(a[0][0],a[1][0]);
+  OC_SORT2I(a[0][1],a[1][1]);
+  OC_SORT2I(a[1][0],a[2][0]);
+  OC_SORT2I(a[1][1],a[2][1]);
+  OC_SORT2I(a[0][0],a[1][0]);
+  OC_SORT2I(a[0][1],a[1][1]);
+  _mcenc->candidates[0][0]=a[1][0];
+  _mcenc->candidates[0][1]=a[1][1];
+  /*Fill in set B: accelerated predictors for this and adjacent macro blocks.*/
+  _mcenc->setb0=ncandidates;
+  /*The first time through the loop use the current macro block.*/
+  nmbi=_mbi;
+  for(i=0;;i++){
+    _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
+     2*embs[_mbi].analysis_mv[1][_frame][0]
+     -embs[_mbi].analysis_mv[2][_frame][0]+_accum[0],31);
+    _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
+     2*embs[_mbi].analysis_mv[1][_frame][1]
+     -embs[_mbi].analysis_mv[2][_frame][1]+_accum[1],31);
+    ncandidates++;
+    if(i>=embs[_mbi].npneighbors)break;
+    nmbi=embs[_mbi].pneighbors[i];
+  }
+  /*Truncate to full-pel positions.*/
+  for(i=0;i<ncandidates;i++){
+    _mcenc->candidates[i][0]=OC_DIV2(_mcenc->candidates[i][0]);
+    _mcenc->candidates[i][1]=OC_DIV2(_mcenc->candidates[i][1]);
+  }
+  _mcenc->ncandidates=ncandidates;
+}
+
+#if 0
+static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc,
+ const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],
+ int _mvoffset0,int _mvoffset1,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _best_err){
+  unsigned err;
+  int      bi;
+  err=0;
+  for(bi=0;bi<4;bi++){
+    ptrdiff_t frag_offs;
+    frag_offs=_frag_buf_offs[_fragis[bi]];
+    err+=oc_enc_frag_sad2_thresh(_enc,_src+frag_offs,_ref+frag_offs+_mvoffset0,
+     _ref+frag_offs+_mvoffset1,_ystride,_best_err-err);
+  }
+  return err;
+}
+#endif
+
+static unsigned oc_satd16_halfpel(const oc_enc_ctx *_enc,
+ const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],
+ int _mvoffset0,int _mvoffset1,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _best_err){
+  unsigned err;
+  int      bi;
+  err=0;
+  for(bi=0;bi<4;bi++){
+    ptrdiff_t frag_offs;
+    frag_offs=_frag_buf_offs[_fragis[bi]];
+    err+=oc_enc_frag_satd2_thresh(_enc,_src+frag_offs,_ref+frag_offs+_mvoffset0,
+     _ref+frag_offs+_mvoffset1,_ystride,_best_err-err);
+  }
+  return err;
+}
+
+static unsigned oc_mcenc_ysad_check_mbcandidate_fullpel(const oc_enc_ctx *_enc,
+ const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],int _dx,int _dy,
+ const unsigned char *_src,const unsigned char *_ref,int _ystride,
+ unsigned _block_err[4]){
+  unsigned err;
+  int      mvoffset;
+  int      bi;
+  mvoffset=_dx+_dy*_ystride;
+  err=0;
+  for(bi=0;bi<4;bi++){
+    ptrdiff_t frag_offs;
+    unsigned  block_err;
+    frag_offs=_frag_buf_offs[_fragis[bi]];
+    block_err=oc_enc_frag_sad(_enc,
+     _src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
+    _block_err[bi]=block_err;
+    err+=block_err;
+  }
+  return err;
+}
+
+static int oc_mcenc_ysatd_check_mbcandidate_fullpel(const oc_enc_ctx *_enc,
+ const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],int _dx,int _dy,
+ const unsigned char *_src,const unsigned char *_ref,int _ystride){
+  int mvoffset;
+  int err;
+  int bi;
+  mvoffset=_dx+_dy*_ystride;
+  err=0;
+  for(bi=0;bi<4;bi++){
+    ptrdiff_t frag_offs;
+    frag_offs=_frag_buf_offs[_fragis[bi]];
+    err+=oc_enc_frag_satd_thresh(_enc,
+     _src+frag_offs,_ref+frag_offs+mvoffset,_ystride,UINT_MAX);
+  }
+  return err;
+}
+
+static unsigned oc_mcenc_ysatd_check_bcandidate_fullpel(const oc_enc_ctx *_enc,
+ ptrdiff_t _frag_offs,int _dx,int _dy,
+ const unsigned char *_src,const unsigned char *_ref,int _ystride){
+  return oc_enc_frag_satd_thresh(_enc,
+   _src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride,UINT_MAX);
+}
+
+/*Perform a motion vector search for this macro block against a single
+   reference frame.
+  As a bonus, individual block motion vectors are computed as well, as much of
+   the work can be shared.
+  The actual motion vector is stored in the appropriate place in the
+   oc_mb_enc_info structure.
+  _mcenc:    The motion compensation context.
+  _accum:    Drop frame/golden MV accumulators.
+  _mbi:      The macro block index.
+  _frame:    The frame to search, either OC_FRAME_PREV or OC_FRAME_GOLD.*/
+void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){
+  /*Note: Traditionally this search is done using a rate-distortion objective
+     function of the form D+lambda*R.
+    However, xiphmont tested this and found it produced a small degredation,
+     while requiring extra computation.
+    This is most likely due to Theora's peculiar MV encoding scheme: MVs are
+     not coded relative to a predictor, and the only truly cheap way to use a
+     MV is in the LAST or LAST2 MB modes, which are not being considered here.
+    Therefore if we use the MV found here, it's only because both LAST and
+     LAST2 performed poorly, and therefore the MB is not likely to be uniform
+     or suffer from the aperture problem.
+    Furthermore we would like to re-use the MV found here for as many MBs as
+     possible, so picking a slightly sub-optimal vector to save a bit or two
+     may cause increased degredation in many blocks to come.
+    We could artificially reduce lambda to compensate, but it's faster to just
+     disable it entirely, and use D (the distortion) as the sole criterion.*/
+  oc_mcenc_ctx         mcenc;
+  const ptrdiff_t     *frag_buf_offs;
+  const ptrdiff_t     *fragis;
+  const unsigned char *src;
+  const unsigned char *ref;
+  int                  ystride;
+  oc_mb_enc_info      *embs;
+  ogg_int32_t          hit_cache[31];
+  ogg_int32_t          hitbit;
+  unsigned             best_block_err[4];
+  unsigned             block_err[4];
+  unsigned             best_err;
+  int                  best_vec[2];
+  int                  best_block_vec[4][2];
+  int                  candx;
+  int                  candy;
+  int                  bi;
+  embs=_enc->mb_info;
+  /*Find some candidate motion vectors.*/
+  oc_mcenc_find_candidates(_enc,&mcenc,_accum,_mbi,_frame);
+  /*Clear the cache of locations we've examined.*/
+  memset(hit_cache,0,sizeof(hit_cache));
+  /*Start with the median predictor.*/
+  candx=mcenc.candidates[0][0];
+  candy=mcenc.candidates[0][1];
+  hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15;
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  fragis=_enc->state.mb_maps[_mbi][0];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]];
+  ystride=_enc->state.ref_ystride[0];
+  /*TODO: customize error function for speed/(quality+size) tradeoff.*/
+  best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
+   frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
+  best_vec[0]=candx;
+  best_vec[1]=candy;
+  if(_frame==OC_FRAME_PREV){
+    for(bi=0;bi<4;bi++){
+      best_block_err[bi]=block_err[bi];
+      best_block_vec[bi][0]=candx;
+      best_block_vec[bi][1]=candy;
+    }
+  }
+  /*If this predictor fails, move on to set A.*/
+  if(best_err>OC_YSAD_THRESH1){
+    unsigned err;
+    unsigned t2;
+    int      ncs;
+    int      ci;
+    /*Compute the early termination threshold for set A.*/
+    t2=embs[_mbi].error[_frame];
+    ncs=OC_MINI(3,embs[_mbi].ncneighbors);
+    for(ci=0;ci<ncs;ci++){
+      t2=OC_MAXI(t2,embs[embs[_mbi].cneighbors[ci]].error[_frame]);
+    }
+    t2+=(t2>>OC_YSAD_THRESH2_SCALE_BITS)+OC_YSAD_THRESH2_OFFSET;
+    /*Examine the candidates in set A.*/
+    for(ci=1;ci<mcenc.setb0;ci++){
+      candx=mcenc.candidates[ci][0];
+      candy=mcenc.candidates[ci][1];
+      /*If we've already examined this vector, then we would be using it if it
+         was better than what we are using.*/
+      hitbit=(ogg_int32_t)1<<candx+15;
+      if(hit_cache[candy+15]&hitbit)continue;
+      hit_cache[candy+15]|=hitbit;
+      err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
+       frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
+      if(err<best_err){
+        best_err=err;
+        best_vec[0]=candx;
+        best_vec[1]=candy;
+      }
+      if(_frame==OC_FRAME_PREV){
+        for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){
+          best_block_err[bi]=block_err[bi];
+          best_block_vec[bi][0]=candx;
+          best_block_vec[bi][1]=candy;
+        }
+      }
+    }
+    if(best_err>t2){
+      /*Examine the candidates in set B.*/
+      for(;ci<mcenc.ncandidates;ci++){
+        candx=mcenc.candidates[ci][0];
+        candy=mcenc.candidates[ci][1];
+        hitbit=(ogg_int32_t)1<<candx+15;
+        if(hit_cache[candy+15]&hitbit)continue;
+        hit_cache[candy+15]|=hitbit;
+        err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
+         frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
+        if(err<best_err){
+          best_err=err;
+          best_vec[0]=candx;
+          best_vec[1]=candy;
+        }
+        if(_frame==OC_FRAME_PREV){
+          for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){
+            best_block_err[bi]=block_err[bi];
+            best_block_vec[bi][0]=candx;
+            best_block_vec[bi][1]=candy;
+          }
+        }
+      }
+      /*Use the same threshold for set B as in set A.*/
+      if(best_err>t2){
+        int best_site;
+        int nsites;
+        int sitei;
+        int site;
+        int b;
+        /*Square pattern search.*/
+        for(;;){
+          best_site=4;
+          /*Compose the bit flags for boundary conditions.*/
+          b=OC_DIV16(-best_vec[0]+1)|OC_DIV16(best_vec[0]+1)<<1|
+           OC_DIV16(-best_vec[1]+1)<<2|OC_DIV16(best_vec[1]+1)<<3;
+          nsites=OC_SQUARE_NSITES[b];
+          for(sitei=0;sitei<nsites;sitei++){
+            site=OC_SQUARE_SITES[b][sitei];
+            candx=best_vec[0]+OC_SQUARE_DX[site];
+            candy=best_vec[1]+OC_SQUARE_DY[site];
+            hitbit=(ogg_int32_t)1<<candx+15;
+            if(hit_cache[candy+15]&hitbit)continue;
+            hit_cache[candy+15]|=hitbit;
+            err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
+             frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
+            if(err<best_err){
+              best_err=err;
+              best_site=site;
+            }
+            if(_frame==OC_FRAME_PREV){
+              for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){
+                best_block_err[bi]=block_err[bi];
+                best_block_vec[bi][0]=candx;
+                best_block_vec[bi][1]=candy;
+              }
+            }
+          }
+          if(best_site==4)break;
+          best_vec[0]+=OC_SQUARE_DX[best_site];
+          best_vec[1]+=OC_SQUARE_DY[best_site];
+        }
+        /*Final 4-MV search.*/
+        /*Simply use 1/4 of the macro block set A and B threshold as the
+           individual block threshold.*/
+        if(_frame==OC_FRAME_PREV){
+          t2>>=2;
+          for(bi=0;bi<4;bi++){
+            if(best_block_err[bi]>t2){
+              /*Square pattern search.
+                We do this in a slightly interesting manner.
+                We continue to check the SAD of all four blocks in the
+                 macro block.
+                This gives us two things:
+                 1) We can continue to use the hit_cache to avoid duplicate
+                     checks.
+                    Otherwise we could continue to read it, but not write to it
+                     without saving and restoring it for each block.
+                    Note that we could still eliminate a large number of
+                     duplicate checks by taking into account the site we came
+                     from when choosing the site list.
+                    We can still do that to avoid extra hit_cache queries, and
+                     it might even be a speed win.
+                 2) It gives us a slightly better chance of escaping local
+                     minima.
+                    We would not be here if we weren't doing a fairly bad job
+                     in finding a good vector, and checking these vectors can
+                     save us from 100 to several thousand points off our SAD 1
+                     in 15 times.
+                TODO: Is this a good idea?
+                Who knows.
+                It needs more testing.*/
+              for(;;){
+                int bestx;
+                int besty;
+                int bj;
+                bestx=best_block_vec[bi][0];
+                besty=best_block_vec[bi][1];
+                /*Compose the bit flags for boundary conditions.*/
+                b=OC_DIV16(-bestx+1)|OC_DIV16(bestx+1)<<1|
+                 OC_DIV16(-besty+1)<<2|OC_DIV16(besty+1)<<3;
+                nsites=OC_SQUARE_NSITES[b];
+                for(sitei=0;sitei<nsites;sitei++){
+                  site=OC_SQUARE_SITES[b][sitei];
+                  candx=bestx+OC_SQUARE_DX[site];
+                  candy=besty+OC_SQUARE_DY[site];
+                  hitbit=(ogg_int32_t)1<<candx+15;
+                  if(hit_cache[candy+15]&hitbit)continue;
+                  hit_cache[candy+15]|=hitbit;
+                  err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
+                   frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
+                  if(err<best_err){
+                    best_err=err;
+                    best_vec[0]=candx;
+                    best_vec[1]=candy;
+                  }
+                  for(bj=0;bj<4;bj++)if(block_err[bj]<best_block_err[bj]){
+                    best_block_err[bj]=block_err[bj];
+                    best_block_vec[bj][0]=candx;
+                    best_block_vec[bj][1]=candy;
+                  }
+                }
+                if(best_block_vec[bi][0]==bestx&&best_block_vec[bi][1]==besty){
+                  break;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  embs[_mbi].error[_frame]=(ogg_uint16_t)best_err;
+  candx=best_vec[0];
+  candy=best_vec[1];
+  embs[_mbi].satd[_frame]=oc_mcenc_ysatd_check_mbcandidate_fullpel(_enc,
+   frag_buf_offs,fragis,candx,candy,src,ref,ystride);
+  embs[_mbi].analysis_mv[0][_frame][0]=(signed char)(candx<<1);
+  embs[_mbi].analysis_mv[0][_frame][1]=(signed char)(candy<<1);
+  if(_frame==OC_FRAME_PREV){
+    for(bi=0;bi<4;bi++){
+      candx=best_block_vec[bi][0];
+      candy=best_block_vec[bi][1];
+      embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_check_bcandidate_fullpel(_enc,
+       frag_buf_offs[fragis[bi]],candx,candy,src,ref,ystride);
+      embs[_mbi].block_mv[bi][0]=(signed char)(candx<<1);
+      embs[_mbi].block_mv[bi][1]=(signed char)(candy<<1);
+    }
+  }
+}
+
+void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi){
+  oc_mv2         *mvs;
+  int             accum_p[2];
+  int             accum_g[2];
+  mvs=_enc->mb_info[_mbi].analysis_mv;
+  if(_enc->prevframe_dropped){
+    accum_p[0]=mvs[0][OC_FRAME_PREV][0];
+    accum_p[1]=mvs[0][OC_FRAME_PREV][1];
+  }
+  else accum_p[1]=accum_p[0]=0;
+  accum_g[0]=mvs[2][OC_FRAME_GOLD][0];
+  accum_g[1]=mvs[2][OC_FRAME_GOLD][1];
+  mvs[0][OC_FRAME_PREV][0]-=mvs[2][OC_FRAME_PREV][0];
+  mvs[0][OC_FRAME_PREV][1]-=mvs[2][OC_FRAME_PREV][1];
+  /*Move the motion vector predictors back a frame.*/
+  memmove(mvs+1,mvs,2*sizeof(*mvs));
+  /*Search the last frame.*/
+  oc_mcenc_search_frame(_enc,accum_p,_mbi,OC_FRAME_PREV);
+  mvs[2][OC_FRAME_PREV][0]=accum_p[0];
+  mvs[2][OC_FRAME_PREV][1]=accum_p[1];
+  /*GOLDEN MVs are different from PREV MVs in that they're each absolute
+     offsets from some frame in the past rather than relative offsets from the
+     frame before.
+    For predictor calculation to make sense, we need them to be in the same
+     form as PREV MVs.*/
+  mvs[1][OC_FRAME_GOLD][0]-=mvs[2][OC_FRAME_GOLD][0];
+  mvs[1][OC_FRAME_GOLD][1]-=mvs[2][OC_FRAME_GOLD][1];
+  mvs[2][OC_FRAME_GOLD][0]-=accum_g[0];
+  mvs[2][OC_FRAME_GOLD][1]-=accum_g[1];
+  /*Search the golden frame.*/
+  oc_mcenc_search_frame(_enc,accum_g,_mbi,OC_FRAME_GOLD);
+  /*Put GOLDEN MVs back into absolute offset form.
+    The newest MV is already an absolute offset.*/
+  mvs[2][OC_FRAME_GOLD][0]+=accum_g[0];
+  mvs[2][OC_FRAME_GOLD][1]+=accum_g[1];
+  mvs[1][OC_FRAME_GOLD][0]+=mvs[2][OC_FRAME_GOLD][0];
+  mvs[1][OC_FRAME_GOLD][1]+=mvs[2][OC_FRAME_GOLD][1];
+}
+
+#if 0
+static int oc_mcenc_ysad_halfpel_mbrefine(const oc_enc_ctx *_enc,int _mbi,
+ int _vec[2],int _best_err,int _frame){
+  const unsigned char *src;
+  const unsigned char *ref;
+  const ptrdiff_t     *frag_buf_offs;
+  const ptrdiff_t     *fragis;
+  int                  offset_y[9];
+  int                  ystride;
+  int                  mvoffset_base;
+  int                  best_site;
+  int                  sitei;
+  int                  err;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_framei]];
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  fragis=_enc->state.mb_maps[_mbi][0];
+  ystride=_enc->state.ref_ystride[0];
+  mvoffset_base=_vec[0]+_vec[1]*ystride;
+  offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
+  offset_y[3]=offset_y[5]=0;
+  offset_y[6]=offset_y[7]=offset_y[8]=ystride;
+  best_site=4;
+  for(sitei=0;sitei<8;sitei++){
+    int site;
+    int xmask;
+    int ymask;
+    int dx;
+    int dy;
+    int mvoffset0;
+    int mvoffset1;
+    site=OC_SQUARE_SITES[0][sitei];
+    dx=OC_SQUARE_DX[site];
+    dy=OC_SQUARE_DY[site];
+    /*The following code SHOULD be equivalent to
+        oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
+         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
+      However, it should also be much faster, as it involves no multiplies and
+       doesn't have to handle chroma vectors.*/
+    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
+    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
+    mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
+    mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
+    err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis,
+     mvoffset0,mvoffset1,src,ref,ystride,_best_err);
+    if(err<_best_err){
+      _best_err=err;
+      best_site=site;
+    }
+  }
+  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
+  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
+  return _best_err;
+}
+#endif
+
+static unsigned oc_mcenc_ysatd_halfpel_mbrefine(const oc_enc_ctx *_enc,
+ int _mbi,int _vec[2],unsigned _best_err,int _frame){
+  const unsigned char *src;
+  const unsigned char *ref;
+  const ptrdiff_t     *frag_buf_offs;
+  const ptrdiff_t     *fragis;
+  int                  offset_y[9];
+  int                  ystride;
+  int                  mvoffset_base;
+  int                  best_site;
+  int                  sitei;
+  int                  err;
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]];
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  fragis=_enc->state.mb_maps[_mbi][0];
+  ystride=_enc->state.ref_ystride[0];
+  mvoffset_base=_vec[0]+_vec[1]*ystride;
+  offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
+  offset_y[3]=offset_y[5]=0;
+  offset_y[6]=offset_y[7]=offset_y[8]=ystride;
+  best_site=4;
+  for(sitei=0;sitei<8;sitei++){
+    int site;
+    int xmask;
+    int ymask;
+    int dx;
+    int dy;
+    int mvoffset0;
+    int mvoffset1;
+    site=OC_SQUARE_SITES[0][sitei];
+    dx=OC_SQUARE_DX[site];
+    dy=OC_SQUARE_DY[site];
+    /*The following code SHOULD be equivalent to
+        oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
+         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
+      However, it should also be much faster, as it involves no multiplies and
+       doesn't have to handle chroma vectors.*/
+    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
+    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
+    mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
+    mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
+    err=oc_satd16_halfpel(_enc,frag_buf_offs,fragis,
+     mvoffset0,mvoffset1,src,ref,ystride,_best_err);
+    if(err<_best_err){
+      _best_err=err;
+      best_site=site;
+    }
+  }
+  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
+  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
+  return _best_err;
+}
+
+void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame){
+  oc_mb_enc_info *embs;
+  int             vec[2];
+  embs=_enc->mb_info;
+  vec[0]=OC_DIV2(embs[_mbi].analysis_mv[0][_frame][0]);
+  vec[1]=OC_DIV2(embs[_mbi].analysis_mv[0][_frame][1]);
+  embs[_mbi].satd[_frame]=oc_mcenc_ysatd_halfpel_mbrefine(_enc,
+   _mbi,vec,embs[_mbi].satd[_frame],_frame);
+  embs[_mbi].analysis_mv[0][_frame][0]=(signed char)vec[0];
+  embs[_mbi].analysis_mv[0][_frame][1]=(signed char)vec[1];
+}
+
+#if 0
+static int oc_mcenc_ysad_halfpel_brefine(const oc_enc_ctx *_enc,
+ int _vec[2],const unsigned char *_src,const unsigned char *_ref,int _ystride,
+ int _offset_y[9],unsigned _best_err){
+  int mvoffset_base;
+  int best_site;
+  int sitei;
+  mvoffset_base=_vec[0]+_vec[1]*_ystride;
+  best_site=4;
+  for(sitei=0;sitei<8;sitei++){
+    unsigned err;
+    int      site;
+    int      xmask;
+    int      ymask;
+    int      dx;
+    int      dy;
+    int      mvoffset0;
+    int      mvoffset1;
+    site=OC_SQUARE_SITES[0][sitei];
+    dx=OC_SQUARE_DX[site];
+    dy=OC_SQUARE_DY[site];
+    /*The following code SHOULD be equivalent to
+        oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
+         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
+      However, it should also be much faster, as it involves no multiplies and
+       doesn't have to handle chroma vectors.*/
+    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
+    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
+    mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask);
+    mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask);
+    err=oc_enc_frag_sad2_thresh(_enc,_src,
+     _ref+mvoffset0,_ref+mvoffset1,ystride,_best_err);
+    if(err<_best_err){
+      _best_err=err;
+      best_site=site;
+    }
+  }
+  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
+  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
+  return _best_err;
+}
+#endif
+
+static unsigned oc_mcenc_ysatd_halfpel_brefine(const oc_enc_ctx *_enc,
+ int _vec[2],const unsigned char *_src,const unsigned char *_ref,int _ystride,
+ int _offset_y[9],unsigned _best_err){
+  int mvoffset_base;
+  int best_site;
+  int sitei;
+  mvoffset_base=_vec[0]+_vec[1]*_ystride;
+  best_site=4;
+  for(sitei=0;sitei<8;sitei++){
+    unsigned err;
+    int      site;
+    int      xmask;
+    int      ymask;
+    int      dx;
+    int      dy;
+    int      mvoffset0;
+    int      mvoffset1;
+    site=OC_SQUARE_SITES[0][sitei];
+    dx=OC_SQUARE_DX[site];
+    dy=OC_SQUARE_DY[site];
+    /*The following code SHOULD be equivalent to
+        oc_state_get_mv_offsets(&_enc->state,&mvoffsets,0,
+         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy);
+      However, it should also be much faster, as it involves no multiplies and
+       doesn't have to handle chroma vectors.*/
+    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
+    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
+    mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask);
+    mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask);
+    err=oc_enc_frag_satd2_thresh(_enc,_src,
+     _ref+mvoffset0,_ref+mvoffset1,_ystride,_best_err);
+    if(err<_best_err){
+      _best_err=err;
+      best_site=site;
+    }
+  }
+  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
+  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
+  return _best_err;
+}
+
+void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){
+  oc_mb_enc_info      *embs;
+  const ptrdiff_t     *frag_buf_offs;
+  const ptrdiff_t     *fragis;
+  const unsigned char *src;
+  const unsigned char *ref;
+  int                  offset_y[9];
+  int                  ystride;
+  int                  bi;
+  ystride=_enc->state.ref_ystride[0];
+  frag_buf_offs=_enc->state.frag_buf_offs;
+  fragis=_enc->state.mb_maps[_mbi][0];
+  src=_enc->state.ref_frame_data[OC_FRAME_IO];
+  ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
+  offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
+  offset_y[3]=offset_y[5]=0;
+  offset_y[6]=offset_y[7]=offset_y[8]=ystride;
+  embs=_enc->mb_info;
+  for(bi=0;bi<4;bi++){
+    ptrdiff_t frag_offs;
+    int       vec[2];
+    frag_offs=frag_buf_offs[fragis[bi]];
+    vec[0]=OC_DIV2(embs[_mbi].block_mv[bi][0]);
+    vec[1]=OC_DIV2(embs[_mbi].block_mv[bi][1]);
+    embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_halfpel_brefine(_enc,vec,
+     src+frag_offs,ref+frag_offs,ystride,offset_y,embs[_mbi].block_satd[bi]);
+    embs[_mbi].ref_mv[bi][0]=(signed char)vec[0];
+    embs[_mbi].ref_mv[bi][1]=(signed char)vec[1];
+  }
+}
diff --git a/lib/modedec.h b/lib/modedec.h
new file mode 100644
index 0000000..ea12c64
--- /dev/null
+++ b/lib/modedec.h
@@ -0,0 +1,4027 @@
+/*File generated by libtheora with OC_COLLECT_METRICS defined at compile time.*/
+#if !defined(_modedec_H)
+# define _modedec_H (1)
+
+
+
+# if defined(OC_COLLECT_METRICS)
+typedef struct oc_mode_metrics oc_mode_metrics;
+# endif
+typedef struct oc_mode_rd      oc_mode_rd;
+
+
+
+/*The number of extra bits of precision at which to store rate metrics.*/
+# define OC_BIT_SCALE  (6)
+/*The number of extra bits of precision at which to store RMSE metrics.
+  This must be at least half OC_BIT_SCALE (rounded up).*/
+# define OC_RMSE_SCALE (5)
+/*The number of bins to partition statistics into.*/
+# define OC_SAD_BINS   (24)
+/*The number of bits of precision to drop from SAD scores to assign them to a
+   bin.*/
+# define OC_SAD_SHIFT  (9)
+
+
+
+# if defined(OC_COLLECT_METRICS)
+struct oc_mode_metrics{
+  double fragw;
+  double satd;
+  double rate;
+  double rmse;
+  double satd2;
+  double satdrate;
+  double rate2;
+  double satdrmse;
+  double rmse2;
+};
+
+
+int             oc_has_mode_metrics;
+oc_mode_metrics OC_MODE_METRICS[64][3][2][OC_SAD_BINS];
+# endif
+
+
+
+struct oc_mode_rd{
+  ogg_int16_t rate;
+  ogg_int16_t rmse;
+};
+
+
+# if !defined(OC_COLLECT_METRICS)
+static const
+# endif
+oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={
+  {
+    {
+      /*Y'  qi=0  INTRA*/
+      {
+        {   87,  -66},{  132, 1611},{  197, 3474},{  285, 5130},
+        {  376, 6419},{  450, 7545},{  521, 8587},{  600, 9587},
+        {  689,10498},{  790,11348},{  899,12158},{ 1030,12855},
+        { 1166,13459},{ 1276,14052},{ 1353,14732},{ 1444,15425},
+        { 1535,16101},{ 1609,16856},{ 1697,17532},{ 1823,17995},
+        { 1962,18426},{ 2085,18919},{ 2201,19503},{ 2304,20307}
+      },
+      /*Y'  qi=0  INTER*/
+      {
+        {   32, -105},{   40, 1268},{   54, 2919},{   91, 4559},
+        {  118, 6244},{  132, 7932},{  142, 9514},{  149,10989},
+        {  155,12375},{  161,13679},{  168,14958},{  176,16215},
+        {  187,17431},{  196,18623},{  207,19790},{  218,20941},
+        {  230,22083},{  246,23213},{  265,24333},{  292,25439},
+        {  328,26512},{  372,27538},{  427,28522},{  494,29479}
+      }
+    },
+    {
+      /*Cb  qi=0  INTRA*/
+      {
+        {    1,    6},{   27,  368},{   52,  738},{   67, 1171},
+        {   80, 1642},{   99, 2134},{  110, 2642},{  112, 3144},
+        {  126, 3578},{  154, 3967},{  167, 4387},{  172, 4839},
+        {  191, 5278},{  208, 5666},{  220, 6036},{  223, 6398},
+        {  227, 6814},{  253, 7157},{  284, 7403},{  292, 7699},
+        {  314, 7983},{  339, 8203},{  363, 8460},{  399, 8919}
+      },
+      /*Cb  qi=0  INTER*/
+      {
+        {   68,  -55},{   63,  275},{   58,  602},{   53,  936},
+        {   50, 1290},{   54, 1691},{   58, 2116},{   62, 2553},
+        {   67, 2992},{   72, 3422},{   78, 3843},{   84, 4253},
+        {   89, 4658},{   94, 5062},{   98, 5455},{  100, 5848},
+        {  102, 6231},{  104, 6604},{  104, 6982},{  105, 7359},
+        {  105, 7733},{  104, 8104},{  105, 8465},{  111, 8828}
+      }
+    },
+    {
+      /*Cr  qi=0  INTRA*/
+      {
+        {    1,    8},{   23,  375},{   47,  759},{   63, 1220},
+        {   71, 1693},{   82, 2171},{   94, 2652},{  109, 3103},
+        {  125, 3567},{  133, 3995},{  151, 4375},{  168, 4819},
+        {  174, 5244},{  190, 5635},{  215, 6005},{  242, 6347},
+        {  257, 6758},{  280, 7068},{  311, 7336},{  326, 7652},
+        {  346, 7968},{  372, 8213},{  388, 8515},{  408, 9060}
+      },
+      /*Cr  qi=0  INTER*/
+      {
+        {   69,    0},{   60,  314},{   49,  624},{   45,  943},
+        {   45, 1285},{   49, 1691},{   55, 2130},{   62, 2560},
+        {   71, 2973},{   79, 3385},{   85, 3800},{   89, 4207},
+        {   92, 4620},{   95, 5037},{   96, 5436},{   97, 5839},
+        {   98, 6252},{   99, 6653},{   99, 7038},{  103, 7426},
+        {  107, 7810},{  108, 8178},{  107, 8539},{  106, 8937}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=1  INTRA*/
+      {
+        {   81,  -71},{  133, 1610},{  203, 3460},{  296, 5083},
+        {  392, 6342},{  467, 7454},{  541, 8486},{  625, 9466},
+        {  716,10352},{  823,11181},{  940,11961},{ 1074,12643},
+        { 1211,13233},{ 1324,13807},{ 1408,14489},{ 1504,15167},
+        { 1598,15824},{ 1679,16544},{ 1788,17161},{ 1928,17579},
+        { 2070,17991},{ 2202,18456},{ 2324,19021},{ 2425,19894}
+      },
+      /*Y'  qi=1  INTER*/
+      {
+        {   34,    4},{   40, 1307},{   55, 2914},{   93, 4555},
+        {  120, 6243},{  134, 7912},{  144, 9468},{  152,10918},
+        {  158,12275},{  164,13569},{  171,14846},{  180,16098},
+        {  191,17310},{  204,18484},{  216,19636},{  228,20779},
+        {  242,21912},{  261,23036},{  286,24146},{  320,25221},
+        {  363,26265},{  418,27261},{  485,28203},{  551,29148}
+      }
+    },
+    {
+      /*Cb  qi=1  INTRA*/
+      {
+        {    1,    6},{   28,  367},{   52,  738},{   68, 1172},
+        {   86, 1644},{  106, 2135},{  115, 2642},{  119, 3141},
+        {  132, 3569},{  157, 3951},{  172, 4366},{  177, 4819},
+        {  194, 5258},{  211, 5638},{  224, 6006},{  233, 6367},
+        {  236, 6784},{  258, 7121},{  299, 7357},{  319, 7637},
+        {  337, 7921},{  358, 8141},{  381, 8367},{  401, 8768}
+      },
+      /*Cb  qi=1  INTER*/
+      {
+        {   95,  -31},{   81,  295},{   67,  614},{   53,  953},
+        {   48, 1305},{   51, 1700},{   56, 2125},{   61, 2563},
+        {   67, 3008},{   73, 3435},{   79, 3844},{   85, 4251},
+        {   90, 4663},{   95, 5073},{   98, 5458},{  100, 5844},
+        {  101, 6231},{  102, 6606},{  102, 6980},{  103, 7347},
+        {  104, 7726},{  105, 8096},{  105, 8453},{  105, 8789}
+      }
+    },
+    {
+      /*Cr  qi=1  INTRA*/
+      {
+        {    1,    8},{   25,  375},{   50,  759},{   65, 1221},
+        {   74, 1695},{   86, 2172},{  101, 2651},{  117, 3101},
+        {  129, 3561},{  135, 3985},{  153, 4368},{  171, 4807},
+        {  182, 5223},{  202, 5608},{  225, 5964},{  251, 6300},
+        {  271, 6697},{  295, 6978},{  324, 7235},{  348, 7558},
+        {  367, 7877},{  394, 8101},{  413, 8386},{  409, 8945}
+      },
+      /*Cr  qi=1  INTER*/
+      {
+        {   66,   11},{   59,  323},{   51,  631},{   44,  949},
+        {   44, 1292},{   49, 1703},{   56, 2140},{   62, 2566},
+        {   69, 2991},{   77, 3397},{   84, 3799},{   89, 4211},
+        {   93, 4634},{   94, 5049},{   95, 5444},{   96, 5854},
+        {   94, 6260},{   95, 6640},{   96, 7032},{  101, 7423},
+        {  104, 7790},{  105, 8158},{  109, 8527},{  108, 8872}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=2  INTRA*/
+      {
+        {   87,  -72},{  139, 1607},{  213, 3426},{  315, 4992},
+        {  416, 6217},{  495, 7315},{  574, 8317},{  666, 9265},
+        {  763,10124},{  875,10906},{ 1001,11654},{ 1147,12305},
+        { 1289,12865},{ 1407,13424},{ 1503,14076},{ 1610,14724},
+        { 1720,15342},{ 1815,16020},{ 1937,16579},{ 2084,16981},
+        { 2236,17371},{ 2385,17779},{ 2536,18250},{ 2689,18931}
+      },
+      /*Y'  qi=2  INTER*/
+      {
+        {   30,   -2},{   40, 1308},{   57, 2921},{   96, 4567},
+        {  122, 6260},{  136, 7902},{  148, 9418},{  156,10826},
+        {  162,12157},{  169,13448},{  177,14709},{  188,15938},
+        {  200,17133},{  213,18295},{  228,19433},{  245,20564},
+        {  264,21685},{  289,22790},{  323,23876},{  368,24916},
+        {  427,25906},{  499,26837},{  585,27700},{  680,28514}
+      }
+    },
+    {
+      /*Cb  qi=2  INTRA*/
+      {
+        {    1,    6},{   30,  367},{   58,  738},{   77, 1172},
+        {   93, 1645},{  111, 2137},{  123, 2642},{  126, 3133},
+        {  136, 3553},{  162, 3934},{  178, 4352},{  183, 4803},
+        {  199, 5231},{  220, 5596},{  235, 5957},{  245, 6314},
+        {  256, 6718},{  286, 7048},{  320, 7285},{  336, 7568},
+        {  366, 7829},{  387, 8045},{  405, 8261},{  445, 8550}
+      },
+      /*Cb  qi=2  INTER*/
+      {
+        {  115,  -61},{   93,  277},{   71,  609},{   54,  963},
+        {   49, 1329},{   53, 1715},{   58, 2138},{   63, 2583},
+        {   69, 3017},{   75, 3442},{   81, 3857},{   88, 4263},
+        {   93, 4667},{   96, 5065},{  101, 5451},{  101, 5832},
+        {  102, 6213},{  103, 6593},{  103, 6968},{  104, 7336},
+        {  104, 7710},{  105, 8076},{  106, 8440},{  106, 8822}
+      }
+    },
+    {
+      /*Cr  qi=2  INTRA*/
+      {
+        {    1,    8},{   27,  375},{   54,  759},{   70, 1222},
+        {   79, 1696},{   89, 2173},{  106, 2652},{  123, 3098},
+        {  135, 3553},{  143, 3972},{  161, 4348},{  181, 4782},
+        {  194, 5189},{  213, 5565},{  235, 5907},{  266, 6229},
+        {  286, 6618},{  311, 6897},{  339, 7152},{  362, 7454},
+        {  392, 7721},{  416, 7946},{  429, 8227},{  458, 8540}
+      },
+      /*Cr  qi=2  INTER*/
+      {
+        {   74,   20},{   63,  330},{   51,  635},{   44,  942},
+        {   47, 1287},{   54, 1710},{   59, 2147},{   65, 2571},
+        {   72, 2996},{   79, 3413},{   86, 3820},{   91, 4230},
+        {   93, 4642},{   95, 5046},{   95, 5442},{   95, 5839},
+        {   96, 6243},{   97, 6641},{   99, 7021},{  101, 7396},
+        {  103, 7764},{  106, 8138},{  109, 8507},{  114, 8851}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=3  INTRA*/
+      {
+        {   91,  -67},{  141, 1606},{  219, 3405},{  328, 4929},
+        {  433, 6122},{  515, 7209},{  598, 8204},{  693, 9145},
+        {  796, 9986},{  912,10756},{ 1045,11471},{ 1200,12079},
+        { 1345,12640},{ 1471,13179},{ 1571,13809},{ 1678,14450},
+        { 1798,15047},{ 1905,15701},{ 2043,16205},{ 2202,16569},
+        { 2351,16971},{ 2501,17393},{ 2660,17851},{ 2825,18455}
+      },
+      /*Y'  qi=3  INTER*/
+      {
+        {   53, -164},{   38, 1314},{   59, 2917},{   99, 4563},
+        {  124, 6253},{  139, 7882},{  150, 9375},{  159,10749},
+        {  166,12059},{  173,13349},{  183,14608},{  194,15826},
+        {  208,17003},{  223,18150},{  240,19287},{  259,20411},
+        {  284,21508},{  317,22593},{  359,23656},{  414,24671},
+        {  483,25634},{  569,26519},{  670,27332},{  786,28072}
+      }
+    },
+    {
+      /*Cb  qi=3  INTRA*/
+      {
+        {    1,    5},{   31,  367},{   58,  739},{   78, 1173},
+        {   96, 1645},{  113, 2134},{  125, 2638},{  133, 3127},
+        {  148, 3542},{  171, 3915},{  184, 4328},{  192, 4776},
+        {  209, 5197},{  230, 5556},{  245, 5909},{  252, 6261},
+        {  272, 6641},{  304, 6942},{  330, 7184},{  342, 7477},
+        {  380, 7736},{  404, 7962},{  428, 8151},{  469, 8430}
+      },
+      /*Cb  qi=3  INTER*/
+      {
+        {   86,  -29},{   72,  296},{   58,  618},{   46,  964},
+        {   47, 1338},{   51, 1743},{   56, 2158},{   63, 2594},
+        {   69, 3035},{   77, 3455},{   84, 3859},{   89, 4266},
+        {   94, 4673},{   98, 5074},{  101, 5460},{  101, 5842},
+        {  101, 6217},{  101, 6593},{  102, 6964},{  104, 7325},
+        {  103, 7696},{  103, 8056},{  104, 8430},{  103, 8792}
+      }
+    },
+    {
+      /*Cr  qi=3  INTRA*/
+      {
+        {    1,    8},{   27,  374},{   56,  759},{   74, 1221},
+        {   83, 1696},{   96, 2173},{  113, 2650},{  127, 3091},
+        {  140, 3542},{  151, 3960},{  164, 4334},{  188, 4764},
+        {  208, 5144},{  224, 5493},{  250, 5841},{  278, 6162},
+        {  298, 6548},{  334, 6816},{  365, 7045},{  388, 7343},
+        {  419, 7613},{  443, 7836},{  455, 8105},{  484, 8445}
+      },
+      /*Cr  qi=3  INTER*/
+      {
+        {   76,   26},{   65,  332},{   53,  638},{   45,  945},
+        {   45, 1304},{   53, 1725},{   60, 2153},{   68, 2584},
+        {   74, 3007},{   81, 3425},{   87, 3844},{   91, 4253},
+        {   94, 4657},{   95, 5061},{   94, 5462},{   94, 5856},
+        {   95, 6250},{   96, 6635},{   97, 7014},{  101, 7393},
+        {  104, 7761},{  106, 8137},{  109, 8506},{  111, 8823}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=4  INTRA*/
+      {
+        {   80,  -67},{  143, 1603},{  227, 3378},{  344, 4861},
+        {  454, 6026},{  537, 7104},{  626, 8089},{  725, 9006},
+        {  830, 9827},{  950,10581},{ 1089,11270},{ 1257,11826},
+        { 1409,12366},{ 1535,12912},{ 1640,13528},{ 1753,14173},
+        { 1884,14756},{ 2007,15368},{ 2148,15852},{ 2307,16212},
+        { 2464,16591},{ 2614,17019},{ 2785,17455},{ 2970,17963}
+      },
+      /*Y'  qi=4  INTER*/
+      {
+        {   50, -145},{   38, 1324},{   61, 2921},{  102, 4566},
+        {  127, 6248},{  142, 7845},{  154, 9300},{  163,10656},
+        {  169,11965},{  177,13246},{  188,14495},{  202,15702},
+        {  218,16864},{  236,18003},{  256,19124},{  278,20233},
+        {  307,21330},{  347,22398},{  398,23437},{  463,24429},
+        {  546,25343},{  649,26170},{  767,26935},{  888,27674}
+      }
+    },
+    {
+      /*Cb  qi=4  INTRA*/
+      {
+        {    1,    5},{   33,  367},{   61,  739},{   80, 1173},
+        {   98, 1646},{  114, 2136},{  126, 2639},{  137, 3124},
+        {  152, 3535},{  176, 3903},{  194, 4307},{  206, 4753},
+        {  222, 5165},{  242, 5508},{  260, 5857},{  272, 6205},
+        {  294, 6559},{  332, 6848},{  356, 7104},{  364, 7389},
+        {  396, 7637},{  415, 7878},{  446, 8064},{  506, 8294}
+      },
+      /*Cb  qi=4  INTER*/
+      {
+        {   86,  -15},{   73,  308},{   60,  627},{   46,  967},
+        {   47, 1343},{   51, 1754},{   56, 2183},{   63, 2615},
+        {   70, 3044},{   79, 3459},{   85, 3866},{   90, 4276},
+        {   94, 4686},{   97, 5088},{  100, 5467},{  102, 5837},
+        {  102, 6205},{  101, 6569},{  103, 6939},{  104, 7317},
+        {  105, 7690},{  107, 8043},{  107, 8394},{  111, 8736}
+      }
+    },
+    {
+      /*Cr  qi=4  INTRA*/
+      {
+        {    1,    7},{   28,  375},{   57,  759},{   79, 1221},
+        {   92, 1697},{  105, 2174},{  122, 2648},{  135, 3085},
+        {  146, 3530},{  157, 3947},{  171, 4316},{  195, 4737},
+        {  218, 5117},{  239, 5445},{  268, 5767},{  295, 6074},
+        {  315, 6460},{  355, 6735},{  392, 6933},{  418, 7218},
+        {  448, 7495},{  471, 7688},{  481, 7954},{  504, 8313}
+      },
+      /*Cr  qi=4  INTER*/
+      {
+        {   68,   28},{   57,  334},{   47,  639},{   43,  953},
+        {   48, 1314},{   54, 1736},{   59, 2169},{   69, 2592},
+        {   78, 3017},{   84, 3434},{   88, 3850},{   92, 4260},
+        {   95, 4663},{   96, 5068},{   95, 5455},{   95, 5839},
+        {   96, 6243},{   97, 6626},{   98, 7006},{  101, 7390},
+        {  104, 7755},{  108, 8115},{  111, 8471},{  110, 8825}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=5  INTRA*/
+      {
+        {   84,  -69},{  147, 1599},{  237, 3350},{  360, 4796},
+        {  475, 5934},{  562, 6992},{  657, 7953},{  765, 8837},
+        {  874, 9641},{  998,10384},{ 1146,11047},{ 1322,11572},
+        { 1484,12076},{ 1617,12609},{ 1731,13203},{ 1856,13806},
+        { 1995,14367},{ 2132,14936},{ 2289,15386},{ 2460,15721},
+        { 2635,16066},{ 2802,16442},{ 2980,16805},{ 3177,17272}
+      },
+      /*Y'  qi=5  INTER*/
+      {
+        {   38,  -86},{   37, 1349},{   64, 2920},{  105, 4563},
+        {  129, 6236},{  145, 7809},{  158, 9236},{  167,10572},
+        {  174,11871},{  182,13141},{  195,14368},{  212,15558},
+        {  230,16706},{  250,17828},{  274,18944},{  303,20041},
+        {  342,21116},{  394,22152},{  460,23144},{  543,24073},
+        {  648,24919},{  773,25673},{  922,26323},{ 1084,26924}
+      }
+    },
+    {
+      /*Cb  qi=5  INTRA*/
+      {
+        {    1,    5},{   34,  367},{   63,  739},{   82, 1174},
+        {  102, 1647},{  119, 2137},{  134, 2639},{  145, 3121},
+        {  161, 3529},{  189, 3891},{  207, 4290},{  216, 4721},
+        {  232, 5113},{  258, 5455},{  277, 5798},{  294, 6124},
+        {  322, 6427},{  352, 6697},{  370, 6982},{  384, 7283},
+        {  423, 7529},{  448, 7766},{  478, 7943},{  527, 8151}
+      },
+      /*Cb  qi=5  INTER*/
+      {
+        {   83,  -49},{   69,  284},{   55,  611},{   48,  961},
+        {   49, 1355},{   52, 1769},{   58, 2191},{   65, 2616},
+        {   73, 3041},{   80, 3460},{   87, 3868},{   92, 4276},
+        {   95, 4682},{   98, 5077},{  100, 5459},{  102, 5827},
+        {  102, 6200},{  102, 6568},{  103, 6930},{  103, 7303},
+        {  104, 7672},{  106, 8032},{  106, 8391},{  106, 8727}
+      }
+    },
+    {
+      /*Cr  qi=5  INTRA*/
+      {
+        {    1,    8},{   28,  375},{   57,  760},{   81, 1222},
+        {   99, 1696},{  111, 2175},{  125, 2648},{  140, 3079},
+        {  152, 3520},{  162, 3927},{  179, 4294},{  203, 4714},
+        {  225, 5080},{  254, 5389},{  286, 5703},{  318, 5997},
+        {  342, 6364},{  380, 6640},{  416, 6837},{  445, 7103},
+        {  473, 7370},{  497, 7562},{  514, 7811},{  549, 8148}
+      },
+      /*Cr  qi=5  INTER*/
+      {
+        {   60,    6},{   54,  323},{   46,  638},{   43,  958},
+        {   45, 1329},{   54, 1749},{   61, 2175},{   70, 2600},
+        {   79, 3021},{   85, 3437},{   89, 3847},{   93, 4254},
+        {   95, 4660},{   96, 5065},{   95, 5456},{   95, 5849},
+        {   96, 6243},{   96, 6621},{   97, 6996},{  101, 7366},
+        {  104, 7722},{  107, 8088},{  111, 8448},{  119, 8816}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=6  INTRA*/
+      {
+        {   88,  -69},{  151, 1593},{  251, 3294},{  387, 4681},
+        {  507, 5790},{  601, 6837},{  702, 7787},{  813, 8648},
+        {  927, 9427},{ 1059,10152},{ 1213,10787},{ 1399,11284},
+        { 1568,11781},{ 1705,12312},{ 1823,12890},{ 1957,13482},
+        { 2106,14036},{ 2249,14600},{ 2411,15042},{ 2588,15359},
+        { 2772,15699},{ 2947,16062},{ 3127,16429},{ 3320,16849}
+      },
+      /*Y'  qi=6  INTER*/
+      {
+        {   44,  -80},{   36, 1346},{   69, 2919},{  111, 4563},
+        {  136, 6216},{  154, 7746},{  168, 9139},{  178,10461},
+        {  185,11747},{  195,13007},{  211,14229},{  230,15408},
+        {  250,16547},{  274,17663},{  302,18769},{  339,19851},
+        {  386,20907},{  446,21933},{  527,22884},{  631,23746},
+        {  760,24512},{  914,25178},{ 1087,25758},{ 1278,26262}
+      }
+    },
+    {
+      /*Cb  qi=6  INTRA*/
+      {
+        {    1,    4},{   36,  367},{   66,  739},{   84, 1174},
+        {  105, 1648},{  126, 2139},{  140, 2639},{  149, 3116},
+        {  164, 3523},{  194, 3880},{  217, 4271},{  226, 4694},
+        {  243, 5077},{  270, 5407},{  291, 5742},{  310, 6061},
+        {  340, 6340},{  373, 6609},{  394, 6890},{  409, 7189},
+        {  444, 7434},{  469, 7652},{  499, 7853},{  559, 8135}
+      },
+      /*Cb  qi=6  INTER*/
+      {
+        {   68,  -46},{   60,  291},{   50,  623},{   49,  971},
+        {   50, 1357},{   55, 1781},{   61, 2211},{   69, 2634},
+        {   78, 3052},{   86, 3466},{   91, 3882},{   95, 4292},
+        {   98, 4691},{  101, 5080},{  102, 5458},{  103, 5830},
+        {  103, 6192},{  104, 6554},{  104, 6916},{  106, 7278},
+        {  108, 7641},{  110, 8004},{  112, 8371},{  112, 8758}
+      }
+    },
+    {
+      /*Cr  qi=6  INTRA*/
+      {
+        {    1,    8},{   29,  375},{   59,  760},{   84, 1223},
+        {   99, 1698},{  112, 2176},{  129, 2647},{  143, 3076},
+        {  156, 3510},{  168, 3906},{  189, 4269},{  220, 4682},
+        {  241, 5047},{  266, 5342},{  299, 5649},{  331, 5954},
+        {  357, 6309},{  393, 6579},{  431, 6765},{  467, 6997},
+        {  501, 7276},{  520, 7488},{  525, 7749},{  548, 8146}
+      },
+      /*Cr  qi=6  INTER*/
+      {
+        {   94,   31},{   69,  335},{   47,  641},{   43,  967},
+        {   50, 1350},{   57, 1772},{   65, 2197},{   74, 2625},
+        {   83, 3043},{   90, 3454},{   94, 3867},{   97, 4273},
+        {   98, 4671},{   99, 5068},{   99, 5461},{   98, 5857},
+        {   98, 6245},{   99, 6610},{  103, 6975},{  105, 7345},
+        {  108, 7712},{  111, 8073},{  113, 8415},{  119, 8768}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=7  INTRA*/
+      {
+        {   92,  -70},{  156, 1590},{  261, 3267},{  403, 4618},
+        {  529, 5704},{  628, 6730},{  736, 7657},{  856, 8491},
+        {  978, 9246},{ 1118, 9943},{ 1281,10550},{ 1472,11028},
+        { 1645,11507},{ 1793,12008},{ 1924,12565},{ 2067,13130},
+        { 2229,13638},{ 2388,14160},{ 2558,14584},{ 2744,14886},
+        { 2932,15194},{ 3116,15531},{ 3311,15858},{ 3538,16197}
+      },
+      /*Y'  qi=7  INTER*/
+      {
+        {   43,   -8},{   36, 1351},{   71, 2923},{  112, 4568},
+        {  138, 6201},{  157, 7705},{  171, 9083},{  181,10390},
+        {  189,11664},{  202,12910},{  220,14121},{  241,15281},
+        {  266,16401},{  295,17507},{  328,18608},{  371,19677},
+        {  430,20701},{  508,21676},{  604,22588},{  727,23397},
+        {  878,24093},{ 1055,24690},{ 1263,25151},{ 1496,25504}
+      }
+    },
+    {
+      /*Cb  qi=7  INTRA*/
+      {
+        {    1,    5},{   40,  367},{   72,  740},{   89, 1175},
+        {  108, 1649},{  129, 2140},{  143, 2637},{  154, 3110},
+        {  169, 3507},{  198, 3860},{  224, 4237},{  235, 4652},
+        {  253, 5037},{  282, 5358},{  307, 5674},{  329, 5986},
+        {  361, 6273},{  393, 6527},{  419, 6777},{  435, 7078},
+        {  467, 7342},{  495, 7554},{  529, 7757},{  591, 8053}
+      },
+      /*Cb  qi=7  INTER*/
+      {
+        {   79,  -33},{   68,  299},{   56,  627},{   50,  978},
+        {   51, 1366},{   55, 1786},{   61, 2213},{   70, 2642},
+        {   80, 3062},{   87, 3474},{   92, 3886},{   96, 4292},
+        {   99, 4684},{  102, 5072},{  103, 5450},{  104, 5814},
+        {  104, 6176},{  104, 6538},{  107, 6905},{  110, 7270},
+        {  110, 7625},{  110, 7978},{  111, 8340},{  117, 8674}
+      }
+    },
+    {
+      /*Cr  qi=7  INTRA*/
+      {
+        {    2,    7},{   31,  375},{   62,  760},{   87, 1223},
+        {  103, 1698},{  115, 2175},{  131, 2644},{  147, 3066},
+        {  161, 3494},{  175, 3889},{  199, 4250},{  229, 4653},
+        {  250, 5001},{  279, 5275},{  311, 5577},{  343, 5889},
+        {  376, 6227},{  417, 6486},{  457, 6689},{  484, 6925},
+        {  518, 7174},{  544, 7393},{  549, 7662},{  577, 8050}
+      },
+      /*Cr  qi=7  INTER*/
+      {
+        {   89,   22},{   62,  332},{   45,  641},{   47,  976},
+        {   52, 1363},{   59, 1779},{   67, 2203},{   76, 2628},
+        {   84, 3046},{   90, 3460},{   94, 3875},{   98, 4272},
+        {   99, 4666},{   98, 5063},{   98, 5459},{   98, 5849},
+        {   99, 6226},{  101, 6594},{  104, 6957},{  109, 7324},
+        {  109, 7686},{  111, 8042},{  115, 8379},{  119, 8699}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=8  INTRA*/
+      {
+        {   91,  -69},{  160, 1585},{  274, 3226},{  423, 4538},
+        {  557, 5596},{  664, 6595},{  778, 7506},{  905, 8319},
+        { 1038, 9035},{ 1186, 9701},{ 1355,10292},{ 1554,10754},
+        { 1739,11196},{ 1904,11639},{ 2047,12184},{ 2194,12763},
+        { 2361,13256},{ 2529,13753},{ 2709,14155},{ 2902,14433},
+        { 3100,14723},{ 3292,15026},{ 3489,15327},{ 3714,15705}
+      },
+      /*Y'  qi=8  INTER*/
+      {
+        {   32, -157},{   33, 1346},{   74, 2914},{  116, 4554},
+        {  142, 6172},{  162, 7648},{  177, 9004},{  186,10300},
+        {  196,11570},{  210,12808},{  231,14001},{  256,15150},
+        {  285,16259},{  319,17352},{  359,18435},{  415,19475},
+        {  489,20470},{  584,21400},{  703,22246},{  852,22968},
+        { 1038,23556},{ 1253,24032},{ 1503,24367},{ 1778,24628}
+      }
+    },
+    {
+      /*Cb  qi=8  INTRA*/
+      {
+        {    1,    4},{   42,  367},{   75,  740},{   93, 1176},
+        {  111, 1649},{  128, 2139},{  144, 2635},{  157, 3103},
+        {  174, 3494},{  206, 3844},{  233, 4207},{  251, 4605},
+        {  277, 4980},{  304, 5284},{  335, 5584},{  359, 5888},
+        {  393, 6152},{  432, 6398},{  455, 6656},{  471, 6956},
+        {  502, 7193},{  528, 7405},{  562, 7630},{  603, 7922}
+      },
+      /*Cb  qi=8  INTER*/
+      {
+        {   77,  -37},{   68,  299},{   58,  632},{   50,  991},
+        {   50, 1382},{   55, 1799},{   62, 2226},{   73, 2647},
+        {   82, 3066},{   90, 3480},{   94, 3891},{   96, 4296},
+        {   98, 4687},{  101, 5073},{  103, 5456},{  104, 5817},
+        {  105, 6170},{  106, 6523},{  107, 6886},{  108, 7250},
+        {  109, 7600},{  110, 7955},{  111, 8305},{  112, 8641}
+      }
+    },
+    {
+      /*Cr  qi=8  INTRA*/
+      {
+        {    2,    7},{   33,  375},{   64,  760},{   92, 1224},
+        {  111, 1700},{  122, 2173},{  137, 2637},{  156, 3055},
+        {  172, 3476},{  186, 3856},{  211, 4211},{  242, 4597},
+        {  263, 4939},{  292, 5214},{  335, 5489},{  376, 5772},
+        {  406, 6099},{  440, 6378},{  483, 6578},{  517, 6797},
+        {  550, 7049},{  571, 7283},{  583, 7560},{  618, 7967}
+      },
+      /*Cr  qi=8  INTER*/
+      {
+        {   74,   25},{   58,  328},{   43,  637},{   45,  980},
+        {   51, 1371},{   59, 1788},{   69, 2207},{   79, 2630},
+        {   86, 3051},{   91, 3470},{   95, 3880},{   97, 4280},
+        {   98, 4680},{   97, 5074},{   96, 5456},{   97, 5839},
+        {   99, 6219},{  101, 6583},{  103, 6945},{  106, 7312},
+        {  110, 7671},{  114, 8009},{  115, 8345},{  117, 8686}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=9  INTRA*/
+      {
+        {  104,  -68},{  164, 1580},{  288, 3173},{  448, 4439},
+        {  587, 5485},{  702, 6465},{  824, 7351},{  958, 8148},
+        { 1096, 8845},{ 1253, 9480},{ 1432,10047},{ 1640,10494},
+        { 1835,10926},{ 2015,11350},{ 2166,11871},{ 2321,12428},
+        { 2508,12876},{ 2684,13345},{ 2866,13741},{ 3069,13991},
+        { 3281,14243},{ 3487,14518},{ 3689,14813},{ 3911,15175}
+      },
+      /*Y'  qi=9  INTER*/
+      {
+        {   47, -140},{   34, 1348},{   77, 2915},{  119, 4552},
+        {  145, 6150},{  166, 7600},{  182, 8936},{  192,10221},
+        {  203,11482},{  220,12711},{  244,13886},{  274,15012},
+        {  308,16111},{  349,17190},{  401,18244},{  470,19257},
+        {  561,20209},{  680,21069},{  830,21822},{ 1010,22463},
+        { 1227,22971},{ 1482,23328},{ 1769,23544},{ 2077,23655}
+      }
+    },
+    {
+      /*Cb  qi=9  INTRA*/
+      {
+        {    1,    5},{   43,  367},{   76,  740},{   95, 1176},
+        {  114, 1649},{  135, 2138},{  153, 2629},{  165, 3091},
+        {  184, 3481},{  217, 3831},{  244, 4187},{  260, 4572},
+        {  290, 4930},{  320, 5231},{  351, 5521},{  379, 5812},
+        {  414, 6055},{  452, 6307},{  483, 6564},{  502, 6848},
+        {  525, 7115},{  554, 7321},{  589, 7533},{  626, 7833}
+      },
+      /*Cb  qi=9  INTER*/
+      {
+        {  101,  -43},{   81,  298},{   62,  637},{   49,  989},
+        {   51, 1381},{   56, 1806},{   65, 2231},{   74, 2653},
+        {   84, 3071},{   91, 3482},{   95, 3892},{   97, 4293},
+        {   99, 4684},{  101, 5066},{  103, 5437},{  103, 5793},
+        {  103, 6148},{  104, 6511},{  105, 6867},{  107, 7221},
+        {  110, 7572},{  111, 7926},{  112, 8283},{  116, 8625}
+      }
+    },
+    {
+      /*Cr  qi=9  INTRA*/
+      {
+        {    2,    7},{   35,  375},{   66,  761},{   93, 1224},
+        {  112, 1700},{  126, 2173},{  144, 2633},{  165, 3047},
+        {  183, 3458},{  199, 3835},{  224, 4191},{  257, 4558},
+        {  283, 4887},{  309, 5176},{  351, 5446},{  397, 5713},
+        {  433, 6017},{  469, 6283},{  508, 6480},{  546, 6687},
+        {  579, 6945},{  600, 7182},{  610, 7434},{  623, 7793}
+      },
+      /*Cr  qi=9  INTER*/
+      {
+        {   77,   15},{   57,  330},{   45,  640},{   48,  980},
+        {   54, 1380},{   61, 1802},{   70, 2220},{   80, 2639},
+        {   87, 3057},{   92, 3474},{   94, 3882},{   98, 4282},
+        {   98, 4675},{   97, 5062},{   97, 5450},{   98, 5829},
+        {  100, 6197},{  101, 6561},{  104, 6927},{  107, 7289},
+        {  113, 7638},{  117, 7978},{  119, 8311},{  117, 8629}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=10  INTRA*/
+      {
+        {  101,  -69},{  168, 1574},{  299, 3143},{  465, 4386},
+        {  610, 5410},{  736, 6353},{  866, 7207},{ 1006, 7982},
+        { 1153, 8655},{ 1319, 9261},{ 1504, 9812},{ 1719,10248},
+        { 1928,10653},{ 2116,11056},{ 2282,11550},{ 2458,12070},
+        { 2654,12492},{ 2846,12923},{ 3043,13291},{ 3249,13537},
+        { 3466,13764},{ 3682,13999},{ 3896,14268},{ 4145,14548}
+      },
+      /*Y'  qi=10  INTER*/
+      {
+        {   48,  -94},{   34, 1355},{   81, 2920},{  124, 4545},
+        {  151, 6113},{  174, 7532},{  190, 8850},{  201,10125},
+        {  214,11379},{  235,12591},{  264,13745},{  299,14859},
+        {  338,15948},{  388,17008},{  456,18029},{  546,18988},
+        {  661,19877},{  808,20666},{  993,21321},{ 1218,21835},
+        { 1481,22203},{ 1783,22420},{ 2117,22504},{ 2469,22481}
+      }
+    },
+    {
+      /*Cb  qi=10  INTRA*/
+      {
+        {    2,    4},{   44,  367},{   79,  740},{   99, 1178},
+        {  117, 1652},{  137, 2141},{  156, 2630},{  170, 3089},
+        {  192, 3474},{  227, 3813},{  259, 4157},{  282, 4526},
+        {  310, 4860},{  342, 5140},{  377, 5425},{  400, 5714},
+        {  436, 5952},{  475, 6194},{  496, 6468},{  522, 6748},
+        {  559, 6996},{  587, 7216},{  617, 7433},{  673, 7678}
+      },
+      /*Cb  qi=10  INTER*/
+      {
+        {   87,  -37},{   72,  301},{   58,  636},{   49,  995},
+        {   51, 1394},{   57, 1819},{   66, 2241},{   78, 2660},
+        {   87, 3074},{   93, 3482},{   97, 3891},{   99, 4294},
+        {  101, 4678},{  103, 5050},{  105, 5414},{  106, 5773},
+        {  107, 6134},{  108, 6485},{  110, 6832},{  113, 7187},
+        {  113, 7547},{  114, 7887},{  117, 8230},{  112, 8590}
+      }
+    },
+    {
+      /*Cr  qi=10  INTRA*/
+      {
+        {    2,    7},{   38,  375},{   69,  761},{   96, 1224},
+        {  116, 1701},{  131, 2175},{  148, 2634},{  168, 3041},
+        {  190, 3439},{  211, 3802},{  238, 4151},{  271, 4506},
+        {  297, 4824},{  331, 5103},{  373, 5360},{  415, 5632},
+        {  459, 5928},{  500, 6176},{  535, 6386},{  573, 6586},
+        {  608, 6834},{  629, 7079},{  642, 7337},{  686, 7680}
+      },
+      /*Cr  qi=10  INTER*/
+      {
+        {   81,   34},{   63,  333},{   50,  633},{   48,  987},
+        {   53, 1397},{   61, 1820},{   71, 2237},{   83, 2651},
+        {   91, 3065},{   95, 3479},{   98, 3882},{  100, 4279},
+        {  101, 4673},{  101, 5054},{  100, 5429},{  101, 5801},
+        {  102, 6173},{  104, 6541},{  108, 6904},{  110, 7264},
+        {  114, 7609},{  119, 7945},{  123, 8275},{  128, 8615}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=11  INTRA*/
+      {
+        {  110,  -66},{  176, 1564},{  316, 3087},{  492, 4296},
+        {  645, 5299},{  781, 6217},{  924, 7039},{ 1075, 7776},
+        { 1232, 8421},{ 1410, 9005},{ 1607, 9532},{ 1834, 9929},
+        { 2053,10300},{ 2249,10697},{ 2427,11184},{ 2619,11682},
+        { 2826,12083},{ 3019,12508},{ 3225,12869},{ 3452,13064},
+        { 3670,13280},{ 3890,13519},{ 4123,13750},{ 4367,14059}
+      },
+      /*Y'  qi=11  INTER*/
+      {
+        {   72, -115},{   32, 1354},{   83, 2911},{  126, 4534},
+        {  154, 6080},{  178, 7475},{  194, 8779},{  205,10047},
+        {  222,11290},{  246,12488},{  281,13621},{  322,14714},
+        {  372,15786},{  436,16821},{  519,17813},{  628,18728},
+        {  770,19549},{  950,20254},{ 1175,20800},{ 1443,21197},
+        { 1752,21446},{ 2095,21555},{ 2457,21553},{ 2808,21544}
+      }
+    },
+    {
+      /*Cb  qi=11  INTRA*/
+      {
+        {    2,    4},{   45,  367},{   81,  740},{  101, 1177},
+        {  121, 1650},{  142, 2136},{  159, 2621},{  174, 3075},
+        {  199, 3451},{  234, 3778},{  265, 4117},{  297, 4473},
+        {  333, 4789},{  367, 5054},{  402, 5319},{  427, 5613},
+        {  462, 5871},{  503, 6107},{  532, 6336},{  560, 6584},
+        {  601, 6842},{  631, 7092},{  662, 7292},{  721, 7497}
+      },
+      /*Cb  qi=11  INTER*/
+      {
+        {  117,  -24},{   93,  308},{   69,  638},{   52,  993},
+        {   52, 1395},{   58, 1822},{   68, 2246},{   80, 2665},
+        {   89, 3082},{   94, 3492},{   96, 3900},{   98, 4299},
+        {  101, 4679},{  103, 5047},{  104, 5405},{  106, 5763},
+        {  106, 6120},{  107, 6474},{  109, 6823},{  112, 7163},
+        {  115, 7516},{  117, 7868},{  118, 8213},{  119, 8561}
+      }
+    },
+    {
+      /*Cr  qi=11  INTRA*/
+      {
+        {    2,    7},{   40,  375},{   75,  761},{  100, 1224},
+        {  119, 1700},{  137, 2169},{  154, 2622},{  178, 3025},
+        {  198, 3416},{  220, 3770},{  255, 4114},{  294, 4459},
+        {  323, 4756},{  359, 5028},{  399, 5292},{  438, 5556},
+        {  483, 5827},{  518, 6073},{  551, 6298},{  598, 6501},
+        {  634, 6754},{  652, 6997},{  670, 7211},{  689, 7560}
+      },
+      /*Cr  qi=11  INTER*/
+      {
+        {   75,   30},{   61,  334},{   51,  639},{   49,  995},
+        {   53, 1403},{   62, 1821},{   73, 2237},{   84, 2654},
+        {   91, 3070},{   95, 3485},{   96, 3890},{   98, 4287},
+        {   98, 4672},{   99, 5050},{   99, 5427},{  100, 5798},
+        {  103, 6169},{  105, 6528},{  107, 6881},{  113, 7233},
+        {  118, 7580},{  121, 7916},{  125, 8240},{  130, 8551}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=12  INTRA*/
+      {
+        {  104,  -69},{  182, 1557},{  335, 3040},{  521, 4205},
+        {  684, 5178},{  831, 6068},{  986, 6854},{ 1151, 7559},
+        { 1323, 8169},{ 1523, 8704},{ 1736, 9192},{ 1978, 9558},
+        { 2213, 9908},{ 2421,10298},{ 2613,10757},{ 2822,11208},
+        { 3042,11585},{ 3250,11991},{ 3474,12308},{ 3710,12480},
+        { 3939,12687},{ 4174,12902},{ 4416,13102},{ 4672,13369}
+      },
+      /*Y'  qi=12  INTER*/
+      {
+        {   52,  -91},{   34, 1355},{   86, 2911},{  129, 4518},
+        {  159, 6037},{  184, 7405},{  200, 8694},{  213, 9955},
+        {  232,11185},{  263,12360},{  304,13479},{  354,14555},
+        {  415,15601},{  495,16608},{  601,17549},{  738,18400},
+        {  915,19136},{ 1139,19724},{ 1414,20150},{ 1731,20412},
+        { 2090,20520},{ 2473,20509},{ 2851,20442},{ 3227,20328}
+      }
+    },
+    {
+      /*Cb  qi=12  INTRA*/
+      {
+        {    1,    4},{   46,  367},{   85,  740},{  109, 1178},
+        {  126, 1650},{  145, 2134},{  165, 2617},{  182, 3061},
+        {  209, 3428},{  245, 3749},{  281, 4077},{  316, 4417},
+        {  354, 4718},{  392, 4970},{  430, 5217},{  456, 5501},
+        {  490, 5771},{  534, 5996},{  571, 6207},{  600, 6458},
+        {  644, 6697},{  675, 6942},{  707, 7151},{  766, 7342}
+      },
+      /*Cb  qi=12  INTER*/
+      {
+        {   84,  -24},{   73,  311},{   60,  644},{   52,  998},
+        {   53, 1398},{   60, 1825},{   71, 2249},{   83, 2665},
+        {   90, 3081},{   94, 3490},{   97, 3893},{   99, 4286},
+        {  102, 4663},{  104, 5032},{  105, 5393},{  106, 5751},
+        {  107, 6102},{  108, 6445},{  111, 6788},{  113, 7136},
+        {  114, 7483},{  117, 7828},{  121, 8163},{  122, 8496}
+      }
+    },
+    {
+      /*Cr  qi=12  INTRA*/
+      {
+        {    3,    7},{   41,  375},{   78,  761},{  106, 1225},
+        {  124, 1700},{  140, 2167},{  163, 2616},{  188, 3010},
+        {  213, 3385},{  240, 3718},{  271, 4062},{  309, 4406},
+        {  345, 4691},{  387, 4956},{  430, 5212},{  469, 5467},
+        {  513, 5729},{  554, 5970},{  587, 6176},{  633, 6395},
+        {  673, 6659},{  692, 6868},{  712, 7061},{  758, 7259}
+      },
+      /*Cr  qi=12  INTER*/
+      {
+        {   73,   31},{   59,  335},{   48,  638},{   50,  998},
+        {   56, 1410},{   65, 1827},{   75, 2240},{   85, 2657},
+        {   92, 3073},{   95, 3485},{   97, 3888},{   99, 4279},
+        {   98, 4663},{   99, 5042},{  101, 5412},{  102, 5779},
+        {  105, 6142},{  107, 6498},{  108, 6848},{  113, 7198},
+        {  118, 7540},{  121, 7867},{  127, 8188},{  132, 8508}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=13  INTRA*/
+      {
+        {  109,  -68},{  187, 1551},{  347, 3010},{  541, 4153},
+        {  709, 5107},{  864, 5975},{ 1026, 6745},{ 1194, 7433},
+        { 1375, 8021},{ 1581, 8550},{ 1803, 9026},{ 2054, 9371},
+        { 2301, 9713},{ 2522,10082},{ 2728,10515},{ 2949,10956},
+        { 3184,11297},{ 3408,11653},{ 3643,11946},{ 3886,12100},
+        { 4124,12277},{ 4377,12459},{ 4632,12635},{ 4898,12861}
+      },
+      /*Y'  qi=13  INTER*/
+      {
+        {   48,  -78},{   35, 1357},{   89, 2914},{  133, 4512},
+        {  164, 6004},{  190, 7348},{  207, 8627},{  222, 9881},
+        {  247,11096},{  284,12251},{  333,13350},{  392,14407},
+        {  466,15426},{  565,16391},{  696,17279},{  865,18058},
+        { 1085,18689},{ 1358,19156},{ 1684,19456},{ 2050,19605},
+        { 2447,19614},{ 2855,19524},{ 3243,19398},{ 3611,19201}
+      }
+    },
+    {
+      /*Cb  qi=13  INTRA*/
+      {
+        {    2,    4},{   47,  367},{   86,  741},{  108, 1179},
+        {  127, 1651},{  150, 2133},{  173, 2611},{  194, 3050},
+        {  222, 3417},{  262, 3733},{  303, 4048},{  337, 4375},
+        {  378, 4657},{  420, 4897},{  456, 5148},{  486, 5422},
+        {  518, 5682},{  558, 5903},{  592, 6113},{  623, 6372},
+        {  662, 6628},{  700, 6833},{  751, 6989},{  805, 7147}
+      },
+      /*Cb  qi=13  INTER*/
+      {
+        {   94,  -34},{   78,  303},{   60,  638},{   51,  994},
+        {   54, 1406},{   61, 1836},{   73, 2253},{   84, 2668},
+        {   92, 3082},{   96, 3492},{   99, 3894},{  101, 4284},
+        {  103, 4659},{  105, 5023},{  106, 5376},{  108, 5726},
+        {  109, 6070},{  110, 6418},{  113, 6765},{  117, 7105},
+        {  119, 7448},{  122, 7784},{  126, 8119},{  131, 8463}
+      }
+    },
+    {
+      /*Cr  qi=13  INTRA*/
+      {
+        {    3,    7},{   43,  375},{   80,  762},{  110, 1226},
+        {  131, 1701},{  149, 2166},{  172, 2610},{  196, 2999},
+        {  221, 3359},{  254, 3679},{  292, 4005},{  332, 4329},
+        {  369, 4612},{  408, 4880},{  456, 5139},{  500, 5388},
+        {  544, 5631},{  581, 5877},{  615, 6101},{  660, 6316},
+        {  692, 6594},{  714, 6795},{  736, 6997},{  789, 7290}
+      },
+      /*Cr  qi=13  INTER*/
+      {
+        {   73,   28},{   61,  336},{   46,  642},{   50, 1003},
+        {   58, 1414},{   67, 1832},{   79, 2245},{   87, 2660},
+        {   93, 3075},{   97, 3484},{   99, 3888},{  100, 4277},
+        {  100, 4651},{  100, 5027},{  101, 5403},{  102, 5765},
+        {  105, 6116},{  109, 6470},{  113, 6825},{  119, 7163},
+        {  124, 7497},{  127, 7827},{  131, 8137},{  135, 8437}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=14  INTRA*/
+      {
+        {  113,  -68},{  191, 1545},{  358, 2981},{  559, 4104},
+        {  733, 5044},{  896, 5890},{ 1066, 6636},{ 1241, 7304},
+        { 1428, 7886},{ 1642, 8402},{ 1872, 8871},{ 2128, 9219},
+        { 2380, 9547},{ 2609, 9908},{ 2825,10321},{ 3055,10728},
+        { 3294,11076},{ 3523,11425},{ 3766,11689},{ 4013,11845},
+        { 4254,12022},{ 4506,12209},{ 4759,12383},{ 5013,12637}
+      },
+      /*Y'  qi=14  INTER*/
+      {
+        {   58,  -82},{   38, 1362},{   93, 2914},{  138, 4492},
+        {  171, 5962},{  198, 7289},{  216, 8559},{  234, 9804},
+        {  263,11005},{  306,12143},{  363,13222},{  434,14259},
+        {  523,15255},{  639,16188},{  794,17021},{ 1000,17717},
+        { 1262,18260},{ 1575,18645},{ 1943,18841},{ 2356,18872},
+        { 2782,18802},{ 3194,18682},{ 3576,18559},{ 3923,18447}
+      }
+    },
+    {
+      /*Cb  qi=14  INTRA*/
+      {
+        {    2,    3},{   50,  367},{   91,  741},{  114, 1180},
+        {  134, 1651},{  157, 2131},{  181, 2601},{  208, 3028},
+        {  239, 3391},{  279, 3706},{  322, 4000},{  361, 4309},
+        {  406, 4587},{  445, 4822},{  482, 5067},{  515, 5344},
+        {  546, 5612},{  589, 5821},{  626, 6020},{  655, 6276},
+        {  701, 6523},{  748, 6717},{  796, 6876},{  815, 7151}
+      },
+      /*Cb  qi=14  INTER*/
+      {
+        {   80,  -43},{   68,  301},{   56,  644},{   50, 1004},
+        {   54, 1412},{   63, 1836},{   75, 2253},{   87, 2670},
+        {   94, 3083},{   98, 3487},{  101, 3885},{  103, 4271},
+        {  106, 4645},{  107, 5004},{  108, 5358},{  109, 5705},
+        {  112, 6047},{  115, 6388},{  118, 6731},{  121, 7081},
+        {  126, 7421},{  129, 7747},{  132, 8076},{  137, 8419}
+      }
+    },
+    {
+      /*Cr  qi=14  INTRA*/
+      {
+        {    3,    6},{   45,  375},{   85,  762},{  116, 1226},
+        {  138, 1700},{  158, 2163},{  180, 2602},{  206, 2985},
+        {  236, 3333},{  270, 3639},{  310, 3956},{  359, 4258},
+        {  397, 4524},{  430, 4802},{  478, 5068},{  527, 5316},
+        {  572, 5560},{  613, 5802},{  654, 6012},{  699, 6216},
+        {  734, 6489},{  755, 6707},{  775, 6898},{  841, 7111}
+      },
+      /*Cr  qi=14  INTER*/
+      {
+        {   78,    0},{   59,  322},{   46,  649},{   51, 1016},
+        {   58, 1422},{   68, 1839},{   81, 2253},{   90, 2666},
+        {   95, 3080},{   98, 3486},{  101, 3881},{  102, 4268},
+        {  102, 4644},{  103, 5017},{  105, 5382},{  106, 5743},
+        {  108, 6093},{  112, 6442},{  118, 6791},{  124, 7130},
+        {  127, 7463},{  133, 7784},{  138, 8085},{  142, 8395}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=15  INTRA*/
+      {
+        {  111,  -66},{  197, 1538},{  370, 2949},{  579, 4050},
+        {  762, 4968},{  933, 5798},{ 1112, 6520},{ 1299, 7161},
+        { 1497, 7725},{ 1723, 8219},{ 1967, 8654},{ 2234, 8990},
+        { 2499, 9302},{ 2740, 9637},{ 2968,10039},{ 3215,10414},
+        { 3473,10709},{ 3721,11015},{ 3971,11270},{ 4228,11402},
+        { 4487,11543},{ 4752,11707},{ 5011,11871},{ 5290,12099}
+      },
+      /*Y'  qi=15  INTER*/
+      {
+        {   59, -113},{   37, 1349},{   95, 2904},{  139, 4478},
+        {  174, 5929},{  201, 7244},{  220, 8505},{  241, 9736},
+        {  275,10922},{  327,12040},{  395,13097},{  477,14114},
+        {  585,15071},{  730,15947},{  917,16714},{ 1162,17326},
+        { 1468,17770},{ 1833,18029},{ 2251,18111},{ 2694,18068},
+        { 3125,17968},{ 3529,17845},{ 3908,17713},{ 4260,17587}
+      }
+    },
+    {
+      /*Cb  qi=15  INTRA*/
+      {
+        {    2,    3},{   51,  367},{   94,  741},{  120, 1180},
+        {  140, 1651},{  160, 2129},{  184, 2591},{  213, 3010},
+        {  246, 3371},{  289, 3680},{  335, 3969},{  374, 4274},
+        {  418, 4546},{  460, 4783},{  498, 5019},{  532, 5280},
+        {  565, 5553},{  608, 5765},{  647, 5958},{  683, 6193},
+        {  732, 6433},{  782, 6620},{  832, 6769},{  848, 7027}
+      },
+      /*Cb  qi=15  INTER*/
+      {
+        {   71,  -52},{   63,  296},{   54,  644},{   50, 1010},
+        {   53, 1417},{   64, 1837},{   77, 2253},{   88, 2666},
+        {   95, 3079},{   98, 3487},{  100, 3882},{  103, 4264},
+        {  106, 4633},{  108, 4991},{  109, 5343},{  109, 5693},
+        {  112, 6038},{  114, 6371},{  119, 6709},{  123, 7051},
+        {  125, 7385},{  130, 7716},{  135, 8050},{  140, 8374}
+      }
+    },
+    {
+      /*Cr  qi=15  INTRA*/
+      {
+        {    2,    6},{   47,  375},{   87,  763},{  119, 1225},
+        {  143, 1699},{  162, 2158},{  185, 2595},{  213, 2971},
+        {  246, 3315},{  279, 3618},{  320, 3920},{  372, 4210},
+        {  409, 4480},{  446, 4756},{  496, 5017},{  542, 5263},
+        {  590, 5487},{  639, 5721},{  687, 5923},{  724, 6132},
+        {  753, 6417},{  781, 6622},{  805, 6806},{  856, 6977}
+      },
+      /*Cr  qi=15  INTER*/
+      {
+        {   71,    3},{   61,  326},{   52,  651},{   50, 1017},
+        {   58, 1422},{   69, 1837},{   82, 2251},{   90, 2668},
+        {   95, 3080},{   98, 3484},{  101, 3877},{  102, 4257},
+        {  102, 4632},{  101, 5005},{  103, 5370},{  106, 5733},
+        {  110, 6082},{  116, 6424},{  120, 6774},{  124, 7106},
+        {  130, 7427},{  135, 7748},{  141, 8052},{  147, 8333}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=16  INTRA*/
+      {
+        {  114,  -63},{  206, 1525},{  396, 2887},{  618, 3945},
+        {  816, 4832},{ 1002, 5626},{ 1196, 6319},{ 1401, 6923},
+        { 1616, 7458},{ 1857, 7928},{ 2121, 8334},{ 2405, 8645},
+        { 2685, 8934},{ 2938, 9255},{ 3175, 9638},{ 3433, 9990},
+        { 3707,10263},{ 3958,10577},{ 4218,10807},{ 4488,10906},
+        { 4760,11028},{ 5037,11148},{ 5306,11286},{ 5625,11463}
+      },
+      /*Y'  qi=16  INTER*/
+      {
+        {   69, -153},{   39, 1348},{   98, 2894},{  144, 4448},
+        {  181, 5872},{  209, 7167},{  228, 8422},{  254, 9644},
+        {  297,10810},{  359,11908},{  438,12944},{  539,13930},
+        {  672,14842},{  850,15650},{ 1085,16318},{ 1391,16793},
+        { 1769,17082},{ 2200,17198},{ 2659,17174},{ 3116,17072},
+        { 3547,16948},{ 3943,16819},{ 4299,16701},{ 4611,16644}
+      }
+    },
+    {
+      /*Cb  qi=16  INTRA*/
+      {
+        {    3,    4},{   54,  367},{   97,  742},{  122, 1181},
+        {  143, 1651},{  168, 2123},{  197, 2575},{  226, 2985},
+        {  263, 3338},{  314, 3631},{  367, 3903},{  409, 4200},
+        {  453, 4468},{  491, 4703},{  528, 4932},{  566, 5188},
+        {  601, 5459},{  647, 5672},{  693, 5844},{  734, 6058},
+        {  784, 6305},{  836, 6460},{  882, 6602},{  905, 6891}
+      },
+      /*Cb  qi=16  INTER*/
+      {
+        {   75,  -64},{   67,  292},{   56,  645},{   51, 1016},
+        {   54, 1421},{   66, 1842},{   79, 2257},{   89, 2670},
+        {   95, 3082},{   98, 3488},{  101, 3879},{  104, 4258},
+        {  106, 4623},{  108, 4974},{  109, 5321},{  113, 5664},
+        {  116, 6001},{  117, 6341},{  123, 6677},{  128, 7004},
+        {  130, 7336},{  136, 7671},{  143, 7996},{  148, 8310}
+      }
+    },
+    {
+      /*Cr  qi=16  INTRA*/
+      {
+        {    4,    7},{   50,  375},{   90,  763},{  124, 1225},
+        {  148, 1698},{  168, 2154},{  195, 2582},{  227, 2948},
+        {  263, 3279},{  302, 3575},{  343, 3865},{  394, 4137},
+        {  439, 4402},{  482, 4672},{  533, 4925},{  579, 5165},
+        {  626, 5382},{  675, 5616},{  725, 5812},{  769, 5991},
+        {  810, 6242},{  848, 6430},{  868, 6615},{  944, 6732}
+      },
+      /*Cr  qi=16  INTER*/
+      {
+        {   78,   11},{   62,  327},{   49,  650},{   50, 1025},
+        {   59, 1431},{   72, 1841},{   83, 2253},{   90, 2671},
+        {   95, 3084},{   98, 3487},{  100, 3879},{  101, 4254},
+        {  102, 4625},{  103, 4994},{  106, 5355},{  108, 5708},
+        {  111, 6058},{  115, 6400},{  121, 6733},{  128, 7058},
+        {  134, 7374},{  140, 7691},{  146, 7993},{  146, 8317}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=17  INTRA*/
+      {
+        {  112,  -59},{  210, 1515},{  409, 2850},{  640, 3882},
+        {  844, 4748},{ 1038, 5529},{ 1240, 6206},{ 1452, 6803},
+        { 1676, 7330},{ 1925, 7792},{ 2194, 8201},{ 2483, 8512},
+        { 2766, 8801},{ 3027, 9121},{ 3279, 9482},{ 3548, 9810},
+        { 3825,10069},{ 4088,10345},{ 4362,10544},{ 4638,10644},
+        { 4915,10744},{ 5196,10850},{ 5471,10981},{ 5802,11136}
+      },
+      /*Y'  qi=17  INTER*/
+      {
+        {   70, -147},{   45, 1349},{  106, 2894},{  155, 4425},
+        {  195, 5818},{  225, 7099},{  247, 8348},{  278, 9565},
+        {  328,10717},{  399,11794},{  491,12807},{  609,13760},
+        {  766,14623},{  984,15349},{ 1274,15902},{ 1642,16256},
+        { 2082,16411},{ 2563,16409},{ 3048,16315},{ 3508,16194},
+        { 3924,16064},{ 4306,15938},{ 4656,15828},{ 4966,15733}
+      }
+    },
+    {
+      /*Cb  qi=17  INTRA*/
+      {
+        {    3,    4},{   57,  367},{  101,  742},{  126, 1182},
+        {  148, 1650},{  175, 2118},{  207, 2565},{  241, 2966},
+        {  279, 3307},{  331, 3588},{  389, 3845},{  435, 4132},
+        {  474, 4408},{  517, 4641},{  560, 4869},{  602, 5122},
+        {  638, 5389},{  672, 5610},{  716, 5787},{  758, 6002},
+        {  817, 6226},{  869, 6393},{  916, 6530},{  950, 6799}
+      },
+      /*Cb  qi=17  INTER*/
+      {
+        {  105,  -65},{   86,  288},{   66,  638},{   54, 1014},
+        {   59, 1427},{   71, 1844},{   86, 2257},{   95, 2668},
+        {  100, 3075},{  103, 3476},{  106, 3867},{  110, 4241},
+        {  112, 4598},{  114, 4948},{  117, 5294},{  121, 5633},
+        {  123, 5968},{  126, 6301},{  131, 6637},{  136, 6968},
+        {  144, 7287},{  152, 7606},{  158, 7931},{  162, 8262}
+      }
+    },
+    {
+      /*Cr  qi=17  INTRA*/
+      {
+        {    4,    6},{   55,  376},{   97,  765},{  128, 1226},
+        {  152, 1696},{  175, 2144},{  204, 2568},{  241, 2928},
+        {  282, 3250},{  323, 3530},{  368, 3811},{  420, 4089},
+        {  463, 4347},{  505, 4609},{  562, 4860},{  609, 5094},
+        {  655, 5303},{  709, 5535},{  759, 5740},{  803, 5913},
+        {  844, 6153},{  879, 6350},{  905, 6527},{  972, 6637}
+      },
+      /*Cr  qi=17  INTER*/
+      {
+        {   88,    8},{   68,  330},{   51,  653},{   54, 1028},
+        {   65, 1433},{   77, 1845},{   89, 2257},{   96, 2669},
+        {  100, 3081},{  102, 3481},{  105, 3867},{  106, 4245},
+        {  108, 4613},{  110, 4971},{  112, 5328},{  115, 5679},
+        {  120, 6019},{  127, 6355},{  133, 6686},{  140, 7007},
+        {  149, 7316},{  158, 7618},{  166, 7924},{  170, 8232}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=18  INTRA*/
+      {
+        {  122,  -58},{  216, 1506},{  425, 2815},{  665, 3822},
+        {  882, 4666},{ 1088, 5425},{ 1301, 6084},{ 1529, 6653},
+        { 1766, 7162},{ 2026, 7611},{ 2312, 7987},{ 2612, 8278},
+        { 2913, 8551},{ 3196, 8840},{ 3454, 9184},{ 3734, 9490},
+        { 4030, 9725},{ 4305, 9973},{ 4585,10162},{ 4864,10251},
+        { 5150,10324},{ 5443,10420},{ 5727,10536},{ 6053,10682}
+      },
+      /*Y'  qi=18  INTER*/
+      {
+        {   66, -143},{   47, 1351},{  108, 2886},{  158, 4401},
+        {  200, 5775},{  232, 7044},{  256, 8288},{  292, 9493},
+        {  351,10625},{  434,11679},{  541,12665},{  681,13578},
+        {  875,14379},{ 1136,15025},{ 1483,15475},{ 1914,15709},
+        { 2399,15767},{ 2907,15699},{ 3400,15579},{ 3852,15453},
+        { 4259,15332},{ 4630,15221},{ 4976,15121},{ 5294,15061}
+      }
+    },
+    {
+      /*Cb  qi=18  INTRA*/
+      {
+        {    2,    3},{   61,  367},{  107,  743},{  131, 1182},
+        {  155, 1648},{  183, 2110},{  220, 2542},{  260, 2927},
+        {  303, 3265},{  359, 3540},{  416, 3785},{  462, 4063},
+        {  506, 4334},{  553, 4567},{  595, 4797},{  636, 5049},
+        {  676, 5304},{  717, 5516},{  759, 5698},{  801, 5904},
+        {  861, 6133},{  911, 6311},{  962, 6443},{ 1021, 6645}
+      },
+      /*Cb  qi=18  INTER*/
+      {
+        {  126,    5},{   95,  326},{   66,  643},{   55, 1015},
+        {   60, 1427},{   73, 1843},{   87, 2256},{   96, 2667},
+        {  101, 3073},{  104, 3470},{  108, 3853},{  111, 4226},
+        {  114, 4584},{  117, 4928},{  119, 5274},{  122, 5612},
+        {  126, 5942},{  130, 6271},{  136, 6606},{  141, 6931},
+        {  148, 7247},{  156, 7568},{  164, 7891},{  173, 8211}
+      }
+    },
+    {
+      /*Cr  qi=18  INTRA*/
+      {
+        {    4,    6},{   59,  376},{  104,  765},{  133, 1226},
+        {  156, 1692},{  184, 2136},{  218, 2548},{  260, 2893},
+        {  308, 3204},{  348, 3481},{  397, 3751},{  448, 4024},
+        {  490, 4281},{  541, 4523},{  593, 4776},{  634, 5022},
+        {  685, 5236},{  748, 5455},{  812, 5638},{  856, 5818},
+        {  891, 6048},{  928, 6230},{  961, 6405},{ 1055, 6449}
+      },
+      /*Cr  qi=18  INTER*/
+      {
+        {   81,   34},{   68,  342},{   57,  652},{   59, 1027},
+        {   67, 1439},{   80, 1848},{   91, 2257},{   97, 2670},
+        {  100, 3076},{  103, 3473},{  106, 3857},{  108, 4231},
+        {  109, 4599},{  110, 4958},{  113, 5307},{  119, 5650},
+        {  125, 5991},{  130, 6325},{  138, 6651},{  147, 6971},
+        {  153, 7278},{  162, 7578},{  172, 7874},{  177, 8156}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=19  INTRA*/
+      {
+        {  128,  -55},{  228, 1495},{  448, 2775},{  699, 3758},
+        {  931, 4571},{ 1154, 5296},{ 1386, 5914},{ 1636, 6450},
+        { 1894, 6930},{ 2177, 7342},{ 2479, 7698},{ 2792, 7976},
+        { 3099, 8235},{ 3392, 8517},{ 3658, 8853},{ 3938, 9155},
+        { 4242, 9371},{ 4527, 9605},{ 4810, 9781},{ 5089, 9853},
+        { 5378, 9920},{ 5674,10009},{ 5972,10110},{ 6336,10196}
+      },
+      /*Y'  qi=19  INTER*/
+      {
+        {   69, -147},{   49, 1353},{  111, 2883},{  162, 4381},
+        {  205, 5737},{  237, 6996},{  264, 8232},{  307, 9421},
+        {  376,10534},{  472,11567},{  596,12525},{  761,13395},
+        {  990,14130},{ 1298,14694},{ 1695,15053},{ 2172,15195},
+        { 2696,15173},{ 3213,15075},{ 3696,14948},{ 4141,14829},
+        { 4541,14721},{ 4910,14609},{ 5245,14506},{ 5536,14399}
+      }
+    },
+    {
+      /*Cb  qi=19  INTRA*/
+      {
+        {    3,    3},{   61,  367},{  109,  743},{  135, 1182},
+        {  161, 1646},{  191, 2101},{  229, 2524},{  273, 2898},
+        {  318, 3221},{  376, 3490},{  436, 3731},{  487, 3994},
+        {  539, 4251},{  584, 4485},{  621, 4721},{  664, 4967},
+        {  709, 5225},{  752, 5431},{  801, 5595},{  846, 5796},
+        {  912, 6011},{  959, 6193},{ 1015, 6321},{ 1121, 6504}
+      },
+      /*Cb  qi=19  INTER*/
+      {
+        {  126,    4},{   97,  329},{   69,  649},{   56, 1017},
+        {   61, 1432},{   74, 1846},{   88, 2255},{   98, 2663},
+        {  103, 3065},{  106, 3460},{  110, 3844},{  114, 4211},
+        {  117, 4564},{  120, 4911},{  122, 5253},{  125, 5588},
+        {  129, 5916},{  135, 6241},{  142, 6567},{  149, 6885},
+        {  155, 7206},{  163, 7527},{  174, 7843},{  188, 8145}
+      }
+    },
+    {
+      /*Cr  qi=19  INTRA*/
+      {
+        {    5,    6},{   61,  376},{  106,  765},{  135, 1225},
+        {  160, 1689},{  192, 2126},{  229, 2531},{  271, 2869},
+        {  321, 3168},{  370, 3433},{  421, 3704},{  476, 3965},
+        {  520, 4212},{  572, 4452},{  629, 4691},{  671, 4939},
+        {  724, 5152},{  792, 5347},{  858, 5510},{  895, 5696},
+        {  939, 5905},{  991, 6056},{ 1027, 6244},{ 1127, 6333}
+      },
+      /*Cr  qi=19  INTER*/
+      {
+        {   80,   45},{   66,  344},{   55,  654},{   56, 1030},
+        {   66, 1440},{   80, 1850},{   91, 2259},{   98, 2668},
+        {  102, 3072},{  104, 3466},{  107, 3845},{  109, 4215},
+        {  110, 4578},{  112, 4933},{  116, 5283},{  122, 5625},
+        {  129, 5963},{  136, 6287},{  143, 6611},{  151, 6927},
+        {  160, 7229},{  170, 7528},{  181, 7818},{  191, 8092}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=20  INTRA*/
+      {
+        {  129,  -50},{  238, 1481},{  469, 2728},{  730, 3684},
+        {  974, 4473},{ 1213, 5171},{ 1463, 5763},{ 1729, 6281},
+        { 2002, 6744},{ 2299, 7146},{ 2613, 7492},{ 2940, 7746},
+        { 3265, 7978},{ 3571, 8228},{ 3853, 8543},{ 4156, 8815},
+        { 4476, 9001},{ 4775, 9218},{ 5070, 9373},{ 5352, 9446},
+        { 5649, 9510},{ 5956, 9580},{ 6268, 9660},{ 6647, 9705}
+      },
+      /*Y'  qi=20  INTER*/
+      {
+        {   64,  -93},{   52, 1340},{  116, 2862},{  170, 4344},
+        {  216, 5678},{  249, 6928},{  281, 8155},{  333, 9326},
+        {  418,10410},{  533,11411},{  683,12329},{  890,13127},
+        { 1183,13750},{ 1579,14162},{ 2066,14357},{ 2611,14370},
+        { 3159,14284},{ 3675,14167},{ 4142,14053},{ 4568,13953},
+        { 4961,13852},{ 5320,13755},{ 5649,13675},{ 5933,13610}
+      }
+    },
+    {
+      /*Cb  qi=20  INTRA*/
+      {
+        {    3,    3},{   62,  367},{  112,  743},{  140, 1183},
+        {  165, 1646},{  196, 2099},{  235, 2517},{  284, 2883},
+        {  334, 3198},{  393, 3460},{  457, 3690},{  509, 3945},
+        {  560, 4198},{  605, 4435},{  647, 4658},{  699, 4888},
+        {  742, 5155},{  788, 5350},{  835, 5517},{  880, 5730},
+        {  956, 5914},{ 1007, 6060},{ 1053, 6199},{ 1158, 6358}
+      },
+      /*Cb  qi=20  INTER*/
+      {
+        {  128,   -6},{   96,  322},{   66,  653},{   54, 1025},
+        {   63, 1431},{   79, 1844},{   91, 2256},{   99, 2665},
+        {  104, 3065},{  107, 3455},{  111, 3831},{  115, 4189},
+        {  120, 4539},{  123, 4885},{  126, 5219},{  130, 5548},
+        {  135, 5876},{  141, 6199},{  149, 6519},{  156, 6837},
+        {  166, 7153},{  179, 7468},{  189, 7784},{  194, 8102}
+      }
+    },
+    {
+      /*Cr  qi=20  INTRA*/
+      {
+        {    4,    6},{   63,  376},{  109,  765},{  139, 1225},
+        {  165, 1689},{  199, 2124},{  239, 2523},{  285, 2852},
+        {  340, 3140},{  388, 3398},{  438, 3662},{  499, 3914},
+        {  547, 4155},{  596, 4392},{  652, 4634},{  699, 4877},
+        {  759, 5074},{  824, 5257},{  883, 5428},{  936, 5589},
+        {  986, 5790},{ 1030, 5960},{ 1074, 6119},{ 1172, 6191}
+      },
+      /*Cr  qi=20  INTER*/
+      {
+        {   92,   40},{   70,  345},{   55,  658},{   57, 1034},
+        {   69, 1441},{   84, 1852},{   94, 2261},{   98, 2669},
+        {  102, 3074},{  105, 3465},{  107, 3841},{  110, 4206},
+        {  112, 4562},{  116, 4915},{  121, 5260},{  127, 5591},
+        {  134, 5920},{  142, 6246},{  153, 6562},{  163, 6870},
+        {  173, 7170},{  186, 7463},{  198, 7746},{  199, 8030}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=21  INTRA*/
+      {
+        {  130,  -51},{  244, 1476},{  483, 2705},{  756, 3635},
+        { 1013, 4396},{ 1266, 5070},{ 1530, 5647},{ 1806, 6153},
+        { 2093, 6600},{ 2411, 6976},{ 2739, 7299},{ 3079, 7534},
+        { 3422, 7744},{ 3738, 7987},{ 4032, 8274},{ 4348, 8533},
+        { 4675, 8721},{ 4989, 8909},{ 5291, 9051},{ 5577, 9111},
+        { 5879, 9163},{ 6190, 9228},{ 6506, 9286},{ 6899, 9295}
+      },
+      /*Y'  qi=21  INTER*/
+      {
+        {   64,  -56},{   55, 1341},{  119, 2859},{  174, 4324},
+        {  223, 5640},{  258, 6880},{  295, 8096},{  359, 9246},
+        {  460,10302},{  595,11268},{  778,12131},{ 1032,12857},
+        { 1387,13385},{ 1850,13683},{ 2399,13774},{ 2976,13729},
+        { 3527,13619},{ 4034,13504},{ 4492,13401},{ 4912,13291},
+        { 5298,13209},{ 5648,13137},{ 5974,13046},{ 6308,12977}
+      }
+    },
+    {
+      /*Cb  qi=21  INTRA*/
+      {
+        {    4,    3},{   64,  367},{  114,  743},{  141, 1183},
+        {  166, 1645},{  201, 2092},{  247, 2502},{  299, 2856},
+        {  352, 3158},{  413, 3412},{  480, 3642},{  536, 3893},
+        {  588, 4137},{  637, 4367},{  678, 4598},{  725, 4834},
+        {  774, 5083},{  827, 5269},{  883, 5420},{  930, 5633},
+        {  999, 5829},{ 1057, 5959},{ 1113, 6082},{ 1200, 6265}
+      },
+      /*Cb  qi=21  INTER*/
+      {
+        {  109,   -8},{   84,  321},{   62,  654},{   54, 1028},
+        {   64, 1434},{   80, 1847},{   92, 2259},{  100, 2664},
+        {  105, 3060},{  109, 3445},{  114, 3815},{  118, 4172},
+        {  122, 4519},{  126, 4861},{  128, 5194},{  133, 5520},
+        {  139, 5847},{  146, 6169},{  155, 6487},{  166, 6801},
+        {  177, 7114},{  189, 7423},{  201, 7729},{  208, 8035}
+      }
+    },
+    {
+      /*Cr  qi=21  INTRA*/
+      {
+        {    4,    6},{   64,  377},{  111,  766},{  144, 1225},
+        {  174, 1683},{  206, 2114},{  248, 2506},{  302, 2824},
+        {  357, 3099},{  404, 3357},{  455, 3622},{  519, 3867},
+        {  573, 4098},{  625, 4331},{  683, 4571},{  733, 4802},
+        {  793, 4994},{  863, 5173},{  926, 5337},{  978, 5492},
+        { 1030, 5685},{ 1079, 5856},{ 1126, 6027},{ 1217, 6159}
+      },
+      /*Cr  qi=21  INTER*/
+      {
+        {   82,   29},{   67,  341},{   55,  660},{   58, 1038},
+        {   71, 1443},{   85, 1851},{   95, 2258},{   99, 2666},
+        {  103, 3069},{  107, 3456},{  110, 3826},{  112, 4188},
+        {  114, 4544},{  118, 4891},{  124, 5231},{  132, 5567},
+        {  139, 5894},{  148, 6210},{  159, 6520},{  171, 6822},
+        {  185, 7111},{  196, 7403},{  209, 7691},{  225, 7945}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=22  INTRA*/
+      {
+        {  128,  -45},{  254, 1463},{  507, 2662},{  794, 3562},
+        { 1070, 4292},{ 1340, 4941},{ 1622, 5492},{ 1920, 5968},
+        { 2229, 6387},{ 2565, 6742},{ 2911, 7047},{ 3263, 7264},
+        { 3615, 7464},{ 3944, 7689},{ 4258, 7950},{ 4591, 8183},
+        { 4934, 8347},{ 5259, 8517},{ 5573, 8634},{ 5870, 8683},
+        { 6186, 8723},{ 6508, 8762},{ 6831, 8801},{ 7232, 8830}
+      },
+      /*Y'  qi=22  INTER*/
+      {
+        {   77,  -48},{   57, 1343},{  122, 2853},{  180, 4299},
+        {  231, 5597},{  269, 6826},{  314, 8025},{  393, 9150},
+        {  512,10179},{  673,11103},{  894,11908},{ 1207,12542},
+        { 1635,12956},{ 2166,13148},{ 2755,13167},{ 3345,13088},
+        { 3895,12966},{ 4386,12848},{ 4832,12746},{ 5252,12647},
+        { 5634,12563},{ 5978,12497},{ 6299,12412},{ 6633,12338}
+      }
+    },
+    {
+      /*Cb  qi=22  INTRA*/
+      {
+        {    4,    3},{   66,  367},{  122,  744},{  153, 1182},
+        {  177, 1640},{  213, 2080},{  263, 2475},{  323, 2811},
+        {  382, 3103},{  451, 3346},{  522, 3568},{  581, 3814},
+        {  633, 4054},{  674, 4288},{  719, 4523},{  768, 4756},
+        {  823, 4979},{  883, 5162},{  937, 5325},{  996, 5510},
+        { 1070, 5687},{ 1129, 5807},{ 1193, 5929},{ 1311, 6099}
+      },
+      /*Cb  qi=22  INTER*/
+      {
+        {  107,   -5},{   83,  322},{   61,  653},{   55, 1030},
+        {   66, 1436},{   81, 1845},{   94, 2253},{  102, 2656},
+        {  107, 3050},{  111, 3435},{  115, 3804},{  119, 4158},
+        {  124, 4501},{  128, 4835},{  132, 5164},{  138, 5490},
+        {  146, 5812},{  154, 6128},{  163, 6442},{  174, 6754},
+        {  188, 7060},{  205, 7361},{  219, 7662},{  233, 7953}
+      }
+    },
+    {
+      /*Cr  qi=22  INTRA*/
+      {
+        {    4,    6},{   67,  378},{  118,  767},{  151, 1222},
+        {  182, 1675},{  221, 2097},{  269, 2476},{  329, 2774},
+        {  389, 3039},{  444, 3292},{  500, 3545},{  560, 3788},
+        {  615, 4020},{  671, 4251},{  734, 4484},{  781, 4712},
+        {  850, 4887},{  925, 5060},{  981, 5229},{ 1031, 5369},
+        { 1092, 5549},{ 1148, 5715},{ 1200, 5861},{ 1291, 5943}
+      },
+      /*Cr  qi=22  INTER*/
+      {
+        {   88,   34},{   69,  340},{   57,  657},{   60, 1039},
+        {   73, 1445},{   87, 1851},{   96, 2257},{  100, 2662},
+        {  103, 3058},{  107, 3442},{  111, 3812},{  115, 4172},
+        {  118, 4524},{  123, 4864},{  129, 5199},{  136, 5531},
+        {  145, 5855},{  156, 6168},{  170, 6468},{  184, 6765},
+        {  193, 7066},{  207, 7353},{  222, 7628},{  230, 7900}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=23  INTRA*/
+      {
+        {  126,  -40},{  257, 1458},{  521, 2636},{  825, 3501},
+        { 1111, 4207},{ 1391, 4842},{ 1684, 5385},{ 1992, 5858},
+        { 2311, 6277},{ 2653, 6626},{ 3005, 6929},{ 3366, 7134},
+        { 3729, 7311},{ 4071, 7526},{ 4396, 7770},{ 4734, 7986},
+        { 5086, 8131},{ 5421, 8286},{ 5735, 8404},{ 6033, 8456},
+        { 6357, 8486},{ 6682, 8525},{ 7003, 8573},{ 7387, 8604}
+      },
+      /*Y'  qi=23  INTER*/
+      {
+        {   64,  -57},{   60, 1345},{  124, 2853},{  185, 4284},
+        {  239, 5565},{  282, 6783},{  336, 7967},{  429, 9069},
+        {  568,10063},{  758,10943},{ 1028,11679},{ 1407,12216},
+        { 1909,12520},{ 2502,12616},{ 3126,12573},{ 3722,12461},
+        { 4258,12344},{ 4742,12236},{ 5185,12136},{ 5590,12052},
+        { 5970,11980},{ 6315,11901},{ 6631,11826},{ 6954,11769}
+      }
+    },
+    {
+      /*Cb  qi=23  INTRA*/
+      {
+        {    3,    3},{   70,  367},{  124,  744},{  151, 1182},
+        {  181, 1637},{  222, 2071},{  276, 2460},{  343, 2785},
+        {  403, 3072},{  468, 3317},{  542, 3534},{  605, 3773},
+        {  659, 4009},{  703, 4243},{  747, 4479},{  795, 4707},
+        {  852, 4923},{  908, 5105},{  972, 5254},{ 1043, 5423},
+        { 1118, 5594},{ 1172, 5731},{ 1240, 5853},{ 1365, 6005}
+      },
+      /*Cb  qi=23  INTER*/
+      {
+        {  109,  -10},{   87,  325},{   63,  650},{   57, 1031},
+        {   67, 1439},{   83, 1847},{   96, 2253},{  103, 2652},
+        {  109, 3041},{  114, 3421},{  117, 3789},{  122, 4141},
+        {  128, 4480},{  134, 4811},{  139, 5138},{  144, 5463},
+        {  152, 5781},{  161, 6096},{  174, 6404},{  185, 6714},
+        {  198, 7023},{  216, 7320},{  233, 7621},{  245, 7935}
+      }
+    },
+    {
+      /*Cr  qi=23  INTRA*/
+      {
+        {    5,    6},{   70,  379},{  122,  768},{  155, 1222},
+        {  187, 1671},{  231, 2088},{  283, 2459},{  346, 2750},
+        {  411, 3009},{  465, 3261},{  523, 3509},{  585, 3746},
+        {  639, 3980},{  695, 4219},{  754, 4449},{  803, 4671},
+        {  873, 4840},{  953, 5001},{ 1015, 5156},{ 1071, 5286},
+        { 1137, 5464},{ 1191, 5629},{ 1249, 5782},{ 1359, 5885}
+      },
+      /*Cr  qi=23  INTER*/
+      {
+        {   84,   29},{   69,  343},{   58,  660},{   62, 1041},
+        {   75, 1448},{   88, 1853},{   97, 2258},{  102, 2659},
+        {  105, 3050},{  108, 3430},{  113, 3799},{  116, 4155},
+        {  121, 4505},{  126, 4845},{  132, 5176},{  142, 5504},
+        {  153, 5826},{  165, 6133},{  180, 6432},{  197, 6722},
+        {  212, 7005},{  226, 7287},{  244, 7555},{  258, 7828}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=24  INTRA*/
+      {
+        {  125,  -34},{  268, 1444},{  547, 2590},{  866, 3422},
+        { 1172, 4098},{ 1476, 4702},{ 1790, 5222},{ 2117, 5678},
+        { 2453, 6080},{ 2811, 6418},{ 3178, 6700},{ 3552, 6895},
+        { 3928, 7055},{ 4286, 7243},{ 4627, 7477},{ 4981, 7674},
+        { 5344, 7802},{ 5683, 7944},{ 6009, 8043},{ 6313, 8082},
+        { 6633, 8111},{ 6959, 8151},{ 7280, 8197},{ 7660, 8221}
+      },
+      /*Y'  qi=24  INTER*/
+      {
+        {   62,  -63},{   68, 1345},{  134, 2840},{  199, 4245},
+        {  256, 5508},{  304, 6715},{  371, 7880},{  484, 8950},
+        {  652, 9899},{  892,10709},{ 1238,11334},{ 1722,11722},
+        { 2326,11875},{ 2983,11864},{ 3616,11783},{ 4189,11678},
+        { 4707,11570},{ 5178,11476},{ 5617,11395},{ 6017,11319},
+        { 6380,11252},{ 6720,11185},{ 7044,11126},{ 7377,11118}
+      }
+    },
+    {
+      /*Cb  qi=24  INTRA*/
+      {
+        {    4,    3},{   75,  367},{  132,  745},{  159, 1182},
+        {  187, 1634},{  230, 2061},{  289, 2439},{  361, 2753},
+        {  425, 3034},{  492, 3278},{  566, 3490},{  630, 3720},
+        {  686, 3956},{  732, 4190},{  777, 4420},{  829, 4637},
+        {  894, 4840},{  958, 5012},{ 1023, 5155},{ 1090, 5326},
+        { 1165, 5502},{ 1226, 5622},{ 1299, 5717},{ 1408, 5887}
+      },
+      /*Cb  qi=24  INTER*/
+      {
+        {  110,   35},{   92,  337},{   70,  651},{   63, 1033},
+        {   74, 1440},{   91, 1846},{  102, 2248},{  109, 2644},
+        {  114, 3031},{  120, 3404},{  127, 3762},{  133, 4109},
+        {  138, 4445},{  144, 4772},{  151, 5094},{  159, 5411},
+        {  168, 5728},{  180, 6037},{  195, 6338},{  210, 6640},
+        {  227, 6944},{  249, 7236},{  272, 7528},{  299, 7809}
+      }
+    },
+    {
+      /*Cr  qi=24  INTRA*/
+      {
+        {    5,    6},{   72,  380},{  124,  770},{  158, 1222},
+        {  195, 1668},{  240, 2079},{  297, 2438},{  367, 2715},
+        {  433, 2966},{  488, 3218},{  549, 3467},{  609, 3701},
+        {  664, 3935},{  728, 4165},{  792, 4379},{  845, 4586},
+        {  917, 4744},{  995, 4898},{ 1063, 5049},{ 1120, 5187},
+        { 1190, 5359},{ 1249, 5522},{ 1304, 5672},{ 1397, 5806}
+      },
+      /*Cr  qi=24  INTER*/
+      {
+        {   91,   56},{   73,  353},{   61,  664},{   66, 1045},
+        {   80, 1449},{   95, 1851},{  103, 2250},{  107, 2648},
+        {  111, 3038},{  116, 3413},{  120, 3774},{  124, 4128},
+        {  130, 4471},{  138, 4802},{  145, 5130},{  156, 5453},
+        {  171, 5764},{  187, 6061},{  204, 6355},{  220, 6643},
+        {  238, 6923},{  254, 7204},{  275, 7475},{  289, 7752}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=25  INTRA*/
+      {
+        {  125,  -28},{  285, 1426},{  582, 2540},{  917, 3351},
+        { 1244, 3997},{ 1569, 4570},{ 1903, 5071},{ 2258, 5498},
+        { 2626, 5866},{ 3002, 6182},{ 3382, 6448},{ 3770, 6623},
+        { 4162, 6760},{ 4528, 6934},{ 4882, 7144},{ 5249, 7328},
+        { 5610, 7453},{ 5958, 7578},{ 6291, 7672},{ 6597, 7708},
+        { 6928, 7715},{ 7258, 7737},{ 7575, 7781},{ 7950, 7829}
+      },
+      /*Y'  qi=25  INTER*/
+      {
+        {   64,  -16},{   72, 1348},{  139, 2832},{  206, 4218},
+        {  268, 5465},{  322, 6659},{  403, 7803},{  540, 8838},
+        {  747, 9734},{ 1044,10465},{ 1473,10981},{ 2048,11249},
+        { 2717,11311},{ 3397,11257},{ 4025,11161},{ 4589,11052},
+        { 5099,10947},{ 5560,10859},{ 5989,10786},{ 6389,10717},
+        { 6753,10652},{ 7078,10592},{ 7389,10535},{ 7697,10460}
+      }
+    },
+    {
+      /*Cb  qi=25  INTRA*/
+      {
+        {    3,    3},{   78,  368},{  133,  745},{  159, 1180},
+        {  193, 1627},{  242, 2046},{  304, 2411},{  381, 2714},
+        {  456, 2983},{  527, 3224},{  598, 3437},{  667, 3655},
+        {  726, 3888},{  776, 4117},{  826, 4333},{  883, 4543},
+        {  954, 4727},{ 1019, 4878},{ 1095, 5014},{ 1171, 5187},
+        { 1255, 5342},{ 1319, 5458},{ 1396, 5546},{ 1536, 5678}
+      },
+      /*Cb  qi=25  INTER*/
+      {
+        {  117,   32},{   89,  342},{   67,  660},{   64, 1037},
+        {   77, 1441},{   93, 1845},{  105, 2243},{  113, 2633},
+        {  120, 3016},{  125, 3387},{  131, 3739},{  137, 4080},
+        {  144, 4416},{  152, 4741},{  160, 5057},{  169, 5369},
+        {  180, 5680},{  193, 5990},{  209, 6294},{  227, 6594},
+        {  249, 6888},{  269, 7180},{  294, 7467},{  317, 7768}
+      }
+    },
+    {
+      /*Cr  qi=25  INTRA*/
+      {
+        {    6,    6},{   74,  380},{  129,  770},{  165, 1220},
+        {  201, 1658},{  253, 2061},{  315, 2410},{  388, 2676},
+        {  462, 2920},{  523, 3166},{  584, 3404},{  647, 3637},
+        {  701, 3870},{  769, 4086},{  838, 4296},{  898, 4491},
+        {  980, 4627},{ 1065, 4759},{ 1126, 4920},{ 1187, 5058},
+        { 1283, 5180},{ 1347, 5332},{ 1404, 5475},{ 1527, 5534}
+      },
+      /*Cr  qi=25  INTER*/
+      {
+        {   92,   41},{   75,  347},{   64,  664},{   70, 1045},
+        {   85, 1448},{   98, 1849},{  105, 2245},{  110, 2637},
+        {  115, 3023},{  120, 3395},{  126, 3753},{  131, 4102},
+        {  136, 4439},{  145, 4768},{  156, 5094},{  168, 5410},
+        {  184, 5717},{  203, 6010},{  221, 6300},{  239, 6577},
+        {  262, 6847},{  282, 7123},{  303, 7390},{  322, 7665}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=26  INTRA*/
+      {
+        {  130,  -24},{  292, 1423},{  594, 2525},{  943, 3307},
+        { 1289, 3921},{ 1633, 4467},{ 1991, 4943},{ 2368, 5348},
+        { 2753, 5696},{ 3148, 5991},{ 3545, 6247},{ 3942, 6415},
+        { 4342, 6535},{ 4726, 6690},{ 5093, 6883},{ 5466, 7047},
+        { 5840, 7159},{ 6202, 7274},{ 6545, 7351},{ 6855, 7375},
+        { 7186, 7384},{ 7517, 7416},{ 7840, 7447},{ 8238, 7450}
+      },
+      /*Y'  qi=26  INTER*/
+      {
+        {   52,   16},{   75, 1336},{  143, 2815},{  213, 4191},
+        {  278, 5427},{  339, 6611},{  436, 7734},{  600, 8732},
+        {  843, 9579},{ 1195,10243},{ 1702,10660},{ 2355,10825},
+        { 3070,10820},{ 3755,10743},{ 4372,10643},{ 4925,10538},
+        { 5426,10440},{ 5882,10354},{ 6296,10290},{ 6686,10224},
+        { 7049,10163},{ 7380,10113},{ 7672,10062},{ 7937,10021}
+      }
+    },
+    {
+      /*Cb  qi=26  INTRA*/
+      {
+        {    4,    3},{   79,  368},{  138,  745},{  167, 1180},
+        {  200, 1623},{  252, 2034},{  322, 2389},{  403, 2682},
+        {  480, 2941},{  558, 3176},{  631, 3393},{  700, 3608},
+        {  766, 3825},{  819, 4046},{  868, 4265},{  926, 4472},
+        { 1002, 4645},{ 1070, 4800},{ 1151, 4924},{ 1242, 5063},
+        { 1325, 5221},{ 1393, 5338},{ 1464, 5431},{ 1595, 5559}
+      },
+      /*Cb  qi=26  INTER*/
+      {
+        {   98,   33},{   83,  343},{   65,  662},{   65, 1037},
+        {   80, 1437},{   96, 1839},{  107, 2238},{  115, 2628},
+        {  122, 3007},{  128, 3373},{  134, 3722},{  142, 4060},
+        {  149, 4390},{  158, 4713},{  167, 5029},{  178, 5341},
+        {  191, 5647},{  208, 5948},{  227, 6244},{  247, 6539},
+        {  269, 6833},{  295, 7114},{  328, 7388},{  369, 7658}
+      }
+    },
+    {
+      /*Cr  qi=26  INTRA*/
+      {
+        {    5,    6},{   75,  380},{  133,  769},{  172, 1217},
+        {  212, 1652},{  266, 2048},{  333, 2384},{  412, 2643},
+        {  490, 2880},{  552, 3124},{  616, 3365},{  681, 3594},
+        {  739, 3816},{  810, 4024},{  880, 4224},{  945, 4405},
+        { 1029, 4538},{ 1114, 4674},{ 1183, 4822},{ 1254, 4946},
+        { 1346, 5063},{ 1417, 5201},{ 1478, 5345},{ 1597, 5411}
+      },
+      /*Cr  qi=26  INTER*/
+      {
+        {   97,   29},{   75,  342},{   62,  667},{   70, 1047},
+        {   87, 1447},{  100, 1846},{  107, 2242},{  113, 2633},
+        {  118, 3016},{  123, 3382},{  128, 3737},{  135, 4082},
+        {  142, 4417},{  151, 4746},{  162, 5066},{  176, 5377},
+        {  194, 5679},{  217, 5963},{  239, 6244},{  260, 6522},
+        {  284, 6789},{  309, 7052},{  335, 7313},{  355, 7582}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=27  INTRA*/
+      {
+        {  118,  -10},{  308, 1404},{  630, 2473},{  997, 3227},
+        { 1360, 3819},{ 1719, 4354},{ 2086, 4829},{ 2470, 5233},
+        { 2863, 5576},{ 3267, 5870},{ 3677, 6117},{ 4085, 6268},
+        { 4499, 6376},{ 4888, 6521},{ 5257, 6705},{ 5638, 6865},
+        { 6020, 6962},{ 6394, 7056},{ 6744, 7130},{ 7051, 7158},
+        { 7386, 7164},{ 7717, 7185},{ 8042, 7209},{ 8444, 7206}
+      },
+      /*Y'  qi=27  INTER*/
+      {
+        {   54,   19},{   77, 1333},{  147, 2806},{  221, 4166},
+        {  290, 5390},{  360, 6564},{  474, 7665},{  664, 8630},
+        {  949, 9423},{ 1370,10002},{ 1958,10323},{ 2670,10414},
+        { 3406,10375},{ 4086,10285},{ 4691,10182},{ 5233,10085},
+        { 5724, 9994},{ 6169, 9918},{ 6582, 9863},{ 6962, 9813},
+        { 7316, 9759},{ 7645, 9707},{ 7948, 9660},{ 8262, 9623}
+      }
+    },
+    {
+      /*Cb  qi=27  INTRA*/
+      {
+        {    4,    3},{   79,  368},{  137,  745},{  166, 1180},
+        {  200, 1622},{  253, 2030},{  324, 2381},{  407, 2671},
+        {  487, 2925},{  567, 3156},{  640, 3372},{  712, 3580},
+        {  782, 3792},{  833, 4015},{  887, 4227},{  954, 4422},
+        { 1031, 4592},{ 1103, 4738},{ 1187, 4856},{ 1280, 4990},
+        { 1371, 5135},{ 1442, 5244},{ 1520, 5321},{ 1684, 5398}
+      },
+      /*Cb  qi=27  INTER*/
+      {
+        {  113,   20},{   90,  338},{   66,  661},{   67, 1034},
+        {   82, 1438},{   97, 1842},{  108, 2238},{  115, 2624},
+        {  123, 3000},{  130, 3361},{  138, 3708},{  146, 4040},
+        {  155, 4367},{  164, 4688},{  174, 4999},{  186, 5306},
+        {  203, 5609},{  222, 5908},{  243, 6202},{  268, 6494},
+        {  295, 6781},{  326, 7058},{  367, 7319},{  420, 7551}
+      }
+    },
+    {
+      /*Cr  qi=27  INTRA*/
+      {
+        {    5,    6},{   75,  380},{  133,  770},{  173, 1217},
+        {  214, 1650},{  268, 2040},{  337, 2375},{  418, 2631},
+        {  496, 2862},{  558, 3104},{  625, 3346},{  692, 3571},
+        {  753, 3786},{  825, 3989},{  896, 4182},{  969, 4352},
+        { 1059, 4479},{ 1144, 4614},{ 1212, 4757},{ 1284, 4871},
+        { 1380, 4982},{ 1457, 5125},{ 1528, 5267},{ 1651, 5346}
+      },
+      /*Cr  qi=27  INTER*/
+      {
+        {   92,   24},{   74,  341},{   61,  669},{   71, 1049},
+        {   88, 1448},{  100, 1849},{  107, 2243},{  113, 2631},
+        {  119, 3010},{  125, 3373},{  131, 3723},{  137, 4064},
+        {  146, 4396},{  159, 4720},{  172, 5033},{  189, 5340},
+        {  210, 5636},{  233, 5920},{  256, 6197},{  282, 6465},
+        {  310, 6730},{  332, 7000},{  359, 7259},{  385, 7515}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=28  INTRA*/
+      {
+        {  116,   -8},{  314, 1400},{  640, 2458},{ 1013, 3197},
+        { 1386, 3768},{ 1762, 4279},{ 2151, 4733},{ 2558, 5117},
+        { 2970, 5442},{ 3393, 5714},{ 3820, 5935},{ 4243, 6069},
+        { 4671, 6161},{ 5074, 6289},{ 5456, 6457},{ 5849, 6598},
+        { 6244, 6689},{ 6632, 6777},{ 6984, 6833},{ 7294, 6855},
+        { 7625, 6862},{ 7961, 6875},{ 8302, 6890},{ 8720, 6883}
+      },
+      /*Y'  qi=28  INTER*/
+      {
+        {   54,    8},{   81, 1333},{  154, 2793},{  231, 4138},
+        {  304, 5352},{  384, 6512},{  519, 7585},{  743, 8508},
+        { 1082, 9236},{ 1587, 9717},{ 2267, 9928},{ 3034, 9944},
+        { 3775, 9878},{ 4438, 9786},{ 5031, 9686},{ 5563, 9601},
+        { 6042, 9523},{ 6481, 9456},{ 6890, 9405},{ 7266, 9356},
+        { 7614, 9313},{ 7933, 9265},{ 8238, 9220},{ 8545, 9193}
+      }
+    },
+    {
+      /*Cb  qi=28  INTRA*/
+      {
+        {    3,    3},{   80,  368},{  138,  746},{  168, 1179},
+        {  208, 1615},{  268, 2014},{  345, 2354},{  432, 2637},
+        {  515, 2884},{  595, 3108},{  669, 3323},{  745, 3533},
+        {  818, 3740},{  876, 3953},{  932, 4160},{ 1003, 4349},
+        { 1088, 4501},{ 1154, 4648},{ 1241, 4768},{ 1349, 4889},
+        { 1441, 5023},{ 1524, 5113},{ 1611, 5187},{ 1783, 5283}
+      },
+      /*Cb  qi=28  INTER*/
+      {
+        {  117,   29},{   91,  341},{   65,  663},{   68, 1038},
+        {   85, 1440},{  100, 1841},{  110, 2234},{  119, 2616},
+        {  127, 2985},{  135, 3342},{  142, 3685},{  151, 4015},
+        {  162, 4337},{  174, 4652},{  186, 4960},{  201, 5264},
+        {  218, 5567},{  239, 5863},{  266, 6149},{  295, 6434},
+        {  328, 6715},{  371, 6976},{  409, 7239},{  460, 7477}
+      }
+    },
+    {
+      /*Cr  qi=28  INTRA*/
+      {
+        {    6,    7},{   79,  381},{  138,  771},{  178, 1215},
+        {  222, 1644},{  285, 2026},{  359, 2347},{  441, 2597},
+        {  521, 2827},{  588, 3066},{  655, 3303},{  725, 3523},
+        {  791, 3728},{  870, 3920},{  950, 4103},{ 1030, 4265},
+        { 1121, 4388},{ 1198, 4520},{ 1266, 4659},{ 1356, 4759},
+        { 1461, 4865},{ 1540, 4993},{ 1619, 5115},{ 1786, 5160}
+      },
+      /*Cr  qi=28  INTER*/
+      {
+        {   96,   18},{   78,  340},{   66,  672},{   74, 1051},
+        {   90, 1450},{  103, 1845},{  110, 2235},{  116, 2619},
+        {  122, 2995},{  129, 3356},{  137, 3702},{  146, 4038},
+        {  156, 4365},{  168, 4684},{  182, 4995},{  203, 5297},
+        {  227, 5588},{  253, 5866},{  282, 6131},{  311, 6394},
+        {  339, 6664},{  366, 6918},{  400, 7171},{  424, 7450}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=29  INTRA*/
+      {
+        {  112,    7},{  334, 1382},{  681, 2410},{ 1081, 3112},
+        { 1484, 3650},{ 1894, 4128},{ 2316, 4547},{ 2749, 4905},
+        { 3188, 5208},{ 3634, 5458},{ 4079, 5666},{ 4517, 5791},
+        { 4952, 5870},{ 5359, 5983},{ 5754, 6137},{ 6165, 6268},
+        { 6568, 6351},{ 6958, 6423},{ 7320, 6471},{ 7638, 6490},
+        { 7979, 6490},{ 8313, 6499},{ 8651, 6517},{ 9085, 6499}
+      },
+      /*Y'  qi=29  INTER*/
+      {
+        {   55,   15},{   85, 1336},{  160, 2780},{  242, 4104},
+        {  323, 5302},{  418, 6443},{  586, 7480},{  859, 8342},
+        { 1278, 8982},{ 1888, 9347},{ 2658, 9457},{ 3457, 9425},
+        { 4192, 9343},{ 4842, 9247},{ 5417, 9162},{ 5935, 9086},
+        { 6404, 9011},{ 6841, 8952},{ 7241, 8907},{ 7609, 8867},
+        { 7953, 8832},{ 8267, 8792},{ 8562, 8740},{ 8836, 8701}
+      }
+    },
+    {
+      /*Cb  qi=29  INTRA*/
+      {
+        {    5,    3},{   84,  368},{  144,  746},{  176, 1175},
+        {  219, 1604},{  285, 1991},{  372, 2318},{  462, 2591},
+        {  546, 2833},{  628, 3058},{  704, 3274},{  788, 3473},
+        {  870, 3664},{  935, 3865},{  995, 4059},{ 1072, 4239},
+        { 1167, 4388},{ 1248, 4518},{ 1334, 4634},{ 1429, 4765},
+        { 1536, 4884},{ 1628, 4964},{ 1716, 5038},{ 1885, 5128}
+      },
+      /*Cb  qi=29  INTER*/
+      {
+        {  126,   25},{   95,  340},{   69,  662},{   71, 1039},
+        {   88, 1440},{  102, 1839},{  113, 2227},{  122, 2604},
+        {  132, 2969},{  141, 3320},{  151, 3659},{  161, 3985},
+        {  172, 4301},{  186, 4612},{  200, 4917},{  219, 5213},
+        {  241, 5509},{  265, 5800},{  296, 6081},{  329, 6360},
+        {  369, 6633},{  414, 6899},{  465, 7148},{  520, 7387}
+      }
+    },
+    {
+      /*Cr  qi=29  INTRA*/
+      {
+        {    6,    7},{   82,  382},{  142,  772},{  185, 1211},
+        {  233, 1632},{  303, 2000},{  388, 2306},{  475, 2550},
+        {  556, 2779},{  627, 3007},{  707, 3237},{  778, 3459},
+        {  843, 3654},{  927, 3834},{ 1012, 4012},{ 1101, 4152},
+        { 1197, 4262},{ 1275, 4399},{ 1359, 4511},{ 1455, 4596},
+        { 1562, 4708},{ 1644, 4833},{ 1719, 4954},{ 1888, 4988}
+      },
+      /*Cr  qi=29  INTER*/
+      {
+        {  101,   28},{   81,  343},{   67,  673},{   75, 1053},
+        {   93, 1450},{  106, 1844},{  113, 2230},{  119, 2610},
+        {  127, 2980},{  135, 3334},{  143, 3676},{  153, 4007},
+        {  165, 4330},{  180, 4645},{  201, 4951},{  224, 5243},
+        {  253, 5522},{  284, 5794},{  314, 6060},{  345, 6322},
+        {  381, 6578},{  419, 6828},{  455, 7073},{  495, 7316}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=30  INTRA*/
+      {
+        {  112,    8},{  335, 1380},{  682, 2401},{ 1083, 3093},
+        { 1489, 3619},{ 1902, 4092},{ 2332, 4511},{ 2777, 4865},
+        { 3231, 5156},{ 3693, 5394},{ 4153, 5585},{ 4605, 5689},
+        { 5049, 5764},{ 5468, 5871},{ 5875, 6004},{ 6295, 6120},
+        { 6706, 6201},{ 7099, 6273},{ 7461, 6311},{ 7785, 6320},
+        { 8128, 6322},{ 8469, 6331},{ 8806, 6342},{ 9220, 6338}
+      },
+      /*Y'  qi=30  INTER*/
+      {
+        {   58,    8},{   90, 1340},{  169, 2771},{  257, 4079},
+        {  345, 5266},{  459, 6387},{  660, 7383},{  990, 8180},
+        { 1496, 8726},{ 2203, 8992},{ 3029, 9038},{ 3833, 8984},
+        { 4549, 8900},{ 5183, 8813},{ 5745, 8735},{ 6250, 8674},
+        { 6715, 8619},{ 7138, 8565},{ 7529, 8528},{ 7899, 8495},
+        { 8234, 8465},{ 8550, 8429},{ 8856, 8395},{ 9160, 8374}
+      }
+    },
+    {
+      /*Cb  qi=30  INTRA*/
+      {
+        {    7,    3},{   88,  369},{  149,  747},{  185, 1175},
+        {  232, 1599},{  304, 1976},{  392, 2293},{  486, 2557},
+        {  573, 2797},{  656, 3027},{  735, 3243},{  819, 3442},
+        {  903, 3629},{  966, 3828},{ 1025, 4027},{ 1105, 4204},
+        { 1201, 4343},{ 1282, 4469},{ 1379, 4575},{ 1486, 4689},
+        { 1588, 4813},{ 1678, 4900},{ 1767, 4969},{ 1911, 5080}
+      },
+      /*Cb  qi=30  INTER*/
+      {
+        {  120,   23},{   96,  336},{   72,  661},{   75, 1043},
+        {   91, 1441},{  105, 1837},{  117, 2221},{  127, 2592},
+        {  137, 2953},{  148, 3301},{  159, 3635},{  170, 3959},
+        {  184, 4271},{  199, 4578},{  216, 4879},{  238, 5175},
+        {  262, 5466},{  294, 5750},{  332, 6027},{  373, 6298},
+        {  421, 6559},{  473, 6805},{  526, 7053},{  587, 7298}
+      }
+    },
+    {
+      /*Cr  qi=30  INTRA*/
+      {
+        {   10,    7},{   89,  384},{  147,  773},{  192, 1211},
+        {  245, 1627},{  322, 1984},{  412, 2280},{  501, 2520},
+        {  583, 2750},{  654, 2982},{  736, 3207},{  810, 3419},
+        {  873, 3614},{  957, 3794},{ 1048, 3965},{ 1139, 4102},
+        { 1237, 4208},{ 1327, 4328},{ 1408, 4448},{ 1496, 4545},
+        { 1604, 4652},{ 1699, 4760},{ 1780, 4877},{ 1937, 4942}
+      },
+      /*Cr  qi=30  INTER*/
+      {
+        {  115,   26},{   89,  342},{   70,  672},{   79, 1055},
+        {   96, 1451},{  108, 1841},{  116, 2222},{  124, 2599},
+        {  132, 2965},{  141, 3316},{  151, 3655},{  163, 3984},
+        {  178, 4301},{  197, 4609},{  219, 4909},{  247, 5195},
+        {  280, 5469},{  317, 5734},{  351, 5991},{  383, 6248},
+        {  423, 6500},{  467, 6744},{  502, 6995},{  558, 7226}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=31  INTRA*/
+      {
+        {  116,   20},{  359, 1361},{  732, 2350},{ 1162, 3010},
+        { 1597, 3507},{ 2042, 3950},{ 2503, 4339},{ 2974, 4670},
+        { 3446, 4951},{ 3922, 5179},{ 4394, 5357},{ 4858, 5454},
+        { 5313, 5519},{ 5734, 5626},{ 6154, 5755},{ 6585, 5859},
+        { 7004, 5928},{ 7408, 5998},{ 7775, 6039},{ 8102, 6048},
+        { 8442, 6051},{ 8790, 6054},{ 9136, 6057},{ 9554, 6041}
+      },
+      /*Y'  qi=31  INTER*/
+      {
+        {   53,   12},{   90, 1340},{  169, 2765},{  259, 4062},
+        {  353, 5236},{  483, 6340},{  713, 7305},{ 1086, 8059},
+        { 1651, 8548},{ 2423, 8751},{ 3288, 8754},{ 4106, 8674},
+        { 4827, 8572},{ 5451, 8482},{ 6007, 8407},{ 6514, 8344},
+        { 6970, 8282},{ 7397, 8225},{ 7795, 8193},{ 8159, 8161},
+        { 8498, 8120},{ 8814, 8093},{ 9127, 8066},{ 9432, 8040}
+      }
+    },
+    {
+      /*Cb  qi=31  INTRA*/
+      {
+        {    7,    3},{   88,  369},{  149,  746},{  185, 1173},
+        {  234, 1595},{  308, 1967},{  399, 2278},{  494, 2537},
+        {  583, 2774},{  669, 2997},{  755, 3204},{  847, 3390},
+        {  936, 3569},{ 1008, 3759},{ 1078, 3942},{ 1162, 4104},
+        { 1262, 4238},{ 1352, 4364},{ 1442, 4470},{ 1557, 4567},
+        { 1676, 4674},{ 1759, 4781},{ 1850, 4853},{ 2043, 4897}
+      },
+      /*Cb  qi=31  INTER*/
+      {
+        {  121,   23},{   96,  335},{   72,  660},{   74, 1043},
+        {   90, 1440},{  105, 1834},{  116, 2217},{  127, 2586},
+        {  138, 2945},{  148, 3293},{  159, 3626},{  172, 3945},
+        {  185, 4256},{  202, 4559},{  223, 4856},{  245, 5150},
+        {  272, 5440},{  306, 5719},{  346, 5989},{  391, 6253},
+        {  443, 6511},{  510, 6743},{  583, 6965},{  651, 7182}
+      }
+    },
+    {
+      /*Cr  qi=31  INTRA*/
+      {
+        {   10,    7},{   88,  384},{  147,  773},{  192, 1209},
+        {  247, 1622},{  326, 1974},{  417, 2262},{  509, 2500},
+        {  596, 2726},{  670, 2949},{  754, 3170},{  836, 3370},
+        {  912, 3548},{  999, 3724},{ 1093, 3888},{ 1198, 4000},
+        { 1304, 4095},{ 1384, 4230},{ 1470, 4347},{ 1577, 4422},
+        { 1696, 4513},{ 1798, 4620},{ 1869, 4746},{ 1991, 4798}
+      },
+      /*Cr  qi=31  INTER*/
+      {
+        {  113,   32},{   88,  345},{   69,  674},{   79, 1055},
+        {   96, 1451},{  108, 1839},{  115, 2218},{  123, 2592},
+        {  132, 2957},{  141, 3308},{  151, 3643},{  163, 3968},
+        {  179, 4285},{  200, 4590},{  225, 4886},{  254, 5169},
+        {  291, 5436},{  330, 5696},{  368, 5951},{  409, 6200},
+        {  452, 6448},{  493, 6695},{  536, 6940},{  571, 7204}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=32  INTRA*/
+      {
+        {  123,   26},{  370, 1356},{  756, 2321},{ 1211, 2944},
+        { 1674, 3408},{ 2148, 3826},{ 2639, 4193},{ 3138, 4504},
+        { 3634, 4765},{ 4133, 4973},{ 4625, 5137},{ 5101, 5225},
+        { 5567, 5274},{ 6002, 5363},{ 6437, 5482},{ 6885, 5566},
+        { 7312, 5625},{ 7723, 5686},{ 8101, 5721},{ 8429, 5732},
+        { 8769, 5728},{ 9120, 5726},{ 9472, 5723},{ 9918, 5700}
+      },
+      /*Y'  qi=32  INTER*/
+      {
+        {   54,   -3},{   95, 1343},{  179, 2750},{  276, 4027},
+        {  382, 5185},{  543, 6256},{  830, 7161},{ 1301, 7815},
+        { 2003, 8172},{ 2883, 8266},{ 3779, 8217},{ 4578, 8127},
+        { 5274, 8035},{ 5886, 7952},{ 6430, 7887},{ 6929, 7835},
+        { 7380, 7779},{ 7796, 7737},{ 8190, 7705},{ 8552, 7672},
+        { 8896, 7640},{ 9210, 7612},{ 9510, 7589},{ 9746, 7552}
+      }
+    },
+    {
+      /*Cb  qi=32  INTRA*/
+      {
+        {    6,    3},{   89,  369},{  153,  746},{  193, 1167},
+        {  247, 1577},{  330, 1935},{  429, 2236},{  528, 2494},
+        {  620, 2732},{  712, 2948},{  801, 3146},{  898, 3325},
+        {  999, 3489},{ 1078, 3664},{ 1155, 3832},{ 1251, 3985},
+        { 1360, 4115},{ 1451, 4236},{ 1549, 4338},{ 1667, 4433},
+        { 1797, 4522},{ 1891, 4613},{ 1989, 4687},{ 2162, 4776}
+      },
+      /*Cb  qi=32  INTER*/
+      {
+        {  116,   -1},{   98,  321},{   80,  656},{   80, 1042},
+        {   96, 1438},{  110, 1827},{  122, 2205},{  133, 2570},
+        {  144, 2925},{  157, 3268},{  170, 3597},{  185, 3911},
+        {  202, 4216},{  221, 4516},{  244, 4809},{  273, 5096},
+        {  308, 5376},{  350, 5644},{  401, 5907},{  459, 6160},
+        {  520, 6401},{  592, 6630},{  676, 6837},{  758, 7050}
+      }
+    },
+    {
+      /*Cr  qi=32  INTRA*/
+      {
+        {   12,    7},{   91,  386},{  152,  773},{  201, 1202},
+        {  261, 1603},{  347, 1942},{  447, 2223},{  540, 2460},
+        {  626, 2684},{  711, 2901},{  801, 3115},{  887, 3312},
+        {  969, 3480},{ 1068, 3633},{ 1176, 3779},{ 1283, 3885},
+        { 1392, 3969},{ 1485, 4090},{ 1573, 4206},{ 1686, 4274},
+        { 1813, 4354},{ 1911, 4459},{ 2004, 4563},{ 2162, 4590}
+      },
+      /*Cr  qi=32  INTER*/
+      {
+        {  129,    5},{   98,  334},{   75,  673},{   84, 1055},
+        {  101, 1448},{  113, 1832},{  121, 2206},{  129, 2577},
+        {  140, 2937},{  151, 3282},{  163, 3614},{  179, 3932},
+        {  198, 4240},{  221, 4542},{  252, 4830},{  290, 5102},
+        {  329, 5364},{  373, 5618},{  420, 5864},{  468, 6105},
+        {  513, 6351},{  564, 6587},{  624, 6810},{  697, 7017}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=33  INTRA*/
+      {
+        {  115,   36},{  388, 1338},{  791, 2289},{ 1258, 2899},
+        { 1732, 3352},{ 2220, 3760},{ 2730, 4117},{ 3244, 4415},
+        { 3751, 4662},{ 4261, 4858},{ 4766, 5012},{ 5249, 5094},
+        { 5719, 5141},{ 6159, 5225},{ 6597, 5333},{ 7044, 5416},
+        { 7474, 5472},{ 7893, 5531},{ 8268, 5570},{ 8591, 5580},
+        { 8931, 5578},{ 9283, 5579},{ 9634, 5582},{10067, 5560}
+      },
+      /*Y'  qi=33  INTER*/
+      {
+        {   65,  -14},{  102, 1345},{  190, 2736},{  294, 3999},
+        {  411, 5146},{  597, 6192},{  934, 7045},{ 1488, 7622},
+        { 2281, 7895},{ 3213, 7937},{ 4108, 7871},{ 4883, 7784},
+        { 5556, 7709},{ 6150, 7643},{ 6685, 7585},{ 7176, 7539},
+        { 7620, 7502},{ 8034, 7466},{ 8427, 7435},{ 8793, 7409},
+        { 9136, 7386},{ 9446, 7364},{ 9743, 7339},{10025, 7303}
+      }
+    },
+    {
+      /*Cb  qi=33  INTRA*/
+      {
+        {    5,    3},{   92,  369},{  159,  746},{  203, 1163},
+        {  263, 1564},{  353, 1911},{  458, 2204},{  557, 2460},
+        {  650, 2697},{  744, 2913},{  836, 3110},{  934, 3292},
+        { 1036, 3454},{ 1125, 3616},{ 1204, 3781},{ 1298, 3932},
+        { 1410, 4058},{ 1507, 4170},{ 1606, 4265},{ 1725, 4358},
+        { 1853, 4445},{ 1955, 4535},{ 2067, 4597},{ 2258, 4663}
+      },
+      /*Cb  qi=33  INTER*/
+      {
+        {  109,   37},{   94,  343},{   81,  662},{   85, 1042},
+        {  102, 1436},{  116, 1823},{  128, 2195},{  141, 2554},
+        {  154, 2906},{  167, 3246},{  183, 3570},{  202, 3881},
+        {  220, 4185},{  241, 4482},{  268, 4772},{  302, 5053},
+        {  341, 5328},{  388, 5592},{  446, 5846},{  507, 6096},
+        {  581, 6328},{  670, 6534},{  762, 6731},{  842, 6922}
+      }
+    },
+    {
+      /*Cr  qi=33  INTRA*/
+      {
+        {   11,    7},{   93,  387},{  158,  774},{  211, 1197},
+        {  278, 1589},{  372, 1917},{  475, 2191},{  569, 2429},
+        {  658, 2655},{  744, 2868},{  835, 3083},{  926, 3271},
+        { 1010, 3430},{ 1110, 3586},{ 1224, 3724},{ 1336, 3826},
+        { 1449, 3908},{ 1547, 4021},{ 1636, 4136},{ 1751, 4200},
+        { 1886, 4277},{ 1977, 4384},{ 2070, 4474},{ 2232, 4510}
+      },
+      /*Cr  qi=33  INTER*/
+      {
+        {   77,    9},{   90,  347},{   80,  674},{   91, 1053},
+        {  107, 1444},{  119, 1825},{  127, 2196},{  137, 2563},
+        {  149, 2919},{  161, 3259},{  176, 3588},{  194, 3905},
+        {  217, 4209},{  246, 4504},{  280, 4786},{  320, 5055},
+        {  364, 5316},{  409, 5565},{  460, 5804},{  517, 6039},
+        {  578, 6264},{  640, 6489},{  701, 6721},{  772, 6948}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=34  INTRA*/
+      {
+        {  124,   40},{  401, 1333},{  823, 2262},{ 1318, 2842},
+        { 1823, 3265},{ 2339, 3650},{ 2872, 3991},{ 3405, 4274},
+        { 3926, 4513},{ 4448, 4704},{ 4961, 4845},{ 5450, 4921},
+        { 5925, 4971},{ 6372, 5053},{ 6813, 5160},{ 7264, 5242},
+        { 7704, 5291},{ 8124, 5346},{ 8500, 5382},{ 8831, 5384},
+        { 9178, 5380},{ 9525, 5387},{ 9869, 5389},{10310, 5356}
+      },
+      /*Y'  qi=34  INTER*/
+      {
+        {   64,  -17},{  101, 1344},{  190, 2730},{  299, 3981},
+        {  430, 5110},{  648, 6127},{ 1036, 6933},{ 1664, 7445},
+        { 2535, 7652},{ 3504, 7653},{ 4402, 7572},{ 5173, 7479},
+        { 5843, 7400},{ 6441, 7334},{ 6976, 7280},{ 7464, 7231},
+        { 7910, 7189},{ 8332, 7157},{ 8730, 7125},{ 9091, 7103},
+        { 9422, 7086},{ 9753, 7061},{10067, 7036},{10316, 7029}
+      }
+    },
+    {
+      /*Cb  qi=34  INTRA*/
+      {
+        {    5,    3},{   91,  369},{  158,  746},{  204, 1162},
+        {  266, 1561},{  358, 1903},{  466, 2189},{  570, 2439},
+        {  665, 2671},{  765, 2880},{  864, 3069},{  970, 3238},
+        { 1079, 3392},{ 1174, 3545},{ 1265, 3693},{ 1360, 3841},
+        { 1471, 3968},{ 1572, 4083},{ 1675, 4181},{ 1804, 4255},
+        { 1939, 4332},{ 2048, 4411},{ 2155, 4484},{ 2339, 4584}
+      },
+      /*Cb  qi=34  INTER*/
+      {
+        {   99,   44},{   92,  345},{   82,  661},{   86, 1043},
+        {  101, 1436},{  116, 1821},{  128, 2191},{  140, 2549},
+        {  154, 2898},{  168, 3235},{  185, 3556},{  203, 3865},
+        {  224, 4166},{  248, 4457},{  278, 4741},{  315, 5021},
+        {  361, 5289},{  416, 5546},{  483, 5792},{  559, 6025},
+        {  651, 6237},{  752, 6432},{  849, 6626},{  967, 6790}
+      }
+    },
+    {
+      /*Cr  qi=34  INTRA*/
+      {
+        {   11,    7},{   93,  387},{  158,  773},{  212, 1195},
+        {  282, 1584},{  378, 1909},{  483, 2179},{  578, 2414},
+        {  671, 2633},{  766, 2837},{  866, 3038},{  960, 3223},
+        { 1049, 3376},{ 1158, 3520},{ 1285, 3644},{ 1400, 3740},
+        { 1505, 3828},{ 1616, 3928},{ 1713, 4030},{ 1820, 4104},
+        { 1957, 4185},{ 2063, 4280},{ 2160, 4355},{ 2320, 4341}
+      },
+      /*Cr  qi=34  INTER*/
+      {
+        {   78,   11},{   89,  347},{   79,  674},{   90, 1053},
+        {  106, 1444},{  117, 1823},{  127, 2192},{  137, 2558},
+        {  149, 2912},{  163, 3249},{  178, 3574},{  197, 3888},
+        {  222, 4189},{  252, 4481},{  293, 4755},{  341, 5013},
+        {  386, 5268},{  436, 5512},{  498, 5743},{  563, 5970},
+        {  622, 6200},{  694, 6415},{  776, 6622},{  871, 6818}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=35  INTRA*/
+      {
+        {  116,   51},{  433, 1312},{  881, 2221},{ 1406, 2771},
+        { 1948, 3156},{ 2511, 3501},{ 3085, 3811},{ 3654, 4066},
+        { 4212, 4273},{ 4763, 4444},{ 5298, 4572},{ 5799, 4638},
+        { 6285, 4678},{ 6747, 4746},{ 7203, 4838},{ 7673, 4905},
+        { 8124, 4950},{ 8552, 5003},{ 8938, 5027},{ 9275, 5026},
+        { 9628, 5019},{ 9981, 5024},{10331, 5030},{10795, 5000}
+      },
+      /*Y'  qi=35  INTER*/
+      {
+        {   71,  -10},{  108, 1348},{  203, 2710},{  325, 3938},
+        {  485, 5040},{  766, 6000},{ 1267, 6706},{ 2048, 7089},
+        { 3037, 7191},{ 4032, 7146},{ 4903, 7061},{ 5648, 6977},
+        { 6301, 6912},{ 6884, 6857},{ 7413, 6812},{ 7898, 6775},
+        { 8342, 6739},{ 8764, 6710},{ 9160, 6688},{ 9519, 6668},
+        { 9859, 6646},{10190, 6625},{10492, 6612},{10755, 6595}
+      }
+    },
+    {
+      /*Cb  qi=35  INTRA*/
+      {
+        {    6,    3},{   95,  369},{  164,  746},{  214, 1156},
+        {  287, 1542},{  390, 1869},{  504, 2143},{  611, 2388},
+        {  712, 2613},{  822, 2811},{  937, 2987},{ 1055, 3147},
+        { 1174, 3285},{ 1286, 3420},{ 1386, 3560},{ 1488, 3698},
+        { 1604, 3814},{ 1714, 3916},{ 1825, 4008},{ 1958, 4088},
+        { 2101, 4159},{ 2224, 4226},{ 2339, 4292},{ 2538, 4383}
+      },
+      /*Cb  qi=35  INTER*/
+      {
+        {   98,   41},{   90,  348},{   86,  665},{   92, 1042},
+        {  108, 1432},{  122, 1812},{  136, 2175},{  151, 2528},
+        {  165, 2872},{  182, 3202},{  202, 3516},{  225, 3819},
+        {  251, 4112},{  281, 4398},{  320, 4675},{  367, 4944},
+        {  421, 5204},{  493, 5450},{  579, 5679},{  672, 5892},
+        {  785, 6082},{  906, 6258},{ 1026, 6432},{ 1153, 6592}
+      }
+    },
+    {
+      /*Cr  qi=35  INTRA*/
+      {
+        {   12,    7},{   98,  388},{  166,  773},{  226, 1187},
+        {  306, 1563},{  411, 1874},{  524, 2134},{  622, 2365},
+        {  721, 2577},{  826, 2768},{  947, 2946},{ 1066, 3106},
+        { 1163, 3250},{ 1274, 3395},{ 1417, 3508},{ 1539, 3590},
+        { 1639, 3671},{ 1754, 3765},{ 1865, 3855},{ 1979, 3921},
+        { 2127, 3998},{ 2249, 4085},{ 2346, 4172},{ 2473, 4210}
+      },
+      /*Cr  qi=35  INTER*/
+      {
+        {   86,   12},{   94,  354},{   85,  677},{   96, 1052},
+        {  113, 1439},{  125, 1811},{  135, 2177},{  147, 2537},
+        {  160, 2884},{  177, 3215},{  195, 3535},{  219, 3842},
+        {  252, 4133},{  292, 4413},{  339, 4680},{  396, 4928},
+        {  455, 5169},{  514, 5408},{  588, 5626},{  672, 5835},
+        {  750, 6051},{  837, 6257},{  943, 6442},{ 1073, 6595}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=36  INTRA*/
+      {
+        {  116,   52},{  432, 1312},{  881, 2215},{ 1407, 2759},
+        { 1948, 3140},{ 2511, 3484},{ 3090, 3789},{ 3672, 4036},
+        { 4243, 4236},{ 4803, 4397},{ 5346, 4517},{ 5856, 4581},
+        { 6350, 4614},{ 6821, 4675},{ 7286, 4763},{ 7754, 4832},
+        { 8201, 4875},{ 8631, 4922},{ 9015, 4948},{ 9351, 4945},
+        { 9706, 4941},{10061, 4948},{10408, 4949},{10878, 4923}
+      },
+      /*Y'  qi=36  INTER*/
+      {
+        {   63,  -16},{  114, 1332},{  216, 2690},{  343, 3914},
+        {  515, 5009},{  829, 5939},{ 1399, 6586},{ 2263, 6901},
+        { 3290, 6967},{ 4272, 6920},{ 5115, 6847},{ 5839, 6779},
+        { 6478, 6726},{ 7051, 6685},{ 7571, 6649},{ 8050, 6614},
+        { 8495, 6587},{ 8908, 6567},{ 9298, 6550},{ 9673, 6530},
+        {10005, 6512},{10324, 6499},{10640, 6483},{10936, 6487}
+      }
+    },
+    {
+      /*Cb  qi=36  INTRA*/
+      {
+        {    6,    3},{   98,  370},{  170,  746},{  225, 1150},
+        {  306, 1527},{  416, 1845},{  534, 2116},{  642, 2363},
+        {  743, 2591},{  851, 2794},{  964, 2972},{ 1081, 3133},
+        { 1198, 3275},{ 1311, 3410},{ 1411, 3547},{ 1519, 3680},
+        { 1642, 3789},{ 1750, 3892},{ 1860, 3982},{ 1998, 4054},
+        { 2141, 4129},{ 2256, 4204},{ 2372, 4278},{ 2567, 4356}
+      },
+      /*Cb  qi=36  INTER*/
+      {
+        {  107,   30},{   96,  346},{   88,  667},{  100, 1039},
+        {  115, 1426},{  128, 1804},{  142, 2164},{  158, 2512},
+        {  176, 2851},{  195, 3178},{  218, 3491},{  243, 3791},
+        {  270, 4084},{  307, 4365},{  348, 4638},{  397, 4908},
+        {  464, 5157},{  545, 5392},{  635, 5620},{  734, 5831},
+        {  854, 6015},{  993, 6170},{ 1124, 6327},{ 1234, 6502}
+      }
+    },
+    {
+      /*Cr  qi=36  INTRA*/
+      {
+        {   12,    7},{  102,  388},{  172,  773},{  239, 1182},
+        {  328, 1546},{  439, 1848},{  554, 2106},{  651, 2341},
+        {  747, 2561},{  850, 2757},{  972, 2934},{ 1086, 3097},
+        { 1182, 3245},{ 1302, 3382},{ 1447, 3491},{ 1572, 3567},
+        { 1677, 3641},{ 1793, 3733},{ 1899, 3828},{ 2013, 3894},
+        { 2163, 3967},{ 2283, 4059},{ 2387, 4142},{ 2559, 4145}
+      },
+      /*Cr  qi=36  INTER*/
+      {
+        {   98,  -10},{   96,  347},{   89,  676},{  102, 1048},
+        {  118, 1433},{  130, 1804},{  141, 2167},{  154, 2523},
+        {  171, 2866},{  190, 3194},{  212, 3508},{  240, 3809},
+        {  276, 4099},{  320, 4377},{  372, 4638},{  428, 4887},
+        {  492, 5122},{  560, 5353},{  638, 5572},{  725, 5779},
+        {  814, 5985},{  902, 6192},{ 1013, 6377},{ 1155, 6527}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=37  INTRA*/
+      {
+        {  109,   58},{  445, 1302},{  927, 2177},{ 1489, 2689},
+        { 2053, 3052},{ 2632, 3387},{ 3230, 3683},{ 3830, 3922},
+        { 4417, 4114},{ 4992, 4266},{ 5546, 4375},{ 6067, 4430},
+        { 6571, 4459},{ 7046, 4516},{ 7513, 4599},{ 7991, 4663},
+        { 8445, 4706},{ 8883, 4749},{ 9273, 4771},{ 9612, 4770},
+        { 9970, 4765},{10325, 4773},{10672, 4778},{11106, 4758}
+      },
+      /*Y'  qi=37  INTER*/
+      {
+        {   56,  -14},{  114, 1333},{  218, 2683},{  354, 3894},
+        {  550, 4966},{  916, 5854},{ 1569, 6437},{ 2520, 6685},
+        { 3596, 6704},{ 4585, 6635},{ 5424, 6556},{ 6147, 6489},
+        { 6787, 6437},{ 7358, 6395},{ 7876, 6358},{ 8361, 6325},
+        { 8807, 6294},{ 9229, 6271},{ 9631, 6253},{10002, 6238},
+        {10356, 6228},{10678, 6212},{10975, 6197},{11274, 6185}
+      }
+    },
+    {
+      /*Cb  qi=37  INTRA*/
+      {
+        {    6,    3},{   99,  370},{  171,  746},{  227, 1149},
+        {  309, 1522},{  421, 1836},{  541, 2104},{  652, 2347},
+        {  757, 2572},{  871, 2768},{  989, 2936},{ 1111, 3087},
+        { 1238, 3223},{ 1357, 3352},{ 1465, 3486},{ 1576, 3612},
+        { 1709, 3705},{ 1828, 3801},{ 1937, 3895},{ 2076, 3967},
+        { 2220, 4035},{ 2345, 4104},{ 2466, 4173},{ 2680, 4265}
+      },
+      /*Cb  qi=37  INTER*/
+      {
+        {  111,   27},{   97,  344},{   87,  667},{   99, 1038},
+        {  115, 1425},{  128, 1802},{  143, 2160},{  159, 2506},
+        {  176, 2843},{  198, 3167},{  220, 3477},{  247, 3774},
+        {  280, 4061},{  321, 4338},{  368, 4608},{  427, 4867},
+        {  501, 5109},{  595, 5332},{  701, 5544},{  818, 5738},
+        {  956, 5905},{ 1105, 6066},{ 1248, 6217},{ 1381, 6353}
+      }
+    },
+    {
+      /*Cr  qi=37  INTRA*/
+      {
+        {   12,    7},{  102,  388},{  173,  773},{  242, 1180},
+        {  331, 1541},{  444, 1839},{  562, 2095},{  662, 2326},
+        {  763, 2540},{  871, 2728},{ 1003, 2892},{ 1130, 3045},
+        { 1230, 3188},{ 1350, 3321},{ 1503, 3418},{ 1634, 3492},
+        { 1737, 3568},{ 1856, 3653},{ 1970, 3744},{ 2091, 3802},
+        { 2247, 3871},{ 2371, 3962},{ 2477, 4041},{ 2655, 4052}
+      },
+      /*Cr  qi=37  INTER*/
+      {
+        {   89,   -9},{   97,  347},{   88,  677},{  102, 1048},
+        {  118, 1432},{  130, 1802},{  141, 2163},{  154, 2517},
+        {  172, 2857},{  192, 3181},{  216, 3494},{  246, 3793},
+        {  286, 4074},{  337, 4343},{  395, 4600},{  464, 4837},
+        {  534, 5066},{  608, 5289},{  694, 5501},{  788, 5704},
+        {  893, 5901},{ 1010, 6088},{ 1151, 6249},{ 1331, 6374}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=38  INTRA*/
+      {
+        {  107,   65},{  476, 1286},{  968, 2148},{ 1548, 2641},
+        { 2141, 2979},{ 2757, 3289},{ 3390, 3564},{ 4020, 3784},
+        { 4632, 3957},{ 5224, 4097},{ 5794, 4201},{ 6326, 4250},
+        { 6828, 4274},{ 7309, 4322},{ 7790, 4401},{ 8271, 4463},
+        { 8729, 4498},{ 9165, 4540},{ 9552, 4566},{ 9901, 4560},
+        {10266, 4552},{10617, 4563},{10964, 4572},{11393, 4567}
+      },
+      /*Y'  qi=38  INTER*/
+      {
+        {   57,  -13},{  118, 1332},{  233, 2665},{  386, 3856},
+        {  620, 4899},{ 1070, 5722},{ 1849, 6211},{ 2898, 6384},
+        { 3989, 6376},{ 4947, 6311},{ 5754, 6249},{ 6454, 6199},
+        { 7077, 6161},{ 7640, 6132},{ 8159, 6101},{ 8639, 6076},
+        { 9081, 6054},{ 9502, 6037},{ 9900, 6027},{10274, 6012},
+        {10621, 5999},{10938, 5991},{11237, 5977},{11557, 5966}
+      }
+    },
+    {
+      /*Cb  qi=38  INTRA*/
+      {
+        {    8,    3},{  104,  370},{  179,  744},{  243, 1139},
+        {  338, 1498},{  458, 1801},{  584, 2060},{  700, 2297},
+        {  812, 2514},{  935, 2699},{ 1061, 2858},{ 1189, 3007},
+        { 1321, 3141},{ 1446, 3266},{ 1563, 3388},{ 1684, 3512},
+        { 1816, 3614},{ 1942, 3702},{ 2055, 3793},{ 2201, 3857},
+        { 2357, 3923},{ 2477, 3994},{ 2593, 4061},{ 2768, 4178}
+      },
+      /*Cb  qi=38  INTER*/
+      {
+        {  118,   24},{  102,  342},{   91,  663},{  101, 1040},
+        {  116, 1427},{  131, 1799},{  147, 2152},{  168, 2491},
+        {  191, 2822},{  215, 3139},{  244, 3441},{  276, 3731},
+        {  316, 4013},{  363, 4286},{  423, 4546},{  495, 4795},
+        {  584, 5028},{  691, 5242},{  814, 5439},{  959, 5608},
+        { 1119, 5759},{ 1277, 5906},{ 1449, 6035},{ 1655, 6144}
+      }
+    },
+    {
+      /*Cr  qi=38  INTRA*/
+      {
+        {   12,    6},{  106,  387},{  182,  771},{  261, 1168},
+        {  364, 1514},{  483, 1802},{  603, 2053},{  707, 2282},
+        {  817, 2489},{  933, 2670},{ 1074, 2825},{ 1210, 2967},
+        { 1320, 3104},{ 1444, 3229},{ 1599, 3324},{ 1735, 3396},
+        { 1846, 3464},{ 1971, 3547},{ 2086, 3646},{ 2206, 3711},
+        { 2366, 3773},{ 2499, 3859},{ 2603, 3945},{ 2766, 3952}
+      },
+      /*Cr  qi=38  INTER*/
+      {
+        {   86,   -9},{   91,  352},{   85,  680},{  102, 1053},
+        {  119, 1435},{  132, 1799},{  146, 2153},{  162, 2501},
+        {  183, 2835},{  209, 3154},{  240, 3458},{  278, 3751},
+        {  327, 4025},{  388, 4284},{  455, 4532},{  529, 4766},
+        {  616, 4980},{  711, 5188},{  815, 5386},{  920, 5583},
+        { 1042, 5770},{ 1186, 5936},{ 1348, 6080},{ 1542, 6196}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=39  INTRA*/
+      {
+        {  103,   66},{  479, 1283},{  998, 2125},{ 1610, 2591},
+        { 2223, 2913},{ 2855, 3214},{ 3501, 3482},{ 4146, 3698},
+        { 4772, 3868},{ 5376, 3999},{ 5956, 4095},{ 6496, 4140},
+        { 7008, 4162},{ 7499, 4209},{ 7987, 4282},{ 8478, 4338},
+        { 8947, 4374},{ 9385, 4417},{ 9783, 4437},{10143, 4433},
+        {10504, 4424},{10866, 4435},{11225, 4444},{11665, 4430}
+      },
+      /*Y'  qi=39  INTER*/
+      {
+        {   56,    2},{  118, 1332},{  235, 2660},{  395, 3843},
+        {  653, 4867},{ 1153, 5652},{ 2003, 6089},{ 3113, 6214},
+        { 4228, 6178},{ 5189, 6102},{ 6002, 6031},{ 6707, 5976},
+        { 7336, 5936},{ 7901, 5900},{ 8424, 5870},{ 8915, 5844},
+        { 9361, 5822},{ 9784, 5807},{10187, 5794},{10571, 5778},
+        {10931, 5763},{11264, 5751},{11582, 5742},{11916, 5730}
+      }
+    },
+    {
+      /*Cb  qi=39  INTRA*/
+      {
+        {    8,    3},{  104,  370},{  179,  744},{  244, 1138},
+        {  340, 1496},{  461, 1796},{  588, 2053},{  705, 2288},
+        {  820, 2503},{  945, 2684},{ 1073, 2840},{ 1210, 2981},
+        { 1352, 3106},{ 1480, 3225},{ 1603, 3342},{ 1728, 3464},
+        { 1865, 3559},{ 1990, 3645},{ 2106, 3734},{ 2258, 3796},
+        { 2413, 3856},{ 2540, 3920},{ 2667, 3986},{ 2887, 4060}
+      },
+      /*Cb  qi=39  INTER*/
+      {
+        {  119,   19},{  103,  340},{   90,  664},{  100, 1040},
+        {  115, 1426},{  131, 1797},{  148, 2148},{  169, 2486},
+        {  192, 2816},{  217, 3131},{  247, 3432},{  282, 3721},
+        {  324, 3999},{  374, 4268},{  435, 4526},{  520, 4766},
+        {  621, 4990},{  738, 5194},{  878, 5376},{ 1035, 5543},
+        { 1202, 5686},{ 1374, 5819},{ 1545, 5950},{ 1729, 6064}
+      }
+    },
+    {
+      /*Cr  qi=39  INTRA*/
+      {
+        {   12,    6},{  106,  387},{  182,  771},{  262, 1167},
+        {  365, 1512},{  486, 1798},{  608, 2047},{  713, 2274},
+        {  824, 2479},{  945, 2655},{ 1091, 2804},{ 1231, 2941},
+        { 1346, 3073},{ 1475, 3194},{ 1633, 3282},{ 1778, 3345},
+        { 1891, 3414},{ 2013, 3501},{ 2138, 3584},{ 2266, 3640},
+        { 2428, 3701},{ 2568, 3782},{ 2674, 3863},{ 2816, 3894}
+      },
+      /*Cr  qi=39  INTER*/
+      {
+        {   88,   -7},{   92,  352},{   85,  680},{  102, 1053},
+        {  119, 1434},{  132, 1797},{  146, 2151},{  163, 2498},
+        {  185, 2830},{  211, 3147},{  243, 3451},{  285, 3735},
+        {  337, 4005},{  401, 4260},{  477, 4499},{  565, 4721},
+        {  655, 4937},{  749, 5148},{  858, 5344},{  979, 5529},
+        { 1110, 5710},{ 1264, 5871},{ 1460, 5990},{ 1677, 6086}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=40  INTRA*/
+      {
+        {   98,   71},{  491, 1274},{ 1023, 2103},{ 1641, 2559},
+        { 2257, 2877},{ 2898, 3171},{ 3566, 3429},{ 4233, 3629},
+        { 4881, 3784},{ 5499, 3906},{ 6088, 3997},{ 6631, 4040},
+        { 7145, 4060},{ 7640, 4107},{ 8128, 4178},{ 8618, 4233},
+        { 9077, 4267},{ 9514, 4304},{ 9919, 4324},{10277, 4317},
+        {10635, 4312},{10985, 4324},{11338, 4331},{11792, 4334}
+      },
+      /*Y'  qi=40  INTER*/
+      {
+        {   63,  -26},{  125, 1331},{  256, 2640},{  439, 3801},
+        {  757, 4782},{ 1391, 5474},{ 2399, 5805},{ 3582, 5870},
+        { 4678, 5824},{ 5600, 5763},{ 6386, 5710},{ 7076, 5667},
+        { 7693, 5637},{ 8252, 5610},{ 8775, 5586},{ 9255, 5571},
+        { 9694, 5556},{10115, 5541},{10530, 5530},{10903, 5522},
+        {11242, 5515},{11596, 5501},{11904, 5482},{12205, 5475}
+      }
+    },
+    {
+      /*Cb  qi=40  INTRA*/
+      {
+        {    8,    3},{  108,  371},{  189,  743},{  265, 1128},
+        {  371, 1475},{  499, 1767},{  628, 2022},{  746, 2256},
+        {  864, 2467},{  991, 2647},{ 1124, 2801},{ 1270, 2933},
+        { 1412, 3054},{ 1547, 3165},{ 1677, 3277},{ 1804, 3393},
+        { 1946, 3483},{ 2078, 3569},{ 2201, 3651},{ 2352, 3711},
+        { 2513, 3766},{ 2643, 3826},{ 2775, 3880},{ 3025, 3919}
+      },
+      /*Cb  qi=40  INTER*/
+      {
+        {  114,   35},{  104,  349},{   96,  667},{  106, 1040},
+        {  121, 1423},{  138, 1789},{  158, 2132},{  184, 2464},
+        {  212, 2787},{  242, 3095},{  279, 3389},{  321, 3671},
+        {  374, 3941},{  438, 4199},{  517, 4446},{  617, 4673},
+        {  740, 4881},{  891, 5064},{ 1058, 5225},{ 1239, 5372},
+        { 1441, 5499},{ 1638, 5610},{ 1840, 5719},{ 2076, 5814}
+      }
+    },
+    {
+      /*Cr  qi=40  INTRA*/
+      {
+        {   14,    7},{  114,  389},{  193,  771},{  283, 1156},
+        {  399, 1488},{  523, 1768},{  643, 2018},{  752, 2245},
+        {  865, 2450},{  984, 2626},{ 1139, 2763},{ 1290, 2887},
+        { 1413, 3014},{ 1550, 3128},{ 1711, 3211},{ 1865, 3268},
+        { 1981, 3334},{ 2103, 3415},{ 2237, 3486},{ 2365, 3543},
+        { 2529, 3610},{ 2666, 3700},{ 2775, 3779},{ 2929, 3803}
+      },
+      /*Cr  qi=40  INTER*/
+      {
+        {   89,   -8},{   95,  353},{   90,  681},{  107, 1053},
+        {  124, 1430},{  139, 1787},{  156, 2136},{  177, 2477},
+        {  203, 2803},{  237, 3112},{  276, 3406},{  329, 3683},
+        {  395, 3942},{  475, 4182},{  567, 4407},{  665, 4624},
+        {  767, 4834},{  879, 5032},{ 1011, 5213},{ 1169, 5375},
+        { 1348, 5525},{ 1547, 5654},{ 1785, 5743},{ 2066, 5787}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=41  INTRA*/
+      {
+        {   98,   71},{  495, 1272},{ 1040, 2090},{ 1675, 2533},
+        { 2302, 2842},{ 2953, 3132},{ 3631, 3381},{ 4309, 3574},
+        { 4966, 3726},{ 5593, 3846},{ 6189, 3934},{ 6738, 3972},
+        { 7256, 3991},{ 7754, 4036},{ 8250, 4099},{ 8747, 4150},
+        { 9207, 4185},{ 9650, 4222},{10057, 4242},{10411, 4237},
+        {10771, 4230},{11127, 4244},{11486, 4254},{11933, 4252}
+      },
+      /*Y'  qi=41  INTER*/
+      {
+        {   65,  -25},{  125, 1331},{  260, 2633},{  457, 3782},
+        {  807, 4740},{ 1499, 5397},{ 2562, 5693},{ 3766, 5743},
+        { 4859, 5695},{ 5776, 5638},{ 6556, 5590},{ 7243, 5554},
+        { 7859, 5529},{ 8417, 5506},{ 8935, 5486},{ 9419, 5473},
+        { 9869, 5460},{10296, 5446},{10711, 5436},{11089, 5430},
+        {11445, 5421},{11802, 5412},{12129, 5404},{12465, 5393}
+      }
+    },
+    {
+      /*Cb  qi=41  INTRA*/
+      {
+        {    8,    3},{  108,  371},{  189,  743},{  267, 1126},
+        {  374, 1471},{  504, 1760},{  635, 2011},{  758, 2241},
+        {  881, 2447},{ 1013, 2621},{ 1147, 2773},{ 1293, 2906},
+        { 1441, 3023},{ 1580, 3131},{ 1712, 3243},{ 1844, 3360},
+        { 1985, 3451},{ 2114, 3532},{ 2240, 3613},{ 2390, 3680},
+        { 2550, 3740},{ 2687, 3800},{ 2825, 3862},{ 3052, 3944}
+      },
+      /*Cb  qi=41  INTER*/
+      {
+        {  104,   39},{  100,  350},{   95,  667},{  105, 1040},
+        {  121, 1422},{  137, 1787},{  159, 2129},{  185, 2459},
+        {  216, 2778},{  249, 3083},{  287, 3374},{  335, 3653},
+        {  393, 3920},{  462, 4175},{  549, 4414},{  660, 4636},
+        {  791, 4839},{  952, 5014},{ 1135, 5166},{ 1337, 5297},
+        { 1552, 5411},{ 1752, 5530},{ 1972, 5634},{ 2224, 5724}
+      }
+    },
+    {
+      /*Cr  qi=41  INTRA*/
+      {
+        {   15,    7},{  115,  389},{  193,  770},{  284, 1154},
+        {  401, 1484},{  528, 1761},{  652, 2005},{  764, 2228},
+        {  882, 2427},{ 1008, 2599},{ 1167, 2734},{ 1320, 2859},
+        { 1443, 2990},{ 1580, 3103},{ 1743, 3181},{ 1894, 3241},
+        { 2012, 3309},{ 2141, 3385},{ 2272, 3459},{ 2398, 3519},
+        { 2566, 3584},{ 2707, 3680},{ 2816, 3762},{ 2991, 3770}
+      },
+      /*Cr  qi=41  INTER*/
+      {
+        {   92,   -9},{   98,  354},{   90,  682},{  107, 1052},
+        {  124, 1429},{  139, 1786},{  156, 2132},{  178, 2471},
+        {  207, 2794},{  241, 3100},{  285, 3391},{  345, 3662},
+        {  417, 3915},{  503, 4151},{  600, 4375},{  703, 4589},
+        {  815, 4791},{  942, 4981},{ 1088, 5155},{ 1250, 5316},
+        { 1432, 5462},{ 1653, 5575},{ 1930, 5639},{ 2250, 5655}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=42  INTRA*/
+      {
+        {  109,   75},{  534, 1257},{ 1114, 2047},{ 1793, 2456},
+        { 2461, 2735},{ 3157, 2994},{ 3879, 3221},{ 4595, 3396},
+        { 5282, 3531},{ 5931, 3638},{ 6546, 3714},{ 7105, 3749},
+        { 7633, 3766},{ 8147, 3803},{ 8652, 3865},{ 9148, 3915},
+        { 9613, 3946},{10075, 3976},{10489, 3997},{10835, 3994},
+        {11195, 3985},{11553, 3997},{11909, 4004},{12369, 3990}
+      },
+      /*Y'  qi=42  INTER*/
+      {
+        {   69,  -23},{  134, 1332},{  287, 2611},{  521, 3730},
+        {  970, 4624},{ 1827, 5176},{ 3028, 5382},{ 4262, 5389},
+        { 5325, 5338},{ 6214, 5291},{ 6976, 5255},{ 7651, 5228},
+        { 8260, 5206},{ 8821, 5190},{ 9343, 5177},{ 9823, 5165},
+        {10273, 5152},{10709, 5143},{11121, 5136},{11502, 5129},
+        {11857, 5125},{12193, 5115},{12520, 5107},{12802, 5097}
+      }
+    },
+    {
+      /*Cb  qi=42  INTRA*/
+      {
+        {    9,    3},{  113,  371},{  199,  743},{  279, 1123},
+        {  390, 1462},{  525, 1743},{  662, 1986},{  789, 2208},
+        {  916, 2406},{ 1057, 2571},{ 1204, 2712},{ 1362, 2835},
+        { 1524, 2943},{ 1676, 3040},{ 1815, 3145},{ 1959, 3249},
+        { 2117, 3325},{ 2249, 3406},{ 2377, 3488},{ 2537, 3547},
+        { 2706, 3597},{ 2854, 3646},{ 2999, 3705},{ 3236, 3759}
+      },
+      /*Cb  qi=42  INTER*/
+      {
+        {  114,   44},{  107,  353},{  101,  670},{  111, 1041},
+        {  129, 1418},{  148, 1775},{  174, 2110},{  208, 2432},
+        {  244, 2746},{  283, 3046},{  330, 3330},{  388, 3602},
+        {  460, 3858},{  546, 4101},{  655, 4326},{  793, 4530},
+        {  966, 4703},{ 1165, 4851},{ 1388, 4980},{ 1630, 5088},
+        { 1869, 5189},{ 2122, 5268},{ 2403, 5328},{ 2667, 5417}
+      }
+    },
+    {
+      /*Cr  qi=42  INTRA*/
+      {
+        {   15,    7},{  120,  390},{  202,  771},{  298, 1150},
+        {  421, 1473},{  553, 1743},{  681, 1982},{  796, 2199},
+        {  923, 2388},{ 1062, 2547},{ 1225, 2678},{ 1392, 2792},
+        { 1531, 2907},{ 1682, 3007},{ 1856, 3074},{ 2009, 3134},
+        { 2138, 3192},{ 2274, 3257},{ 2407, 3333},{ 2536, 3393},
+        { 2711, 3455},{ 2875, 3531},{ 3000, 3598},{ 3186, 3599}
+      },
+      /*Cr  qi=42  INTER*/
+      {
+        {   87,   -4},{   95,  358},{   97,  683},{  113, 1052},
+        {  131, 1423},{  148, 1774},{  170, 2116},{  198, 2448},
+        {  234, 2762},{  276, 3062},{  331, 3343},{  404, 3603},
+        {  494, 3844},{  598, 4067},{  715, 4276},{  842, 4471},
+        {  977, 4661},{ 1128, 4840},{ 1311, 4991},{ 1516, 5127},
+        { 1759, 5233},{ 2050, 5300},{ 2377, 5323},{ 2710, 5304}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=43  INTRA*/
+      {
+        {   99,   79},{  557, 1244},{ 1175, 2016},{ 1882, 2408},
+        { 2570, 2677},{ 3288, 2926},{ 4030, 3141},{ 4760, 3307},
+        { 5458, 3435},{ 6115, 3537},{ 6743, 3608},{ 7312, 3636},
+        { 7841, 3652},{ 8357, 3687},{ 8870, 3742},{ 9376, 3788},
+        { 9850, 3821},{10315, 3853},{10734, 3873},{11084, 3870},
+        {11442, 3862},{11800, 3874},{12160, 3879},{12618, 3876}
+      },
+      /*Y'  qi=43  INTER*/
+      {
+        {   69,  -22},{  134, 1331},{  294, 2601},{  551, 3703},
+        { 1056, 4563},{ 2003, 5061},{ 3276, 5215},{ 4534, 5194},
+        { 5599, 5133},{ 6488, 5083},{ 7257, 5044},{ 7938, 5014},
+        { 8556, 4992},{ 9124, 4975},{ 9648, 4960},{10138, 4948},
+        {10594, 4939},{11039, 4926},{11462, 4919},{11847, 4912},
+        {12216, 4904},{12570, 4896},{12883, 4889},{13189, 4879}
+      }
+    },
+    {
+      /*Cb  qi=43  INTRA*/
+      {
+        {    9,    3},{  114,  371},{  202,  740},{  294, 1110},
+        {  417, 1440},{  558, 1716},{  700, 1956},{  833, 2172},
+        {  966, 2365},{ 1116, 2524},{ 1269, 2661},{ 1431, 2781},
+        { 1599, 2885},{ 1756, 2980},{ 1902, 3082},{ 2051, 3185},
+        { 2209, 3261},{ 2337, 3342},{ 2464, 3420},{ 2633, 3475},
+        { 2809, 3525},{ 2948, 3579},{ 3094, 3633},{ 3347, 3678}
+      },
+      /*Cb  qi=43  INTER*/
+      {
+        {  111,   44},{  106,  353},{  102,  670},{  112, 1040},
+        {  128, 1416},{  148, 1771},{  176, 2104},{  211, 2424},
+        {  250, 2734},{  293, 3030},{  347, 3309},{  411, 3575},
+        {  490, 3828},{  589, 4064},{  716, 4278},{  869, 4472},
+        { 1050, 4640},{ 1264, 4781},{ 1512, 4895},{ 1775, 4991},
+        { 2042, 5069},{ 2310, 5141},{ 2593, 5207},{ 2912, 5239}
+      }
+    },
+    {
+      /*Cr  qi=43  INTRA*/
+      {
+        {   15,    7},{  121,  390},{  208,  767},{  315, 1135},
+        {  449, 1449},{  586, 1715},{  718, 1950},{  843, 2158},
+        {  977, 2342},{ 1120, 2501},{ 1290, 2632},{ 1466, 2739},
+        { 1613, 2845},{ 1763, 2945},{ 1937, 3015},{ 2093, 3070},
+        { 2225, 3126},{ 2366, 3194},{ 2501, 3267},{ 2634, 3324},
+        { 2815, 3385},{ 2964, 3466},{ 3087, 3538},{ 3263, 3555}
+      },
+      /*Cr  qi=43  INTER*/
+      {
+        {   84,   -4},{   93,  358},{   95,  683},{  113, 1052},
+        {  131, 1421},{  148, 1770},{  171, 2110},{  201, 2439},
+        {  240, 2750},{  287, 3046},{  348, 3322},{  429, 3576},
+        {  527, 3811},{  641, 4029},{  767, 4230},{  904, 4422},
+        { 1053, 4603},{ 1225, 4765},{ 1433, 4903},{ 1661, 5030},
+        { 1928, 5121},{ 2252, 5160},{ 2604, 5164},{ 2979, 5125}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=44  INTRA*/
+      {
+        {  103,   80},{  560, 1244},{ 1183, 2009},{ 1891, 2391},
+        { 2586, 2649},{ 3324, 2884},{ 4093, 3089},{ 4850, 3243},
+        { 5575, 3358},{ 6252, 3452},{ 6886, 3518},{ 7459, 3546},
+        { 7993, 3562},{ 8515, 3594},{ 9030, 3645},{ 9534, 3691},
+        {10004, 3723},{10469, 3750},{10887, 3765},{11236, 3766},
+        {11596, 3762},{11960, 3775},{12317, 3784},{12766, 3789}
+      },
+      /*Y'  qi=44  INTER*/
+      {
+        {   77,  -24},{  145, 1332},{  332, 2580},{  642, 3649},
+        { 1270, 4438},{ 2360, 4860},{ 3685, 4982},{ 4910, 4966},
+        { 5929, 4928},{ 6785, 4900},{ 7529, 4880},{ 8198, 4863},
+        { 8804, 4850},{ 9361, 4842},{ 9882, 4836},{10371, 4830},
+        {10827, 4822},{11262, 4816},{11672, 4811},{12052, 4807},
+        {12431, 4806},{12780, 4798},{13095, 4792},{13401, 4791}
+      }
+    },
+    {
+      /*Cb  qi=44  INTRA*/
+      {
+        {    9,    2},{  122,  371},{  214,  741},{  307, 1109},
+        {  433, 1432},{  576, 1704},{  718, 1939},{  855, 2152},
+        {  991, 2340},{ 1141, 2497},{ 1298, 2632},{ 1463, 2749},
+        { 1636, 2851},{ 1796, 2944},{ 1947, 3041},{ 2101, 3140},
+        { 2260, 3219},{ 2392, 3297},{ 2527, 3366},{ 2693, 3424},
+        { 2872, 3477},{ 3025, 3525},{ 3175, 3584},{ 3451, 3626}
+      },
+      /*Cb  qi=44  INTER*/
+      {
+        {  111,   14},{  110,  339},{  109,  671},{  120, 1040},
+        {  139, 1410},{  162, 1758},{  197, 2084},{  243, 2397},
+        {  291, 2702},{  342, 2992},{  405, 3265},{  484, 3521},
+        {  584, 3760},{  705, 3983},{  855, 4185},{ 1048, 4356},
+        { 1274, 4500},{ 1531, 4617},{ 1816, 4707},{ 2111, 4783},
+        { 2409, 4846},{ 2720, 4901},{ 3044, 4957},{ 3391, 4985}
+      }
+    },
+    {
+      /*Cr  qi=44  INTRA*/
+      {
+        {   17,    7},{  128,  392},{  219,  770},{  329, 1135},
+        {  465, 1442},{  601, 1703},{  734, 1935},{  862, 2142},
+        {  998, 2325},{ 1147, 2482},{ 1321, 2606},{ 1496, 2710},
+        { 1649, 2813},{ 1809, 2908},{ 1984, 2977},{ 2143, 3032},
+        { 2279, 3087},{ 2423, 3152},{ 2559, 3225},{ 2684, 3288},
+        { 2866, 3351},{ 3025, 3426},{ 3161, 3492},{ 3372, 3500}
+      },
+      /*Cr  qi=44  INTER*/
+      {
+        {   89,    0},{  101,  352},{  104,  683},{  121, 1051},
+        {  141, 1414},{  163, 1757},{  192, 2092},{  231, 2415},
+        {  278, 2720},{  336, 3007},{  412, 3273},{  510, 3516},
+        {  633, 3733},{  769, 3936},{  914, 4130},{ 1076, 4307},
+        { 1256, 4472},{ 1469, 4617},{ 1723, 4732},{ 2012, 4822},
+        { 2347, 4871},{ 2716, 4875},{ 3082, 4866},{ 3422, 4826}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=45  INTRA*/
+      {
+        {  119,   78},{  610, 1226},{ 1271, 1965},{ 2026, 2319},
+        { 2768, 2550},{ 3556, 2757},{ 4369, 2938},{ 5157, 3076},
+        { 5901, 3182},{ 6598, 3268},{ 7253, 3326},{ 7844, 3343},
+        { 8392, 3356},{ 8922, 3386},{ 9453, 3433},{ 9973, 3474},
+        {10457, 3503},{10929, 3530},{11351, 3543},{11709, 3541},
+        {12068, 3537},{12434, 3547},{12805, 3555},{13268, 3563}
+      },
+      /*Y'  qi=45  INTER*/
+      {
+        {   77,  -20},{  146, 1330},{  342, 2566},{  699, 3604},
+        { 1439, 4332},{ 2669, 4672},{ 4075, 4727},{ 5318, 4679},
+        { 6345, 4630},{ 7209, 4595},{ 7963, 4570},{ 8644, 4551},
+        { 9262, 4535},{ 9831, 4525},{10370, 4515},{10872, 4506},
+        {11334, 4500},{11783, 4492},{12219, 4489},{12617, 4483},
+        {12995, 4477},{13350, 4472},{13674, 4466},{13968, 4468}
+      }
+    },
+    {
+      /*Cb  qi=45  INTRA*/
+      {
+        {    9,    2},{  122,  370},{  219,  735},{  324, 1096},
+        {  465, 1414},{  619, 1679},{  771, 1905},{  920, 2103},
+        { 1070, 2276},{ 1236, 2419},{ 1410, 2539},{ 1595, 2644},
+        { 1784, 2736},{ 1949, 2831},{ 2104, 2931},{ 2275, 3021},
+        { 2443, 3092},{ 2586, 3166},{ 2735, 3234},{ 2904, 3288},
+        { 3093, 3338},{ 3262, 3382},{ 3419, 3427},{ 3708, 3456}
+      },
+      /*Cb  qi=45  INTER*/
+      {
+        {  103,    0},{  109,  339},{  109,  670},{  119, 1039},
+        {  137, 1408},{  162, 1754},{  199, 2076},{  248, 2386},
+        {  301, 2684},{  360, 2967},{  433, 3234},{  525, 3481},
+        {  640, 3713},{  780, 3924},{  956, 4110},{ 1176, 4266},
+        { 1438, 4390},{ 1736, 4481},{ 2057, 4553},{ 2385, 4613},
+        { 2718, 4656},{ 3056, 4698},{ 3416, 4733},{ 3799, 4755}
+      }
+    },
+    {
+      /*Cr  qi=45  INTRA*/
+      {
+        {   16,    7},{  128,  391},{  225,  763},{  350, 1120},
+        {  500, 1420},{  649, 1673},{  792, 1893},{  929, 2089},
+        { 1084, 2257},{ 1250, 2401},{ 1440, 2518},{ 1633, 2614},
+        { 1799, 2708},{ 1968, 2798},{ 2151, 2863},{ 2314, 2914},
+        { 2453, 2968},{ 2611, 3025},{ 2759, 3095},{ 2887, 3160},
+        { 3082, 3210},{ 3259, 3278},{ 3403, 3342},{ 3593, 3354}
+      },
+      /*Cr  qi=45  INTER*/
+      {
+        {   92,    0},{  101,  352},{  103,  682},{  120, 1049},
+        {  140, 1412},{  163, 1752},{  193, 2083},{  234, 2402},
+        {  287, 2702},{  353, 2983},{  442, 3240},{  557, 3471},
+        {  694, 3680},{  846, 3873},{ 1014, 4056},{ 1200, 4224},
+        { 1414, 4369},{ 1664, 4495},{ 1946, 4595},{ 2278, 4654},
+        { 2654, 4673},{ 3047, 4658},{ 3438, 4627},{ 3825, 4585}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=46  INTRA*/
+      {
+        {  119,   78},{  610, 1227},{ 1277, 1960},{ 2043, 2309},
+        { 2805, 2529},{ 3618, 2719},{ 4452, 2887},{ 5257, 3016},
+        { 6017, 3115},{ 6727, 3195},{ 7392, 3248},{ 7984, 3267},
+        { 8528, 3281},{ 9059, 3310},{ 9593, 3354},{10119, 3395},
+        {10599, 3425},{11064, 3450},{11493, 3464},{11850, 3466},
+        {12207, 3462},{12578, 3471},{12948, 3480},{13407, 3487}
+      },
+      /*Y'  qi=46  INTER*/
+      {
+        {   74,  -14},{  149, 1326},{  382, 2538},{  807, 3541},
+        { 1670, 4211},{ 3000, 4499},{ 4416, 4533},{ 5628, 4490},
+        { 6628, 4453},{ 7479, 4425},{ 8228, 4406},{ 8902, 4393},
+        { 9521, 4380},{10090, 4371},{10623, 4364},{11124, 4356},
+        {11586, 4351},{12043, 4344},{12476, 4341},{12863, 4340},
+        {13244, 4337},{13610, 4329},{13936, 4324},{14246, 4329}
+      }
+    },
+    {
+      /*Cb  qi=46  INTRA*/
+      {
+        {   11,    2},{  132,  371},{  234,  737},{  340, 1094},
+        {  481, 1405},{  637, 1667},{  791, 1891},{  944, 2084},
+        { 1099, 2253},{ 1268, 2392},{ 1444, 2507},{ 1633, 2610},
+        { 1825, 2700},{ 1990, 2794},{ 2147, 2895},{ 2321, 2984},
+        { 2493, 3053},{ 2640, 3126},{ 2787, 3198},{ 2954, 3253},
+        { 3146, 3297},{ 3313, 3344},{ 3473, 3393},{ 3757, 3434}
+      },
+      /*Cb  qi=46  INTER*/
+      {
+        {   97,    0},{  109,  339},{  108,  669},{  120, 1035},
+        {  142, 1398},{  173, 1737},{  221, 2052},{  281, 2353},
+        {  345, 2646},{  415, 2924},{  504, 3183},{  616, 3421},
+        {  749, 3643},{  914, 3842},{ 1123, 4012},{ 1379, 4150},
+        { 1685, 4250},{ 2014, 4327},{ 2366, 4382},{ 2731, 4426},
+        { 3083, 4470},{ 3445, 4490},{ 3805, 4511},{ 4146, 4539}
+      }
+    },
+    {
+      /*Cr  qi=46  INTRA*/
+      {
+        {   19,    7},{  137,  393},{  237,  765},{  364, 1116},
+        {  516, 1411},{  665, 1662},{  809, 1880},{  951, 2072},
+        { 1109, 2236},{ 1278, 2378},{ 1474, 2491},{ 1669, 2584},
+        { 1835, 2678},{ 2014, 2766},{ 2203, 2828},{ 2366, 2880},
+        { 2506, 2933},{ 2661, 2988},{ 2810, 3053},{ 2941, 3116},
+        { 3131, 3175},{ 3310, 3243},{ 3461, 3303},{ 3656, 3321}
+      },
+      /*Cr  qi=46  INTER*/
+      {
+        {   91,    1},{  103,  351},{  104,  681},{  121, 1046},
+        {  144, 1401},{  173, 1736},{  213, 2060},{  265, 2373},
+        {  330, 2666},{  410, 2938},{  517, 3185},{  655, 3404},
+        {  815, 3601},{  989, 3784},{ 1183, 3951},{ 1400, 4104},
+        { 1649, 4241},{ 1933, 4352},{ 2261, 4427},{ 2646, 4458},
+        { 3057, 4446},{ 3453, 4418},{ 3820, 4385},{ 4171, 4352}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=47  INTRA*/
+      {
+        {  117,   83},{  670, 1205},{ 1408, 1904},{ 2239, 2219},
+        { 3049, 2414},{ 3905, 2584},{ 4775, 2734},{ 5610, 2852},
+        { 6393, 2944},{ 7121, 3017},{ 7804, 3066},{ 8407, 3081},
+        { 8957, 3093},{ 9498, 3119},{10043, 3160},{10582, 3199},
+        {11083, 3226},{11561, 3250},{11993, 3263},{12352, 3264},
+        {12711, 3259},{13092, 3266},{13463, 3271},{13918, 3275}
+      },
+      /*Y'  qi=47  INTER*/
+      {
+        {   74,  -11},{  148, 1325},{  404, 2518},{  910, 3478},
+        { 1916, 4080},{ 3369, 4298},{ 4823, 4292},{ 6035, 4238},
+        { 7037, 4197},{ 7894, 4168},{ 8650, 4146},{ 9337, 4129},
+        { 9968, 4116},{10549, 4105},{11096, 4096},{11605, 4089},
+        {12081, 4083},{12547, 4076},{12990, 4070},{13399, 4070},
+        {13776, 4065},{14133, 4059},{14486, 4057},{14842, 4053}
+      }
+    },
+    {
+      /*Cb  qi=47  INTRA*/
+      {
+        {   11,    2},{  133,  370},{  242,  731},{  367, 1077},
+        {  524, 1378},{  692, 1630},{  860, 1844},{ 1028, 2024},
+        { 1203, 2178},{ 1393, 2305},{ 1582, 2413},{ 1787, 2507},
+        { 1992, 2590},{ 2175, 2676},{ 2351, 2767},{ 2534, 2851},
+        { 2707, 2923},{ 2862, 2994},{ 3021, 3060},{ 3193, 3111},
+        { 3396, 3147},{ 3573, 3184},{ 3752, 3220},{ 4038, 3255}
+      },
+      /*Cb  qi=47  INTER*/
+      {
+        {  101,    0},{  107,  339},{  108,  667},{  120, 1033},
+        {  142, 1394},{  175, 1729},{  227, 2040},{  295, 2335},
+        {  369, 2619},{  452, 2888},{  556, 3138},{  686, 3368},
+        {  850, 3574},{ 1050, 3758},{ 1299, 3910},{ 1605, 4024},
+        { 1950, 4104},{ 2317, 4163},{ 2689, 4210},{ 3077, 4239},
+        { 3466, 4258},{ 3840, 4278},{ 4205, 4298},{ 4515, 4340}
+      }
+    },
+    {
+      /*Cr  qi=47  INTRA*/
+      {
+        {   19,    7},{  138,  392},{  248,  758},{  396, 1094},
+        {  563, 1378},{  723, 1621},{  881, 1829},{ 1037, 2011},
+        { 1214, 2165},{ 1410, 2290},{ 1623, 2393},{ 1834, 2480},
+        { 2016, 2564},{ 2203, 2647},{ 2405, 2707},{ 2569, 2757},
+        { 2709, 2810},{ 2871, 2860},{ 3027, 2924},{ 3178, 2980},
+        { 3375, 3034},{ 3563, 3097},{ 3724, 3151},{ 3952, 3153}
+      },
+      /*Cr  qi=47  INTER*/
+      {
+        {   91,    1},{  100,  351},{  102,  681},{  120, 1043},
+        {  144, 1397},{  175, 1729},{  219, 2049},{  277, 2356},
+        {  353, 2640},{  451, 2902},{  579, 3136},{  739, 3342},
+        {  926, 3525},{ 1125, 3698},{ 1343, 3859},{ 1595, 3998},
+        { 1881, 4113},{ 2208, 4205},{ 2589, 4253},{ 3014, 4250},
+        { 3444, 4220},{ 3838, 4183},{ 4196, 4147},{ 4521, 4116}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=48  INTRA*/
+      {
+        {  107,   87},{  681, 1200},{ 1456, 1883},{ 2306, 2193},
+        { 3122, 2386},{ 3984, 2548},{ 4862, 2693},{ 5704, 2808},
+        { 6495, 2899},{ 7232, 2970},{ 7915, 3018},{ 8524, 3034},
+        { 9085, 3043},{ 9635, 3068},{10192, 3108},{10735, 3145},
+        {11237, 3171},{11719, 3194},{12153, 3207},{12516, 3206},
+        {12888, 3202},{13266, 3210},{13637, 3218},{14101, 3219}
+      },
+      /*Y'  qi=48  INTER*/
+      {
+        {   83,  -18},{  147, 1328},{  398, 2519},{  923, 3468},
+        { 1979, 4047},{ 3472, 4246},{ 4936, 4232},{ 6148, 4178},
+        { 7150, 4139},{ 8007, 4111},{ 8765, 4091},{ 9458, 4076},
+        {10090, 4063},{10676, 4054},{11226, 4045},{11742, 4038},
+        {12223, 4033},{12686, 4029},{13127, 4022},{13527, 4015},
+        {13915, 4012},{14277, 4007},{14619, 4004},{14966, 4001}
+      }
+    },
+    {
+      /*Cb  qi=48  INTRA*/
+      {
+        {   11,    2},{  134,  369},{  245,  730},{  373, 1075},
+        {  531, 1374},{  698, 1625},{  865, 1839},{ 1033, 2019},
+        { 1207, 2173},{ 1397, 2300},{ 1588, 2408},{ 1795, 2501},
+        { 2003, 2581},{ 2187, 2666},{ 2362, 2757},{ 2548, 2841},
+        { 2719, 2912},{ 2876, 2983},{ 3034, 3047},{ 3209, 3097},
+        { 3409, 3137},{ 3589, 3178},{ 3762, 3216},{ 4004, 3252}
+      },
+      /*Cb  qi=48  INTER*/
+      {
+        {  113,   26},{  112,  344},{  111,  668},{  120, 1032},
+        {  141, 1392},{  173, 1727},{  224, 2036},{  290, 2330},
+        {  363, 2612},{  447, 2880},{  551, 3130},{  685, 3358},
+        {  852, 3563},{ 1061, 3742},{ 1332, 3884},{ 1654, 3993},
+        { 2011, 4068},{ 2394, 4120},{ 2782, 4160},{ 3172, 4186},
+        { 3557, 4209},{ 3932, 4228},{ 4306, 4237},{ 4675, 4236}
+      }
+    },
+    {
+      /*Cr  qi=48  INTRA*/
+      {
+        {   18,    7},{  139,  389},{  252,  755},{  404, 1090},
+        {  573, 1372},{  732, 1615},{  889, 1823},{ 1045, 2005},
+        { 1222, 2159},{ 1417, 2285},{ 1631, 2387},{ 1843, 2474},
+        { 2027, 2558},{ 2212, 2639},{ 2413, 2697},{ 2578, 2746},
+        { 2720, 2798},{ 2887, 2852},{ 3040, 2913},{ 3181, 2970},
+        { 3381, 3024},{ 3581, 3081},{ 3743, 3130},{ 3948, 3133}
+      },
+      /*Cr  qi=48  INTER*/
+      {
+        {   89,    0},{  106,  352},{  105,  682},{  120, 1044},
+        {  144, 1395},{  174, 1724},{  215, 2044},{  270, 2350},
+        {  343, 2635},{  441, 2895},{  571, 3129},{  735, 3334},
+        {  926, 3518},{ 1139, 3684},{ 1371, 3836},{ 1628, 3977},
+        { 1933, 4089},{ 2279, 4164},{ 2672, 4204},{ 3105, 4205},
+        { 3533, 4176},{ 3931, 4135},{ 4290, 4089},{ 4624, 4057}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=49  INTRA*/
+      {
+        {  120,   85},{  706, 1194},{ 1485, 1875},{ 2348, 2187},
+        { 3190, 2372},{ 4076, 2521},{ 4967, 2658},{ 5819, 2771},
+        { 6611, 2861},{ 7345, 2936},{ 8026, 2990},{ 8626, 3013},
+        { 9182, 3030},{ 9723, 3059},{10266, 3100},{10802, 3143},
+        {11293, 3179},{11768, 3206},{12201, 3221},{12556, 3225},
+        {12914, 3226},{13281, 3237},{13639, 3247},{14089, 3257}
+      },
+      /*Y'  qi=49  INTER*/
+      {
+        {   72,  -11},{  155, 1320},{  458, 2485},{ 1090, 3386},
+        { 2284, 3907},{ 3835, 4075},{ 5272, 4064},{ 6449, 4026},
+        { 7426, 4003},{ 8267, 3987},{ 9017, 3976},{ 9698, 3967},
+        {10328, 3962},{10913, 3959},{11452, 3954},{11961, 3950},
+        {12442, 3947},{12904, 3946},{13347, 3945},{13749, 3943},
+        {14123, 3941},{14490, 3941},{14826, 3939},{15153, 3937}
+      }
+    },
+    {
+      /*Cb  qi=49  INTRA*/
+      {
+        {   11,    2},{  145,  369},{  262,  729},{  393, 1070},
+        {  557, 1363},{  731, 1607},{  907, 1811},{ 1085, 1983},
+        { 1268, 2130},{ 1465, 2251},{ 1658, 2359},{ 1868, 2454},
+        { 2079, 2534},{ 2264, 2621},{ 2440, 2717},{ 2625, 2802},
+        { 2792, 2878},{ 2945, 2954},{ 3106, 3021},{ 3277, 3075},
+        { 3466, 3119},{ 3638, 3170},{ 3824, 3213},{ 4100, 3243}
+      },
+      /*Cb  qi=49  INTER*/
+      {
+        {   98,   -6},{  113,  343},{  110,  669},{  122, 1029},
+        {  149, 1380},{  192, 1706},{  258, 2007},{  340, 2293},
+        {  426, 2569},{  525, 2831},{  653, 3071},{  814, 3287},
+        { 1013, 3478},{ 1262, 3637},{ 1575, 3761},{ 1936, 3851},
+        { 2328, 3910},{ 2741, 3949},{ 3163, 3970},{ 3559, 3994},
+        { 3936, 4025},{ 4300, 4050},{ 4655, 4060},{ 4962, 4062}
+      }
+    },
+    {
+      /*Cr  qi=49  INTRA*/
+      {
+        {   19,    7},{  151,  389},{  270,  753},{  427, 1084},
+        {  602, 1360},{  767, 1595},{  933, 1794},{ 1098, 1968},
+        { 1285, 2115},{ 1489, 2237},{ 1699, 2342},{ 1912, 2435},
+        { 2101, 2519},{ 2288, 2601},{ 2486, 2663},{ 2651, 2715},
+        { 2799, 2769},{ 2958, 2825},{ 3106, 2890},{ 3257, 2948},
+        { 3452, 3007},{ 3634, 3075},{ 3786, 3136},{ 3959, 3164}
+      },
+      /*Cr  qi=49  INTER*/
+      {
+        {   85,    1},{  103,  352},{  104,  681},{  121, 1039},
+        {  152, 1382},{  195, 1702},{  248, 2015},{  316, 2316},
+        {  403, 2595},{  520, 2847},{  676, 3068},{  870, 3258},
+        { 1091, 3429},{ 1329, 3585},{ 1597, 3725},{ 1894, 3849},
+        { 2242, 3940},{ 2656, 3984},{ 3098, 3992},{ 3531, 3981},
+        { 3936, 3950},{ 4304, 3915},{ 4646, 3879},{ 4915, 3861}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=50  INTRA*/
+      {
+        {  122,   89},{  798, 1170},{ 1682, 1812},{ 2613, 2096},
+        { 3501, 2260},{ 4430, 2388},{ 5352, 2510},{ 6228, 2613},
+        { 7043, 2698},{ 7793, 2770},{ 8486, 2823},{ 9092, 2846},
+        { 9652, 2865},{10210, 2895},{10773, 2936},{11315, 2979},
+        {11817, 3014},{12297, 3041},{12734, 3057},{13097, 3064},
+        {13443, 3067},{13813, 3078},{14190, 3088},{14646, 3103}
+      },
+      /*Y'  qi=50  INTER*/
+      {
+        {   73,  -11},{  154, 1318},{  501, 2457},{ 1281, 3291},
+        { 2685, 3719},{ 4356, 3810},{ 5811, 3769},{ 6988, 3726},
+        { 7976, 3700},{ 8835, 3682},{ 9606, 3669},{10307, 3659},
+        {10953, 3652},{11556, 3645},{12115, 3643},{12641, 3640},
+        {13138, 3636},{13613, 3634},{14068, 3629},{14488, 3627},
+        {14876, 3625},{15237, 3621},{15585, 3623},{15922, 3629}
+      }
+    },
+    {
+      /*Cb  qi=50  INTRA*/
+      {
+        {   11,    2},{  148,  368},{  278,  724},{  431, 1052},
+        {  613, 1334},{  806, 1567},{ 1004, 1756},{ 1203, 1915},
+        { 1405, 2051},{ 1621, 2163},{ 1833, 2262},{ 2059, 2347},
+        { 2280, 2424},{ 2476, 2512},{ 2670, 2598},{ 2864, 2679},
+        { 3037, 2754},{ 3201, 2826},{ 3376, 2887},{ 3562, 2936},
+        { 3756, 2976},{ 3932, 3022},{ 4117, 3065},{ 4385, 3094}
+      },
+      /*Cb  qi=50  INTER*/
+      {
+        {   92,   -3},{  112,  343},{  109,  669},{  121, 1027},
+        {  149, 1375},{  196, 1697},{  270, 1992},{  366, 2267},
+        {  471, 2532},{  594, 2782},{  747, 3011},{  942, 3212},
+        { 1189, 3384},{ 1497, 3521},{ 1875, 3613},{ 2297, 3673},
+        { 2739, 3710},{ 3195, 3725},{ 3644, 3737},{ 4057, 3751},
+        { 4445, 3763},{ 4841, 3769},{ 5211, 3779},{ 5568, 3769}
+      }
+    },
+    {
+      /*Cr  qi=50  INTRA*/
+      {
+        {   19,    7},{  155,  388},{  290,  744},{  474, 1060},
+        {  666, 1324},{  847, 1549},{ 1033, 1737},{ 1219, 1898},
+        { 1428, 2034},{ 1653, 2147},{ 1885, 2245},{ 2115, 2329},
+        { 2316, 2410},{ 2517, 2486},{ 2730, 2539},{ 2901, 2586},
+        { 3042, 2638},{ 3199, 2693},{ 3366, 2755},{ 3534, 2805},
+        { 3738, 2858},{ 3934, 2916},{ 4079, 2975},{ 4257, 2992}
+      },
+      /*Cr  qi=50  INTER*/
+      {
+        {   87,    1},{  102,  353},{  103,  680},{  121, 1036},
+        {  153, 1377},{  199, 1694},{  260, 1999},{  339, 2291},
+        {  446, 2559},{  590, 2797},{  780, 3003},{ 1010, 3176},
+        { 1267, 3331},{ 1547, 3474},{ 1874, 3594},{ 2245, 3688},
+        { 2666, 3742},{ 3130, 3758},{ 3594, 3748},{ 4028, 3711},
+        { 4415, 3674},{ 4771, 3641},{ 5122, 3605},{ 5482, 3569}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=51  INTRA*/
+      {
+        {  115,   93},{  819, 1164},{ 1739, 1806},{ 2695, 2101},
+        { 3612, 2257},{ 4552, 2374},{ 5479, 2490},{ 6352, 2593},
+        { 7158, 2683},{ 7898, 2761},{ 8580, 2823},{ 9177, 2854},
+        { 9728, 2880},{10268, 2917},{10816, 2966},{11350, 3016},
+        {11834, 3058},{12311, 3089},{12741, 3109},{13092, 3119},
+        {13434, 3126},{13791, 3142},{14156, 3155},{14590, 3171}
+      },
+      /*Y'  qi=51  INTER*/
+      {
+        {   58,    0},{  171, 1307},{  610, 2407},{ 1563, 3175},
+        { 3116, 3545},{ 4789, 3624},{ 6185, 3602},{ 7320, 3583},
+        { 8282, 3574},{ 9124, 3569},{ 9878, 3567},{10569, 3565},
+        {11207, 3563},{11801, 3564},{12359, 3566},{12884, 3567},
+        {13373, 3568},{13841, 3567},{14289, 3566},{14699, 3568},
+        {15086, 3568},{15446, 3566},{15788, 3564},{16103, 3568}
+      }
+    },
+    {
+      /*Cb  qi=51  INTRA*/
+      {
+        {   14,    3},{  161,  369},{  297,  722},{  454, 1047},
+        {  639, 1325},{  833, 1554},{ 1033, 1742},{ 1236, 1897},
+        { 1440, 2032},{ 1653, 2148},{ 1860, 2253},{ 2077, 2347},
+        { 2288, 2432},{ 2476, 2525},{ 2661, 2621},{ 2841, 2714},
+        { 3010, 2797},{ 3170, 2876},{ 3333, 2945},{ 3510, 3000},
+        { 3696, 3054},{ 3865, 3114},{ 4046, 3164},{ 4317, 3200}
+      },
+      /*Cb  qi=51  INTER*/
+      {
+        {   88,  -11},{  109,  341},{  109,  668},{  126, 1019},
+        {  168, 1358},{  233, 1670},{  329, 1955},{  451, 2219},
+        {  584, 2472},{  736, 2711},{  931, 2923},{ 1179, 3104},
+        { 1480, 3254},{ 1846, 3368},{ 2265, 3448},{ 2714, 3501},
+        { 3180, 3524},{ 3638, 3529},{ 4074, 3543},{ 4485, 3560},
+        { 4868, 3571},{ 5238, 3581},{ 5597, 3594},{ 5953, 3591}
+      }
+    },
+    {
+      /*Cr  qi=51  INTRA*/
+      {
+        {   24,    7},{  168,  388},{  309,  742},{  496, 1054},
+        {  688, 1316},{  873, 1538},{ 1063, 1723},{ 1252, 1882},
+        { 1460, 2018},{ 1682, 2134},{ 1907, 2238},{ 2125, 2332},
+        { 2317, 2422},{ 2507, 2510},{ 2705, 2575},{ 2869, 2630},
+        { 3015, 2684},{ 3178, 2744},{ 3329, 2815},{ 3477, 2878},
+        { 3667, 2945},{ 3848, 3016},{ 3997, 3082},{ 4174, 3121}
+      },
+      /*Cr  qi=51  INTER*/
+      {
+        {   83,   -2},{  102,  351},{  102,  680},{  126, 1029},
+        {  172, 1359},{  238, 1665},{  321, 1962},{  422, 2246},
+        {  552, 2505},{  733, 2728},{  970, 2912},{ 1247, 3069},
+        { 1552, 3209},{ 1876, 3338},{ 2251, 3440},{ 2692, 3502},
+        { 3161, 3529},{ 3637, 3525},{ 4084, 3509},{ 4487, 3479},
+        { 4850, 3444},{ 5181, 3419},{ 5507, 3406},{ 5786, 3398}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=52  INTRA*/
+      {
+        {  117,   93},{  814, 1168},{ 1729, 1822},{ 2706, 2119},
+        { 3655, 2262},{ 4604, 2374},{ 5528, 2490},{ 6394, 2596},
+        { 7189, 2691},{ 7921, 2777},{ 8596, 2846},{ 9184, 2885},
+        { 9728, 2918},{10260, 2961},{10796, 3014},{11316, 3069},
+        {11793, 3115},{12267, 3150},{12692, 3172},{13037, 3185},
+        {13367, 3196},{13717, 3214},{14087, 3227},{14521, 3249}
+      },
+      /*Y'  qi=52  INTER*/
+      {
+        {   52,    0},{  169, 1308},{  668, 2382},{ 1735, 3112},
+        { 3384, 3451},{ 5077, 3519},{ 6461, 3506},{ 7587, 3496},
+        { 8545, 3494},{ 9384, 3494},{10142, 3498},{10838, 3501},
+        {11475, 3503},{12078, 3508},{12640, 3511},{13162, 3513},
+        {13654, 3517},{14130, 3521},{14576, 3522},{14980, 3523},
+        {15369, 3523},{15737, 3522},{16071, 3521},{16382, 3516}
+      }
+    },
+    {
+      /*Cb  qi=52  INTRA*/
+      {
+        {   14,    3},{  163,  369},{  299,  722},{  457, 1044},
+        {  645, 1319},{  843, 1545},{ 1050, 1728},{ 1261, 1879},
+        { 1468, 2013},{ 1678, 2132},{ 1883, 2240},{ 2093, 2338},
+        { 2301, 2428},{ 2488, 2523},{ 2667, 2619},{ 2843, 2718},
+        { 3010, 2805},{ 3163, 2887},{ 3323, 2963},{ 3490, 3028},
+        { 3665, 3087},{ 3841, 3145},{ 4011, 3197},{ 4289, 3230}
+      },
+      /*Cb  qi=52  INTER*/
+      {
+        {   98,   -7},{  109,  342},{  109,  668},{  126, 1018},
+        {  170, 1355},{  242, 1663},{  352, 1941},{  490, 2195},
+        {  642, 2439},{  823, 2666},{ 1052, 2868},{ 1333, 3039},
+        { 1670, 3178},{ 2074, 3280},{ 2524, 3348},{ 2996, 3390},
+        { 3469, 3410},{ 3923, 3420},{ 4355, 3434},{ 4771, 3451},
+        { 5166, 3468},{ 5532, 3483},{ 5885, 3499},{ 6263, 3501}
+      }
+    },
+    {
+      /*Cr  qi=52  INTRA*/
+      {
+        {   25,    7},{  170,  388},{  312,  741},{  500, 1051},
+        {  694, 1310},{  883, 1529},{ 1082, 1709},{ 1280, 1864},
+        { 1491, 1998},{ 1710, 2117},{ 1932, 2225},{ 2143, 2324},
+        { 2328, 2418},{ 2516, 2506},{ 2708, 2578},{ 2870, 2637},
+        { 3017, 2693},{ 3170, 2758},{ 3312, 2835},{ 3455, 2901},
+        { 3644, 2972},{ 3827, 3049},{ 3968, 3121},{ 4115, 3166}
+      },
+      /*Cr  qi=52  INTER*/
+      {
+        {   86,   -2},{  101,  352},{  100,  680},{  126, 1028},
+        {  175, 1356},{  247, 1657},{  341, 1948},{  458, 2224},
+        {  615, 2471},{  828, 2681},{ 1091, 2857},{ 1395, 3008},
+        { 1732, 3140},{ 2095, 3257},{ 2502, 3348},{ 2968, 3402},
+        { 3457, 3420},{ 3926, 3413},{ 4360, 3388},{ 4759, 3357},
+        { 5128, 3329},{ 5449, 3306},{ 5741, 3295},{ 6071, 3296}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=53  INTRA*/
+      {
+        {  138,   93},{  850, 1161},{ 1773, 1810},{ 2763, 2103},
+        { 3722, 2245},{ 4675, 2360},{ 5600, 2483},{ 6464, 2597},
+        { 7255, 2700},{ 7982, 2792},{ 8652, 2867},{ 9237, 2913},
+        { 9775, 2950},{10302, 2998},{10834, 3058},{11347, 3121},
+        {11826, 3169},{12299, 3207},{12713, 3235},{13054, 3250},
+        {13387, 3265},{13744, 3286},{14110, 3302},{14515, 3323}
+      },
+      /*Y'  qi=53  INTER*/
+      {
+        {   52,    2},{  169, 1308},{  680, 2377},{ 1763, 3103},
+        { 3410, 3450},{ 5094, 3531},{ 6469, 3526},{ 7590, 3525},
+        { 8547, 3530},{ 9385, 3534},{10139, 3540},{10835, 3548},
+        {11479, 3553},{12075, 3559},{12634, 3565},{13159, 3570},
+        {13650, 3573},{14124, 3576},{14575, 3580},{14993, 3583},
+        {15375, 3584},{15744, 3584},{16091, 3583},{16421, 3586}
+      }
+    },
+    {
+      /*Cb  qi=53  INTRA*/
+      {
+        {   14,    3},{  167,  367},{  317,  717},{  492, 1033},
+        {  687, 1306},{  887, 1531},{ 1095, 1715},{ 1309, 1866},
+        { 1517, 2000},{ 1729, 2119},{ 1932, 2227},{ 2146, 2325},
+        { 2358, 2414},{ 2544, 2511},{ 2724, 2611},{ 2902, 2711},
+        { 3070, 2800},{ 3227, 2878},{ 3381, 2954},{ 3548, 3021},
+        { 3724, 3077},{ 3888, 3140},{ 4065, 3196},{ 4359, 3225}
+      },
+      /*Cb  qi=53  INTER*/
+      {
+        {   93,   -8},{  110,  342},{  108,  668},{  125, 1018},
+        {  170, 1355},{  242, 1663},{  353, 1939},{  494, 2192},
+        {  651, 2433},{  838, 2658},{ 1076, 2856},{ 1368, 3022},
+        { 1716, 3158},{ 2123, 3260},{ 2575, 3330},{ 3042, 3373},
+        { 3507, 3396},{ 3962, 3413},{ 4394, 3430},{ 4797, 3452},
+        { 5169, 3476},{ 5547, 3496},{ 5914, 3510},{ 6235, 3525}
+      }
+    },
+    {
+      /*Cr  qi=53  INTRA*/
+      {
+        {   25,    7},{  175,  386},{  335,  734},{  541, 1037},
+        {  737, 1296},{  926, 1516},{ 1125, 1696},{ 1324, 1851},
+        { 1540, 1984},{ 1763, 2102},{ 1989, 2210},{ 2202, 2310},
+        { 2386, 2404},{ 2572, 2495},{ 2768, 2569},{ 2929, 2627},
+        { 3071, 2684},{ 3231, 2749},{ 3374, 2825},{ 3514, 2894},
+        { 3703, 2963},{ 3882, 3040},{ 4024, 3111},{ 4190, 3150}
+      },
+      /*Cr  qi=53  INTER*/
+      {
+        {   87,   -1},{   99,  352},{  100,  680},{  125, 1027},
+        {  175, 1355},{  249, 1657},{  343, 1946},{  462, 2220},
+        {  624, 2465},{  844, 2671},{ 1122, 2841},{ 1435, 2989},
+        { 1768, 3125},{ 2134, 3243},{ 2545, 3334},{ 3002, 3393},
+        { 3490, 3412},{ 3965, 3405},{ 4401, 3384},{ 4797, 3359},
+        { 5156, 3328},{ 5482, 3297},{ 5800, 3292},{ 6135, 3293}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=54  INTRA*/
+      {
+        {  184,   94},{  902, 1151},{ 1876, 1776},{ 2881, 2057},
+        { 3832, 2200},{ 4785, 2315},{ 5709, 2442},{ 6570, 2562},
+        { 7362, 2672},{ 8092, 2771},{ 8760, 2852},{ 9337, 2901},
+        { 9874, 2943},{10402, 2995},{10928, 3059},{11443, 3126},
+        {11926, 3178},{12396, 3220},{12805, 3251},{13139, 3266},
+        {13466, 3280},{13822, 3304},{14184, 3322},{14585, 3342}
+      },
+      /*Y'  qi=54  INTER*/
+      {
+        {   60,    5},{  169, 1308},{  683, 2375},{ 1791, 3090},
+        { 3478, 3412},{ 5184, 3470},{ 6568, 3455},{ 7697, 3446},
+        { 8659, 3446},{ 9503, 3447},{10266, 3450},{10971, 3454},
+        {11619, 3458},{12223, 3462},{12789, 3467},{13315, 3471},
+        {13811, 3475},{14291, 3479},{14743, 3479},{15148, 3481},
+        {15535, 3483},{15913, 3481},{16252, 3479},{16569, 3472}
+      }
+    },
+    {
+      /*Cb  qi=54  INTRA*/
+      {
+        {   13,    2},{  165,  367},{  318,  715},{  498, 1030},
+        {  698, 1301},{  906, 1523},{ 1121, 1703},{ 1336, 1853},
+        { 1549, 1984},{ 1765, 2100},{ 1974, 2207},{ 2192, 2306},
+        { 2402, 2396},{ 2587, 2493},{ 2773, 2591},{ 2953, 2691},
+        { 3119, 2778},{ 3277, 2858},{ 3430, 2940},{ 3603, 3004},
+        { 3788, 3059},{ 3950, 3121},{ 4128, 3173},{ 4398, 3215}
+      },
+      /*Cb  qi=54  INTER*/
+      {
+        {  100,   -3},{  109,  343},{  107,  668},{  125, 1018},
+        {  169, 1354},{  241, 1662},{  353, 1938},{  496, 2190},
+        {  655, 2431},{  843, 2655},{ 1082, 2851},{ 1381, 3015},
+        { 1739, 3146},{ 2154, 3243},{ 2610, 3310},{ 3094, 3344},
+        { 3581, 3358},{ 4034, 3371},{ 4457, 3384},{ 4867, 3399},
+        { 5255, 3413},{ 5630, 3425},{ 6003, 3440},{ 6346, 3440}
+      }
+    },
+    {
+      /*Cr  qi=54  INTRA*/
+      {
+        {   23,    7},{  174,  386},{  338,  732},{  549, 1034},
+        {  751, 1289},{  947, 1506},{ 1150, 1685},{ 1353, 1837},
+        { 1572, 1969},{ 1800, 2087},{ 2031, 2192},{ 2248, 2291},
+        { 2434, 2387},{ 2622, 2477},{ 2815, 2549},{ 2976, 2607},
+        { 3126, 2663},{ 3286, 2727},{ 3427, 2807},{ 3569, 2877},
+        { 3761, 2941},{ 3942, 3016},{ 4084, 3093},{ 4226, 3131}
+      },
+      /*Cr  qi=54  INTER*/
+      {
+        {   88,   -2},{   99,  351},{  100,  680},{  125, 1027},
+        {  175, 1354},{  248, 1656},{  343, 1945},{  463, 2219},
+        {  626, 2463},{  850, 2668},{ 1128, 2837},{ 1445, 2983},
+        { 1791, 3111},{ 2168, 3224},{ 2597, 3309},{ 3075, 3351},
+        { 3560, 3364},{ 4029, 3356},{ 4464, 3335},{ 4858, 3307},
+        { 5218, 3275},{ 5547, 3256},{ 5850, 3247},{ 6171, 3214}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=55  INTRA*/
+      {
+        {  178,   95},{  968, 1137},{ 2000, 1747},{ 3013, 2027},
+        { 3966, 2173},{ 4920, 2294},{ 5842, 2427},{ 6702, 2553},
+        { 7489, 2668},{ 8213, 2773},{ 8875, 2858},{ 9452, 2913},
+        { 9986, 2959},{10504, 3016},{11023, 3085},{11530, 3157},
+        {12011, 3213},{12480, 3257},{12882, 3291},{13214, 3310},
+        {13542, 3325},{13890, 3350},{14248, 3371},{14671, 3398}
+      },
+      /*Y'  qi=55  INTER*/
+      {
+        {   59,    5},{  170, 1307},{  725, 2358},{ 1886, 3058},
+        { 3589, 3385},{ 5284, 3459},{ 6654, 3458},{ 7771, 3461},
+        { 8727, 3470},{ 9564, 3478},{10322, 3488},{11019, 3497},
+        {11658, 3505},{12258, 3513},{12819, 3520},{13344, 3527},
+        {13840, 3533},{14314, 3537},{14755, 3541},{15161, 3544},
+        {15552, 3548},{15916, 3548},{16257, 3548},{16576, 3540}
+      }
+    },
+    {
+      /*Cb  qi=55  INTRA*/
+      {
+        {   13,    2},{  167,  366},{  322,  714},{  508, 1026},
+        {  716, 1292},{  930, 1511},{ 1148, 1690},{ 1366, 1839},
+        { 1578, 1972},{ 1793, 2090},{ 2001, 2199},{ 2217, 2300},
+        { 2427, 2393},{ 2609, 2495},{ 2784, 2600},{ 2961, 2704},
+        { 3121, 2797},{ 3268, 2884},{ 3423, 2965},{ 3590, 3032},
+        { 3764, 3096},{ 3926, 3165},{ 4101, 3223},{ 4405, 3258}
+      },
+      /*Cb  qi=55  INTER*/
+      {
+        {   90,   -4},{  109,  344},{  107,  668},{  126, 1017},
+        {  172, 1351},{  249, 1657},{  370, 1928},{  527, 2174},
+        {  702, 2407},{  909, 2624},{ 1170, 2814},{ 1493, 2970},
+        { 1869, 3097},{ 2292, 3192},{ 2752, 3258},{ 3232, 3295},
+        { 3709, 3314},{ 4156, 3335},{ 4592, 3355},{ 5004, 3373},
+        { 5377, 3389},{ 5737, 3411},{ 6092, 3432},{ 6473, 3423}
+      }
+    },
+    {
+      /*Cr  qi=55  INTRA*/
+      {
+        {   23,    7},{  175,  385},{  342,  730},{  561, 1028},
+        {  771, 1279},{  973, 1493},{ 1181, 1669},{ 1384, 1822},
+        { 1602, 1956},{ 1830, 2076},{ 2057, 2184},{ 2270, 2288},
+        { 2452, 2389},{ 2637, 2484},{ 2823, 2559},{ 2983, 2621},
+        { 3129, 2682},{ 3280, 2753},{ 3417, 2833},{ 3554, 2904},
+        { 3743, 2977},{ 3921, 3060},{ 4055, 3137},{ 4185, 3186}
+      },
+      /*Cr  qi=55  INTER*/
+      {
+        {   85,    0},{   99,  352},{  100,  679},{  126, 1025},
+        {  178, 1351},{  256, 1650},{  359, 1935},{  493, 2202},
+        {  675, 2439},{  921, 2636},{ 1220, 2799},{ 1552, 2941},
+        { 1910, 3068},{ 2303, 3177},{ 2735, 3262},{ 3206, 3311},
+        { 3689, 3333},{ 4152, 3327},{ 4588, 3299},{ 4978, 3272},
+        { 5325, 3243},{ 5651, 3221},{ 5969, 3210},{ 6218, 3185}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=56  INTRA*/
+      {
+        {  137,  104},{ 1048, 1128},{ 2147, 1760},{ 3261, 2029},
+        { 4319, 2131},{ 5310, 2234},{ 6245, 2351},{ 7101, 2464},
+        { 7886, 2572},{ 8610, 2675},{ 9270, 2762},{ 9840, 2818},
+        {10365, 2869},{10875, 2928},{11393, 2997},{11900, 3071},
+        {12371, 3128},{12834, 3172},{13233, 3208},{13562, 3228},
+        {13878, 3245},{14221, 3271},{14584, 3292},{15008, 3320}
+      },
+      /*Y'  qi=56  INTER*/
+      {
+        {   19,   21},{  207, 1292},{ 1031, 2252},{ 2553, 2846},
+        { 4463, 3085},{ 6137, 3131},{ 7441, 3151},{ 8526, 3172},
+        { 9468, 3193},{10301, 3209},{11059, 3224},{11760, 3237},
+        {12405, 3249},{13008, 3261},{13570, 3270},{14100, 3278},
+        {14597, 3284},{15074, 3289},{15524, 3297},{15929, 3302},
+        {16314, 3306},{16675, 3307},{17004, 3305},{17288, 3301}
+      }
+    },
+    {
+      /*Cb  qi=56  INTRA*/
+      {
+        {   16,    3},{  188,  367},{  353,  712},{  546, 1017},
+        {  765, 1275},{  989, 1484},{ 1221, 1653},{ 1459, 1791},
+        { 1681, 1920},{ 1893, 2046},{ 2102, 2160},{ 2323, 2257},
+        { 2534, 2347},{ 2720, 2447},{ 2902, 2549},{ 3075, 2654},
+        { 3239, 2749},{ 3392, 2835},{ 3544, 2920},{ 3712, 2988},
+        { 3882, 3052},{ 4052, 3123},{ 4227, 3181},{ 4483, 3213}
+      },
+      /*Cb  qi=56  INTER*/
+      {
+        {   92,   -1},{  111,  343},{  114,  665},{  148, 1003},
+        {  224, 1321},{  345, 1609},{  526, 1858},{  754, 2077},
+        { 1009, 2281},{ 1319, 2464},{ 1702, 2614},{ 2145, 2732},
+        { 2625, 2824},{ 3123, 2890},{ 3634, 2933},{ 4137, 2954},
+        { 4614, 2965},{ 5052, 2988},{ 5468, 3015},{ 5852, 3035},
+        { 6213, 3060},{ 6557, 3081},{ 6906, 3094},{ 7243, 3112}
+      }
+    },
+    {
+      /*Cr  qi=56  INTRA*/
+      {
+        {   28,    8},{  195,  385},{  373,  727},{  598, 1019},
+        {  816, 1263},{ 1033, 1465},{ 1260, 1630},{ 1482, 1773},
+        { 1717, 1900},{ 1949, 2018},{ 2178, 2128},{ 2393, 2233},
+        { 2570, 2338},{ 2749, 2435},{ 2937, 2514},{ 3097, 2577},
+        { 3240, 2638},{ 3398, 2709},{ 3540, 2791},{ 3673, 2865},
+        { 3869, 2938},{ 4049, 3019},{ 4179, 3095},{ 4330, 3137}
+      },
+      /*Cr  qi=56  INTER*/
+      {
+        {   83,    0},{   99,  353},{  103,  676},{  146, 1010},
+        {  232, 1320},{  355, 1601},{  512, 1866},{  713, 2109},
+        {  988, 2312},{ 1344, 2471},{ 1750, 2602},{ 2180, 2719},
+        { 2642, 2819},{ 3141, 2892},{ 3653, 2939},{ 4159, 2961},
+        { 4636, 2961},{ 5072, 2945},{ 5464, 2917},{ 5813, 2895},
+        { 6134, 2890},{ 6458, 2883},{ 6735, 2881},{ 6953, 2902}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=57  INTRA*/
+      {
+        {  170,  106},{ 1106, 1120},{ 2246, 1740},{ 3399, 1993},
+        { 4482, 2077},{ 5492, 2167},{ 6446, 2273},{ 7324, 2379},
+        { 8130, 2482},{ 8866, 2578},{ 9537, 2661},{10119, 2715},
+        {10646, 2762},{11161, 2820},{11694, 2886},{12214, 2957},
+        {12693, 3013},{13166, 3053},{13569, 3087},{13897, 3106},
+        {14224, 3122},{14568, 3148},{14931, 3167},{15390, 3192}
+      },
+      /*Y'  qi=57  INTER*/
+      {
+        {   19,   20},{  205, 1292},{ 1096, 2229},{ 2775, 2766},
+        { 4811, 2943},{ 6512, 2964},{ 7832, 2976},{ 8940, 2990},
+        { 9903, 3004},{10755, 3017},{11532, 3029},{12243, 3039},
+        {12891, 3047},{13502, 3058},{14073, 3065},{14603, 3071},
+        {15097, 3078},{15581, 3083},{16036, 3086},{16452, 3090},
+        {16855, 3093},{17222, 3094},{17552, 3092},{17851, 3098}
+      }
+    },
+    {
+      /*Cb  qi=57  INTRA*/
+      {
+        {   16,    3},{  197,  365},{  384,  704},{  603, 1001},
+        {  837, 1252},{ 1077, 1455},{ 1326, 1618},{ 1581, 1748},
+        { 1819, 1871},{ 2042, 1993},{ 2264, 2104},{ 2500, 2196},
+        { 2722, 2280},{ 2916, 2375},{ 3103, 2473},{ 3290, 2575},
+        { 3456, 2667},{ 3612, 2748},{ 3775, 2829},{ 3958, 2896},
+        { 4145, 2947},{ 4307, 3012},{ 4476, 3070},{ 4733, 3110}
+      },
+      /*Cb  qi=57  INTER*/
+      {
+        {   94,   -1},{  111,  344},{  112,  665},{  147, 1002},
+        {  227, 1319},{  353, 1604},{  543, 1849},{  785, 2062},
+        { 1066, 2257},{ 1408, 2430},{ 1827, 2568},{ 2320, 2670},
+        { 2848, 2743},{ 3386, 2791},{ 3934, 2812},{ 4453, 2820},
+        { 4929, 2830},{ 5368, 2842},{ 5787, 2856},{ 6190, 2875},
+        { 6554, 2896},{ 6895, 2913},{ 7229, 2927},{ 7572, 2932}
+      }
+    },
+    {
+      /*Cr  qi=57  INTRA*/
+      {
+        {   28,    8},{  207,  383},{  413,  716},{  661,  999},
+        {  889, 1237},{ 1123, 1433},{ 1365, 1592},{ 1603, 1731},
+        { 1853, 1852},{ 2103, 1965},{ 2345, 2072},{ 2571, 2173},
+        { 2763, 2271},{ 2949, 2364},{ 3146, 2438},{ 3315, 2497},
+        { 3459, 2552},{ 3618, 2616},{ 3767, 2697},{ 3906, 2773},
+        { 4099, 2841},{ 4281, 2916},{ 4429, 2987},{ 4569, 3030}
+      },
+      /*Cr  qi=57  INTER*/
+      {
+        {   85,    0},{   99,  352},{  102,  675},{  147, 1008},
+        {  235, 1317},{  363, 1597},{  529, 1858},{  748, 2094},
+        { 1050, 2287},{ 1439, 2436},{ 1877, 2557},{ 2352, 2660},
+        { 2869, 2740},{ 3413, 2791},{ 3962, 2815},{ 4485, 2819},
+        { 4955, 2816},{ 5382, 2800},{ 5769, 2772},{ 6107, 2748},
+        { 6443, 2740},{ 6754, 2739},{ 7029, 2737},{ 7284, 2745}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=58  INTRA*/
+      {
+        {  164,  109},{ 1198, 1111},{ 2396, 1737},{ 3606, 1978},
+        { 4727, 2048},{ 5749, 2138},{ 6708, 2243},{ 7584, 2347},
+        { 8388, 2449},{ 9122, 2549},{ 9784, 2635},{10354, 2691},
+        {10876, 2740},{11385, 2800},{11912, 2869},{12429, 2941},
+        {12902, 2997},{13375, 3040},{13779, 3075},{14103, 3096},
+        {14435, 3112},{14783, 3140},{15141, 3160},{15599, 3186}
+      },
+      /*Y'  qi=58  INTER*/
+      {
+        {   14,   23},{  210, 1290},{ 1277, 2178},{ 3118, 2677},
+        { 5207, 2834},{ 6902, 2857},{ 8218, 2878},{ 9323, 2900},
+        {10285, 2919},{11132, 2934},{11899, 2949},{12599, 2961},
+        {13235, 2971},{13835, 2982},{14394, 2991},{14917, 2997},
+        {15412, 3005},{15882, 3009},{16325, 3013},{16735, 3016},
+        {17131, 3018},{17501, 3021},{17824, 3021},{18125, 3016}
+      }
+    },
+    {
+      /*Cb  qi=58  INTRA*/
+      {
+        {   17,    3},{  200,  365},{  389,  703},{  613,  996},
+        {  853, 1243},{ 1095, 1445},{ 1349, 1604},{ 1613, 1731},
+        { 1853, 1853},{ 2074, 1978},{ 2292, 2091},{ 2526, 2184},
+        { 2750, 2266},{ 2945, 2360},{ 3134, 2458},{ 3320, 2561},
+        { 3482, 2654},{ 3641, 2737},{ 3804, 2818},{ 3985, 2881},
+        { 4168, 2935},{ 4331, 3003},{ 4499, 3060},{ 4751, 3100}
+      },
+      /*Cb  qi=58  INTER*/
+      {
+        {   94,   -1},{  112,  345},{  112,  665},{  152,  998},
+        {  247, 1307},{  406, 1580},{  644, 1810},{  938, 2007},
+        { 1271, 2189},{ 1668, 2348},{ 2151, 2470},{ 2691, 2558},
+        { 3249, 2619},{ 3798, 2659},{ 4334, 2682},{ 4849, 2692},
+        { 5314, 2700},{ 5747, 2721},{ 6167, 2742},{ 6547, 2765},
+        { 6902, 2790},{ 7251, 2804},{ 7583, 2819},{ 7924, 2833}
+      }
+    },
+    {
+      /*Cr  qi=58  INTRA*/
+      {
+        {   29,    8},{  210,  382},{  419,  714},{  671,  993},
+        {  903, 1229},{ 1141, 1422},{ 1390, 1578},{ 1635, 1713},
+        { 1889, 1833},{ 2140, 1946},{ 2379, 2055},{ 2604, 2157},
+        { 2794, 2256},{ 2977, 2349},{ 3174, 2422},{ 3339, 2482},
+        { 3483, 2537},{ 3643, 2604},{ 3790, 2684},{ 3927, 2757},
+        { 4112, 2826},{ 4294, 2900},{ 4451, 2975},{ 4600, 3011}
+      },
+      /*Cr  qi=58  INTER*/
+      {
+        {   86,    0},{   99,  352},{  103,  675},{  151, 1004},
+        {  256, 1306},{  417, 1573},{  628, 1819},{  901, 2040},
+        { 1262, 2217},{ 1705, 2353},{ 2191, 2466},{ 2713, 2556},
+        { 3268, 2622},{ 3831, 2664},{ 4374, 2682},{ 4881, 2686},
+        { 5339, 2685},{ 5747, 2668},{ 6123, 2646},{ 6465, 2630},
+        { 6783, 2618},{ 7082, 2623},{ 7366, 2632},{ 7673, 2654}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=59  INTRA*/
+      {
+        {  142,  112},{ 1259, 1100},{ 2552, 1711},{ 3815, 1933},
+        { 4955, 1987},{ 5983, 2068},{ 6949, 2165},{ 7832, 2263},
+        { 8645, 2359},{ 9392, 2454},{10066, 2536},{10643, 2589},
+        {11174, 2636},{11696, 2693},{12230, 2758},{12752, 2826},
+        {13239, 2883},{13721, 2926},{14139, 2959},{14479, 2978},
+        {14811, 2993},{15166, 3020},{15532, 3039},{16000, 3062}
+      },
+      /*Y'  qi=59  INTER*/
+      {
+        {    8,   25},{  211, 1289},{ 1394, 2144},{ 3421, 2580},
+        { 5611, 2689},{ 7316, 2701},{ 8643, 2717},{ 9762, 2734},
+        {10735, 2750},{11587, 2763},{12353, 2775},{13056, 2785},
+        {13693, 2793},{14288, 2805},{14843, 2814},{15361, 2821},
+        {15857, 2827},{16328, 2831},{16763, 2834},{17171, 2838},
+        {17568, 2840},{17941, 2842},{18285, 2843},{18586, 2839}
+      }
+    },
+    {
+      /*Cb  qi=59  INTRA*/
+      {
+        {   17,    3},{  224,  363},{  441,  696},{  689,  982},
+        {  945, 1222},{ 1204, 1416},{ 1474, 1571},{ 1751, 1695},
+        { 2001, 1816},{ 2228, 1941},{ 2453, 2055},{ 2693, 2147},
+        { 2924, 2227},{ 3125, 2321},{ 3321, 2416},{ 3510, 2520},
+        { 3676, 2616},{ 3839, 2699},{ 4008, 2778},{ 4193, 2842},
+        { 4371, 2898},{ 4535, 2965},{ 4710, 3023},{ 4921, 3068}
+      },
+      /*Cb  qi=59  INTER*/
+      {
+        {   95,   -5},{  111,  343},{  112,  664},{  157,  995},
+        {  258, 1302},{  429, 1569},{  691, 1790},{ 1017, 1977},
+        { 1387, 2148},{ 1832, 2294},{ 2368, 2401},{ 2961, 2472},
+        { 3553, 2518},{ 4133, 2545},{ 4688, 2557},{ 5198, 2563},
+        { 5663, 2574},{ 6100, 2590},{ 6511, 2608},{ 6898, 2621},
+        { 7274, 2634},{ 7631, 2655},{ 7984, 2669},{ 8361, 2669}
+      }
+    },
+    {
+      /*Cr  qi=59  INTRA*/
+      {
+        {   31,    8},{  240,  379},{  480,  706},{  748,  978},
+        {  993, 1208},{ 1250, 1394},{ 1519, 1543},{ 1779, 1674},
+        { 2047, 1792},{ 2307, 1904},{ 2552, 2013},{ 2780, 2116},
+        { 2973, 2216},{ 3165, 2309},{ 3362, 2383},{ 3528, 2444},
+        { 3677, 2499},{ 3841, 2566},{ 3995, 2646},{ 4139, 2720},
+        { 4324, 2793},{ 4504, 2867},{ 4658, 2939},{ 4806, 2975}
+      },
+      /*Cr  qi=59  INTER*/
+      {
+        {   89,   -3},{   98,  352},{  103,  674},{  156, 1002},
+        {  268, 1300},{  441, 1562},{  673, 1801},{  980, 2010},
+        { 1385, 2175},{ 1868, 2301},{ 2401, 2402},{ 2984, 2474},
+        { 3591, 2520},{ 4179, 2545},{ 4729, 2555},{ 5232, 2553},
+        { 5679, 2545},{ 6081, 2530},{ 6447, 2510},{ 6791, 2496},
+        { 7101, 2487},{ 7393, 2489},{ 7684, 2499},{ 7950, 2501}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=60  INTRA*/
+      {
+        {   92,  116},{ 1361, 1085},{ 2746, 1686},{ 4050, 1895},
+        { 5209, 1939},{ 6244, 2012},{ 7213, 2103},{ 8105, 2197},
+        { 8928, 2290},{ 9685, 2381},{10371, 2460},{10952, 2511},
+        {11487, 2556},{12026, 2611},{12574, 2674},{13102, 2739},
+        {13597, 2793},{14092, 2831},{14523, 2862},{14862, 2881},
+        {15198, 2897},{15568, 2923},{15949, 2941},{16416, 2964}
+      },
+      /*Y'  qi=60  INTER*/
+      {
+        {    4,   30},{  215, 1287},{ 1547, 2104},{ 3729, 2491},
+        { 5973, 2568},{ 7672, 2577},{ 9001, 2591},{10123, 2606},
+        {11094, 2620},{11943, 2632},{12709, 2643},{13409, 2652},
+        {14044, 2660},{14641, 2669},{15193, 2677},{15709, 2684},
+        {16201, 2689},{16675, 2693},{17118, 2696},{17522, 2701},
+        {17920, 2704},{18293, 2706},{18620, 2702},{18923, 2700}
+      }
+    },
+    {
+      /*Cb  qi=60  INTRA*/
+      {
+        {   18,    3},{  227,  362},{  447,  694},{  708,  974},
+        {  981, 1207},{ 1252, 1397},{ 1532, 1547},{ 1822, 1663},
+        { 2082, 1780},{ 2316, 1903},{ 2548, 2013},{ 2794, 2101},
+        { 3029, 2178},{ 3242, 2266},{ 3445, 2360},{ 3638, 2459},
+        { 3816, 2547},{ 3980, 2628},{ 4146, 2708},{ 4344, 2766},
+        { 4546, 2812},{ 4725, 2872},{ 4880, 2930},{ 5054, 2966}
+      },
+      /*Cb  qi=60  INTER*/
+      {
+        {   97,   -4},{  112,  343},{  114,  664},{  162,  993},
+        {  273, 1294},{  472, 1553},{  774, 1762},{ 1138, 1939},
+        { 1543, 2102},{ 2034, 2236},{ 2620, 2329},{ 3244, 2389},
+        { 3860, 2423},{ 4443, 2440},{ 4997, 2449},{ 5502, 2455},
+        { 5962, 2458},{ 6413, 2466},{ 6836, 2485},{ 7217, 2506},
+        { 7592, 2518},{ 7957, 2533},{ 8291, 2543},{ 8574, 2545}
+      }
+    },
+    {
+      /*Cr  qi=60  INTRA*/
+      {
+        {   32,    8},{  243,  379},{  488,  702},{  771,  968},
+        { 1030, 1192},{ 1300, 1373},{ 1581, 1517},{ 1854, 1643},
+        { 2127, 1757},{ 2393, 1864},{ 2645, 1968},{ 2879, 2068},
+        { 3078, 2166},{ 3277, 2256},{ 3484, 2325},{ 3660, 2381},
+        { 3808, 2433},{ 3970, 2496},{ 4138, 2571},{ 4288, 2643},
+        { 4475, 2710},{ 4655, 2778},{ 4810, 2843},{ 4959, 2879}
+      },
+      /*Cr  qi=60  INTER*/
+      {
+        {   86,   -2},{   99,  352},{  103,  673},{  160,  998},
+        {  284, 1292},{  484, 1546},{  753, 1774},{ 1100, 1973},
+        { 1546, 2129},{ 2072, 2246},{ 2652, 2334},{ 3279, 2392},
+        { 3911, 2425},{ 4504, 2440},{ 5044, 2443},{ 5536, 2440},
+        { 5979, 2430},{ 6381, 2413},{ 6735, 2397},{ 7062, 2382},
+        { 7383, 2376},{ 7680, 2375},{ 7962, 2373},{ 8203, 2379}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=61  INTRA*/
+      {
+        {   54,  121},{ 1477, 1069},{ 3061, 1638},{ 4465, 1808},
+        { 5649, 1827},{ 6710, 1884},{ 7716, 1958},{ 8648, 2037},
+        { 9514, 2116},{10311, 2192},{11033, 2261},{11641, 2305},
+        {12202, 2342},{12771, 2387},{13356, 2440},{13924, 2493},
+        {14444, 2541},{14951, 2576},{15409, 2600},{15779, 2615},
+        {16131, 2626},{16521, 2648},{16921, 2663},{17409, 2694}
+      },
+      /*Y'  qi=61  INTER*/
+      {
+        {   -1,   32},{  216, 1286},{ 1806, 2036},{ 4279, 2327},
+        { 6629, 2352},{ 8347, 2352},{ 9707, 2357},{10860, 2364},
+        {11857, 2372},{12726, 2377},{13508, 2382},{14225, 2387},
+        {14877, 2392},{15484, 2398},{16048, 2401},{16581, 2405},
+        {17092, 2409},{17573, 2409},{18016, 2410},{18427, 2413},
+        {18829, 2415},{19221, 2415},{19578, 2415},{19980, 2413}
+      }
+    },
+    {
+      /*Cb  qi=61  INTRA*/
+      {
+        {   19,    3},{  231,  362},{  456,  693},{  733,  965},
+        { 1032, 1188},{ 1330, 1369},{ 1637, 1508},{ 1956, 1612},
+        { 2241, 1718},{ 2496, 1832},{ 2750, 1932},{ 3019, 2007},
+        { 3274, 2074},{ 3505, 2154},{ 3725, 2236},{ 3943, 2323},
+        { 4138, 2403},{ 4323, 2476},{ 4505, 2543},{ 4706, 2592},
+        { 4909, 2630},{ 5109, 2675},{ 5292, 2724},{ 5495, 2768}
+      },
+      /*Cb  qi=61  INTER*/
+      {
+        {   91,   -2},{  111,  344},{  114,  663},{  166,  989},
+        {  291, 1285},{  522, 1534},{  875, 1729},{ 1302, 1889},
+        { 1786, 2031},{ 2368, 2141},{ 3042, 2207},{ 3734, 2243},
+        { 4388, 2259},{ 4982, 2264},{ 5533, 2265},{ 6043, 2262},
+        { 6524, 2264},{ 6982, 2274},{ 7422, 2283},{ 7831, 2295},
+        { 8198, 2308},{ 8593, 2319},{ 8965, 2329},{ 9258, 2340}
+      }
+    },
+    {
+      /*Cr  qi=61  INTRA*/
+      {
+        {   33,    9},{  245,  378},{  497,  699},{  801,  958},
+        { 1087, 1171},{ 1384, 1342},{ 1692, 1474},{ 1992, 1589},
+        { 2290, 1692},{ 2576, 1789},{ 2852, 1884},{ 3109, 1973},
+        { 3324, 2061},{ 3544, 2142},{ 3763, 2199},{ 3945, 2244},
+        { 4103, 2292},{ 4283, 2349},{ 4469, 2413},{ 4635, 2476},
+        { 4836, 2534},{ 5038, 2592},{ 5210, 2649},{ 5358, 2682}
+      },
+      /*Cr  qi=61  INTER*/
+      {
+        {   82,    0},{   97,  353},{  104,  672},{  165,  995},
+        {  303, 1284},{  532, 1529},{  852, 1742},{ 1273, 1921},
+        { 1798, 2057},{ 2409, 2154},{ 3090, 2212},{ 3794, 2240},
+        { 4460, 2251},{ 5057, 2249},{ 5596, 2249},{ 6085, 2245},
+        { 6519, 2234},{ 6908, 2220},{ 7269, 2203},{ 7618, 2196},
+        { 7949, 2198},{ 8269, 2195},{ 8554, 2196},{ 8928, 2217}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=62  INTRA*/
+      {
+        {   29,  124},{ 1527, 1067},{ 3221, 1618},{ 4703, 1751},
+        { 5909, 1744},{ 7001, 1779},{ 8057, 1829},{ 9049, 1885},
+        { 9968, 1943},{10813, 1999},{11572, 2050},{12206, 2082},
+        {12801, 2107},{13402, 2140},{14020, 2180},{14625, 2223},
+        {15179, 2260},{15718, 2288},{16196, 2305},{16581, 2313},
+        {16963, 2324},{17382, 2341},{17800, 2351},{18318, 2376}
+      },
+      /*Y'  qi=62  INTER*/
+      {
+        {   -8,   36},{  218, 1284},{ 2073, 1965},{ 4814, 2159},
+        { 7237, 2138},{ 8979, 2124},{10378, 2115},{11570, 2109},
+        {12601, 2106},{13503, 2103},{14320, 2103},{15064, 2103},
+        {15746, 2103},{16384, 2104},{16975, 2105},{17534, 2105},
+        {18062, 2106},{18564, 2107},{19035, 2106},{19471, 2107},
+        {19890, 2107},{20288, 2107},{20651, 2107},{21012, 2108}
+      }
+    },
+    {
+      /*Cb  qi=62  INTRA*/
+      {
+        {   21,    3},{  283,  360},{  565,  683},{  907,  938},
+        { 1269, 1143},{ 1611, 1311},{ 1949, 1441},{ 2290, 1535},
+        { 2596, 1632},{ 2877, 1738},{ 3162, 1828},{ 3458, 1893},
+        { 3745, 1948},{ 4011, 2016},{ 4253, 2089},{ 4506, 2164},
+        { 4734, 2233},{ 4943, 2294},{ 5162, 2353},{ 5381, 2393},
+        { 5593, 2420},{ 5807, 2454},{ 6003, 2496},{ 6210, 2543}
+      },
+      /*Cb  qi=62  INTER*/
+      {
+        {   91,   -1},{  110,  344},{  113,  663},{  169,  987},
+        {  306, 1279},{  562, 1519},{  961, 1701},{ 1450, 1845},
+        { 2013, 1967},{ 2686, 2053},{ 3437, 2095},{ 4171, 2109},
+        { 4841, 2109},{ 5441, 2105},{ 6002, 2097},{ 6542, 2089},
+        { 7028, 2087},{ 7491, 2088},{ 7949, 2090},{ 8377, 2089},
+        { 8789, 2095},{ 9195, 2103},{ 9569, 2104},{ 9937, 2102}
+      }
+    },
+    {
+      /*Cr  qi=62  INTRA*/
+      {
+        {   38,    8},{  308,  374},{  619,  685},{  984,  925},
+        { 1326, 1126},{ 1662, 1285},{ 1999, 1407},{ 2328, 1512},
+        { 2659, 1604},{ 2976, 1691},{ 3285, 1774},{ 3570, 1853},
+        { 3815, 1931},{ 4068, 1998},{ 4304, 2044},{ 4491, 2082},
+        { 4666, 2124},{ 4870, 2174},{ 5078, 2231},{ 5262, 2285},
+        { 5480, 2335},{ 5703, 2378},{ 5905, 2423},{ 6075, 2454}
+      },
+      /*Cr  qi=62  INTER*/
+      {
+        {   79,    1},{   95,  353},{  102,  671},{  169,  992},
+        {  318, 1277},{  569, 1515},{  936, 1716},{ 1428, 1876},
+        { 2034, 1993},{ 2738, 2067},{ 3511, 2095},{ 4268, 2094},
+        { 4943, 2087},{ 5543, 2079},{ 6074, 2074},{ 6552, 2069},
+        { 6985, 2057},{ 7366, 2043},{ 7728, 2030},{ 8086, 2021},
+        { 8423, 2017},{ 8752, 2016},{ 9057, 2014},{ 9376, 2008}
+      }
+    }
+  },
+  {
+    {
+      /*Y'  qi=63  INTRA*/
+      {
+        {  -59,  134},{ 1734, 1036},{ 3743, 1521},{ 5309, 1618},
+        { 6520, 1597},{ 7664, 1609},{ 8809, 1630},{ 9894, 1657},
+        {10907, 1687},{11838, 1717},{12673, 1744},{13379, 1758},
+        {14038, 1767},{14698, 1784},{15379, 1806},{16062, 1831},
+        {16694, 1852},{17300, 1867},{17827, 1878},{18250, 1881},
+        {18702, 1884},{19199, 1892},{19665, 1896},{20273, 1908}
+      },
+      /*Y'  qi=63  INTER*/
+      {
+        {   -7,   33},{  209, 1285},{ 2309, 1904},{ 5274, 2025},
+        { 7801, 1966},{ 9637, 1924},{11126, 1892},{12403, 1868},
+        {13515, 1849},{14491, 1834},{15380, 1822},{16197, 1814},
+        {16944, 1806},{17645, 1799},{18303, 1794},{18916, 1789},
+        {19494, 1785},{20056, 1782},{20568, 1779},{21047, 1776},
+        {21508, 1775},{21925, 1772},{22327, 1770},{22678, 1771}
+      }
+    },
+    {
+      /*Cb  qi=63  INTRA*/
+      {
+        {   20,    3},{  294,  357},{  608,  673},{ 1047,  908},
+        { 1501, 1090},{ 1898, 1240},{ 2275, 1353},{ 2654, 1427},
+        { 3014, 1502},{ 3366, 1579},{ 3726, 1637},{ 4084, 1674},
+        { 4425, 1703},{ 4752, 1743},{ 5058, 1791},{ 5377, 1838},
+        { 5676, 1877},{ 5946, 1912},{ 6213, 1945},{ 6458, 1969},
+        { 6704, 1982},{ 6969, 1997},{ 7210, 2017},{ 7439, 2037}
+      },
+      /*Cb  qi=63  INTER*/
+      {
+        {   86,    1},{  108,  345},{  111,  663},{  168,  985},
+        {  307, 1276},{  577, 1513},{ 1007, 1688},{ 1550, 1819},
+        { 2189, 1921},{ 2938, 1981},{ 3744, 2002},{ 4512, 2002},
+        { 5199, 1996},{ 5824, 1986},{ 6419, 1971},{ 6978, 1954},
+        { 7507, 1940},{ 8015, 1932},{ 8502, 1928},{ 8978, 1920},
+        { 9410, 1915},{ 9842, 1910},{10262, 1901},{10634, 1896}
+      }
+    },
+    {
+      /*Cr  qi=63  INTRA*/
+      {
+        {   38,    7},{  324,  367},{  677,  670},{ 1136,  892},
+        { 1562, 1070},{ 1951, 1209},{ 2326, 1313},{ 2694, 1399},
+        { 3074, 1471},{ 3460, 1531},{ 3850, 1575},{ 4214, 1622},
+        { 4522, 1679},{ 4819, 1723},{ 5089, 1749},{ 5315, 1769},
+        { 5530, 1792},{ 5756, 1825},{ 6006, 1860},{ 6244, 1889},
+        { 6514, 1924},{ 6792, 1946},{ 7026, 1962},{ 7191, 1971}
+      },
+      /*Cr  qi=63  INTER*/
+      {
+        {   80,    2},{   95,  354},{  101,  671},{  167,  990},
+        {  321, 1274},{  585, 1509},{  984, 1702},{ 1534, 1849},
+        { 2217, 1947},{ 3005, 1995},{ 3839, 1999},{ 4619, 1986},
+        { 5310, 1973},{ 5933, 1961},{ 6486, 1952},{ 6988, 1942},
+        { 7435, 1927},{ 7817, 1911},{ 8198, 1900},{ 8552, 1895},
+        { 8881, 1890},{ 9253, 1883},{ 9598, 1876},{ 9923, 1859}
+      }
+    }
+  }
+};
+
+#endif
diff --git a/lib/ocintrin.h b/lib/ocintrin.h
new file mode 100644
index 0000000..d49ebb2
--- /dev/null
+++ b/lib/ocintrin.h
@@ -0,0 +1,128 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: ocintrin.h 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+/*Some common macros for potential platform-specific optimization.*/
+#include <math.h>
+#if !defined(_ocintrin_H)
+# define _ocintrin_H (1)
+
+/*Some specific platforms may have optimized intrinsic or inline assembly
+   versions of these functions which can substantially improve performance.
+  We define macros for them to allow easy incorporation of these non-ANSI
+   features.*/
+
+/*Note that we do not provide a macro for abs(), because it is provided as a
+   library function, which we assume is translated into an intrinsic to avoid
+   the function call overhead and then implemented in the smartest way for the
+   target platform.
+  With modern gcc (4.x), this is true: it uses cmov instructions if the
+   architecture supports it and branchless bit-twiddling if it does not (the
+   speed difference between the two approaches is not measurable).
+  Interestingly, the bit-twiddling method was patented in 2000 (US 6,073,150)
+   by Sun Microsystems, despite prior art dating back to at least 1996:
+   http://web.archive.org/web/19961201174141/www.x86.org/ftp/articles/pentopt/PENTOPT.TXT
+  On gcc 3.x, however, our assumption is not true, as abs() is translated to a
+   conditional jump, which is horrible on deeply piplined architectures (e.g.,
+   all consumer architectures for the past decade or more).
+  Also be warned that -C*abs(x) where C is a constant is mis-optimized as
+   abs(C*x) on every gcc release before 4.2.3.
+  See bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34130 */
+
+/*Modern gcc (4.x) can compile the naive versions of min and max with cmov if
+   given an appropriate architecture, but the branchless bit-twiddling versions
+   are just as fast, and do not require any special target architecture.
+  Earlier gcc versions (3.x) compiled both code to the same assembly
+   instructions, because of the way they represented ((_b)>(_a)) internally.*/
+#define OC_MAXI(_a,_b)      ((_a)-((_a)-(_b)&-((_b)>(_a))))
+#define OC_MINI(_a,_b)      ((_a)+((_b)-(_a)&-((_b)<(_a))))
+/*Clamps an integer into the given range.
+  If _a>_c, then the lower bound _a is respected over the upper bound _c (this
+   behavior is required to meet our documented API behavior).
+  _a: The lower bound.
+  _b: The value to clamp.
+  _c: The upper boud.*/
+#define OC_CLAMPI(_a,_b,_c) (OC_MAXI(_a,OC_MINI(_b,_c)))
+#define OC_CLAMP255(_x)     ((unsigned char)((((_x)<0)-1)&((_x)|-((_x)>255))))
+/*This has a chance of compiling branchless, and is just as fast as the
+   bit-twiddling method, which is slightly less portable, since it relies on a
+   sign-extended rightshift, which is not guaranteed by ANSI (but present on
+   every relevant platform).*/
+#define OC_SIGNI(_a)        (((_a)>0)-((_a)<0))
+/*Slightly more portable than relying on a sign-extended right-shift (which is
+   not guaranteed by ANSI), and just as fast, since gcc (3.x and 4.x both)
+   compile it into the right-shift anyway.*/
+#define OC_SIGNMASK(_a)     (-((_a)<0))
+/*Divides an integer by a power of two, truncating towards 0.
+  _dividend: The integer to divide.
+  _shift:    The non-negative power of two to divide by.
+  _rmask:    (1<<_shift)-1*/
+#define OC_DIV_POW2(_dividend,_shift,_rmask)\
+  ((_dividend)+(OC_SIGNMASK(_dividend)&(_rmask))>>(_shift))
+/*Divides _x by 65536, truncating towards 0.*/
+#define OC_DIV2_16(_x) OC_DIV_POW2(_x,16,0xFFFF)
+/*Divides _x by 2, truncating towards 0.*/
+#define OC_DIV2(_x) OC_DIV_POW2(_x,1,0x1)
+/*Divides _x by 8, truncating towards 0.*/
+#define OC_DIV8(_x) OC_DIV_POW2(_x,3,0x7)
+/*Divides _x by 16, truncating towards 0.*/
+#define OC_DIV16(_x) OC_DIV_POW2(_x,4,0xF)
+/*Right shifts _dividend by _shift, adding _rval, and subtracting one for
+   negative dividends first.
+  When _rval is (1<<_shift-1), this is equivalent to division with rounding
+   ties away from zero.*/
+#define OC_DIV_ROUND_POW2(_dividend,_shift,_rval)\
+  ((_dividend)+OC_SIGNMASK(_dividend)+(_rval)>>(_shift))
+/*Divides a _x by 2, rounding towards even numbers.*/
+#define OC_DIV2_RE(_x) ((_x)+((_x)>>1&1)>>1)
+/*Divides a _x by (1<<(_shift)), rounding towards even numbers.*/
+#define OC_DIV_POW2_RE(_x,_shift) \
+  ((_x)+((_x)>>(_shift)&1)+((1<<(_shift))-1>>1)>>(_shift))
+/*Swaps two integers _a and _b if _a>_b.*/
+#define OC_SORT2I(_a,_b) \
+  do{ \
+    int t__; \
+    t__=((_a)^(_b))&-((_b)<(_a)); \
+    (_a)^=t__; \
+    (_b)^=t__; \
+  } \
+  while(0)
+
+/*Accesses one of four (signed) bytes given an index.
+  This can be used to avoid small lookup tables.*/
+#define OC_BYTE_TABLE32(_a,_b,_c,_d,_i) \
+  ((signed char) \
+   (((_a)&0xFF|((_b)&0xFF)<<8|((_c)&0xFF)<<16|((_d)&0xFF)<<24)>>(_i)*8))
+/*Accesses one of eight (unsigned) nibbles given an index.
+  This can be used to avoid small lookup tables.*/
+#define OC_UNIBBLE_TABLE32(_a,_b,_c,_d,_e,_f,_g,_h,_i) \
+  ((((_a)&0xF|((_b)&0xF)<<4|((_c)&0xF)<<8|((_d)&0xF)<<12| \
+   ((_e)&0xF)<<16|((_f)&0xF)<<20|((_g)&0xF)<<24|((_h)&0xF)<<28)>>(_i)*4)&0xF)
+
+
+
+/*All of these macros should expect floats as arguments.*/
+#define OC_MAXF(_a,_b)      ((_a)<(_b)?(_b):(_a))
+#define OC_MINF(_a,_b)      ((_a)>(_b)?(_b):(_a))
+#define OC_CLAMPF(_a,_b,_c) (OC_MINF(_a,OC_MAXF(_b,_c)))
+#define OC_FABSF(_f)        ((float)fabs(_f))
+#define OC_SQRTF(_f)        ((float)sqrt(_f))
+#define OC_POWF(_b,_e)      ((float)pow(_b,_e))
+#define OC_LOGF(_f)         ((float)log(_f))
+#define OC_IFLOORF(_f)      ((int)floor(_f))
+#define OC_ICEILF(_f)       ((int)ceil(_f))
+
+#endif
diff --git a/lib/quant.c b/lib/quant.c
new file mode 100644
index 0000000..8359f5a
--- /dev/null
+++ b/lib/quant.c
@@ -0,0 +1,119 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: quant.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <ogg/ogg.h>
+#include "quant.h"
+#include "decint.h"
+
+static const unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2};
+static const unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2};
+
+/*Initializes the dequantization tables from a set of quantizer info.
+  Currently the dequantizer (and elsewhere enquantizer) tables are expected to
+   be initialized as pointing to the storage reserved for them in the
+   oc_theora_state (resp. oc_enc_ctx) structure.
+  If some tables are duplicates of others, the pointers will be adjusted to
+   point to a single copy of the tables, but the storage for them will not be
+   freed.
+  If you're concerned about the memory footprint, the obvious thing to do is
+   to move the storage out of its fixed place in the structures and allocate
+   it on demand.
+  However, a much, much better option is to only store the quantization
+   matrices being used for the current frame, and to recalculate these as the
+   qi values change between frames (this is what VP3 did).*/
+void oc_dequant_tables_init(ogg_uint16_t *_dequant[64][3][2],
+ int _pp_dc_scale[64],const th_quant_info *_qinfo){
+  /*Coding mode: intra or inter.*/
+  int          qti;
+  /*Y', C_b, C_r*/
+  int          pli;
+  for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
+    /*Quality index.*/
+    int qi;
+    /*Range iterator.*/
+    int qri;
+    for(qi=0,qri=0;qri<=_qinfo->qi_ranges[qti][pli].nranges;qri++){
+      th_quant_base base;
+      ogg_uint32_t  q;
+      int           qi_start;
+      int           qi_end;
+      memcpy(base,_qinfo->qi_ranges[qti][pli].base_matrices[qri],
+       sizeof(base));
+      qi_start=qi;
+      if(qri==_qinfo->qi_ranges[qti][pli].nranges)qi_end=qi+1;
+      else qi_end=qi+_qinfo->qi_ranges[qti][pli].sizes[qri];
+      /*Iterate over quality indicies in this range.*/
+      for(;;){
+        ogg_uint32_t qfac;
+        int          zzi;
+        int          ci;
+        /*In the original VP3.2 code, the rounding offset and the size of the
+           dead zone around 0 were controlled by a "sharpness" parameter.
+          The size of our dead zone is now controlled by the per-coefficient
+           quality thresholds returned by our HVS module.
+          We round down from a more accurate value when the quality of the
+           reconstruction does not fall below our threshold and it saves bits.
+          Hence, all of that VP3.2 code is gone from here, and the remaining
+           floating point code has been implemented as equivalent integer code
+           with exact precision.*/
+        qfac=(ogg_uint32_t)_qinfo->dc_scale[qi]*base[0];
+        /*For postprocessing, not dequantization.*/
+        if(_pp_dc_scale!=NULL)_pp_dc_scale[qi]=(int)(qfac/160);
+        /*Scale DC the coefficient from the proper table.*/
+        q=(qfac/100)<<2;
+        q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
+        _dequant[qi][pli][qti][0]=(ogg_uint16_t)q;
+        /*Now scale AC coefficients from the proper table.*/
+        for(zzi=1;zzi<64;zzi++){
+          q=((ogg_uint32_t)_qinfo->ac_scale[qi]*base[OC_FZIG_ZAG[zzi]]/100)<<2;
+          q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
+          _dequant[qi][pli][qti][zzi]=(ogg_uint16_t)q;
+        }
+        /*If this is a duplicate of a previous matrix, use that instead.
+          This simple check helps us improve cache coherency later.*/
+        {
+          int dupe;
+          int qtj;
+          int plj;
+          dupe=0;
+          for(qtj=0;qtj<=qti;qtj++){
+            for(plj=0;plj<(qtj<qti?3:pli);plj++){
+              if(!memcmp(_dequant[qi][pli][qti],_dequant[qi][plj][qtj],
+               sizeof(oc_quant_table))){
+                dupe=1;
+                break;
+              }
+            }
+            if(dupe)break;
+          }
+          if(dupe)_dequant[qi][pli][qti]=_dequant[qi][plj][qtj];
+        }
+        if(++qi>=qi_end)break;
+        /*Interpolate the next base matrix.*/
+        for(ci=0;ci<64;ci++){
+          base[ci]=(unsigned char)(
+           (2*((qi_end-qi)*_qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
+           (qi-qi_start)*_qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
+           +_qinfo->qi_ranges[qti][pli].sizes[qri])/
+           (2*_qinfo->qi_ranges[qti][pli].sizes[qri]));
+        }
+      }
+    }
+  }
+}
diff --git a/lib/quant.h b/lib/quant.h
new file mode 100644
index 0000000..49ce13a
--- /dev/null
+++ b/lib/quant.h
@@ -0,0 +1,33 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: quant.h 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#if !defined(_quant_H)
+# define _quant_H (1)
+# include "theora/codec.h"
+# include "ocintrin.h"
+
+typedef ogg_uint16_t   oc_quant_table[64];
+
+
+/*Maximum scaled quantizer value.*/
+#define OC_QUANT_MAX          (1024<<2)
+
+
+void oc_dequant_tables_init(ogg_uint16_t *_dequant[64][3][2],
+ int _pp_dc_scale[64],const th_quant_info *_qinfo);
+
+#endif
diff --git a/lib/rate.c b/lib/rate.c
new file mode 100644
index 0000000..4f43bb2
--- /dev/null
+++ b/lib/rate.c
@@ -0,0 +1,1137 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: rate.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+#include <stdlib.h>
+#include <string.h>
+#include "encint.h"
+
+/*A rough lookup table for tan(x), 0<=x<pi/2.
+  The values are Q12 fixed-point and spaced at 5 degree intervals.
+  These decisions are somewhat arbitrary, but sufficient for the 2nd order
+   Bessel follower below.
+  Values of x larger than 85 degrees are extrapolated from the last inteval,
+   which is way off, but "good enough".*/
+static unsigned short OC_ROUGH_TAN_LOOKUP[18]={
+      0,  358,  722, 1098, 1491, 1910,
+   2365, 2868, 3437, 4096, 4881, 5850,
+   7094, 8784,11254,15286,23230,46817
+};
+
+/*_alpha is Q24 in the range [0,0.5).
+  The return values is 5.12.*/
+static int oc_warp_alpha(int _alpha){
+  int i;
+  int d;
+  int t0;
+  int t1;
+  i=_alpha*36>>24;
+  if(i>=17)i=16;
+  t0=OC_ROUGH_TAN_LOOKUP[i];
+  t1=OC_ROUGH_TAN_LOOKUP[i+1];
+  d=_alpha*36-(i<<24);
+  return (int)(((ogg_int64_t)t0<<32)+(t1-t0<<8)*(ogg_int64_t)d>>32);
+}
+
+/*Re-initialize the Bessel filter coefficients with the specified delay.
+  This does not alter the x/y state, but changes the reaction time of the
+   filter.
+  Altering the time constant of a reactive filter without alterning internal
+   state is something that has to be done carefuly, but our design operates at
+   high enough delays and with small enough time constant changes to make it
+   safe.*/
+static void oc_iir_filter_reinit(oc_iir_filter *_f,int _delay){
+  int         alpha;
+  ogg_int64_t one48;
+  ogg_int64_t warp;
+  ogg_int64_t k1;
+  ogg_int64_t k2;
+  ogg_int64_t d;
+  ogg_int64_t a;
+  ogg_int64_t ik2;
+  ogg_int64_t b1;
+  ogg_int64_t b2;
+  /*This borrows some code from an unreleased version of Postfish.
+    See the recipe at http://unicorn.us.com/alex/2polefilters.html for details
+     on deriving the filter coefficients.*/
+  /*alpha is Q24*/
+  alpha=(1<<24)/_delay;
+  one48=(ogg_int64_t)1<<48;
+  /*warp is 7.12*/
+  warp=OC_MAXI(oc_warp_alpha(alpha),1);
+  /*k1 is 9.12*/
+  k1=3*warp;
+  /*k2 is 16.24.*/
+  k2=k1*warp;
+  /*d is 16.15.*/
+  d=((1<<12)+k1<<12)+k2+256>>9;
+  /*a is 0.32, since d is larger than both 1.0 and k2.*/
+  a=(k2<<23)/d;
+  /*ik2 is 25.24.*/
+  ik2=one48/k2;
+  /*b1 is Q56; in practice, the integer ranges between -2 and 2.*/
+  b1=2*a*(ik2-(1<<24));
+  /*b2 is Q56; in practice, the integer ranges between -2 and 2.*/
+  b2=(one48<<8)-(4*a<<24)-b1;
+  /*All of the filter parameters are Q24.*/
+  _f->c[0]=(ogg_int32_t)(b1+((ogg_int64_t)1<<31)>>32);
+  _f->c[1]=(ogg_int32_t)(b2+((ogg_int64_t)1<<31)>>32);
+  _f->g=(ogg_int32_t)(a+128>>8);
+}
+
+/*Initialize a 2nd order low-pass Bessel filter with the corresponding delay
+   and initial value.
+  _value is Q24.*/
+static void oc_iir_filter_init(oc_iir_filter *_f,int _delay,ogg_int32_t _value){
+  oc_iir_filter_reinit(_f,_delay);
+  _f->y[1]=_f->y[0]=_f->x[1]=_f->x[0]=_value;
+}
+
+static ogg_int64_t oc_iir_filter_update(oc_iir_filter *_f,ogg_int32_t _x){
+  ogg_int64_t c0;
+  ogg_int64_t c1;
+  ogg_int64_t g;
+  ogg_int64_t x0;
+  ogg_int64_t x1;
+  ogg_int64_t y0;
+  ogg_int64_t y1;
+  ogg_int64_t ya;
+  c0=_f->c[0];
+  c1=_f->c[1];
+  g=_f->g;
+  x0=_f->x[0];
+  x1=_f->x[1];
+  y0=_f->y[0];
+  y1=_f->y[1];
+  ya=(_x+x0*2+x1)*g+y0*c0+y1*c1+(1<<23)>>24;
+  _f->x[1]=(ogg_int32_t)x0;
+  _f->x[0]=_x;
+  _f->y[1]=(ogg_int32_t)y0;
+  _f->y[0]=(ogg_int32_t)ya;
+  return ya;
+}
+
+
+
+/*Search for the quantizer that matches the target most closely.
+  We don't assume a linear ordering, but when there are ties we pick the
+   quantizer closest to the old one.*/
+static int oc_enc_find_qi_for_target(oc_enc_ctx *_enc,int _qti,int _qi_old,
+ int _qi_min,ogg_int64_t _log_qtarget){
+  ogg_int64_t best_qdiff;
+  int         best_qi;
+  int         qi;
+  best_qi=_qi_min;
+  best_qdiff=_enc->log_qavg[_qti][best_qi]-_log_qtarget;
+  best_qdiff=best_qdiff+OC_SIGNMASK(best_qdiff)^OC_SIGNMASK(best_qdiff);
+  for(qi=_qi_min+1;qi<64;qi++){
+    ogg_int64_t qdiff;
+    qdiff=_enc->log_qavg[_qti][qi]-_log_qtarget;
+    qdiff=qdiff+OC_SIGNMASK(qdiff)^OC_SIGNMASK(qdiff);
+    if(qdiff<best_qdiff||
+     qdiff==best_qdiff&&abs(qi-_qi_old)<abs(best_qi-_qi_old)){
+      best_qi=qi;
+      best_qdiff=qdiff;
+    }
+  }
+  return best_qi;
+}
+
+void oc_enc_calc_lambda(oc_enc_ctx *_enc,int _qti){
+  ogg_int64_t lq;
+  int         qi;
+  int         qi1;
+  int         nqis;
+  /*For now, lambda is fixed depending on the qi value and frame type:
+      lambda=qscale*(qavg[qti][qi]**2),
+     where qscale=0.2125.
+    This was derived by exhaustively searching for the optimal quantizer for
+     the AC coefficients in each block from a number of test sequences for a
+     number of fixed lambda values and fitting the peaks of the resulting
+     histograms (on the log(qavg) scale).
+    The same model applies to both inter and intra frames.
+    A more adaptive scheme might perform better.*/
+  qi=_enc->state.qis[0];
+  /*If rate control is active, use the lambda for the _target_ quantizer.
+    This allows us to scale to rates slightly lower than we'd normally be able
+     to reach, and give the rate control a semblance of "fractional qi"
+     precision.
+    TODO: Add API for changing QI, and allow extra precision.*/
+  if(_enc->state.info.target_bitrate>0)lq=_enc->rc.log_qtarget;
+  else lq=_enc->log_qavg[_qti][qi];
+  /*The resulting lambda value is less than 0x500000.*/
+  _enc->lambda=(int)oc_bexp64(2*lq-0x4780BD468D6B62BLL);
+  /*Select additional quantizers.
+    The R-D optimal block AC quantizer statistics suggest that the distribution
+     is roughly Gaussian-like with a slight positive skew.
+    K-means clustering on log_qavg to select 3 quantizers produces cluster
+     centers of {log_qavg-0.6,log_qavg,log_qavg+0.7}.
+    Experiments confirm these are relatively good choices.
+
+    Although we do greedy R-D optimization of the qii flags to avoid switching
+     too frequently, this becomes ineffective at low rates, either because we
+     do a poor job of predicting the actual R-D cost, or the greedy
+     optimization is not sufficient.
+    Therefore adaptive quantization is disabled above an (experimentally
+     suggested) threshold of log_qavg=7.00 (e.g., below INTRA qi=12 or
+     INTER qi=20 with current matrices).
+    This may need to be revised if the R-D cost estimation or qii flag
+     optimization strategies change.*/
+  nqis=1;
+  if(lq<(OC_Q57(56)>>3)&&!_enc->vp3_compatible){
+    qi1=oc_enc_find_qi_for_target(_enc,_qti,OC_MAXI(qi-1,0),0,
+     lq+(OC_Q57(7)+5)/10);
+    if(qi1!=qi)_enc->state.qis[nqis++]=qi1;
+    qi1=oc_enc_find_qi_for_target(_enc,_qti,OC_MINI(qi+1,63),0,
+     lq-(OC_Q57(6)+5)/10);
+    if(qi1!=qi&&qi1!=_enc->state.qis[nqis-1])_enc->state.qis[nqis++]=qi1;
+  }
+  _enc->state.nqis=nqis;
+}
+
+/*Binary exponential of _log_scale with 24-bit fractional precision and
+   saturation.
+  _log_scale: A binary logarithm in Q24 format.
+  Return: The binary exponential in Q24 format, saturated to 2**47-1 if
+   _log_scale was too large.*/
+static ogg_int64_t oc_bexp_q24(ogg_int32_t _log_scale){
+  if(_log_scale<(ogg_int32_t)23<<24){
+    ogg_int64_t ret;
+    ret=oc_bexp64(((ogg_int64_t)_log_scale<<33)+OC_Q57(24));
+    return ret<0x7FFFFFFFFFFFLL?ret:0x7FFFFFFFFFFFLL;
+  }
+  return 0x7FFFFFFFFFFFLL;
+}
+
+/*Convenience function converts Q57 value to a clamped 32-bit Q24 value
+  _in: input in Q57 format.
+  Return: same number in Q24 */
+static ogg_int32_t oc_q57_to_q24(ogg_int64_t _in){
+  ogg_int64_t ret;
+  ret=_in+((ogg_int64_t)1<<32)>>33;
+  /*0x80000000 is automatically converted to unsigned on 32-bit systems.
+    -0x7FFFFFFF-1 is needed to avoid "promoting" the whole expression to
+    unsigned.*/
+  return (ogg_int32_t)OC_CLAMPI(-0x7FFFFFFF-1,ret,0x7FFFFFFF);
+}
+
+/*Binary exponential of _log_scale with 24-bit fractional precision and
+   saturation.
+  _log_scale: A binary logarithm in Q57 format.
+  Return: The binary exponential in Q24 format, saturated to 2**31-1 if
+   _log_scale was too large.*/
+static ogg_int32_t oc_bexp64_q24(ogg_int64_t _log_scale){
+  if(_log_scale<OC_Q57(8)){
+    ogg_int64_t ret;
+    ret=oc_bexp64(_log_scale+OC_Q57(24));
+    return ret<0x7FFFFFFF?(ogg_int32_t)ret:0x7FFFFFFF;
+  }
+  return 0x7FFFFFFF;
+}
+
+
+static void oc_enc_rc_reset(oc_enc_ctx *_enc){
+  ogg_int64_t npixels;
+  ogg_int64_t ibpp;
+  int         inter_delay;
+  /*TODO: These parameters should be exposed in a th_encode_ctl() API.*/
+  _enc->rc.bits_per_frame=(_enc->state.info.target_bitrate*
+   (ogg_int64_t)_enc->state.info.fps_denominator)/
+   _enc->state.info.fps_numerator;
+  /*Insane framerates or frame sizes mean insane bitrates.
+    Let's not get carried away.*/
+  if(_enc->rc.bits_per_frame>0x400000000000LL){
+    _enc->rc.bits_per_frame=(ogg_int64_t)0x400000000000LL;
+  }
+  else if(_enc->rc.bits_per_frame<32)_enc->rc.bits_per_frame=32;
+  _enc->rc.buf_delay=OC_MAXI(_enc->rc.buf_delay,12);
+  _enc->rc.max=_enc->rc.bits_per_frame*_enc->rc.buf_delay;
+  /*Start with a buffer fullness of 50% plus 25% of the amount we plan to spend
+     on a single keyframe interval.
+    We can require fully half the bits in an interval for a keyframe, so this
+     initial level gives us maximum flexibility for over/under-shooting in
+     subsequent frames.*/
+  _enc->rc.target=(_enc->rc.max+1>>1)+(_enc->rc.bits_per_frame+2>>2)*
+   OC_MINI(_enc->keyframe_frequency_force,_enc->rc.buf_delay);
+  _enc->rc.fullness=_enc->rc.target;
+  /*Pick exponents and initial scales for quantizer selection.*/
+  npixels=_enc->state.info.frame_width*
+   (ogg_int64_t)_enc->state.info.frame_height;
+  _enc->rc.log_npixels=oc_blog64(npixels);
+  ibpp=npixels/_enc->rc.bits_per_frame;
+  if(ibpp<1){
+    _enc->rc.exp[0]=59;
+    _enc->rc.log_scale[0]=oc_blog64(1997)-OC_Q57(8);
+  }
+  else if(ibpp<2){
+    _enc->rc.exp[0]=55;
+    _enc->rc.log_scale[0]=oc_blog64(1604)-OC_Q57(8);
+  }
+  else{
+    _enc->rc.exp[0]=48;
+    _enc->rc.log_scale[0]=oc_blog64(834)-OC_Q57(8);
+  }
+  if(ibpp<4){
+    _enc->rc.exp[1]=100;
+    _enc->rc.log_scale[1]=oc_blog64(2249)-OC_Q57(8);
+  }
+  else if(ibpp<8){
+    _enc->rc.exp[1]=95;
+    _enc->rc.log_scale[1]=oc_blog64(1751)-OC_Q57(8);
+  }
+  else{
+    _enc->rc.exp[1]=73;
+    _enc->rc.log_scale[1]=oc_blog64(1260)-OC_Q57(8);
+  }
+  _enc->rc.prev_drop_count=0;
+  _enc->rc.log_drop_scale=OC_Q57(0);
+  /*Set up second order followers, initialized according to corresponding
+     time constants.*/
+  oc_iir_filter_init(&_enc->rc.scalefilter[0],4,
+   oc_q57_to_q24(_enc->rc.log_scale[0]));
+  inter_delay=(_enc->rc.twopass?
+   OC_MAXI(_enc->keyframe_frequency_force,12):_enc->rc.buf_delay)>>1;
+  _enc->rc.inter_count=0;
+  /*We clamp the actual inter_delay to a minimum of 10 to work within the range
+     of values where later incrementing the delay works as designed.
+    10 is not an exact choice, but rather a good working trade-off.*/
+  _enc->rc.inter_delay=10;
+  _enc->rc.inter_delay_target=inter_delay;
+  oc_iir_filter_init(&_enc->rc.scalefilter[1],_enc->rc.inter_delay,
+   oc_q57_to_q24(_enc->rc.log_scale[1]));
+  oc_iir_filter_init(&_enc->rc.vfrfilter,4,
+   oc_bexp64_q24(_enc->rc.log_drop_scale));
+}
+
+void oc_rc_state_init(oc_rc_state *_rc,oc_enc_ctx *_enc){
+  _rc->twopass=0;
+  _rc->twopass_buffer_bytes=0;
+  _rc->twopass_force_kf=0;
+  _rc->frame_metrics=NULL;
+  _rc->rate_bias=0;
+  if(_enc->state.info.target_bitrate>0){
+    /*The buffer size is set equal to the keyframe interval, clamped to the
+       range [12,256] frames.
+      The 12 frame minimum gives us some chance to distribute bit estimation
+       errors.
+      The 256 frame maximum means we'll require 8-10 seconds of pre-buffering
+       at 24-30 fps, which is not unreasonable.*/
+    _rc->buf_delay=_enc->keyframe_frequency_force>256?
+     256:_enc->keyframe_frequency_force;
+    /*By default, enforce all buffer constraints.*/
+    _rc->drop_frames=1;
+    _rc->cap_overflow=1;
+    _rc->cap_underflow=0;
+    oc_enc_rc_reset(_enc);
+  }
+}
+
+void oc_rc_state_clear(oc_rc_state *_rc){
+  _ogg_free(_rc->frame_metrics);
+}
+
+void oc_enc_rc_resize(oc_enc_ctx *_enc){
+  /*If encoding has not yet begun, reset the buffer state.*/
+  if(_enc->state.curframe_num<0)oc_enc_rc_reset(_enc);
+  else{
+    int idt;
+    /*Otherwise, update the bounds on the buffer, but not the current
+       fullness.*/
+    _enc->rc.bits_per_frame=(_enc->state.info.target_bitrate*
+     (ogg_int64_t)_enc->state.info.fps_denominator)/
+     _enc->state.info.fps_numerator;
+    /*Insane framerates or frame sizes mean insane bitrates.
+      Let's not get carried away.*/
+    if(_enc->rc.bits_per_frame>0x400000000000LL){
+      _enc->rc.bits_per_frame=(ogg_int64_t)0x400000000000LL;
+    }
+    else if(_enc->rc.bits_per_frame<32)_enc->rc.bits_per_frame=32;
+    _enc->rc.buf_delay=OC_MAXI(_enc->rc.buf_delay,12);
+    _enc->rc.max=_enc->rc.bits_per_frame*_enc->rc.buf_delay;
+    _enc->rc.target=(_enc->rc.max+1>>1)+(_enc->rc.bits_per_frame+2>>2)*
+     OC_MINI(_enc->keyframe_frequency_force,_enc->rc.buf_delay);
+    /*Update the INTER-frame scale filter delay.
+      We jump to it immediately if we've already seen enough frames; otherwise
+       it is simply set as the new target.*/
+    _enc->rc.inter_delay_target=idt=OC_MAXI(_enc->rc.buf_delay>>1,10);
+    if(idt<OC_MINI(_enc->rc.inter_delay,_enc->rc.inter_count)){
+      oc_iir_filter_init(&_enc->rc.scalefilter[1],idt,
+       _enc->rc.scalefilter[1].y[0]);
+      _enc->rc.inter_delay=idt;
+    }
+  }
+  /*If we're in pass-2 mode, make sure the frame metrics array is big enough
+     to hold frame statistics for the full buffer.*/
+  if(_enc->rc.twopass==2){
+    int cfm;
+    int buf_delay;
+    int reset_window;
+    buf_delay=_enc->rc.buf_delay;
+    reset_window=_enc->rc.frame_metrics==NULL&&(_enc->rc.frames_total[0]==0||
+     buf_delay<_enc->rc.frames_total[0]+_enc->rc.frames_total[1]
+     +_enc->rc.frames_total[2]);
+    cfm=_enc->rc.cframe_metrics;
+    /*Only try to resize the frame metrics buffer if a) it's too small and
+       b) we were using a finite buffer, or are about to start.*/
+    if(cfm<buf_delay&&(_enc->rc.frame_metrics!=NULL||reset_window)){
+      oc_frame_metrics *fm;
+      int               nfm;
+      int               fmh;
+      fm=(oc_frame_metrics *)_ogg_realloc(_enc->rc.frame_metrics,
+       buf_delay*sizeof(*_enc->rc.frame_metrics));
+      if(fm==NULL){
+        /*We failed to allocate a finite buffer.*/
+        /*If we don't have a valid 2-pass header yet, just return; we'll reset
+           the buffer size when we read the header.*/
+        if(_enc->rc.frames_total[0]==0)return;
+        /*Otherwise revert to the largest finite buffer previously set, or to
+           whole-file buffering if we were still using that.*/
+        _enc->rc.buf_delay=_enc->rc.frame_metrics!=NULL?
+         cfm:_enc->rc.frames_total[0]+_enc->rc.frames_total[1]
+         +_enc->rc.frames_total[2];
+        oc_enc_rc_resize(_enc);
+        return;
+      }
+      _enc->rc.frame_metrics=fm;
+      _enc->rc.cframe_metrics=buf_delay;
+      /*Re-organize the circular buffer.*/
+      fmh=_enc->rc.frame_metrics_head;
+      nfm=_enc->rc.nframe_metrics;
+      if(fmh+nfm>cfm){
+        int shift;
+        shift=OC_MINI(fmh+nfm-cfm,buf_delay-cfm);
+        memcpy(fm+cfm,fm,OC_MINI(fmh+nfm-cfm,buf_delay-cfm)*sizeof(*fm));
+        if(fmh+nfm>buf_delay)memmove(fm,fm+shift,fmh+nfm-buf_delay);
+      }
+    }
+    /*We were using whole-file buffering; now we're not.*/
+    if(reset_window){
+      _enc->rc.nframes[0]=_enc->rc.nframes[1]=_enc->rc.nframes[2]=0;
+      _enc->rc.scale_sum[0]=_enc->rc.scale_sum[1]=0;
+      _enc->rc.scale_window_end=_enc->rc.scale_window0=
+       _enc->state.curframe_num+_enc->prev_dup_count+1;
+      if(_enc->rc.twopass_buffer_bytes){
+        int qti;
+        /*We already read the metrics for the first frame in the window.*/
+        *(_enc->rc.frame_metrics)=*&_enc->rc.cur_metrics;
+        _enc->rc.nframe_metrics++;
+        qti=_enc->rc.cur_metrics.frame_type;
+        _enc->rc.nframes[qti]++;
+        _enc->rc.nframes[2]+=_enc->rc.cur_metrics.dup_count;
+        _enc->rc.scale_sum[qti]+=oc_bexp_q24(_enc->rc.cur_metrics.log_scale);
+        _enc->rc.scale_window_end+=_enc->rc.cur_metrics.dup_count+1;
+        if(_enc->rc.scale_window_end-_enc->rc.scale_window0<buf_delay){
+          /*We need more frame data.*/
+          _enc->rc.twopass_buffer_bytes=0;
+        }
+      }
+    }
+    /*Otherwise, we could shrink the size of the current window, if necessary,
+       but leaving it like it is lets us adapt to the new buffer size more
+       gracefully.*/
+  }
+}
+
+/*Scale the number of frames by the number of expected drops/duplicates.*/
+static int oc_rc_scale_drop(oc_rc_state *_rc,int _nframes){
+  if(_rc->prev_drop_count>0||_rc->log_drop_scale>OC_Q57(0)){
+    ogg_int64_t dup_scale;
+    dup_scale=oc_bexp64((_rc->log_drop_scale
+     +oc_blog64(_rc->prev_drop_count+1)>>1)+OC_Q57(8));
+    if(dup_scale<_nframes<<8){
+      int dup_scalei;
+      dup_scalei=(int)dup_scale;
+      if(dup_scalei>0)_nframes=((_nframes<<8)+dup_scalei-1)/dup_scalei;
+    }
+    else _nframes=!!_nframes;
+  }
+  return _nframes;
+}
+
+int oc_enc_select_qi(oc_enc_ctx *_enc,int _qti,int _clamp){
+  ogg_int64_t  rate_total;
+  ogg_int64_t  rate_bias;
+  int          nframes[2];
+  int          buf_delay;
+  int          buf_pad;
+  ogg_int64_t  log_qtarget;
+  ogg_int64_t  log_scale0;
+  ogg_int64_t  log_cur_scale;
+  ogg_int64_t  log_qexp;
+  int          exp0;
+  int          old_qi;
+  int          qi;
+  /*Figure out how to re-distribute bits so that we hit our fullness target
+     before the last keyframe in our current buffer window (after the current
+     frame), or the end of the buffer window, whichever comes first.*/
+  log_cur_scale=(ogg_int64_t)_enc->rc.scalefilter[_qti].y[0]<<33;
+  buf_pad=0;
+  switch(_enc->rc.twopass){
+    default:{
+      ogg_uint32_t next_key_frame;
+      /*Single pass mode: assume only forced keyframes and attempt to estimate
+         the drop count for VFR content.*/
+      next_key_frame=_qti?_enc->keyframe_frequency_force
+       -(_enc->state.curframe_num-_enc->state.keyframe_num):0;
+      nframes[0]=(_enc->rc.buf_delay-OC_MINI(next_key_frame,_enc->rc.buf_delay)
+       +_enc->keyframe_frequency_force-1)/_enc->keyframe_frequency_force;
+      if(nframes[0]+_qti>1){
+        nframes[0]--;
+        buf_delay=next_key_frame+nframes[0]*_enc->keyframe_frequency_force;
+      }
+      else buf_delay=_enc->rc.buf_delay;
+      nframes[1]=buf_delay-nframes[0];
+      /*Downgrade the delta frame rate to correspond to the recent drop count
+         history.*/
+      nframes[1]=oc_rc_scale_drop(&_enc->rc,nframes[1]);
+    }break;
+    case 1:{
+      /*Pass 1 mode: use a fixed qi value.*/
+      qi=_enc->state.qis[0];
+      _enc->rc.log_qtarget=_enc->log_qavg[_qti][qi];
+      return qi;
+    }break;
+    case 2:{
+      ogg_int64_t scale_sum[2];
+      int         qti;
+      /*Pass 2 mode: we know exactly how much of each frame type there is in
+         the current buffer window, and have estimates for the scales.*/
+      nframes[0]=_enc->rc.nframes[0];
+      nframes[1]=_enc->rc.nframes[1];
+      scale_sum[0]=_enc->rc.scale_sum[0];
+      scale_sum[1]=_enc->rc.scale_sum[1];
+      /*The window size can be slightly larger than the buffer window for VFR
+         content; clamp it down, if appropriate (the excess will all be dup
+         frames).*/
+      buf_delay=OC_MINI(_enc->rc.scale_window_end-_enc->rc.scale_window0,
+       _enc->rc.buf_delay);
+      /*If we're approaching the end of the file, add some slack to keep us
+         from slamming into a rail.
+        Our rate accuracy goes down, but it keeps the result sensible.
+        We position the target where the first forced keyframe beyond the end
+         of the file would be (for consistency with 1-pass mode).*/
+      buf_pad=OC_MINI(_enc->rc.buf_delay,_enc->state.keyframe_num
+       +_enc->keyframe_frequency_force-_enc->rc.scale_window0);
+      if(buf_delay<buf_pad)buf_pad-=buf_delay;
+      else{
+        /*Otherwise, search for the last keyframe in the buffer window and
+           target that.*/
+        buf_pad=0;
+        /*TODO: Currently we only do this when using a finite buffer; we could
+           save the position of the last keyframe in the summary data and do it
+           with a whole-file buffer as well, but it isn't likely to make a
+           difference.*/
+        if(_enc->rc.frame_metrics!=NULL){
+          int fmi;
+          int fm_tail;
+          fm_tail=_enc->rc.frame_metrics_head+_enc->rc.nframe_metrics;
+          if(fm_tail>=_enc->rc.cframe_metrics)fm_tail-=_enc->rc.cframe_metrics;
+          for(fmi=fm_tail;;){
+            oc_frame_metrics *m;
+            fmi--;
+            if(fmi<0)fmi+=_enc->rc.cframe_metrics;
+            /*Stop before we remove the first frame.*/
+            if(fmi==_enc->rc.frame_metrics_head)break;
+            m=_enc->rc.frame_metrics+fmi;
+            /*If we find a keyframe, remove it and everything past it.*/
+            if(m->frame_type==OC_INTRA_FRAME){
+              do{
+                qti=m->frame_type;
+                nframes[qti]--;
+                scale_sum[qti]-=oc_bexp_q24(m->log_scale);
+                buf_delay-=m->dup_count+1;
+                fmi++;
+                if(fmi>=_enc->rc.cframe_metrics)fmi=0;
+                m=_enc->rc.frame_metrics+fmi;
+              }
+              while(fmi!=fm_tail);
+              /*And stop scanning backwards.*/
+              break;
+            }
+          }
+        }
+      }
+      /*If we're not using the same frame type as in pass 1 (because someone
+         changed the keyframe interval), remove that scale estimate.
+        We'll add in a replacement for the correct frame type below.*/
+      qti=_enc->rc.cur_metrics.frame_type;
+      if(qti!=_qti){
+        nframes[qti]--;
+        scale_sum[qti]-=oc_bexp_q24(_enc->rc.cur_metrics.log_scale);
+      }
+      /*Compute log_scale estimates for each frame type from the pass-1 scales
+         we measured in the current window.*/
+      for(qti=0;qti<2;qti++){
+        _enc->rc.log_scale[qti]=nframes[qti]>0?
+         oc_blog64(scale_sum[qti])-oc_blog64(nframes[qti])-OC_Q57(24):
+         -_enc->rc.log_npixels;
+      }
+      /*If we're not using the same frame type as in pass 1, add a scale
+         estimate for the corresponding frame using the current low-pass
+         filter value.
+        This is mostly to ensure we have a valid estimate even when pass 1 had
+         no frames of this type in the buffer window.
+        TODO: We could also plan ahead and figure out how many keyframes we'll
+         be forced to add in the current buffer window.*/
+      qti=_enc->rc.cur_metrics.frame_type;
+      if(qti!=_qti){
+        ogg_int64_t scale;
+        scale=_enc->rc.log_scale[_qti]<OC_Q57(23)?
+         oc_bexp64(_enc->rc.log_scale[_qti]+OC_Q57(24)):0x7FFFFFFFFFFFLL;
+        scale*=nframes[_qti];
+        nframes[_qti]++;
+        scale+=oc_bexp_q24(log_cur_scale>>33);
+        _enc->rc.log_scale[_qti]=oc_blog64(scale)
+         -oc_blog64(nframes[qti])-OC_Q57(24);
+      }
+      else log_cur_scale=(ogg_int64_t)_enc->rc.cur_metrics.log_scale<<33;
+      /*Add the padding from above.
+        This basically reverts to 1-pass estimations in the last keyframe
+         interval.*/
+      if(buf_pad>0){
+        ogg_int64_t scale;
+        int         nextra_frames;
+        /*Extend the buffer.*/
+        buf_delay+=buf_pad;
+        /*Add virtual delta frames according to the estimated drop count.*/
+        nextra_frames=oc_rc_scale_drop(&_enc->rc,buf_pad);
+        /*And blend in the low-pass filtered scale according to how many frames
+           we added.*/
+        scale=
+         oc_bexp64(_enc->rc.log_scale[1]+OC_Q57(24))*(ogg_int64_t)nframes[1]
+         +oc_bexp_q24(_enc->rc.scalefilter[1].y[0])*(ogg_int64_t)nextra_frames;
+        nframes[1]+=nextra_frames;
+        _enc->rc.log_scale[1]=oc_blog64(scale)-oc_blog64(nframes[1])-OC_Q57(24);
+      }
+    }break;
+  }
+  /*If we've been missing our target, add a penalty term.*/
+  rate_bias=(_enc->rc.rate_bias/(_enc->state.curframe_num+1000))*
+   (buf_delay-buf_pad);
+  /*rate_total is the total bits available over the next buf_delay frames.*/
+  rate_total=_enc->rc.fullness-_enc->rc.target+rate_bias
+   +buf_delay*_enc->rc.bits_per_frame;
+  log_scale0=_enc->rc.log_scale[_qti]+_enc->rc.log_npixels;
+  /*If there aren't enough bits to achieve our desired fullness level, use the
+     minimum quality permitted.*/
+  if(rate_total<=buf_delay)log_qtarget=OC_QUANT_MAX_LOG;
+  else{
+    static const ogg_int64_t LOG_KEY_RATIO=0x0137222BB70747BALL;
+    ogg_int64_t log_scale1;
+    ogg_int64_t rlo;
+    ogg_int64_t rhi;
+    log_scale1=_enc->rc.log_scale[1-_qti]+_enc->rc.log_npixels;
+    rlo=0;
+    rhi=(rate_total+nframes[_qti]-1)/nframes[_qti];
+    while(rlo<rhi){
+      ogg_int64_t curr;
+      ogg_int64_t rdiff;
+      ogg_int64_t log_rpow;
+      ogg_int64_t rscale;
+      curr=rlo+rhi>>1;
+      log_rpow=oc_blog64(curr)-log_scale0;
+      log_rpow=(log_rpow+(_enc->rc.exp[_qti]>>1))/_enc->rc.exp[_qti];
+      if(_qti)log_rpow+=LOG_KEY_RATIO>>6;
+      else log_rpow-=LOG_KEY_RATIO>>6;
+      log_rpow*=_enc->rc.exp[1-_qti];
+      rscale=nframes[1-_qti]*oc_bexp64(log_scale1+log_rpow);
+      rdiff=nframes[_qti]*curr+rscale-rate_total;
+      if(rdiff<0)rlo=curr+1;
+      else if(rdiff>0)rhi=curr-1;
+      else break;
+    }
+    log_qtarget=OC_Q57(2)-((oc_blog64(rlo)-log_scale0+(_enc->rc.exp[_qti]>>1))/
+     _enc->rc.exp[_qti]<<6);
+    log_qtarget=OC_MINI(log_qtarget,OC_QUANT_MAX_LOG);
+  }
+  /*The above allocation looks only at the total rate we'll accumulate in the
+     next buf_delay frames.
+    However, we could overflow the buffer on the very next frame, so check for
+     that here, if we're not using a soft target.*/
+  exp0=_enc->rc.exp[_qti];
+  if(_enc->rc.cap_overflow){
+    ogg_int64_t margin;
+    ogg_int64_t soft_limit;
+    ogg_int64_t log_soft_limit;
+    /*Allow 3% of the buffer for prediction error.
+      This should be plenty, and we don't mind if we go a bit over; we only
+       want to keep these bits from being completely wasted.*/
+    margin=_enc->rc.max+31>>5;
+    /*We want to use at least this many bits next frame.*/
+    soft_limit=_enc->rc.fullness+_enc->rc.bits_per_frame-(_enc->rc.max-margin);
+    log_soft_limit=oc_blog64(soft_limit);
+    /*If we're predicting we won't use that many...*/
+    log_qexp=(log_qtarget-OC_Q57(2)>>6)*exp0;
+    if(log_scale0-log_qexp<log_soft_limit){
+      /*Scale the adjustment based on how far into the margin we are.*/
+      log_qexp+=(log_scale0-log_soft_limit-log_qexp>>32)*
+       ((OC_MINI(margin,soft_limit)<<32)/margin);
+      log_qtarget=((log_qexp+(exp0>>1))/exp0<<6)+OC_Q57(2);
+    }
+  }
+  /*If this was not one of the initial frames, limit the change in quality.*/
+  old_qi=_enc->state.qis[0];
+  if(_clamp){
+    ogg_int64_t log_qmin;
+    ogg_int64_t log_qmax;
+    /*Clamp the target quantizer to within [0.8*Q,1.2*Q], where Q is the
+       current quantizer.
+      TODO: With user-specified quant matrices, we need to enlarge these limits
+       if they don't actually let us change qi values.*/
+    log_qmin=_enc->log_qavg[_qti][old_qi]-0x00A4D3C25E68DC58LL;
+    log_qmax=_enc->log_qavg[_qti][old_qi]+0x00A4D3C25E68DC58LL;
+    log_qtarget=OC_CLAMPI(log_qmin,log_qtarget,log_qmax);
+  }
+  /*The above allocation looks only at the total rate we'll accumulate in the
+     next buf_delay frames.
+    However, we could bust the budget on the very next frame, so check for that
+     here, if we're not using a soft target.*/
+  /* Disabled when our minimum qi > 0; if we saturate log_qtarget to
+     to the maximum possible size when we have a minimum qi, the
+     resulting lambda will interact very strangely with SKIP.  The
+     resulting artifacts look like waterfalls. */
+  if(_enc->state.info.quality==0){
+    ogg_int64_t log_hard_limit;
+    /*Compute the maximum number of bits we can use in the next frame.
+      Allow 50% of the rate for a single frame for prediction error.
+      This may not be enough for keyframes or sudden changes in complexity.*/
+    log_hard_limit=oc_blog64(_enc->rc.fullness+(_enc->rc.bits_per_frame>>1));
+    /*If we're predicting we'll use more than this...*/
+    log_qexp=(log_qtarget-OC_Q57(2)>>6)*exp0;
+    if(log_scale0-log_qexp>log_hard_limit){
+      /*Force the target to hit our limit exactly.*/
+      log_qexp=log_scale0-log_hard_limit;
+      log_qtarget=((log_qexp+(exp0>>1))/exp0<<6)+OC_Q57(2);
+      /*If that target is unreasonable, oh well; we'll have to drop.*/
+      log_qtarget=OC_MINI(log_qtarget,OC_QUANT_MAX_LOG);
+    }
+  }
+  /*Compute a final estimate of the number of bits we plan to use.*/
+  log_qexp=(log_qtarget-OC_Q57(2)>>6)*_enc->rc.exp[_qti];
+  _enc->rc.rate_bias+=oc_bexp64(log_cur_scale+_enc->rc.log_npixels-log_qexp);
+  qi=oc_enc_find_qi_for_target(_enc,_qti,old_qi,
+   _enc->state.info.quality,log_qtarget);
+  /*Save the quantizer target for lambda calculations.*/
+  _enc->rc.log_qtarget=log_qtarget;
+  return qi;
+}
+
+int oc_enc_update_rc_state(oc_enc_ctx *_enc,
+ long _bits,int _qti,int _qi,int _trial,int _droppable){
+  ogg_int64_t buf_delta;
+  ogg_int64_t log_scale;
+  int         dropped;
+  dropped=0;
+  /* Drop frames also disabled for now in the case of infinite-buffer
+     two-pass mode */
+  if(!_enc->rc.drop_frames||_enc->rc.twopass&&_enc->rc.frame_metrics==NULL){
+    _droppable=0;
+  }
+  buf_delta=_enc->rc.bits_per_frame*(1+_enc->dup_count);
+  if(_bits<=0){
+    /*We didn't code any blocks in this frame.*/
+    log_scale=OC_Q57(-64);
+    _bits=0;
+  }
+  else{
+    ogg_int64_t log_bits;
+    ogg_int64_t log_qexp;
+    /*Compute the estimated scale factor for this frame type.*/
+    log_bits=oc_blog64(_bits);
+    log_qexp=_enc->rc.log_qtarget-OC_Q57(2);
+    log_qexp=(log_qexp>>6)*(_enc->rc.exp[_qti]);
+    log_scale=OC_MINI(log_bits-_enc->rc.log_npixels+log_qexp,OC_Q57(16));
+  }
+  /*Special two-pass processing.*/
+  switch(_enc->rc.twopass){
+    case 1:{
+      /*Pass 1 mode: save the metrics for this frame.*/
+      _enc->rc.cur_metrics.log_scale=oc_q57_to_q24(log_scale);
+      _enc->rc.cur_metrics.dup_count=_enc->dup_count;
+      _enc->rc.cur_metrics.frame_type=_enc->state.frame_type;
+      _enc->rc.twopass_buffer_bytes=0;
+    }break;
+    case 2:{
+      /*Pass 2 mode:*/
+      if(!_trial){
+        ogg_int64_t next_frame_num;
+        int         qti;
+        /*Move the current metrics back one frame.*/
+        *&_enc->rc.prev_metrics=*&_enc->rc.cur_metrics;
+        next_frame_num=_enc->state.curframe_num+_enc->dup_count+1;
+        /*Back out the last frame's statistics from the sliding window.*/
+        qti=_enc->rc.prev_metrics.frame_type;
+        _enc->rc.frames_left[qti]--;
+        _enc->rc.frames_left[2]-=_enc->rc.prev_metrics.dup_count;
+        _enc->rc.nframes[qti]--;
+        _enc->rc.nframes[2]-=_enc->rc.prev_metrics.dup_count;
+        _enc->rc.scale_sum[qti]-=oc_bexp_q24(_enc->rc.prev_metrics.log_scale);
+        _enc->rc.scale_window0=(int)next_frame_num;
+        /*Free the corresponding entry in the circular buffer.*/
+        if(_enc->rc.frame_metrics!=NULL){
+          _enc->rc.nframe_metrics--;
+          _enc->rc.frame_metrics_head++;
+          if(_enc->rc.frame_metrics_head>=_enc->rc.cframe_metrics){
+            _enc->rc.frame_metrics_head=0;
+          }
+        }
+        /*Mark us ready for the next 2-pass packet.*/
+        _enc->rc.twopass_buffer_bytes=0;
+        /*Update state, so the user doesn't have to keep calling 2pass_in after
+           they've fed in all the data when we're using a finite buffer.*/
+        _enc->prev_dup_count=_enc->dup_count;
+        oc_enc_rc_2pass_in(_enc,NULL,0);
+      }
+    }break;
+  }
+  /*Common to all passes:*/
+  if(_bits>0){
+    if(_trial){
+      oc_iir_filter *f;
+      /*Use the estimated scale factor directly if this was a trial.*/
+      f=_enc->rc.scalefilter+_qti;
+      f->y[1]=f->y[0]=f->x[1]=f->x[0]=oc_q57_to_q24(log_scale);
+      _enc->rc.log_scale[_qti]=log_scale;
+    }
+    else{
+      /*Lengthen the time constant for the INTER filter as we collect more
+         frame statistics, until we reach our target.*/
+      if(_enc->rc.inter_delay<_enc->rc.inter_delay_target&&
+       _enc->rc.inter_count>=_enc->rc.inter_delay&&_qti==OC_INTER_FRAME){
+        oc_iir_filter_reinit(&_enc->rc.scalefilter[1],++_enc->rc.inter_delay);
+      }
+      /*Otherwise update the low-pass scale filter for this frame type,
+         regardless of whether or not we dropped this frame.*/
+      _enc->rc.log_scale[_qti]=oc_iir_filter_update(
+       _enc->rc.scalefilter+_qti,oc_q57_to_q24(log_scale))<<33;
+      /*If this frame busts our budget, it must be dropped.*/
+      if(_droppable&&_enc->rc.fullness+buf_delta<_bits){
+        _enc->rc.prev_drop_count+=1+_enc->dup_count;
+        _bits=0;
+        dropped=1;
+      }
+      else{
+        ogg_uint32_t drop_count;
+        /*Update a low-pass filter to estimate the "real" frame rate taking
+           drops and duplicates into account.
+          This is only done if the frame is coded, as it needs the final
+           count of dropped frames.*/
+        drop_count=_enc->rc.prev_drop_count+1;
+        if(drop_count>0x7F)drop_count=0x7FFFFFFF;
+        else drop_count<<=24;
+        _enc->rc.log_drop_scale=oc_blog64(oc_iir_filter_update(
+         &_enc->rc.vfrfilter,drop_count))-OC_Q57(24);
+        /*Initialize the drop count for this frame to the user-requested dup
+           count.
+          It will be increased if we drop more frames.*/
+        _enc->rc.prev_drop_count=_enc->dup_count;
+      }
+    }
+    /*Increment the INTER frame count, for filter adaptation purposes.*/
+    if(_enc->rc.inter_count<INT_MAX)_enc->rc.inter_count+=_qti;
+  }
+  /*Increase the drop count.*/
+  else _enc->rc.prev_drop_count+=1+_enc->dup_count;
+  /*And update the buffer fullness level.*/
+  if(!_trial){
+    _enc->rc.fullness+=buf_delta-_bits;
+    /*If we're too quick filling the buffer and overflow is capped,
+      that rate is lost forever.*/
+    if(_enc->rc.cap_overflow&&_enc->rc.fullness>_enc->rc.max){
+      _enc->rc.fullness=_enc->rc.max;
+    }
+    /*If we're too quick draining the buffer and underflow is capped,
+      don't try to make up that rate later.*/
+    if(_enc->rc.cap_underflow&&_enc->rc.fullness<0){
+      _enc->rc.fullness=0;
+    }
+    /*Adjust the bias for the real bits we've used.*/
+    _enc->rc.rate_bias-=_bits;
+  }
+  return dropped;
+}
+
+#define OC_RC_2PASS_VERSION   (1)
+#define OC_RC_2PASS_HDR_SZ    (38)
+#define OC_RC_2PASS_PACKET_SZ (8)
+
+static void oc_rc_buffer_val(oc_rc_state *_rc,ogg_int64_t _val,int _bytes){
+  while(_bytes-->0){
+    _rc->twopass_buffer[_rc->twopass_buffer_bytes++]=(unsigned char)(_val&0xFF);
+    _val>>=8;
+  }
+}
+
+int oc_enc_rc_2pass_out(oc_enc_ctx *_enc,unsigned char **_buf){
+  if(_enc->rc.twopass_buffer_bytes==0){
+    if(_enc->rc.twopass==0){
+      int qi;
+      /*Pick first-pass qi for scale calculations.*/
+      qi=oc_enc_select_qi(_enc,0,0);
+      _enc->state.nqis=1;
+      _enc->state.qis[0]=qi;
+      _enc->rc.twopass=1;
+      _enc->rc.frames_total[0]=_enc->rc.frames_total[1]=
+       _enc->rc.frames_total[2]=0;
+      _enc->rc.scale_sum[0]=_enc->rc.scale_sum[1]=0;
+      /*Fill in dummy summary values.*/
+      oc_rc_buffer_val(&_enc->rc,0x5032544F,4);
+      oc_rc_buffer_val(&_enc->rc,OC_RC_2PASS_VERSION,4);
+      oc_rc_buffer_val(&_enc->rc,0,OC_RC_2PASS_HDR_SZ-8);
+    }
+    else{
+      int qti;
+      qti=_enc->rc.cur_metrics.frame_type;
+      _enc->rc.scale_sum[qti]+=oc_bexp_q24(_enc->rc.cur_metrics.log_scale);
+      _enc->rc.frames_total[qti]++;
+      _enc->rc.frames_total[2]+=_enc->rc.cur_metrics.dup_count;
+      oc_rc_buffer_val(&_enc->rc,
+       _enc->rc.cur_metrics.dup_count|_enc->rc.cur_metrics.frame_type<<31,4);
+      oc_rc_buffer_val(&_enc->rc,_enc->rc.cur_metrics.log_scale,4);
+    }
+  }
+  else if(_enc->packet_state==OC_PACKET_DONE&&
+   _enc->rc.twopass_buffer_bytes!=OC_RC_2PASS_HDR_SZ){
+    _enc->rc.twopass_buffer_bytes=0;
+    oc_rc_buffer_val(&_enc->rc,0x5032544F,4);
+    oc_rc_buffer_val(&_enc->rc,OC_RC_2PASS_VERSION,4);
+    oc_rc_buffer_val(&_enc->rc,_enc->rc.frames_total[0],4);
+    oc_rc_buffer_val(&_enc->rc,_enc->rc.frames_total[1],4);
+    oc_rc_buffer_val(&_enc->rc,_enc->rc.frames_total[2],4);
+    oc_rc_buffer_val(&_enc->rc,_enc->rc.exp[0],1);
+    oc_rc_buffer_val(&_enc->rc,_enc->rc.exp[1],1);
+    oc_rc_buffer_val(&_enc->rc,_enc->rc.scale_sum[0],8);
+    oc_rc_buffer_val(&_enc->rc,_enc->rc.scale_sum[1],8);
+  }
+  else{
+    /*The data for this frame has already been retrieved.*/
+    *_buf=NULL;
+    return 0;
+  }
+  *_buf=_enc->rc.twopass_buffer;
+  return _enc->rc.twopass_buffer_bytes;
+}
+
+static size_t oc_rc_buffer_fill(oc_rc_state *_rc,
+ unsigned char *_buf,size_t _bytes,size_t _consumed,size_t _goal){
+  while(_rc->twopass_buffer_fill<_goal&&_consumed<_bytes){
+    _rc->twopass_buffer[_rc->twopass_buffer_fill++]=_buf[_consumed++];
+  }
+  return _consumed;
+}
+
+static ogg_int64_t oc_rc_unbuffer_val(oc_rc_state *_rc,int _bytes){
+  ogg_int64_t ret;
+  int         shift;
+  ret=0;
+  shift=0;
+  while(_bytes-->0){
+    ret|=((ogg_int64_t)_rc->twopass_buffer[_rc->twopass_buffer_bytes++])<<shift;
+    shift+=8;
+  }
+  return ret;
+}
+
+int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes){
+  size_t consumed;
+  consumed=0;
+  /*Enable pass 2 mode if this is the first call.*/
+  if(_enc->rc.twopass==0){
+    _enc->rc.twopass=2;
+    _enc->rc.twopass_buffer_fill=0;
+    _enc->rc.frames_total[0]=0;
+    _enc->rc.nframe_metrics=0;
+    _enc->rc.cframe_metrics=0;
+    _enc->rc.frame_metrics_head=0;
+    _enc->rc.scale_window0=0;
+    _enc->rc.scale_window_end=0;
+  }
+  /*If we haven't got a valid summary header yet, try to parse one.*/
+  if(_enc->rc.frames_total[0]==0){
+    if(!_buf){
+      int frames_needed;
+      /*If we're using a whole-file buffer, we just need the first frame.
+        Otherwise, we may need as many as one per buffer slot.*/
+      frames_needed=_enc->rc.frame_metrics==NULL?1:_enc->rc.buf_delay;
+      return OC_RC_2PASS_HDR_SZ+frames_needed*OC_RC_2PASS_PACKET_SZ
+       -_enc->rc.twopass_buffer_fill;
+    }
+    consumed=oc_rc_buffer_fill(&_enc->rc,
+     _buf,_bytes,consumed,OC_RC_2PASS_HDR_SZ);
+    if(_enc->rc.twopass_buffer_fill>=OC_RC_2PASS_HDR_SZ){
+      ogg_int64_t scale_sum[2];
+      int         exp[2];
+      int         buf_delay;
+      /*Read the summary header data.*/
+      /*Check the magic value and version number.*/
+      if(oc_rc_unbuffer_val(&_enc->rc,4)!=0x5032544F||
+       oc_rc_unbuffer_val(&_enc->rc,4)!=OC_RC_2PASS_VERSION){
+        _enc->rc.twopass_buffer_bytes=0;
+        return TH_ENOTFORMAT;
+      }
+      _enc->rc.frames_total[0]=(ogg_uint32_t)oc_rc_unbuffer_val(&_enc->rc,4);
+      _enc->rc.frames_total[1]=(ogg_uint32_t)oc_rc_unbuffer_val(&_enc->rc,4);
+      _enc->rc.frames_total[2]=(ogg_uint32_t)oc_rc_unbuffer_val(&_enc->rc,4);
+      exp[0]=(int)oc_rc_unbuffer_val(&_enc->rc,1);
+      exp[1]=(int)oc_rc_unbuffer_val(&_enc->rc,1);
+      scale_sum[0]=oc_rc_unbuffer_val(&_enc->rc,8);
+      scale_sum[1]=oc_rc_unbuffer_val(&_enc->rc,8);
+      /*Make sure the file claims to have at least one frame.
+        Otherwise we probably got the placeholder data from an aborted pass 1.
+        Also make sure the total frame count doesn't overflow an integer.*/
+      buf_delay=_enc->rc.frames_total[0]+_enc->rc.frames_total[1]
+       +_enc->rc.frames_total[2];
+      if(_enc->rc.frames_total[0]==0||buf_delay<0||
+       (ogg_uint32_t)buf_delay<_enc->rc.frames_total[0]||
+       (ogg_uint32_t)buf_delay<_enc->rc.frames_total[1]){
+        _enc->rc.frames_total[0]=0;
+        _enc->rc.twopass_buffer_bytes=0;
+        return TH_EBADHEADER;
+      }
+      /*Got a valid header; set up pass 2.*/
+      _enc->rc.frames_left[0]=_enc->rc.frames_total[0];
+      _enc->rc.frames_left[1]=_enc->rc.frames_total[1];
+      _enc->rc.frames_left[2]=_enc->rc.frames_total[2];
+      /*If the user hasn't specified a buffer size, use the whole file.*/
+      if(_enc->rc.frame_metrics==NULL){
+        _enc->rc.buf_delay=buf_delay;
+        _enc->rc.nframes[0]=_enc->rc.frames_total[0];
+        _enc->rc.nframes[1]=_enc->rc.frames_total[1];
+        _enc->rc.nframes[2]=_enc->rc.frames_total[2];
+        _enc->rc.scale_sum[0]=scale_sum[0];
+        _enc->rc.scale_sum[1]=scale_sum[1];
+        _enc->rc.scale_window_end=buf_delay;
+        oc_enc_rc_reset(_enc);
+      }
+      _enc->rc.exp[0]=exp[0];
+      _enc->rc.exp[1]=exp[1];
+      /*Clear the header data from the buffer to make room for packet data.*/
+      _enc->rc.twopass_buffer_fill=0;
+      _enc->rc.twopass_buffer_bytes=0;
+    }
+  }
+  if(_enc->rc.frames_total[0]!=0){
+    ogg_int64_t curframe_num;
+    int         nframes_total;
+    curframe_num=_enc->state.curframe_num;
+    if(curframe_num>=0){
+      /*We just encoded a frame; make sure things matched.*/
+      if(_enc->rc.prev_metrics.dup_count!=_enc->prev_dup_count){
+        _enc->rc.twopass_buffer_bytes=0;
+        return TH_EINVAL;
+      }
+    }
+    curframe_num+=_enc->prev_dup_count+1;
+    nframes_total=_enc->rc.frames_total[0]+_enc->rc.frames_total[1]
+     +_enc->rc.frames_total[2];
+    if(curframe_num>=nframes_total){
+      /*We don't want any more data after the last frame, and we don't want to
+         allow any more frames to be encoded.*/
+      _enc->rc.twopass_buffer_bytes=0;
+    }
+    else if(_enc->rc.twopass_buffer_bytes==0){
+      if(_enc->rc.frame_metrics==NULL){
+        /*We're using a whole-file buffer:*/
+        if(!_buf)return OC_RC_2PASS_PACKET_SZ-_enc->rc.twopass_buffer_fill;
+        consumed=oc_rc_buffer_fill(&_enc->rc,
+         _buf,_bytes,consumed,OC_RC_2PASS_PACKET_SZ);
+        if(_enc->rc.twopass_buffer_fill>=OC_RC_2PASS_PACKET_SZ){
+          ogg_uint32_t dup_count;
+          ogg_int32_t  log_scale;
+          int          qti;
+          int          arg;
+          /*Read the metrics for the next frame.*/
+          dup_count=oc_rc_unbuffer_val(&_enc->rc,4);
+          log_scale=oc_rc_unbuffer_val(&_enc->rc,4);
+          _enc->rc.cur_metrics.log_scale=log_scale;
+          qti=(dup_count&0x80000000)>>31;
+          _enc->rc.cur_metrics.dup_count=dup_count&0x7FFFFFFF;
+          _enc->rc.cur_metrics.frame_type=qti;
+          _enc->rc.twopass_force_kf=qti==OC_INTRA_FRAME;
+          /*"Helpfully" set the dup count back to what it was in pass 1.*/
+          arg=_enc->rc.cur_metrics.dup_count;
+          th_encode_ctl(_enc,TH_ENCCTL_SET_DUP_COUNT,&arg,sizeof(arg));
+          /*Clear the buffer for the next frame.*/
+          _enc->rc.twopass_buffer_fill=0;
+        }
+      }
+      else{
+        int frames_needed;
+        /*We're using a finite buffer:*/
+        frames_needed=OC_CLAMPI(0,_enc->rc.buf_delay
+         -(_enc->rc.scale_window_end-_enc->rc.scale_window0),
+         _enc->rc.frames_left[0]+_enc->rc.frames_left[1]
+         -_enc->rc.nframes[0]-_enc->rc.nframes[1]);
+        while(frames_needed>0){
+          if(!_buf){
+            return OC_RC_2PASS_PACKET_SZ*frames_needed
+           -_enc->rc.twopass_buffer_fill;
+          }
+          consumed=oc_rc_buffer_fill(&_enc->rc,
+           _buf,_bytes,consumed,OC_RC_2PASS_PACKET_SZ);
+          if(_enc->rc.twopass_buffer_fill>=OC_RC_2PASS_PACKET_SZ){
+            oc_frame_metrics *m;
+            int               fmi;
+            ogg_uint32_t      dup_count;
+            ogg_int32_t       log_scale;
+            int               qti;
+            /*Read the metrics for the next frame.*/
+            dup_count=oc_rc_unbuffer_val(&_enc->rc,4);
+            log_scale=oc_rc_unbuffer_val(&_enc->rc,4);
+            /*Add the to the circular buffer.*/
+            fmi=_enc->rc.frame_metrics_head+_enc->rc.nframe_metrics++;
+            if(fmi>=_enc->rc.cframe_metrics)fmi-=_enc->rc.cframe_metrics;
+            m=_enc->rc.frame_metrics+fmi;
+            m->log_scale=log_scale;
+            qti=(dup_count&0x80000000)>>31;
+            m->dup_count=dup_count&0x7FFFFFFF;
+            m->frame_type=qti;
+            /*And accumulate the statistics over the window.*/
+            _enc->rc.nframes[qti]++;
+            _enc->rc.nframes[2]+=m->dup_count;
+            _enc->rc.scale_sum[qti]+=oc_bexp_q24(m->log_scale);
+            _enc->rc.scale_window_end+=m->dup_count+1;
+            /*Compute an upper bound on the number of remaining packets needed
+               for the current window.*/
+            frames_needed=OC_CLAMPI(0,_enc->rc.buf_delay
+             -(_enc->rc.scale_window_end-_enc->rc.scale_window0),
+             _enc->rc.frames_left[0]+_enc->rc.frames_left[1]
+             -_enc->rc.nframes[0]-_enc->rc.nframes[1]);
+            /*Clear the buffer for the next frame.*/
+            _enc->rc.twopass_buffer_fill=0;
+            _enc->rc.twopass_buffer_bytes=0;
+          }
+          /*Go back for more data.*/
+          else break;
+        }
+        /*If we've got all the frames we need, fill in the current metrics.
+          We're ready to go.*/
+        if(frames_needed<=0){
+          int arg;
+          *&_enc->rc.cur_metrics=
+           *(_enc->rc.frame_metrics+_enc->rc.frame_metrics_head);
+          _enc->rc.twopass_force_kf=
+           _enc->rc.cur_metrics.frame_type==OC_INTRA_FRAME;
+          /*"Helpfully" set the dup count back to what it was in pass 1.*/
+          arg=_enc->rc.cur_metrics.dup_count;
+          th_encode_ctl(_enc,TH_ENCCTL_SET_DUP_COUNT,&arg,sizeof(arg));
+          /*Mark us ready for the next frame.*/
+          _enc->rc.twopass_buffer_bytes=1;
+        }
+      }
+    }
+  }
+  return (int)consumed;
+}
diff --git a/lib/state.c b/lib/state.c
new file mode 100644
index 0000000..42ed33a
--- /dev/null
+++ b/lib/state.c
@@ -0,0 +1,1227 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: state.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include "internal.h"
+#if defined(OC_X86_ASM)
+#if defined(_MSC_VER)
+# include "x86_vc/x86int.h"
+#else
+# include "x86/x86int.h"
+#endif
+#endif
+#if defined(OC_DUMP_IMAGES)
+# include <stdio.h>
+# include "png.h"
+#endif
+
+/*Returns the fragment index of the top-left block in a macro block.
+  This can be used to test whether or not the whole macro block is valid.
+  _sb_map: The super block map.
+  _quadi:  The quadrant number.
+  Return: The index of the fragment of the upper left block in the macro
+   block, or -1 if the block lies outside the coded frame.*/
+static ptrdiff_t oc_sb_quad_top_left_frag(oc_sb_map_quad _sb_map[4],int _quadi){
+  /*It so happens that under the Hilbert curve ordering described below, the
+     upper-left block in each macro block is at index 0, except in macro block
+     3, where it is at index 2.*/
+  return _sb_map[_quadi][_quadi&_quadi<<1];
+}
+
+/*Fills in the mapping from block positions to fragment numbers for a single
+   color plane.
+  This function also fills in the "valid" flag of each quadrant in the super
+   block flags.
+  _sb_maps:  The array of super block maps for the color plane.
+  _sb_flags: The array of super block flags for the color plane.
+  _frag0:    The index of the first fragment in the plane.
+  _hfrags:   The number of horizontal fragments in a coded frame.
+  _vfrags:   The number of vertical fragments in a coded frame.*/
+static void oc_sb_create_plane_mapping(oc_sb_map _sb_maps[],
+ oc_sb_flags _sb_flags[],ptrdiff_t _frag0,int _hfrags,int _vfrags){
+  /*Contains the (macro_block,block) indices for a 4x4 grid of
+     fragments.
+    The pattern is a 4x4 Hilbert space-filling curve.
+    A Hilbert curve has the nice property that as the curve grows larger, its
+     fractal dimension approaches 2.
+    The intuition is that nearby blocks in the curve are also close spatially,
+     with the previous element always an immediate neighbor, so that runs of
+     blocks should be well correlated.*/
+  static const int SB_MAP[4][4][2]={
+    {{0,0},{0,1},{3,2},{3,3}},
+    {{0,3},{0,2},{3,1},{3,0}},
+    {{1,0},{1,3},{2,0},{2,3}},
+    {{1,1},{1,2},{2,1},{2,2}}
+  };
+  ptrdiff_t  yfrag;
+  unsigned   sbi;
+  int        y;
+  sbi=0;
+  yfrag=_frag0;
+  for(y=0;;y+=4){
+    int imax;
+    int x;
+    /*Figure out how many columns of blocks in this super block lie within the
+       image.*/
+    imax=_vfrags-y;
+    if(imax>4)imax=4;
+    else if(imax<=0)break;
+    for(x=0;;x+=4,sbi++){
+      ptrdiff_t xfrag;
+      int       jmax;
+      int       quadi;
+      int       i;
+      /*Figure out how many rows of blocks in this super block lie within the
+         image.*/
+      jmax=_hfrags-x;
+      if(jmax>4)jmax=4;
+      else if(jmax<=0)break;
+      /*By default, set all fragment indices to -1.*/
+      memset(_sb_maps[sbi][0],0xFF,sizeof(_sb_maps[sbi]));
+      /*Fill in the fragment map for this super block.*/
+      xfrag=yfrag+x;
+      for(i=0;i<imax;i++){
+        int j;
+        for(j=0;j<jmax;j++){
+          _sb_maps[sbi][SB_MAP[i][j][0]][SB_MAP[i][j][1]]=xfrag+j;
+        }
+        xfrag+=_hfrags;
+      }
+      /*Mark which quadrants of this super block lie within the image.*/
+      for(quadi=0;quadi<4;quadi++){
+        _sb_flags[sbi].quad_valid|=
+         (oc_sb_quad_top_left_frag(_sb_maps[sbi],quadi)>=0)<<quadi;
+      }
+    }
+    yfrag+=_hfrags<<2;
+  }
+}
+
+/*Fills in the Y plane fragment map for a macro block given the fragment
+   coordinates of its upper-left hand corner.
+  _mb_map:    The macro block map to fill.
+  _fplane: The description of the Y plane.
+  _xfrag0: The X location of the upper-left hand fragment in the luma plane.
+  _yfrag0: The Y location of the upper-left hand fragment in the luma plane.*/
+static void oc_mb_fill_ymapping(oc_mb_map_plane _mb_map[3],
+ const oc_fragment_plane *_fplane,int _xfrag0,int _yfrag0){
+  int i;
+  int j;
+  for(i=0;i<2;i++)for(j=0;j<2;j++){
+    _mb_map[0][i<<1|j]=(_yfrag0+i)*(ptrdiff_t)_fplane->nhfrags+_xfrag0+j;
+  }
+}
+
+/*Fills in the chroma plane fragment maps for a macro block.
+  This version is for use with chroma decimated in the X and Y directions
+   (4:2:0).
+  _mb_map:  The macro block map to fill.
+  _fplanes: The descriptions of the fragment planes.
+  _xfrag0:  The X location of the upper-left hand fragment in the luma plane.
+  _yfrag0:  The Y location of the upper-left hand fragment in the luma plane.*/
+static void oc_mb_fill_cmapping00(oc_mb_map_plane _mb_map[3],
+ const oc_fragment_plane _fplanes[3],int _xfrag0,int _yfrag0){
+  ptrdiff_t fragi;
+  _xfrag0>>=1;
+  _yfrag0>>=1;
+  fragi=_yfrag0*(ptrdiff_t)_fplanes[1].nhfrags+_xfrag0;
+  _mb_map[1][0]=fragi+_fplanes[1].froffset;
+  _mb_map[2][0]=fragi+_fplanes[2].froffset;
+}
+
+/*Fills in the chroma plane fragment maps for a macro block.
+  This version is for use with chroma decimated in the Y direction.
+  _mb_map:  The macro block map to fill.
+  _fplanes: The descriptions of the fragment planes.
+  _xfrag0:  The X location of the upper-left hand fragment in the luma plane.
+  _yfrag0:  The Y location of the upper-left hand fragment in the luma plane.*/
+static void oc_mb_fill_cmapping01(oc_mb_map_plane _mb_map[3],
+ const oc_fragment_plane _fplanes[3],int _xfrag0,int _yfrag0){
+  ptrdiff_t fragi;
+  int       j;
+  _yfrag0>>=1;
+  fragi=_yfrag0*(ptrdiff_t)_fplanes[1].nhfrags+_xfrag0;
+  for(j=0;j<2;j++){
+    _mb_map[1][j]=fragi+_fplanes[1].froffset;
+    _mb_map[2][j]=fragi+_fplanes[2].froffset;
+    fragi++;
+  }
+}
+
+/*Fills in the chroma plane fragment maps for a macro block.
+  This version is for use with chroma decimated in the X direction (4:2:2).
+  _mb_map:  The macro block map to fill.
+  _fplanes: The descriptions of the fragment planes.
+  _xfrag0:  The X location of the upper-left hand fragment in the luma plane.
+  _yfrag0:  The Y location of the upper-left hand fragment in the luma plane.*/
+static void oc_mb_fill_cmapping10(oc_mb_map_plane _mb_map[3],
+ const oc_fragment_plane _fplanes[3],int _xfrag0,int _yfrag0){
+  ptrdiff_t fragi;
+  int       i;
+  _xfrag0>>=1;
+  fragi=_yfrag0*(ptrdiff_t)_fplanes[1].nhfrags+_xfrag0;
+  for(i=0;i<2;i++){
+    _mb_map[1][i<<1]=fragi+_fplanes[1].froffset;
+    _mb_map[2][i<<1]=fragi+_fplanes[2].froffset;
+    fragi+=_fplanes[1].nhfrags;
+  }
+}
+
+/*Fills in the chroma plane fragment maps for a macro block.
+  This version is for use with no chroma decimation (4:4:4).
+  This uses the already filled-in luma plane values.
+  _mb_map:  The macro block map to fill.
+  _fplanes: The descriptions of the fragment planes.*/
+static void oc_mb_fill_cmapping11(oc_mb_map_plane _mb_map[3],
+ const oc_fragment_plane _fplanes[3]){
+  int k;
+  for(k=0;k<4;k++){
+    _mb_map[1][k]=_mb_map[0][k]+_fplanes[1].froffset;
+    _mb_map[2][k]=_mb_map[0][k]+_fplanes[2].froffset;
+  }
+}
+
+/*The function type used to fill in the chroma plane fragment maps for a
+   macro block.
+  _mb_map:  The macro block map to fill.
+  _fplanes: The descriptions of the fragment planes.
+  _xfrag0:  The X location of the upper-left hand fragment in the luma plane.
+  _yfrag0:  The Y location of the upper-left hand fragment in the luma plane.*/
+typedef void (*oc_mb_fill_cmapping_func)(oc_mb_map_plane _mb_map[3],
+ const oc_fragment_plane _fplanes[3],int _xfrag0,int _yfrag0);
+
+/*A table of functions used to fill in the chroma plane fragment maps for a
+   macro block for each type of chrominance decimation.*/
+static const oc_mb_fill_cmapping_func OC_MB_FILL_CMAPPING_TABLE[4]={
+  oc_mb_fill_cmapping00,
+  oc_mb_fill_cmapping01,
+  oc_mb_fill_cmapping10,
+  (oc_mb_fill_cmapping_func)oc_mb_fill_cmapping11
+};
+
+/*Fills in the mapping from macro blocks to their corresponding fragment
+   numbers in each plane.
+  _mb_maps:   The list of macro block maps.
+  _mb_modes:  The list of macro block modes; macro blocks completely outside
+               the coded region are marked invalid.
+  _fplanes:   The descriptions of the fragment planes.
+  _pixel_fmt: The chroma decimation type.*/
+static void oc_mb_create_mapping(oc_mb_map _mb_maps[],
+ signed char _mb_modes[],const oc_fragment_plane _fplanes[3],int _pixel_fmt){
+  oc_mb_fill_cmapping_func  mb_fill_cmapping;
+  unsigned                  sbi;
+  int                       y;
+  mb_fill_cmapping=OC_MB_FILL_CMAPPING_TABLE[_pixel_fmt];
+  /*Loop through the luma plane super blocks.*/
+  for(sbi=y=0;y<_fplanes[0].nvfrags;y+=4){
+    int x;
+    for(x=0;x<_fplanes[0].nhfrags;x+=4,sbi++){
+      int ymb;
+      /*Loop through the macro blocks in each super block in display order.*/
+      for(ymb=0;ymb<2;ymb++){
+        int xmb;
+        for(xmb=0;xmb<2;xmb++){
+          unsigned mbi;
+          int      mbx;
+          int      mby;
+          mbi=sbi<<2|OC_MB_MAP[ymb][xmb];
+          mbx=x|xmb<<1;
+          mby=y|ymb<<1;
+          /*Initialize fragment indices to -1.*/
+          memset(_mb_maps[mbi],0xFF,sizeof(_mb_maps[mbi]));
+          /*Make sure this macro block is within the encoded region.*/
+          if(mbx>=_fplanes[0].nhfrags||mby>=_fplanes[0].nvfrags){
+            _mb_modes[mbi]=OC_MODE_INVALID;
+            continue;
+          }
+          /*Fill in the fragment indices for the luma plane.*/
+          oc_mb_fill_ymapping(_mb_maps[mbi],_fplanes,mbx,mby);
+          /*Fill in the fragment indices for the chroma planes.*/
+          (*mb_fill_cmapping)(_mb_maps[mbi],_fplanes,mbx,mby);
+        }
+      }
+    }
+  }
+}
+
+/*Marks the fragments which fall all or partially outside the displayable
+   region of the frame.
+  _state: The Theora state containing the fragments to be marked.*/
+static void oc_state_border_init(oc_theora_state *_state){
+  oc_fragment       *frag;
+  oc_fragment       *yfrag_end;
+  oc_fragment       *xfrag_end;
+  oc_fragment_plane *fplane;
+  int                crop_x0;
+  int                crop_y0;
+  int                crop_xf;
+  int                crop_yf;
+  int                pli;
+  int                y;
+  int                x;
+  /*The method we use here is slow, but the code is dead simple and handles
+     all the special cases easily.
+    We only ever need to do it once.*/
+  /*Loop through the fragments, marking those completely outside the
+     displayable region and constructing a border mask for those that straddle
+     the border.*/
+  _state->nborders=0;
+  yfrag_end=frag=_state->frags;
+  for(pli=0;pli<3;pli++){
+    fplane=_state->fplanes+pli;
+    /*Set up the cropping rectangle for this plane.*/
+    crop_x0=_state->info.pic_x;
+    crop_xf=_state->info.pic_x+_state->info.pic_width;
+    crop_y0=_state->info.pic_y;
+    crop_yf=_state->info.pic_y+_state->info.pic_height;
+    if(pli>0){
+      if(!(_state->info.pixel_fmt&1)){
+        crop_x0=crop_x0>>1;
+        crop_xf=crop_xf+1>>1;
+      }
+      if(!(_state->info.pixel_fmt&2)){
+        crop_y0=crop_y0>>1;
+        crop_yf=crop_yf+1>>1;
+      }
+    }
+    y=0;
+    for(yfrag_end+=fplane->nfrags;frag<yfrag_end;y+=8){
+      x=0;
+      for(xfrag_end=frag+fplane->nhfrags;frag<xfrag_end;frag++,x+=8){
+        /*First check to see if this fragment is completely outside the
+           displayable region.*/
+        /*Note the special checks for an empty cropping rectangle.
+          This guarantees that if we count a fragment as straddling the
+           border below, at least one pixel in the fragment will be inside
+           the displayable region.*/
+        if(x+8<=crop_x0||crop_xf<=x||y+8<=crop_y0||crop_yf<=y||
+         crop_x0>=crop_xf||crop_y0>=crop_yf){
+          frag->invalid=1;
+        }
+        /*Otherwise, check to see if it straddles the border.*/
+        else if(x<crop_x0&&crop_x0<x+8||x<crop_xf&&crop_xf<x+8||
+         y<crop_y0&&crop_y0<y+8||y<crop_yf&&crop_yf<y+8){
+          ogg_int64_t mask;
+          int         npixels;
+          int         i;
+          mask=npixels=0;
+          for(i=0;i<8;i++){
+            int j;
+            for(j=0;j<8;j++){
+              if(x+j>=crop_x0&&x+j<crop_xf&&y+i>=crop_y0&&y+i<crop_yf){
+                mask|=(ogg_int64_t)1<<(i<<3|j);
+                npixels++;
+              }
+            }
+          }
+          /*Search the fragment array for border info with the same pattern.
+            In general, there will be at most 8 different patterns (per
+             plane).*/
+          for(i=0;;i++){
+            if(i>=_state->nborders){
+              _state->nborders++;
+              _state->borders[i].mask=mask;
+              _state->borders[i].npixels=npixels;
+            }
+            else if(_state->borders[i].mask!=mask)continue;
+            frag->borderi=i;
+            break;
+          }
+        }
+        else frag->borderi=-1;
+      }
+    }
+  }
+}
+
+static int oc_state_frarray_init(oc_theora_state *_state){
+  int       yhfrags;
+  int       yvfrags;
+  int       chfrags;
+  int       cvfrags;
+  ptrdiff_t yfrags;
+  ptrdiff_t cfrags;
+  ptrdiff_t nfrags;
+  unsigned  yhsbs;
+  unsigned  yvsbs;
+  unsigned  chsbs;
+  unsigned  cvsbs;
+  unsigned  ysbs;
+  unsigned  csbs;
+  unsigned  nsbs;
+  size_t    nmbs;
+  int       hdec;
+  int       vdec;
+  int       pli;
+  /*Figure out the number of fragments in each plane.*/
+  /*These parameters have already been validated to be multiples of 16.*/
+  yhfrags=_state->info.frame_width>>3;
+  yvfrags=_state->info.frame_height>>3;
+  hdec=!(_state->info.pixel_fmt&1);
+  vdec=!(_state->info.pixel_fmt&2);
+  chfrags=yhfrags+hdec>>hdec;
+  cvfrags=yvfrags+vdec>>vdec;
+  yfrags=yhfrags*(ptrdiff_t)yvfrags;
+  cfrags=chfrags*(ptrdiff_t)cvfrags;
+  nfrags=yfrags+2*cfrags;
+  /*Figure out the number of super blocks in each plane.*/
+  yhsbs=yhfrags+3>>2;
+  yvsbs=yvfrags+3>>2;
+  chsbs=chfrags+3>>2;
+  cvsbs=cvfrags+3>>2;
+  ysbs=yhsbs*yvsbs;
+  csbs=chsbs*cvsbs;
+  nsbs=ysbs+2*csbs;
+  nmbs=(size_t)ysbs<<2;
+  /*Check for overflow.
+    We support the ridiculous upper limits of the specification (1048560 by
+     1048560, or 3 TB frames) if the target architecture has 64-bit pointers,
+     but for those with 32-bit pointers (or smaller!) we have to check.
+    If the caller wants to prevent denial-of-service by imposing a more
+     reasonable upper limit on the size of attempted allocations, they must do
+     so themselves; we have no platform independent way to determine how much
+     system memory there is nor an application-independent way to decide what a
+     "reasonable" allocation is.*/
+  if(yfrags/yhfrags!=yvfrags||2*cfrags<cfrags||nfrags<yfrags||
+   ysbs/yhsbs!=yvsbs||2*csbs<csbs||nsbs<ysbs||nmbs>>2!=ysbs){
+    return TH_EIMPL;
+  }
+  /*Initialize the fragment array.*/
+  _state->fplanes[0].nhfrags=yhfrags;
+  _state->fplanes[0].nvfrags=yvfrags;
+  _state->fplanes[0].froffset=0;
+  _state->fplanes[0].nfrags=yfrags;
+  _state->fplanes[0].nhsbs=yhsbs;
+  _state->fplanes[0].nvsbs=yvsbs;
+  _state->fplanes[0].sboffset=0;
+  _state->fplanes[0].nsbs=ysbs;
+  _state->fplanes[1].nhfrags=_state->fplanes[2].nhfrags=chfrags;
+  _state->fplanes[1].nvfrags=_state->fplanes[2].nvfrags=cvfrags;
+  _state->fplanes[1].froffset=yfrags;
+  _state->fplanes[2].froffset=yfrags+cfrags;
+  _state->fplanes[1].nfrags=_state->fplanes[2].nfrags=cfrags;
+  _state->fplanes[1].nhsbs=_state->fplanes[2].nhsbs=chsbs;
+  _state->fplanes[1].nvsbs=_state->fplanes[2].nvsbs=cvsbs;
+  _state->fplanes[1].sboffset=ysbs;
+  _state->fplanes[2].sboffset=ysbs+csbs;
+  _state->fplanes[1].nsbs=_state->fplanes[2].nsbs=csbs;
+  _state->nfrags=nfrags;
+  _state->frags=_ogg_calloc(nfrags,sizeof(*_state->frags));
+  _state->frag_mvs=_ogg_malloc(nfrags*sizeof(*_state->frag_mvs));
+  _state->nsbs=nsbs;
+  _state->sb_maps=_ogg_malloc(nsbs*sizeof(*_state->sb_maps));
+  _state->sb_flags=_ogg_calloc(nsbs,sizeof(*_state->sb_flags));
+  _state->nhmbs=yhsbs<<1;
+  _state->nvmbs=yvsbs<<1;
+  _state->nmbs=nmbs;
+  _state->mb_maps=_ogg_calloc(nmbs,sizeof(*_state->mb_maps));
+  _state->mb_modes=_ogg_calloc(nmbs,sizeof(*_state->mb_modes));
+  _state->coded_fragis=_ogg_malloc(nfrags*sizeof(*_state->coded_fragis));
+  if(_state->frags==NULL||_state->frag_mvs==NULL||_state->sb_maps==NULL||
+   _state->sb_flags==NULL||_state->mb_maps==NULL||_state->mb_modes==NULL||
+   _state->coded_fragis==NULL){
+    return TH_EFAULT;
+  }
+  /*Create the mapping from super blocks to fragments.*/
+  for(pli=0;pli<3;pli++){
+    oc_fragment_plane *fplane;
+    fplane=_state->fplanes+pli;
+    oc_sb_create_plane_mapping(_state->sb_maps+fplane->sboffset,
+     _state->sb_flags+fplane->sboffset,fplane->froffset,
+     fplane->nhfrags,fplane->nvfrags);
+  }
+  /*Create the mapping from macro blocks to fragments.*/
+  oc_mb_create_mapping(_state->mb_maps,_state->mb_modes,
+   _state->fplanes,_state->info.pixel_fmt);
+  /*Initialize the invalid and borderi fields of each fragment.*/
+  oc_state_border_init(_state);
+  return 0;
+}
+
+static void oc_state_frarray_clear(oc_theora_state *_state){
+  _ogg_free(_state->coded_fragis);
+  _ogg_free(_state->mb_modes);
+  _ogg_free(_state->mb_maps);
+  _ogg_free(_state->sb_flags);
+  _ogg_free(_state->sb_maps);
+  _ogg_free(_state->frag_mvs);
+  _ogg_free(_state->frags);
+}
+
+
+/*Initializes the buffers used for reconstructed frames.
+  These buffers are padded with 16 extra pixels on each side, to allow
+   unrestricted motion vectors without special casing the boundary.
+  If chroma is decimated in either direction, the padding is reduced by a
+   factor of 2 on the appropriate sides.
+  _nrefs: The number of reference buffers to init; must be 3 or 4.*/
+static int oc_state_ref_bufs_init(oc_theora_state *_state,int _nrefs){
+  th_info       *info;
+  unsigned char *ref_frame_data;
+  size_t         ref_frame_data_sz;
+  size_t         ref_frame_sz;
+  size_t         yplane_sz;
+  size_t         cplane_sz;
+  int            yhstride;
+  int            yheight;
+  int            chstride;
+  int            cheight;
+  ptrdiff_t      yoffset;
+  ptrdiff_t      coffset;
+  ptrdiff_t     *frag_buf_offs;
+  ptrdiff_t      fragi;
+  int            hdec;
+  int            vdec;
+  int            rfi;
+  int            pli;
+  if(_nrefs<3||_nrefs>4)return TH_EINVAL;
+  info=&_state->info;
+  /*Compute the image buffer parameters for each plane.*/
+  hdec=!(info->pixel_fmt&1);
+  vdec=!(info->pixel_fmt&2);
+  yhstride=info->frame_width+2*OC_UMV_PADDING;
+  yheight=info->frame_height+2*OC_UMV_PADDING;
+  chstride=yhstride>>hdec;
+  cheight=yheight>>vdec;
+  yplane_sz=yhstride*(size_t)yheight;
+  cplane_sz=chstride*(size_t)cheight;
+  yoffset=OC_UMV_PADDING+OC_UMV_PADDING*(ptrdiff_t)yhstride;
+  coffset=(OC_UMV_PADDING>>hdec)+(OC_UMV_PADDING>>vdec)*(ptrdiff_t)chstride;
+  ref_frame_sz=yplane_sz+2*cplane_sz;
+  ref_frame_data_sz=_nrefs*ref_frame_sz;
+  /*Check for overflow.
+    The same caveats apply as for oc_state_frarray_init().*/
+  if(yplane_sz/yhstride!=yheight||2*cplane_sz<cplane_sz||
+   ref_frame_sz<yplane_sz||ref_frame_data_sz/_nrefs!=ref_frame_sz){
+    return TH_EIMPL;
+  }
+  ref_frame_data=_ogg_malloc(ref_frame_data_sz);
+  frag_buf_offs=_state->frag_buf_offs=
+   _ogg_malloc(_state->nfrags*sizeof(*frag_buf_offs));
+  if(ref_frame_data==NULL||frag_buf_offs==NULL){
+    _ogg_free(frag_buf_offs);
+    _ogg_free(ref_frame_data);
+    return TH_EFAULT;
+  }
+  /*Set up the width, height and stride for the image buffers.*/
+  _state->ref_frame_bufs[0][0].width=info->frame_width;
+  _state->ref_frame_bufs[0][0].height=info->frame_height;
+  _state->ref_frame_bufs[0][0].stride=yhstride;
+  _state->ref_frame_bufs[0][1].width=_state->ref_frame_bufs[0][2].width=
+   info->frame_width>>hdec;
+  _state->ref_frame_bufs[0][1].height=_state->ref_frame_bufs[0][2].height=
+   info->frame_height>>vdec;
+  _state->ref_frame_bufs[0][1].stride=_state->ref_frame_bufs[0][2].stride=
+   chstride;
+  for(rfi=1;rfi<_nrefs;rfi++){
+    memcpy(_state->ref_frame_bufs[rfi],_state->ref_frame_bufs[0],
+     sizeof(_state->ref_frame_bufs[0]));
+  }
+  /*Set up the data pointers for the image buffers.*/
+  for(rfi=0;rfi<_nrefs;rfi++){
+    _state->ref_frame_data[rfi]=ref_frame_data;
+    _state->ref_frame_bufs[rfi][0].data=ref_frame_data+yoffset;
+    ref_frame_data+=yplane_sz;
+    _state->ref_frame_bufs[rfi][1].data=ref_frame_data+coffset;
+    ref_frame_data+=cplane_sz;
+    _state->ref_frame_bufs[rfi][2].data=ref_frame_data+coffset;
+    ref_frame_data+=cplane_sz;
+    /*Flip the buffer upside down.
+      This allows us to decode Theora's bottom-up frames in their natural
+       order, yet return a top-down buffer with a positive stride to the user.*/
+    oc_ycbcr_buffer_flip(_state->ref_frame_bufs[rfi],
+     _state->ref_frame_bufs[rfi]);
+  }
+  _state->ref_ystride[0]=-yhstride;
+  _state->ref_ystride[1]=_state->ref_ystride[2]=-chstride;
+  /*Initialize the fragment buffer offsets.*/
+  ref_frame_data=_state->ref_frame_data[0];
+  fragi=0;
+  for(pli=0;pli<3;pli++){
+    th_img_plane      *iplane;
+    oc_fragment_plane *fplane;
+    unsigned char     *vpix;
+    ptrdiff_t          stride;
+    ptrdiff_t          vfragi_end;
+    int                nhfrags;
+    iplane=_state->ref_frame_bufs[0]+pli;
+    fplane=_state->fplanes+pli;
+    vpix=iplane->data;
+    vfragi_end=fplane->froffset+fplane->nfrags;
+    nhfrags=fplane->nhfrags;
+    stride=iplane->stride;
+    while(fragi<vfragi_end){
+      ptrdiff_t      hfragi_end;
+      unsigned char *hpix;
+      hpix=vpix;
+      for(hfragi_end=fragi+nhfrags;fragi<hfragi_end;fragi++){
+        frag_buf_offs[fragi]=hpix-ref_frame_data;
+        hpix+=8;
+      }
+      vpix+=stride<<3;
+    }
+  }
+  /*Initialize the reference frame indices.*/
+  _state->ref_frame_idx[OC_FRAME_GOLD]=
+   _state->ref_frame_idx[OC_FRAME_PREV]=
+   _state->ref_frame_idx[OC_FRAME_SELF]=-1;
+  _state->ref_frame_idx[OC_FRAME_IO]=_nrefs>3?3:-1;
+  return 0;
+}
+
+static void oc_state_ref_bufs_clear(oc_theora_state *_state){
+  _ogg_free(_state->frag_buf_offs);
+  _ogg_free(_state->ref_frame_data[0]);
+}
+
+
+void oc_state_vtable_init_c(oc_theora_state *_state){
+  _state->opt_vtable.frag_copy=oc_frag_copy_c;
+  _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
+  _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
+  _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
+  _state->opt_vtable.idct8x8=oc_idct8x8_c;
+  _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c;
+  _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_c;
+  _state->opt_vtable.state_loop_filter_frag_rows=
+   oc_state_loop_filter_frag_rows_c;
+  _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
+  _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
+}
+
+/*Initialize the accelerated function pointers.*/
+void oc_state_vtable_init(oc_theora_state *_state){
+#if defined(OC_X86_ASM)
+  oc_state_vtable_init_x86(_state);
+#else
+  oc_state_vtable_init_c(_state);
+#endif
+}
+
+
+int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs){
+  int ret;
+  /*First validate the parameters.*/
+  if(_info==NULL)return TH_EFAULT;
+  /*The width and height of the encoded frame must be multiples of 16.
+    They must also, when divided by 16, fit into a 16-bit unsigned integer.
+    The displayable frame offset coordinates must fit into an 8-bit unsigned
+     integer.
+    Note that the offset Y in the API is specified on the opposite side from
+     how it is specified in the bitstream, because the Y axis is flipped in
+     the bitstream.
+    The displayable frame must fit inside the encoded frame.
+    The color space must be one known by the encoder.*/
+  if((_info->frame_width&0xF)||(_info->frame_height&0xF)||
+   _info->frame_width<=0||_info->frame_width>=0x100000||
+   _info->frame_height<=0||_info->frame_height>=0x100000||
+   _info->pic_x+_info->pic_width>_info->frame_width||
+   _info->pic_y+_info->pic_height>_info->frame_height||
+   _info->pic_x>255||_info->frame_height-_info->pic_height-_info->pic_y>255||
+   /*Note: the following <0 comparisons may generate spurious warnings on
+      platforms where enums are unsigned.
+     We could cast them to unsigned and just use the following >= comparison,
+      but there are a number of compilers which will mis-optimize this.
+     It's better to live with the spurious warnings.*/
+   _info->colorspace<0||_info->colorspace>=TH_CS_NSPACES||
+   _info->pixel_fmt<0||_info->pixel_fmt>=TH_PF_NFORMATS){
+    return TH_EINVAL;
+  }
+  memset(_state,0,sizeof(*_state));
+  memcpy(&_state->info,_info,sizeof(*_info));
+  /*Invert the sense of pic_y to match Theora's right-handed coordinate
+     system.*/
+  _state->info.pic_y=_info->frame_height-_info->pic_height-_info->pic_y;
+  _state->frame_type=OC_UNKWN_FRAME;
+  oc_state_vtable_init(_state);
+  ret=oc_state_frarray_init(_state);
+  if(ret>=0)ret=oc_state_ref_bufs_init(_state,_nrefs);
+  if(ret<0){
+    oc_state_frarray_clear(_state);
+    return ret;
+  }
+  /*If the keyframe_granule_shift is out of range, use the maximum allowable
+     value.*/
+  if(_info->keyframe_granule_shift<0||_info->keyframe_granule_shift>31){
+    _state->info.keyframe_granule_shift=31;
+  }
+  _state->keyframe_num=0;
+  _state->curframe_num=-1;
+  /*3.2.0 streams mark the frame index instead of the frame count.
+    This was changed with stream version 3.2.1 to conform to other Ogg
+     codecs.
+    We add an extra bias when computing granule positions for new streams.*/
+  _state->granpos_bias=TH_VERSION_CHECK(_info,3,2,1);
+  return 0;
+}
+
+void oc_state_clear(oc_theora_state *_state){
+  oc_state_ref_bufs_clear(_state);
+  oc_state_frarray_clear(_state);
+}
+
+
+/*Duplicates the pixels on the border of the image plane out into the
+   surrounding padding for use by unrestricted motion vectors.
+  This function only adds the left and right borders, and only for the fragment
+   rows specified.
+  _refi: The index of the reference buffer to pad.
+  _pli:  The color plane.
+  _y0:   The Y coordinate of the first row to pad.
+  _yend: The Y coordinate of the row to stop padding at.*/
+void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli,
+ int _y0,int _yend){
+  th_img_plane  *iplane;
+  unsigned char *apix;
+  unsigned char *bpix;
+  unsigned char *epix;
+  int            stride;
+  int            hpadding;
+  hpadding=OC_UMV_PADDING>>(_pli!=0&&!(_state->info.pixel_fmt&1));
+  iplane=_state->ref_frame_bufs[_refi]+_pli;
+  stride=iplane->stride;
+  apix=iplane->data+_y0*(ptrdiff_t)stride;
+  bpix=apix+iplane->width-1;
+  epix=iplane->data+_yend*(ptrdiff_t)stride;
+  /*Note the use of != instead of <, which allows the stride to be negative.*/
+  while(apix!=epix){
+    memset(apix-hpadding,apix[0],hpadding);
+    memset(bpix+1,bpix[0],hpadding);
+    apix+=stride;
+    bpix+=stride;
+  }
+}
+
+/*Duplicates the pixels on the border of the image plane out into the
+   surrounding padding for use by unrestricted motion vectors.
+  This function only adds the top and bottom borders, and must be called after
+   the left and right borders are added.
+  _refi:      The index of the reference buffer to pad.
+  _pli:       The color plane.*/
+void oc_state_borders_fill_caps(oc_theora_state *_state,int _refi,int _pli){
+  th_img_plane  *iplane;
+  unsigned char *apix;
+  unsigned char *bpix;
+  unsigned char *epix;
+  int            stride;
+  int            hpadding;
+  int            vpadding;
+  int            fullw;
+  hpadding=OC_UMV_PADDING>>(_pli!=0&&!(_state->info.pixel_fmt&1));
+  vpadding=OC_UMV_PADDING>>(_pli!=0&&!(_state->info.pixel_fmt&2));
+  iplane=_state->ref_frame_bufs[_refi]+_pli;
+  stride=iplane->stride;
+  fullw=iplane->width+(hpadding<<1);
+  apix=iplane->data-hpadding;
+  bpix=iplane->data+(iplane->height-1)*(ptrdiff_t)stride-hpadding;
+  epix=apix-stride*(ptrdiff_t)vpadding;
+  while(apix!=epix){
+    memcpy(apix-stride,apix,fullw);
+    memcpy(bpix+stride,bpix,fullw);
+    apix-=stride;
+    bpix+=stride;
+  }
+}
+
+/*Duplicates the pixels on the border of the given reference image out into
+   the surrounding padding for use by unrestricted motion vectors.
+  _state: The context containing the reference buffers.
+  _refi:  The index of the reference buffer to pad.*/
+void oc_state_borders_fill(oc_theora_state *_state,int _refi){
+  int pli;
+  for(pli=0;pli<3;pli++){
+    oc_state_borders_fill_rows(_state,_refi,pli,0,
+     _state->ref_frame_bufs[_refi][pli].height);
+    oc_state_borders_fill_caps(_state,_refi,pli);
+  }
+}
+
+/*Determines the offsets in an image buffer to use for motion compensation.
+  _state:   The Theora state the offsets are to be computed with.
+  _offsets: Returns the offset for the buffer(s).
+            _offsets[0] is always set.
+            _offsets[1] is set if the motion vector has non-zero fractional
+             components.
+  _pli:     The color plane index.
+  _dx:      The X component of the motion vector.
+  _dy:      The Y component of the motion vector.
+  Return: The number of offsets returned: 1 or 2.*/
+int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
+ int _pli,int _dx,int _dy){
+  /*Here is a brief description of how Theora handles motion vectors:
+    Motion vector components are specified to half-pixel accuracy in
+     undecimated directions of each plane, and quarter-pixel accuracy in
+     decimated directions.
+    Integer parts are extracted by dividing (not shifting) by the
+     appropriate amount, with truncation towards zero.
+    These integer values are used to calculate the first offset.
+
+    If either of the fractional parts are non-zero, then a second offset is
+     computed.
+    No third or fourth offsets are computed, even if both components have
+     non-zero fractional parts.
+    The second offset is computed by dividing (not shifting) by the
+     appropriate amount, always truncating _away_ from zero.*/
+#if 0
+  /*This version of the code doesn't use any tables, but is slower.*/
+  int ystride;
+  int xprec;
+  int yprec;
+  int xfrac;
+  int yfrac;
+  int offs;
+  ystride=_state->ref_ystride[_pli];
+  /*These two variables decide whether we are in half- or quarter-pixel
+     precision in each component.*/
+  xprec=1+(_pli!=0&&!(_state->info.pixel_fmt&1));
+  yprec=1+(_pli!=0&&!(_state->info.pixel_fmt&2));
+  /*These two variables are either 0 if all the fractional bits are zero or -1
+     if any of them are non-zero.*/
+  xfrac=OC_SIGNMASK(-(_dx&(xprec|1)));
+  yfrac=OC_SIGNMASK(-(_dy&(yprec|1)));
+  offs=(_dx>>xprec)+(_dy>>yprec)*ystride;
+  if(xfrac||yfrac){
+    int xmask;
+    int ymask;
+    xmask=OC_SIGNMASK(_dx);
+    ymask=OC_SIGNMASK(_dy);
+    yfrac&=ystride;
+    _offsets[0]=offs-(xfrac&xmask)+(yfrac&ymask);
+    _offsets[1]=offs-(xfrac&~xmask)+(yfrac&~ymask);
+    return 2;
+  }
+  else{
+    _offsets[0]=offs;
+    return 1;
+  }
+#else
+  /*Using tables simplifies the code, and there's enough arithmetic to hide the
+     latencies of the memory references.*/
+  static const signed char OC_MVMAP[2][64]={
+    {
+          -15,-15,-14,-14,-13,-13,-12,-12,-11,-11,-10,-10, -9, -9, -8,
+       -8, -7, -7, -6, -6, -5, -5, -4, -4, -3, -3, -2, -2, -1, -1,  0,
+        0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,
+        8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15
+    },
+    {
+           -7, -7, -7, -7, -6, -6, -6, -6, -5, -5, -5, -5, -4, -4, -4,
+       -4, -3, -3, -3, -3, -2, -2, -2, -2, -1, -1, -1, -1,  0,  0,  0,
+        0,  0,  0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,
+        4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7
+    }
+  };
+  static const signed char OC_MVMAP2[2][64]={
+    {
+        -1, 0,-1,  0,-1, 0,-1,  0,-1, 0,-1,  0,-1, 0,-1,
+      0,-1, 0,-1,  0,-1, 0,-1,  0,-1, 0,-1,  0,-1, 0,-1,
+      0, 1, 0, 1,  0, 1, 0, 1,  0, 1, 0, 1,  0, 1, 0, 1,
+      0, 1, 0, 1,  0, 1, 0, 1,  0, 1, 0, 1,  0, 1, 0, 1
+    },
+    {
+        -1,-1,-1,  0,-1,-1,-1,  0,-1,-1,-1,  0,-1,-1,-1,
+      0,-1,-1,-1,  0,-1,-1,-1,  0,-1,-1,-1,  0,-1,-1,-1,
+      0, 1, 1, 1,  0, 1, 1, 1,  0, 1, 1, 1,  0, 1, 1, 1,
+      0, 1, 1, 1,  0, 1, 1, 1,  0, 1, 1, 1,  0, 1, 1, 1
+    }
+  };
+  int ystride;
+  int qpx;
+  int qpy;
+  int mx;
+  int my;
+  int mx2;
+  int my2;
+  int offs;
+  ystride=_state->ref_ystride[_pli];
+  qpy=_pli!=0&&!(_state->info.pixel_fmt&2);
+  my=OC_MVMAP[qpy][_dy+31];
+  my2=OC_MVMAP2[qpy][_dy+31];
+  qpx=_pli!=0&&!(_state->info.pixel_fmt&1);
+  mx=OC_MVMAP[qpx][_dx+31];
+  mx2=OC_MVMAP2[qpx][_dx+31];
+  offs=my*ystride+mx;
+  if(mx2||my2){
+    _offsets[1]=offs+my2*ystride+mx2;
+    _offsets[0]=offs;
+    return 2;
+  }
+  _offsets[0]=offs;
+  return 1;
+#endif
+}
+
+void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+  _state->opt_vtable.state_frag_recon(_state,_fragi,_pli,_dct_coeffs,
+   _last_zzi,_dc_quant);
+}
+
+void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            mb_mode;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_int16_t p;
+    int         ci;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    /*LOOP VECTORIZES.*/
+    for(ci=0;ci<64;ci++)_dct_coeffs[ci]=p;
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8(_state,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  mb_mode=_state->frags[_fragi].mb_mode;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+     +frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
+      oc_frag_recon_inter2(_state,
+       dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs);
+    }
+    else oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+  }
+}
+
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _fragis:    A pointer to a list of fragment indices.
+  _nfragis:   The number of fragment indices to copy.
+  _dst_frame: The reference frame to copy to.
+  _src_frame: The reference frame to copy from.
+  _pli:       The color plane the fragments lie in.*/
+void oc_state_frag_copy_list(const oc_theora_state *_state,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
+ int _dst_frame,int _src_frame,int _pli){
+  _state->opt_vtable.state_frag_copy_list(_state,_fragis,_nfragis,_dst_frame,
+   _src_frame,_pli);
+}
+
+void oc_state_frag_copy_list_c(const oc_theora_state *_state,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
+ int _dst_frame,int _src_frame,int _pli){
+  const ptrdiff_t     *frag_buf_offs;
+  const unsigned char *src_frame_data;
+  unsigned char       *dst_frame_data;
+  ptrdiff_t            fragii;
+  int                  ystride;
+  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
+  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
+  ystride=_state->ref_ystride[_pli];
+  frag_buf_offs=_state->frag_buf_offs;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=frag_buf_offs[_fragis[fragii]];
+    oc_frag_copy(_state,dst_frame_data+frag_buf_off,
+     src_frame_data+frag_buf_off,ystride);
+  }
+}
+
+static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){
+  int y;
+  _pix-=2;
+  for(y=0;y<8;y++){
+    int f;
+    f=_pix[0]-_pix[3]+3*(_pix[2]-_pix[1]);
+    /*The _bv array is used to compute the function
+      f=OC_CLAMPI(OC_MINI(-_2flimit-f,0),f,OC_MAXI(_2flimit-f,0));
+      where _2flimit=_state->loop_filter_limits[_state->qis[0]]<<1;*/
+    f=*(_bv+(f+4>>3));
+    _pix[1]=OC_CLAMP255(_pix[1]+f);
+    _pix[2]=OC_CLAMP255(_pix[2]-f);
+    _pix+=_ystride;
+  }
+}
+
+static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){
+  int x;
+  _pix-=_ystride*2;
+  for(x=0;x<8;x++){
+    int f;
+    f=_pix[x]-_pix[_ystride*3+x]+3*(_pix[_ystride*2+x]-_pix[_ystride+x]);
+    /*The _bv array is used to compute the function
+      f=OC_CLAMPI(OC_MINI(-_2flimit-f,0),f,OC_MAXI(_2flimit-f,0));
+      where _2flimit=_state->loop_filter_limits[_state->qis[0]]<<1;*/
+    f=*(_bv+(f+4>>3));
+    _pix[_ystride+x]=OC_CLAMP255(_pix[_ystride+x]+f);
+    _pix[_ystride*2+x]=OC_CLAMP255(_pix[_ystride*2+x]-f);
+  }
+}
+
+/*Initialize the bounding values array used by the loop filter.
+  _bv: Storage for the array.
+  Return: 0 on success, or a non-zero value if no filtering need be applied.*/
+int oc_state_loop_filter_init(oc_theora_state *_state,int _bv[256]){
+  int flimit;
+  int i;
+  flimit=_state->loop_filter_limits[_state->qis[0]];
+  if(flimit==0)return 1;
+  memset(_bv,0,sizeof(_bv[0])*256);
+  for(i=0;i<flimit;i++){
+    if(127-i-flimit>=0)_bv[127-i-flimit]=i-flimit;
+    _bv[127-i]=-i;
+    _bv[127+i]=i;
+    if(127+i+flimit<256)_bv[127+i+flimit]=flimit-i;
+  }
+  return 0;
+}
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+  The filter may be run on the bottom edge, affecting pixels in the next row of
+   fragments, so this row also needs to be available.
+  _bv:        The bounding values array.
+  _refi:      The index of the frame buffer to filter.
+  _pli:       The color plane to filter.
+  _fragy0:    The Y coordinate of the first fragment row to filter.
+  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows(const oc_theora_state *_state,int _bv[256],
+ int _refi,int _pli,int _fragy0,int _fragy_end){
+  _state->opt_vtable.state_loop_filter_frag_rows(_state,_bv,_refi,_pli,
+   _fragy0,_fragy_end);
+}
+
+void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end){
+  const oc_fragment_plane *fplane;
+  const oc_fragment       *frags;
+  const ptrdiff_t         *frag_buf_offs;
+  unsigned char           *ref_frame_data;
+  ptrdiff_t                fragi_top;
+  ptrdiff_t                fragi_bot;
+  ptrdiff_t                fragi0;
+  ptrdiff_t                fragi0_end;
+  int                      ystride;
+  int                      nhfrags;
+  _bv+=127;
+  fplane=_state->fplanes+_pli;
+  nhfrags=fplane->nhfrags;
+  fragi_top=fplane->froffset;
+  fragi_bot=fragi_top+fplane->nfrags;
+  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+  ystride=_state->ref_ystride[_pli];
+  frags=_state->frags;
+  frag_buf_offs=_state->frag_buf_offs;
+  ref_frame_data=_state->ref_frame_data[_refi];
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
+  while(fragi0<fragi0_end){
+    ptrdiff_t fragi;
+    ptrdiff_t fragi_end;
+    fragi=fragi0;
+    fragi_end=fragi+nhfrags;
+    while(fragi<fragi_end){
+      if(frags[fragi].coded){
+        unsigned char *ref;
+        ref=ref_frame_data+frag_buf_offs[fragi];
+        if(fragi>fragi0)loop_filter_h(ref,ystride,_bv);
+        if(fragi0>fragi_top)loop_filter_v(ref,ystride,_bv);
+        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
+          loop_filter_h(ref+8,ystride,_bv);
+        }
+        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
+          loop_filter_v(ref+(ystride<<3),ystride,_bv);
+        }
+      }
+      fragi++;
+    }
+    fragi0+=nhfrags;
+  }
+}
+
+#if defined(OC_DUMP_IMAGES)
+int oc_state_dump_frame(const oc_theora_state *_state,int _frame,
+ const char *_suf){
+  /*Dump a PNG of the reconstructed image.*/
+  png_structp    png;
+  png_infop      info;
+  png_bytep     *image;
+  FILE          *fp;
+  char           fname[16];
+  unsigned char *y_row;
+  unsigned char *u_row;
+  unsigned char *v_row;
+  unsigned char *y;
+  unsigned char *u;
+  unsigned char *v;
+  ogg_int64_t    iframe;
+  ogg_int64_t    pframe;
+  int            y_stride;
+  int            u_stride;
+  int            v_stride;
+  int            framei;
+  int            width;
+  int            height;
+  int            imgi;
+  int            imgj;
+  width=_state->info.frame_width;
+  height=_state->info.frame_height;
+  iframe=_state->granpos>>_state->info.keyframe_granule_shift;
+  pframe=_state->granpos-(iframe<<_state->info.keyframe_granule_shift);
+  sprintf(fname,"%08i%s.png",(int)(iframe+pframe),_suf);
+  fp=fopen(fname,"wb");
+  if(fp==NULL)return TH_EFAULT;
+  image=(png_bytep *)oc_malloc_2d(height,6*width,sizeof(**image));
+  if(image==NULL){
+    fclose(fp);
+    return TH_EFAULT;
+  }
+  png=png_create_write_struct(PNG_LIBPNG_VER_STRING,NULL,NULL,NULL);
+  if(png==NULL){
+    oc_free_2d(image);
+    fclose(fp);
+    return TH_EFAULT;
+  }
+  info=png_create_info_struct(png);
+  if(info==NULL){
+    png_destroy_write_struct(&png,NULL);
+    oc_free_2d(image);
+    fclose(fp);
+    return TH_EFAULT;
+  }
+  if(setjmp(png_jmpbuf(png))){
+    png_destroy_write_struct(&png,&info);
+    oc_free_2d(image);
+    fclose(fp);
+    return TH_EFAULT;
+  }
+  framei=_state->ref_frame_idx[_frame];
+  y_row=_state->ref_frame_bufs[framei][0].data;
+  u_row=_state->ref_frame_bufs[framei][1].data;
+  v_row=_state->ref_frame_bufs[framei][2].data;
+  y_stride=_state->ref_frame_bufs[framei][0].stride;
+  u_stride=_state->ref_frame_bufs[framei][1].stride;
+  v_stride=_state->ref_frame_bufs[framei][2].stride;
+  /*Chroma up-sampling is just done with a box filter.
+    This is very likely what will actually be used in practice on a real
+     display, and also removes one more layer to search in for the source of
+     artifacts.
+    As an added bonus, it's dead simple.*/
+  for(imgi=height;imgi-->0;){
+    int dc;
+    y=y_row;
+    u=u_row;
+    v=v_row;
+    for(imgj=0;imgj<6*width;){
+      float    yval;
+      float    uval;
+      float    vval;
+      unsigned rval;
+      unsigned gval;
+      unsigned bval;
+      /*This is intentionally slow and very accurate.*/
+      yval=(*y-16)*(1.0F/219);
+      uval=(*u-128)*(2*(1-0.114F)/224);
+      vval=(*v-128)*(2*(1-0.299F)/224);
+      rval=OC_CLAMPI(0,(int)(65535*(yval+vval)+0.5F),65535);
+      gval=OC_CLAMPI(0,(int)(65535*(
+       yval-uval*(0.114F/0.587F)-vval*(0.299F/0.587F))+0.5F),65535);
+      bval=OC_CLAMPI(0,(int)(65535*(yval+uval)+0.5F),65535);
+      image[imgi][imgj++]=(unsigned char)(rval>>8);
+      image[imgi][imgj++]=(unsigned char)(rval&0xFF);
+      image[imgi][imgj++]=(unsigned char)(gval>>8);
+      image[imgi][imgj++]=(unsigned char)(gval&0xFF);
+      image[imgi][imgj++]=(unsigned char)(bval>>8);
+      image[imgi][imgj++]=(unsigned char)(bval&0xFF);
+      dc=(y-y_row&1)|(_state->info.pixel_fmt&1);
+      y++;
+      u+=dc;
+      v+=dc;
+    }
+    dc=-((height-1-imgi&1)|_state->info.pixel_fmt>>1);
+    y_row+=y_stride;
+    u_row+=dc&u_stride;
+    v_row+=dc&v_stride;
+  }
+  png_init_io(png,fp);
+  png_set_compression_level(png,Z_BEST_COMPRESSION);
+  png_set_IHDR(png,info,width,height,16,PNG_COLOR_TYPE_RGB,
+   PNG_INTERLACE_NONE,PNG_COMPRESSION_TYPE_DEFAULT,PNG_FILTER_TYPE_DEFAULT);
+  switch(_state->info.colorspace){
+    case TH_CS_ITU_REC_470M:{
+      png_set_gAMA(png,info,2.2);
+      png_set_cHRM_fixed(png,info,31006,31616,
+       67000,32000,21000,71000,14000,8000);
+    }break;
+    case TH_CS_ITU_REC_470BG:{
+      png_set_gAMA(png,info,2.67);
+      png_set_cHRM_fixed(png,info,31271,32902,
+       64000,33000,29000,60000,15000,6000);
+    }break;
+    default:break;
+  }
+  png_set_pHYs(png,info,_state->info.aspect_numerator,
+   _state->info.aspect_denominator,0);
+  png_set_rows(png,info,image);
+  png_write_png(png,info,PNG_TRANSFORM_IDENTITY,NULL);
+  png_write_end(png,info);
+  png_destroy_write_struct(&png,&info);
+  oc_free_2d(image);
+  fclose(fp);
+  return 0;
+}
+#endif
+
+
+
+ogg_int64_t th_granule_frame(void *_encdec,ogg_int64_t _granpos){
+  oc_theora_state *state;
+  state=(oc_theora_state *)_encdec;
+  if(_granpos>=0){
+    ogg_int64_t iframe;
+    ogg_int64_t pframe;
+    iframe=_granpos>>state->info.keyframe_granule_shift;
+    pframe=_granpos-(iframe<<state->info.keyframe_granule_shift);
+    /*3.2.0 streams store the frame index in the granule position.
+      3.2.1 and later store the frame count.
+      We return the index, so adjust the value if we have a 3.2.1 or later
+       stream.*/
+    return iframe+pframe-TH_VERSION_CHECK(&state->info,3,2,1);
+  }
+  return -1;
+}
+
+double th_granule_time(void *_encdec,ogg_int64_t _granpos){
+  oc_theora_state *state;
+  state=(oc_theora_state *)_encdec;
+  if(_granpos>=0){
+    return (th_granule_frame(_encdec, _granpos)+1)*(
+     (double)state->info.fps_denominator/state->info.fps_numerator);
+  }
+  return -1;
+}
diff --git a/lib/theora.exp b/lib/theora.exp
new file mode 100644
index 0000000..b4e0225
--- /dev/null
+++ b/lib/theora.exp
@@ -0,0 +1,55 @@
+# export list for libtheora
+_theora_version_string
+_theora_version_number
+_theora_encode_init
+_theora_encode_YUVin
+_theora_encode_packetout
+_theora_encode_header
+_theora_encode_comment
+_theora_encode_tables
+_theora_decode_header
+_theora_decode_init
+_theora_decode_packetin
+_theora_decode_YUVout
+_theora_control
+_theora_packet_isheader
+_theora_packet_iskeyframe
+_theora_granule_shift
+_theora_granule_frame
+_theora_granule_time
+_theora_info_init
+_theora_info_clear
+_theora_clear
+_theora_comment_init
+_theora_comment_add
+_theora_comment_add_tag
+_theora_comment_query
+_theora_comment_query_count
+_theora_comment_clear
+_th_version_string
+_th_version_number
+_th_decode_headerin
+_th_decode_alloc
+_th_setup_free
+_th_decode_ctl
+_th_decode_packetin
+_th_decode_ycbcr_out
+_th_decode_free
+_th_packet_isheader
+_th_packet_iskeyframe
+_th_granule_frame
+_th_granule_time
+_th_info_init
+_th_info_clear
+_th_comment_init
+_th_comment_add
+_th_comment_add_tag
+_th_comment_query
+_th_comment_query_count
+_th_comment_clear
+_th_encode_alloc
+_th_encode_ctl
+_th_encode_flushheader
+_th_encode_packetout
+_th_encode_ycbcr_in
+_th_encode_free
diff --git a/lib/theoradec.exp b/lib/theoradec.exp
new file mode 100644
index 0000000..41dec1c
--- /dev/null
+++ b/lib/theoradec.exp
@@ -0,0 +1,43 @@
+# export list for theoradec
+_th_version_string
+_th_version_number
+_th_decode_headerin
+_th_decode_alloc
+_th_setup_free
+_th_decode_ctl
+_th_decode_packetin
+_th_decode_ycbcr_out
+_th_decode_free
+_th_packet_isheader
+_th_packet_iskeyframe
+_th_granule_frame
+_th_granule_time
+_th_info_init
+_th_info_clear
+_th_comment_init
+_th_comment_add
+_th_comment_add_tag
+_th_comment_query
+_th_comment_query_count
+_th_comment_clear
+_theora_version_string
+_theora_version_number
+_theora_decode_header
+_theora_decode_init
+_theora_decode_packetin
+_theora_decode_YUVout
+_theora_control
+_theora_packet_isheader
+_theora_packet_iskeyframe
+_theora_granule_shift
+_theora_granule_frame
+_theora_granule_time
+_theora_info_init
+_theora_info_clear
+_theora_clear
+_theora_comment_init
+_theora_comment_add
+_theora_comment_add_tag
+_theora_comment_query
+_theora_comment_query_count
+_theora_comment_clear
diff --git a/lib/theoraenc.exp b/lib/theoraenc.exp
new file mode 100644
index 0000000..d278455
--- /dev/null
+++ b/lib/theoraenc.exp
@@ -0,0 +1,15 @@
+# export list for theoraenc
+_th_encode_alloc
+_th_encode_ctl
+_th_encode_flushheader
+_th_encode_ycbcr_in
+_th_encode_packetout
+_th_encode_free
+_TH_VP31_QUANT_INFO
+_TH_VP31_HUFF_CODES
+_theora_encode_init
+_theora_encode_YUVin
+_theora_encode_packetout
+_theora_encode_header
+_theora_encode_comment
+_theora_encode_tables
diff --git a/lib/tokenize.c b/lib/tokenize.c
new file mode 100644
index 0000000..60574c3
--- /dev/null
+++ b/lib/tokenize.c
@@ -0,0 +1,1072 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: tokenize.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+#include <stdlib.h>
+#include <string.h>
+#include "encint.h"
+
+
+
+static int oc_make_eob_token(int _run_count){
+  if(_run_count<4)return OC_DCT_EOB1_TOKEN+_run_count-1;
+  else{
+    int cat;
+    cat=OC_ILOGNZ_32(_run_count)-3;
+    cat=OC_MINI(cat,3);
+    return OC_DCT_REPEAT_RUN0_TOKEN+cat;
+  }
+}
+
+static int oc_make_eob_token_full(int _run_count,int *_eb){
+  if(_run_count<4){
+    *_eb=0;
+    return OC_DCT_EOB1_TOKEN+_run_count-1;
+  }
+  else{
+    int cat;
+    cat=OC_ILOGNZ_32(_run_count)-3;
+    cat=OC_MINI(cat,3);
+    *_eb=_run_count-OC_BYTE_TABLE32(4,8,16,0,cat);
+    return OC_DCT_REPEAT_RUN0_TOKEN+cat;
+  }
+}
+
+/*Returns the number of blocks ended by an EOB token.*/
+static int oc_decode_eob_token(int _token,int _eb){
+  return (0x20820C41U>>_token*5&0x1F)+_eb;
+}
+
+/*TODO: This is now only used during DCT tokenization, and never for runs; it
+   should be simplified.*/
+static int oc_make_dct_token_full(int _zzi,int _zzj,int _val,int *_eb){
+  int neg;
+  int zero_run;
+  int token;
+  int eb;
+  neg=_val<0;
+  _val=abs(_val);
+  zero_run=_zzj-_zzi;
+  if(zero_run>0){
+    int adj;
+    /*Implement a minor restriction on stack 1 so that we know during DC fixups
+       that extending a dctrun token from stack 1 will never overflow.*/
+    adj=_zzi!=1;
+    if(_val<2&&zero_run<17+adj){
+      if(zero_run<6){
+        token=OC_DCT_RUN_CAT1A+zero_run-1;
+        eb=neg;
+      }
+      else if(zero_run<10){
+        token=OC_DCT_RUN_CAT1B;
+        eb=zero_run-6+(neg<<2);
+      }
+      else{
+        token=OC_DCT_RUN_CAT1C;
+        eb=zero_run-10+(neg<<3);
+      }
+    }
+    else if(_val<4&&zero_run<3+adj){
+      if(zero_run<2){
+        token=OC_DCT_RUN_CAT2A;
+        eb=_val-2+(neg<<1);
+      }
+      else{
+        token=OC_DCT_RUN_CAT2B;
+        eb=zero_run-2+(_val-2<<1)+(neg<<2);
+      }
+    }
+    else{
+      if(zero_run<9)token=OC_DCT_SHORT_ZRL_TOKEN;
+      else token=OC_DCT_ZRL_TOKEN;
+      eb=zero_run-1;
+    }
+  }
+  else if(_val<3){
+    token=OC_ONE_TOKEN+(_val-1<<1)+neg;
+    eb=0;
+  }
+  else if(_val<7){
+    token=OC_DCT_VAL_CAT2+_val-3;
+    eb=neg;
+  }
+  else if(_val<9){
+    token=OC_DCT_VAL_CAT3;
+    eb=_val-7+(neg<<1);
+  }
+  else if(_val<13){
+    token=OC_DCT_VAL_CAT4;
+    eb=_val-9+(neg<<2);
+  }
+  else if(_val<21){
+    token=OC_DCT_VAL_CAT5;
+    eb=_val-13+(neg<<3);
+  }
+  else if(_val<37){
+    token=OC_DCT_VAL_CAT6;
+    eb=_val-21+(neg<<4);
+  }
+  else if(_val<69){
+    token=OC_DCT_VAL_CAT7;
+    eb=_val-37+(neg<<5);
+  }
+  else{
+    token=OC_DCT_VAL_CAT8;
+    eb=_val-69+(neg<<9);
+  }
+  *_eb=eb;
+  return token;
+}
+
+/*Token logging to allow a few fragments of efficient rollback.
+  Late SKIP analysis is tied up in the tokenization process, so we need to be
+   able to undo a fragment's tokens on a whim.*/
+
+static const unsigned char OC_ZZI_HUFF_OFFSET[64]={
+   0,16,16,16,16,16,32,32,
+  32,32,32,32,32,32,32,48,
+  48,48,48,48,48,48,48,48,
+  48,48,48,48,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
+};
+
+static int oc_token_bits(oc_enc_ctx *_enc,int _huffi,int _zzi,int _token){
+  return _enc->huff_codes[_huffi+OC_ZZI_HUFF_OFFSET[_zzi]][_token].nbits
+   +OC_DCT_TOKEN_EXTRA_BITS[_token];
+}
+
+static void oc_enc_tokenlog_checkpoint(oc_enc_ctx *_enc,
+ oc_token_checkpoint *_cp,int _pli,int _zzi){
+  _cp->pli=_pli;
+  _cp->zzi=_zzi;
+  _cp->eob_run=_enc->eob_run[_pli][_zzi];
+  _cp->ndct_tokens=_enc->ndct_tokens[_pli][_zzi];
+}
+
+void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
+ const oc_token_checkpoint *_stack,int _n){
+  int i;
+  for(i=_n;i-->0;){
+    int pli;
+    int zzi;
+    pli=_stack[i].pli;
+    zzi=_stack[i].zzi;
+    _enc->eob_run[pli][zzi]=_stack[i].eob_run;
+    _enc->ndct_tokens[pli][zzi]=_stack[i].ndct_tokens;
+  }
+}
+
+static void oc_enc_token_log(oc_enc_ctx *_enc,
+ int _pli,int _zzi,int _token,int _eb){
+  ptrdiff_t ti;
+  ti=_enc->ndct_tokens[_pli][_zzi]++;
+  _enc->dct_tokens[_pli][_zzi][ti]=(unsigned char)_token;
+  _enc->extra_bits[_pli][_zzi][ti]=(ogg_uint16_t)_eb;
+}
+
+static void oc_enc_eob_log(oc_enc_ctx *_enc,
+ int _pli,int _zzi,int _run_count){
+  int token;
+  int eb;
+  token=oc_make_eob_token_full(_run_count,&eb);
+  oc_enc_token_log(_enc,_pli,_zzi,token,eb);
+}
+
+
+void oc_enc_tokenize_start(oc_enc_ctx *_enc){
+  memset(_enc->ndct_tokens,0,sizeof(_enc->ndct_tokens));
+  memset(_enc->eob_run,0,sizeof(_enc->eob_run));
+  memset(_enc->dct_token_offs,0,sizeof(_enc->dct_token_offs));
+  memset(_enc->dc_pred_last,0,sizeof(_enc->dc_pred_last));
+}
+
+typedef struct oc_quant_token oc_quant_token;
+
+/*A single node in the Viterbi trellis.
+  We maintain up to 2 of these per coefficient:
+    - A token to code if the value is zero (EOB, zero run, or combo token).
+    - A token to code if the value is not zero (DCT value token).*/
+struct oc_quant_token{
+  unsigned char next;
+  signed char   token;
+  ogg_int16_t   eb;
+  ogg_uint32_t  cost;
+  int           bits;
+  int           qc;
+};
+
+/*Tokenizes the AC coefficients, possibly adjusting the quantization, and then
+   dequantizes and de-zig-zags the result.
+  The DC coefficient is not preserved; it should be restored by the caller.*/
+int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
+ ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
+ int _zzi,oc_token_checkpoint **_stack,int _acmin){
+  oc_token_checkpoint *stack;
+  ogg_int64_t          zflags;
+  ogg_int64_t          nzflags;
+  ogg_int64_t          best_flags;
+  ogg_uint32_t         d2_accum[64];
+  oc_quant_token       tokens[64][2];
+  ogg_uint16_t        *eob_run;
+  const unsigned char *dct_fzig_zag;
+  ogg_uint32_t         cost;
+  int                  bits;
+  int                  eob;
+  int                  token;
+  int                  eb;
+  int                  next;
+  int                  huffi;
+  int                  zzi;
+  int                  ti;
+  int                  zzj;
+  int                  qc;
+  huffi=_enc->huff_idxs[_enc->state.frame_type][1][_pli+1>>1];
+  eob_run=_enc->eob_run[_pli];
+  memset(tokens[0],0,sizeof(tokens[0]));
+  best_flags=nzflags=0;
+  zflags=1;
+  d2_accum[0]=0;
+  zzj=64;
+  for(zzi=OC_MINI(_zzi,63);zzi>0;zzi--){
+    ogg_int32_t  lambda;
+    ogg_uint32_t best_cost;
+    int          best_bits=best_bits;
+    int          best_next=best_next;
+    int          best_token=best_token;
+    int          best_eb=best_eb;
+    int          best_qc=best_qc;
+    int          flush_bits;
+    ogg_uint32_t d2;
+    int          dq;
+    int          e;
+    int          c;
+    int          s;
+    int          tj;
+    lambda=_enc->lambda;
+    qc=_qdct[zzi];
+    s=-(qc<0);
+    qc=qc+s^s;
+    c=_dct[OC_FZIG_ZAG[zzi]];
+    if(qc<=1){
+      ogg_uint32_t sum_d2;
+      int          nzeros;
+      int          dc_reserve;
+      /*The hard case: try a zero run.*/
+      if(!qc){
+        /*Skip runs that are already quantized to zeros.
+          If we considered each zero coefficient in turn, we might
+           theoretically find a better way to partition long zero runs (e.g.,
+           a run of > 17 zeros followed by a 1 might be better coded as a short
+           zero run followed by a combo token, rather than the longer zero
+           token followed by a 1 value token), but zeros are so common that
+           this becomes very computationally expensive (quadratic instead of
+           linear in the number of coefficients), for a marginal gain.*/
+        while(zzi>1&&!_qdct[zzi-1])zzi--;
+        /*The distortion of coefficients originally quantized to zero is
+           treated as zero (since we'll never quantize them to anything else).*/
+        d2=0;
+      }
+      else{
+        c=c+s^s;
+        d2=c*(ogg_int32_t)c;
+      }
+      eob=eob_run[zzi];
+      nzeros=zzj-zzi;
+      zzj&=63;
+      sum_d2=d2+d2_accum[zzj];
+      d2_accum[zzi]=sum_d2;
+      flush_bits=eob>0?oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob)):0;
+      /*We reserve 1 spot for combo run tokens that start in the 1st AC stack
+         to ensure they can be extended to include the DC coefficient if
+         necessary; this greatly simplifies stack-rewriting later on.*/
+      dc_reserve=zzi+62>>6;
+      best_cost=0xFFFFFFFF;
+      for(;;){
+        if(nzflags>>zzj&1){
+          int cat;
+          int val;
+          int val_s;
+          int zzk;
+          int tk;
+          next=tokens[zzj][1].next;
+          tk=next&1;
+          zzk=next>>1;
+          /*Try a pure zero run to this point.*/
+          cat=nzeros+55>>6;
+          token=OC_DCT_SHORT_ZRL_TOKEN+cat;
+          bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
+          d2=sum_d2-d2_accum[zzj];
+          cost=d2+lambda*bits+tokens[zzj][1].cost;
+          if(cost<=best_cost){
+            best_next=(zzj<<1)+1;
+            best_token=token;
+            best_eb=nzeros-1;
+            best_cost=cost;
+            best_bits=bits+tokens[zzj][1].bits;
+            best_qc=0;
+          }
+          if(nzeros<16+dc_reserve){
+            val=_qdct[zzj];
+            val_s=-(val<0);
+            val=val+val_s^val_s;
+            if(val<=2){
+              /*Try a +/- 1 combo token.*/
+              if(nzeros<6){
+                token=OC_DCT_RUN_CAT1A+nzeros-1;
+                eb=-val_s;
+              }
+              else{
+                cat=nzeros+54>>6;
+                token=OC_DCT_RUN_CAT1B+cat;
+                eb=(-val_s<<cat+2)+nzeros-6-(cat<<2);
+              }
+              e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj];
+              d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
+              bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
+              cost=d2+lambda*bits+tokens[zzk][tk].cost;
+              if(cost<=best_cost){
+                best_next=next;
+                best_token=token;
+                best_eb=eb;
+                best_cost=cost;
+                best_bits=bits+tokens[zzk][tk].bits;
+                best_qc=1+val_s^val_s;
+              }
+            }
+            if(nzeros<2+dc_reserve&&2<=val&&val<=4){
+              /*Try a +/- 2/3 combo token.*/
+              cat=nzeros>>1;
+              token=OC_DCT_RUN_CAT2A+cat;
+              bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
+              val=2+((val+val_s^val_s)>2);
+              e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj]*val;
+              d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
+              cost=d2+lambda*bits+tokens[zzk][tk].cost;
+              if(cost<=best_cost){
+                best_cost=cost;
+                best_bits=bits+tokens[zzk][tk].bits;
+                best_next=next;
+                best_token=token;
+                best_eb=(-val_s<<1+cat)+(val-2<<cat)+(nzeros-1>>1);
+                best_qc=val+val_s^val_s;
+              }
+            }
+          }
+          /*zzj can't be coded as a zero, so stop trying to extend the run.*/
+          if(!(zflags>>zzj&1))break;
+        }
+        /*We could try to consider _all_ potentially non-zero coefficients, but
+           if we already found a bunch of them not worth coding, it's fairly
+           unlikely they would now be worth coding from this position; skipping
+           them saves a lot of work.*/
+        zzj=(tokens[zzj][0].next>>1)-(tokens[zzj][0].qc!=0)&63;
+        if(zzj==0){
+          /*We made it all the way to the end of the block; try an EOB token.*/
+          if(eob<4095){
+            bits=oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob+1))
+             -flush_bits;
+          }
+          else bits=oc_token_bits(_enc,huffi,zzi,OC_DCT_EOB1_TOKEN);
+          cost=sum_d2+bits*lambda;
+          /*If the best route so far is still a pure zero run to the end of the
+             block, force coding it as an EOB.
+            Even if it's not optimal for this block, it has a good chance of
+             getting combined with an EOB token from subsequent blocks, saving
+             bits overall.*/
+          if(cost<=best_cost||best_token<=OC_DCT_ZRL_TOKEN&&zzi+best_eb==63){
+            best_next=0;
+            /*This token is just a marker; in reality we may not emit any
+               tokens, but update eob_run[] instead.*/
+            best_token=OC_DCT_EOB1_TOKEN;
+            best_eb=0;
+            best_cost=cost;
+            best_bits=bits;
+            best_qc=0;
+          }
+          break;
+        }
+        nzeros=zzj-zzi;
+      }
+      tokens[zzi][0].next=(unsigned char)best_next;
+      tokens[zzi][0].token=(signed char)best_token;
+      tokens[zzi][0].eb=(ogg_int16_t)best_eb;
+      tokens[zzi][0].cost=best_cost;
+      tokens[zzi][0].bits=best_bits;
+      tokens[zzi][0].qc=best_qc;
+      zflags|=(ogg_int64_t)1<<zzi;
+      if(qc){
+        dq=_dequant[zzi];
+        if(zzi<_acmin)lambda=0;
+        e=dq-c;
+        d2=e*(ogg_int32_t)e;
+        token=OC_ONE_TOKEN-s;
+        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
+        zzj=zzi+1&63;
+        tj=best_flags>>zzj&1;
+        next=(zzj<<1)+tj;
+        tokens[zzi][1].next=(unsigned char)next;
+        tokens[zzi][1].token=(signed char)token;
+        tokens[zzi][1].eb=0;
+        tokens[zzi][1].cost=d2+lambda*bits+tokens[zzj][tj].cost;
+        tokens[zzi][1].bits=bits+tokens[zzj][tj].bits;
+        tokens[zzi][1].qc=1+s^s;
+        nzflags|=(ogg_int64_t)1<<zzi;
+        best_flags|=
+         (ogg_int64_t)(tokens[zzi][1].cost<tokens[zzi][0].cost)<<zzi;
+      }
+    }
+    else{
+      eob=eob_run[zzi];
+      if(zzi<_acmin)lambda=0;
+      c=c+s^s;
+      dq=_dequant[zzi];
+      /*No zero run can extend past this point.*/
+      d2_accum[zzi]=0;
+      flush_bits=eob>0?oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob)):0;
+      if(qc<=2){
+        e=2*dq-c;
+        d2=e*(ogg_int32_t)e;
+        best_token=OC_TWO_TOKEN-s;
+        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
+        best_cost=d2+lambda*best_bits;
+        e-=dq;
+        d2=e*(ogg_int32_t)e;
+        token=OC_ONE_TOKEN-s;
+        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
+        cost=d2+lambda*bits;
+        if(cost<=best_cost){
+          best_token=token;
+          best_bits=bits;
+          best_cost=cost;
+          qc--;
+        }
+        best_eb=0;
+      }
+      else if(qc<=3){
+        e=3*dq-c;
+        d2=e*(ogg_int32_t)e;
+        best_token=OC_DCT_VAL_CAT2;
+        best_eb=-s;
+        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
+        best_cost=d2+lambda*best_bits;
+        e-=dq;
+        d2=e*(ogg_int32_t)e;
+        token=OC_TWO_TOKEN-s;
+        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
+        cost=d2+lambda*bits;
+        if(cost<=best_cost){
+          best_token=token;
+          best_eb=0;
+          best_bits=bits;
+          best_cost=cost;
+          qc--;
+        }
+      }
+      else if(qc<=6){
+        e=qc*dq-c;
+        d2=e*(ogg_int32_t)e;
+        best_token=OC_DCT_VAL_CAT2+qc-3;
+        best_eb=-s;
+        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
+        best_cost=d2+lambda*best_bits;
+        e-=dq;
+        d2=e*(ogg_int32_t)e;
+        token=best_token-1;
+        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
+        cost=d2+lambda*bits;
+        if(cost<=best_cost){
+          best_token=token;
+          best_bits=bits;
+          best_cost=cost;
+          qc--;
+        }
+      }
+      else if(qc<=8){
+        e=qc*dq-c;
+        d2=e*(ogg_int32_t)e;
+        best_token=OC_DCT_VAL_CAT3;
+        best_eb=(-s<<1)+qc-7;
+        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
+        best_cost=d2+lambda*best_bits;
+        e=6*dq-c;
+        d2=e*(ogg_int32_t)e;
+        token=OC_DCT_VAL_CAT2+3;
+        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
+        cost=d2+lambda*bits;
+        if(cost<=best_cost){
+          best_token=token;
+          best_eb=-s;
+          best_bits=bits;
+          best_cost=cost;
+          qc=6;
+        }
+      }
+      else if(qc<=12){
+        e=qc*dq-c;
+        d2=e*(ogg_int32_t)e;
+        best_token=OC_DCT_VAL_CAT4;
+        best_eb=(-s<<2)+qc-9;
+        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
+        best_cost=d2+lambda*best_bits;
+        e=8*dq-c;
+        d2=e*(ogg_int32_t)e;
+        token=best_token-1;
+        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
+        cost=d2+lambda*bits;
+        if(cost<=best_cost){
+          best_token=token;
+          best_eb=(-s<<1)+1;
+          best_bits=bits;
+          best_cost=cost;
+          qc=8;
+        }
+      }
+      else if(qc<=20){
+        e=qc*dq-c;
+        d2=e*(ogg_int32_t)e;
+        best_token=OC_DCT_VAL_CAT5;
+        best_eb=(-s<<3)+qc-13;
+        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
+        best_cost=d2+lambda*best_bits;
+        e=12*dq-c;
+        d2=e*(ogg_int32_t)e;
+        token=best_token-1;
+        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
+        cost=d2+lambda*bits;
+        if(cost<=best_cost){
+          best_token=token;
+          best_eb=(-s<<2)+3;
+          best_bits=bits;
+          best_cost=cost;
+          qc=12;
+        }
+      }
+      else if(qc<=36){
+        e=qc*dq-c;
+        d2=e*(ogg_int32_t)e;
+        best_token=OC_DCT_VAL_CAT6;
+        best_eb=(-s<<4)+qc-21;
+        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
+        best_cost=d2+lambda*best_bits;
+        e=20*dq-c;
+        d2=e*(ogg_int32_t)e;
+        token=best_token-1;
+        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
+        cost=d2+lambda*bits;
+        if(cost<=best_cost){
+          best_token=token;
+          best_eb=(-s<<3)+7;
+          best_bits=bits;
+          best_cost=cost;
+          qc=20;
+        }
+      }
+      else if(qc<=68){
+        e=qc*dq-c;
+        d2=e*(ogg_int32_t)e;
+        best_token=OC_DCT_VAL_CAT7;
+        best_eb=(-s<<5)+qc-37;
+        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
+        best_cost=d2+lambda*best_bits;
+        e=36*dq-c;
+        d2=e*(ogg_int32_t)e;
+        token=best_token-1;
+        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
+        cost=d2+lambda*bits;
+        if(cost<best_cost){
+          best_token=token;
+          best_eb=(-s<<4)+15;
+          best_bits=bits;
+          best_cost=cost;
+          qc=36;
+        }
+      }
+      else{
+        e=qc*dq-c;
+        d2=e*(ogg_int32_t)e;
+        best_token=OC_DCT_VAL_CAT8;
+        best_eb=(-s<<9)+qc-69;
+        best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
+        best_cost=d2+lambda*best_bits;
+        e=68*dq-c;
+        d2=e*(ogg_int32_t)e;
+        token=best_token-1;
+        bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
+        cost=d2+lambda*bits;
+        if(cost<best_cost){
+          best_token=token;
+          best_eb=(-s<<5)+31;
+          best_bits=bits;
+          best_cost=cost;
+          qc=68;
+        }
+      }
+      zzj=zzi+1&63;
+      tj=best_flags>>zzj&1;
+      next=(zzj<<1)+tj;
+      tokens[zzi][1].next=(unsigned char)next;
+      tokens[zzi][1].token=(signed char)best_token;
+      tokens[zzi][1].eb=best_eb;
+      tokens[zzi][1].cost=best_cost+tokens[zzj][tj].cost;
+      tokens[zzi][1].bits=best_bits+tokens[zzj][tj].bits;
+      tokens[zzi][1].qc=qc+s^s;
+      nzflags|=(ogg_int64_t)1<<zzi;
+      best_flags|=(ogg_int64_t)1<<zzi;
+    }
+    zzj=zzi;
+  }
+  /*Emit the tokens from the best path through the trellis.*/
+  stack=*_stack;
+  /*We blow away the first entry here so that things vectorize better.
+    The DC coefficient is not actually stored in the array yet.*/
+  for(zzi=0;zzi<64;zzi++)_qdct[zzi]=0;
+  dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
+  zzi=1;
+  ti=best_flags>>1&1;
+  bits=tokens[zzi][ti].bits;
+  do{
+    oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
+    eob=eob_run[zzi];
+    if(tokens[zzi][ti].token<OC_NDCT_EOB_TOKEN_MAX){
+      if(++eob>=4095){
+        oc_enc_eob_log(_enc,_pli,zzi,eob);
+        eob=0;
+      }
+      eob_run[zzi]=eob;
+      /*We don't include the actual EOB cost for this block in the return value.
+        It will be paid for by the fragment that terminates the EOB run.*/
+      bits-=tokens[zzi][ti].bits;
+      zzi=_zzi;
+      break;
+    }
+    /*Emit pending EOB run if any.*/
+    if(eob>0){
+      oc_enc_eob_log(_enc,_pli,zzi,eob);
+      eob_run[zzi]=0;
+    }
+    oc_enc_token_log(_enc,_pli,zzi,tokens[zzi][ti].token,tokens[zzi][ti].eb);
+    next=tokens[zzi][ti].next;
+    qc=tokens[zzi][ti].qc;
+    zzj=(next>>1)-1&63;
+    /*TODO: It may be worth saving the dequantized coefficient in the trellis
+       above; we had to compute it to measure the error anyway.*/
+    _qdct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
+    zzi=next>>1;
+    ti=next&1;
+  }
+  while(zzi);
+  *_stack=stack;
+  return bits;
+}
+
+void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
+ int _pli,int _fragy0,int _frag_yend){
+  const oc_fragment_plane *fplane;
+  const oc_fragment       *frags;
+  ogg_int16_t             *frag_dc;
+  ptrdiff_t                fragi;
+  int                     *pred_last;
+  int                      nhfrags;
+  int                      fragx;
+  int                      fragy;
+  fplane=_enc->state.fplanes+_pli;
+  frags=_enc->state.frags;
+  frag_dc=_enc->frag_dc;
+  pred_last=_enc->dc_pred_last[_pli];
+  nhfrags=fplane->nhfrags;
+  fragi=fplane->froffset+_fragy0*nhfrags;
+  for(fragy=_fragy0;fragy<_frag_yend;fragy++){
+    if(fragy==0){
+      /*For the first row, all of the cases reduce to just using the previous
+         predictor for the same reference frame.*/
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        if(frags[fragi].coded){
+          int ref;
+          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+          frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred_last[ref]);
+          pred_last[ref]=frags[fragi].dc;
+        }
+      }
+    }
+    else{
+      const oc_fragment *u_frags;
+      int                l_ref;
+      int                ul_ref;
+      int                u_ref;
+      u_frags=frags-nhfrags;
+      l_ref=-1;
+      ul_ref=-1;
+      u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
+      for(fragx=0;fragx<nhfrags;fragx++,fragi++){
+        int ur_ref;
+        if(fragx+1>=nhfrags)ur_ref=-1;
+        else{
+          ur_ref=u_frags[fragi+1].coded?
+           OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
+        }
+        if(frags[fragi].coded){
+          int pred;
+          int ref;
+          ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
+          /*We break out a separate case based on which of our neighbors use
+             the same reference frames.
+            This is somewhat faster than trying to make a generic case which
+             handles all of them, since it reduces lots of poorly predicted
+             jumps to one switch statement, and also lets a number of the
+             multiplications be optimized out by strength reduction.*/
+          switch((l_ref==ref)|(ul_ref==ref)<<1|
+           (u_ref==ref)<<2|(ur_ref==ref)<<3){
+            default:pred=pred_last[ref];break;
+            case  1:
+            case  3:pred=frags[fragi-1].dc;break;
+            case  2:pred=u_frags[fragi-1].dc;break;
+            case  4:
+            case  6:
+            case 12:pred=u_frags[fragi].dc;break;
+            case  5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break;
+            case  8:pred=u_frags[fragi+1].dc;break;
+            case  9:
+            case 11:
+            case 13:{
+              pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
+            }break;
+            case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
+            case 14:{
+              pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc)
+               +10*u_frags[fragi].dc)/16;
+            }break;
+            case  7:
+            case 15:{
+              int p0;
+              int p1;
+              int p2;
+              p0=frags[fragi-1].dc;
+              p1=u_frags[fragi-1].dc;
+              p2=u_frags[fragi].dc;
+              pred=(29*(p0+p2)-26*p1)/32;
+              if(abs(pred-p2)>128)pred=p2;
+              else if(abs(pred-p0)>128)pred=p0;
+              else if(abs(pred-p1)>128)pred=p1;
+            }break;
+          }
+          frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred);
+          pred_last[ref]=frags[fragi].dc;
+          l_ref=ref;
+        }
+        else l_ref=-1;
+        ul_ref=u_ref;
+        u_ref=ur_ref;
+      }
+    }
+  }
+}
+
+void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
+ const ptrdiff_t *_coded_fragis,ptrdiff_t _ncoded_fragis,
+ int _prev_ndct_tokens1,int _prev_eob_run1){
+  const ogg_int16_t *frag_dc;
+  ptrdiff_t          fragii;
+  unsigned char     *dct_tokens0;
+  unsigned char     *dct_tokens1;
+  ogg_uint16_t      *extra_bits0;
+  ogg_uint16_t      *extra_bits1;
+  ptrdiff_t          ti0;
+  ptrdiff_t          ti1r;
+  ptrdiff_t          ti1w;
+  int                eob_run0;
+  int                eob_run1;
+  int                neobs1;
+  int                token;
+  int                eb;
+  int                token1=token1;
+  int                eb1=eb1;
+  /*Return immediately if there are no coded fragments; otherwise we'd flush
+     any trailing EOB run into the AC 1 list and never read it back out.*/
+  if(_ncoded_fragis<=0)return;
+  frag_dc=_enc->frag_dc;
+  dct_tokens0=_enc->dct_tokens[_pli][0];
+  dct_tokens1=_enc->dct_tokens[_pli][1];
+  extra_bits0=_enc->extra_bits[_pli][0];
+  extra_bits1=_enc->extra_bits[_pli][1];
+  ti0=_enc->ndct_tokens[_pli][0];
+  ti1w=ti1r=_prev_ndct_tokens1;
+  eob_run0=_enc->eob_run[_pli][0];
+  /*Flush any trailing EOB run for the 1st AC coefficient.
+    This is needed to allow us to track tokens to the end of the list.*/
+  eob_run1=_enc->eob_run[_pli][1];
+  if(eob_run1>0)oc_enc_eob_log(_enc,_pli,1,eob_run1);
+  /*If there was an active EOB run at the start of the 1st AC stack, read it
+     in and decode it.*/
+  if(_prev_eob_run1>0){
+    token1=dct_tokens1[ti1r];
+    eb1=extra_bits1[ti1r];
+    ti1r++;
+    eob_run1=oc_decode_eob_token(token1,eb1);
+    /*Consume the portion of the run that came before these fragments.*/
+    neobs1=eob_run1-_prev_eob_run1;
+  }
+  else eob_run1=neobs1=0;
+  for(fragii=0;fragii<_ncoded_fragis;fragii++){
+    int val;
+    /*All tokens in the 1st AC coefficient stack are regenerated as the DC
+       coefficients are produced.
+      This can be done in-place; stack 1 cannot get larger.*/
+    if(!neobs1){
+      /*There's no active EOB run in stack 1; read the next token.*/
+      token1=dct_tokens1[ti1r];
+      eb1=extra_bits1[ti1r];
+      ti1r++;
+      if(token1<OC_NDCT_EOB_TOKEN_MAX){
+        neobs1=oc_decode_eob_token(token1,eb1);
+        /*It's an EOB run; add it to the current (inactive) one.
+          Because we may have moved entries to stack 0, we may have an
+           opportunity to merge two EOB runs in stack 1.*/
+        eob_run1+=neobs1;
+      }
+    }
+    val=frag_dc[_coded_fragis[fragii]];
+    if(val){
+      /*There was a non-zero DC value, so there's no alteration to stack 1
+         for this fragment; just code the stack 0 token.*/
+      /*Flush any pending EOB run.*/
+      if(eob_run0>0){
+        token=oc_make_eob_token_full(eob_run0,&eb);
+        dct_tokens0[ti0]=(unsigned char)token;
+        extra_bits0[ti0]=(ogg_uint16_t)eb;
+        ti0++;
+        eob_run0=0;
+      }
+      token=oc_make_dct_token_full(0,0,val,&eb);
+      dct_tokens0[ti0]=(unsigned char)token;
+      extra_bits0[ti0]=(ogg_uint16_t)eb;
+      ti0++;
+    }
+    else{
+      /*Zero DC value; that means the entry in stack 1 might need to be coded
+         from stack 0.
+        This requires a stack 1 fixup.*/
+      if(neobs1>0){
+        /*We're in the middle of an active EOB run in stack 1.
+          Move it to stack 0.*/
+        if(++eob_run0>=4095){
+          token=oc_make_eob_token_full(eob_run0,&eb);
+          dct_tokens0[ti0]=(unsigned char)token;
+          extra_bits0[ti0]=(ogg_uint16_t)eb;
+          ti0++;
+          eob_run0=0;
+        }
+        eob_run1--;
+      }
+      else{
+        /*No active EOB run in stack 1, so we can't extend one in stack 0.
+          Flush it if we've got it.*/
+        if(eob_run0>0){
+          token=oc_make_eob_token_full(eob_run0,&eb);
+          dct_tokens0[ti0]=(unsigned char)token;
+          extra_bits0[ti0]=(ogg_uint16_t)eb;
+          ti0++;
+          eob_run0=0;
+        }
+        /*Stack 1 token is one of: a pure zero run token, a single
+           coefficient token, or a zero run/coefficient combo token.
+          A zero run token is expanded and moved to token stack 0, and the
+           stack 1 entry dropped.
+          A single coefficient value may be transformed into combo token that
+           is moved to stack 0, or if it cannot be combined, it is left alone
+           and a single length-1 zero run is emitted in stack 0.
+          A combo token is extended and moved to stack 0.
+          During AC coding, we restrict the run lengths on combo tokens for
+           stack 1 to guarantee we can extend them.*/
+        switch(token1){
+          case OC_DCT_SHORT_ZRL_TOKEN:{
+            if(eb1<7){
+              dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
+              extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
+              ti0++;
+              /*Don't write the AC coefficient back out.*/
+              continue;
+            }
+            /*Fall through.*/
+          }
+          case OC_DCT_ZRL_TOKEN:{
+            dct_tokens0[ti0]=OC_DCT_ZRL_TOKEN;
+            extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
+            ti0++;
+            /*Don't write the AC coefficient back out.*/
+          }continue;
+          case OC_ONE_TOKEN:
+          case OC_MINUS_ONE_TOKEN:{
+            dct_tokens0[ti0]=OC_DCT_RUN_CAT1A;
+            extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_ONE_TOKEN);
+            ti0++;
+            /*Don't write the AC coefficient back out.*/
+          }continue;
+          case OC_TWO_TOKEN:
+          case OC_MINUS_TWO_TOKEN:{
+            dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
+            extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_TWO_TOKEN<<1);
+            ti0++;
+            /*Don't write the AC coefficient back out.*/
+          }continue;
+          case OC_DCT_VAL_CAT2:{
+            dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
+            extra_bits0[ti0]=(ogg_uint16_t)((eb1<<1)+1);
+            ti0++;
+            /*Don't write the AC coefficient back out.*/
+          }continue;
+          case OC_DCT_RUN_CAT1A:
+          case OC_DCT_RUN_CAT1A+1:
+          case OC_DCT_RUN_CAT1A+2:
+          case OC_DCT_RUN_CAT1A+3:{
+            dct_tokens0[ti0]=(unsigned char)(token1+1);
+            extra_bits0[ti0]=(ogg_uint16_t)eb1;
+            ti0++;
+            /*Don't write the AC coefficient back out.*/
+          }continue;
+          case OC_DCT_RUN_CAT1A+4:{
+            dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
+            extra_bits0[ti0]=(ogg_uint16_t)(eb1<<2);
+            ti0++;
+            /*Don't write the AC coefficient back out.*/
+          }continue;
+          case OC_DCT_RUN_CAT1B:{
+            if((eb1&3)<3){
+              dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
+              extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
+              ti0++;
+              /*Don't write the AC coefficient back out.*/
+              continue;
+            }
+            eb1=((eb1&4)<<1)-1;
+            /*Fall through.*/
+          }
+          case OC_DCT_RUN_CAT1C:{
+            dct_tokens0[ti0]=OC_DCT_RUN_CAT1C;
+            extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
+            ti0++;
+            /*Don't write the AC coefficient back out.*/
+          }continue;
+          case OC_DCT_RUN_CAT2A:{
+            eb1=(eb1<<1)-1;
+            /*Fall through.*/
+          }
+          case OC_DCT_RUN_CAT2B:{
+            dct_tokens0[ti0]=OC_DCT_RUN_CAT2B;
+            extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
+            ti0++;
+            /*Don't write the AC coefficient back out.*/
+          }continue;
+        }
+        /*We can't merge tokens, write a short zero run and keep going.*/
+        dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
+        extra_bits0[ti0]=0;
+        ti0++;
+      }
+    }
+    if(!neobs1){
+      /*Flush any (inactive) EOB run.*/
+      if(eob_run1>0){
+        token=oc_make_eob_token_full(eob_run1,&eb);
+        dct_tokens1[ti1w]=(unsigned char)token;
+        extra_bits1[ti1w]=(ogg_uint16_t)eb;
+        ti1w++;
+        eob_run1=0;
+      }
+      /*There's no active EOB run, so log the current token.*/
+      dct_tokens1[ti1w]=(unsigned char)token1;
+      extra_bits1[ti1w]=(ogg_uint16_t)eb1;
+      ti1w++;
+    }
+    else{
+      /*Otherwise consume one EOB from the current run.*/
+      neobs1--;
+      /*If we have more than 4095 EOBs outstanding in stack1, flush the run.*/
+      if(eob_run1-neobs1>=4095){
+        token=oc_make_eob_token_full(4095,&eb);
+        dct_tokens1[ti1w]=(unsigned char)token;
+        extra_bits1[ti1w]=(ogg_uint16_t)eb;
+        ti1w++;
+        eob_run1-=4095;
+      }
+    }
+  }
+  /*Save the current state.*/
+  _enc->ndct_tokens[_pli][0]=ti0;
+  _enc->ndct_tokens[_pli][1]=ti1w;
+  _enc->eob_run[_pli][0]=eob_run0;
+  _enc->eob_run[_pli][1]=eob_run1;
+}
+
+/*Final EOB run welding.*/
+void oc_enc_tokenize_finish(oc_enc_ctx *_enc){
+  int pli;
+  int zzi;
+  /*Emit final EOB runs.*/
+  for(pli=0;pli<3;pli++)for(zzi=0;zzi<64;zzi++){
+    int eob_run;
+    eob_run=_enc->eob_run[pli][zzi];
+    if(eob_run>0)oc_enc_eob_log(_enc,pli,zzi,eob_run);
+  }
+  /*Merge the final EOB run of one token list with the start of the next, if
+     possible.*/
+  for(zzi=0;zzi<64;zzi++)for(pli=0;pli<3;pli++){
+    int       old_tok1;
+    int       old_tok2;
+    int       old_eb1;
+    int       old_eb2;
+    int       new_tok;
+    int       new_eb;
+    int       zzj;
+    int       plj;
+    ptrdiff_t ti=ti;
+    int       run_count;
+    /*Make sure this coefficient has tokens at all.*/
+    if(_enc->ndct_tokens[pli][zzi]<=0)continue;
+    /*Ensure the first token is an EOB run.*/
+    old_tok2=_enc->dct_tokens[pli][zzi][0];
+    if(old_tok2>=OC_NDCT_EOB_TOKEN_MAX)continue;
+    /*Search for a previous coefficient that has any tokens at all.*/
+    old_tok1=OC_NDCT_EOB_TOKEN_MAX;
+    for(zzj=zzi,plj=pli;zzj>=0;zzj--){
+      while(plj-->0){
+        ti=_enc->ndct_tokens[plj][zzj]-1;
+        if(ti>=_enc->dct_token_offs[plj][zzj]){
+          old_tok1=_enc->dct_tokens[plj][zzj][ti];
+          break;
+        }
+      }
+      if(plj>=0)break;
+      plj=3;
+    }
+    /*Ensure its last token was an EOB run.*/
+    if(old_tok1>=OC_NDCT_EOB_TOKEN_MAX)continue;
+    /*Pull off the associated extra bits, if any, and decode the runs.*/
+    old_eb1=_enc->extra_bits[plj][zzj][ti];
+    old_eb2=_enc->extra_bits[pli][zzi][0];
+    run_count=oc_decode_eob_token(old_tok1,old_eb1)
+     +oc_decode_eob_token(old_tok2,old_eb2);
+    /*We can't possibly combine these into one run.
+      It might be possible to split them more optimally, but we'll just leave
+       them as-is.*/
+    if(run_count>=4096)continue;
+    /*We CAN combine them into one run.*/
+    new_tok=oc_make_eob_token_full(run_count,&new_eb);
+    _enc->dct_tokens[plj][zzj][ti]=(unsigned char)new_tok;
+    _enc->extra_bits[plj][zzj][ti]=(ogg_uint16_t)new_eb;
+    _enc->dct_token_offs[pli][zzi]++;
+  }
+}
diff --git a/lib/x86/mmxencfrag.c b/lib/x86/mmxencfrag.c
new file mode 100644
index 0000000..c79ff01
--- /dev/null
+++ b/lib/x86/mmxencfrag.c
@@ -0,0 +1,900 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
+
+ ********************************************************************/
+#include <stddef.h>
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  ptrdiff_t ystride3;
+  ptrdiff_t ret;
+  __asm__ __volatile__(
+    /*Load the first 4 rows of each block.*/
+    "movq (%[src]),%%mm0\n\t"
+    "movq (%[ref]),%%mm1\n\t"
+    "movq (%[src],%[ystride]),%%mm2\n\t"
+    "movq (%[ref],%[ystride]),%%mm3\n\t"
+    "lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
+    "movq (%[src],%[ystride],2),%%mm4\n\t"
+    "movq (%[ref],%[ystride],2),%%mm5\n\t"
+    "movq (%[src],%[ystride3]),%%mm6\n\t"
+    "movq (%[ref],%[ystride3]),%%mm7\n\t"
+    /*Compute their SADs and add them in %%mm0*/
+    "psadbw %%mm1,%%mm0\n\t"
+    "psadbw %%mm3,%%mm2\n\t"
+    "lea (%[src],%[ystride],4),%[src]\n\t"
+    "paddw %%mm2,%%mm0\n\t"
+    "lea (%[ref],%[ystride],4),%[ref]\n\t"
+    /*Load the next 3 rows as registers become available.*/
+    "movq (%[src]),%%mm2\n\t"
+    "movq (%[ref]),%%mm3\n\t"
+    "psadbw %%mm5,%%mm4\n\t"
+    "psadbw %%mm7,%%mm6\n\t"
+    "paddw %%mm4,%%mm0\n\t"
+    "movq (%[ref],%[ystride]),%%mm5\n\t"
+    "movq (%[src],%[ystride]),%%mm4\n\t"
+    "paddw %%mm6,%%mm0\n\t"
+    "movq (%[ref],%[ystride],2),%%mm7\n\t"
+    "movq (%[src],%[ystride],2),%%mm6\n\t"
+    /*Start adding their SADs to %%mm0*/
+    "psadbw %%mm3,%%mm2\n\t"
+    "psadbw %%mm5,%%mm4\n\t"
+    "paddw %%mm2,%%mm0\n\t"
+    "psadbw %%mm7,%%mm6\n\t"
+    /*Load last row as registers become available.*/
+    "movq (%[src],%[ystride3]),%%mm2\n\t"
+    "movq (%[ref],%[ystride3]),%%mm3\n\t"
+    /*And finish adding up their SADs.*/
+    "paddw %%mm4,%%mm0\n\t"
+    "psadbw %%mm3,%%mm2\n\t"
+    "paddw %%mm6,%%mm0\n\t"
+    "paddw %%mm2,%%mm0\n\t"
+    "movd %%mm0,%[ret]\n\t"
+    :[ret]"=a"(ret),[src]"+%r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)
+    :[ystride]"r"((ptrdiff_t)_ystride)
+  );
+  return (unsigned)ret;
+}
+
+unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh){
+  /*Early termination is for suckers.*/
+  return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
+}
+
+/*Assumes the first two rows of %[ref1] and %[ref2] are in %%mm0...%%mm3, the
+   first two rows of %[src] are in %%mm4,%%mm5, and {1}x8 is in %%mm7.
+  We pre-load the next two rows of data as registers become available.*/
+#define OC_SAD2_LOOP \
+ "#OC_SAD2_LOOP\n\t" \
+ /*We want to compute (%%mm0+%%mm1>>1) on unsigned bytes without overflow, but \
+    pavgb computes (%%mm0+%%mm1+1>>1). \
+   The latter is exactly 1 too large when the low bit of two corresponding \
+    bytes is only set in one of them. \
+   Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
+    correct the output of pavgb.*/ \
+ "movq %%mm0,%%mm6\n\t" \
+ "lea (%[ref1],%[ystride],2),%[ref1]\n\t" \
+ "pxor %%mm1,%%mm0\n\t" \
+ "pavgb %%mm1,%%mm6\n\t" \
+ "lea (%[ref2],%[ystride],2),%[ref2]\n\t" \
+ "movq %%mm2,%%mm1\n\t" \
+ "pand %%mm7,%%mm0\n\t" \
+ "pavgb %%mm3,%%mm2\n\t" \
+ "pxor %%mm3,%%mm1\n\t" \
+ "movq (%[ref2],%[ystride]),%%mm3\n\t" \
+ "psubb %%mm0,%%mm6\n\t" \
+ "movq (%[ref1]),%%mm0\n\t" \
+ "pand %%mm7,%%mm1\n\t" \
+ "psadbw %%mm6,%%mm4\n\t" \
+ "movd %[ret],%%mm6\n\t" \
+ "psubb %%mm1,%%mm2\n\t" \
+ "movq (%[ref2]),%%mm1\n\t" \
+ "lea (%[src],%[ystride],2),%[src]\n\t" \
+ "psadbw %%mm2,%%mm5\n\t" \
+ "movq (%[ref1],%[ystride]),%%mm2\n\t" \
+ "paddw %%mm4,%%mm5\n\t" \
+ "movq (%[src]),%%mm4\n\t" \
+ "paddw %%mm5,%%mm6\n\t" \
+ "movq (%[src],%[ystride]),%%mm5\n\t" \
+ "movd %%mm6,%[ret]\n\t" \
+
+/*Same as above, but does not pre-load the next two rows.*/
+#define OC_SAD2_TAIL \
+ "#OC_SAD2_TAIL\n\t" \
+ "movq %%mm0,%%mm6\n\t" \
+ "pavgb %%mm1,%%mm0\n\t" \
+ "pxor %%mm1,%%mm6\n\t" \
+ "movq %%mm2,%%mm1\n\t" \
+ "pand %%mm7,%%mm6\n\t" \
+ "pavgb %%mm3,%%mm2\n\t" \
+ "pxor %%mm3,%%mm1\n\t" \
+ "psubb %%mm6,%%mm0\n\t" \
+ "pand %%mm7,%%mm1\n\t" \
+ "psadbw %%mm0,%%mm4\n\t" \
+ "psubb %%mm1,%%mm2\n\t" \
+ "movd %[ret],%%mm6\n\t" \
+ "psadbw %%mm2,%%mm5\n\t" \
+ "paddw %%mm4,%%mm5\n\t" \
+ "paddw %%mm5,%%mm6\n\t" \
+ "movd %%mm6,%[ret]\n\t" \
+
+unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh){
+  ptrdiff_t ret;
+  __asm__ __volatile__(
+    "movq (%[ref1]),%%mm0\n\t"
+    "movq (%[ref2]),%%mm1\n\t"
+    "movq (%[ref1],%[ystride]),%%mm2\n\t"
+    "movq (%[ref2],%[ystride]),%%mm3\n\t"
+    "xor %[ret],%[ret]\n\t"
+    "movq (%[src]),%%mm4\n\t"
+    "pxor %%mm7,%%mm7\n\t"
+    "pcmpeqb %%mm6,%%mm6\n\t"
+    "movq (%[src],%[ystride]),%%mm5\n\t"
+    "psubb %%mm6,%%mm7\n\t"
+    OC_SAD2_LOOP
+    OC_SAD2_LOOP
+    OC_SAD2_LOOP
+    OC_SAD2_TAIL
+    :[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+%r"(_ref1),[ref2]"+r"(_ref2)
+    :[ystride]"r"((ptrdiff_t)_ystride)
+  );
+  return (unsigned)ret;
+}
+
+/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
+   16-bit difference in %%mm0...%%mm7.*/
+#define OC_LOAD_SUB_8x4(_off) \
+ "#OC_LOAD_SUB_8x4\n\t" \
+ "movd "_off"(%[src]),%%mm0\n\t" \
+ "movd "_off"(%[ref]),%%mm4\n\t" \
+ "movd "_off"(%[src],%[src_ystride]),%%mm1\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "movd "_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "movd "_off"(%[src]),%%mm2\n\t" \
+ "movd "_off"(%[ref]),%%mm7\n\t" \
+ "movd "_off"(%[src],%[src_ystride]),%%mm3\n\t" \
+ "movd "_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
+ "punpcklbw %%mm4,%%mm0\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "punpcklbw %%mm4,%%mm4\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "psubw %%mm4,%%mm0\n\t" \
+ "movd "_off"(%[src]),%%mm4\n\t" \
+ "movq %%mm0,"_off"*2(%[buf])\n\t" \
+ "movd "_off"(%[ref]),%%mm0\n\t" \
+ "punpcklbw %%mm5,%%mm1\n\t" \
+ "punpcklbw %%mm5,%%mm5\n\t" \
+ "psubw %%mm5,%%mm1\n\t" \
+ "movd "_off"(%[src],%[src_ystride]),%%mm5\n\t" \
+ "punpcklbw %%mm7,%%mm2\n\t" \
+ "punpcklbw %%mm7,%%mm7\n\t" \
+ "psubw %%mm7,%%mm2\n\t" \
+ "movd "_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
+ "punpcklbw %%mm6,%%mm3\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "punpcklbw %%mm6,%%mm6\n\t" \
+ "psubw %%mm6,%%mm3\n\t" \
+ "movd "_off"(%[src]),%%mm6\n\t" \
+ "punpcklbw %%mm0,%%mm4\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "punpcklbw %%mm0,%%mm0\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "psubw %%mm0,%%mm4\n\t" \
+ "movd "_off"(%[ref]),%%mm0\n\t" \
+ "punpcklbw %%mm7,%%mm5\n\t" \
+ "neg %[src_ystride]\n\t" \
+ "punpcklbw %%mm7,%%mm7\n\t" \
+ "psubw %%mm7,%%mm5\n\t" \
+ "movd "_off"(%[src],%[src_ystride]),%%mm7\n\t" \
+ "punpcklbw %%mm0,%%mm6\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "punpcklbw %%mm0,%%mm0\n\t" \
+ "neg %[ref_ystride]\n\t" \
+ "psubw %%mm0,%%mm6\n\t" \
+ "movd "_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
+ "lea (%[src],%[src_ystride],8),%[src]\n\t" \
+ "punpcklbw %%mm0,%%mm7\n\t" \
+ "neg %[src_ystride]\n\t" \
+ "punpcklbw %%mm0,%%mm0\n\t" \
+ "lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \
+ "psubw %%mm0,%%mm7\n\t" \
+ "neg %[ref_ystride]\n\t" \
+ "movq "_off"*2(%[buf]),%%mm0\n\t" \
+
+/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
+#define OC_LOAD_8x4(_off) \
+ "#OC_LOAD_8x4\n\t" \
+ "movd "_off"(%[src]),%%mm0\n\t" \
+ "movd "_off"(%[src],%[ystride]),%%mm1\n\t" \
+ "movd "_off"(%[src],%[ystride],2),%%mm2\n\t" \
+ "pxor %%mm7,%%mm7\n\t" \
+ "movd "_off"(%[src],%[ystride3]),%%mm3\n\t" \
+ "punpcklbw %%mm7,%%mm0\n\t" \
+ "movd "_off"(%[src4]),%%mm4\n\t" \
+ "punpcklbw %%mm7,%%mm1\n\t" \
+ "movd "_off"(%[src4],%[ystride]),%%mm5\n\t" \
+ "punpcklbw %%mm7,%%mm2\n\t" \
+ "movd "_off"(%[src4],%[ystride],2),%%mm6\n\t" \
+ "punpcklbw %%mm7,%%mm3\n\t" \
+ "movd "_off"(%[src4],%[ystride3]),%%mm7\n\t" \
+ "punpcklbw %%mm4,%%mm4\n\t" \
+ "punpcklbw %%mm5,%%mm5\n\t" \
+ "psrlw $8,%%mm4\n\t" \
+ "psrlw $8,%%mm5\n\t" \
+ "punpcklbw %%mm6,%%mm6\n\t" \
+ "punpcklbw %%mm7,%%mm7\n\t" \
+ "psrlw $8,%%mm6\n\t" \
+ "psrlw $8,%%mm7\n\t" \
+
+/*Performs the first two stages of an 8-point 1-D Hadamard transform.
+  The transform is performed in place, except that outputs 0-3 are swapped with
+   outputs 4-7.
+  Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
+   perform this stage in place with no temporary registers).*/
+#define OC_HADAMARD_AB_8x4 \
+ "#OC_HADAMARD_AB_8x4\n\t" \
+ /*Stage A: \
+   Outputs 0-3 are swapped with 4-7 here.*/ \
+ "paddw %%mm1,%%mm5\n\t" \
+ "paddw %%mm2,%%mm6\n\t" \
+ "paddw %%mm1,%%mm1\n\t" \
+ "paddw %%mm2,%%mm2\n\t" \
+ "psubw %%mm5,%%mm1\n\t" \
+ "psubw %%mm6,%%mm2\n\t" \
+ "paddw %%mm3,%%mm7\n\t" \
+ "paddw %%mm0,%%mm4\n\t" \
+ "paddw %%mm3,%%mm3\n\t" \
+ "paddw %%mm0,%%mm0\n\t" \
+ "psubw %%mm7,%%mm3\n\t" \
+ "psubw %%mm4,%%mm0\n\t" \
+ /*Stage B:*/ \
+ "paddw %%mm2,%%mm0\n\t" \
+ "paddw %%mm3,%%mm1\n\t" \
+ "paddw %%mm6,%%mm4\n\t" \
+ "paddw %%mm7,%%mm5\n\t" \
+ "paddw %%mm2,%%mm2\n\t" \
+ "paddw %%mm3,%%mm3\n\t" \
+ "paddw %%mm6,%%mm6\n\t" \
+ "paddw %%mm7,%%mm7\n\t" \
+ "psubw %%mm0,%%mm2\n\t" \
+ "psubw %%mm1,%%mm3\n\t" \
+ "psubw %%mm4,%%mm6\n\t" \
+ "psubw %%mm5,%%mm7\n\t" \
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
+  Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+   place with no temporary registers).*/
+#define OC_HADAMARD_C_8x4 \
+ "#OC_HADAMARD_C_8x4\n\t" \
+ /*Stage C:*/ \
+ "paddw %%mm1,%%mm0\n\t" \
+ "paddw %%mm3,%%mm2\n\t" \
+ "paddw %%mm5,%%mm4\n\t" \
+ "paddw %%mm7,%%mm6\n\t" \
+ "paddw %%mm1,%%mm1\n\t" \
+ "paddw %%mm3,%%mm3\n\t" \
+ "paddw %%mm5,%%mm5\n\t" \
+ "paddw %%mm7,%%mm7\n\t" \
+ "psubw %%mm0,%%mm1\n\t" \
+ "psubw %%mm2,%%mm3\n\t" \
+ "psubw %%mm4,%%mm5\n\t" \
+ "psubw %%mm6,%%mm7\n\t" \
+
+/*Performs an 8-point 1-D Hadamard transform.
+  The transform is performed in place, except that outputs 0-3 are swapped with
+   outputs 4-7.
+  Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
+   in place with no temporary registers).*/
+#define OC_HADAMARD_8x4 \
+ OC_HADAMARD_AB_8x4 \
+ OC_HADAMARD_C_8x4 \
+
+/*Performs the first part of the final stage of the Hadamard transform and
+   summing of absolute values.
+  At the end of this part, %%mm1 will contain the DC coefficient of the
+   transform.*/
+#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
+ /*We use the fact that \
+     (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
+    to merge the final butterfly with the abs and the first stage of \
+    accumulation. \
+   Thus we can avoid using pabsw, which is not available until SSSE3. \
+   Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
+    implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
+    registers). \
+   Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
+   This implementation is only 26 (+4 for spilling registers).*/ \
+ "#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \
+ "movq %%mm7,"_r7"(%[buf])\n\t" \
+ "movq %%mm6,"_r6"(%[buf])\n\t" \
+ /*mm7={0x7FFF}x4 \
+   mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
+ "pcmpeqb %%mm7,%%mm7\n\t" \
+ "movq %%mm0,%%mm6\n\t" \
+ "psrlw $1,%%mm7\n\t" \
+ "paddw %%mm1,%%mm6\n\t" \
+ "pmaxsw %%mm1,%%mm0\n\t" \
+ "paddsw %%mm7,%%mm6\n\t" \
+ "psubw %%mm6,%%mm0\n\t" \
+ /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
+   mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
+ "movq %%mm2,%%mm6\n\t" \
+ "movq %%mm4,%%mm1\n\t" \
+ "pmaxsw %%mm3,%%mm2\n\t" \
+ "pmaxsw %%mm5,%%mm4\n\t" \
+ "paddw %%mm3,%%mm6\n\t" \
+ "paddw %%mm5,%%mm1\n\t" \
+ "movq "_r7"(%[buf]),%%mm3\n\t" \
+
+/*Performs the second part of the final stage of the Hadamard transform and
+   summing of absolute values.*/
+#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
+ "#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \
+ "paddsw %%mm7,%%mm6\n\t" \
+ "movq "_r6"(%[buf]),%%mm5\n\t" \
+ "paddsw %%mm7,%%mm1\n\t" \
+ "psubw %%mm6,%%mm2\n\t" \
+ "psubw %%mm1,%%mm4\n\t" \
+ /*mm7={1}x4 (needed for the horizontal add that follows) \
+   mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
+ "movq %%mm3,%%mm6\n\t" \
+ "pmaxsw %%mm5,%%mm3\n\t" \
+ "paddw %%mm2,%%mm0\n\t" \
+ "paddw %%mm5,%%mm6\n\t" \
+ "paddw %%mm4,%%mm0\n\t" \
+ "paddsw %%mm7,%%mm6\n\t" \
+ "paddw %%mm3,%%mm0\n\t" \
+ "psrlw $14,%%mm7\n\t" \
+ "psubw %%mm6,%%mm0\n\t" \
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
+   absolute value of each component, and accumulates everything into mm0.
+  This is the only portion of SATD which requires MMXEXT (we could use plain
+   MMX, but it takes 4 instructions and an extra register to work around the
+   lack of a pmaxsw, which is a pretty serious penalty).*/
+#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
+ OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
+ OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
+
+/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
+   component, and accumulates everything into mm0.
+  Note that mm0 will have an extra 4 added to each column, and that after
+   removing this value, the remainder will be half the conventional value.*/
+#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) \
+ OC_HADAMARD_AB_8x4 \
+ OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7)
+
+/*Performs two 4x4 transposes (mostly) in place.
+  On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
+   contains rows {a,b,c,d}.
+  On output, {0x40,0x50,0x60,0x70}+_off(%[buf]) contains {e,f,g,h}^T, and
+   {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
+#define OC_TRANSPOSE_4x4x2(_off) \
+ "#OC_TRANSPOSE_4x4x2\n\t" \
+ /*First 4x4 transpose:*/ \
+ "movq %%mm5,0x10+"_off"(%[buf])\n\t" \
+ /*mm0 = e3 e2 e1 e0 \
+   mm1 = f3 f2 f1 f0 \
+   mm2 = g3 g2 g1 g0 \
+   mm3 = h3 h2 h1 h0*/ \
+ "movq %%mm2,%%mm5\n\t" \
+ "punpcklwd %%mm3,%%mm2\n\t" \
+ "punpckhwd %%mm3,%%mm5\n\t" \
+ "movq %%mm0,%%mm3\n\t" \
+ "punpcklwd %%mm1,%%mm0\n\t" \
+ "punpckhwd %%mm1,%%mm3\n\t" \
+ /*mm0 = f1 e1 f0 e0 \
+   mm3 = f3 e3 f2 e2 \
+   mm2 = h1 g1 h0 g0 \
+   mm5 = h3 g3 h2 g2*/ \
+ "movq %%mm0,%%mm1\n\t" \
+ "punpckldq %%mm2,%%mm0\n\t" \
+ "punpckhdq %%mm2,%%mm1\n\t" \
+ "movq %%mm3,%%mm2\n\t" \
+ "punpckhdq %%mm5,%%mm3\n\t" \
+ "movq %%mm0,0x40+"_off"(%[buf])\n\t" \
+ "punpckldq %%mm5,%%mm2\n\t" \
+ /*mm0 = h0 g0 f0 e0 \
+   mm1 = h1 g1 f1 e1 \
+   mm2 = h2 g2 f2 e2 \
+   mm3 = h3 g3 f3 e3*/ \
+ "movq 0x10+"_off"(%[buf]),%%mm5\n\t" \
+ /*Second 4x4 transpose:*/ \
+ /*mm4 = a3 a2 a1 a0 \
+   mm5 = b3 b2 b1 b0 \
+   mm6 = c3 c2 c1 c0 \
+   mm7 = d3 d2 d1 d0*/ \
+ "movq %%mm6,%%mm0\n\t" \
+ "punpcklwd %%mm7,%%mm6\n\t" \
+ "movq %%mm1,0x50+"_off"(%[buf])\n\t" \
+ "punpckhwd %%mm7,%%mm0\n\t" \
+ "movq %%mm4,%%mm7\n\t" \
+ "punpcklwd %%mm5,%%mm4\n\t" \
+ "movq %%mm2,0x60+"_off"(%[buf])\n\t" \
+ "punpckhwd %%mm5,%%mm7\n\t" \
+ /*mm4 = b1 a1 b0 a0 \
+   mm7 = b3 a3 b2 a2 \
+   mm6 = d1 c1 d0 c0 \
+   mm0 = d3 c3 d2 c2*/ \
+ "movq %%mm4,%%mm5\n\t" \
+ "punpckldq %%mm6,%%mm4\n\t" \
+ "movq %%mm3,0x70+"_off"(%[buf])\n\t" \
+ "punpckhdq %%mm6,%%mm5\n\t" \
+ "movq %%mm7,%%mm6\n\t" \
+ "punpckhdq %%mm0,%%mm7\n\t" \
+ "punpckldq %%mm0,%%mm6\n\t" \
+ /*mm4 = d0 c0 b0 a0 \
+   mm5 = d1 c1 b1 a1 \
+   mm6 = d2 c2 b2 a2 \
+   mm7 = d3 c3 b3 a3*/ \
+
+static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
+ int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
+  OC_ALIGN8(ogg_int16_t  buf[64]);
+  ogg_int16_t *bufp;
+  unsigned     ret;
+  unsigned     ret2;
+  bufp=buf;
+  __asm__ __volatile__(
+    OC_LOAD_SUB_8x4("0x00")
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2("0x00")
+    /*Finish swapping out this 8x4 block to make room for the next one.
+      mm0...mm3 have been swapped out already.*/
+    "movq %%mm4,0x00(%[buf])\n\t"
+    "movq %%mm5,0x10(%[buf])\n\t"
+    "movq %%mm6,0x20(%[buf])\n\t"
+    "movq %%mm7,0x30(%[buf])\n\t"
+    OC_LOAD_SUB_8x4("0x04")
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2("0x08")
+    /*Here the first 4x4 block of output from the last transpose is the second
+       4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place, so
+       we only have to do half the loads.*/
+    "movq 0x10(%[buf]),%%mm1\n\t"
+    "movq 0x20(%[buf]),%%mm2\n\t"
+    "movq 0x30(%[buf]),%%mm3\n\t"
+    "movq 0x00(%[buf]),%%mm0\n\t"
+    OC_HADAMARD_ABS_ACCUM_8x4("0x28","0x38")
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+       for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.
+      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
+       latency of pmaddwd by starting the next series of loads now.*/
+    "mov %[thresh],%[ret2]\n\t"
+    "pmaddwd %%mm7,%%mm0\n\t"
+    "movq 0x50(%[buf]),%%mm1\n\t"
+    "movq 0x58(%[buf]),%%mm5\n\t"
+    "movq %%mm0,%%mm4\n\t"
+    "movq 0x60(%[buf]),%%mm2\n\t"
+    "punpckhdq %%mm0,%%mm0\n\t"
+    "movq 0x68(%[buf]),%%mm6\n\t"
+    "paddd %%mm0,%%mm4\n\t"
+    "movq 0x70(%[buf]),%%mm3\n\t"
+    "movd %%mm4,%[ret]\n\t"
+    "movq 0x78(%[buf]),%%mm7\n\t"
+    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
+       added to them, and a factor of two removed; correct the final sum here.*/
+    "lea -32(%[ret],%[ret]),%[ret]\n\t"
+    "movq 0x40(%[buf]),%%mm0\n\t"
+    "cmp %[ret2],%[ret]\n\t"
+    "movq 0x48(%[buf]),%%mm4\n\t"
+    "jae 1f\n\t"
+    OC_HADAMARD_ABS_ACCUM_8x4("0x68","0x78")
+    "pmaddwd %%mm7,%%mm0\n\t"
+    /*There isn't much to stick in here to hide the latency this time, but the
+       alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
+       latency is even worse.*/
+    "sub $32,%[ret]\n\t"
+    "movq %%mm0,%%mm4\n\t"
+    "punpckhdq %%mm0,%%mm0\n\t"
+    "paddd %%mm0,%%mm4\n\t"
+    "movd %%mm4,%[ret2]\n\t"
+    "lea (%[ret],%[ret2],2),%[ret]\n\t"
+    ".p2align 4,,15\n\t"
+    "1:\n\t"
+    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
+       and %[ret2] with some of the inputs, since for once we don't write to
+       them until after we're done using everything but %[buf] (which is also
+       listed as an output to ensure gcc _doesn't_ alias them against it).*/
+    /*Note that _src_ystride and _ref_ystride must be given non-overlapping
+       constraints, otherewise if gcc can prove they're equal it will allocate
+       them to the same register (which is bad); _src and _ref face a similar
+       problem, though those are never actually the same.*/
+    :[ret]"=a"(ret),[ret2]"=r"(ret2),[buf]"+r"(bufp)
+    :[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
+     [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride),
+     [thresh]"m"(_thresh)
+    /*We have to use neg, so we actually clobber the condition codes for once
+       (not to mention cmp, sub, and add).*/
+    :"cc"
+  );
+  return ret;
+}
+
+unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh){
+  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
+}
+
+/*Our internal implementation of frag_copy2 takes an extra stride parameter so
+   we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
+static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
+  __asm__ __volatile__(
+    /*Load the first 3 rows.*/
+    "movq (%[src1]),%%mm0\n\t"
+    "movq (%[src2]),%%mm1\n\t"
+    "movq (%[src1],%[src_ystride]),%%mm2\n\t"
+    "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
+    "movq (%[src2],%[src_ystride]),%%mm3\n\t"
+    "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
+    "pxor %%mm7,%%mm7\n\t"
+    "movq (%[src1]),%%mm4\n\t"
+    "pcmpeqb %%mm6,%%mm6\n\t"
+    "movq (%[src2]),%%mm5\n\t"
+    /*mm7={1}x8.*/
+    "psubb %%mm6,%%mm7\n\t"
+    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
+    "movq %%mm0,%%mm6\n\t"
+    "pxor %%mm1,%%mm0\n\t"
+    "pavgb %%mm1,%%mm6\n\t"
+    /*%%mm1 is free, start averaging %%mm3 into %%mm2 using %%mm1.*/
+    "movq %%mm2,%%mm1\n\t"
+    "pand %%mm7,%%mm0\n\t"
+    "pavgb %%mm3,%%mm2\n\t"
+    "pxor %%mm3,%%mm1\n\t"
+    /*%%mm3 is free.*/
+    "psubb %%mm0,%%mm6\n\t"
+    /*%%mm0 is free, start loading the next row.*/
+    "movq (%[src1],%[src_ystride]),%%mm0\n\t"
+    /*Start averaging %%mm5 and %%mm4 using %%mm3.*/
+    "movq %%mm4,%%mm3\n\t"
+    /*%%mm6 (row 0) is done; write it out.*/
+    "movq %%mm6,(%[dst])\n\t"
+    "pand %%mm7,%%mm1\n\t"
+    "pavgb %%mm5,%%mm4\n\t"
+    "psubb %%mm1,%%mm2\n\t"
+    /*%%mm1 is free, continue loading the next row.*/
+    "movq (%[src2],%[src_ystride]),%%mm1\n\t"
+    "pxor %%mm5,%%mm3\n\t"
+    "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
+    /*%%mm2 (row 1) is done; write it out.*/
+    "movq %%mm2,(%[dst],%[dst_ystride])\n\t"
+    "pand %%mm7,%%mm3\n\t"
+    /*Start loading the next row.*/
+    "movq (%[src1]),%%mm2\n\t"
+    "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
+    "psubb %%mm3,%%mm4\n\t"
+    "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
+    /*%%mm4 (row 2) is done; write it out.*/
+    "movq %%mm4,(%[dst])\n\t"
+    /*Continue loading the next row.*/
+    "movq (%[src2]),%%mm3\n\t"
+    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
+    "movq %%mm0,%%mm6\n\t"
+    "pxor %%mm1,%%mm0\n\t"
+    /*Start loading the next row.*/
+    "movq (%[src1],%[src_ystride]),%%mm4\n\t"
+    "pavgb %%mm1,%%mm6\n\t"
+    /*%%mm1 is free; start averaging %%mm3 into %%mm2 using %%mm1.*/
+    "movq %%mm2,%%mm1\n\t"
+    "pand %%mm7,%%mm0\n\t"
+    /*Continue loading the next row.*/
+    "movq (%[src2],%[src_ystride]),%%mm5\n\t"
+    "pavgb %%mm3,%%mm2\n\t"
+    "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
+    "pxor %%mm3,%%mm1\n\t"
+    /*%%mm3 is free.*/
+    "psubb %%mm0,%%mm6\n\t"
+    /*%%mm0 is free, start loading the next row.*/
+    "movq (%[src1]),%%mm0\n\t"
+    /*Start averaging %%mm5 into %%mm4 using %%mm3.*/
+    "movq %%mm4,%%mm3\n\t"
+    /*%%mm6 (row 3) is done; write it out.*/
+    "movq %%mm6,(%[dst],%[dst_ystride])\n\t"
+    "pand %%mm7,%%mm1\n\t"
+    "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
+    "pavgb %%mm5,%%mm4\n\t"
+    "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
+    "psubb %%mm1,%%mm2\n\t"
+    /*%%mm1 is free; continue loading the next row.*/
+    "movq (%[src2]),%%mm1\n\t"
+    "pxor %%mm5,%%mm3\n\t"
+    /*%%mm2 (row 4) is done; write it out.*/
+    "movq %%mm2,(%[dst])\n\t"
+    "pand %%mm7,%%mm3\n\t"
+    /*Start loading the next row.*/
+    "movq (%[src1],%[src_ystride]),%%mm2\n\t"
+    "psubb %%mm3,%%mm4\n\t"
+    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
+    "movq %%mm0,%%mm6\n\t"
+    /*Continue loading the next row.*/
+    "movq (%[src2],%[src_ystride]),%%mm3\n\t"
+    /*%%mm4 (row 5) is done; write it out.*/
+    "movq %%mm4,(%[dst],%[dst_ystride])\n\t"
+    "pxor %%mm1,%%mm0\n\t"
+    "pavgb %%mm1,%%mm6\n\t"
+    /*%%mm4 is free; start averaging %%mm3 into %%mm2 using %%mm4.*/
+    "movq %%mm2,%%mm4\n\t"
+    "pand %%mm7,%%mm0\n\t"
+    "pavgb %%mm3,%%mm2\n\t"
+    "pxor %%mm3,%%mm4\n\t"
+    "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
+    "psubb %%mm0,%%mm6\n\t"
+    "pand %%mm7,%%mm4\n\t"
+    /*%%mm6 (row 6) is done, write it out.*/
+    "movq %%mm6,(%[dst])\n\t"
+    "psubb %%mm4,%%mm2\n\t"
+    /*%%mm2 (row 7) is done, write it out.*/
+    "movq %%mm2,(%[dst],%[dst_ystride])\n\t"
+    :[dst]"+r"(_dst),[src1]"+%r"(_src1),[src2]"+r"(_src2)
+    :[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
+     [src_ystride]"r"((ptrdiff_t)_src_ystride)
+    :"memory"
+  );
+}
+
+unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh){
+  OC_ALIGN8(unsigned char ref[64]);
+  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
+  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
+}
+
+unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
+ int _ystride){
+  OC_ALIGN8(ogg_int16_t  buf[64]);
+  ogg_int16_t *bufp;
+  unsigned     ret;
+  unsigned     ret2;
+  bufp=buf;
+  __asm__ __volatile__(
+    OC_LOAD_8x4("0x00")
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2("0x00")
+    /*Finish swapping out this 8x4 block to make room for the next one.
+      mm0...mm3 have been swapped out already.*/
+    "movq %%mm4,0x00(%[buf])\n\t"
+    "movq %%mm5,0x10(%[buf])\n\t"
+    "movq %%mm6,0x20(%[buf])\n\t"
+    "movq %%mm7,0x30(%[buf])\n\t"
+    OC_LOAD_8x4("0x04")
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2("0x08")
+    /*Here the first 4x4 block of output from the last transpose is the second
+       4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place, so
+       we only have to do half the loads.*/
+    "movq 0x10(%[buf]),%%mm1\n\t"
+    "movq 0x20(%[buf]),%%mm2\n\t"
+    "movq 0x30(%[buf]),%%mm3\n\t"
+    "movq 0x00(%[buf]),%%mm0\n\t"
+    /*We split out the stages here so we can save the DC coefficient in the
+       middle.*/
+    OC_HADAMARD_AB_8x4
+    OC_HADAMARD_C_ABS_ACCUM_A_8x4("0x28","0x38")
+    "movd %%mm1,%[ret]\n\t"
+    OC_HADAMARD_C_ABS_ACCUM_B_8x4("0x28","0x38")
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+       for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.
+      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
+       latency of pmaddwd by starting the next series of loads now.*/
+    "pmaddwd %%mm7,%%mm0\n\t"
+    "movq 0x50(%[buf]),%%mm1\n\t"
+    "movq 0x58(%[buf]),%%mm5\n\t"
+    "movq 0x60(%[buf]),%%mm2\n\t"
+    "movq %%mm0,%%mm4\n\t"
+    "movq 0x68(%[buf]),%%mm6\n\t"
+    "punpckhdq %%mm0,%%mm0\n\t"
+    "movq 0x70(%[buf]),%%mm3\n\t"
+    "paddd %%mm0,%%mm4\n\t"
+    "movq 0x78(%[buf]),%%mm7\n\t"
+    "movd %%mm4,%[ret2]\n\t"
+    "movq 0x40(%[buf]),%%mm0\n\t"
+    "movq 0x48(%[buf]),%%mm4\n\t"
+    OC_HADAMARD_ABS_ACCUM_8x4("0x68","0x78")
+    "pmaddwd %%mm7,%%mm0\n\t"
+    /*We assume that the DC coefficient is always positive (which is true,
+       because the input to the INTRA transform was not a difference).*/
+    "movzx %w[ret],%[ret]\n\t"
+    "add %[ret2],%[ret2]\n\t"
+    "sub %[ret],%[ret2]\n\t"
+    "movq %%mm0,%%mm4\n\t"
+    "punpckhdq %%mm0,%%mm0\n\t"
+    "paddd %%mm0,%%mm4\n\t"
+    "movd %%mm4,%[ret]\n\t"
+    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
+    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
+       and %[ret2] with some of the inputs, since for once we don't write to
+       them until after we're done using everything but %[buf] (which is also
+       listed as an output to ensure gcc _doesn't_ alias them against it).*/
+    :[ret]"=a"(ret),[ret2]"=r"(ret2),[buf]"+r"(bufp)
+    :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
+     [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
+    /*We have to use sub, so we actually clobber the condition codes for once
+       (not to mention add).*/
+    :"cc"
+  );
+  return ret;
+}
+
+void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
+ const unsigned char *_src,const unsigned char *_ref,int _ystride){
+  int i;
+  __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
+  for(i=4;i-->0;){
+    __asm__ __volatile__(
+      /*mm0=[src]*/
+      "movq (%[src]),%%mm0\n\t"
+      /*mm1=[ref]*/
+      "movq (%[ref]),%%mm1\n\t"
+      /*mm4=[src+ystride]*/
+      "movq (%[src],%[ystride]),%%mm4\n\t"
+      /*mm5=[ref+ystride]*/
+      "movq (%[ref],%[ystride]),%%mm5\n\t"
+      /*Compute [src]-[ref].*/
+      "movq %%mm0,%%mm2\n\t"
+      "punpcklbw %%mm7,%%mm0\n\t"
+      "movq %%mm1,%%mm3\n\t"
+      "punpckhbw %%mm7,%%mm2\n\t"
+      "punpcklbw %%mm7,%%mm1\n\t"
+      "punpckhbw %%mm7,%%mm3\n\t"
+      "psubw %%mm1,%%mm0\n\t"
+      "psubw %%mm3,%%mm2\n\t"
+      /*Compute [src+ystride]-[ref+ystride].*/
+      "movq %%mm4,%%mm1\n\t"
+      "punpcklbw %%mm7,%%mm4\n\t"
+      "movq %%mm5,%%mm3\n\t"
+      "punpckhbw %%mm7,%%mm1\n\t"
+      "lea (%[src],%[ystride],2),%[src]\n\t"
+      "punpcklbw %%mm7,%%mm5\n\t"
+      "lea (%[ref],%[ystride],2),%[ref]\n\t"
+      "punpckhbw %%mm7,%%mm3\n\t"
+      "psubw %%mm5,%%mm4\n\t"
+      "psubw %%mm3,%%mm1\n\t"
+      /*Write the answer out.*/
+      "movq %%mm0,0x00(%[residue])\n\t"
+      "movq %%mm2,0x08(%[residue])\n\t"
+      "movq %%mm4,0x10(%[residue])\n\t"
+      "movq %%mm1,0x18(%[residue])\n\t"
+      "lea 0x20(%[residue]),%[residue]\n\t"
+      :[residue]"+r"(_residue),[src]"+r"(_src),[ref]"+r"(_ref)
+      :[ystride]"r"((ptrdiff_t)_ystride)
+      :"memory"
+    );
+  }
+}
+
+void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
+ const unsigned char *_src,int _ystride){
+  ptrdiff_t ystride3;
+  __asm__ __volatile__(
+    /*mm0=[src]*/
+    "movq (%[src]),%%mm0\n\t"
+    /*mm1=[src+ystride]*/
+    "movq (%[src],%[ystride]),%%mm1\n\t"
+    /*mm6={-1}x4*/
+    "pcmpeqw %%mm6,%%mm6\n\t"
+    /*mm2=[src+2*ystride]*/
+    "movq (%[src],%[ystride],2),%%mm2\n\t"
+    /*[ystride3]=3*[ystride]*/
+    "lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
+    /*mm6={1}x4*/
+    "psllw $15,%%mm6\n\t"
+    /*mm3=[src+3*ystride]*/
+    "movq (%[src],%[ystride3]),%%mm3\n\t"
+    /*mm6={128}x4*/
+    "psrlw $8,%%mm6\n\t"
+    /*mm7=0*/
+    "pxor %%mm7,%%mm7\n\t"
+    /*[src]=[src]+4*[ystride]*/
+    "lea (%[src],%[ystride],4),%[src]\n\t"
+    /*Compute [src]-128 and [src+ystride]-128*/
+    "movq %%mm0,%%mm4\n\t"
+    "punpcklbw %%mm7,%%mm0\n\t"
+    "movq %%mm1,%%mm5\n\t"
+    "punpckhbw %%mm7,%%mm4\n\t"
+    "psubw %%mm6,%%mm0\n\t"
+    "punpcklbw %%mm7,%%mm1\n\t"
+    "psubw %%mm6,%%mm4\n\t"
+    "punpckhbw %%mm7,%%mm5\n\t"
+    "psubw %%mm6,%%mm1\n\t"
+    "psubw %%mm6,%%mm5\n\t"
+    /*Write the answer out.*/
+    "movq %%mm0,0x00(%[residue])\n\t"
+    "movq %%mm4,0x08(%[residue])\n\t"
+    "movq %%mm1,0x10(%[residue])\n\t"
+    "movq %%mm5,0x18(%[residue])\n\t"
+    /*mm0=[src+4*ystride]*/
+    "movq (%[src]),%%mm0\n\t"
+    /*mm1=[src+5*ystride]*/
+    "movq (%[src],%[ystride]),%%mm1\n\t"
+    /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
+    "movq %%mm2,%%mm4\n\t"
+    "punpcklbw %%mm7,%%mm2\n\t"
+    "movq %%mm3,%%mm5\n\t"
+    "punpckhbw %%mm7,%%mm4\n\t"
+    "psubw %%mm6,%%mm2\n\t"
+    "punpcklbw %%mm7,%%mm3\n\t"
+    "psubw %%mm6,%%mm4\n\t"
+    "punpckhbw %%mm7,%%mm5\n\t"
+    "psubw %%mm6,%%mm3\n\t"
+    "psubw %%mm6,%%mm5\n\t"
+    /*Write the answer out.*/
+    "movq %%mm2,0x20(%[residue])\n\t"
+    "movq %%mm4,0x28(%[residue])\n\t"
+    "movq %%mm3,0x30(%[residue])\n\t"
+    "movq %%mm5,0x38(%[residue])\n\t"
+    /*mm2=[src+6*ystride]*/
+    "movq (%[src],%[ystride],2),%%mm2\n\t"
+    /*mm3=[src+7*ystride]*/
+    "movq (%[src],%[ystride3]),%%mm3\n\t"
+    /*Compute [src+4*ystride]-128 and [src+5*ystride]-128*/
+    "movq %%mm0,%%mm4\n\t"
+    "punpcklbw %%mm7,%%mm0\n\t"
+    "movq %%mm1,%%mm5\n\t"
+    "punpckhbw %%mm7,%%mm4\n\t"
+    "psubw %%mm6,%%mm0\n\t"
+    "punpcklbw %%mm7,%%mm1\n\t"
+    "psubw %%mm6,%%mm4\n\t"
+    "punpckhbw %%mm7,%%mm5\n\t"
+    "psubw %%mm6,%%mm1\n\t"
+    "psubw %%mm6,%%mm5\n\t"
+    /*Write the answer out.*/
+    "movq %%mm0,0x40(%[residue])\n\t"
+    "movq %%mm4,0x48(%[residue])\n\t"
+    "movq %%mm1,0x50(%[residue])\n\t"
+    "movq %%mm5,0x58(%[residue])\n\t"
+    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
+    "movq %%mm2,%%mm4\n\t"
+    "punpcklbw %%mm7,%%mm2\n\t"
+    "movq %%mm3,%%mm5\n\t"
+    "punpckhbw %%mm7,%%mm4\n\t"
+    "psubw %%mm6,%%mm2\n\t"
+    "punpcklbw %%mm7,%%mm3\n\t"
+    "psubw %%mm6,%%mm4\n\t"
+    "punpckhbw %%mm7,%%mm5\n\t"
+    "psubw %%mm6,%%mm3\n\t"
+    "psubw %%mm6,%%mm5\n\t"
+    /*Write the answer out.*/
+    "movq %%mm2,0x60(%[residue])\n\t"
+    "movq %%mm4,0x68(%[residue])\n\t"
+    "movq %%mm3,0x70(%[residue])\n\t"
+    "movq %%mm5,0x78(%[residue])\n\t"
+    :[src]"+r"(_src),[ystride3]"=&r"(ystride3)
+    :[residue]"r"(_residue),[ystride]"r"((ptrdiff_t)_ystride)
+    :"memory"
+  );
+}
+
+void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride){
+  oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
+}
+
+#endif
diff --git a/lib/x86/mmxfdct.c b/lib/x86/mmxfdct.c
new file mode 100644
index 0000000..2118752
--- /dev/null
+++ b/lib/x86/mmxfdct.c
@@ -0,0 +1,665 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************/
+/*MMX fDCT implementation for x86_32*/
+/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+# define OC_FDCT_STAGE1_8x4 \
+ "#OC_FDCT_STAGE1_8x4\n\t" \
+ /*Stage 1:*/ \
+ /*mm0=t7'=t0-t7*/ \
+ "psubw %%mm7,%%mm0\n\t" \
+ "paddw %%mm7,%%mm7\n\t" \
+ /*mm1=t6'=t1-t6*/ \
+ "psubw %%mm6,%%mm1\n\t" \
+ "paddw %%mm6,%%mm6\n\t" \
+ /*mm2=t5'=t2-t5*/ \
+ "psubw %%mm5,%%mm2\n\t" \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*mm3=t4'=t3-t4*/ \
+ "psubw %%mm4,%%mm3\n\t" \
+ "paddw %%mm4,%%mm4\n\t" \
+ /*mm7=t0'=t0+t7*/ \
+ "paddw %%mm0,%%mm7\n\t" \
+ /*mm6=t1'=t1+t6*/ \
+ "paddw %%mm1,%%mm6\n\t" \
+ /*mm5=t2'=t2+t5*/ \
+ "paddw %%mm2,%%mm5\n\t" \
+ /*mm4=t3'=t3+t4*/ \
+ "paddw %%mm3,%%mm4\n\t" \
+
+# define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
+ "#OC_FDCT8x4\n\t" \
+ /*Stage 2:*/ \
+ /*mm7=t3''=t0'-t3'*/ \
+ "psubw %%mm4,%%mm7\n\t" \
+ "paddw %%mm4,%%mm4\n\t" \
+ /*mm6=t2''=t1'-t2'*/ \
+ "psubw %%mm5,%%mm6\n\t" \
+ "movq %%mm7,"_r6"(%[y])\n\t" \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*mm1=t5''=t6'-t5'*/ \
+ "psubw %%mm2,%%mm1\n\t" \
+ "movq %%mm6,"_r2"(%[y])\n\t" \
+ /*mm4=t0''=t0'+t3'*/ \
+ "paddw %%mm7,%%mm4\n\t" \
+ "paddw %%mm2,%%mm2\n\t" \
+ /*mm5=t1''=t1'+t2'*/ \
+ "movq %%mm4,"_r0"(%[y])\n\t" \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*mm2=t6''=t6'+t5'*/ \
+ "paddw %%mm1,%%mm2\n\t" \
+ "movq %%mm5,"_r4"(%[y])\n\t" \
+ /*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
+ /*mm4, mm5, mm6, mm7 are free.*/ \
+ /*Stage 3:*/ \
+ /*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
+ "mov $0x5A806A0A,%[a]\n\t" \
+ "pcmpeqb %%mm6,%%mm6\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psrlw $15,%%mm6\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddw %%mm6,%%mm6\n\t" \
+ /*mm0=0, m2={-1}x4 \
+   mm5:mm4=t5''*27146+0xB500*/ \
+ "movq %%mm1,%%mm4\n\t" \
+ "movq %%mm1,%%mm5\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "movq %%mm2,"_r3"(%[y])\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "movq %%mm0,"_r7"(%[y])\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pcmpeqb %%mm2,%%mm2\n\t" \
+ /*mm2=t6'', mm1=t5''+(t5''!=0) \
+   mm4=(t5''*27146+0xB500>>16)*/ \
+ "pcmpeqw %%mm1,%%mm0\n\t" \
+ "psrad $16,%%mm4\n\t" \
+ "psubw %%mm2,%%mm0\n\t" \
+ "movq "_r3"(%[y]),%%mm2\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "paddw %%mm0,%%mm1\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ /*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
+ "paddw %%mm1,%%mm4\n\t" \
+ "movq "_r7"(%[y]),%%mm0\n\t" \
+ "psraw $1,%%mm4\n\t" \
+ "movq %%mm3,%%mm1\n\t" \
+ /*mm3=t4''=t4'+s*/ \
+ "paddw %%mm4,%%mm3\n\t" \
+ /*mm1=t5'''=t4'-s*/ \
+ "psubw %%mm4,%%mm1\n\t" \
+ /*mm1=0, mm3={-1}x4 \
+   mm5:mm4=t6''*27146+0xB500*/ \
+ "movq %%mm2,%%mm4\n\t" \
+ "movq %%mm2,%%mm5\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "movq %%mm1,"_r5"(%[y])\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "movq %%mm3,"_r1"(%[y])\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "pxor %%mm1,%%mm1\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pcmpeqb %%mm3,%%mm3\n\t" \
+ /*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
+ "psrad $16,%%mm4\n\t" \
+ "pcmpeqw %%mm2,%%mm1\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "psubw %%mm3,%%mm1\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "paddw %%mm1,%%mm2\n\t" \
+ /*mm1=t1'' \
+   mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
+ "paddw %%mm2,%%mm4\n\t" \
+ "movq "_r4"(%[y]),%%mm1\n\t" \
+ "psraw $1,%%mm4\n\t" \
+ "movq %%mm0,%%mm2\n\t" \
+ /*mm7={54491-0x7FFF,0x7FFF}x2 \
+   mm0=t7''=t7'+s*/ \
+ "paddw %%mm4,%%mm0\n\t" \
+ /*mm2=t6'''=t7'-s*/ \
+ "psubw %%mm4,%%mm2\n\t" \
+ /*Stage 4:*/ \
+ /*mm0=0, mm2=t0'' \
+   mm5:mm4=t1''*27146+0xB500*/ \
+ "movq %%mm1,%%mm4\n\t" \
+ "movq %%mm1,%%mm5\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "movq %%mm2,"_r3"(%[y])\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "movq "_r0"(%[y]),%%mm2\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "movq %%mm0,"_r7"(%[y])\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ /*mm7={27146,0x4000>>1}x2 \
+   mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
+ "psrad $16,%%mm4\n\t" \
+ "mov $0x20006A0A,%[a]\n\t" \
+ "pcmpeqw %%mm1,%%mm0\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "psubw %%mm3,%%mm0\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "paddw %%mm1,%%mm0\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddw %%mm4,%%mm0\n\t" \
+ /*mm6={0x00000E3D}x2 \
+   mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
+ "movq %%mm2,%%mm4\n\t" \
+ "movq %%mm2,%%mm5\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "mov $0x0E3D,%[a]\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "movd %[a],%%mm6\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pxor %%mm1,%%mm1\n\t" \
+ "punpckldq %%mm6,%%mm6\n\t" \
+ "pcmpeqw %%mm2,%%mm1\n\t" \
+ /*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
+ "psrad $16,%%mm4\n\t" \
+ "psubw %%mm3,%%mm1\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "paddw %%mm1,%%mm2\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "movq "_r5"(%[y]),%%mm1\n\t" \
+ "paddw %%mm2,%%mm4\n\t" \
+ /*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
+   The naive implementation could cause overflow, so we use \
+    u=(r&s)+((r^s)>>1).*/ \
+ "movq "_r3"(%[y]),%%mm2\n\t" \
+ "movq %%mm0,%%mm7\n\t" \
+ "pxor %%mm4,%%mm0\n\t" \
+ "pand %%mm4,%%mm7\n\t" \
+ "psraw $1,%%mm0\n\t" \
+ "mov $0x7FFF54DC,%[a]\n\t" \
+ "paddw %%mm7,%%mm0\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ /*mm7={54491-0x7FFF,0x7FFF}x2 \
+   mm4=_y[4]=v=r-u*/ \
+ "psubw %%mm0,%%mm4\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "movq %%mm4,"_r4"(%[y])\n\t" \
+ /*mm0=0, mm7={36410}x4 \
+   mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
+ "movq %%mm1,%%mm4\n\t" \
+ "movq %%mm1,%%mm5\n\t" \
+ "punpcklwd %%mm1,%%mm4\n\t" \
+ "mov $0x8E3A8E3A,%[a]\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "movq %%mm0,"_r0"(%[y])\n\t" \
+ "punpckhwd %%mm1,%%mm5\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pcmpeqw %%mm0,%%mm1\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psubw %%mm3,%%mm1\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddd %%mm6,%%mm4\n\t" \
+ "paddd %%mm6,%%mm5\n\t" \
+ /*mm0=0 \
+   mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
+ "movq %%mm2,%%mm6\n\t" \
+ "movq %%mm2,%%mm3\n\t" \
+ "pmulhw %%mm7,%%mm6\n\t" \
+ "paddw %%mm2,%%mm1\n\t" \
+ "pmullw %%mm7,%%mm3\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ "paddw %%mm1,%%mm6\n\t" \
+ "movq %%mm3,%%mm1\n\t" \
+ "punpckhwd %%mm6,%%mm3\n\t" \
+ "punpcklwd %%mm6,%%mm1\n\t" \
+ /*mm3={-1}x4, mm6={1}x4 \
+   mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
+ "paddd %%mm3,%%mm5\n\t" \
+ "paddd %%mm1,%%mm4\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "pxor %%mm6,%%mm6\n\t" \
+ "psrad $16,%%mm4\n\t" \
+ "pcmpeqb %%mm3,%%mm3\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "psubw %%mm3,%%mm6\n\t" \
+ /*mm1=t7'', mm7={26568,0x3400}x2 \
+   mm2=s=t6'''-(36410*u>>16)*/ \
+ "movq %%mm4,%%mm1\n\t" \
+ "mov $0x340067C8,%[a]\n\t" \
+ "pmulhw %%mm7,%%mm4\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "movq %%mm1,"_r5"(%[y])\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddw %%mm1,%%mm4\n\t" \
+ "movq "_r7"(%[y]),%%mm1\n\t" \
+ "psubw %%mm4,%%mm2\n\t" \
+ /*mm6={0x00007B1B}x2 \
+   mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
+ "movq %%mm2,%%mm4\n\t" \
+ "movq %%mm2,%%mm5\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "pcmpeqw %%mm2,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "mov $0x7B1B,%[a]\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "movd %[a],%%mm6\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "psubw %%mm3,%%mm0\n\t" \
+ "punpckldq %%mm6,%%mm6\n\t" \
+ /*mm7={64277-0x7FFF,0x7FFF}x2 \
+   mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
+ "psrad $17,%%mm4\n\t" \
+ "paddw %%mm0,%%mm2\n\t" \
+ "psrad $17,%%mm5\n\t" \
+ "mov $0x7FFF7B16,%[a]\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "paddw %%mm4,%%mm2\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ /*mm0=0, mm7={12785}x4 \
+   mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
+ "movq %%mm1,%%mm4\n\t" \
+ "movq %%mm1,%%mm5\n\t" \
+ "movq %%mm2,"_r3"(%[y])\n\t" \
+ "punpcklwd %%mm1,%%mm4\n\t" \
+ "movq "_r1"(%[y]),%%mm2\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "mov $0x31F131F1,%[a]\n\t" \
+ "punpckhwd %%mm1,%%mm5\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pcmpeqw %%mm0,%%mm1\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psubw %%mm3,%%mm1\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddd %%mm6,%%mm4\n\t" \
+ "paddd %%mm6,%%mm5\n\t" \
+ /*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
+ "movq %%mm2,%%mm6\n\t" \
+ "movq %%mm2,%%mm3\n\t" \
+ "pmulhw %%mm7,%%mm6\n\t" \
+ "pmullw %%mm7,%%mm3\n\t" \
+ "paddw %%mm1,%%mm6\n\t" \
+ "movq %%mm3,%%mm1\n\t" \
+ "punpckhwd %%mm6,%%mm3\n\t" \
+ "punpcklwd %%mm6,%%mm1\n\t" \
+ /*mm3={-1}x4, mm6={1}x4 \
+   mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
+ "paddd %%mm3,%%mm5\n\t" \
+ "paddd %%mm1,%%mm4\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "pxor %%mm6,%%mm6\n\t" \
+ "psrad $16,%%mm4\n\t" \
+ "pcmpeqb %%mm3,%%mm3\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "psubw %%mm3,%%mm6\n\t" \
+ /*mm1=t3'', mm7={20539,0x3000}x2 \
+   mm4=s=(12785*u>>16)-t4''*/ \
+ "movq %%mm4,"_r1"(%[y])\n\t" \
+ "pmulhw %%mm7,%%mm4\n\t" \
+ "mov $0x3000503B,%[a]\n\t" \
+ "movq "_r6"(%[y]),%%mm1\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psubw %%mm2,%%mm4\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ /*mm6={0x00006CB7}x2 \
+   mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
+ "movq %%mm4,%%mm5\n\t" \
+ "movq %%mm4,%%mm2\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "pcmpeqw %%mm2,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "mov $0x6CB7,%[a]\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "movd %[a],%%mm6\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "psubw %%mm3,%%mm0\n\t" \
+ "punpckldq %%mm6,%%mm6\n\t" \
+ /*mm7={60547-0x7FFF,0x7FFF}x2 \
+   mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
+ "psrad $20,%%mm4\n\t" \
+ "paddw %%mm0,%%mm2\n\t" \
+ "psrad $20,%%mm5\n\t" \
+ "mov $0x7FFF6C84,%[a]\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "paddw %%mm4,%%mm2\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ /*mm0=0, mm7={25080}x4 \
+   mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
+ "movq %%mm1,%%mm4\n\t" \
+ "movq %%mm1,%%mm5\n\t" \
+ "movq %%mm2,"_r7"(%[y])\n\t" \
+ "punpcklwd %%mm1,%%mm4\n\t" \
+ "movq "_r2"(%[y]),%%mm2\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "mov $0x61F861F8,%[a]\n\t" \
+ "punpckhwd %%mm1,%%mm5\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "pcmpeqw %%mm0,%%mm1\n\t" \
+ "psubw %%mm3,%%mm1\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddd %%mm6,%%mm4\n\t" \
+ "paddd %%mm6,%%mm5\n\t" \
+ /*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
+ "movq %%mm2,%%mm6\n\t" \
+ "movq %%mm2,%%mm3\n\t" \
+ "pmulhw %%mm7,%%mm6\n\t" \
+ "pmullw %%mm7,%%mm3\n\t" \
+ "paddw %%mm1,%%mm6\n\t" \
+ "movq %%mm3,%%mm1\n\t" \
+ "punpckhwd %%mm6,%%mm3\n\t" \
+ "punpcklwd %%mm6,%%mm1\n\t" \
+ /*mm1={-1}x4 \
+   mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
+ "paddd %%mm3,%%mm5\n\t" \
+ "paddd %%mm1,%%mm4\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "mov $0x28005460,%[a]\n\t" \
+ "psrad $16,%%mm4\n\t" \
+ "pcmpeqb %%mm1,%%mm1\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ /*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
+   mm4=s=(25080*u>>16)-t2''*/ \
+ "movq %%mm4,%%mm6\n\t" \
+ "pmulhw %%mm7,%%mm4\n\t" \
+ "pxor %%mm5,%%mm5\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psubw %%mm1,%%mm5\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "psubw %%mm2,%%mm4\n\t" \
+ /*mm2=s+(s!=0) \
+   mm4:mm3=s*21600+0x2800*/ \
+ "movq %%mm4,%%mm3\n\t" \
+ "movq %%mm4,%%mm2\n\t" \
+ "punpckhwd %%mm5,%%mm4\n\t" \
+ "pcmpeqw %%mm2,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "psubw %%mm1,%%mm0\n\t" \
+ "punpcklwd %%mm5,%%mm3\n\t" \
+ "paddw %%mm0,%%mm2\n\t" \
+ "pmaddwd %%mm7,%%mm3\n\t" \
+ /*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
+   mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
+ "movq "_r4"(%[y]),%%mm0\n\t" \
+ "psrad $18,%%mm4\n\t" \
+ "movq "_r5"(%[y]),%%mm5\n\t" \
+ "psrad $18,%%mm3\n\t" \
+ "movq "_r7"(%[y]),%%mm1\n\t" \
+ "packssdw %%mm4,%%mm3\n\t" \
+ "movq "_r0"(%[y]),%%mm4\n\t" \
+ "paddw %%mm2,%%mm3\n\t" \
+
+/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].
+  On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and
+   {mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/
+# define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
+ "#OC_TRANSPOSE8x4\n\t" \
+ /*First 4x4 transpose:*/ \
+ /*mm0 = e3 e2 e1 e0 \
+   mm5 = f3 f2 f1 f0 \
+   mm3 = g3 g2 g1 g0 \
+   mm1 = h3 h2 h1 h0*/ \
+ "movq %%mm0,%%mm2\n\t" \
+ "punpcklwd %%mm5,%%mm0\n\t" \
+ "punpckhwd %%mm5,%%mm2\n\t" \
+ "movq %%mm3,%%mm5\n\t" \
+ "punpcklwd %%mm1,%%mm3\n\t" \
+ "punpckhwd %%mm1,%%mm5\n\t" \
+ /*mm0 = f1 e1 f0 e0 \
+   mm2 = f3 e3 f2 e2 \
+   mm3 = h1 g1 h0 g0 \
+   mm5 = h3 g3 h2 g2*/ \
+ "movq %%mm0,%%mm1\n\t" \
+ "punpckldq %%mm3,%%mm0\n\t" \
+ "movq %%mm0,"_r4"(%[y])\n\t" \
+ "punpckhdq %%mm3,%%mm1\n\t" \
+ "movq "_r1"(%[y]),%%mm0\n\t" \
+ "movq %%mm2,%%mm3\n\t" \
+ "punpckldq %%mm5,%%mm2\n\t" \
+ "punpckhdq %%mm5,%%mm3\n\t" \
+ "movq "_r3"(%[y]),%%mm5\n\t" \
+ /*_y[4] = h0 g0 f0 e0 \
+    mm1  = h1 g1 f1 e1 \
+    mm2  = h2 g2 f2 e2 \
+    mm3  = h3 g3 f3 e3*/ \
+ /*Second 4x4 transpose:*/ \
+ /*mm4 = a3 a2 a1 a0 \
+   mm0 = b3 b2 b1 b0 \
+   mm6 = c3 c2 c1 c0 \
+   mm5 = d3 d2 d1 d0*/ \
+ "movq %%mm4,%%mm7\n\t" \
+ "punpcklwd %%mm0,%%mm4\n\t" \
+ "punpckhwd %%mm0,%%mm7\n\t" \
+ "movq %%mm6,%%mm0\n\t" \
+ "punpcklwd %%mm5,%%mm6\n\t" \
+ "punpckhwd %%mm5,%%mm0\n\t" \
+ /*mm4 = b1 a1 b0 a0 \
+   mm7 = b3 a3 b2 a2 \
+   mm6 = d1 c1 d0 c0 \
+   mm0 = d3 c3 d2 c2*/ \
+ "movq %%mm4,%%mm5\n\t" \
+ "punpckldq %%mm6,%%mm4\n\t" \
+ "punpckhdq %%mm6,%%mm5\n\t" \
+ "movq %%mm7,%%mm6\n\t" \
+ "punpckhdq %%mm0,%%mm7\n\t" \
+ "punpckldq %%mm0,%%mm6\n\t" \
+ /*mm4 = d0 c0 b0 a0 \
+   mm5 = d1 c1 b1 a1 \
+   mm6 = d2 c2 b2 a2 \
+   mm7 = d3 c3 b3 a3*/ \
+
+/*MMX implementation of the fDCT.*/
+void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  ptrdiff_t a;
+  __asm__ __volatile__(
+    /*Add two extra bits of working precision to improve accuracy; any more and
+       we could overflow.*/
+    /*We also add biases to correct for some systematic error that remains in
+       the full fDCT->iDCT round trip.*/
+    "movq 0x00(%[x]),%%mm0\n\t"
+    "movq 0x10(%[x]),%%mm1\n\t"
+    "movq 0x20(%[x]),%%mm2\n\t"
+    "movq 0x30(%[x]),%%mm3\n\t"
+    "pcmpeqb %%mm4,%%mm4\n\t"
+    "pxor %%mm7,%%mm7\n\t"
+    "movq %%mm0,%%mm5\n\t"
+    "psllw $2,%%mm0\n\t"
+    "pcmpeqw %%mm7,%%mm5\n\t"
+    "movq 0x70(%[x]),%%mm7\n\t"
+    "psllw $2,%%mm1\n\t"
+    "psubw %%mm4,%%mm5\n\t"
+    "psllw $2,%%mm2\n\t"
+    "mov $1,%[a]\n\t"
+    "pslld $16,%%mm5\n\t"
+    "movd %[a],%%mm6\n\t"
+    "psllq $16,%%mm5\n\t"
+    "mov $0x10001,%[a]\n\t"
+    "psllw $2,%%mm3\n\t"
+    "movd %[a],%%mm4\n\t"
+    "punpckhwd %%mm6,%%mm5\n\t"
+    "psubw %%mm6,%%mm1\n\t"
+    "movq 0x60(%[x]),%%mm6\n\t"
+    "paddw %%mm5,%%mm0\n\t"
+    "movq 0x50(%[x]),%%mm5\n\t"
+    "paddw %%mm4,%%mm0\n\t"
+    "movq 0x40(%[x]),%%mm4\n\t"
+    /*We inline stage1 of the transform here so we can get better instruction
+       scheduling with the shifts.*/
+    /*mm0=t7'=t0-t7*/
+    "psllw $2,%%mm7\n\t"
+    "psubw %%mm7,%%mm0\n\t"
+    "psllw $2,%%mm6\n\t"
+    "paddw %%mm7,%%mm7\n\t"
+    /*mm1=t6'=t1-t6*/
+    "psllw $2,%%mm5\n\t"
+    "psubw %%mm6,%%mm1\n\t"
+    "psllw $2,%%mm4\n\t"
+    "paddw %%mm6,%%mm6\n\t"
+    /*mm2=t5'=t2-t5*/
+    "psubw %%mm5,%%mm2\n\t"
+    "paddw %%mm5,%%mm5\n\t"
+    /*mm3=t4'=t3-t4*/
+    "psubw %%mm4,%%mm3\n\t"
+    "paddw %%mm4,%%mm4\n\t"
+    /*mm7=t0'=t0+t7*/
+    "paddw %%mm0,%%mm7\n\t"
+    /*mm6=t1'=t1+t6*/
+    "paddw %%mm1,%%mm6\n\t"
+    /*mm5=t2'=t2+t5*/
+    "paddw %%mm2,%%mm5\n\t"
+    /*mm4=t3'=t3+t4*/
+    "paddw %%mm3,%%mm4\n\t"
+    OC_FDCT8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
+    OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
+    /*Swap out this 8x4 block for the next one.*/
+    "movq 0x08(%[x]),%%mm0\n\t"
+    "movq %%mm7,0x30(%[y])\n\t"
+    "movq 0x78(%[x]),%%mm7\n\t"
+    "movq %%mm1,0x50(%[y])\n\t"
+    "movq 0x18(%[x]),%%mm1\n\t"
+    "movq %%mm6,0x20(%[y])\n\t"
+    "movq 0x68(%[x]),%%mm6\n\t"
+    "movq %%mm2,0x60(%[y])\n\t"
+    "movq 0x28(%[x]),%%mm2\n\t"
+    "movq %%mm5,0x10(%[y])\n\t"
+    "movq 0x58(%[x]),%%mm5\n\t"
+    "movq %%mm3,0x70(%[y])\n\t"
+    "movq 0x38(%[x]),%%mm3\n\t"
+    /*And increase its working precision, too.*/
+    "psllw $2,%%mm0\n\t"
+    "movq %%mm4,0x00(%[y])\n\t"
+    "psllw $2,%%mm7\n\t"
+    "movq 0x48(%[x]),%%mm4\n\t"
+    /*We inline stage1 of the transform here so we can get better instruction
+       scheduling with the shifts.*/
+    /*mm0=t7'=t0-t7*/
+    "psubw %%mm7,%%mm0\n\t"
+    "psllw $2,%%mm1\n\t"
+    "paddw %%mm7,%%mm7\n\t"
+    "psllw $2,%%mm6\n\t"
+    /*mm1=t6'=t1-t6*/
+    "psubw %%mm6,%%mm1\n\t"
+    "psllw $2,%%mm2\n\t"
+    "paddw %%mm6,%%mm6\n\t"
+    "psllw $2,%%mm5\n\t"
+    /*mm2=t5'=t2-t5*/
+    "psubw %%mm5,%%mm2\n\t"
+    "psllw $2,%%mm3\n\t"
+    "paddw %%mm5,%%mm5\n\t"
+    "psllw $2,%%mm4\n\t"
+    /*mm3=t4'=t3-t4*/
+    "psubw %%mm4,%%mm3\n\t"
+    "paddw %%mm4,%%mm4\n\t"
+    /*mm7=t0'=t0+t7*/
+    "paddw %%mm0,%%mm7\n\t"
+    /*mm6=t1'=t1+t6*/
+    "paddw %%mm1,%%mm6\n\t"
+    /*mm5=t2'=t2+t5*/
+    "paddw %%mm2,%%mm5\n\t"
+    /*mm4=t3'=t3+t4*/
+    "paddw %%mm3,%%mm4\n\t"
+    OC_FDCT8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
+    OC_TRANSPOSE8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
+    /*Here the first 4x4 block of output from the last transpose is the second
+       4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place,
+       so we only have to do half the stores and loads.*/
+    "movq 0x00(%[y]),%%mm0\n\t"
+    "movq %%mm1,0x58(%[y])\n\t"
+    "movq 0x10(%[y]),%%mm1\n\t"
+    "movq %%mm2,0x68(%[y])\n\t"
+    "movq 0x20(%[y]),%%mm2\n\t"
+    "movq %%mm3,0x78(%[y])\n\t"
+    "movq 0x30(%[y]),%%mm3\n\t"
+    OC_FDCT_STAGE1_8x4
+    OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
+    OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
+    /*mm0={-2}x4*/
+    "pcmpeqw %%mm0,%%mm0\n\t"
+    "paddw %%mm0,%%mm0\n\t"
+    /*Round the results.*/
+    "psubw %%mm0,%%mm1\n\t"
+    "psubw %%mm0,%%mm2\n\t"
+    "psraw $2,%%mm1\n\t"
+    "psubw %%mm0,%%mm3\n\t"
+    "movq %%mm1,0x18(%[y])\n\t"
+    "psraw $2,%%mm2\n\t"
+    "psubw %%mm0,%%mm4\n\t"
+    "movq 0x08(%[y]),%%mm1\n\t"
+    "psraw $2,%%mm3\n\t"
+    "psubw %%mm0,%%mm5\n\t"
+    "psraw $2,%%mm4\n\t"
+    "psubw %%mm0,%%mm6\n\t"
+    "psraw $2,%%mm5\n\t"
+    "psubw %%mm0,%%mm7\n\t"
+    "psraw $2,%%mm6\n\t"
+    "psubw %%mm0,%%mm1\n\t"
+    "psraw $2,%%mm7\n\t"
+    "movq 0x40(%[y]),%%mm0\n\t"
+    "psraw $2,%%mm1\n\t"
+    "movq %%mm7,0x30(%[y])\n\t"
+    "movq 0x78(%[y]),%%mm7\n\t"
+    "movq %%mm1,0x08(%[y])\n\t"
+    "movq 0x50(%[y]),%%mm1\n\t"
+    "movq %%mm6,0x20(%[y])\n\t"
+    "movq 0x68(%[y]),%%mm6\n\t"
+    "movq %%mm2,0x28(%[y])\n\t"
+    "movq 0x60(%[y]),%%mm2\n\t"
+    "movq %%mm5,0x10(%[y])\n\t"
+    "movq 0x58(%[y]),%%mm5\n\t"
+    "movq %%mm3,0x38(%[y])\n\t"
+    "movq 0x70(%[y]),%%mm3\n\t"
+    "movq %%mm4,0x00(%[y])\n\t"
+    "movq 0x48(%[y]),%%mm4\n\t"
+    OC_FDCT_STAGE1_8x4
+    OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
+    OC_TRANSPOSE8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
+    /*mm0={-2}x4*/
+    "pcmpeqw %%mm0,%%mm0\n\t"
+    "paddw %%mm0,%%mm0\n\t"
+    /*Round the results.*/
+    "psubw %%mm0,%%mm1\n\t"
+    "psubw %%mm0,%%mm2\n\t"
+    "psraw $2,%%mm1\n\t"
+    "psubw %%mm0,%%mm3\n\t"
+    "movq %%mm1,0x58(%[y])\n\t"
+    "psraw $2,%%mm2\n\t"
+    "psubw %%mm0,%%mm4\n\t"
+    "movq 0x48(%[y]),%%mm1\n\t"
+    "psraw $2,%%mm3\n\t"
+    "psubw %%mm0,%%mm5\n\t"
+    "movq %%mm2,0x68(%[y])\n\t"
+    "psraw $2,%%mm4\n\t"
+    "psubw %%mm0,%%mm6\n\t"
+    "movq %%mm3,0x78(%[y])\n\t"
+    "psraw $2,%%mm5\n\t"
+    "psubw %%mm0,%%mm7\n\t"
+    "movq %%mm4,0x40(%[y])\n\t"
+    "psraw $2,%%mm6\n\t"
+    "psubw %%mm0,%%mm1\n\t"
+    "movq %%mm5,0x50(%[y])\n\t"
+    "psraw $2,%%mm7\n\t"
+    "movq %%mm6,0x60(%[y])\n\t"
+    "psraw $2,%%mm1\n\t"
+    "movq %%mm7,0x70(%[y])\n\t"
+    "movq %%mm1,0x48(%[y])\n\t"
+    :[a]"=&r"(a)
+    :[y]"r"(_y),[x]"r"(_x)
+    :"memory"
+  );
+}
+
+#endif
diff --git a/lib/x86/mmxfrag.c b/lib/x86/mmxfrag.c
new file mode 100644
index 0000000..2c73293
--- /dev/null
+++ b/lib/x86/mmxfrag.c
@@ -0,0 +1,293 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxfrag.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+/*MMX acceleration of fragment reconstruction for motion compensation.
+  Originally written by Rudolf Marek.
+  Additional optimization by Nils Pipenbrinck.
+  Note: Loops are unrolled for best performance.
+  The iteration each instruction belongs to is marked in the comments as #i.*/
+#include <stddef.h>
+#include "x86int.h"
+#include "mmxfrag.h"
+
+#if defined(OC_X86_ASM)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
+void oc_frag_copy_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride){
+  OC_FRAG_COPY_MMX(_dst,_src,_ystride);
+}
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue){
+  __asm__ __volatile__(
+    /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
+    "pcmpeqw %%mm0,%%mm0\n\t"
+    /*#0 Load low residue.*/
+    "movq 0*8(%[residue]),%%mm1\n\t"
+    /*#0 Load high residue.*/
+    "movq 1*8(%[residue]),%%mm2\n\t"
+    /*Set mm0 to 0x8000800080008000.*/
+    "psllw $15,%%mm0\n\t"
+    /*#1 Load low residue.*/
+    "movq 2*8(%[residue]),%%mm3\n\t"
+    /*#1 Load high residue.*/
+    "movq 3*8(%[residue]),%%mm4\n\t"
+    /*Set mm0 to 0x0080008000800080.*/
+    "psrlw $8,%%mm0\n\t"
+    /*#2 Load low residue.*/
+    "movq 4*8(%[residue]),%%mm5\n\t"
+    /*#2 Load high residue.*/
+    "movq 5*8(%[residue]),%%mm6\n\t"
+    /*#0 Bias low  residue.*/
+    "paddsw %%mm0,%%mm1\n\t"
+    /*#0 Bias high residue.*/
+    "paddsw %%mm0,%%mm2\n\t"
+    /*#0 Pack to byte.*/
+    "packuswb %%mm2,%%mm1\n\t"
+    /*#1 Bias low  residue.*/
+    "paddsw %%mm0,%%mm3\n\t"
+    /*#1 Bias high residue.*/
+    "paddsw %%mm0,%%mm4\n\t"
+    /*#1 Pack to byte.*/
+    "packuswb %%mm4,%%mm3\n\t"
+    /*#2 Bias low  residue.*/
+    "paddsw %%mm0,%%mm5\n\t"
+    /*#2 Bias high residue.*/
+    "paddsw %%mm0,%%mm6\n\t"
+    /*#2 Pack to byte.*/
+    "packuswb %%mm6,%%mm5\n\t"
+    /*#0 Write row.*/
+    "movq %%mm1,(%[dst])\n\t"
+    /*#1 Write row.*/
+    "movq %%mm3,(%[dst],%[ystride])\n\t"
+    /*#2 Write row.*/
+    "movq %%mm5,(%[dst],%[ystride],2)\n\t"
+    /*#3 Load low residue.*/
+    "movq 6*8(%[residue]),%%mm1\n\t"
+    /*#3 Load high residue.*/
+    "movq 7*8(%[residue]),%%mm2\n\t"
+    /*#4 Load high residue.*/
+    "movq 8*8(%[residue]),%%mm3\n\t"
+    /*#4 Load high residue.*/
+    "movq 9*8(%[residue]),%%mm4\n\t"
+    /*#5 Load high residue.*/
+    "movq 10*8(%[residue]),%%mm5\n\t"
+    /*#5 Load high residue.*/
+    "movq 11*8(%[residue]),%%mm6\n\t"
+    /*#3 Bias low  residue.*/
+    "paddsw %%mm0,%%mm1\n\t"
+    /*#3 Bias high residue.*/
+    "paddsw %%mm0,%%mm2\n\t"
+    /*#3 Pack to byte.*/
+    "packuswb %%mm2,%%mm1\n\t"
+    /*#4 Bias low  residue.*/
+    "paddsw %%mm0,%%mm3\n\t"
+    /*#4 Bias high residue.*/
+    "paddsw %%mm0,%%mm4\n\t"
+    /*#4 Pack to byte.*/
+    "packuswb %%mm4,%%mm3\n\t"
+    /*#5 Bias low  residue.*/
+    "paddsw %%mm0,%%mm5\n\t"
+    /*#5 Bias high residue.*/
+    "paddsw %%mm0,%%mm6\n\t"
+    /*#5 Pack to byte.*/
+    "packuswb %%mm6,%%mm5\n\t"
+    /*#3 Write row.*/
+    "movq %%mm1,(%[dst],%[ystride3])\n\t"
+    /*#4 Write row.*/
+    "movq %%mm3,(%[dst4])\n\t"
+    /*#5 Write row.*/
+    "movq %%mm5,(%[dst4],%[ystride])\n\t"
+    /*#6 Load low residue.*/
+    "movq 12*8(%[residue]),%%mm1\n\t"
+    /*#6 Load high residue.*/
+    "movq 13*8(%[residue]),%%mm2\n\t"
+    /*#7 Load low residue.*/
+    "movq 14*8(%[residue]),%%mm3\n\t"
+    /*#7 Load high residue.*/
+    "movq 15*8(%[residue]),%%mm4\n\t"
+    /*#6 Bias low  residue.*/
+    "paddsw %%mm0,%%mm1\n\t"
+    /*#6 Bias high residue.*/
+    "paddsw %%mm0,%%mm2\n\t"
+    /*#6 Pack to byte.*/
+    "packuswb %%mm2,%%mm1\n\t"
+    /*#7 Bias low  residue.*/
+    "paddsw %%mm0,%%mm3\n\t"
+    /*#7 Bias high residue.*/
+    "paddsw %%mm0,%%mm4\n\t"
+    /*#7 Pack to byte.*/
+    "packuswb %%mm4,%%mm3\n\t"
+    /*#6 Write row.*/
+    "movq %%mm1,(%[dst4],%[ystride],2)\n\t"
+    /*#7 Write row.*/
+    "movq %%mm3,(%[dst4],%[ystride3])\n\t"
+    :
+    :[residue]"r"(_residue),
+     [dst]"r"(_dst),
+     [dst4]"r"(_dst+(_ystride<<2)),
+     [ystride]"r"((ptrdiff_t)_ystride),
+     [ystride3]"r"((ptrdiff_t)_ystride*3)
+    :"memory"
+  );
+}
+
+void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue){
+  int i;
+  /*Zero mm0.*/
+  __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
+  for(i=4;i-->0;){
+    __asm__ __volatile__(
+      /*#0 Load source.*/
+      "movq (%[src]),%%mm3\n\t"
+      /*#1 Load source.*/
+      "movq (%[src],%[ystride]),%%mm7\n\t"
+      /*#0 Get copy of src.*/
+      "movq %%mm3,%%mm4\n\t"
+      /*#0 Expand high source.*/
+      "punpckhbw %%mm0,%%mm4\n\t"
+      /*#0 Expand low  source.*/
+      "punpcklbw %%mm0,%%mm3\n\t"
+      /*#0 Add residue high.*/
+      "paddsw 8(%[residue]),%%mm4\n\t"
+      /*#1 Get copy of src.*/
+      "movq %%mm7,%%mm2\n\t"
+      /*#0 Add residue low.*/
+      "paddsw (%[residue]), %%mm3\n\t"
+      /*#1 Expand high source.*/
+      "punpckhbw %%mm0,%%mm2\n\t"
+      /*#0 Pack final row pixels.*/
+      "packuswb %%mm4,%%mm3\n\t"
+      /*#1 Expand low  source.*/
+      "punpcklbw %%mm0,%%mm7\n\t"
+      /*#1 Add residue low.*/
+      "paddsw 16(%[residue]),%%mm7\n\t"
+      /*#1 Add residue high.*/
+      "paddsw 24(%[residue]),%%mm2\n\t"
+      /*Advance residue.*/
+      "lea 32(%[residue]),%[residue]\n\t"
+      /*#1 Pack final row pixels.*/
+      "packuswb %%mm2,%%mm7\n\t"
+      /*Advance src.*/
+      "lea (%[src],%[ystride],2),%[src]\n\t"
+      /*#0 Write row.*/
+      "movq %%mm3,(%[dst])\n\t"
+      /*#1 Write row.*/
+      "movq %%mm7,(%[dst],%[ystride])\n\t"
+      /*Advance dst.*/
+      "lea (%[dst],%[ystride],2),%[dst]\n\t"
+      :[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src)
+      :[ystride]"r"((ptrdiff_t)_ystride)
+      :"memory"
+    );
+  }
+}
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
+  int i;
+  /*Zero mm7.*/
+  __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
+  for(i=4;i-->0;){
+    __asm__ __volatile__(
+      /*#0 Load src1.*/
+      "movq (%[src1]),%%mm0\n\t"
+      /*#0 Load src2.*/
+      "movq (%[src2]),%%mm2\n\t"
+      /*#0 Copy src1.*/
+      "movq %%mm0,%%mm1\n\t"
+      /*#0 Copy src2.*/
+      "movq %%mm2,%%mm3\n\t"
+      /*#1 Load src1.*/
+      "movq (%[src1],%[ystride]),%%mm4\n\t"
+      /*#0 Unpack lower src1.*/
+      "punpcklbw %%mm7,%%mm0\n\t"
+      /*#1 Load src2.*/
+      "movq (%[src2],%[ystride]),%%mm5\n\t"
+      /*#0 Unpack higher src1.*/
+      "punpckhbw %%mm7,%%mm1\n\t"
+      /*#0 Unpack lower src2.*/
+      "punpcklbw %%mm7,%%mm2\n\t"
+      /*#0 Unpack higher src2.*/
+      "punpckhbw %%mm7,%%mm3\n\t"
+      /*Advance src1 ptr.*/
+      "lea (%[src1],%[ystride],2),%[src1]\n\t"
+      /*Advance src2 ptr.*/
+      "lea (%[src2],%[ystride],2),%[src2]\n\t"
+      /*#0 Lower src1+src2.*/
+      "paddsw %%mm2,%%mm0\n\t"
+      /*#0 Higher src1+src2.*/
+      "paddsw %%mm3,%%mm1\n\t"
+      /*#1 Copy src1.*/
+      "movq %%mm4,%%mm2\n\t"
+      /*#0 Build lo average.*/
+      "psraw $1,%%mm0\n\t"
+      /*#1 Copy src2.*/
+      "movq %%mm5,%%mm3\n\t"
+      /*#1 Unpack lower src1.*/
+      "punpcklbw %%mm7,%%mm4\n\t"
+      /*#0 Build hi average.*/
+      "psraw $1,%%mm1\n\t"
+      /*#1 Unpack higher src1.*/
+      "punpckhbw %%mm7,%%mm2\n\t"
+      /*#0 low+=residue.*/
+      "paddsw (%[residue]),%%mm0\n\t"
+      /*#1 Unpack lower src2.*/
+      "punpcklbw %%mm7,%%mm5\n\t"
+      /*#0 high+=residue.*/
+      "paddsw 8(%[residue]),%%mm1\n\t"
+      /*#1 Unpack higher src2.*/
+      "punpckhbw %%mm7,%%mm3\n\t"
+      /*#1 Lower src1+src2.*/
+      "paddsw %%mm4,%%mm5\n\t"
+      /*#0 Pack and saturate.*/
+      "packuswb %%mm1,%%mm0\n\t"
+      /*#1 Higher src1+src2.*/
+      "paddsw %%mm2,%%mm3\n\t"
+      /*#0 Write row.*/
+      "movq %%mm0,(%[dst])\n\t"
+      /*#1 Build lo average.*/
+      "psraw $1,%%mm5\n\t"
+      /*#1 Build hi average.*/
+      "psraw $1,%%mm3\n\t"
+      /*#1 low+=residue.*/
+      "paddsw 16(%[residue]),%%mm5\n\t"
+      /*#1 high+=residue.*/
+      "paddsw 24(%[residue]),%%mm3\n\t"
+      /*#1 Pack and saturate.*/
+      "packuswb  %%mm3,%%mm5\n\t"
+      /*#1 Write row ptr.*/
+      "movq %%mm5,(%[dst],%[ystride])\n\t"
+      /*Advance residue ptr.*/
+      "add $32,%[residue]\n\t"
+      /*Advance dest ptr.*/
+      "lea (%[dst],%[ystride],2),%[dst]\n\t"
+     :[dst]"+r"(_dst),[residue]"+r"(_residue),
+      [src1]"+%r"(_src1),[src2]"+r"(_src2)
+     :[ystride]"r"((ptrdiff_t)_ystride)
+     :"memory"
+    );
+  }
+}
+
+void oc_restore_fpu_mmx(void){
+  __asm__ __volatile__("emms\n\t");
+}
+#endif
diff --git a/lib/x86/mmxfrag.h b/lib/x86/mmxfrag.h
new file mode 100644
index 0000000..a398427
--- /dev/null
+++ b/lib/x86/mmxfrag.h
@@ -0,0 +1,64 @@
+#if !defined(_x86_mmxfrag_H)
+# define _x86_mmxfrag_H (1)
+# include <stddef.h>
+# include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
+#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+  do{ \
+    const unsigned char *src; \
+    unsigned char       *dst; \
+    ptrdiff_t            ystride3; \
+    src=(_src); \
+    dst=(_dst); \
+    __asm__ __volatile__( \
+      /*src+0*ystride*/ \
+      "movq (%[src]),%%mm0\n\t" \
+      /*src+1*ystride*/ \
+      "movq (%[src],%[ystride]),%%mm1\n\t" \
+      /*ystride3=ystride*3*/ \
+      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+      /*src+2*ystride*/ \
+      "movq (%[src],%[ystride],2),%%mm2\n\t" \
+      /*src+3*ystride*/ \
+      "movq (%[src],%[ystride3]),%%mm3\n\t" \
+      /*dst+0*ystride*/ \
+      "movq %%mm0,(%[dst])\n\t" \
+      /*dst+1*ystride*/ \
+      "movq %%mm1,(%[dst],%[ystride])\n\t" \
+      /*Pointer to next 4.*/ \
+      "lea (%[src],%[ystride],4),%[src]\n\t" \
+      /*dst+2*ystride*/ \
+      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+      /*dst+3*ystride*/ \
+      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+      /*Pointer to next 4.*/ \
+      "lea (%[dst],%[ystride],4),%[dst]\n\t" \
+      /*src+0*ystride*/ \
+      "movq (%[src]),%%mm0\n\t" \
+      /*src+1*ystride*/ \
+      "movq (%[src],%[ystride]),%%mm1\n\t" \
+      /*src+2*ystride*/ \
+      "movq (%[src],%[ystride],2),%%mm2\n\t" \
+      /*src+3*ystride*/ \
+      "movq (%[src],%[ystride3]),%%mm3\n\t" \
+      /*dst+0*ystride*/ \
+      "movq %%mm0,(%[dst])\n\t" \
+      /*dst+1*ystride*/ \
+      "movq %%mm1,(%[dst],%[ystride])\n\t" \
+      /*dst+2*ystride*/ \
+      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+      /*dst+3*ystride*/ \
+      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+      :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
+      :[ystride]"r"((ptrdiff_t)(_ystride)) \
+      :"memory" \
+    ); \
+  } \
+  while(0)
+
+# endif
+#endif
diff --git a/lib/x86/mmxidct.c b/lib/x86/mmxidct.c
new file mode 100644
index 0000000..76424e6
--- /dev/null
+++ b/lib/x86/mmxidct.c
@@ -0,0 +1,564 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+/*MMX acceleration of Theora's iDCT.
+  Originally written by Rudolf Marek, based on code from On2's VP3.*/
+#include "x86int.h"
+#include "../dct.h"
+
+#if defined(OC_X86_ASM)
+
+/*These are offsets into the table of constants below.*/
+/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
+#define OC_COSINE_OFFSET (0)
+/*A row of 8's.*/
+#define OC_EIGHT_OFFSET  (56)
+
+
+
+/*A table of constants used by the MMX routines.*/
+static const ogg_uint16_t __attribute__((aligned(8),used))
+ OC_IDCT_CONSTS[(7+1)*4]={
+  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
+  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
+  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
+  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
+  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
+  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
+  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
+  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
+  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
+  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
+  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
+  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
+  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
+  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
+      8,    8,    8,    8
+};
+
+/*Converts the expression in the argument to a string.*/
+#define OC_M2STR(_s) #_s
+
+/*38 cycles*/
+#define OC_IDCT_BEGIN \
+  "#OC_IDCT_BEGIN\n\t" \
+  "movq "OC_I(3)",%%mm2\n\t" \
+  "movq "OC_C(3)",%%mm6\n\t" \
+  "movq %%mm2,%%mm4\n\t" \
+  "movq "OC_J(5)",%%mm7\n\t" \
+  "pmulhw %%mm6,%%mm4\n\t" \
+  "movq "OC_C(5)",%%mm1\n\t" \
+  "pmulhw %%mm7,%%mm6\n\t" \
+  "movq %%mm1,%%mm5\n\t" \
+  "pmulhw %%mm2,%%mm1\n\t" \
+  "movq "OC_I(1)",%%mm3\n\t" \
+  "pmulhw %%mm7,%%mm5\n\t" \
+  "movq "OC_C(1)",%%mm0\n\t" \
+  "paddw %%mm2,%%mm4\n\t" \
+  "paddw %%mm7,%%mm6\n\t" \
+  "paddw %%mm1,%%mm2\n\t" \
+  "movq "OC_J(7)",%%mm1\n\t" \
+  "paddw %%mm5,%%mm7\n\t" \
+  "movq %%mm0,%%mm5\n\t" \
+  "pmulhw %%mm3,%%mm0\n\t" \
+  "paddw %%mm7,%%mm4\n\t" \
+  "pmulhw %%mm1,%%mm5\n\t" \
+  "movq "OC_C(7)",%%mm7\n\t" \
+  "psubw %%mm2,%%mm6\n\t" \
+  "paddw %%mm3,%%mm0\n\t" \
+  "pmulhw %%mm7,%%mm3\n\t" \
+  "movq "OC_I(2)",%%mm2\n\t" \
+  "pmulhw %%mm1,%%mm7\n\t" \
+  "paddw %%mm1,%%mm5\n\t" \
+  "movq %%mm2,%%mm1\n\t" \
+  "pmulhw "OC_C(2)",%%mm2\n\t" \
+  "psubw %%mm5,%%mm3\n\t" \
+  "movq "OC_J(6)",%%mm5\n\t" \
+  "paddw %%mm7,%%mm0\n\t" \
+  "movq %%mm5,%%mm7\n\t" \
+  "psubw %%mm4,%%mm0\n\t" \
+  "pmulhw "OC_C(2)",%%mm5\n\t" \
+  "paddw %%mm1,%%mm2\n\t" \
+  "pmulhw "OC_C(6)",%%mm1\n\t" \
+  "paddw %%mm4,%%mm4\n\t" \
+  "paddw %%mm0,%%mm4\n\t" \
+  "psubw %%mm6,%%mm3\n\t" \
+  "paddw %%mm7,%%mm5\n\t" \
+  "paddw %%mm6,%%mm6\n\t" \
+  "pmulhw "OC_C(6)",%%mm7\n\t" \
+  "paddw %%mm3,%%mm6\n\t" \
+  "movq %%mm4,"OC_I(1)"\n\t" \
+  "psubw %%mm5,%%mm1\n\t" \
+  "movq "OC_C(4)",%%mm4\n\t" \
+  "movq %%mm3,%%mm5\n\t" \
+  "pmulhw %%mm4,%%mm3\n\t" \
+  "paddw %%mm2,%%mm7\n\t" \
+  "movq %%mm6,"OC_I(2)"\n\t" \
+  "movq %%mm0,%%mm2\n\t" \
+  "movq "OC_I(0)",%%mm6\n\t" \
+  "pmulhw %%mm4,%%mm0\n\t" \
+  "paddw %%mm3,%%mm5\n\t" \
+  "movq "OC_J(4)",%%mm3\n\t" \
+  "psubw %%mm1,%%mm5\n\t" \
+  "paddw %%mm0,%%mm2\n\t" \
+  "psubw %%mm3,%%mm6\n\t" \
+  "movq %%mm6,%%mm0\n\t" \
+  "pmulhw %%mm4,%%mm6\n\t" \
+  "paddw %%mm3,%%mm3\n\t" \
+  "paddw %%mm1,%%mm1\n\t" \
+  "paddw %%mm0,%%mm3\n\t" \
+  "paddw %%mm5,%%mm1\n\t" \
+  "pmulhw %%mm3,%%mm4\n\t" \
+  "paddw %%mm0,%%mm6\n\t" \
+  "psubw %%mm2,%%mm6\n\t" \
+  "paddw %%mm2,%%mm2\n\t" \
+  "movq "OC_I(1)",%%mm0\n\t" \
+  "paddw %%mm6,%%mm2\n\t" \
+  "paddw %%mm3,%%mm4\n\t" \
+  "psubw %%mm1,%%mm2\n\t" \
+  "#end OC_IDCT_BEGIN\n\t" \
+
+/*38+8=46 cycles.*/
+#define OC_ROW_IDCT \
+  "#OC_ROW_IDCT\n" \
+  OC_IDCT_BEGIN \
+  /*r3=D'*/ \
+  "movq "OC_I(2)",%%mm3\n\t" \
+  /*r4=E'=E-G*/ \
+  "psubw %%mm7,%%mm4\n\t" \
+  /*r1=H'+H'*/ \
+  "paddw %%mm1,%%mm1\n\t" \
+  /*r7=G+G*/ \
+  "paddw %%mm7,%%mm7\n\t" \
+  /*r1=R1=A''+H'*/ \
+  "paddw %%mm2,%%mm1\n\t" \
+  /*r7=G'=E+G*/ \
+  "paddw %%mm4,%%mm7\n\t" \
+  /*r4=R4=E'-D'*/ \
+  "psubw %%mm3,%%mm4\n\t" \
+  "paddw %%mm3,%%mm3\n\t" \
+  /*r6=R6=F'-B''*/ \
+  "psubw %%mm5,%%mm6\n\t" \
+  "paddw %%mm5,%%mm5\n\t" \
+  /*r3=R3=E'+D'*/ \
+  "paddw %%mm4,%%mm3\n\t" \
+  /*r5=R5=F'+B''*/ \
+  "paddw %%mm6,%%mm5\n\t" \
+  /*r7=R7=G'-C'*/ \
+  "psubw %%mm0,%%mm7\n\t" \
+  "paddw %%mm0,%%mm0\n\t" \
+  /*Save R1.*/ \
+  "movq %%mm1,"OC_I(1)"\n\t" \
+  /*r0=R0=G.+C.*/ \
+  "paddw %%mm7,%%mm0\n\t" \
+  "#end OC_ROW_IDCT\n\t" \
+
+/*The following macro does two 4x4 transposes in place.
+  At entry, we assume:
+    r0 = a3 a2 a1 a0
+  I(1) = b3 b2 b1 b0
+    r2 = c3 c2 c1 c0
+    r3 = d3 d2 d1 d0
+
+    r4 = e3 e2 e1 e0
+    r5 = f3 f2 f1 f0
+    r6 = g3 g2 g1 g0
+    r7 = h3 h2 h1 h0
+
+  At exit, we have:
+  I(0) = d0 c0 b0 a0
+  I(1) = d1 c1 b1 a1
+  I(2) = d2 c2 b2 a2
+  I(3) = d3 c3 b3 a3
+
+  J(4) = h0 g0 f0 e0
+  J(5) = h1 g1 f1 e1
+  J(6) = h2 g2 f2 e2
+  J(7) = h3 g3 f3 e3
+
+  I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
+  J(4) J(5) J(6) J(7) is the transpose of r4  r5  r6 r7.
+
+  Since r1 is free at entry, we calculate the Js first.*/
+/*19 cycles.*/
+#define OC_TRANSPOSE \
+  "#OC_TRANSPOSE\n\t" \
+  "movq %%mm4,%%mm1\n\t" \
+  "punpcklwd %%mm5,%%mm4\n\t" \
+  "movq %%mm0,"OC_I(0)"\n\t" \
+  "punpckhwd %%mm5,%%mm1\n\t" \
+  "movq %%mm6,%%mm0\n\t" \
+  "punpcklwd %%mm7,%%mm6\n\t" \
+  "movq %%mm4,%%mm5\n\t" \
+  "punpckldq %%mm6,%%mm4\n\t" \
+  "punpckhdq %%mm6,%%mm5\n\t" \
+  "movq %%mm1,%%mm6\n\t" \
+  "movq %%mm4,"OC_J(4)"\n\t" \
+  "punpckhwd %%mm7,%%mm0\n\t" \
+  "movq %%mm5,"OC_J(5)"\n\t" \
+  "punpckhdq %%mm0,%%mm6\n\t" \
+  "movq "OC_I(0)",%%mm4\n\t" \
+  "punpckldq %%mm0,%%mm1\n\t" \
+  "movq "OC_I(1)",%%mm5\n\t" \
+  "movq %%mm4,%%mm0\n\t" \
+  "movq %%mm6,"OC_J(7)"\n\t" \
+  "punpcklwd %%mm5,%%mm0\n\t" \
+  "movq %%mm1,"OC_J(6)"\n\t" \
+  "punpckhwd %%mm5,%%mm4\n\t" \
+  "movq %%mm2,%%mm5\n\t" \
+  "punpcklwd %%mm3,%%mm2\n\t" \
+  "movq %%mm0,%%mm1\n\t" \
+  "punpckldq %%mm2,%%mm0\n\t" \
+  "punpckhdq %%mm2,%%mm1\n\t" \
+  "movq %%mm4,%%mm2\n\t" \
+  "movq %%mm0,"OC_I(0)"\n\t" \
+  "punpckhwd %%mm3,%%mm5\n\t" \
+  "movq %%mm1,"OC_I(1)"\n\t" \
+  "punpckhdq %%mm5,%%mm4\n\t" \
+  "punpckldq %%mm5,%%mm2\n\t" \
+  "movq %%mm4,"OC_I(3)"\n\t" \
+  "movq %%mm2,"OC_I(2)"\n\t" \
+  "#end OC_TRANSPOSE\n\t" \
+
+/*38+19=57 cycles.*/
+#define OC_COLUMN_IDCT \
+  "#OC_COLUMN_IDCT\n" \
+  OC_IDCT_BEGIN \
+  "paddw "OC_8",%%mm2\n\t" \
+  /*r1=H'+H'*/ \
+  "paddw %%mm1,%%mm1\n\t" \
+  /*r1=R1=A''+H'*/ \
+  "paddw %%mm2,%%mm1\n\t" \
+  /*r2=NR2*/ \
+  "psraw $4,%%mm2\n\t" \
+  /*r4=E'=E-G*/ \
+  "psubw %%mm7,%%mm4\n\t" \
+  /*r1=NR1*/ \
+  "psraw $4,%%mm1\n\t" \
+  /*r3=D'*/ \
+  "movq "OC_I(2)",%%mm3\n\t" \
+  /*r7=G+G*/ \
+  "paddw %%mm7,%%mm7\n\t" \
+  /*Store NR2 at I(2).*/ \
+  "movq %%mm2,"OC_I(2)"\n\t" \
+  /*r7=G'=E+G*/ \
+  "paddw %%mm4,%%mm7\n\t" \
+  /*Store NR1 at I(1).*/ \
+  "movq %%mm1,"OC_I(1)"\n\t" \
+  /*r4=R4=E'-D'*/ \
+  "psubw %%mm3,%%mm4\n\t" \
+  "paddw "OC_8",%%mm4\n\t" \
+  /*r3=D'+D'*/ \
+  "paddw %%mm3,%%mm3\n\t" \
+  /*r3=R3=E'+D'*/ \
+  "paddw %%mm4,%%mm3\n\t" \
+  /*r4=NR4*/ \
+  "psraw $4,%%mm4\n\t" \
+  /*r6=R6=F'-B''*/ \
+  "psubw %%mm5,%%mm6\n\t" \
+  /*r3=NR3*/ \
+  "psraw $4,%%mm3\n\t" \
+  "paddw "OC_8",%%mm6\n\t" \
+  /*r5=B''+B''*/ \
+  "paddw %%mm5,%%mm5\n\t" \
+  /*r5=R5=F'+B''*/ \
+  "paddw %%mm6,%%mm5\n\t" \
+  /*r6=NR6*/ \
+  "psraw $4,%%mm6\n\t" \
+  /*Store NR4 at J(4).*/ \
+  "movq %%mm4,"OC_J(4)"\n\t" \
+  /*r5=NR5*/ \
+  "psraw $4,%%mm5\n\t" \
+  /*Store NR3 at I(3).*/ \
+  "movq %%mm3,"OC_I(3)"\n\t" \
+  /*r7=R7=G'-C'*/ \
+  "psubw %%mm0,%%mm7\n\t" \
+  "paddw "OC_8",%%mm7\n\t" \
+  /*r0=C'+C'*/ \
+  "paddw %%mm0,%%mm0\n\t" \
+  /*r0=R0=G'+C'*/ \
+  "paddw %%mm7,%%mm0\n\t" \
+  /*r7=NR7*/ \
+  "psraw $4,%%mm7\n\t" \
+  /*Store NR6 at J(6).*/ \
+  "movq %%mm6,"OC_J(6)"\n\t" \
+  /*r0=NR0*/ \
+  "psraw $4,%%mm0\n\t" \
+  /*Store NR5 at J(5).*/ \
+  "movq %%mm5,"OC_J(5)"\n\t" \
+  /*Store NR7 at J(7).*/ \
+  "movq %%mm7,"OC_J(7)"\n\t" \
+  /*Store NR0 at I(0).*/ \
+  "movq %%mm0,"OC_I(0)"\n\t" \
+  "#end OC_COLUMN_IDCT\n\t" \
+
+#define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"(%[c])"
+#define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
+#define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
+
+static void oc_idct8x8_slow(ogg_int16_t _y[64]){
+  /*This routine accepts an 8x8 matrix, but in partially transposed form.
+    Every 4x4 block is transposed.*/
+  __asm__ __volatile__(
+#define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
+#define OC_J(_k)      OC_M2STR(((_k-4)*16)+8)"(%[y])"
+    OC_ROW_IDCT
+    OC_TRANSPOSE
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k)      OC_M2STR((_k*16)+64)"(%[y])"
+#define OC_J(_k)      OC_M2STR(((_k-4)*16)+72)"(%[y])"
+    OC_ROW_IDCT
+    OC_TRANSPOSE
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
+#define OC_J(_k)      OC_I(_k)
+    OC_COLUMN_IDCT
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k)      OC_M2STR((_k*16)+8)"(%[y])"
+#define OC_J(_k)      OC_I(_k)
+    OC_COLUMN_IDCT
+#undef  OC_I
+#undef  OC_J
+    :
+    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
+  );
+}
+
+/*25 cycles.*/
+#define OC_IDCT_BEGIN_10 \
+ "#OC_IDCT_BEGIN_10\n\t" \
+ "movq "OC_I(3)",%%mm2\n\t" \
+ "nop\n\t" \
+ "movq "OC_C(3)",%%mm6\n\t" \
+ "movq %%mm2,%%mm4\n\t" \
+ "movq "OC_C(5)",%%mm1\n\t" \
+ "pmulhw %%mm6,%%mm4\n\t" \
+ "movq "OC_I(1)",%%mm3\n\t" \
+ "pmulhw %%mm2,%%mm1\n\t" \
+ "movq "OC_C(1)",%%mm0\n\t" \
+ "paddw %%mm2,%%mm4\n\t" \
+ "pxor %%mm6,%%mm6\n\t" \
+ "paddw %%mm1,%%mm2\n\t" \
+ "movq "OC_I(2)",%%mm5\n\t" \
+ "pmulhw %%mm3,%%mm0\n\t" \
+ "movq %%mm5,%%mm1\n\t" \
+ "paddw %%mm3,%%mm0\n\t" \
+ "pmulhw "OC_C(7)",%%mm3\n\t" \
+ "psubw %%mm2,%%mm6\n\t" \
+ "pmulhw "OC_C(2)",%%mm5\n\t" \
+ "psubw %%mm4,%%mm0\n\t" \
+ "movq "OC_I(2)",%%mm7\n\t" \
+ "paddw %%mm4,%%mm4\n\t" \
+ "paddw %%mm5,%%mm7\n\t" \
+ "paddw %%mm0,%%mm4\n\t" \
+ "pmulhw "OC_C(6)",%%mm1\n\t" \
+ "psubw %%mm6,%%mm3\n\t" \
+ "movq %%mm4,"OC_I(1)"\n\t" \
+ "paddw %%mm6,%%mm6\n\t" \
+ "movq "OC_C(4)",%%mm4\n\t" \
+ "paddw %%mm3,%%mm6\n\t" \
+ "movq %%mm3,%%mm5\n\t" \
+ "pmulhw %%mm4,%%mm3\n\t" \
+ "movq %%mm6,"OC_I(2)"\n\t" \
+ "movq %%mm0,%%mm2\n\t" \
+ "movq "OC_I(0)",%%mm6\n\t" \
+ "pmulhw %%mm4,%%mm0\n\t" \
+ "paddw %%mm3,%%mm5\n\t" \
+ "paddw %%mm0,%%mm2\n\t" \
+ "psubw %%mm1,%%mm5\n\t" \
+ "pmulhw %%mm4,%%mm6\n\t" \
+ "paddw "OC_I(0)",%%mm6\n\t" \
+ "paddw %%mm1,%%mm1\n\t" \
+ "movq %%mm6,%%mm4\n\t" \
+ "paddw %%mm5,%%mm1\n\t" \
+ "psubw %%mm2,%%mm6\n\t" \
+ "paddw %%mm2,%%mm2\n\t" \
+ "movq "OC_I(1)",%%mm0\n\t" \
+ "paddw %%mm6,%%mm2\n\t" \
+ "psubw %%mm1,%%mm2\n\t" \
+ "nop\n\t" \
+ "#end OC_IDCT_BEGIN_10\n\t" \
+
+/*25+8=33 cycles.*/
+#define OC_ROW_IDCT_10 \
+ "#OC_ROW_IDCT_10\n\t" \
+ OC_IDCT_BEGIN_10 \
+ /*r3=D'*/ \
+ "movq "OC_I(2)",%%mm3\n\t" \
+ /*r4=E'=E-G*/ \
+ "psubw %%mm7,%%mm4\n\t" \
+ /*r1=H'+H'*/ \
+ "paddw %%mm1,%%mm1\n\t" \
+ /*r7=G+G*/ \
+ "paddw %%mm7,%%mm7\n\t" \
+ /*r1=R1=A''+H'*/ \
+ "paddw %%mm2,%%mm1\n\t" \
+ /*r7=G'=E+G*/ \
+ "paddw %%mm4,%%mm7\n\t" \
+ /*r4=R4=E'-D'*/ \
+ "psubw %%mm3,%%mm4\n\t" \
+ "paddw %%mm3,%%mm3\n\t" \
+ /*r6=R6=F'-B''*/ \
+ "psubw %%mm5,%%mm6\n\t" \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*r3=R3=E'+D'*/ \
+ "paddw %%mm4,%%mm3\n\t" \
+ /*r5=R5=F'+B''*/ \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*r7=R7=G'-C'*/ \
+ "psubw %%mm0,%%mm7\n\t" \
+ "paddw %%mm0,%%mm0\n\t" \
+ /*Save R1.*/ \
+ "movq %%mm1,"OC_I(1)"\n\t" \
+ /*r0=R0=G'+C'*/ \
+ "paddw %%mm7,%%mm0\n\t" \
+ "#end OC_ROW_IDCT_10\n\t" \
+
+/*25+19=44 cycles'*/
+#define OC_COLUMN_IDCT_10 \
+ "#OC_COLUMN_IDCT_10\n\t" \
+ OC_IDCT_BEGIN_10 \
+ "paddw "OC_8",%%mm2\n\t" \
+ /*r1=H'+H'*/ \
+ "paddw %%mm1,%%mm1\n\t" \
+ /*r1=R1=A''+H'*/ \
+ "paddw %%mm2,%%mm1\n\t" \
+ /*r2=NR2*/ \
+ "psraw $4,%%mm2\n\t" \
+ /*r4=E'=E-G*/ \
+ "psubw %%mm7,%%mm4\n\t" \
+ /*r1=NR1*/ \
+ "psraw $4,%%mm1\n\t" \
+ /*r3=D'*/ \
+ "movq "OC_I(2)",%%mm3\n\t" \
+ /*r7=G+G*/ \
+ "paddw %%mm7,%%mm7\n\t" \
+ /*Store NR2 at I(2).*/ \
+ "movq %%mm2,"OC_I(2)"\n\t" \
+ /*r7=G'=E+G*/ \
+ "paddw %%mm4,%%mm7\n\t" \
+ /*Store NR1 at I(1).*/ \
+ "movq %%mm1,"OC_I(1)"\n\t" \
+ /*r4=R4=E'-D'*/ \
+ "psubw %%mm3,%%mm4\n\t" \
+ "paddw "OC_8",%%mm4\n\t" \
+ /*r3=D'+D'*/ \
+ "paddw %%mm3,%%mm3\n\t" \
+ /*r3=R3=E'+D'*/ \
+ "paddw %%mm4,%%mm3\n\t" \
+ /*r4=NR4*/ \
+ "psraw $4,%%mm4\n\t" \
+ /*r6=R6=F'-B''*/ \
+ "psubw %%mm5,%%mm6\n\t" \
+ /*r3=NR3*/ \
+ "psraw $4,%%mm3\n\t" \
+ "paddw "OC_8",%%mm6\n\t" \
+ /*r5=B''+B''*/ \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*r5=R5=F'+B''*/ \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*r6=NR6*/ \
+ "psraw $4,%%mm6\n\t" \
+ /*Store NR4 at J(4).*/ \
+ "movq %%mm4,"OC_J(4)"\n\t" \
+ /*r5=NR5*/ \
+ "psraw $4,%%mm5\n\t" \
+ /*Store NR3 at I(3).*/ \
+ "movq %%mm3,"OC_I(3)"\n\t" \
+ /*r7=R7=G'-C'*/ \
+ "psubw %%mm0,%%mm7\n\t" \
+ "paddw "OC_8",%%mm7\n\t" \
+ /*r0=C'+C'*/ \
+ "paddw %%mm0,%%mm0\n\t" \
+ /*r0=R0=G'+C'*/ \
+ "paddw %%mm7,%%mm0\n\t" \
+ /*r7=NR7*/ \
+ "psraw $4,%%mm7\n\t" \
+ /*Store NR6 at J(6).*/ \
+ "movq %%mm6,"OC_J(6)"\n\t" \
+ /*r0=NR0*/ \
+ "psraw $4,%%mm0\n\t" \
+ /*Store NR5 at J(5).*/ \
+ "movq %%mm5,"OC_J(5)"\n\t" \
+ /*Store NR7 at J(7).*/ \
+ "movq %%mm7,"OC_J(7)"\n\t" \
+ /*Store NR0 at I(0).*/ \
+ "movq %%mm0,"OC_I(0)"\n\t" \
+ "#end OC_COLUMN_IDCT_10\n\t" \
+
+static void oc_idct8x8_10(ogg_int16_t _y[64]){
+  __asm__ __volatile__(
+#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
+#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])"
+    /*Done with dequant, descramble, and partial transpose.
+      Now do the iDCT itself.*/
+    OC_ROW_IDCT_10
+    OC_TRANSPOSE
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
+#define OC_J(_k) OC_I(_k)
+    OC_COLUMN_IDCT_10
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])"
+#define OC_J(_k) OC_I(_k)
+    OC_COLUMN_IDCT_10
+#undef  OC_I
+#undef  OC_J
+    :
+    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
+  );
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.*/
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
+  /*_last_zzi is subtly different from an actual count of the number of
+     coefficients we decoded for this block.
+    It contains the value of zzi BEFORE the final token in the block was
+     decoded.
+    In most cases this is an EOB token (the continuation of an EOB run from a
+     previous block counts), and so this is the same as the coefficient count.
+    However, in the case that the last token was NOT an EOB token, but filled
+     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+    Provided the last token was not a pure zero run, the minimum value it can
+     be is 46, and so that doesn't affect any of the cases in this routine.
+    However, if the last token WAS a pure zero run of length 63, then _last_zzi
+     will be 1 while the number of coefficients decoded is 64.
+    Thus, we will trigger the following special case, where the real
+     coefficient count would not.
+    Note also that a zero run of length 64 will give _last_zzi a value of 0,
+     but we still process the DC coefficient, which might have a non-zero value
+     due to DC prediction.
+    Although convoluted, this is arguably the correct behavior: it allows us to
+     use a smaller transform when the block ends with a long zero run instead
+     of a normal EOB token.
+    It could be smarter... multiple separate zero runs at the end of a block
+     will fool it, but an encoder that generates these really deserves what it
+     gets.
+    Needless to say we inherited this approach from VP3.*/
+  /*Then perform the iDCT.*/
+  if(_last_zzi<10)oc_idct8x8_10(_y);
+  else oc_idct8x8_slow(_y);
+}
+
+#endif
diff --git a/lib/x86/mmxloop.h b/lib/x86/mmxloop.h
new file mode 100644
index 0000000..2e870c7
--- /dev/null
+++ b/lib/x86/mmxloop.h
@@ -0,0 +1,215 @@
+#if !defined(_x86_mmxloop_H)
+# define _x86_mmxloop_H (1)
+# include <stddef.h>
+# include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
+  On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
+   mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
+#define OC_LOOP_FILTER8_MMX \
+ "#OC_LOOP_FILTER8_MMX\n\t" \
+ /*mm7=0*/ \
+ "pxor %%mm7,%%mm7\n\t" \
+ /*mm6:mm0={a0,...,a7}*/ \
+ "movq %%mm0,%%mm6\n\t" \
+ "punpcklbw %%mm7,%%mm0\n\t" \
+ "punpckhbw %%mm7,%%mm6\n\t" \
+ /*mm3:mm5={d0,...,d7}*/ \
+ "movq %%mm3,%%mm5\n\t" \
+ "punpcklbw %%mm7,%%mm3\n\t" \
+ "punpckhbw %%mm7,%%mm5\n\t" \
+ /*mm6:mm0={a0-d0,...,a7-d7}*/ \
+ "psubw %%mm3,%%mm0\n\t" \
+ "psubw %%mm5,%%mm6\n\t" \
+ /*mm3:mm1={b0,...,b7}*/ \
+ "movq %%mm1,%%mm3\n\t" \
+ "punpcklbw %%mm7,%%mm1\n\t" \
+ "movq %%mm2,%%mm4\n\t" \
+ "punpckhbw %%mm7,%%mm3\n\t" \
+ /*mm5:mm4={c0,...,c7}*/ \
+ "movq %%mm2,%%mm5\n\t" \
+ "punpcklbw %%mm7,%%mm4\n\t" \
+ "punpckhbw %%mm7,%%mm5\n\t" \
+ /*mm7={3}x4 \
+   mm5:mm4={c0-b0,...,c7-b7}*/ \
+ "pcmpeqw %%mm7,%%mm7\n\t" \
+ "psubw %%mm1,%%mm4\n\t" \
+ "psrlw $14,%%mm7\n\t" \
+ "psubw %%mm3,%%mm5\n\t" \
+ /*Scale by 3.*/ \
+ "pmullw %%mm7,%%mm4\n\t" \
+ "pmullw %%mm7,%%mm5\n\t" \
+ /*mm7={4}x4 \
+   mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
+ "psrlw $1,%%mm7\n\t" \
+ "paddw %%mm0,%%mm4\n\t" \
+ "psllw $2,%%mm7\n\t" \
+ "movq (%[ll]),%%mm0\n\t" \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*R_i has the range [-127,128], so we compute -R_i instead. \
+   mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
+ "psubw %%mm7,%%mm4\n\t" \
+ "psubw %%mm7,%%mm5\n\t" \
+ "psraw $3,%%mm4\n\t" \
+ "psraw $3,%%mm5\n\t" \
+ "pcmpeqb %%mm7,%%mm7\n\t" \
+ "packsswb %%mm5,%%mm4\n\t" \
+ "pxor %%mm6,%%mm6\n\t" \
+ "pxor %%mm7,%%mm4\n\t" \
+ "packuswb %%mm3,%%mm1\n\t" \
+ /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
+ /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+    we have to split things by sign (the other option is to work in 16 bits, \
+    but working in 8 bits gives much better parallelism). \
+   We compute abs(R_i), but save a mask of which terms were negative in mm6. \
+   Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
+   Finally, we split mm4 into positive and negative pieces using the mask in \
+    mm6, and add and subtract them as appropriate.*/ \
+ /*mm4=abs(-R_i)*/ \
+ /*mm7=255-2*L*/ \
+ "pcmpgtb %%mm4,%%mm6\n\t" \
+ "psubb %%mm0,%%mm7\n\t" \
+ "pxor %%mm6,%%mm4\n\t" \
+ "psubb %%mm0,%%mm7\n\t" \
+ "psubb %%mm6,%%mm4\n\t" \
+ /*mm7=255-max(2*L-abs(R_i),0)*/ \
+ "paddusb %%mm4,%%mm7\n\t" \
+ /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
+ "paddusb %%mm7,%%mm4\n\t" \
+ "psubusb %%mm7,%%mm4\n\t" \
+ /*Now split mm4 by the original sign of -R_i.*/ \
+ "movq %%mm4,%%mm5\n\t" \
+ "pand %%mm6,%%mm4\n\t" \
+ "pandn %%mm5,%%mm6\n\t" \
+ /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+ /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+ "paddusb %%mm4,%%mm1\n\t" \
+ "psubusb %%mm4,%%mm2\n\t" \
+ "psubusb %%mm6,%%mm1\n\t" \
+ "paddusb %%mm6,%%mm2\n\t" \
+
+#define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \
+  do{ \
+    ptrdiff_t ystride3__; \
+    __asm__ __volatile__( \
+      /*mm0={a0,...,a7}*/ \
+      "movq (%[pix]),%%mm0\n\t" \
+      /*ystride3=_ystride*3*/ \
+      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+      /*mm3={d0,...,d7}*/ \
+      "movq (%[pix],%[ystride3]),%%mm3\n\t" \
+      /*mm1={b0,...,b7}*/ \
+      "movq (%[pix],%[ystride]),%%mm1\n\t" \
+      /*mm2={c0,...,c7}*/ \
+      "movq (%[pix],%[ystride],2),%%mm2\n\t" \
+      OC_LOOP_FILTER8_MMX \
+      /*Write it back out.*/ \
+      "movq %%mm1,(%[pix],%[ystride])\n\t" \
+      "movq %%mm2,(%[pix],%[ystride],2)\n\t" \
+      :[ystride3]"=&r"(ystride3__) \
+      :[pix]"r"(_pix-_ystride*2),[ystride]"r"((ptrdiff_t)(_ystride)), \
+       [ll]"r"(_ll) \
+      :"memory" \
+    ); \
+  } \
+  while(0)
+
+#define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \
+  do{ \
+    unsigned char *pix__; \
+    ptrdiff_t      ystride3__; \
+    ptrdiff_t      d__; \
+    pix__=(_pix)-2; \
+    __asm__ __volatile__( \
+      /*x x x x d0 c0 b0 a0*/ \
+      "movd (%[pix]),%%mm0\n\t" \
+      /*x x x x d1 c1 b1 a1*/ \
+      "movd (%[pix],%[ystride]),%%mm1\n\t" \
+      /*ystride3=_ystride*3*/ \
+      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+      /*x x x x d2 c2 b2 a2*/ \
+      "movd (%[pix],%[ystride],2),%%mm2\n\t" \
+      /*x x x x d3 c3 b3 a3*/ \
+      "lea (%[pix],%[ystride],4),%[d]\n\t" \
+      "movd (%[pix],%[ystride3]),%%mm3\n\t" \
+      /*x x x x d4 c4 b4 a4*/ \
+      "movd (%[d]),%%mm4\n\t" \
+      /*x x x x d5 c5 b5 a5*/ \
+      "movd (%[d],%[ystride]),%%mm5\n\t" \
+      /*x x x x d6 c6 b6 a6*/ \
+      "movd (%[d],%[ystride],2),%%mm6\n\t" \
+      /*x x x x d7 c7 b7 a7*/ \
+      "movd (%[d],%[ystride3]),%%mm7\n\t" \
+      /*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \
+      "punpcklbw %%mm1,%%mm0\n\t" \
+      /*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \
+      "punpcklbw %%mm3,%%mm2\n\t" \
+      /*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \
+      "movq %%mm0,%%mm3\n\t" \
+      /*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \
+      "punpcklwd %%mm2,%%mm0\n\t" \
+      /*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \
+      "punpckhwd %%mm2,%%mm3\n\t" \
+      /*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \
+      "movq %%mm0,%%mm1\n\t" \
+      /*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \
+      "punpcklbw %%mm5,%%mm4\n\t" \
+      /*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \
+      "punpcklbw %%mm7,%%mm6\n\t" \
+      /*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \
+      "movq %%mm4,%%mm5\n\t" \
+      /*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \
+      "punpcklwd %%mm6,%%mm4\n\t" \
+      /*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \
+      "punpckhwd %%mm6,%%mm5\n\t" \
+      /*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \
+      "movq %%mm3,%%mm2\n\t" \
+      /*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \
+      "punpckldq %%mm4,%%mm0\n\t" \
+      /*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \
+      "punpckhdq %%mm4,%%mm1\n\t" \
+      /*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \
+      "punpckldq %%mm5,%%mm2\n\t" \
+      /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
+      "punpckhdq %%mm5,%%mm3\n\t" \
+      OC_LOOP_FILTER8_MMX \
+      /*mm2={b0+R_0'',...,b7+R_7''}*/ \
+      "movq %%mm1,%%mm0\n\t" \
+      /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
+      "punpcklbw %%mm2,%%mm1\n\t" \
+      /*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \
+      "punpckhbw %%mm2,%%mm0\n\t" \
+      /*[d]=c1 b1 c0 b0*/ \
+      "movd %%mm1,%[d]\n\t" \
+      "movw %w[d],1(%[pix])\n\t" \
+      "psrlq $32,%%mm1\n\t" \
+      "shr $16,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride])\n\t" \
+      /*[d]=c3 b3 c2 b2*/ \
+      "movd %%mm1,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride],2)\n\t" \
+      "shr $16,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride3])\n\t" \
+      "lea (%[pix],%[ystride],4),%[pix]\n\t" \
+      /*[d]=c5 b5 c4 b4*/ \
+      "movd %%mm0,%[d]\n\t" \
+      "movw %w[d],1(%[pix])\n\t" \
+      "psrlq $32,%%mm0\n\t" \
+      "shr $16,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride])\n\t" \
+      /*[d]=c7 b7 c6 b6*/ \
+      "movd %%mm0,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride],2)\n\t" \
+      "shr $16,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride3])\n\t" \
+      :[pix]"+r"(pix__),[ystride3]"=&r"(ystride3__),[d]"=&r"(d__) \
+      :[ystride]"r"((ptrdiff_t)(_ystride)),[ll]"r"(_ll) \
+      :"memory" \
+    ); \
+  } \
+  while(0)
+
+# endif
+#endif
diff --git a/lib/x86/mmxstate.c b/lib/x86/mmxstate.c
new file mode 100644
index 0000000..808b0a7
--- /dev/null
+++ b/lib/x86/mmxstate.c
@@ -0,0 +1,188 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxstate.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+/*MMX acceleration of complete fragment reconstruction algorithm.
+  Originally written by Rudolf Marek.*/
+#include <string.h>
+#include "x86int.h"
+#include "mmxfrag.h"
+#include "mmxloop.h"
+
+#if defined(OC_X86_ASM)
+
+void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            mb_mode;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    /*Note that this value must be unsigned, to keep the __asm__ block from
+       sign-extending it when it puts it in a register.*/
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    /*Fill _dct_coeffs with p.*/
+    __asm__ __volatile__(
+      /*mm0=0000 0000 0000 AAAA*/
+      "movd %[p],%%mm0\n\t"
+      /*mm0=0000 0000 AAAA AAAA*/
+      "punpcklwd %%mm0,%%mm0\n\t"
+      /*mm0=AAAA AAAA AAAA AAAA*/
+      "punpckldq %%mm0,%%mm0\n\t"
+      "movq %%mm0,(%[y])\n\t"
+      "movq %%mm0,8(%[y])\n\t"
+      "movq %%mm0,16(%[y])\n\t"
+      "movq %%mm0,24(%[y])\n\t"
+      "movq %%mm0,32(%[y])\n\t"
+      "movq %%mm0,40(%[y])\n\t"
+      "movq %%mm0,48(%[y])\n\t"
+      "movq %%mm0,56(%[y])\n\t"
+      "movq %%mm0,64(%[y])\n\t"
+      "movq %%mm0,72(%[y])\n\t"
+      "movq %%mm0,80(%[y])\n\t"
+      "movq %%mm0,88(%[y])\n\t"
+      "movq %%mm0,96(%[y])\n\t"
+      "movq %%mm0,104(%[y])\n\t"
+      "movq %%mm0,112(%[y])\n\t"
+      "movq %%mm0,120(%[y])\n\t"
+      :
+      :[y]"r"(_dct_coeffs),[p]"r"((unsigned)p)
+      :"memory"
+    );
+  }
+  else{
+    /*Dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  mb_mode=_state->frags[_fragi].mb_mode;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+     +frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
+      oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs);
+    }
+    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+  }
+}
+
+/*We copy these entire function to inline the actual MMX routines so that we
+   use only a single indirect call.*/
+
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _fragis:    A pointer to a list of fragment indices.
+  _nfragis:   The number of fragment indices to copy.
+  _dst_frame: The reference frame to copy to.
+  _src_frame: The reference frame to copy from.
+  _pli:       The color plane the fragments lie in.*/
+void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
+ int _dst_frame,int _src_frame,int _pli){
+  const ptrdiff_t     *frag_buf_offs;
+  const unsigned char *src_frame_data;
+  unsigned char       *dst_frame_data;
+  ptrdiff_t            fragii;
+  int                  ystride;
+  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
+  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
+  ystride=_state->ref_ystride[_pli];
+  frag_buf_offs=_state->frag_buf_offs;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=frag_buf_offs[_fragis[fragii]];
+    OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
+     src_frame_data+frag_buf_off,ystride);
+  }
+}
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+  The filter may be run on the bottom edge, affecting pixels in the next row of
+   fragments, so this row also needs to be available.
+  _bv:        The bounding values array.
+  _refi:      The index of the frame buffer to filter.
+  _pli:       The color plane to filter.
+  _fragy0:    The Y coordinate of the first fragment row to filter.
+  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
+ int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+  OC_ALIGN8(unsigned char   ll[8]);
+  const oc_fragment_plane *fplane;
+  const oc_fragment       *frags;
+  const ptrdiff_t         *frag_buf_offs;
+  unsigned char           *ref_frame_data;
+  ptrdiff_t                fragi_top;
+  ptrdiff_t                fragi_bot;
+  ptrdiff_t                fragi0;
+  ptrdiff_t                fragi0_end;
+  int                      ystride;
+  int                      nhfrags;
+  memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
+  fplane=_state->fplanes+_pli;
+  nhfrags=fplane->nhfrags;
+  fragi_top=fplane->froffset;
+  fragi_bot=fragi_top+fplane->nfrags;
+  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+  ystride=_state->ref_ystride[_pli];
+  frags=_state->frags;
+  frag_buf_offs=_state->frag_buf_offs;
+  ref_frame_data=_state->ref_frame_data[_refi];
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
+  while(fragi0<fragi0_end){
+    ptrdiff_t fragi;
+    ptrdiff_t fragi_end;
+    fragi=fragi0;
+    fragi_end=fragi+nhfrags;
+    while(fragi<fragi_end){
+      if(frags[fragi].coded){
+        unsigned char *ref;
+        ref=ref_frame_data+frag_buf_offs[fragi];
+        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
+        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
+        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
+          OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
+        }
+        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
+          OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
+        }
+      }
+      fragi++;
+    }
+    fragi0+=nhfrags;
+  }
+}
+
+#endif
diff --git a/lib/x86/sse2fdct.c b/lib/x86/sse2fdct.c
new file mode 100644
index 0000000..86c17d6
--- /dev/null
+++ b/lib/x86/sse2fdct.c
@@ -0,0 +1,523 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************/
+/*SSE2 fDCT implementation for x86_64.*/
+/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
+#include <stddef.h>
+#include "x86enc.h"
+
+#if defined(OC_X86_64_ASM)
+
+# define OC_FDCT8x8 \
+ /*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \
+ "#OC_FDCT8x8\n\t" \
+ /*Stage 1:*/ \
+ "movdqa %%xmm0,%%xmm11\n\t" \
+ "movdqa %%xmm1,%%xmm10\n\t" \
+ "movdqa %%xmm2,%%xmm9\n\t" \
+ "movdqa %%xmm3,%%xmm8\n\t" \
+ /*xmm11=t7'=t0-t7*/ \
+ "psubw %%xmm7,%%xmm11\n\t" \
+ /*xmm10=t6'=t1-t6*/ \
+ "psubw %%xmm6,%%xmm10\n\t" \
+ /*xmm9=t5'=t2-t5*/ \
+ "psubw %%xmm5,%%xmm9\n\t" \
+ /*xmm8=t4'=t3-t4*/ \
+ "psubw %%xmm4,%%xmm8\n\t" \
+ /*xmm0=t0'=t0+t7*/ \
+ "paddw %%xmm7,%%xmm0\n\t" \
+ /*xmm1=t1'=t1+t6*/ \
+ "paddw %%xmm6,%%xmm1\n\t" \
+ /*xmm5=t2'=t2+t5*/ \
+ "paddw %%xmm2,%%xmm5\n\t" \
+ /*xmm4=t3'=t3+t4*/ \
+ "paddw %%xmm3,%%xmm4\n\t" \
+ /*xmm2,3,6,7 are now free.*/ \
+ /*Stage 2:*/ \
+ "movdqa %%xmm0,%%xmm3\n\t" \
+ "mov $0x5A806A0A,%[a]\n\t" \
+ "movdqa %%xmm1,%%xmm2\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "movdqa %%xmm10,%%xmm6\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ /*xmm2=t2''=t1'-t2'*/ \
+ "psubw %%xmm5,%%xmm2\n\t" \
+ "pxor %%xmm12,%%xmm12\n\t" \
+ /*xmm3=t3''=t0'-t3'*/ \
+ "psubw %%xmm4,%%xmm3\n\t" \
+ "psubw %%xmm14,%%xmm12\n\t" \
+ /*xmm10=t5''=t6'-t5'*/ \
+ "psubw %%xmm9,%%xmm10\n\t" \
+ "paddw %%xmm12,%%xmm12\n\t" \
+ /*xmm4=t0''=t0'+t3'*/ \
+ "paddw %%xmm0,%%xmm4\n\t" \
+ /*xmm1=t1''=t1'+t2'*/ \
+ "paddw %%xmm5,%%xmm1\n\t" \
+ /*xmm6=t6''=t6'+t5'*/ \
+ "paddw %%xmm9,%%xmm6\n\t" \
+ /*xmm0,xmm5,xmm9 are now free.*/ \
+ /*Stage 3:*/ \
+ /*xmm10:xmm5=t5''*27146+0xB500 \
+   xmm0=t5''*/ \
+ "movdqa %%xmm10,%%xmm5\n\t" \
+ "movdqa %%xmm10,%%xmm0\n\t" \
+ "punpckhwd %%xmm12,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ "punpcklwd %%xmm12,%%xmm5\n\t" \
+ "pmaddwd %%xmm13,%%xmm5\n\t" \
+ /*xmm5=(t5''*27146+0xB500>>16)+t5''*/ \
+ "psrad $16,%%xmm10\n\t" \
+ "psrad $16,%%xmm5\n\t" \
+ "packssdw %%xmm10,%%xmm5\n\t" \
+ "paddw %%xmm0,%%xmm5\n\t" \
+ /*xmm0=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
+ "pcmpeqw %%xmm15,%%xmm0\n\t" \
+ "psubw %%xmm14,%%xmm0\n\t" \
+ "paddw %%xmm5,%%xmm0\n\t" \
+ "movdqa %%xmm8,%%xmm5\n\t" \
+ "psraw $1,%%xmm0\n\t" \
+ /*xmm5=t5'''=t4'-s*/ \
+ "psubw %%xmm0,%%xmm5\n\t" \
+ /*xmm8=t4''=t4'+s*/ \
+ "paddw %%xmm0,%%xmm8\n\t" \
+ /*xmm0,xmm7,xmm9,xmm10 are free.*/ \
+ /*xmm7:xmm9=t6''*27146+0xB500*/ \
+ "movdqa %%xmm6,%%xmm7\n\t" \
+ "movdqa %%xmm6,%%xmm9\n\t" \
+ "punpckhwd %%xmm12,%%xmm7\n\t" \
+ "pmaddwd %%xmm13,%%xmm7\n\t" \
+ "punpcklwd %%xmm12,%%xmm9\n\t" \
+ "pmaddwd %%xmm13,%%xmm9\n\t" \
+ /*xmm9=(t6''*27146+0xB500>>16)+t6''*/ \
+ "psrad $16,%%xmm7\n\t" \
+ "psrad $16,%%xmm9\n\t" \
+ "packssdw %%xmm7,%%xmm9\n\t" \
+ "paddw %%xmm6,%%xmm9\n\t" \
+ /*xmm9=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
+ "pcmpeqw %%xmm15,%%xmm6\n\t" \
+ "psubw %%xmm14,%%xmm6\n\t" \
+ "paddw %%xmm6,%%xmm9\n\t" \
+ "movdqa %%xmm11,%%xmm7\n\t" \
+ "psraw $1,%%xmm9\n\t" \
+ /*xmm7=t6'''=t7'-s*/ \
+ "psubw %%xmm9,%%xmm7\n\t" \
+ /*xmm9=t7''=t7'+s*/ \
+ "paddw %%xmm11,%%xmm9\n\t" \
+ /*xmm0,xmm6,xmm10,xmm11 are free.*/ \
+ /*Stage 4:*/ \
+ /*xmm10:xmm0=t1''*27146+0xB500*/ \
+ "movdqa %%xmm1,%%xmm0\n\t" \
+ "movdqa %%xmm1,%%xmm10\n\t" \
+ "punpcklwd %%xmm12,%%xmm0\n\t" \
+ "pmaddwd %%xmm13,%%xmm0\n\t" \
+ "punpckhwd %%xmm12,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ /*xmm0=(t1''*27146+0xB500>>16)+t1''*/ \
+ "psrad $16,%%xmm0\n\t" \
+ "psrad $16,%%xmm10\n\t" \
+ "mov $0x20006A0A,%[a]\n\t" \
+ "packssdw %%xmm10,%%xmm0\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddw %%xmm1,%%xmm0\n\t" \
+ /*xmm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
+ "pcmpeqw %%xmm15,%%xmm1\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "psubw %%xmm14,%%xmm1\n\t" \
+ "paddw %%xmm1,%%xmm0\n\t" \
+ /*xmm10:xmm4=t0''*27146+0x4000*/ \
+ "movdqa %%xmm4,%%xmm1\n\t" \
+ "movdqa %%xmm4,%%xmm10\n\t" \
+ "punpcklwd %%xmm12,%%xmm4\n\t" \
+ "pmaddwd %%xmm13,%%xmm4\n\t" \
+ "punpckhwd %%xmm12,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ /*xmm4=(t0''*27146+0x4000>>16)+t0''*/ \
+ "psrad $16,%%xmm4\n\t" \
+ "psrad $16,%%xmm10\n\t" \
+ "mov $0x6CB7,%[a]\n\t" \
+ "packssdw %%xmm10,%%xmm4\n\t" \
+ "movd %[a],%%xmm12\n\t" \
+ "paddw %%xmm1,%%xmm4\n\t" \
+ /*xmm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
+ "pcmpeqw %%xmm15,%%xmm1\n\t" \
+ "pshufd $00,%%xmm12,%%xmm12\n\t" \
+ "psubw %%xmm14,%%xmm1\n\t" \
+ "mov $0x7FFF6C84,%[a]\n\t" \
+ "paddw %%xmm1,%%xmm4\n\t" \
+ /*xmm0=_y[0]=u=r+s>>1 \
+   The naive implementation could cause overflow, so we use \
+    u=(r&s)+((r^s)>>1).*/ \
+ "movdqa %%xmm0,%%xmm6\n\t" \
+ "pxor %%xmm4,%%xmm0\n\t" \
+ "pand %%xmm4,%%xmm6\n\t" \
+ "psraw $1,%%xmm0\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddw %%xmm6,%%xmm0\n\t" \
+ /*xmm4=_y[4]=v=r-u*/ \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "psubw %%xmm0,%%xmm4\n\t" \
+ /*xmm1,xmm6,xmm10,xmm11 are free.*/ \
+ /*xmm6:xmm10=60547*t3''+0x6CB7*/ \
+ "movdqa %%xmm3,%%xmm10\n\t" \
+ "movdqa %%xmm3,%%xmm6\n\t" \
+ "punpcklwd %%xmm3,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ "mov $0x61F861F8,%[a]\n\t" \
+ "punpckhwd %%xmm3,%%xmm6\n\t" \
+ "pmaddwd %%xmm13,%%xmm6\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm6\n\t" \
+ /*xmm1:xmm2=25080*t2'' \
+   xmm12=t2''*/ \
+ "movdqa %%xmm2,%%xmm11\n\t" \
+ "movdqa %%xmm2,%%xmm12\n\t" \
+ "pmullw %%xmm13,%%xmm2\n\t" \
+ "pmulhw %%xmm13,%%xmm11\n\t" \
+ "movdqa %%xmm2,%%xmm1\n\t" \
+ "punpcklwd %%xmm11,%%xmm2\n\t" \
+ "punpckhwd %%xmm11,%%xmm1\n\t" \
+ /*xmm10=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
+ "paddd %%xmm2,%%xmm10\n\t" \
+ "paddd %%xmm1,%%xmm6\n\t" \
+ "psrad $16,%%xmm10\n\t" \
+ "pcmpeqw %%xmm15,%%xmm3\n\t" \
+ "psrad $16,%%xmm6\n\t" \
+ "psubw %%xmm14,%%xmm3\n\t" \
+ "packssdw %%xmm6,%%xmm10\n\t" \
+ "paddw %%xmm3,%%xmm10\n\t" \
+ /*xmm2=_y[2]=u \
+   xmm10=s=(25080*u>>16)-t2''*/ \
+ "movdqa %%xmm10,%%xmm2\n\t" \
+ "pmulhw %%xmm13,%%xmm10\n\t" \
+ "psubw %%xmm12,%%xmm10\n\t" \
+ /*xmm1:xmm6=s*21600+0x2800*/ \
+ "pxor %%xmm12,%%xmm12\n\t" \
+ "psubw %%xmm14,%%xmm12\n\t" \
+ "mov $0x28005460,%[a]\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "movdqa %%xmm10,%%xmm6\n\t" \
+ "movdqa %%xmm10,%%xmm1\n\t" \
+ "punpcklwd %%xmm12,%%xmm6\n\t" \
+ "pmaddwd %%xmm13,%%xmm6\n\t" \
+ "mov $0x0E3D,%[a]\n\t" \
+ "punpckhwd %%xmm12,%%xmm1\n\t" \
+ "pmaddwd %%xmm13,%%xmm1\n\t" \
+ /*xmm6=(s*21600+0x2800>>18)+s*/ \
+ "psrad $18,%%xmm6\n\t" \
+ "psrad $18,%%xmm1\n\t" \
+ "movd %[a],%%xmm12\n\t" \
+ "packssdw %%xmm1,%%xmm6\n\t" \
+ "pshufd $00,%%xmm12,%%xmm12\n\t" \
+ "paddw %%xmm10,%%xmm6\n\t" \
+ /*xmm6=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
+ "mov $0x7FFF54DC,%[a]\n\t" \
+ "pcmpeqw %%xmm15,%%xmm10\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "psubw %%xmm14,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "paddw %%xmm10,%%xmm6\n\t " \
+ /*xmm1,xmm3,xmm10,xmm11 are free.*/ \
+ /*xmm11:xmm10=54491*t5'''+0x0E3D*/ \
+ "movdqa %%xmm5,%%xmm10\n\t" \
+ "movdqa %%xmm5,%%xmm11\n\t" \
+ "punpcklwd %%xmm5,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ "mov $0x8E3A8E3A,%[a]\n\t" \
+ "punpckhwd %%xmm5,%%xmm11\n\t" \
+ "pmaddwd %%xmm13,%%xmm11\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm11\n\t" \
+ /*xmm7:xmm12=36410*t6''' \
+   xmm1=t6'''*/ \
+ "movdqa %%xmm7,%%xmm3\n\t" \
+ "movdqa %%xmm7,%%xmm1\n\t" \
+ "pmulhw %%xmm13,%%xmm3\n\t" \
+ "pmullw %%xmm13,%%xmm7\n\t" \
+ "paddw %%xmm1,%%xmm3\n\t" \
+ "movdqa %%xmm7,%%xmm12\n\t" \
+ "punpckhwd %%xmm3,%%xmm7\n\t" \
+ "punpcklwd %%xmm3,%%xmm12\n\t" \
+ /*xmm10=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
+ "paddd %%xmm12,%%xmm10\n\t" \
+ "paddd %%xmm7,%%xmm11\n\t" \
+ "psrad $16,%%xmm10\n\t" \
+ "pcmpeqw %%xmm15,%%xmm5\n\t" \
+ "psrad $16,%%xmm11\n\t" \
+ "psubw %%xmm14,%%xmm5\n\t" \
+ "packssdw %%xmm11,%%xmm10\n\t" \
+ "pxor %%xmm12,%%xmm12\n\t" \
+ "paddw %%xmm5,%%xmm10\n\t" \
+ /*xmm5=_y[5]=u \
+   xmm1=s=t6'''-(36410*u>>16)*/ \
+ "psubw %%xmm14,%%xmm12\n\t" \
+ "movdqa %%xmm10,%%xmm5\n\t" \
+ "mov $0x340067C8,%[a]\n\t" \
+ "pmulhw %%xmm13,%%xmm10\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddw %%xmm5,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "psubw %%xmm10,%%xmm1\n\t" \
+ /*xmm11:xmm3=s*26568+0x3400*/ \
+ "movdqa %%xmm1,%%xmm3\n\t" \
+ "movdqa %%xmm1,%%xmm11\n\t" \
+ "punpcklwd %%xmm12,%%xmm3\n\t" \
+ "pmaddwd %%xmm13,%%xmm3\n\t" \
+ "mov $0x7B1B,%[a]\n\t" \
+ "punpckhwd %%xmm12,%%xmm11\n\t" \
+ "pmaddwd %%xmm13,%%xmm11\n\t" \
+ /*xmm3=(s*26568+0x3400>>17)+s*/ \
+ "psrad $17,%%xmm3\n\t" \
+ "psrad $17,%%xmm11\n\t" \
+ "movd %[a],%%xmm12\n\t" \
+ "packssdw %%xmm11,%%xmm3\n\t" \
+ "pshufd $00,%%xmm12,%%xmm12\n\t" \
+ "paddw %%xmm1,%%xmm3\n\t" \
+ /*xmm3=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
+ "mov $0x7FFF7B16,%[a]\n\t" \
+ "pcmpeqw %%xmm15,%%xmm1\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "psubw %%xmm14,%%xmm1\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "paddw %%xmm1,%%xmm3\n\t " \
+ /*xmm1,xmm7,xmm10,xmm11 are free.*/ \
+ /*xmm11:xmm10=64277*t7''+0x7B1B*/ \
+ "movdqa %%xmm9,%%xmm10\n\t" \
+ "movdqa %%xmm9,%%xmm11\n\t" \
+ "punpcklwd %%xmm9,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ "mov $0x31F131F1,%[a]\n\t" \
+ "punpckhwd %%xmm9,%%xmm11\n\t" \
+ "pmaddwd %%xmm13,%%xmm11\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm11\n\t" \
+ /*xmm12:xmm7=12785*t4''*/ \
+ "movdqa %%xmm8,%%xmm7\n\t" \
+ "movdqa %%xmm8,%%xmm1\n\t" \
+ "pmullw %%xmm13,%%xmm7\n\t" \
+ "pmulhw %%xmm13,%%xmm1\n\t" \
+ "movdqa %%xmm7,%%xmm12\n\t" \
+ "punpcklwd %%xmm1,%%xmm7\n\t" \
+ "punpckhwd %%xmm1,%%xmm12\n\t" \
+ /*xmm10=u=(12785*t4''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
+ "paddd %%xmm7,%%xmm10\n\t" \
+ "paddd %%xmm12,%%xmm11\n\t" \
+ "psrad $16,%%xmm10\n\t" \
+ "pcmpeqw %%xmm15,%%xmm9\n\t" \
+ "psrad $16,%%xmm11\n\t" \
+ "psubw %%xmm14,%%xmm9\n\t" \
+ "packssdw %%xmm11,%%xmm10\n\t" \
+ "pxor %%xmm12,%%xmm12\n\t" \
+ "paddw %%xmm9,%%xmm10\n\t" \
+ /*xmm1=_y[1]=u \
+   xmm10=s=(12785*u>>16)-t4''*/ \
+ "psubw %%xmm14,%%xmm12\n\t" \
+ "movdqa %%xmm10,%%xmm1\n\t" \
+ "mov $0x3000503B,%[a]\n\t" \
+ "pmulhw %%xmm13,%%xmm10\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "psubw %%xmm8,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ /*xmm8:xmm7=s*20539+0x3000*/ \
+ "movdqa %%xmm10,%%xmm7\n\t" \
+ "movdqa %%xmm10,%%xmm8\n\t" \
+ "punpcklwd %%xmm12,%%xmm7\n\t" \
+ "pmaddwd %%xmm13,%%xmm7\n\t" \
+ "punpckhwd %%xmm12,%%xmm8\n\t" \
+ "pmaddwd %%xmm13,%%xmm8\n\t" \
+ /*xmm7=(s*20539+0x3000>>20)+s*/ \
+ "psrad $20,%%xmm7\n\t" \
+ "psrad $20,%%xmm8\n\t" \
+ "packssdw %%xmm8,%%xmm7\n\t" \
+ "paddw %%xmm10,%%xmm7\n\t" \
+ /*xmm7=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
+ "pcmpeqw %%xmm15,%%xmm10\n\t" \
+ "psubw %%xmm14,%%xmm10\n\t" \
+ "paddw %%xmm10,%%xmm7\n\t " \
+
+# define OC_TRANSPOSE8x8 \
+ "#OC_TRANSPOSE8x8\n\t" \
+ "movdqa %%xmm4,%%xmm8\n\t" \
+ /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
+ "punpckhwd %%xmm5,%%xmm8\n\t" \
+ /*xmm5 is free.*/ \
+ "movdqa %%xmm0,%%xmm5\n\t" \
+ /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm0\n\t" \
+ /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
+ "punpckhwd %%xmm1,%%xmm5\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm6,%%xmm1\n\t" \
+ /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm7,%%xmm6\n\t" \
+ /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
+ "punpckhwd %%xmm7,%%xmm1\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm2,%%xmm7\n\t" \
+ /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm3,%%xmm7\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "punpckhwd %%xmm3,%%xmm2\n\t" \
+ /*xmm3 is free.*/ \
+ "movdqa %%xmm0,%%xmm3\n\t" \
+ /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm7,%%xmm0\n\t" \
+ /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm7,%%xmm3\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm5,%%xmm7\n\t" \
+ /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
+ "punpckldq %%xmm2,%%xmm5\n\t" \
+ /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
+ "punpckhdq %%xmm2,%%xmm7\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm4,%%xmm2\n\t" \
+ /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm6,%%xmm2\n\t" \
+ /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm6,%%xmm4\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm8,%%xmm6\n\t" \
+ /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
+ "punpckldq %%xmm1,%%xmm6\n\t" \
+ /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "punpckhdq %%xmm1,%%xmm8\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm0,%%xmm1\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm2,%%xmm0\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm2,%%xmm1\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm3,%%xmm2\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm4,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm4,%%xmm3\n\t" \
+ /*xmm4 is free.*/ \
+ "movdqa %%xmm5,%%xmm4\n\t" \
+ /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
+ "punpcklqdq %%xmm6,%%xmm4\n\t" \
+ /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
+ "punpckhqdq %%xmm6,%%xmm5\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm7,%%xmm6\n\t" \
+ /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
+ "punpcklqdq %%xmm8,%%xmm6\n\t" \
+ /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
+ "punpckhqdq %%xmm8,%%xmm7\n\t" \
+ /*xmm8 is free.*/ \
+
+/*SSE2 implementation of the fDCT for x86-64 only.
+  Because of the 8 extra XMM registers on x86-64, this version can operate
+   without any temporary stack access at all.*/
+void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  ptrdiff_t a;
+  __asm__ __volatile__(
+    /*Load the input.*/
+    "movdqa 0x00(%[x]),%%xmm0\n\t"
+    "movdqa 0x10(%[x]),%%xmm1\n\t"
+    "movdqa 0x20(%[x]),%%xmm2\n\t"
+    "movdqa 0x30(%[x]),%%xmm3\n\t"
+    "movdqa 0x40(%[x]),%%xmm4\n\t"
+    "movdqa 0x50(%[x]),%%xmm5\n\t"
+    "movdqa 0x60(%[x]),%%xmm6\n\t"
+    "movdqa 0x70(%[x]),%%xmm7\n\t"
+    /*Add two extra bits of working precision to improve accuracy; any more and
+       we could overflow.*/
+    /*We also add a few biases to correct for some systematic error that
+       remains in the full fDCT->iDCT round trip.*/
+    /*xmm15={0}x8*/
+    "pxor %%xmm15,%%xmm15\n\t"
+    /*xmm14={-1}x8*/
+    "pcmpeqb %%xmm14,%%xmm14\n\t"
+    "psllw $2,%%xmm0\n\t"
+    /*xmm8=xmm0*/
+    "movdqa %%xmm0,%%xmm8\n\t"
+    "psllw $2,%%xmm1\n\t"
+    /*xmm8={_x[7...0]==0}*/
+    "pcmpeqw %%xmm15,%%xmm8\n\t"
+    "psllw $2,%%xmm2\n\t"
+    /*xmm8={_x[7...0]!=0}*/
+    "psubw %%xmm14,%%xmm8\n\t"
+    "psllw $2,%%xmm3\n\t"
+    /*%[a]=1*/
+    "mov $1,%[a]\n\t"
+    /*xmm8={_x[6]!=0,0,_x[4]!=0,0,_x[2]!=0,0,_x[0]!=0,0}*/
+    "pslld $16,%%xmm8\n\t"
+    "psllw $2,%%xmm4\n\t"
+    /*xmm9={0,0,0,0,0,0,0,1}*/
+    "movd %[a],%%xmm9\n\t"
+    /*xmm8={0,0,_x[2]!=0,0,_x[0]!=0,0}*/
+    "pshufhw $0x00,%%xmm8,%%xmm8\n\t"
+    "psllw $2,%%xmm5\n\t"
+    /*%[a]={1}x2*/
+    "mov $0x10001,%[a]\n\t"
+    /*xmm8={0,0,0,0,0,0,0,_x[0]!=0}*/
+    "pshuflw $0x01,%%xmm8,%%xmm8\n\t"
+    "psllw $2,%%xmm6\n\t"
+    /*xmm10={0,0,0,0,0,0,1,1}*/
+    "movd %[a],%%xmm10\n\t"
+    /*xmm0=_x[7...0]+{0,0,0,0,0,0,0,_x[0]!=0}*/
+    "paddw %%xmm8,%%xmm0\n\t"
+    "psllw $2,%%xmm7\n\t"
+    /*xmm0=_x[7...0]+{0,0,0,0,0,0,1,(_x[0]!=0)+1}*/
+    "paddw %%xmm10,%%xmm0\n\t"
+    /*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
+    "psubw %%xmm9,%%xmm1\n\t"
+    /*Transform columns.*/
+    OC_FDCT8x8
+    /*Transform rows.*/
+    OC_TRANSPOSE8x8
+    OC_FDCT8x8
+    /*TODO: zig-zag ordering?*/
+    OC_TRANSPOSE8x8
+    /*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
+    "paddw %%xmm14,%%xmm14\n\t"
+    "psubw %%xmm14,%%xmm0\n\t"
+    "psubw %%xmm14,%%xmm1\n\t"
+    "psraw $2,%%xmm0\n\t"
+    "psubw %%xmm14,%%xmm2\n\t"
+    "psraw $2,%%xmm1\n\t"
+    "psubw %%xmm14,%%xmm3\n\t"
+    "psraw $2,%%xmm2\n\t"
+    "psubw %%xmm14,%%xmm4\n\t"
+    "psraw $2,%%xmm3\n\t"
+    "psubw %%xmm14,%%xmm5\n\t"
+    "psraw $2,%%xmm4\n\t"
+    "psubw %%xmm14,%%xmm6\n\t"
+    "psraw $2,%%xmm5\n\t"
+    "psubw %%xmm14,%%xmm7\n\t"
+    "psraw $2,%%xmm6\n\t"
+    "psraw $2,%%xmm7\n\t"
+    /*Store the result.*/
+    "movdqa %%xmm0,0x00(%[y])\n\t"
+    "movdqa %%xmm1,0x10(%[y])\n\t"
+    "movdqa %%xmm2,0x20(%[y])\n\t"
+    "movdqa %%xmm3,0x30(%[y])\n\t"
+    "movdqa %%xmm4,0x40(%[y])\n\t"
+    "movdqa %%xmm5,0x50(%[y])\n\t"
+    "movdqa %%xmm6,0x60(%[y])\n\t"
+    "movdqa %%xmm7,0x70(%[y])\n\t"
+    :[a]"=&r"(a)
+    :[y]"r"(_y),[x]"r"(_x)
+    :"memory"
+  );
+}
+#endif
diff --git a/lib/x86/x86enc.c b/lib/x86/x86enc.c
new file mode 100644
index 0000000..43b7be3
--- /dev/null
+++ b/lib/x86/x86enc.c
@@ -0,0 +1,49 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+#include "../cpu.c"
+
+void oc_enc_vtable_init_x86(oc_enc_ctx *_enc){
+  ogg_uint32_t cpu_flags;
+  cpu_flags=oc_cpu_flags_get();
+  oc_enc_vtable_init_c(_enc);
+  if(cpu_flags&OC_CPU_X86_MMX){
+    _enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
+    _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
+    _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
+    _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
+  }
+  if(cpu_flags&OC_CPU_X86_MMXEXT){
+    _enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
+    _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
+    _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
+    _enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_mmxext;
+    _enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_mmxext;
+    _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
+    _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
+  }
+  if(cpu_flags&OC_CPU_X86_SSE2){
+# if defined(OC_X86_64_ASM)
+    /*_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;*/
+# endif
+  }
+}
+#endif
diff --git a/lib/x86/x86enc.h b/lib/x86/x86enc.h
new file mode 100644
index 0000000..06c3908
--- /dev/null
+++ b/lib/x86/x86enc.h
@@ -0,0 +1,47 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_x86enc_H)
+# define _x86_x86enc_H (1)
+# include "../encint.h"
+# include "x86int.h"
+
+void oc_enc_vtable_init_x86(oc_enc_ctx *_enc);
+
+unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,int _ystride);
+void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
+ const unsigned char *_x,const unsigned char *_y,int _stride);
+void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
+ const unsigned char *_x,int _stride);
+void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+
+#endif
diff --git a/lib/x86/x86int.h b/lib/x86/x86int.h
new file mode 100644
index 0000000..ede724f
--- /dev/null
+++ b/lib/x86/x86int.h
@@ -0,0 +1,42 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#if !defined(_x86_x86int_H)
+# define _x86_x86int_H (1)
+# include "../internal.h"
+
+void oc_state_vtable_init_x86(oc_theora_state *_state);
+
+void oc_frag_copy_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride);
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
+void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
+ int _dst_frame,int _src_frame,int _pli);
+void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
+ int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_restore_fpu_mmx(void);
+
+#endif
diff --git a/lib/x86/x86state.c b/lib/x86/x86state.c
new file mode 100644
index 0000000..a786bec
--- /dev/null
+++ b/lib/x86/x86state.c
@@ -0,0 +1,62 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+#include "../cpu.c"
+
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+   each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+};
+
+void oc_state_vtable_init_x86(oc_theora_state *_state){
+  _state->cpu_flags=oc_cpu_flags_get();
+  if(_state->cpu_flags&OC_CPU_X86_MMX){
+    _state->opt_vtable.frag_copy=oc_frag_copy_mmx;
+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
+    _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
+    _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     oc_state_loop_filter_frag_rows_mmx;
+    _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
+  }
+  else oc_state_vtable_init_c(_state);
+}
+#endif
diff --git a/lib/x86_vc/mmxencfrag.c b/lib/x86_vc/mmxencfrag.c
new file mode 100644
index 0000000..ac9dacf
--- /dev/null
+++ b/lib/x86_vc/mmxencfrag.c
@@ -0,0 +1,969 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
+
+ ********************************************************************/
+#include <stddef.h>
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  ptrdiff_t ret;
+  __asm{
+#define SRC esi
+#define REF edx
+#define YSTRIDE ecx
+#define YSTRIDE3 edi
+    mov YSTRIDE,_ystride
+    mov SRC,_src
+    mov REF,_ref
+    /*Load the first 4 rows of each block.*/
+    movq mm0,[SRC]
+    movq mm1,[REF]
+    movq mm2,[SRC][YSTRIDE]
+    movq mm3,[REF][YSTRIDE]
+    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
+    movq mm4,[SRC+YSTRIDE*2]
+    movq mm5,[REF+YSTRIDE*2]
+    movq mm6,[SRC+YSTRIDE3]
+    movq mm7,[REF+YSTRIDE3]
+    /*Compute their SADs and add them in mm0*/
+    psadbw mm0,mm1
+    psadbw mm2,mm3
+    lea SRC,[SRC+YSTRIDE*4]
+    paddw mm0,mm2
+    lea REF,[REF+YSTRIDE*4]
+    /*Load the next 3 rows as registers become available.*/
+    movq mm2,[SRC]
+    movq mm3,[REF]
+    psadbw mm4,mm5
+    psadbw mm6,mm7
+    paddw mm0,mm4
+    movq mm5,[REF+YSTRIDE]
+    movq mm4,[SRC+YSTRIDE]
+    paddw mm0,mm6
+    movq mm7,[REF+YSTRIDE*2]
+    movq mm6,[SRC+YSTRIDE*2]
+    /*Start adding their SADs to mm0*/
+    psadbw mm2,mm3
+    psadbw mm4,mm5
+    paddw mm0,mm2
+    psadbw mm6,mm7
+    /*Load last row as registers become available.*/
+    movq mm2,[SRC+YSTRIDE3]
+    movq mm3,[REF+YSTRIDE3]
+    /*And finish adding up their SADs.*/
+    paddw mm0,mm4
+    psadbw mm2,mm3
+    paddw mm0,mm6
+    paddw mm0,mm2
+    movd [ret],mm0
+#undef SRC
+#undef REF
+#undef YSTRIDE
+#undef YSTRIDE3
+  }
+  return (unsigned)ret;
+}
+
+unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh){
+  /*Early termination is for suckers.*/
+  return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
+}
+
+#define OC_SAD2_LOOP __asm{ \
+  /*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \
+     pavgb computes (mm0+mm1+1>>1). \
+   The latter is exactly 1 too large when the low bit of two corresponding \
+    bytes is only set in one of them. \
+   Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
+    correct the output of pavgb.*/ \
+  __asm  movq mm6,mm0 \
+  __asm  lea REF1,[REF1+YSTRIDE*2] \
+  __asm  pxor mm0,mm1 \
+  __asm  pavgb mm6,mm1 \
+  __asm  lea REF2,[REF2+YSTRIDE*2] \
+  __asm  movq mm1,mm2 \
+  __asm  pand mm0,mm7 \
+  __asm  pavgb mm2,mm3 \
+  __asm  pxor mm1,mm3 \
+  __asm  movq mm3,[REF2+YSTRIDE] \
+  __asm  psubb mm6,mm0 \
+  __asm  movq mm0,[REF1] \
+  __asm  pand mm1,mm7 \
+  __asm  psadbw mm4,mm6 \
+  __asm  movd mm6,RET \
+  __asm  psubb mm2,mm1 \
+  __asm  movq mm1,[REF2] \
+  __asm  lea SRC,[SRC+YSTRIDE*2] \
+  __asm  psadbw mm5,mm2 \
+  __asm  movq mm2,[REF1+YSTRIDE] \
+  __asm  paddw mm5,mm4 \
+  __asm  movq mm4,[SRC] \
+  __asm  paddw mm6,mm5 \
+  __asm  movq mm5,[SRC+YSTRIDE] \
+  __asm  movd RET,mm6 \
+}
+
+/*Same as above, but does not pre-load the next two rows.*/
+#define OC_SAD2_TAIL __asm{ \
+  __asm  movq mm6,mm0 \
+  __asm  pavgb mm0,mm1 \
+  __asm  pxor mm6,mm1 \
+  __asm  movq mm1,mm2 \
+  __asm  pand mm6,mm7 \
+  __asm  pavgb mm2,mm3 \
+  __asm  pxor mm1,mm3 \
+  __asm  psubb mm0,mm6 \
+  __asm  pand mm1,mm7 \
+  __asm  psadbw mm4,mm0 \
+  __asm  psubb mm2,mm1 \
+  __asm  movd mm6,RET \
+  __asm  psadbw mm5,mm2 \
+  __asm  paddw mm5,mm4 \
+  __asm  paddw mm6,mm5 \
+  __asm  movd RET,mm6 \
+}
+
+unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh){
+  ptrdiff_t ret;
+  __asm{
+#define REF1 ecx
+#define REF2 edi
+#define YSTRIDE esi
+#define SRC edx
+#define RET eax
+    mov YSTRIDE,_ystride
+    mov SRC,_src
+    mov REF1,_ref1
+    mov REF2,_ref2
+    movq mm0,[REF1]
+    movq mm1,[REF2]
+    movq mm2,[REF1+YSTRIDE]
+    movq mm3,[REF2+YSTRIDE]
+    xor RET,RET
+    movq mm4,[SRC]
+    pxor mm7,mm7
+    pcmpeqb mm6,mm6
+    movq mm5,[SRC+YSTRIDE]
+    psubb mm7,mm6
+    OC_SAD2_LOOP
+    OC_SAD2_LOOP
+    OC_SAD2_LOOP
+    OC_SAD2_TAIL
+    mov [ret],RET
+#undef REF1
+#undef REF2
+#undef YSTRIDE
+#undef SRC
+#undef RET
+  }
+  return (unsigned)ret;
+}
+
+/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
+  16-bit difference in mm0...mm7.*/
+#define OC_LOAD_SUB_8x4(_off) __asm{ \
+  __asm  movd mm0,[_off+SRC] \
+  __asm  movd mm4,[_off+REF] \
+  __asm  movd mm1,[_off+SRC+SRC_YSTRIDE] \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  movd mm5,[_off+REF+REF_YSTRIDE] \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  movd mm2,[_off+SRC] \
+  __asm  movd mm7,[_off+REF] \
+  __asm  movd mm3,[_off+SRC+SRC_YSTRIDE] \
+  __asm  movd mm6,[_off+REF+REF_YSTRIDE] \
+  __asm  punpcklbw mm0,mm4 \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  punpcklbw mm4,mm4 \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  psubw mm0,mm4 \
+  __asm  movd mm4,[_off+SRC] \
+  __asm  movq [_off*2+BUF],mm0 \
+  __asm  movd mm0,[_off+REF] \
+  __asm  punpcklbw mm1,mm5 \
+  __asm  punpcklbw mm5,mm5 \
+  __asm  psubw mm1,mm5 \
+  __asm  movd mm5,[_off+SRC+SRC_YSTRIDE] \
+  __asm  punpcklbw mm2,mm7 \
+  __asm  punpcklbw mm7,mm7 \
+  __asm  psubw mm2,mm7 \
+  __asm  movd mm7,[_off+REF+REF_YSTRIDE] \
+  __asm  punpcklbw mm3,mm6 \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  punpcklbw mm6,mm6 \
+  __asm  psubw mm3,mm6 \
+  __asm  movd mm6,[_off+SRC] \
+  __asm  punpcklbw mm4,mm0 \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  punpcklbw mm0,mm0 \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  psubw mm4,mm0 \
+  __asm  movd mm0,[_off+REF] \
+  __asm  punpcklbw mm5,mm7 \
+  __asm  neg SRC_YSTRIDE \
+  __asm  punpcklbw mm7,mm7 \
+  __asm  psubw mm5,mm7 \
+  __asm  movd mm7,[_off+SRC+SRC_YSTRIDE] \
+  __asm  punpcklbw mm6,mm0 \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  punpcklbw mm0,mm0 \
+  __asm  neg REF_YSTRIDE \
+  __asm  psubw mm6,mm0 \
+  __asm  movd mm0,[_off+REF+REF_YSTRIDE] \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*8] \
+  __asm  punpcklbw mm7,mm0 \
+  __asm  neg SRC_YSTRIDE \
+  __asm  punpcklbw mm0,mm0 \
+  __asm  lea REF,[REF+REF_YSTRIDE*8] \
+  __asm  psubw mm7,mm0 \
+  __asm  neg REF_YSTRIDE \
+  __asm  movq mm0,[_off*2+BUF] \
+}
+
+/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
+#define OC_LOAD_8x4(_off) __asm{ \
+  __asm  movd mm0,[_off+SRC] \
+  __asm  movd mm1,[_off+SRC+YSTRIDE] \
+  __asm  movd mm2,[_off+SRC+YSTRIDE*2] \
+  __asm  pxor mm7,mm7 \
+  __asm  movd mm3,[_off+SRC+YSTRIDE3] \
+  __asm  punpcklbw mm0,mm7 \
+  __asm  movd mm4,[_off+SRC4] \
+  __asm  punpcklbw mm1,mm7 \
+  __asm  movd mm5,[_off+SRC4+YSTRIDE] \
+  __asm  punpcklbw mm2,mm7 \
+  __asm  movd mm6,[_off+SRC4+YSTRIDE*2] \
+  __asm  punpcklbw mm3,mm7 \
+  __asm  movd mm7,[_off+SRC4+YSTRIDE3] \
+  __asm  punpcklbw mm4,mm4 \
+  __asm  punpcklbw mm5,mm5 \
+  __asm  psrlw mm4,8 \
+  __asm  psrlw mm5,8 \
+  __asm  punpcklbw mm6,mm6 \
+  __asm  punpcklbw mm7,mm7 \
+  __asm  psrlw mm6,8 \
+  __asm  psrlw mm7,8 \
+}
+
+/*Performs the first two stages of an 8-point 1-D Hadamard transform.
+  The transform is performed in place, except that outputs 0-3 are swapped with
+   outputs 4-7.
+  Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
+   perform this stage in place with no temporary registers).*/
+#define OC_HADAMARD_AB_8x4 __asm{ \
+  /*Stage A: \
+    Outputs 0-3 are swapped with 4-7 here.*/ \
+  __asm  paddw mm5,mm1 \
+  __asm  paddw mm6,mm2 \
+  __asm  paddw mm1,mm1 \
+  __asm  paddw mm2,mm2 \
+  __asm  psubw mm1,mm5 \
+  __asm  psubw mm2,mm6 \
+  __asm  paddw mm7,mm3 \
+  __asm  paddw mm4,mm0 \
+  __asm  paddw mm3,mm3 \
+  __asm  paddw mm0,mm0 \
+  __asm  psubw mm3,mm7 \
+  __asm  psubw mm0,mm4 \
+   /*Stage B:*/ \
+  __asm  paddw mm0,mm2 \
+  __asm  paddw mm1,mm3 \
+  __asm  paddw mm4,mm6 \
+  __asm  paddw mm5,mm7 \
+  __asm  paddw mm2,mm2 \
+  __asm  paddw mm3,mm3 \
+  __asm  paddw mm6,mm6 \
+  __asm  paddw mm7,mm7 \
+  __asm  psubw mm2,mm0 \
+  __asm  psubw mm3,mm1 \
+  __asm  psubw mm6,mm4 \
+  __asm  psubw mm7,mm5 \
+}
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
+  Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+   place with no temporary registers).*/
+#define OC_HADAMARD_C_8x4 __asm{ \
+  /*Stage C:*/ \
+  __asm  paddw mm0,mm1 \
+  __asm  paddw mm2,mm3 \
+  __asm  paddw mm4,mm5 \
+  __asm  paddw mm6,mm7 \
+  __asm  paddw mm1,mm1 \
+  __asm  paddw mm3,mm3 \
+  __asm  paddw mm5,mm5 \
+  __asm  paddw mm7,mm7 \
+  __asm  psubw mm1,mm0 \
+  __asm  psubw mm3,mm2 \
+  __asm  psubw mm5,mm4 \
+  __asm  psubw mm7,mm6 \
+}
+
+/*Performs an 8-point 1-D Hadamard transform.
+  The transform is performed in place, except that outputs 0-3 are swapped with
+   outputs 4-7.
+  Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
+   in place with no temporary registers).*/
+#define OC_HADAMARD_8x4 __asm{ \
+  OC_HADAMARD_AB_8x4 \
+  OC_HADAMARD_C_8x4 \
+}
+
+/*Performs the first part of the final stage of the Hadamard transform and
+   summing of absolute values.
+  At the end of this part, mm1 will contain the DC coefficient of the
+   transform.*/
+#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \
+  /*We use the fact that \
+      (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
+     to merge the final butterfly with the abs and the first stage of \
+     accumulation. \
+    Thus we can avoid using pabsw, which is not available until SSSE3. \
+    Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
+     implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
+     registers). \
+    Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
+    This implementation is only 26 (+4 for spilling registers).*/ \
+  __asm  movq [_r7+BUF],mm7 \
+  __asm  movq [_r6+BUF],mm6 \
+  /*mm7={0x7FFF}x4 \
+    mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
+  __asm  pcmpeqb mm7,mm7 \
+  __asm  movq mm6,mm0 \
+  __asm  psrlw mm7,1 \
+  __asm  paddw mm6,mm1 \
+  __asm  pmaxsw mm0,mm1 \
+  __asm  paddsw mm6,mm7 \
+  __asm  psubw mm0,mm6 \
+  /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
+    mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm1,mm4 \
+  __asm  pmaxsw mm2,mm3 \
+  __asm  pmaxsw mm4,mm5 \
+  __asm  paddw mm6,mm3 \
+  __asm  paddw mm1,mm5 \
+  __asm  movq mm3,[_r7+BUF] \
+}
+
+/*Performs the second part of the final stage of the Hadamard transform and
+   summing of absolute values.*/
+#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \
+  __asm  paddsw mm6,mm7 \
+  __asm  movq mm5,[_r6+BUF] \
+  __asm  paddsw mm1,mm7 \
+  __asm  psubw mm2,mm6 \
+  __asm  psubw mm4,mm1 \
+  /*mm7={1}x4 (needed for the horizontal add that follows) \
+    mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
+  __asm  movq mm6,mm3 \
+  __asm  pmaxsw mm3,mm5 \
+  __asm  paddw mm0,mm2 \
+  __asm  paddw mm6,mm5 \
+  __asm  paddw mm0,mm4 \
+  __asm  paddsw mm6,mm7 \
+  __asm  paddw mm0,mm3 \
+  __asm  psrlw mm7,14 \
+  __asm  psubw mm0,mm6 \
+}
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
+   absolute value of each component, and accumulates everything into mm0.
+  This is the only portion of SATD which requires MMXEXT (we could use plain
+   MMX, but it takes 4 instructions and an extra register to work around the
+   lack of a pmaxsw, which is a pretty serious penalty).*/
+#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
+  OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
+  OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
+}
+
+/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
+   component, and accumulates everything into mm0.
+  Note that mm0 will have an extra 4 added to each column, and that after
+   removing this value, the remainder will be half the conventional value.*/
+#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
+  OC_HADAMARD_AB_8x4 \
+  OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
+}
+
+/*Performs two 4x4 transposes (mostly) in place.
+  On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
+   contains rows {a,b,c,d}.
+  On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and
+   {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
+#define OC_TRANSPOSE_4x4x2(_off) __asm{ \
+  /*First 4x4 transpose:*/ \
+  __asm  movq [0x10+_off+BUF],mm5 \
+  /*mm0 = e3 e2 e1 e0 \
+    mm1 = f3 f2 f1 f0 \
+    mm2 = g3 g2 g1 g0 \
+    mm3 = h3 h2 h1 h0*/ \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm2,mm3 \
+  __asm  punpckhwd mm5,mm3 \
+  __asm  movq mm3,mm0 \
+  __asm  punpcklwd mm0,mm1 \
+  __asm  punpckhwd mm3,mm1 \
+  /*mm0 = f1 e1 f0 e0 \
+    mm3 = f3 e3 f2 e2 \
+    mm2 = h1 g1 h0 g0 \
+    mm5 = h3 g3 h2 g2*/ \
+  __asm  movq mm1,mm0 \
+  __asm  punpckldq mm0,mm2 \
+  __asm  punpckhdq mm1,mm2 \
+  __asm  movq mm2,mm3 \
+  __asm  punpckhdq mm3,mm5 \
+  __asm  movq [0x40+_off+BUF],mm0 \
+  __asm  punpckldq mm2,mm5 \
+  /*mm0 = h0 g0 f0 e0 \
+    mm1 = h1 g1 f1 e1 \
+    mm2 = h2 g2 f2 e2 \
+    mm3 = h3 g3 f3 e3*/ \
+  __asm  movq mm5,[0x10+_off+BUF] \
+  /*Second 4x4 transpose:*/ \
+  /*mm4 = a3 a2 a1 a0 \
+    mm5 = b3 b2 b1 b0 \
+    mm6 = c3 c2 c1 c0 \
+    mm7 = d3 d2 d1 d0*/ \
+  __asm  movq mm0,mm6 \
+  __asm  punpcklwd mm6,mm7 \
+  __asm  movq [0x50+_off+BUF],mm1 \
+  __asm  punpckhwd mm0,mm7 \
+  __asm  movq mm7,mm4 \
+  __asm  punpcklwd mm4,mm5 \
+  __asm  movq [0x60+_off+BUF],mm2 \
+  __asm  punpckhwd mm7,mm5 \
+  /*mm4 = b1 a1 b0 a0 \
+    mm7 = b3 a3 b2 a2 \
+    mm6 = d1 c1 d0 c0 \
+    mm0 = d3 c3 d2 c2*/ \
+  __asm  movq mm5,mm4 \
+  __asm  punpckldq mm4,mm6 \
+  __asm  movq [0x70+_off+BUF],mm3 \
+  __asm  punpckhdq mm5,mm6 \
+  __asm  movq mm6,mm7 \
+  __asm  punpckhdq mm7,mm0 \
+  __asm  punpckldq mm6,mm0 \
+  /*mm4 = d0 c0 b0 a0 \
+    mm5 = d1 c1 b1 a1 \
+    mm6 = d2 c2 b2 a2 \
+    mm7 = d3 c3 b3 a3*/ \
+}
+
+static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
+ int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
+  OC_ALIGN8(ogg_int16_t  buf[64]);
+  ogg_int16_t           *bufp;
+  unsigned               ret1;
+  unsigned               ret2;
+  bufp=buf;
+  __asm{
+#define SRC esi
+#define REF eax
+#define SRC_YSTRIDE ecx
+#define REF_YSTRIDE edx
+#define BUF edi
+#define RET eax
+#define RET2 edx
+    mov SRC,_src
+    mov SRC_YSTRIDE,_src_ystride
+    mov REF,_ref
+    mov REF_YSTRIDE,_ref_ystride
+    mov BUF,bufp
+    OC_LOAD_SUB_8x4(0x00)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x00)
+    /*Finish swapping out this 8x4 block to make room for the next one.
+      mm0...mm3 have been swapped out already.*/
+    movq [0x00+BUF],mm4
+    movq [0x10+BUF],mm5
+    movq [0x20+BUF],mm6
+    movq [0x30+BUF],mm7
+    OC_LOAD_SUB_8x4(0x04)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x08)
+    /*Here the first 4x4 block of output from the last transpose is the second
+       4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place, so
+       we only have to do half the loads.*/
+    movq mm1,[0x10+BUF]
+    movq mm2,[0x20+BUF]
+    movq mm3,[0x30+BUF]
+    movq mm0,[0x00+BUF]
+    OC_HADAMARD_ABS_ACCUM_8x4(0x28,0x38)
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+       for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.
+      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
+       latency of pmaddwd by starting the next series of loads now.*/
+    mov RET2,_thresh
+    pmaddwd mm0,mm7
+    movq mm1,[0x50+BUF]
+    movq mm5,[0x58+BUF]
+    movq mm4,mm0
+    movq mm2,[0x60+BUF]
+    punpckhdq mm0,mm0
+    movq mm6,[0x68+BUF]
+    paddd mm4,mm0
+    movq mm3,[0x70+BUF]
+    movd RET,mm4
+    movq mm7,[0x78+BUF]
+    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
+       added to them, and a factor of two removed; correct the final sum here.*/
+    lea RET,[RET+RET-32]
+    movq mm0,[0x40+BUF]
+    cmp RET,RET2
+    movq mm4,[0x48+BUF]
+    jae at_end
+    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
+    pmaddwd mm0,mm7
+    /*There isn't much to stick in here to hide the latency this time, but the
+       alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
+       latency is even worse.*/
+    sub RET,32
+    movq mm4,mm0
+    punpckhdq mm0,mm0
+    paddd mm4,mm0
+    movd RET2,mm4
+    lea RET,[RET+RET2*2]
+    align 16
+at_end:
+    mov ret1,RET
+#undef SRC
+#undef REF
+#undef SRC_YSTRIDE
+#undef REF_YSTRIDE
+#undef BUF
+#undef RET
+#undef RET2
+  }
+  return ret1;
+}
+
+unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh){
+  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
+}
+
+
+/*Our internal implementation of frag_copy2 takes an extra stride parameter so
+   we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
+static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
+  __asm{
+    /*Load the first 3 rows.*/
+#define DST_YSTRIDE edi
+#define SRC_YSTRIDE esi
+#define DST eax
+#define SRC1 edx
+#define SRC2 ecx
+    mov DST_YSTRIDE,_dst_ystride
+    mov SRC_YSTRIDE,_src_ystride
+    mov DST,_dst
+    mov SRC1,_src1
+    mov SRC2,_src2
+    movq mm0,[SRC1]
+    movq mm1,[SRC2]
+    movq mm2,[SRC1+SRC_YSTRIDE]
+    lea SRC1,[SRC1+SRC_YSTRIDE*2]
+    movq mm3,[SRC2+SRC_YSTRIDE]
+    lea SRC2,[SRC2+SRC_YSTRIDE*2]
+    pxor mm7,mm7
+    movq mm4,[SRC1]
+    pcmpeqb mm6,mm6
+    movq mm5,[SRC2]
+    /*mm7={1}x8.*/
+    psubb mm7,mm6
+    /*Start averaging mm0 and mm1 into mm6.*/
+    movq mm6,mm0
+    pxor mm0,mm1
+    pavgb mm6,mm1
+    /*mm1 is free, start averaging mm3 into mm2 using mm1.*/
+    movq mm1,mm2
+    pand mm0,mm7
+    pavgb mm2,mm3
+    pxor mm1,mm3
+    /*mm3 is free.*/
+    psubb mm6,mm0
+    /*mm0 is free, start loading the next row.*/
+    movq mm0,[SRC1+SRC_YSTRIDE]
+    /*Start averaging mm5 and mm4 using mm3.*/
+    movq mm3,mm4
+    /*mm6 [row 0] is done; write it out.*/
+    movq [DST],mm6
+    pand mm1,mm7
+    pavgb mm4,mm5
+    psubb mm2,mm1
+    /*mm1 is free, continue loading the next row.*/
+    movq mm1,[SRC2+SRC_YSTRIDE]
+    pxor mm3,mm5
+    lea SRC1,[SRC1+SRC_YSTRIDE*2]
+    /*mm2 [row 1] is done; write it out.*/
+    movq [DST+DST_YSTRIDE],mm2
+    pand mm3,mm7
+    /*Start loading the next row.*/
+    movq mm2,[SRC1]
+    lea DST,[DST+DST_YSTRIDE*2]
+    psubb mm4,mm3
+    lea SRC2,[SRC2+SRC_YSTRIDE*2]
+    /*mm4 [row 2] is done; write it out.*/
+    movq [DST],mm4
+    /*Continue loading the next row.*/
+    movq mm3,[SRC2]
+    /*Start averaging mm0 and mm1 into mm6.*/
+    movq mm6,mm0
+    pxor mm0,mm1
+    /*Start loading the next row.*/
+    movq mm4,[SRC1+SRC_YSTRIDE]
+    pavgb mm6,mm1
+    /*mm1 is free; start averaging mm3 into mm2 using mm1.*/
+    movq mm1,mm2
+    pand mm0,mm7
+    /*Continue loading the next row.*/
+    movq mm5,[SRC2+SRC_YSTRIDE]
+    pavgb mm2,mm3
+    lea SRC1,[SRC1+SRC_YSTRIDE*2]
+    pxor mm1,mm3
+    /*mm3 is free.*/
+    psubb mm6,mm0
+    /*mm0 is free, start loading the next row.*/
+    movq mm0,[SRC1]
+    /*Start averaging mm5 into mm4 using mm3.*/
+    movq mm3,mm4
+    /*mm6 [row 3] is done; write it out.*/
+    movq [DST+DST_YSTRIDE],mm6
+    pand mm1,mm7
+    lea SRC2,[SRC2+SRC_YSTRIDE*2]
+    pavgb mm4,mm5
+    lea DST,[DST+DST_YSTRIDE*2]
+    psubb mm2,mm1
+    /*mm1 is free; continue loading the next row.*/
+    movq mm1,[SRC2]
+    pxor mm3,mm5
+    /*mm2 [row 4] is done; write it out.*/
+    movq [DST],mm2
+    pand mm3,mm7
+    /*Start loading the next row.*/
+    movq mm2,[SRC1+SRC_YSTRIDE]
+    psubb mm4,mm3
+    /*Start averaging mm0 and mm1 into mm6.*/
+    movq mm6,mm0
+    /*Continue loading the next row.*/
+    movq mm3,[SRC2+SRC_YSTRIDE]
+    /*mm4 [row 5] is done; write it out.*/
+    movq [DST+DST_YSTRIDE],mm4
+    pxor mm0,mm1
+    pavgb mm6,mm1
+    /*mm4 is free; start averaging mm3 into mm2 using mm4.*/
+    movq mm4,mm2
+    pand mm0,mm7
+    pavgb mm2,mm3
+    pxor mm4,mm3
+    lea DST,[DST+DST_YSTRIDE*2]
+    psubb mm6,mm0
+    pand mm4,mm7
+    /*mm6 [row 6] is done, write it out.*/
+    movq [DST],mm6
+    psubb mm2,mm4
+    /*mm2 [row 7] is done, write it out.*/
+    movq [DST+DST_YSTRIDE],mm2
+#undef SRC1
+#undef SRC2
+#undef SRC_YSTRIDE
+#undef DST_YSTRIDE
+#undef DST
+  }
+}
+
+unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh){
+  OC_ALIGN8(unsigned char ref[64]);
+  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
+  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
+}
+
+unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
+ int _ystride){
+  OC_ALIGN8(ogg_int16_t  buf[64]);
+  ogg_int16_t           *bufp;
+  unsigned               ret1;
+  unsigned               ret2;
+  bufp=buf;
+  __asm{
+#define SRC eax
+#define SRC4 esi
+#define BUF edi
+#define RET eax
+#define RET_WORD ax
+#define RET2 ecx
+#define YSTRIDE edx
+#define YSTRIDE3 ecx
+    mov SRC,_src
+    mov BUF,bufp
+    mov YSTRIDE,_ystride
+    /* src4 = src+4*ystride */
+    lea SRC4,[SRC+YSTRIDE*4]
+    /* ystride3 = 3*ystride */
+    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
+    OC_LOAD_8x4(0x00)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x00)
+    /*Finish swapping out this 8x4 block to make room for the next one.
+      mm0...mm3 have been swapped out already.*/
+    movq [0x00+BUF],mm4
+    movq [0x10+BUF],mm5
+    movq [0x20+BUF],mm6
+    movq [0x30+BUF],mm7
+    OC_LOAD_8x4(0x04)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x08)
+    /*Here the first 4x4 block of output from the last transpose is the second
+      4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place, so
+      we only have to do half the loads.*/
+    movq mm1,[0x10+BUF]
+    movq mm2,[0x20+BUF]
+    movq mm3,[0x30+BUF]
+    movq mm0,[0x00+BUF]
+    /*We split out the stages here so we can save the DC coefficient in the
+      middle.*/
+    OC_HADAMARD_AB_8x4
+    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
+    movd RET,mm1
+    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+      difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+      for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.
+      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
+      latency of pmaddwd by starting the next series of loads now.*/
+    pmaddwd mm0,mm7
+    movq mm1,[0x50+BUF]
+    movq mm5,[0x58+BUF]
+    movq mm2,[0x60+BUF]
+    movq mm4,mm0
+    movq mm6,[0x68+BUF]
+    punpckhdq mm0,mm0
+    movq mm3,[0x70+BUF]
+    paddd mm4,mm0
+    movq mm7,[0x78+BUF]
+    movd RET2,mm4
+    movq mm0,[0x40+BUF]
+    movq mm4,[0x48+BUF]
+    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
+    pmaddwd mm0,mm7
+    /*We assume that the DC coefficient is always positive (which is true,
+    because the input to the INTRA transform was not a difference).*/
+    movzx RET,RET_WORD
+    add RET2,RET2
+    sub RET2,RET
+    movq mm4,mm0
+    punpckhdq mm0,mm0
+    paddd mm4,mm0
+    movd RET,mm4
+    lea RET,[-64+RET2+RET*2]
+    mov [ret1],RET
+#undef SRC
+#undef SRC4
+#undef BUF
+#undef RET
+#undef RET_WORD
+#undef RET2
+#undef YSTRIDE
+#undef YSTRIDE3
+  }
+  return ret1;
+}
+
+void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
+ const unsigned char *_src, const unsigned char *_ref,int _ystride){
+  int i;
+  __asm  pxor mm7,mm7
+  for(i=4;i-->0;){
+    __asm{
+#define SRC edx
+#define YSTRIDE esi
+#define RESIDUE eax
+#define REF ecx
+      mov YSTRIDE,_ystride
+      mov RESIDUE,_residue
+      mov SRC,_src
+      mov REF,_ref
+      /*mm0=[src]*/
+      movq mm0,[SRC]
+      /*mm1=[ref]*/
+      movq mm1,[REF]
+      /*mm4=[src+ystride]*/
+      movq mm4,[SRC+YSTRIDE]
+      /*mm5=[ref+ystride]*/
+      movq mm5,[REF+YSTRIDE]
+      /*Compute [src]-[ref].*/
+      movq mm2,mm0
+      punpcklbw mm0,mm7
+      movq mm3,mm1
+      punpckhbw mm2,mm7
+      punpcklbw mm1,mm7
+      punpckhbw mm3,mm7
+      psubw mm0,mm1
+      psubw mm2,mm3
+      /*Compute [src+ystride]-[ref+ystride].*/
+      movq mm1,mm4
+      punpcklbw mm4,mm7
+      movq mm3,mm5
+      punpckhbw mm1,mm7
+      lea SRC,[SRC+YSTRIDE*2]
+      punpcklbw mm5,mm7
+      lea REF,[REF+YSTRIDE*2]
+      punpckhbw mm3,mm7
+      psubw mm4,mm5
+      psubw mm1,mm3
+      /*Write the answer out.*/
+      movq [RESIDUE+0x00],mm0
+      movq [RESIDUE+0x08],mm2
+      movq [RESIDUE+0x10],mm4
+      movq [RESIDUE+0x18],mm1
+      lea RESIDUE,[RESIDUE+0x20]
+      mov _residue,RESIDUE
+      mov _src,SRC
+      mov _ref,REF
+#undef SRC
+#undef YSTRIDE
+#undef RESIDUE
+#undef REF
+    }
+  }
+}
+
+void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
+ const unsigned char *_src,int _ystride){
+   __asm{
+#define YSTRIDE edx
+#define YSTRIDE3 edi
+#define RESIDUE ecx
+#define SRC eax
+    mov YSTRIDE,_ystride
+    mov RESIDUE,_residue
+    mov SRC,_src
+    /*mm0=[src]*/
+    movq mm0,[SRC]
+    /*mm1=[src+ystride]*/
+    movq mm1,[SRC+YSTRIDE]
+    /*mm6={-1}x4*/
+    pcmpeqw mm6,mm6
+    /*mm2=[src+2*ystride]*/
+    movq mm2,[SRC+YSTRIDE*2]
+    /*[ystride3]=3*[ystride]*/
+    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
+    /*mm6={1}x4*/
+    psllw mm6,15
+    /*mm3=[src+3*ystride]*/
+    movq mm3,[SRC+YSTRIDE3]
+    /*mm6={128}x4*/
+    psrlw mm6,8
+    /*mm7=0*/ 
+    pxor mm7,mm7
+    /*[src]=[src]+4*[ystride]*/
+    lea SRC,[SRC+YSTRIDE*4]
+    /*Compute [src]-128 and [src+ystride]-128*/
+    movq mm4,mm0
+    punpcklbw mm0,mm7
+    movq mm5,mm1
+    punpckhbw mm4,mm7
+    psubw mm0,mm6
+    punpcklbw mm1,mm7
+    psubw mm4,mm6
+    punpckhbw mm5,mm7
+    psubw mm1,mm6
+    psubw mm5,mm6
+    /*Write the answer out.*/
+    movq [RESIDUE+0x00],mm0
+    movq [RESIDUE+0x08],mm4
+    movq [RESIDUE+0x10],mm1
+    movq [RESIDUE+0x18],mm5
+    /*mm0=[src+4*ystride]*/
+    movq mm0,[SRC]
+    /*mm1=[src+5*ystride]*/
+    movq mm1,[SRC+YSTRIDE]
+    /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
+    movq mm4,mm2
+    punpcklbw mm2,mm7
+    movq mm5,mm3
+    punpckhbw mm4,mm7
+    psubw mm2,mm6
+    punpcklbw mm3,mm7
+    psubw mm4,mm6
+    punpckhbw mm5,mm7
+    psubw mm3,mm6
+    psubw mm5,mm6
+    /*Write the answer out.*/
+    movq [RESIDUE+0x20],mm2
+    movq [RESIDUE+0x28],mm4
+    movq [RESIDUE+0x30],mm3
+    movq [RESIDUE+0x38],mm5
+    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
+    movq mm2,[SRC+YSTRIDE*2]
+    movq mm3,[SRC+YSTRIDE3]
+    movq mm4,mm0
+    punpcklbw mm0,mm7
+    movq mm5,mm1
+    punpckhbw mm4,mm7
+    psubw mm0,mm6
+    punpcklbw mm1,mm7
+    psubw mm4,mm6
+    punpckhbw mm5,mm7
+    psubw mm1,mm6
+    psubw mm5,mm6
+    /*Write the answer out.*/
+    movq [RESIDUE+0x40],mm0
+    movq [RESIDUE+0x48],mm4
+    movq [RESIDUE+0x50],mm1
+    movq [RESIDUE+0x58],mm5
+    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
+    movq mm4,mm2
+    punpcklbw mm2,mm7
+    movq mm5,mm3
+    punpckhbw mm4,mm7
+    psubw mm2,mm6
+    punpcklbw mm3,mm7
+    psubw mm4,mm6
+    punpckhbw mm5,mm7
+    psubw mm3,mm6
+    psubw mm5,mm6
+    /*Write the answer out.*/
+    movq [RESIDUE+0x60],mm2
+    movq [RESIDUE+0x68],mm4
+    movq [RESIDUE+0x70],mm3
+    movq [RESIDUE+0x78],mm5
+#undef YSTRIDE
+#undef YSTRIDE3
+#undef RESIDUE
+#undef SRC
+  }
+}
+
+void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride){
+  oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
+}
+
+#endif
diff --git a/lib/x86_vc/mmxfdct.c b/lib/x86_vc/mmxfdct.c
new file mode 100644
index 0000000..dcf17c9
--- /dev/null
+++ b/lib/x86_vc/mmxfdct.c
@@ -0,0 +1,670 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************/ 
+ /*MMX fDCT implementation for x86_32*/
+/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+#define OC_FDCT_STAGE1_8x4  __asm{ \
+  /*Stage 1:*/ \
+  /*mm0=t7'=t0-t7*/ \
+  __asm  psubw mm0,mm7 \
+  __asm  paddw mm7,mm7 \
+  /*mm1=t6'=t1-t6*/ \
+  __asm  psubw mm1, mm6 \
+  __asm  paddw mm6,mm6 \
+  /*mm2=t5'=t2-t5*/ \
+  __asm  psubw mm2,mm5 \
+  __asm  paddw mm5,mm5 \
+  /*mm3=t4'=t3-t4*/ \
+  __asm  psubw mm3,mm4 \
+  __asm  paddw mm4,mm4 \
+  /*mm7=t0'=t0+t7*/ \
+  __asm  paddw mm7,mm0 \
+  /*mm6=t1'=t1+t6*/  \
+  __asm  paddw mm6,mm1 \
+  /*mm5=t2'=t2+t5*/ \
+  __asm  paddw mm5,mm2 \
+  /*mm4=t3'=t3+t4*/ \
+  __asm  paddw mm4,mm3\
+}
+
+#define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \
+  /*Stage 2:*/ \
+  /*mm7=t3''=t0'-t3'*/ \
+  __asm  psubw mm7,mm4 \
+  __asm  paddw mm4,mm4 \
+  /*mm6=t2''=t1'-t2'*/ \
+  __asm  psubw mm6,mm5 \
+  __asm  movq [Y+_r6],mm7 \
+  __asm  paddw mm5,mm5 \
+  /*mm1=t5''=t6'-t5'*/ \
+  __asm  psubw mm1,mm2 \
+  __asm  movq [Y+_r2],mm6 \
+  /*mm4=t0''=t0'+t3'*/ \
+  __asm  paddw mm4,mm7 \
+  __asm  paddw mm2,mm2 \
+  /*mm5=t1''=t1'+t2'*/ \
+  __asm  movq [Y+_r0],mm4 \
+  __asm  paddw mm5,mm6 \
+  /*mm2=t6''=t6'+t5'*/ \
+  __asm  paddw mm2,mm1 \
+  __asm  movq [Y+_r4],mm5 \
+  /*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
+  /*mm4, mm5, mm6, mm7 are free.*/ \
+  /*Stage 3:*/ \
+  /*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
+  __asm  mov A,0x5A806A0A \
+  __asm  pcmpeqb mm6,mm6 \
+  __asm  movd mm7,A \
+  __asm  psrlw mm6,15 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddw mm6,mm6 \
+  /*mm0=0, m2={-1}x4 \
+    mm5:mm4=t5''*27146+0xB500*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  movq [Y+_r3],mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  movq [Y+_r7],mm0 \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  pxor mm0,mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pcmpeqb mm2,mm2 \
+  /*mm2=t6'', mm1=t5''+(t5''!=0) \
+    mm4=(t5''*27146+0xB500>>16)*/ \
+  __asm  pcmpeqw mm0,mm1 \
+  __asm  psrad mm4,16 \
+  __asm  psubw mm0,mm2 \
+  __asm  movq mm2, [Y+_r3] \
+  __asm  psrad mm5,16 \
+  __asm  paddw mm1,mm0 \
+  __asm  packssdw mm4,mm5 \
+  /*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
+  __asm  paddw mm4,mm1 \
+  __asm  movq mm0, [Y+_r7] \
+  __asm  psraw mm4,1 \
+  __asm  movq mm1,mm3 \
+  /*mm3=t4''=t4'+s*/ \
+  __asm  paddw mm3,mm4 \
+  /*mm1=t5'''=t4'-s*/ \
+  __asm  psubw mm1,mm4 \
+  /*mm1=0, mm3={-1}x4 \
+    mm5:mm4=t6''*27146+0xB500*/ \
+  __asm  movq mm4,mm2 \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  movq [Y+_r5],mm1 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  movq [Y+_r1],mm3 \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  pxor mm1,mm1 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pcmpeqb mm3,mm3 \
+  /*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
+  __asm  psrad mm4,16 \
+  __asm  pcmpeqw mm1,mm2 \
+  __asm  psrad mm5,16 \
+  __asm  psubw mm1,mm3 \
+  __asm  packssdw mm4,mm5 \
+  __asm  paddw mm2,mm1 \
+  /*mm1=t1'' \
+    mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
+  __asm  paddw mm4,mm2 \
+  __asm  movq mm1,[Y+_r4] \
+  __asm  psraw mm4,1 \
+  __asm  movq mm2,mm0 \
+  /*mm7={54491-0x7FFF,0x7FFF}x2 \
+    mm0=t7''=t7'+s*/ \
+  __asm  paddw mm0,mm4 \
+  /*mm2=t6'''=t7'-s*/ \
+  __asm  psubw mm2,mm4 \
+  /*Stage 4:*/ \
+  /*mm0=0, mm2=t0'' \
+    mm5:mm4=t1''*27146+0xB500*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  movq [Y+_r3],mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  movq mm2,[Y+_r0] \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  movq [Y+_r7],mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pxor mm0,mm0 \
+  /*mm7={27146,0x4000>>1}x2 \
+    mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
+  __asm  psrad mm4,16 \
+  __asm  mov A,0x20006A0A \
+  __asm  pcmpeqw mm0,mm1 \
+  __asm  movd mm7,A \
+  __asm  psrad mm5,16 \
+  __asm  psubw mm0,mm3 \
+  __asm  packssdw mm4,mm5 \
+  __asm  paddw mm0,mm1 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddw mm0,mm4 \
+  /*mm6={0x00000E3D}x2 \
+    mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
+  __asm  movq mm4,mm2 \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  mov A,0x0E3D \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  movd mm6,A \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pxor mm1,mm1 \
+  __asm  punpckldq mm6,mm6 \
+  __asm  pcmpeqw mm1,mm2 \
+  /*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
+  __asm  psrad mm4,16 \
+  __asm  psubw mm1,mm3 \
+  __asm  psrad mm5,16 \
+  __asm  paddw mm2,mm1 \
+  __asm  packssdw mm4,mm5 \
+  __asm  movq mm1,[Y+_r5] \
+  __asm  paddw mm4,mm2 \
+  /*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
+    The naive implementation could cause overflow, so we use \
+     u=(r&s)+((r^s)>>1).*/ \
+  __asm  movq mm2,[Y+_r3] \
+  __asm  movq mm7,mm0 \
+  __asm  pxor mm0,mm4 \
+  __asm  pand mm7,mm4 \
+  __asm  psraw mm0,1 \
+  __asm  mov A,0x7FFF54DC \
+  __asm  paddw mm0,mm7 \
+  __asm  movd mm7,A \
+  /*mm7={54491-0x7FFF,0x7FFF}x2 \
+    mm4=_y[4]=v=r-u*/ \
+  __asm  psubw mm4,mm0 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  movq [Y+_r4],mm4 \
+  /*mm0=0, mm7={36410}x4 \
+    mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  punpcklwd mm4,mm1 \
+  __asm  mov A,0x8E3A8E3A \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  movq [Y+_r0],mm0 \
+  __asm  punpckhwd mm5,mm1 \
+  __asm  pxor mm0,mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pcmpeqw mm1,mm0 \
+  __asm  movd mm7,A \
+  __asm  psubw mm1,mm3 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddd mm4,mm6 \
+  __asm  paddd mm5,mm6 \
+  /*mm0=0 \
+    mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm3,mm2 \
+  __asm  pmulhw mm6,mm7 \
+  __asm  paddw mm1,mm2 \
+  __asm  pmullw mm3,mm7 \
+  __asm  pxor mm0,mm0 \
+  __asm  paddw mm6,mm1 \
+  __asm  movq mm1,mm3 \
+  __asm  punpckhwd mm3,mm6 \
+  __asm  punpcklwd mm1,mm6 \
+  /*mm3={-1}x4, mm6={1}x4 \
+    mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
+  __asm  paddd mm5,mm3 \
+  __asm  paddd mm4,mm1 \
+  __asm  psrad mm5,16 \
+  __asm  pxor mm6,mm6 \
+  __asm  psrad mm4,16 \
+  __asm  pcmpeqb mm3,mm3 \
+  __asm  packssdw mm4,mm5 \
+  __asm  psubw mm6,mm3 \
+  /*mm1=t7'', mm7={26568,0x3400}x2 \
+    mm2=s=t6'''-(36410*u>>16)*/ \
+  __asm  movq mm1,mm4 \
+  __asm  mov A,0x340067C8 \
+  __asm  pmulhw mm4,mm7 \
+  __asm  movd mm7,A \
+  __asm  movq [Y+_r5],mm1 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddw mm4,mm1 \
+  __asm  movq mm1,[Y+_r7] \
+  __asm  psubw mm2,mm4 \
+  /*mm6={0x00007B1B}x2 \
+    mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
+  __asm  movq mm4,mm2 \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  pcmpeqw mm0,mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  mov A,0x7B1B \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  movd mm6,A \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  psubw mm0,mm3 \
+  __asm  punpckldq mm6,mm6 \
+  /*mm7={64277-0x7FFF,0x7FFF}x2 \
+    mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
+  __asm  psrad mm4,17 \
+  __asm  paddw mm2,mm0 \
+  __asm  psrad mm5,17 \
+  __asm  mov A,0x7FFF7B16 \
+  __asm  packssdw mm4,mm5 \
+  __asm  movd mm7,A \
+  __asm  paddw mm2,mm4 \
+  __asm  punpckldq mm7,mm7 \
+  /*mm0=0, mm7={12785}x4 \
+    mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  movq [Y+_r3],mm2 \
+  __asm  punpcklwd mm4,mm1 \
+  __asm  movq mm2,[Y+_r1] \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  mov A,0x31F131F1 \
+  __asm  punpckhwd mm5,mm1 \
+  __asm  pxor mm0,mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pcmpeqw mm1,mm0 \
+  __asm  movd mm7,A \
+  __asm  psubw mm1,mm3 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddd mm4,mm6 \
+  __asm  paddd mm5,mm6 \
+  /*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm3,mm2 \
+  __asm  pmulhw mm6,mm7 \
+  __asm  pmullw mm3,mm7 \
+  __asm  paddw mm6,mm1 \
+  __asm  movq mm1,mm3 \
+  __asm  punpckhwd mm3,mm6 \
+  __asm  punpcklwd mm1,mm6 \
+  /*mm3={-1}x4, mm6={1}x4 \
+    mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
+  __asm  paddd mm5,mm3 \
+  __asm  paddd mm4,mm1 \
+  __asm  psrad mm5,16 \
+  __asm  pxor mm6,mm6 \
+  __asm  psrad mm4,16 \
+  __asm  pcmpeqb mm3,mm3 \
+  __asm  packssdw mm4,mm5 \
+  __asm  psubw mm6,mm3 \
+  /*mm1=t3'', mm7={20539,0x3000}x2 \
+    mm4=s=(12785*u>>16)-t4''*/ \
+  __asm  movq [Y+_r1],mm4 \
+  __asm  pmulhw mm4,mm7 \
+  __asm  mov A,0x3000503B \
+  __asm  movq mm1,[Y+_r6] \
+  __asm  movd mm7,A \
+  __asm  psubw mm4,mm2 \
+  __asm  punpckldq mm7,mm7 \
+  /*mm6={0x00006CB7}x2 \
+    mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
+  __asm  movq mm5,mm4 \
+  __asm  movq mm2,mm4 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  pcmpeqw mm0,mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  mov A,0x6CB7 \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  movd mm6,A \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  psubw mm0,mm3 \
+  __asm  punpckldq mm6,mm6 \
+  /*mm7={60547-0x7FFF,0x7FFF}x2 \
+    mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
+  __asm  psrad mm4,20 \
+  __asm  paddw mm2,mm0 \
+  __asm  psrad mm5,20 \
+  __asm  mov A,0x7FFF6C84 \
+  __asm  packssdw mm4,mm5 \
+  __asm  movd mm7,A \
+  __asm  paddw mm2,mm4 \
+  __asm  punpckldq mm7,mm7 \
+  /*mm0=0, mm7={25080}x4 \
+    mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  movq [Y+_r7],mm2 \
+  __asm  punpcklwd mm4,mm1 \
+  __asm  movq mm2,[Y+_r2] \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  mov A,0x61F861F8 \
+  __asm  punpckhwd mm5,mm1 \
+  __asm  pxor mm0,mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  movd mm7,A \
+  __asm  pcmpeqw mm1,mm0 \
+  __asm  psubw mm1,mm3 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddd mm4,mm6 \
+  __asm  paddd mm5,mm6 \
+  /*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm3,mm2 \
+  __asm  pmulhw mm6,mm7 \
+  __asm  pmullw mm3,mm7 \
+  __asm  paddw mm6,mm1 \
+  __asm  movq mm1,mm3 \
+  __asm  punpckhwd mm3,mm6 \
+  __asm  punpcklwd mm1,mm6 \
+  /*mm1={-1}x4 \
+    mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
+  __asm  paddd mm5,mm3 \
+  __asm  paddd mm4,mm1 \
+  __asm  psrad mm5,16 \
+  __asm  mov A,0x28005460 \
+  __asm  psrad mm4,16 \
+  __asm  pcmpeqb mm1,mm1 \
+  __asm  packssdw mm4,mm5 \
+  /*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
+    mm4=s=(25080*u>>16)-t2''*/ \
+  __asm  movq mm6,mm4 \
+  __asm  pmulhw mm4,mm7 \
+  __asm  pxor mm5,mm5 \
+  __asm  movd mm7,A \
+  __asm  psubw mm5,mm1 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  psubw mm4,mm2 \
+  /*mm2=s+(s!=0) \
+    mm4:mm3=s*21600+0x2800*/ \
+  __asm  movq mm3,mm4 \
+  __asm  movq mm2,mm4 \
+  __asm  punpckhwd mm4,mm5 \
+  __asm  pcmpeqw mm0,mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  psubw mm0,mm1 \
+  __asm  punpcklwd mm3,mm5 \
+  __asm  paddw mm2,mm0 \
+  __asm  pmaddwd mm3,mm7 \
+  /*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
+    mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
+  __asm  movq mm0,[Y+_r4] \
+  __asm  psrad mm4,18 \
+  __asm  movq mm5,[Y+_r5] \
+  __asm  psrad mm3,18 \
+  __asm  movq mm1,[Y+_r7] \
+  __asm  packssdw mm3,mm4 \
+  __asm  movq mm4,[Y+_r0] \
+  __asm  paddw mm3,mm2 \
+}
+
+/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].
+  On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and
+   {mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/
+#define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \
+  /*First 4x4 transpose:*/ \
+  /*mm0 = e3 e2 e1 e0 \
+    mm5 = f3 f2 f1 f0 \
+    mm3 = g3 g2 g1 g0 \
+    mm1 = h3 h2 h1 h0*/ \
+  __asm  movq mm2,mm0 \
+  __asm  punpcklwd mm0,mm5 \
+  __asm  punpckhwd mm2,mm5 \
+  __asm  movq mm5,mm3 \
+  __asm  punpcklwd mm3,mm1 \
+  __asm  punpckhwd mm5,mm1 \
+  /*mm0 = f1 e1 f0 e0 \
+    mm2 = f3 e3 f2 e2 \
+    mm3 = h1 g1 h0 g0 \
+    mm5 = h3 g3 h2 g2*/ \
+  __asm  movq mm1,mm0 \
+  __asm  punpckldq mm0,mm3 \
+  __asm  movq [Y+_r4],mm0 \
+  __asm  punpckhdq mm1,mm3 \
+  __asm  movq mm0,[Y+_r1] \
+  __asm  movq mm3,mm2 \
+  __asm  punpckldq mm2,mm5 \
+  __asm  punpckhdq mm3,mm5 \
+  __asm  movq mm5,[Y+_r3] \
+  /*_y[4] = h0 g0 f0 e0 \
+   mm1  = h1 g1 f1 e1 \
+   mm2  = h2 g2 f2 e2 \
+   mm3  = h3 g3 f3 e3*/ \
+  /*Second 4x4 transpose:*/ \
+  /*mm4 = a3 a2 a1 a0 \
+    mm0 = b3 b2 b1 b0 \
+    mm6 = c3 c2 c1 c0 \
+    mm5 = d3 d2 d1 d0*/ \
+  __asm  movq mm7,mm4 \
+  __asm  punpcklwd mm4,mm0 \
+  __asm  punpckhwd mm7,mm0 \
+  __asm  movq mm0,mm6 \
+  __asm  punpcklwd mm6,mm5 \
+  __asm  punpckhwd mm0,mm5 \
+  /*mm4 = b1 a1 b0 a0 \
+    mm7 = b3 a3 b2 a2 \
+    mm6 = d1 c1 d0 c0 \
+    mm0 = d3 c3 d2 c2*/ \
+  __asm  movq mm5,mm4 \
+  __asm  punpckldq mm4,mm6 \
+  __asm  punpckhdq mm5,mm6 \
+  __asm  movq mm6,mm7 \
+  __asm  punpckhdq mm7,mm0 \
+  __asm  punpckldq mm6,mm0 \
+  /*mm4 = d0 c0 b0 a0 \
+    mm5 = d1 c1 b1 a1 \
+    mm6 = d2 c2 b2 a2 \
+    mm7 = d3 c3 b3 a3*/ \
+}
+
+/*MMX implementation of the fDCT.*/
+void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  ptrdiff_t a;
+  __asm{
+#define Y eax
+#define A ecx
+#define X edx
+    /*Add two extra bits of working precision to improve accuracy; any more and
+       we could overflow.*/
+    /*We also add biases to correct for some systematic error that remains in
+       the full fDCT->iDCT round trip.*/
+    mov X, _x
+    mov Y, _y
+    movq mm0,[0x00+X]
+    movq mm1,[0x10+X]
+    movq mm2,[0x20+X]
+    movq mm3,[0x30+X]
+    pcmpeqb mm4,mm4
+    pxor mm7,mm7
+    movq mm5,mm0
+    psllw mm0,2
+    pcmpeqw mm5,mm7
+    movq mm7,[0x70+X]
+    psllw mm1,2
+    psubw mm5,mm4
+    psllw mm2,2
+    mov A,1
+    pslld mm5,16
+    movd mm6,A
+    psllq mm5,16
+    mov A,0x10001
+    psllw mm3,2
+    movd mm4,A
+    punpckhwd mm5,mm6
+    psubw mm1,mm6
+    movq mm6,[0x60+X]
+    paddw mm0,mm5
+    movq mm5,[0x50+X]
+    paddw mm0,mm4
+    movq mm4,[0x40+X]
+    /*We inline stage1 of the transform here so we can get better instruction
+       scheduling with the shifts.*/
+    /*mm0=t7'=t0-t7*/
+    psllw mm7,2
+    psubw mm0,mm7
+    psllw mm6,2
+    paddw mm7,mm7
+    /*mm1=t6'=t1-t6*/
+    psllw mm5,2
+    psubw mm1,mm6
+    psllw mm4,2
+    paddw mm6,mm6
+    /*mm2=t5'=t2-t5*/
+    psubw mm2,mm5
+    paddw mm5,mm5
+    /*mm3=t4'=t3-t4*/
+    psubw mm3,mm4
+    paddw mm4,mm4
+    /*mm7=t0'=t0+t7*/
+    paddw mm7,mm0
+    /*mm6=t1'=t1+t6*/
+    paddw mm6,mm1
+    /*mm5=t2'=t2+t5*/
+    paddw mm5,mm2
+    /*mm4=t3'=t3+t4*/
+    paddw mm4,mm3
+    OC_FDCT8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70)
+    OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70)
+    /*Swap out this 8x4 block for the next one.*/
+    movq mm0,[0x08+X]
+    movq [0x30+Y],mm7
+    movq mm7,[0x78+X]
+    movq [0x50+Y],mm1
+    movq mm1,[0x18+X]
+    movq [0x20+Y],mm6
+    movq mm6,[0x68+X]
+    movq [0x60+Y],mm2
+    movq mm2,[0x28+X]
+    movq [0x10+Y],mm5
+    movq mm5,[0x58+X]
+    movq [0x70+Y],mm3
+    movq mm3,[0x38+X]
+    /*And increase its working precision, too.*/
+    psllw mm0,2
+    movq [0x00+Y],mm4
+    psllw mm7,2
+    movq mm4,[0x48+X]
+    /*We inline stage1 of the transform here so we can get better instruction
+       scheduling with the shifts.*/
+    /*mm0=t7'=t0-t7*/
+    psubw mm0,mm7
+    psllw mm1,2
+    paddw mm7,mm7
+    psllw mm6,2
+    /*mm1=t6'=t1-t6*/
+    psubw mm1,mm6
+    psllw mm2,2
+    paddw mm6,mm6
+    psllw mm5,2
+    /*mm2=t5'=t2-t5*/
+    psubw mm2,mm5
+    psllw mm3,2
+    paddw mm5,mm5
+    psllw mm4,2
+    /*mm3=t4'=t3-t4*/
+    psubw mm3,mm4
+    paddw mm4,mm4
+    /*mm7=t0'=t0+t7*/
+    paddw mm7,mm0
+    /*mm6=t1'=t1+t6*/
+    paddw mm6,mm1
+    /*mm5=t2'=t2+t5*/
+    paddw mm5,mm2
+    /*mm4=t3'=t3+t4*/
+    paddw mm4,mm3
+    OC_FDCT8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78)
+    OC_TRANSPOSE8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78)
+    /*Here the first 4x4 block of output from the last transpose is the second
+       4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place,
+       so we only have to do half the stores and loads.*/
+    movq mm0,[0x00+Y]
+    movq [0x58+Y],mm1
+    movq mm1,[0x10+Y]
+    movq [0x68+Y],mm2
+    movq mm2,[0x20+Y]
+    movq [0x78+Y],mm3
+    movq mm3,[0x30+Y]
+    OC_FDCT_STAGE1_8x4
+    OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
+    OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
+    /*mm0={-2}x4*/
+    pcmpeqw mm0,mm0
+    paddw mm0,mm0
+    /*Round the results.*/
+    psubw mm1,mm0
+    psubw mm2,mm0
+    psraw mm1,2
+    psubw mm3,mm0
+    movq [0x18+Y],mm1
+    psraw mm2,2
+    psubw mm4,mm0
+    movq mm1,[0x08+Y]
+    psraw mm3,2
+    psubw mm5,mm0
+    psraw mm4,2
+    psubw mm6,mm0
+    psraw mm5,2
+    psubw mm7,mm0
+    psraw mm6,2
+    psubw mm1,mm0
+    psraw mm7,2
+    movq mm0,[0x40+Y]
+    psraw mm1,2
+    movq [0x30+Y],mm7
+    movq mm7,[0x78+Y]
+    movq [0x08+Y],mm1
+    movq mm1,[0x50+Y]
+    movq [0x20+Y],mm6
+    movq mm6,[0x68+Y]
+    movq [0x28+Y],mm2
+    movq mm2,[0x60+Y]
+    movq [0x10+Y],mm5
+    movq mm5,[0x58+Y]
+    movq [0x38+Y],mm3
+    movq mm3,[0x70+Y]
+    movq [0x00+Y],mm4
+    movq mm4,[0x48+Y]
+    OC_FDCT_STAGE1_8x4
+    OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
+    OC_TRANSPOSE8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
+    /*mm0={-2}x4*/
+    pcmpeqw mm0,mm0
+    paddw mm0,mm0
+    /*Round the results.*/
+    psubw mm1,mm0
+    psubw mm2,mm0
+    psraw mm1,2
+    psubw mm3,mm0
+    movq [0x58+Y],mm1
+    psraw mm2,2
+    psubw mm4,mm0
+    movq mm1,[0x48+Y]
+    psraw mm3,2
+    psubw mm5,mm0
+    movq [0x68+Y],mm2
+    psraw mm4,2
+    psubw mm6,mm0
+    movq [0x78+Y],mm3
+    psraw mm5,2
+    psubw mm7,mm0
+    movq [0x40+Y],mm4
+    psraw mm6,2
+    psubw mm1,mm0
+    movq [0x50+Y],mm5
+    psraw mm7,2
+    movq [0x60+Y],mm6
+    psraw mm1,2
+    movq [0x70+Y],mm7
+    movq [0x48+Y],mm1
+#undef Y
+#undef A
+#undef X
+  }
+}
+
+#endif
diff --git a/lib/x86_vc/mmxfrag.c b/lib/x86_vc/mmxfrag.c
new file mode 100644
index 0000000..4eb2084
--- /dev/null
+++ b/lib/x86_vc/mmxfrag.c
@@ -0,0 +1,337 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxfrag.c 16578 2009-09-25 19:50:48Z cristianadam $
+
+ ********************************************************************/
+
+/*MMX acceleration of fragment reconstruction for motion compensation.
+  Originally written by Rudolf Marek.
+  Additional optimization by Nils Pipenbrinck.
+  Note: Loops are unrolled for best performance.
+  The iteration each instruction belongs to is marked in the comments as #i.*/
+#include <stddef.h>
+#include "x86int.h"
+#include "mmxfrag.h"
+
+#if defined(OC_X86_ASM)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
+void oc_frag_copy_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride){
+#define SRC edx
+#define DST eax
+#define YSTRIDE ecx
+#define YSTRIDE3 esi
+  OC_FRAG_COPY_MMX(_dst,_src,_ystride);
+#undef SRC
+#undef DST
+#undef YSTRIDE
+#undef YSTRIDE3
+}
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue){
+  __asm{
+#define DST edx
+#define DST4 esi
+#define YSTRIDE eax
+#define YSTRIDE3 edi
+#define RESIDUE ecx
+    mov DST,_dst
+    mov YSTRIDE,_ystride
+    mov RESIDUE,_residue
+    lea DST4,[DST+YSTRIDE*4]
+    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
+    /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
+    pcmpeqw mm0,mm0
+    /*#0 Load low residue.*/
+    movq mm1,[0*8+RESIDUE]
+    /*#0 Load high residue.*/
+    movq mm2,[1*8+RESIDUE]
+    /*Set mm0 to 0x8000800080008000.*/
+    psllw mm0,15
+    /*#1 Load low residue.*/
+    movq mm3,[2*8+RESIDUE]
+    /*#1 Load high residue.*/
+    movq mm4,[3*8+RESIDUE]
+    /*Set mm0 to 0x0080008000800080.*/
+    psrlw mm0,8
+    /*#2 Load low residue.*/
+    movq mm5,[4*8+RESIDUE]
+    /*#2 Load high residue.*/
+    movq mm6,[5*8+RESIDUE]
+    /*#0 Bias low  residue.*/
+    paddsw mm1,mm0
+    /*#0 Bias high residue.*/
+    paddsw mm2,mm0
+    /*#0 Pack to byte.*/
+    packuswb mm1,mm2
+    /*#1 Bias low  residue.*/
+    paddsw mm3,mm0
+    /*#1 Bias high residue.*/
+    paddsw mm4,mm0
+    /*#1 Pack to byte.*/
+    packuswb mm3,mm4
+    /*#2 Bias low  residue.*/
+    paddsw mm5,mm0
+    /*#2 Bias high residue.*/
+    paddsw mm6,mm0
+    /*#2 Pack to byte.*/
+    packuswb mm5,mm6
+    /*#0 Write row.*/
+    movq [DST],mm1
+    /*#1 Write row.*/
+    movq [DST+YSTRIDE],mm3
+    /*#2 Write row.*/
+    movq [DST+YSTRIDE*2],mm5
+    /*#3 Load low residue.*/
+    movq mm1,[6*8+RESIDUE]
+    /*#3 Load high residue.*/
+    movq mm2,[7*8+RESIDUE]
+    /*#4 Load high residue.*/
+    movq mm3,[8*8+RESIDUE]
+    /*#4 Load high residue.*/
+    movq mm4,[9*8+RESIDUE]
+    /*#5 Load high residue.*/
+    movq mm5,[10*8+RESIDUE]
+    /*#5 Load high residue.*/
+    movq mm6,[11*8+RESIDUE]
+    /*#3 Bias low  residue.*/
+    paddsw mm1,mm0
+    /*#3 Bias high residue.*/
+    paddsw mm2,mm0
+    /*#3 Pack to byte.*/
+    packuswb mm1,mm2
+    /*#4 Bias low  residue.*/
+    paddsw mm3,mm0
+    /*#4 Bias high residue.*/
+    paddsw mm4,mm0
+    /*#4 Pack to byte.*/
+    packuswb mm3,mm4
+    /*#5 Bias low  residue.*/
+    paddsw mm5,mm0
+    /*#5 Bias high residue.*/
+    paddsw mm6,mm0
+    /*#5 Pack to byte.*/
+    packuswb mm5,mm6
+    /*#3 Write row.*/
+    movq [DST+YSTRIDE3],mm1
+    /*#4 Write row.*/
+    movq [DST4],mm3
+    /*#5 Write row.*/
+    movq [DST4+YSTRIDE],mm5
+    /*#6 Load low residue.*/
+    movq mm1,[12*8+RESIDUE]
+    /*#6 Load high residue.*/
+    movq mm2,[13*8+RESIDUE]
+    /*#7 Load low residue.*/
+    movq mm3,[14*8+RESIDUE]
+    /*#7 Load high residue.*/
+    movq mm4,[15*8+RESIDUE]
+    /*#6 Bias low  residue.*/
+    paddsw mm1,mm0
+    /*#6 Bias high residue.*/
+    paddsw mm2,mm0
+    /*#6 Pack to byte.*/
+    packuswb mm1,mm2
+    /*#7 Bias low  residue.*/
+    paddsw mm3,mm0
+    /*#7 Bias high residue.*/
+    paddsw mm4,mm0
+    /*#7 Pack to byte.*/
+    packuswb mm3,mm4
+    /*#6 Write row.*/
+    movq [DST4+YSTRIDE*2],mm1
+    /*#7 Write row.*/
+    movq [DST4+YSTRIDE3],mm3
+#undef DST
+#undef DST4
+#undef YSTRIDE
+#undef YSTRIDE3
+#undef RESIDUE
+  }
+}
+
+void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue){
+  int i;
+  /*Zero mm0.*/
+  __asm pxor mm0,mm0;
+  for(i=4;i-->0;){
+    __asm{
+#define DST edx
+#define SRC ecx
+#define YSTRIDE edi
+#define RESIDUE eax
+      mov DST,_dst
+      mov SRC,_src
+      mov YSTRIDE,_ystride
+      mov RESIDUE,_residue
+      /*#0 Load source.*/
+      movq mm3,[SRC]
+      /*#1 Load source.*/
+      movq mm7,[SRC+YSTRIDE]
+      /*#0 Get copy of src.*/
+      movq mm4,mm3
+      /*#0 Expand high source.*/
+      punpckhbw mm4,mm0
+      /*#0 Expand low  source.*/
+      punpcklbw mm3,mm0
+      /*#0 Add residue high.*/
+      paddsw mm4,[8+RESIDUE]
+      /*#1 Get copy of src.*/
+      movq mm2,mm7
+      /*#0 Add residue low.*/
+      paddsw  mm3,[RESIDUE]
+      /*#1 Expand high source.*/
+      punpckhbw mm2,mm0
+      /*#0 Pack final row pixels.*/
+      packuswb mm3,mm4
+      /*#1 Expand low  source.*/
+      punpcklbw mm7,mm0
+      /*#1 Add residue low.*/
+      paddsw mm7,[16+RESIDUE]
+      /*#1 Add residue high.*/
+      paddsw mm2,[24+RESIDUE]
+      /*Advance residue.*/
+      lea RESIDUE,[32+RESIDUE]
+      /*#1 Pack final row pixels.*/
+      packuswb mm7,mm2
+      /*Advance src.*/
+      lea SRC,[SRC+YSTRIDE*2]
+      /*#0 Write row.*/
+      movq [DST],mm3
+      /*#1 Write row.*/
+      movq [DST+YSTRIDE],mm7
+      /*Advance dst.*/
+      lea DST,[DST+YSTRIDE*2]
+      mov _residue,RESIDUE
+      mov _dst,DST
+      mov _src,SRC
+#undef DST
+#undef SRC
+#undef YSTRIDE
+#undef RESIDUE
+    }
+  }
+}
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
+  int i;
+  /*Zero mm7.*/
+  __asm pxor mm7,mm7;
+  for(i=4;i-->0;){
+    __asm{
+#define SRC1 ecx
+#define SRC2 edi
+#define YSTRIDE esi
+#define RESIDUE edx
+#define DST eax
+      mov YSTRIDE,_ystride
+      mov DST,_dst
+      mov RESIDUE,_residue
+      mov SRC1,_src1
+      mov SRC2,_src2
+      /*#0 Load src1.*/
+      movq mm0,[SRC1]
+      /*#0 Load src2.*/
+      movq mm2,[SRC2]
+      /*#0 Copy src1.*/
+      movq mm1,mm0
+      /*#0 Copy src2.*/
+      movq mm3,mm2
+      /*#1 Load src1.*/
+      movq mm4,[SRC1+YSTRIDE]
+      /*#0 Unpack lower src1.*/
+      punpcklbw mm0,mm7
+      /*#1 Load src2.*/
+      movq mm5,[SRC2+YSTRIDE]
+      /*#0 Unpack higher src1.*/
+      punpckhbw mm1,mm7
+      /*#0 Unpack lower src2.*/
+      punpcklbw mm2,mm7
+      /*#0 Unpack higher src2.*/
+      punpckhbw mm3,mm7
+      /*Advance src1 ptr.*/
+      lea SRC1,[SRC1+YSTRIDE*2]
+      /*Advance src2 ptr.*/
+      lea SRC2,[SRC2+YSTRIDE*2]
+      /*#0 Lower src1+src2.*/
+      paddsw mm0,mm2
+      /*#0 Higher src1+src2.*/
+      paddsw mm1,mm3
+      /*#1 Copy src1.*/
+      movq mm2,mm4
+      /*#0 Build lo average.*/
+      psraw mm0,1
+      /*#1 Copy src2.*/
+      movq mm3,mm5
+      /*#1 Unpack lower src1.*/
+      punpcklbw mm4,mm7
+      /*#0 Build hi average.*/
+      psraw mm1,1
+      /*#1 Unpack higher src1.*/
+      punpckhbw mm2,mm7
+      /*#0 low+=residue.*/
+      paddsw mm0,[RESIDUE]
+      /*#1 Unpack lower src2.*/
+      punpcklbw mm5,mm7
+      /*#0 high+=residue.*/
+      paddsw mm1,[8+RESIDUE]
+      /*#1 Unpack higher src2.*/
+      punpckhbw mm3,mm7
+      /*#1 Lower src1+src2.*/
+      paddsw mm5,mm4
+      /*#0 Pack and saturate.*/
+      packuswb mm0,mm1
+      /*#1 Higher src1+src2.*/
+      paddsw mm3,mm2
+      /*#0 Write row.*/
+      movq [DST],mm0
+      /*#1 Build lo average.*/
+      psraw mm5,1
+      /*#1 Build hi average.*/
+      psraw mm3,1
+      /*#1 low+=residue.*/
+      paddsw mm5,[16+RESIDUE]
+      /*#1 high+=residue.*/
+      paddsw mm3,[24+RESIDUE]
+      /*#1 Pack and saturate.*/
+      packuswb  mm5,mm3
+      /*#1 Write row ptr.*/
+      movq [DST+YSTRIDE],mm5
+      /*Advance residue ptr.*/
+      add RESIDUE,32
+      /*Advance dest ptr.*/
+      lea DST,[DST+YSTRIDE*2]
+      mov _dst,DST
+      mov _residue,RESIDUE
+      mov _src1,SRC1
+      mov _src2,SRC2
+#undef SRC1
+#undef SRC2
+#undef YSTRIDE
+#undef RESIDUE
+#undef DST
+    }
+  }
+}
+
+void oc_restore_fpu_mmx(void){
+  __asm emms;
+}
+
+#endif
diff --git a/lib/x86_vc/mmxfrag.h b/lib/x86_vc/mmxfrag.h
new file mode 100644
index 0000000..45ee93e
--- /dev/null
+++ b/lib/x86_vc/mmxfrag.h
@@ -0,0 +1,61 @@
+#if !defined(_x86_vc_mmxfrag_H)
+# define _x86_vc_mmxfrag_H (1)
+# include <stddef.h>
+# include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
+#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+  do{ \
+    const unsigned char *src; \
+    unsigned char       *dst; \
+    src=(_src); \
+    dst=(_dst); \
+    __asm  mov SRC,src \
+    __asm  mov DST,dst \
+    __asm  mov YSTRIDE,_ystride \
+    /*src+0*ystride*/ \
+    __asm  movq mm0,[SRC] \
+    /*src+1*ystride*/ \
+    __asm  movq mm1,[SRC+YSTRIDE] \
+    /*ystride3=ystride*3*/ \
+    __asm  lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
+    /*src+2*ystride*/ \
+    __asm  movq mm2,[SRC+YSTRIDE*2] \
+    /*src+3*ystride*/ \
+    __asm  movq mm3,[SRC+YSTRIDE3] \
+    /*dst+0*ystride*/ \
+    __asm  movq [DST],mm0 \
+    /*dst+1*ystride*/ \
+    __asm  movq [DST+YSTRIDE],mm1 \
+    /*Pointer to next 4.*/ \
+    __asm  lea SRC,[SRC+YSTRIDE*4] \
+    /*dst+2*ystride*/ \
+    __asm  movq [DST+YSTRIDE*2],mm2 \
+    /*dst+3*ystride*/ \
+    __asm  movq [DST+YSTRIDE3],mm3 \
+    /*Pointer to next 4.*/ \
+    __asm  lea DST,[DST+YSTRIDE*4] \
+    /*src+0*ystride*/ \
+    __asm  movq mm0,[SRC] \
+    /*src+1*ystride*/ \
+    __asm  movq mm1,[SRC+YSTRIDE] \
+    /*src+2*ystride*/ \
+    __asm  movq mm2,[SRC+YSTRIDE*2] \
+    /*src+3*ystride*/ \
+    __asm  movq mm3,[SRC+YSTRIDE3] \
+    /*dst+0*ystride*/ \
+    __asm  movq [DST],mm0 \
+    /*dst+1*ystride*/ \
+    __asm  movq [DST+YSTRIDE],mm1 \
+    /*dst+2*ystride*/ \
+    __asm  movq [DST+YSTRIDE*2],mm2 \
+    /*dst+3*ystride*/ \
+    __asm  movq [DST+YSTRIDE3],mm3 \
+  } \
+  while(0)
+
+# endif
+#endif
diff --git a/lib/x86_vc/mmxidct.c b/lib/x86_vc/mmxidct.c
new file mode 100644
index 0000000..8f5ff68
--- /dev/null
+++ b/lib/x86_vc/mmxidct.c
@@ -0,0 +1,562 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+/*MMX acceleration of Theora's iDCT.
+  Originally written by Rudolf Marek, based on code from On2's VP3.*/
+#include "x86int.h"
+#include "../dct.h"
+
+#if defined(OC_X86_ASM)
+
+/*These are offsets into the table of constants below.*/
+/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
+#define OC_COSINE_OFFSET (0)
+/*A row of 8's.*/
+#define OC_EIGHT_OFFSET  (56)
+
+
+
+/*A table of constants used by the MMX routines.*/
+static const __declspec(align(16))ogg_uint16_t
+ OC_IDCT_CONSTS[(7+1)*4]={
+  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
+  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
+  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
+  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
+  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
+  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
+  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
+  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
+  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
+  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
+  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
+  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
+  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
+  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
+      8,    8,    8,    8
+};
+
+/*38 cycles*/
+#define OC_IDCT_BEGIN __asm{ \
+  __asm movq mm2,OC_I(3) \
+  __asm movq mm6,OC_C(3) \
+  __asm movq mm4,mm2 \
+  __asm movq mm7,OC_J(5) \
+  __asm pmulhw mm4,mm6 \
+  __asm movq mm1,OC_C(5) \
+  __asm pmulhw mm6,mm7 \
+  __asm movq mm5,mm1 \
+  __asm pmulhw mm1,mm2 \
+  __asm movq mm3,OC_I(1) \
+  __asm pmulhw mm5,mm7 \
+  __asm movq mm0,OC_C(1) \
+  __asm paddw mm4,mm2 \
+  __asm paddw mm6,mm7 \
+  __asm paddw mm2,mm1 \
+  __asm movq mm1,OC_J(7) \
+  __asm paddw mm7,mm5 \
+  __asm movq mm5,mm0 \
+  __asm pmulhw mm0,mm3 \
+  __asm paddw mm4,mm7 \
+  __asm pmulhw mm5,mm1 \
+  __asm movq mm7,OC_C(7) \
+  __asm psubw mm6,mm2 \
+  __asm paddw mm0,mm3 \
+  __asm pmulhw mm3,mm7 \
+  __asm movq mm2,OC_I(2) \
+  __asm pmulhw mm7,mm1 \
+  __asm paddw mm5,mm1 \
+  __asm movq mm1,mm2 \
+  __asm pmulhw mm2,OC_C(2) \
+  __asm psubw mm3,mm5 \
+  __asm movq mm5,OC_J(6) \
+  __asm paddw mm0,mm7 \
+  __asm movq mm7,mm5 \
+  __asm psubw mm0,mm4 \
+  __asm pmulhw mm5,OC_C(2) \
+  __asm paddw mm2,mm1 \
+  __asm pmulhw mm1,OC_C(6) \
+  __asm paddw mm4,mm4 \
+  __asm paddw mm4,mm0 \
+  __asm psubw mm3,mm6 \
+  __asm paddw mm5,mm7 \
+  __asm paddw mm6,mm6 \
+  __asm pmulhw mm7,OC_C(6) \
+  __asm paddw mm6,mm3 \
+  __asm movq OC_I(1),mm4 \
+  __asm psubw mm1,mm5 \
+  __asm movq mm4,OC_C(4) \
+  __asm movq mm5,mm3 \
+  __asm pmulhw mm3,mm4 \
+  __asm paddw mm7,mm2 \
+  __asm movq OC_I(2),mm6 \
+  __asm movq mm2,mm0 \
+  __asm movq mm6,OC_I(0) \
+  __asm pmulhw mm0,mm4 \
+  __asm paddw mm5,mm3 \
+  __asm movq mm3,OC_J(4) \
+  __asm psubw mm5,mm1 \
+  __asm paddw mm2,mm0 \
+  __asm psubw mm6,mm3 \
+  __asm movq mm0,mm6 \
+  __asm pmulhw mm6,mm4 \
+  __asm paddw mm3,mm3 \
+  __asm paddw mm1,mm1 \
+  __asm paddw mm3,mm0 \
+  __asm paddw mm1,mm5 \
+  __asm pmulhw mm4,mm3 \
+  __asm paddw mm6,mm0 \
+  __asm psubw mm6,mm2 \
+  __asm paddw mm2,mm2 \
+  __asm movq mm0,OC_I(1) \
+  __asm paddw mm2,mm6 \
+  __asm paddw mm4,mm3 \
+  __asm psubw mm2,mm1 \
+}
+
+/*38+8=46 cycles.*/
+#define OC_ROW_IDCT __asm{ \
+  OC_IDCT_BEGIN \
+  /*r3=D'*/ \
+  __asm  movq mm3,OC_I(2) \
+  /*r4=E'=E-G*/ \
+  __asm  psubw mm4,mm7 \
+  /*r1=H'+H'*/ \
+  __asm  paddw mm1,mm1 \
+  /*r7=G+G*/ \
+  __asm  paddw mm7,mm7 \
+  /*r1=R1=A''+H'*/ \
+  __asm  paddw mm1,mm2 \
+  /*r7=G'=E+G*/ \
+  __asm  paddw mm7,mm4 \
+  /*r4=R4=E'-D'*/ \
+  __asm  psubw mm4,mm3 \
+  __asm  paddw mm3,mm3 \
+  /*r6=R6=F'-B''*/ \
+  __asm  psubw mm6,mm5 \
+  __asm  paddw mm5,mm5 \
+  /*r3=R3=E'+D'*/ \
+  __asm  paddw mm3,mm4 \
+  /*r5=R5=F'+B''*/ \
+  __asm  paddw mm5,mm6 \
+  /*r7=R7=G'-C'*/ \
+  __asm  psubw mm7,mm0 \
+  __asm  paddw mm0,mm0 \
+  /*Save R1.*/ \
+  __asm  movq OC_I(1),mm1 \
+  /*r0=R0=G.+C.*/ \
+  __asm  paddw mm0,mm7 \
+}
+
+/*The following macro does two 4x4 transposes in place.
+  At entry, we assume:
+    r0 = a3 a2 a1 a0
+  I(1) = b3 b2 b1 b0
+    r2 = c3 c2 c1 c0
+    r3 = d3 d2 d1 d0
+
+    r4 = e3 e2 e1 e0
+    r5 = f3 f2 f1 f0
+    r6 = g3 g2 g1 g0
+    r7 = h3 h2 h1 h0
+
+  At exit, we have:
+  I(0) = d0 c0 b0 a0
+  I(1) = d1 c1 b1 a1
+  I(2) = d2 c2 b2 a2
+  I(3) = d3 c3 b3 a3
+
+  J(4) = h0 g0 f0 e0
+  J(5) = h1 g1 f1 e1
+  J(6) = h2 g2 f2 e2
+  J(7) = h3 g3 f3 e3
+
+  I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
+  J(4) J(5) J(6) J(7) is the transpose of r4  r5  r6 r7.
+
+  Since r1 is free at entry, we calculate the Js first.*/
+/*19 cycles.*/
+#define OC_TRANSPOSE __asm{ \
+  __asm movq mm1,mm4 \
+  __asm punpcklwd mm4,mm5 \
+  __asm movq OC_I(0),mm0 \
+  __asm punpckhwd mm1,mm5 \
+  __asm movq mm0,mm6 \
+  __asm punpcklwd mm6,mm7 \
+  __asm movq mm5,mm4 \
+  __asm punpckldq mm4,mm6 \
+  __asm punpckhdq mm5,mm6 \
+  __asm movq mm6,mm1 \
+  __asm movq OC_J(4),mm4 \
+  __asm punpckhwd mm0,mm7 \
+  __asm movq OC_J(5),mm5 \
+  __asm punpckhdq mm6,mm0 \
+  __asm movq mm4,OC_I(0) \
+  __asm punpckldq mm1,mm0 \
+  __asm movq mm5,OC_I(1) \
+  __asm movq mm0,mm4 \
+  __asm movq OC_J(7),mm6 \
+  __asm punpcklwd mm0,mm5 \
+  __asm movq OC_J(6),mm1 \
+  __asm punpckhwd mm4,mm5 \
+  __asm movq mm5,mm2 \
+  __asm punpcklwd mm2,mm3 \
+  __asm movq mm1,mm0 \
+  __asm punpckldq mm0,mm2 \
+  __asm punpckhdq mm1,mm2 \
+  __asm movq mm2,mm4 \
+  __asm movq OC_I(0),mm0 \
+  __asm punpckhwd mm5,mm3 \
+  __asm movq OC_I(1),mm1 \
+  __asm punpckhdq mm4,mm5 \
+  __asm punpckldq mm2,mm5 \
+  __asm movq OC_I(3),mm4 \
+  __asm movq OC_I(2),mm2 \
+}
+
+/*38+19=57 cycles.*/
+#define OC_COLUMN_IDCT __asm{ \
+  OC_IDCT_BEGIN \
+  __asm paddw mm2,OC_8 \
+  /*r1=H'+H'*/ \
+  __asm paddw mm1,mm1 \
+  /*r1=R1=A''+H'*/ \
+  __asm paddw mm1,mm2 \
+  /*r2=NR2*/ \
+  __asm psraw mm2,4 \
+  /*r4=E'=E-G*/ \
+  __asm psubw mm4,mm7 \
+  /*r1=NR1*/ \
+  __asm psraw mm1,4 \
+  /*r3=D'*/ \
+  __asm movq mm3,OC_I(2) \
+  /*r7=G+G*/ \
+  __asm paddw mm7,mm7 \
+  /*Store NR2 at I(2).*/ \
+  __asm movq OC_I(2),mm2 \
+  /*r7=G'=E+G*/ \
+  __asm paddw mm7,mm4 \
+  /*Store NR1 at I(1).*/ \
+  __asm movq OC_I(1),mm1 \
+  /*r4=R4=E'-D'*/ \
+  __asm psubw mm4,mm3 \
+  __asm paddw mm4,OC_8 \
+  /*r3=D'+D'*/ \
+  __asm paddw mm3,mm3 \
+  /*r3=R3=E'+D'*/ \
+  __asm paddw mm3,mm4 \
+  /*r4=NR4*/ \
+  __asm psraw mm4,4 \
+  /*r6=R6=F'-B''*/ \
+  __asm psubw mm6,mm5 \
+  /*r3=NR3*/ \
+  __asm psraw mm3,4 \
+  __asm paddw mm6,OC_8 \
+  /*r5=B''+B''*/ \
+  __asm paddw mm5,mm5 \
+  /*r5=R5=F'+B''*/ \
+  __asm paddw mm5,mm6 \
+  /*r6=NR6*/ \
+  __asm psraw mm6,4 \
+  /*Store NR4 at J(4).*/ \
+  __asm movq OC_J(4),mm4 \
+  /*r5=NR5*/ \
+  __asm psraw mm5,4 \
+  /*Store NR3 at I(3).*/ \
+  __asm movq OC_I(3),mm3 \
+  /*r7=R7=G'-C'*/ \
+  __asm psubw mm7,mm0 \
+  __asm paddw mm7,OC_8 \
+  /*r0=C'+C'*/ \
+  __asm paddw mm0,mm0 \
+  /*r0=R0=G'+C'*/ \
+  __asm paddw mm0,mm7 \
+  /*r7=NR7*/ \
+  __asm psraw mm7,4 \
+  /*Store NR6 at J(6).*/ \
+  __asm movq OC_J(6),mm6 \
+  /*r0=NR0*/ \
+  __asm psraw mm0,4 \
+  /*Store NR5 at J(5).*/ \
+  __asm movq OC_J(5),mm5 \
+  /*Store NR7 at J(7).*/ \
+  __asm movq OC_J(7),mm7 \
+  /*Store NR0 at I(0).*/ \
+  __asm movq OC_I(0),mm0 \
+}
+
+#define OC_MID(_m,_i) [CONSTS+_m+(_i)*8]
+#define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
+#define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
+
+static void oc_idct8x8_slow(ogg_int16_t _y[64]){
+  /*This routine accepts an 8x8 matrix, but in partially transposed form.
+    Every 4x4 block is transposed.*/
+  __asm{
+#define CONSTS eax
+#define Y edx
+    mov CONSTS,offset OC_IDCT_CONSTS
+    mov Y,_y
+#define OC_I(_k)      [Y+_k*16]
+#define OC_J(_k)      [Y+(_k-4)*16+8]
+    OC_ROW_IDCT
+    OC_TRANSPOSE
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k)      [Y+(_k*16)+64]
+#define OC_J(_k)      [Y+(_k-4)*16+72]
+    OC_ROW_IDCT
+    OC_TRANSPOSE
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k)      [Y+_k*16]
+#define OC_J(_k)      OC_I(_k)
+    OC_COLUMN_IDCT
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k)      [Y+_k*16+8]
+#define OC_J(_k)      OC_I(_k)
+    OC_COLUMN_IDCT
+#undef  OC_I
+#undef  OC_J
+#undef  CONSTS
+#undef  Y
+  }
+}
+
+/*25 cycles.*/
+#define OC_IDCT_BEGIN_10 __asm{ \
+  __asm movq mm2,OC_I(3) \
+  __asm nop \
+  __asm movq mm6,OC_C(3) \
+  __asm movq mm4,mm2 \
+  __asm movq mm1,OC_C(5) \
+  __asm pmulhw mm4,mm6 \
+  __asm movq mm3,OC_I(1) \
+  __asm pmulhw mm1,mm2 \
+  __asm movq mm0,OC_C(1) \
+  __asm paddw mm4,mm2 \
+  __asm pxor mm6,mm6 \
+  __asm paddw mm2,mm1 \
+  __asm movq mm5,OC_I(2) \
+  __asm pmulhw mm0,mm3 \
+  __asm movq mm1,mm5 \
+  __asm paddw mm0,mm3 \
+  __asm pmulhw mm3,OC_C(7) \
+  __asm psubw mm6,mm2 \
+  __asm pmulhw mm5,OC_C(2) \
+  __asm psubw mm0,mm4 \
+  __asm movq mm7,OC_I(2) \
+  __asm paddw mm4,mm4 \
+  __asm paddw mm7,mm5 \
+  __asm paddw mm4,mm0 \
+  __asm pmulhw mm1,OC_C(6) \
+  __asm psubw mm3,mm6 \
+  __asm movq OC_I(1),mm4 \
+  __asm paddw mm6,mm6 \
+  __asm movq mm4,OC_C(4) \
+  __asm paddw mm6,mm3 \
+  __asm movq mm5,mm3 \
+  __asm pmulhw mm3,mm4 \
+  __asm movq OC_I(2),mm6 \
+  __asm movq mm2,mm0 \
+  __asm movq mm6,OC_I(0) \
+  __asm pmulhw mm0,mm4 \
+  __asm paddw mm5,mm3 \
+  __asm paddw mm2,mm0 \
+  __asm psubw mm5,mm1 \
+  __asm pmulhw mm6,mm4 \
+  __asm paddw mm6,OC_I(0) \
+  __asm paddw mm1,mm1 \
+  __asm movq mm4,mm6 \
+  __asm paddw mm1,mm5 \
+  __asm psubw mm6,mm2 \
+  __asm paddw mm2,mm2 \
+  __asm movq mm0,OC_I(1) \
+  __asm paddw mm2,mm6 \
+  __asm psubw mm2,mm1 \
+  __asm nop \
+}
+
+/*25+8=33 cycles.*/
+#define OC_ROW_IDCT_10 __asm{ \
+  OC_IDCT_BEGIN_10 \
+  /*r3=D'*/ \
+   __asm movq mm3,OC_I(2) \
+  /*r4=E'=E-G*/ \
+   __asm psubw mm4,mm7 \
+  /*r1=H'+H'*/ \
+   __asm paddw mm1,mm1 \
+  /*r7=G+G*/ \
+   __asm paddw mm7,mm7 \
+  /*r1=R1=A''+H'*/ \
+   __asm paddw mm1,mm2 \
+  /*r7=G'=E+G*/ \
+   __asm paddw mm7,mm4 \
+  /*r4=R4=E'-D'*/ \
+   __asm psubw mm4,mm3 \
+   __asm paddw mm3,mm3 \
+  /*r6=R6=F'-B''*/ \
+   __asm psubw mm6,mm5 \
+   __asm paddw mm5,mm5 \
+  /*r3=R3=E'+D'*/ \
+   __asm paddw mm3,mm4 \
+  /*r5=R5=F'+B''*/ \
+   __asm paddw mm5,mm6 \
+  /*r7=R7=G'-C'*/ \
+   __asm psubw mm7,mm0 \
+   __asm paddw mm0,mm0 \
+  /*Save R1.*/ \
+   __asm movq OC_I(1),mm1 \
+  /*r0=R0=G'+C'*/ \
+   __asm paddw mm0,mm7 \
+}
+
+/*25+19=44 cycles'*/
+#define OC_COLUMN_IDCT_10 __asm{ \
+  OC_IDCT_BEGIN_10 \
+  __asm paddw mm2,OC_8 \
+  /*r1=H'+H'*/ \
+  __asm paddw mm1,mm1 \
+  /*r1=R1=A''+H'*/ \
+  __asm paddw mm1,mm2 \
+  /*r2=NR2*/ \
+  __asm psraw mm2,4 \
+  /*r4=E'=E-G*/ \
+  __asm psubw mm4,mm7 \
+  /*r1=NR1*/ \
+  __asm psraw mm1,4 \
+  /*r3=D'*/ \
+  __asm movq mm3,OC_I(2) \
+  /*r7=G+G*/ \
+  __asm paddw mm7,mm7 \
+  /*Store NR2 at I(2).*/ \
+  __asm movq OC_I(2),mm2 \
+  /*r7=G'=E+G*/ \
+  __asm paddw mm7,mm4 \
+  /*Store NR1 at I(1).*/ \
+  __asm movq OC_I(1),mm1 \
+  /*r4=R4=E'-D'*/ \
+  __asm psubw mm4,mm3 \
+  __asm paddw mm4,OC_8 \
+  /*r3=D'+D'*/ \
+  __asm paddw mm3,mm3 \
+  /*r3=R3=E'+D'*/ \
+  __asm paddw mm3,mm4 \
+  /*r4=NR4*/ \
+  __asm psraw mm4,4 \
+  /*r6=R6=F'-B''*/ \
+  __asm psubw mm6,mm5 \
+  /*r3=NR3*/ \
+  __asm psraw mm3,4 \
+  __asm paddw mm6,OC_8 \
+  /*r5=B''+B''*/ \
+  __asm paddw mm5,mm5 \
+  /*r5=R5=F'+B''*/ \
+  __asm paddw mm5,mm6 \
+  /*r6=NR6*/ \
+  __asm psraw mm6,4 \
+  /*Store NR4 at J(4).*/ \
+  __asm movq OC_J(4),mm4 \
+  /*r5=NR5*/ \
+  __asm psraw mm5,4 \
+  /*Store NR3 at I(3).*/ \
+  __asm movq OC_I(3),mm3 \
+  /*r7=R7=G'-C'*/ \
+  __asm psubw mm7,mm0 \
+  __asm paddw mm7,OC_8 \
+  /*r0=C'+C'*/ \
+  __asm paddw mm0,mm0 \
+  /*r0=R0=G'+C'*/ \
+  __asm paddw mm0,mm7 \
+  /*r7=NR7*/ \
+  __asm psraw mm7,4 \
+  /*Store NR6 at J(6).*/ \
+  __asm movq OC_J(6),mm6 \
+  /*r0=NR0*/ \
+  __asm psraw mm0,4 \
+  /*Store NR5 at J(5).*/ \
+  __asm movq OC_J(5),mm5 \
+  /*Store NR7 at J(7).*/ \
+  __asm movq OC_J(7),mm7 \
+  /*Store NR0 at I(0).*/ \
+  __asm movq OC_I(0),mm0 \
+}
+
+static void oc_idct8x8_10(ogg_int16_t _y[64]){
+  __asm{
+#define CONSTS eax
+#define Y edx
+    mov CONSTS,offset OC_IDCT_CONSTS
+    mov Y,_y
+#define OC_I(_k) [Y+_k*16]
+#define OC_J(_k) [Y+(_k-4)*16+8]
+    /*Done with dequant, descramble, and partial transpose.
+      Now do the iDCT itself.*/
+    OC_ROW_IDCT_10
+    OC_TRANSPOSE
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k) [Y+_k*16]
+#define OC_J(_k) OC_I(_k)
+    OC_COLUMN_IDCT_10
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k) [Y+_k*16+8]
+#define OC_J(_k) OC_I(_k)
+    OC_COLUMN_IDCT_10
+#undef  OC_I
+#undef  OC_J
+#undef  CONSTS
+#undef  Y
+  }
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.*/
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
+  /*_last_zzi is subtly different from an actual count of the number of
+     coefficients we decoded for this block.
+    It contains the value of zzi BEFORE the final token in the block was
+     decoded.
+    In most cases this is an EOB token (the continuation of an EOB run from a
+     previous block counts), and so this is the same as the coefficient count.
+    However, in the case that the last token was NOT an EOB token, but filled
+     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+    Provided the last token was not a pure zero run, the minimum value it can
+     be is 46, and so that doesn't affect any of the cases in this routine.
+    However, if the last token WAS a pure zero run of length 63, then _last_zzi
+     will be 1 while the number of coefficients decoded is 64.
+    Thus, we will trigger the following special case, where the real
+     coefficient count would not.
+    Note also that a zero run of length 64 will give _last_zzi a value of 0,
+     but we still process the DC coefficient, which might have a non-zero value
+     due to DC prediction.
+    Although convoluted, this is arguably the correct behavior: it allows us to
+     use a smaller transform when the block ends with a long zero run instead
+     of a normal EOB token.
+    It could be smarter... multiple separate zero runs at the end of a block
+     will fool it, but an encoder that generates these really deserves what it
+     gets.
+    Needless to say we inherited this approach from VP3.*/
+  /*Perform the iDCT.*/
+  if(_last_zzi<10)oc_idct8x8_10(_y);
+  else oc_idct8x8_slow(_y);
+}
+
+#endif
diff --git a/lib/x86_vc/mmxloop.h b/lib/x86_vc/mmxloop.h
new file mode 100644
index 0000000..2561fca
--- /dev/null
+++ b/lib/x86_vc/mmxloop.h
@@ -0,0 +1,219 @@
+#if !defined(_x86_vc_mmxloop_H)
+# define _x86_vc_mmxloop_H (1)
+# include <stddef.h>
+# include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
+  On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
+   mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
+#define OC_LOOP_FILTER8_MMX __asm{ \
+  /*mm7=0*/ \
+  __asm pxor mm7,mm7 \
+  /*mm6:mm0={a0,...,a7}*/ \
+  __asm movq mm6,mm0 \
+  __asm punpcklbw mm0,mm7 \
+  __asm punpckhbw mm6,mm7 \
+  /*mm3:mm5={d0,...,d7}*/ \
+  __asm movq mm5,mm3 \
+  __asm punpcklbw mm3,mm7 \
+  __asm punpckhbw mm5,mm7 \
+  /*mm6:mm0={a0-d0,...,a7-d7}*/ \
+  __asm psubw mm0,mm3 \
+  __asm psubw mm6,mm5 \
+  /*mm3:mm1={b0,...,b7}*/ \
+  __asm movq mm3,mm1 \
+  __asm punpcklbw mm1,mm7 \
+  __asm movq mm4,mm2 \
+  __asm punpckhbw mm3,mm7 \
+  /*mm5:mm4={c0,...,c7}*/ \
+  __asm movq mm5,mm2 \
+  __asm punpcklbw mm4,mm7 \
+  __asm punpckhbw mm5,mm7 \
+  /*mm7={3}x4 \
+    mm5:mm4={c0-b0,...,c7-b7}*/ \
+  __asm pcmpeqw mm7,mm7 \
+  __asm psubw mm4,mm1 \
+  __asm psrlw mm7,14 \
+  __asm psubw mm5,mm3 \
+  /*Scale by 3.*/ \
+  __asm pmullw mm4,mm7 \
+  __asm pmullw mm5,mm7 \
+  /*mm7={4}x4 \
+    mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
+  __asm psrlw mm7,1 \
+  __asm paddw mm4,mm0 \
+  __asm psllw mm7,2 \
+  __asm movq mm0,[LL] \
+  __asm paddw mm5,mm6 \
+  /*R_i has the range [-127,128], so we compute -R_i instead. \
+    mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
+  __asm psubw mm4,mm7 \
+  __asm psubw mm5,mm7 \
+  __asm psraw mm4,3 \
+  __asm psraw mm5,3 \
+  __asm pcmpeqb mm7,mm7 \
+  __asm packsswb mm4,mm5 \
+  __asm pxor mm6,mm6 \
+  __asm pxor mm4,mm7 \
+  __asm packuswb mm1,mm3 \
+  /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
+  /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+     we have to split things by sign (the other option is to work in 16 bits, \
+     but working in 8 bits gives much better parallelism). \
+    We compute abs(R_i), but save a mask of which terms were negative in mm6. \
+    Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
+    Finally, we split mm4 into positive and negative pieces using the mask in \
+     mm6, and add and subtract them as appropriate.*/ \
+  /*mm4=abs(-R_i)*/ \
+  /*mm7=255-2*L*/ \
+  __asm pcmpgtb mm6,mm4 \
+  __asm psubb mm7,mm0 \
+  __asm pxor mm4,mm6 \
+  __asm psubb mm7,mm0 \
+  __asm psubb mm4,mm6 \
+  /*mm7=255-max(2*L-abs(R_i),0)*/ \
+  __asm paddusb mm7,mm4 \
+  /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
+  __asm paddusb mm4,mm7 \
+  __asm psubusb mm4,mm7 \
+  /*Now split mm4 by the original sign of -R_i.*/ \
+  __asm movq mm5,mm4 \
+  __asm pand mm4,mm6 \
+  __asm pandn mm6,mm5 \
+  /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+  /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+  __asm paddusb mm1,mm4 \
+  __asm psubusb mm2,mm4 \
+  __asm psubusb mm1,mm6 \
+  __asm paddusb mm2,mm6 \
+}
+
+#define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \
+  do{ \
+    /*Used local variable pix__ in order to fix compilation errors like: \
+       "error C2425: 'SHL' : non-constant expression in 'second operand'".*/ \
+    unsigned char *pix__; \
+    unsigned char *ll__; \
+    ll__=(_ll); \
+    pix__=(_pix); \
+    __asm mov YSTRIDE,_ystride \
+    __asm mov LL,ll__ \
+    __asm mov PIX,pix__ \
+    __asm sub PIX,YSTRIDE \
+    __asm sub PIX,YSTRIDE \
+    /*mm0={a0,...,a7}*/ \
+    __asm movq mm0,[PIX] \
+    /*ystride3=_ystride*3*/ \
+    __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
+    /*mm3={d0,...,d7}*/ \
+    __asm movq mm3,[PIX+YSTRIDE3] \
+    /*mm1={b0,...,b7}*/ \
+    __asm movq mm1,[PIX+YSTRIDE] \
+    /*mm2={c0,...,c7}*/ \
+    __asm movq mm2,[PIX+YSTRIDE*2] \
+    OC_LOOP_FILTER8_MMX \
+    /*Write it back out.*/ \
+    __asm movq [PIX+YSTRIDE],mm1 \
+    __asm movq [PIX+YSTRIDE*2],mm2 \
+  } \
+  while(0)
+
+#define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \
+  do{ \
+    /*Used local variable ll__ in order to fix compilation errors like: \
+       "error C2443: operand size conflict".*/ \
+    unsigned char *ll__; \
+    unsigned char *pix__; \
+    ll__=(_ll); \
+    pix__=(_pix)-2; \
+    __asm mov PIX,pix__ \
+    __asm mov YSTRIDE,_ystride \
+    __asm mov LL,ll__ \
+    /*x x x x d0 c0 b0 a0*/ \
+    __asm movd mm0,[PIX] \
+    /*x x x x d1 c1 b1 a1*/ \
+    __asm movd mm1,[PIX+YSTRIDE] \
+    /*ystride3=_ystride*3*/ \
+    __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
+    /*x x x x d2 c2 b2 a2*/ \
+    __asm movd mm2,[PIX+YSTRIDE*2] \
+    /*x x x x d3 c3 b3 a3*/ \
+    __asm lea D,[PIX+YSTRIDE*4] \
+    __asm movd mm3,[PIX+YSTRIDE3] \
+    /*x x x x d4 c4 b4 a4*/ \
+    __asm movd mm4,[D] \
+    /*x x x x d5 c5 b5 a5*/ \
+    __asm movd mm5,[D+YSTRIDE] \
+    /*x x x x d6 c6 b6 a6*/ \
+    __asm movd mm6,[D+YSTRIDE*2] \
+    /*x x x x d7 c7 b7 a7*/ \
+    __asm movd mm7,[D+YSTRIDE3] \
+    /*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \
+    __asm punpcklbw mm0,mm1 \
+    /*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \
+    __asm punpcklbw mm2,mm3 \
+    /*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \
+    __asm movq mm3,mm0 \
+    /*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \
+    __asm punpcklwd mm0,mm2 \
+    /*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \
+    __asm punpckhwd mm3,mm2 \
+    /*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \
+    __asm movq mm1,mm0 \
+    /*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \
+    __asm punpcklbw mm4,mm5 \
+    /*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \
+    __asm punpcklbw mm6,mm7 \
+    /*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \
+    __asm movq mm5,mm4 \
+    /*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \
+    __asm punpcklwd mm4,mm6 \
+    /*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \
+    __asm punpckhwd mm5,mm6 \
+    /*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \
+    __asm movq mm2,mm3 \
+    /*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \
+    __asm punpckldq mm0,mm4 \
+    /*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \
+    __asm punpckhdq mm1,mm4 \
+    /*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \
+    __asm punpckldq mm2,mm5 \
+    /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
+    __asm punpckhdq mm3,mm5 \
+    OC_LOOP_FILTER8_MMX \
+    /*mm2={b0+R_0'',...,b7+R_7''}*/ \
+    __asm movq mm0,mm1 \
+    /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
+    __asm punpcklbw mm1,mm2 \
+    /*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \
+    __asm punpckhbw mm0,mm2 \
+    /*[d]=c1 b1 c0 b0*/ \
+    __asm movd D,mm1 \
+    __asm mov [PIX+1],D_WORD \
+    __asm psrlq mm1,32 \
+    __asm shr D,16 \
+    __asm mov [PIX+YSTRIDE+1],D_WORD \
+    /*[d]=c3 b3 c2 b2*/ \
+    __asm movd D,mm1 \
+    __asm mov [PIX+YSTRIDE*2+1],D_WORD \
+    __asm shr D,16 \
+    __asm mov [PIX+YSTRIDE3+1],D_WORD \
+    __asm lea PIX,[PIX+YSTRIDE*4] \
+    /*[d]=c5 b5 c4 b4*/ \
+    __asm movd D,mm0 \
+    __asm mov [PIX+1],D_WORD \
+    __asm psrlq mm0,32 \
+    __asm shr D,16 \
+    __asm mov [PIX+YSTRIDE+1],D_WORD \
+    /*[d]=c7 b7 c6 b6*/ \
+    __asm movd D,mm0 \
+    __asm mov [PIX+YSTRIDE*2+1],D_WORD \
+    __asm shr D,16 \
+    __asm mov [PIX+YSTRIDE3+1],D_WORD \
+  } \
+  while(0)
+
+# endif
+#endif
diff --git a/lib/x86_vc/mmxstate.c b/lib/x86_vc/mmxstate.c
new file mode 100644
index 0000000..73bd198
--- /dev/null
+++ b/lib/x86_vc/mmxstate.c
@@ -0,0 +1,211 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxstate.c 16584 2009-09-26 19:35:55Z tterribe $
+
+ ********************************************************************/
+
+/*MMX acceleration of complete fragment reconstruction algorithm.
+  Originally written by Rudolf Marek.*/
+#include <string.h>
+#include "x86int.h"
+#include "mmxfrag.h"
+#include "mmxloop.h"
+
+#if defined(OC_X86_ASM)
+
+void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            mb_mode;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    /*Note that this value must be unsigned, to keep the __asm__ block from
+       sign-extending it when it puts it in a register.*/
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    /*Fill _dct_coeffs with p.*/
+    __asm{
+#define Y eax
+#define P ecx
+      mov Y,_dct_coeffs
+      movzx P,p
+      /*mm0=0000 0000 0000 AAAA*/
+      movd mm0,P
+      /*mm0=0000 0000 AAAA AAAA*/
+      punpcklwd mm0,mm0
+      /*mm0=AAAA AAAA AAAA AAAA*/
+      punpckldq mm0,mm0
+      movq [Y],mm0
+      movq [8+Y],mm0
+      movq [16+Y],mm0
+      movq [24+Y],mm0
+      movq [32+Y],mm0
+      movq [40+Y],mm0
+      movq [48+Y],mm0
+      movq [56+Y],mm0
+      movq [64+Y],mm0
+      movq [72+Y],mm0
+      movq [80+Y],mm0
+      movq [88+Y],mm0
+      movq [96+Y],mm0
+      movq [104+Y],mm0
+      movq [112+Y],mm0
+      movq [120+Y],mm0
+#undef Y
+#undef P
+    }
+  }
+  else{
+    /*Dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  mb_mode=_state->frags[_fragi].mb_mode;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+     +frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
+      oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs);
+    }
+    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+  }
+}
+
+/*We copy these entire function to inline the actual MMX routines so that we
+   use only a single indirect call.*/
+
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _fragis:    A pointer to a list of fragment indices.
+  _nfragis:   The number of fragment indices to copy.
+  _dst_frame: The reference frame to copy to.
+  _src_frame: The reference frame to copy from.
+  _pli:       The color plane the fragments lie in.*/
+void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
+ int _dst_frame,int _src_frame,int _pli){
+  const ptrdiff_t     *frag_buf_offs;
+  const unsigned char *src_frame_data;
+  unsigned char       *dst_frame_data;
+  ptrdiff_t            fragii;
+  int                  ystride;
+  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
+  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
+  ystride=_state->ref_ystride[_pli];
+  frag_buf_offs=_state->frag_buf_offs;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=frag_buf_offs[_fragis[fragii]];
+#define SRC edx
+#define DST eax
+#define YSTRIDE ecx
+#define YSTRIDE3 edi
+    OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
+     src_frame_data+frag_buf_off,ystride);
+#undef SRC
+#undef DST
+#undef YSTRIDE
+#undef YSTRIDE3
+  }
+}
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+  The filter may be run on the bottom edge, affecting pixels in the next row of
+   fragments, so this row also needs to be available.
+  _bv:        The bounding values array.
+  _refi:      The index of the frame buffer to filter.
+  _pli:       The color plane to filter.
+  _fragy0:    The Y coordinate of the first fragment row to filter.
+  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
+ int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+  OC_ALIGN8(unsigned char  ll[8]);
+  const oc_fragment_plane *fplane;
+  const oc_fragment       *frags;
+  const ptrdiff_t         *frag_buf_offs;
+  unsigned char           *ref_frame_data;
+  ptrdiff_t                fragi_top;
+  ptrdiff_t                fragi_bot;
+  ptrdiff_t                fragi0;
+  ptrdiff_t                fragi0_end;
+  int                      ystride;
+  int                      nhfrags;
+  memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
+  fplane=_state->fplanes+_pli;
+  nhfrags=fplane->nhfrags;
+  fragi_top=fplane->froffset;
+  fragi_bot=fragi_top+fplane->nfrags;
+  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+  ystride=_state->ref_ystride[_pli];
+  frags=_state->frags;
+  frag_buf_offs=_state->frag_buf_offs;
+  ref_frame_data=_state->ref_frame_data[_refi];
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
+  while(fragi0<fragi0_end){
+    ptrdiff_t fragi;
+    ptrdiff_t fragi_end;
+    fragi=fragi0;
+    fragi_end=fragi+nhfrags;
+    while(fragi<fragi_end){
+      if(frags[fragi].coded){
+        unsigned char *ref;
+        ref=ref_frame_data+frag_buf_offs[fragi];
+#define PIX eax
+#define YSTRIDE3 edi
+#define YSTRIDE ecx
+#define LL edx
+#define D esi
+#define D_WORD si
+        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
+        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
+        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
+          OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
+        }
+        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
+          OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
+        }
+#undef PIX
+#undef YSTRIDE3
+#undef YSTRIDE
+#undef LL
+#undef D
+#undef D_WORD
+      }
+      fragi++;
+    }
+    fragi0+=nhfrags;
+  }
+}
+
+#endif
diff --git a/lib/x86_vc/x86enc.c b/lib/x86_vc/x86enc.c
new file mode 100644
index 0000000..e1960e1
--- /dev/null
+++ b/lib/x86_vc/x86enc.c
@@ -0,0 +1,49 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+#include "../cpu.c"
+
+void oc_enc_vtable_init_x86(oc_enc_ctx *_enc){
+  ogg_uint32_t cpu_flags;
+  cpu_flags=oc_cpu_flags_get();
+  oc_enc_vtable_init_c(_enc);
+  if(cpu_flags&OC_CPU_X86_MMX){
+    _enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
+    _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
+    _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
+    _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
+  }
+  if(cpu_flags&OC_CPU_X86_MMXEXT){
+    _enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
+    _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
+    _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
+    _enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_mmxext;
+    _enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_mmxext;
+    _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
+    _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
+  }
+  if(cpu_flags&OC_CPU_X86_SSE2){
+# if defined(OC_X86_64_ASM)
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
+# endif
+  }
+}
+#endif
diff --git a/lib/x86_vc/x86enc.h b/lib/x86_vc/x86enc.h
new file mode 100644
index 0000000..5814846
--- /dev/null
+++ b/lib/x86_vc/x86enc.h
@@ -0,0 +1,47 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_vc_x86enc_H)
+# define _x86_vc_x86enc_H (1)
+# include "../encint.h"
+# include "x86int.h"
+
+void oc_enc_vtable_init_x86(oc_enc_ctx *_enc);
+
+unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,int _ystride);
+void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
+ const unsigned char *_x,const unsigned char *_y,int _stride);
+void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
+ const unsigned char *_x,int _stride);
+void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+
+#endif
diff --git a/lib/x86_vc/x86int.h b/lib/x86_vc/x86int.h
new file mode 100644
index 0000000..4cca485
--- /dev/null
+++ b/lib/x86_vc/x86int.h
@@ -0,0 +1,42 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#if !defined(_x86_vc_x86int_H)
+# define _x86_vc_x86int_H (1)
+# include "../internal.h"
+
+void oc_state_vtable_init_x86(oc_theora_state *_state);
+
+void oc_frag_copy_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride);
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
+void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
+ int _dst_frame,int _src_frame,int _pli);
+void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
+ int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_restore_fpu_mmx(void);
+
+#endif
diff --git a/lib/x86_vc/x86state.c b/lib/x86_vc/x86state.c
new file mode 100644
index 0000000..a786bec
--- /dev/null
+++ b/lib/x86_vc/x86state.c
@@ -0,0 +1,62 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+#include "../cpu.c"
+
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+   each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+};
+
+void oc_state_vtable_init_x86(oc_theora_state *_state){
+  _state->cpu_flags=oc_cpu_flags_get();
+  if(_state->cpu_flags&OC_CPU_X86_MMX){
+    _state->opt_vtable.frag_copy=oc_frag_copy_mmx;
+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
+    _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
+    _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     oc_state_loop_filter_frag_rows_mmx;
+    _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
+  }
+  else oc_state_vtable_init_c(_state);
+}
+#endif