diff -NaHudr dosbox-0.73/src/dos/dos.cpp dosbox-0.73-patched/src/dos/dos.cpp
--- dosbox-0.73/src/dos/dos.cpp	2009-05-25 21:44:45.000000000 +0300
+++ dosbox-0.73-patched/src/dos/dos.cpp	2010-04-04 00:55:51.620788494 +0300
@@ -32,6 +32,12 @@
 #include "support.h"
 #include "serialport.h"
 
+#include <pty.h>   /* FOR BISQWIT LINUX SHELL SUPPORT */
+#include <fcntl.h> /* FOR BISQWIT LINUX SHELL SUPPORT */
+#include <signal.h> /* FOR BISQWIT LINUX SHELL SUPPORT */
+#include <sys/errno.h> /* FOR BISQWIT LINUX SHELL SUPPORT */
+#include <sys/wait.h> /* FOR BISQWIT LINUX SHELL SUPPORT */
+
 DOS_Block dos;
 DOS_InfoBlock dos_infoblock;
 
@@ -755,6 +761,77 @@
 //TODO Think hard how shit this is gonna be
 //And will any game ever use this :)
 	case 0x53:					/* Translate BIOS parameter block to drive parameter block */
+		/* BISQWIT LINUX SHELL SUPPORT */
+		{
+		    switch(reg_al)
+		    {
+		        case 0: /* open */
+		        {
+		            int fd = -1, pid = -1;
+					struct winsize ws;
+					memset(&ws, 0, sizeof ws);
+					ws.ws_row = reg_cx;
+					ws.ws_col = reg_dx;
+					
+					pid = forkpty(&fd, NULL, NULL, &ws);
+					if(!pid)
+					{
+						putenv("TERM=linux");
+						execl(getenv("SHELL"), getenv("SHELL"), NULL);
+						// not reached
+					}
+					fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK);
+					reg_bx = fd;
+					reg_cx = pid;
+					break;
+		        }
+		        case 1: /* close */
+		        {
+		            kill(reg_cx, SIGKILL);
+		            close(reg_bx);
+		            waitpid(reg_cx, NULL, 0);
+		            break;
+		        }
+		        case 2: /* send */
+		        {
+		            int fd = reg_bx; Bit16u towrite = reg_cx;
+		            unsigned char* bisqdata = new unsigned char[towrite];
+		            MEM_BlockRead(SegPhys(ds)+reg_dx, bisqdata, towrite);
+		            reg_cx = write(fd, bisqdata, towrite);
+		            fprintf(stderr, "Sent <%.*s>\n", (int)reg_cx, bisqdata);
+		            delete[] bisqdata;
+		            break;
+		        }
+		        case 3: /* read */
+		        {
+		            int fd = reg_bx; Bit16u toread = reg_cx;
+		            if(toread > 0)
+		            {
+						unsigned char* bisqdata = new unsigned char[toread];
+						int r = read(fd, bisqdata, toread);
+						if(r < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
+						    r = 0;
+						if(r > 0) MEM_BlockWrite(SegPhys(ds)+reg_dx, bisqdata, r);
+						reg_cx = r;
+						reg_dx = r < 0 ? errno : 0;
+						if(r > 0) fprintf(stderr, "Read <%.*s>\n", r, bisqdata);
+						delete[] bisqdata;
+		            }
+		            break;
+		        }
+		        case 4: /* resize */
+		        {
+		            struct winsize ws;
+		            memset(&ws, 0, sizeof(ws));
+		            ws.ws_row = reg_cx;
+		            ws.ws_col = reg_dx;
+		            ioctl(reg_bx, TIOCSWINSZ, &ws);
+		            break;
+		        }
+		    }
+		    break;
+		}
+		/* END BISQWIT SSH SUPPORT */
 		E_Exit("Unhandled Dos 21 call %02X",reg_ah);
 		break;
 	case 0x54:					/* Get verify flag */
diff -NaHudr dosbox-0.73/src/gui/sdlmain.cpp dosbox-0.73-patched/src/gui/sdlmain.cpp
--- dosbox-0.73/src/gui/sdlmain.cpp	2009-05-25 21:44:46.000000000 +0300
+++ dosbox-0.73-patched/src/gui/sdlmain.cpp	2010-07-07 19:37:28.404438512 +0300
@@ -51,6 +51,9 @@
 #include "cross.h"
 #include "control.h"
 
+#include "../nesvideos-piece.hh"
+#include "hardware.h"
+
 //#define DISABLE_JOYSTICK
 
 #if C_OPENGL
@@ -1647,6 +1650,8 @@
 
 //extern void UI_Init(void);
 int main(int argc, char* argv[]) {
+    bool WillLaunchVideoLog = false;
+    
 	try {
 		CommandLine com_line(argc,argv);
 		Config myconf(&com_line);
@@ -1659,6 +1664,8 @@
 		if(control->cmdline->FindString("-editconf",editor,false)) launcheditor();
 		if(control->cmdline->FindString("-opencaptures",editor,true)) launchcaptures(editor);
 		if(control->cmdline->FindExist("-eraseconf")) eraseconfigfile();
+		
+		WillLaunchVideoLog = control->cmdline->FindExist("-videolog");
 
 		/* Can't disable the console with debugger enabled */
 #if defined(WIN32) && !(C_DEBUG)
@@ -1818,8 +1825,19 @@
 		MAPPER_Init();
 		if (control->cmdline->FindExist("-startmapper")) MAPPER_Run(false);
 		/* Start up main machine */
+
+        if(WillLaunchVideoLog)
+        {
+            fprintf(stderr, "Videolog enabled with env VIDEOLOG=%s\n", getenv("VIDEOLOG"));
+            LoggingEnabled=2;
+            CaptureState |= CAPTURE_VIDEO | CAPTURE_WAVE;
+            NESVideoSetVideoCmd(getenv("VIDEOLOG"));
+        }
+
 		control->StartUp();
 		/* Shutdown everything */
+
+
 	} catch (char * error) {
 		GFX_ShowMsg("Exit to error: %s",error);
 		fflush(NULL);
diff -NaHudr dosbox-0.73/src/hardware/hardware.cpp dosbox-0.73-patched/src/hardware/hardware.cpp
--- dosbox-0.73/src/hardware/hardware.cpp	2009-05-25 21:44:46.000000000 +0300
+++ dosbox-0.73-patched/src/hardware/hardware.cpp	2010-08-22 12:01:15.916745179 +0300
@@ -31,6 +31,8 @@
 #include "render.h"
 #include "cross.h"
 
+#include "../nesvideos-piece.hh"
+
 #if (C_SSHOT)
 #include <png.h>
 #include "../libs/zmbv/zmbv.cpp"
@@ -38,7 +40,7 @@
 
 static std::string capturedir;
 extern const char* RunningProgram;
-Bitu CaptureState;
+Bitu CaptureState = 0;
 
 #define WAVE_BUF 16*1024
 #define MIDI_BUF 4*1024
@@ -300,6 +302,202 @@
 #endif
 
 void CAPTURE_AddImage(Bitu width, Bitu height, Bitu bpp, Bitu pitch, Bitu flags, float fps, Bit8u * data, Bit8u * pal) {
+    {
+
+        static float OldFPS = 0;
+        if(fps != OldFPS)
+        {
+            fprintf(stderr, "\33[1m%d: FPS reported now as %g\33[m\n", getpid(), (double)fps); 
+            OldFPS = fps;
+        }
+        
+        Bit8u* origdata = data;
+        Bit8u* curdata  = data;
+
+        const unsigned FrameShift = 3;
+        static unsigned FrameCounter = 0;
+
+        //fprintf(stderr, "Getting %u\n", FrameCounter);fflush(stderr);
+
+        /*if(width == 320 && height == 200)
+        {
+            static Bit8u x640x400[FrameShift][640*400];
+            data = x640x400[FrameCounter];
+            for(unsigned p=0,y=0; y<200; ++y)
+            {
+                for(unsigned d=y*2*640, x=0; x<320; ++x,++p, d+=2)
+                {
+                    unsigned char c = origdata[p];
+                    data[d+0] = c;
+                    data[d+1] = c;
+                }
+                memcpy(&data[(y*2+1)*640], &data[(y*2+0)*640], 640);
+            }
+            width=640; height=400;
+        }*/
+        if(width == 320 && height == 200)
+        {
+            static Bit8u x640x200[FrameShift][640*200];
+            data = x640x200[FrameCounter];
+            for(unsigned p=0,y=0; y<200; ++y)
+            {
+                for(unsigned d=y*640, x=0; x<320; ++x,++p, d+=2)
+                {
+                    unsigned char c = origdata[p];
+                    data[d+0] = c;
+                    data[d+1] = c;
+                }
+            }
+            width=640; height=200;
+            curdata = data;
+        }
+        if(width == 640 && (height == 200 || height == 400))
+        {
+            static Bit8u x1280x400[FrameShift][1280*400];
+            data = x1280x400[FrameCounter];
+            if(height == 200)
+            {
+				for(unsigned p=0,y=0; y<200; ++y)
+				{
+					for(unsigned d=y*2*1280, x=0; x<640; ++x,++p, d+=2)
+					{
+						unsigned char c = curdata[p];
+						data[d+0] = c;
+						data[d+1] = c;
+					}
+					memcpy(&data[(y*2+1)*1280],
+						   &data[(y*2+0)*1280], 1280);
+				}
+            }
+            else if(height == 400)
+            {
+				for(unsigned p=0,y=0; y<400; ++y)
+				{
+					for(unsigned d=y*1280, x=0; x<640; ++x,++p, d+=2)
+					{
+						unsigned char c = curdata[p];
+						data[d+0] = c;
+						data[d+1] = c;
+					}
+				}
+            }
+            width=1280; height=400;
+            /*
+static char TextBuf[] =
+{
+    "........&&&&&&...&&....&&&.&&&....&&&&&&\n"
+    "..........&&.....&&....&&.&.&&....&&&...\n"
+    "..........&&.....&&....&&...&&....&&&&&&\n"
+    "................................................................\n"
+    "..AAAAA.....HHHHH.......OOOOO.....aaaaa.......hhhhh.....ooooo...\n"
+    "BB.A.A.CC.II.H.H.JJ...PP.O.O.QQ.bb.a.a.cc...ii.h.h.jj.pp.o.o.qq.\n"
+    "BBB...CCC.III...JJJ...PPP...QQQ.bbb...ccc...iii...jjj.ppp...qqq.\n"
+    "BBB...CCC.III...JJJ.#.PPP...QQQ.bbb...ccc.#.iii...jjj.ppp...qqq.\n"
+    "BB.....CC.II.....JJ...PP.....QQ.bb.....cc...ii.....jj.pp.....qq.\n"
+    ".DDDDDDD...KKKKKKK.....RRRRRRR...ddddddd.....kkkkkkk...rrrrrrr..\n"
+    "EE.....GG.LL.....NN...SS.....UU.ee.....gg...ll.....nn.ss.....uu.\n"
+    "EEE...GGG.LLL...NNN.#.SSS...UUU.eee...ggg.#.lll...nnn.sss...uuu.\n"
+    "EEE...GGG.LLL...NNN...SSS...UUU.eee...ggg...lll...nnn.sss...uuu.\n"
+    "EE.F.F.GG.LL.M.M.NN...SS.T.T.UU.ee.f.f.gg...ll.m.m.nn.ss.t.t.uu.\n"
+    "..FFFFF.....MMMMM.......TTTTT.....fffff.......mmmmm.....ttttt...\n"
+};
+            static const char DigMask[7*10] =
+            { 1,1,1,0,1,1,1,
+              0,0,1,0,0,0,1,
+              1,0,1,1,1,1,0,
+              1,0,1,1,0,1,1,
+              0,1,1,1,0,0,1,
+              1,1,0,1,0,1,1,
+              1,1,0,1,1,1,1,
+              1,1,1,0,0,0,1,
+              1,1,1,1,1,1,1,
+              1,1,1,1,0,1,1 };
+            time_t t = time(NULL);
+            struct tm* tm = localtime(&t);
+            
+            unsigned digits[6] =
+            {
+                tm->tm_hour / 10,
+                tm->tm_hour % 10,
+                tm->tm_min / 10,
+                tm->tm_min % 10,
+                tm->tm_sec / 10,
+                tm->tm_sec % 10
+            };
+            char Enabled[256] = { 0 };
+            Enabled['#'] = 2;
+            Enabled['&'] = 3;
+            for(unsigned d=0; d<6; ++d)
+            for(unsigned c=0; c<7; ++c)
+                Enabled["ABCDEFG""HIJKLMN""OPQRSTU"
+                        "abcdefg""hijklmn""opqrstu"[d*7+c]]
+                         = DigMask[c + 7*digits[d]] ? 2 : 1;
+            unsigned x = 1200;
+            unsigned y = 360;
+            for(const char* s = TextBuf; *s; ++s)
+            {
+                bool cross = (x^y)&1;
+                if(*s == '\n') { ++y; x = 1200; continue; }
+                switch(Enabled[*s])
+                {
+                    case 1: data[y*1280+x] = cross?1:0; break;
+                    case 2: data[y*1280+x] = cross?11:15; break;
+                    case 3: data[y*1280+x] = cross?6:14; break;
+                    case 0: if(cross) data[y*1280+x] = 0; break;
+                }
+                ++x;
+            }*/
+        }
+        
+        static Bit8u* Frames[FrameShift];
+        //fprintf(stderr, "Got %u: %p\n", FrameCounter, data);fflush(stderr);
+		if(data == origdata)
+		{
+			fprintf(stderr, "This, I was not prepared for: %ux%u\n",
+				(unsigned)width, (unsigned)height);
+		}
+		Frames[FrameCounter] = data;
+		
+		if(++FrameCounter < FrameShift)return;
+        FrameCounter = 0;
+
+        //fprintf(stderr, "Plushing\n");fflush(stderr);
+        
+        Bit8u* nesvdata = data;
+        if(bpp == 8)
+        {
+            nesvdata = new Bit8u[width*height*3];
+            for(unsigned p=0; p<width*height; ++p)
+            {
+                { unsigned v = 0;
+                  for(unsigned f=0; f<FrameShift; ++f)
+                      v += pal[Frames[f][p]*4+2];
+                  nesvdata[p*3+0] = v / FrameShift; }
+
+                { unsigned v = 0;
+                  for(unsigned f=0; f<FrameShift; ++f)
+                      v += pal[Frames[f][p]*4+1];
+                  nesvdata[p*3+1] = v / FrameShift; }
+
+                { unsigned v = 0;
+                  for(unsigned f=0; f<FrameShift; ++f)
+                      v += pal[Frames[f][p]*4+0];
+                  nesvdata[p*3+2] = v / FrameShift; }
+            }
+        }
+        
+        static unsigned length_frames = 0;
+        ++length_frames;
+        fprintf(stderr, "Movie length: %6.2f seconds\r",
+            length_frames*FrameShift / fps); fflush(stderr);
+
+        NESVideoLoggingVideo(nesvdata, width, height,
+            (fps/FrameShift)*(1<<24), bpp==8 ? 24 : bpp);
+
+        if(bpp == 8) delete[] nesvdata;
+    }
+    return;
+    
 #if (C_SSHOT)
 	Bitu i;
 	Bit8u doubleRow[SCALER_MAXWIDTH*4];
@@ -576,6 +774,10 @@
 };
 
 void CAPTURE_AddWave(Bit32u freq, Bit32u len, Bit16s * data) {
+    //fprintf(stderr, "AddWave %u\n", (unsigned)freq);
+    NESVideoLoggingAudio(data, freq, 16, 2, len);
+    return;
+
 #if (C_SSHOT)
 	if (CaptureState & CAPTURE_VIDEO) {
 		Bitu left = WAVE_BUF - capture.video.audioused;
@@ -735,6 +937,7 @@
 		Prop_path* proppath= section->Get_path("captures");
 		capturedir = proppath->realpath;
 		CaptureState = 0;
+		fprintf(stderr, "CaptureState set to %u\n", CaptureState);
 		MAPPER_AddHandler(CAPTURE_WaveEvent,MK_f6,MMOD1,"recwave","Rec Wave");
 		MAPPER_AddHandler(CAPTURE_MidiEvent,MK_f8,MMOD1|MMOD2,"caprawmidi","Cap MIDI");
 #if (C_SSHOT)
diff -NaHudr dosbox-0.73/src/Makefile.am dosbox-0.73-patched/src/Makefile.am
--- dosbox-0.73/src/Makefile.am	2009-04-28 10:02:37.000000000 +0300
+++ dosbox-0.73-patched/src/Makefile.am	2010-04-04 01:30:39.280785772 +0300
@@ -11,10 +11,12 @@
 .rc.o:
 	$(WINDRES) -o $@ $<
 
-dosbox_SOURCES = dosbox.cpp $(ico_stuff)
+dosbox_SOURCES = dosbox.cpp nesvideos-piece.cc rgbtorgb.cc $(ico_stuff)
 dosbox_LDADD = cpu/libcpu.a debug/libdebug.a dos/libdos.a fpu/libfpu.a  hardware/libhardware.a gui/libgui.a \
                ints/libints.a misc/libmisc.a shell/libshell.a hardware/serialport/libserial.a libs/gui_tk/libgui_tk.a
 
 EXTRA_DIST = winres.rc dosbox.ico
 
 
+dosbox_LDADD += -lutil -lgd -lx264
+
diff -NaHudr dosbox-0.73/src/nesvideos-piece.cc dosbox-0.73-patched/src/nesvideos-piece.cc
--- dosbox-0.73/src/nesvideos-piece.cc	1970-01-01 02:00:00.000000000 +0200
+++ dosbox-0.73-patched/src/nesvideos-piece.cc	2010-07-07 20:50:34.588435122 +0300
@@ -0,0 +1,1295 @@
+#define THREAD_SAFETY
+
+#include <cmath>
+#include <string>
+#include <vector>
+#include <deque>
+#include <list>
+#include <map>
+
+#include <unistd.h>   // mknod, unlink, write
+#include <stdio.h>
+#include <sys/stat.h> // S_IFIFO
+#include <fcntl.h>    // fcntl
+#include <sys/poll.h> // poll
+#include <stdlib.h>   // setenv
+#include <string.h>   // strrchr
+#include <sys/file.h> // flock
+#include <errno.h>
+#include <glob.h>
+
+#include <stdint.h>
+
+#include <gd.h>
+
+extern "C" {
+#include <x264.h>
+int x264_nal_encode( uint8_t *dst, int b_annexb, x264_nal_t *nal );
+
+}
+
+/* Note: This module assumes everyone uses BGR32 as display depth */
+
+//#define LOGO_LENGTH_HEADER  (1.2)
+//#define LOGO_LENGTH_OVERLAP (10.0-LOGO_LENGTH_HEADER)
+//#define LOGO_LENGTH_HEADER  (1.1)
+//#define LOGO_LENGTH_OVERLAP (6.00-LOGO_LENGTH_HEADER)
+//#define LOGO_LENGTH_OVERLAP (5.40-LOGO_LENGTH_HEADER)
+//#define LOGO_LENGTH_OVERLAP (3-LOGO_LENGTH_HEADER)
+//#define LOGO_LENGTH_HEADER  (1.5)
+#define LOGO_LENGTH_OVERLAP (0)
+#define LOGO_LENGTH_HEADER (0)
+
+static std::string VIDEO_CMD = "";
+/*
+-rawvideo on:fps=60:format=0x42475220:w=256:h=224:size=$[1024*224]
+-audiofile "+AUDIO_FN+"
+*/
+static std::string AUDIO_FN = "s3.log";
+
+static bool Terminate=false;
+static unsigned videonumber = 0;
+
+unsigned long long A_sent = 0;
+unsigned long long V_sent = 0;
+
+#ifdef THREAD_SAFETY
+# include <pthread.h>
+static pthread_mutex_t APIlock = PTHREAD_MUTEX_INITIALIZER;
+struct ScopedLock
+{ ScopedLock() { 
+                 pthread_mutex_lock(&APIlock);
+                 //fprintf(stderr, "audio start\n"); fflush(stderr);
+               }
+  ~ScopedLock() {
+                 //fprintf(stderr, "audio end\n"); fflush(stderr);
+                 pthread_mutex_unlock(&APIlock); }
+};
+#endif
+
+static unsigned NonblockWrite(FILE* fp, const unsigned char*buf, unsigned length)
+{
+  Retry:
+    int result = write(fileno(fp), buf, length);
+    if(result == -1 && errno==EAGAIN)
+    {
+        return 0;
+    }
+    if(result == -1 && errno==EINTR) goto Retry;
+    if(result == -1)
+    {
+        perror("write");
+        Terminate=true;
+        return 0;
+    }
+    return result;
+}
+static int WaitUntilOneIsWritable(FILE*f1, FILE*f2, int whichmask = 3)
+{
+    struct pollfd po[2] = { {fileno(f1),POLLOUT,0}, {fileno(f2),POLLOUT,0} };
+    
+    pollfd* po_ptr = po;
+    unsigned po_n = 2;
+    
+    if(whichmask == 1) // f1 only
+        { po_n = 1; po[1].revents = 0; }
+    else if(whichmask == 2) // f2 only
+        { po_ptr += 1; po[0].revents = 0; }
+
+    poll(po_ptr, po_n, -1);
+
+    return ((po[0].revents & POLLOUT) ? 1 : 0)
+         | ((po[1].revents & POLLOUT) ? 2 : 0);
+}
+
+#define BGR32 0x42475220  // BGR32 fourcc
+#define BGR24 0x42475218  // BGR24 fourcc
+#define BGR16 0x42475210  // BGR16 fourcc
+#define BGR15 0x4247520F  // BGR15 fourcc
+#define I420  0x30323449  // I420 fourcc
+#define YUY2  0x32595559  // YUY2 fourcc
+
+static unsigned USE_FOURCC = BGR32;
+static unsigned INPUT_BPP  = 32;
+
+#define u32(n) (n)&255,((n)>>8)&255,((n)>>16)&255,((n)>>24)&255
+#define u16(n) (n)&255,((n)>>8)&255
+#define s4(s) s[0],s[1],s[2],s[3]
+
+static const unsigned FPS_SCALE = 0x1000000;
+
+static struct Construct
+{
+    Construct()
+    {
+        char Buf[4096];
+        getcwd(Buf,sizeof(Buf));
+        Buf[sizeof(Buf)-1]=0;
+        AUDIO_FN = Buf + std::string("/") + AUDIO_FN;
+    }
+} Construct;
+
+namespace LogoInfo
+{
+    unsigned width;
+    unsigned height;
+
+    bool SentVideo = false;
+    bool SentAudio = false;
+    int OverlapSent = 0;
+}
+
+
+class AVI
+{
+public:
+    AVI()          { }
+    virtual ~AVI() { }
+
+    virtual void Audio
+        (unsigned r,unsigned b,unsigned c,
+         const unsigned char*d, unsigned nsamples) = 0;
+
+    virtual void Video
+        (unsigned w,unsigned h,unsigned f, const unsigned char*d) = 0;
+    
+    virtual void SaveState(const std::string&) { }
+    virtual void LoadState(const std::string&) { }
+};
+
+class NormalAVI: public AVI
+{
+    FILE* vidfp;
+    FILE* audfp;
+    
+    bool KnowVideo;
+    unsigned vid_width;
+    unsigned vid_height;
+    unsigned vid_fps_scaled;
+    std::list<std::vector<unsigned char> > VideoBuffer;
+    unsigned VidBufSize;
+    
+    bool KnowAudio;
+    unsigned aud_rate;
+    unsigned aud_chans;
+    unsigned aud_bits;
+    std::list<std::vector<unsigned char> > AudioBuffer;
+    unsigned AudBufSize;
+    
+public:
+    NormalAVI() :
+        vidfp(NULL),
+        audfp(NULL),
+        KnowVideo(false), VidBufSize(0),
+        KnowAudio(false), AudBufSize(0)
+    {
+    }
+    virtual ~NormalAVI()
+    {
+        while(VidBufSize && AudBufSize)
+        {
+            CheckFlushing();
+        }
+        if(audfp) fclose(audfp);
+        if(vidfp) pclose(vidfp);
+        unlink(AUDIO_FN.c_str());
+    }
+    
+    virtual void Audio
+        (unsigned r,unsigned b,unsigned c,
+         const unsigned char*d, unsigned nsamples)
+    {
+        if(Terminate) return;
+        if(!KnowAudio)
+        {
+            aud_rate = r;
+            aud_chans = c;
+            aud_bits = b;
+            KnowAudio = true;
+        }
+        CheckFlushing();
+        
+        unsigned bytes = nsamples * aud_chans * (aud_bits / 8);
+
+#if 1
+        static FILE* ouf = 0;
+        if(!ouf)
+            ouf = popen("lzop -F3 | ssh -c blowfish chii 'lzop -Fd > /mnt/gbatmp/audio_out.raw'", "w");
+        fwrite(d, 1, bytes, ouf);
+        return;
+#endif
+        unsigned wrote = 0;
+        if(KnowVideo && AudioBuffer.empty())
+        {
+            //fprintf(stderr, "Writing %u of %s from %p to %p\t", bytes, "aud", (void*)d, (void*)audfp);
+            wrote = NonblockWrite(audfp, d, bytes);
+            //fprintf(stderr, "Wrote %u\n", wrote);
+            A_sent += wrote;
+        }
+        if(wrote < bytes)
+        {
+            unsigned remain = bytes-wrote;
+            //fprintf(stderr, "Buffering %u of %s (%p..%p)\n", remain, "aud", d+wrote, d+bytes);
+            AudioBuffer.push_back(std::vector<unsigned char>(d+wrote, d+bytes));
+            AudBufSize += remain;
+        }
+        CheckFlushing();
+    }
+
+    virtual void Video
+        (unsigned w,unsigned h,unsigned f, const unsigned char*d)
+    {
+        if(Terminate) return;
+        if(!KnowVideo)
+        {
+            vid_width      = w;
+            vid_height     = h;
+            vid_fps_scaled = f;
+            KnowVideo = true;
+        }
+        CheckFlushing();
+        
+        unsigned bpp   = INPUT_BPP; if(bpp == 15 || bpp == 17) bpp = 16;
+        unsigned bytes = vid_width * vid_height * bpp / 8;
+        
+#if 1
+        static FILE* ouf = 0;
+        if(!ouf)
+            ouf = popen("lzop -F3 | ssh -c blowfish chii 'lzop -Fd > /mnt/gbatmp/video_out.raw'", "w");
+        fwrite(d, 1, bytes, ouf);
+        return;
+#endif
+        //std::vector<unsigned char> tmp(bytes, 'k');
+        //d = &tmp[0];
+        
+        unsigned wrote = 0;
+        if(KnowAudio && VideoBuffer.empty())
+        {
+            CheckBegin();
+            //fprintf(stderr, "Writing %u of %s from %p to %p\t", bytes, "vid", (void*)d, (void*)vidfp);
+            wrote = NonblockWrite(vidfp, d, bytes);
+            //fprintf(stderr, "Wrote %u\n", wrote);
+            V_sent += wrote;
+        }
+        
+        if(wrote < bytes)
+        {
+            unsigned remain = bytes-wrote;
+            //fprintf(stderr, "Buffering %u of %s (%p..%p)\n", remain, "vid", d+wrote, d+bytes);
+
+            VideoBuffer.push_back(std::vector<unsigned char>(d+wrote, d+bytes));
+            VidBufSize += remain;
+        }
+        CheckFlushing();
+    }
+
+private:
+    /* fp is passed as a reference because it may be NULL
+     * prior to calling, and this function changes it. */
+    template<typename BufType>
+    bool FlushBufferSome(BufType& List, unsigned& Size, FILE*& fp, const char* what)
+    {
+        what=what;
+        
+    Retry:
+        if(List.empty() || Terminate) return false;
+        
+        if(List.begin()->empty()) { List.erase(List.begin()); goto Retry; }
+        
+        CheckBegin();
+
+        typename BufType::iterator i = List.begin();
+        std::vector<unsigned char>& buf = *i;
+        
+        unsigned bytes = buf.size();
+        
+        //fprintf(stderr, "Writing %u of %s from %p to %p\t", bytes, what, (void*)&buf[0], (void*)fp);
+        
+        unsigned ate = NonblockWrite(fp, &buf[0], bytes);
+        if(ate == 0)
+            return false;
+
+        //fprintf(stderr, "Wrote %u\n", ate);
+        
+        if(what[0] == 'v')
+            V_sent += ate;
+        else
+            A_sent += ate;
+        
+        buf.erase(buf.begin(), buf.begin()+ate);
+        
+        Size -= ate;
+        
+        if(buf.empty())
+        {
+            List.erase(i);
+        }
+        return true;
+    }
+
+    void CheckFlushing()
+    {
+        //AudioBuffer.clear();
+        //VideoBuffer.clear();
+        
+        if(KnowAudio && KnowVideo && !Terminate)
+        {
+        const int LogoFramesHeader  = (int)( (LOGO_LENGTH_HEADER  * 60));
+        const int LogoFramesOverlap = (int)( (LOGO_LENGTH_OVERLAP * 60));
+        
+            unsigned nloops = 0;
+            while ((!AudioBuffer.empty() && !VideoBuffer.empty())
+                || (LogoInfo::OverlapSent >= LogoFramesOverlap
+                 && VideoBuffer.size() >= 50
+                 && nloops++ < 10))
+            {
+                /* vidfp = &1, audfp = &2 */
+                int attempt = WaitUntilOneIsWritable(vidfp, audfp,
+                    (AudioBuffer.empty() ? 0 : 2)
+                  | (VideoBuffer.empty() ? 0 : 1)
+                   );
+                
+                if(attempt <= 0) break; /* Some kind of error can cause this */
+                
+                // Flush Audio
+                if(attempt&2) FlushBufferSome(AudioBuffer, AudBufSize, audfp, "aud");
+
+                // Flush Video
+                if(attempt&1) FlushBufferSome(VideoBuffer, VidBufSize, vidfp, "vid");
+            }
+
+            while(FlushBufferSome(VideoBuffer, VidBufSize, vidfp, "vid")) {}
+            while(FlushBufferSome(AudioBuffer, AudBufSize, audfp, "aud")) {}
+
+            /*
+            fprintf(stderr, "Buffer Sizes: Audio %u(%u) video %u(%u) -- sent A=%llu, V=%llu\n",
+                (unsigned)AudioBuffer.size(), AudBufSize,
+                (unsigned)VideoBuffer.size(), VidBufSize,
+                A_sent, V_sent);
+            */
+        }
+    }
+    std::string GetMEncoderRawvideoParam() const
+    {
+        char Buf[512];
+        unsigned bpp   = INPUT_BPP; if(bpp == 15 || bpp == 17) bpp = 16;
+        sprintf(Buf, "fps=%g:format=0x%04X:w=%u:h=%u:size=%u",
+            vid_fps_scaled / (double)FPS_SCALE,
+            USE_FOURCC,
+            vid_width,
+            vid_height,
+            vid_width*vid_height * bpp/8);
+        return Buf;
+    }
+    std::string GetMEncoderRawaudioParam() const
+    {
+        char Buf[512];
+        sprintf(Buf, "channels=%u:rate=%u:samplesize=%u:bitrate=%u",
+            aud_chans,
+            aud_rate,
+            aud_bits/8,
+            aud_rate*aud_chans*(aud_bits/8) );
+        return Buf;
+    }
+    std::string GetMEncoderCommand() const
+    {
+        std::string mandatory = "-audiofile " + AUDIO_FN
+                              + " -audio-demuxer rawaudio"
+                              + " -demuxer rawvideo"
+                              + " -rawvideo " + GetMEncoderRawvideoParam()
+                              + " -rawaudio " + GetMEncoderRawaudioParam()
+                              ;
+        std::string cmd = VIDEO_CMD;
+
+        std::string::size_type p = cmd.find("NESV""SETTINGS");
+        if(p != cmd.npos)
+            cmd = cmd.replace(p, 4+8, mandatory);
+        else
+            fprintf(stderr, "Warning: NESVSETTINGS not found in videocmd\n");
+        
+        char videonumstr[64];
+        sprintf(videonumstr, "%u", videonumber);
+        
+        for(;;)
+        {
+            p = cmd.find("VIDEO""NUMBER");
+            if(p == cmd.npos) break;
+            cmd = cmd.replace(p, 5+6, videonumstr);
+        }
+        
+        fprintf(stderr, "Launch: %s\n", cmd.c_str()); fflush(stderr);
+        
+        return cmd;
+    }
+
+    void CheckBegin()
+    {
+        if(!audfp)
+        {
+            unlink(AUDIO_FN.c_str());
+            mknod(AUDIO_FN.c_str(), S_IFIFO|0666, 0);
+        }
+        
+        if(!vidfp)
+        {
+            /* Note: popen does not accept b/t in mode param */
+            setenv("LD_PRELOAD", "", 1);
+            vidfp = popen(GetMEncoderCommand().c_str(), "w");
+            if(!vidfp)
+            {
+                perror("Launch failed");
+            }
+            else
+            {
+                fcntl(fileno(vidfp), F_SETFL, O_WRONLY | O_NONBLOCK);
+            }
+        }
+        
+        if(!audfp)
+        {
+        Retry:
+            audfp = fopen(AUDIO_FN.c_str(), "wb");
+            
+            if(!audfp)
+            {
+                perror(AUDIO_FN.c_str());
+                if(errno == ESTALE) goto Retry;
+            }
+            else
+            {
+                fcntl(fileno(audfp), F_SETFL, O_WRONLY | O_NONBLOCK);
+            }
+        }
+    }
+};
+
+class RerecordingAVI: public AVI
+{
+    std::map<std::string, std::pair<off_t, off_t> > FrameStates;
+    size_t aud_framesize;
+    size_t vid_framesize;
+    
+    FILE* vidfp;
+    FILE* audfp;
+    FILE* eventfp;
+    FILE* statefp;
+    /*
+    std::string vidfn;
+    std::string audfn;
+    std::string eventfn;
+    std::string statefn;
+    */
+    
+    x264_t*        x264;
+    x264_param_t   param;
+    bool           forcekey;
+    
+    class LockF
+    {
+    public:
+        LockF(FILE* f) : fp(f) { flock(fileno(fp), LOCK_EX); }
+        ~LockF()               { flock(fileno(fp), LOCK_UN); }
+    private:
+        LockF(const LockF&);
+        LockF& operator=(const LockF&);
+        FILE* fp;
+    };
+    
+public:
+    RerecordingAVI(long FrameNumber)
+        : aud_framesize(0),
+          vid_framesize(0),
+          x264(0),
+          forcekey(true)
+    {
+        SetFn();
+    }
+    virtual ~RerecordingAVI()
+    {
+        if(eventfp)
+        {
+            off_t vidpos = ftello(vidfp);
+            off_t audpos = ftello(audfp);
+            fprintf(eventfp,
+                "%llX %llX End\n",
+                (long long)vidpos, (long long)audpos);
+        }
+        if(vidfp) fclose(vidfp);
+        if(audfp) fclose(audfp);
+        if(eventfp) fclose(eventfp);
+        if(statefp) fclose(statefp);
+        
+        if(x264) x264_encoder_close(x264);
+    }
+
+    virtual void Audio
+        (unsigned aud_rate,unsigned aud_bits,unsigned aud_chans,
+         const unsigned char*data, unsigned nsamples)
+    {
+        size_t bytes = nsamples     * aud_chans * (aud_bits / 8);
+        size_t framesize = aud_rate * aud_chans * (aud_bits / 8);
+        
+        if(framesize != aud_framesize)
+        {
+            aud_framesize = framesize;
+            LockF el(eventfp);
+            fprintf(eventfp, "AudFrameSize %lu\n", (unsigned long)aud_framesize);
+            fflush(eventfp);
+        }
+        
+        LockF al(audfp);
+        fwrite(data, 1, bytes, audfp);
+    }
+
+    virtual void Video
+        (unsigned vid_width,unsigned vid_height,
+         unsigned vid_fps_scaled, const unsigned char*data)
+    {
+        unsigned bpp   = INPUT_BPP; if(bpp == 15 || bpp == 17) bpp = 16;
+        size_t bytes = vid_width * vid_height * bpp / 8;
+        size_t framesize = bytes;
+
+        if(framesize != vid_framesize)
+        {
+            vid_framesize = framesize;
+            LockF el(eventfp);
+            fprintf(eventfp, "VidFrameSize %lu\n", (unsigned long)vid_framesize);
+            fflush(eventfp);
+        }
+
+        LockF vl(vidfp);
+        
+        if(bpp == 12) /* For I420, we use a local X264 encoder */
+        {
+            if(!x264)
+            {
+                x264_param_default(&param);
+                x264_param_parse(&param, "psnr", "no");
+                x264_param_parse(&param, "ssim", "no");
+                param.i_width  = vid_width;
+                param.i_height = vid_height;
+                param.i_csp    = X264_CSP_I420;
+                //param.i_scenecut_threshold = -1;
+                //param.b_bframe_adaptive     = 0;
+                //param.rc.i_rc_method      = X264_RC_CRF;
+                //param.rc.i_qp_constant    = 0;
+                x264_param_parse(&param, "me",       "dia");
+                x264_param_parse(&param, "crf",      "6");
+                x264_param_parse(&param, "frameref", "8");
+                param.i_frame_reference = 1;
+                param.analyse.i_subpel_refine = 1;
+                param.analyse.i_me_method = X264_ME_DIA;
+                /*
+                param.analyse.inter = 0;
+                param.analyse.b_transform_8x8 = 0;
+                param.analyse.b_weighted_bipred = 0;
+                param.analyse.i_trellis = 0;
+                */
+                //param.b_repeat_headers = 1; // guess this might be needed
+                
+                param.i_fps_num = vid_fps_scaled;
+                param.i_fps_den = 1 << 24;
+                
+                x264 = x264_encoder_open(&param);
+                if(!x264)
+                {
+                    fprintf(stderr, "x264_encoder_open failed.\n");
+                    goto raw_fallback;
+                }
+            }
+            
+            const size_t npixels = vid_width * vid_height;
+            x264_picture_t pic;
+            pic.i_type = forcekey ? X264_TYPE_IDR : X264_TYPE_AUTO;
+            pic.i_pts  = 0;
+            pic.i_qpplus1 = 0;
+            pic.img.i_csp = X264_CSP_I420;
+            pic.img.i_plane = 3;
+            pic.img.i_stride[0] = vid_width;
+            pic.img.i_stride[1] = vid_width / 2;
+            pic.img.i_stride[2] = vid_width / 2;
+            pic.img.plane[0] = const_cast<uint8_t*>(data) + npixels*0/4;
+            pic.img.plane[1] = const_cast<uint8_t*>(data) + npixels*4/4;
+            pic.img.plane[2] = const_cast<uint8_t*>(data) + npixels*5/4;
+            
+            x264_nal_t*    nal; int i_nal;
+            x264_picture_t pic_out;
+            if(x264_encoder_encode(x264, &nal, &i_nal, &pic, &pic_out) < 0)
+            {
+                fprintf(stderr, "x264_encoder_encode failed\n");
+                goto raw_fallback;
+            }
+            int i_size = 0;
+            for(int i=0; i<i_nal; ++i) i_size += nal[i].i_payload * 2 + 4;
+            std::vector<unsigned char> muxbuf(i_size);
+            i_size = 0;
+            for(int i=0; i<i_nal; ++i)
+            {
+                int room_required = nal[i].i_payload * 3/2 + 4;
+                if(muxbuf.size() < i_size + room_required)
+                    muxbuf.resize(i_size + room_required);
+                
+                int i_data = muxbuf.size() - i_size;
+                /*
+                i_size += x264_nal_encode(&muxbuf[i_size], &i_data, 1, &nal[i]);
+                */
+            }
+            if(i_size > 0)
+                fwrite(&muxbuf[0], 1, i_size, vidfp);
+        }
+        else
+        {
+        raw_fallback:
+            fwrite(data, 1, bytes, vidfp);
+        }
+
+        if(eventfp)
+        {
+            LockF el(eventfp);
+            off_t vidpos = ftello(vidfp);
+            off_t audpos = ftello(audfp);
+            fprintf(eventfp,
+                "%llX %llX Mark\n",
+                (long long)vidpos, (long long)audpos);
+            fflush(eventfp);
+        }
+    }
+    
+    virtual void SaveState(const std::string& slot)
+    {
+        LockF el(eventfp);
+        
+        off_t vidpos = ftello(vidfp);
+        off_t audpos = ftello(audfp);
+    
+        fprintf(eventfp,
+            "%llX %llX Save %s\n",
+             (long long)vidpos, (long long)audpos, slot.c_str());
+        fflush(eventfp);
+        
+        FrameStates[slot] = std::make_pair(vidpos, audpos);
+        WriteStates();
+        
+        forcekey = true;
+    }
+    
+    virtual void LoadState(const std::string& slot)
+    {
+        LockF el(eventfp);
+
+        const std::pair<off_t, off_t>& old = FrameStates[slot];
+        off_t vidpos = ftello(vidfp);
+        off_t audpos = ftello(audfp);
+        fprintf(eventfp,
+            "%llX %llX Load %llX %llX %s\n",
+            (long long)vidpos, (long long)audpos,
+            (long long)old.first,
+            (long long)old.second,
+            slot.c_str());
+        fflush(eventfp);
+
+        forcekey = true;
+    }
+private:
+    void SetFn()
+    {
+        std::string vidfn = VIDEO_CMD + ".vid";
+        std::string audfn = VIDEO_CMD + ".aud";
+        std::string eventfn = VIDEO_CMD + ".log";
+        std::string statefn = VIDEO_CMD + ".state";
+        vidfp = fopen(vidfn.c_str(), "ab+");
+        audfp = fopen(audfn.c_str(), "ab+");
+        eventfp = fopen(eventfn.c_str(), "ab+");
+        statefp = fopen2(statefn.c_str(), "rb+", "wb+");
+        ReadStates();
+
+        if(eventfp)
+        {
+            off_t vidpos = ftello(vidfp);
+            off_t audpos = ftello(audfp);
+            fprintf(eventfp,
+                "%llX %llX Begin\n",
+                (long long)vidpos, (long long)audpos);
+        }
+    }
+    static FILE* fopen2(const char* fn, const char* mode1, const char* mode2)
+    {
+        FILE* result = fopen(fn, mode1);
+        if(!result) result = fopen(fn, mode2);
+        return result;
+    }
+    void ReadStates()
+    {
+        LockF sl(statefp);
+        
+        char Buf[4096];
+        rewind(statefp);
+        FrameStates.clear();
+        while(fgets(Buf, sizeof(Buf), statefp))
+        {
+            if(*Buf == '-') break;
+            char slotname[4096];
+            long long vidpos, audpos;
+            strtok(Buf, "\r"); strtok(Buf, "\n");
+            sscanf(Buf, "%llX %llX %4095s", &vidpos, &audpos, slotname);
+            FrameStates[slotname] = std::pair<off_t,off_t> (vidpos, audpos);
+        }
+    }
+    void WriteStates()
+    {
+        LockF sl(statefp);
+        
+        rewind(statefp);
+        for(std::map<std::string, std::pair<off_t, off_t> >::const_iterator
+            i = FrameStates.begin(); i != FrameStates.end(); ++i)
+        {
+            fprintf(statefp, "%llX %llX %s\n", 
+                (long long) i->second.first,
+                (long long) i->second.second,
+                i->first.c_str());
+        }
+        fprintf(statefp, "-\n");
+        fflush(statefp);
+    }
+};
+
+
+static AVI* AVI = 0;
+
+#include "quantize.hh"
+#include "rgbtorgb.hh"
+
+static bool RerecordingMode = false;
+static long CurrentFrameNumber = 0;
+
+extern "C"
+{
+    int LoggingEnabled = 0; /* 0=no, 1=yes, 2=recording! */
+
+    const char* NESVideoGetVideoCmd()
+    {
+        return VIDEO_CMD.c_str();
+    }
+    void NESVideoSetVideoCmd(const char *cmd)
+    {
+#ifdef THREAD_SAFETY
+        ScopedLock lock;
+#endif
+
+        VIDEO_CMD = cmd;
+    }
+    
+    void NESVideoSetRerecordingMode(long FrameNumber)
+    {
+        //const int LogoFramesOverlap = (int)( (LOGO_LENGTH_OVERLAP * fps_scaled) / (1 << 24) );
+        RerecordingMode = true;
+        CurrentFrameNumber = FrameNumber;
+        LogoInfo::SentVideo = FrameNumber > 0;
+        LogoInfo::SentAudio = FrameNumber > 0;
+        LogoInfo::OverlapSent = FrameNumber;
+    }
+    
+    static class AVI& GetAVIptr()
+    {
+        if(!AVI)
+        {
+            if(RerecordingMode)
+            {
+                fprintf(stderr, "Beginning rerecording project at frame %ld\n", CurrentFrameNumber);
+                AVI = new RerecordingAVI(CurrentFrameNumber);
+            }
+            else
+            {
+                fprintf(stderr, "Starting new AVI (num %u)\n", videonumber);
+                AVI = new NormalAVI;
+            }
+        }
+        return *AVI;
+    }
+    
+    void NESVideoRerecordingSave(const char* slot)
+    {
+        GetAVIptr().SaveState(slot);
+    }
+    
+    void NESVideoRerecordingLoad(const char* slot)
+    {
+        GetAVIptr().LoadState(slot);
+    }
+    
+    void NESVideoNextAVI()
+    {
+#ifdef THREAD_SAFETY
+        ScopedLock lock;
+#endif
+
+        if(AVI)
+        {
+            fprintf(stderr, "Closing AVI (next will be started)\n");
+            delete AVI;
+            AVI = 0;
+            ++videonumber;
+        }
+    }
+
+    static void Overlay32With32(unsigned char* target, const unsigned char* source, int alpha)
+    {
+        target[0] += ((int)(source[0] - target[0])) * alpha / 255;
+        target[1] += ((int)(source[1] - target[1])) * alpha / 255;
+        target[2] += ((int)(source[2] - target[2])) * alpha / 255;
+    }
+    
+    static void OverlayLogoFrom(const char* fn, std::vector<unsigned char>& data)
+    {
+        FILE*fp = fopen(fn, "rb");
+        if(!fp) perror(fn);
+        if(!fp) return; /* Silently ignore missing frames */
+        
+        gdImagePtr im = gdImageCreateFromPng(fp);
+        if(!im)
+        {
+          fprintf(stderr, "'%s': Failed to open image\n", fn);
+          goto CloseIm;
+        }
+        if(!gdImageTrueColor(im))
+        {
+          fprintf(stderr, "'%s': Only true color images are supported\n", fn);
+          goto CloseIm;
+        }
+        {/*scope begin*/
+        
+        unsigned new_width = gdImageSX(im);
+        unsigned new_height= gdImageSY(im);
+        
+        if(new_width != LogoInfo::width
+        || new_height != LogoInfo::height)
+        {
+            if(new_height < LogoInfo::height || new_height > LogoInfo::height+20)
+            fprintf(stderr, "'%s': ERROR, expected %dx%d, got %dx%d\n", fn,
+                LogoInfo::width, LogoInfo::height,
+                new_width, new_height);
+        }
+
+        for(unsigned y=0; y<LogoInfo::height; ++y)
+        {
+            unsigned char pixbuf[4] = {0,0,0,0};
+            for(unsigned x = 0; x < LogoInfo::width; ++x)
+            {
+                int color = gdImageTrueColorPixel(im, x,y);
+                int alpha = 255-gdTrueColorGetAlpha(color)*256/128;
+                pixbuf[2] = gdTrueColorGetRed(color);
+                pixbuf[1] = gdTrueColorGetGreen(color);
+                pixbuf[0] = gdTrueColorGetBlue(color);
+                Overlay32With32(&data[(y*LogoInfo::width+x)*3], pixbuf, alpha);
+            }
+        }
+        }/* close scope */
+    CloseIm:
+        gdImageDestroy(im);
+        fclose(fp);
+    }
+    
+    static const std::string GetLogoFileName(unsigned frameno)
+    {
+        std::string avdir = "/home/bisqwit/povray/nesvlogov5/";
+        //std::string avdir = "/home/bisqwit/povray/nesvlogov6/cv2/";
+        //std::string avdir = "/home/bisqwit/povray/nesvlogov6/kuros/";
+        
+        char AvName[512];
+        sprintf(AvName, "logo_%d_%d_f%03u.png",
+            LogoInfo::width,
+            LogoInfo::height,
+            frameno);
+        
+        std::string want = avdir + AvName;
+        int ac = access(want.c_str(), R_OK);
+        if(ac != 0)
+        {
+            /* No correct avatar file? Check if there's an approximate match. */
+            static std::map<int, std::vector<std::string> > files;
+            if(files.empty()) /* Cache the list of logo files. */
+            {
+                static const char GlobPat[] = "logo_*_*_f*.png";
+                glob_t globdata;
+                globdata.gl_offs = 0;
+                fprintf(stderr, "Loading list of usable logo animation files in %s...\n", avdir.c_str());
+                int globres = glob( (avdir + GlobPat).c_str(), GLOB_NOSORT, NULL, &globdata);
+                if(globres == 0)
+                {
+                    for(size_t n=0; n<globdata.gl_pathc; ++n)
+                    {
+                        const char* fn = globdata.gl_pathv[n];
+                        const char* slash = strrchr(fn, '/');
+                        if(slash) fn = slash+1;
+                        
+                        int gotw=0, goth=0, gotf=0;
+                        sscanf(fn, "logo_%d_%d_f%d", &gotw,&goth,&gotf);
+                        files[gotf].push_back(fn);
+                    }
+                }
+                globfree(&globdata);
+            }
+            
+            std::map<int, std::vector<std::string> >::const_iterator
+                i = files.find(frameno);
+            if(i != files.end())
+            {
+                std::string best;
+                int bestdist = -1;
+                
+                const std::vector<std::string>& fnames = i->second;
+                for(size_t b=fnames.size(), a=0; a<b; ++a)
+                {
+                    unsigned gotw=0, goth=0;
+                    sscanf(fnames[a].c_str(), "logo_%u_%u", &gotw,&goth);
+                    if(gotw < LogoInfo::width || goth < LogoInfo::height) continue;
+                    
+                    int dist = std::max(gotw - LogoInfo::width,
+                                        goth - LogoInfo::height);
+                    
+                    if(bestdist == -1 || dist < bestdist)
+                        { bestdist = dist; best = fnames[a]; }
+                }
+                
+                if(bestdist >= 0) want = avdir + best;
+            }
+        }
+        return want;
+    }
+    
+    static const std::vector<unsigned char> NVConvert24To16Frame
+        (const std::vector<unsigned char>& logodata)
+    {
+        std::vector<unsigned char> result(LogoInfo::width * LogoInfo::height * 2);
+        Convert24To16Frame(&logodata[0], &result[0], LogoInfo::width * LogoInfo::height, LogoInfo::width);
+        return result;
+    }
+    static const std::vector<unsigned char> NVConvert24ToR16Frame
+        (const std::vector<unsigned char>& logodata)
+    {
+        std::vector<unsigned char> result(LogoInfo::width * LogoInfo::height * 2);
+        Convert24ToR16Frame(&logodata[0], &result[0], LogoInfo::width * LogoInfo::height, LogoInfo::width);
+        return result;
+    }
+    static const std::vector<unsigned char> NVConvert24To15Frame
+        (const std::vector<unsigned char>& logodata)
+    {
+        std::vector<unsigned char> result(LogoInfo::width * LogoInfo::height * 2);
+        Convert24To15Frame(&logodata[0], &result[0], LogoInfo::width * LogoInfo::height, LogoInfo::width);
+        return result;
+    }
+    
+    static const std::vector<unsigned char> NVConvert24To_I420Frame
+        (const std::vector<unsigned char>& logodata)
+    {
+        std::vector<unsigned char> result(LogoInfo::width * LogoInfo::height * 3 / 2);
+        Convert24To_I420Frame(&logodata[0], &result[0], LogoInfo::width * LogoInfo::height, LogoInfo::width);
+        return result;
+    }
+    
+    static const std::vector<unsigned char> NVConvert24To_YUY2Frame
+        (const std::vector<unsigned char>& logodata)
+    {
+        std::vector<unsigned char> result(LogoInfo::width * LogoInfo::height * 3 / 2);
+        Convert24To_YUY2Frame(&logodata[0], &result[0], LogoInfo::width * LogoInfo::height, LogoInfo::width);
+        return result;
+    }
+    
+    static const std::vector<unsigned char> NVConvert16To24Frame
+        (const void* data, unsigned npixels)
+    {
+        std::vector<unsigned char> logodata(npixels*3); /* filled with black. */
+        Convert16To24Frame(data, &logodata[0], npixels, true);
+        return logodata;
+    }
+    
+    static const std::vector<unsigned char> NVConvertR16To24Frame
+        (const void* data, unsigned npixels)
+    {
+        std::vector<unsigned char> logodata(npixels*3); /* filled with black. */
+        Convert16To24Frame(data, &logodata[0], npixels, false);
+        return logodata;
+    }
+    
+    static const std::vector<unsigned char> NVConvert15To24Frame
+        (const void* data, unsigned npixels)
+    {
+        std::vector<unsigned char> logodata(npixels*3); /* filled with black. */
+        Convert15To24Frame(data, &logodata[0], npixels);
+        return logodata;
+    }
+    
+    static const std::vector<unsigned char> NVConvert_I420To24Frame
+        (const void* data, unsigned npixels)
+    {
+        std::vector<unsigned char> logodata(npixels*3); /* filled with black. */
+        Convert_I420To24Frame(data, &logodata[0], npixels, LogoInfo::width);
+        return logodata;
+    }
+    
+    static const std::vector<unsigned char> NVConvert_YUY2To24Frame
+        (const void* data, unsigned npixels)
+    {
+        std::vector<unsigned char> logodata(npixels*3); /* filled with black. */
+        Convert_YUY2To24Frame(data, &logodata[0], npixels, LogoInfo::width);
+        return logodata;
+    }
+    
+    static void SubstituteWithBlackIfNeeded(const void*& data)
+    {
+        /* If the first frames of the animation consist of a
+         * single color (such as gray for NES), replace them
+         * with black to avoid ugly backgrounds on logo animations
+         */
+    
+        static bool Deviate = false;
+        static short* Replacement = 0;
+        static unsigned wid=0, hei=0;
+        if(Deviate)
+        {
+            if(Replacement) { delete[] Replacement; Replacement=0; }
+            return;
+        }
+        
+        unsigned dim = LogoInfo::width * LogoInfo::height;
+        const short* p = (const short*)data;
+        for(unsigned a=0; a<dim; ++a)
+            if(p[a] != p[0])
+            {
+                Deviate = true;
+                return;
+            }
+        
+        if(Replacement && (wid != LogoInfo::width || hei != LogoInfo::height))
+        {
+            delete[] Replacement;
+            Replacement = 0;
+        }
+        
+        wid = LogoInfo::width;
+        hei = LogoInfo::height;
+        
+        if(!Replacement)
+        {
+            Replacement = new short[dim];
+            for(unsigned a=0; a<dim; ++a) Replacement[a]=0x0000;
+        }
+        data = (void*)Replacement;
+    }
+
+    void NESVideoLoggingVideo
+        (const void*data, unsigned width,unsigned height,
+         unsigned fps_scaled,
+         unsigned bpp
+        )
+    {
+        if(LoggingEnabled < 2) return;
+        
+        ++CurrentFrameNumber;
+        
+#ifdef THREAD_SAFETY
+        ScopedLock lock;
+#endif
+
+        if(bpp == 32) /* Convert 32 to 24 */
+        {
+            bpp = 24;
+            
+            static std::vector<unsigned char> VideoBuf;
+            VideoBuf.resize(width*height * 3);
+            
+            Convert32To24Frame(data, &VideoBuf[0], width*height);
+            data = (void*)&VideoBuf[0];
+        }
+        
+        if(bpp) INPUT_BPP = bpp;
+        
+        switch(INPUT_BPP)
+        {
+            case 32: USE_FOURCC = BGR32; break;
+            case 24: USE_FOURCC = BGR24; break;
+            case 16: USE_FOURCC = BGR16; break;
+            case 15: USE_FOURCC = BGR15; break;
+            case 12: USE_FOURCC = I420; break;
+            case 17: USE_FOURCC = YUY2; break;
+        }
+        //USE_FOURCC = BGR24; // FIXME TEMPORARY
+        
+        const int LogoFramesHeader  = (int)( (LOGO_LENGTH_HEADER  * fps_scaled) / (1 << 24) );
+        const int LogoFramesOverlap = (int)( (LOGO_LENGTH_OVERLAP * fps_scaled) / (1 << 24) );
+        
+        LogoInfo::width  = width;
+        LogoInfo::height = height;
+        
+        if(INPUT_BPP == 16 || INPUT_BPP == 15)
+        {
+            SubstituteWithBlackIfNeeded(data);
+        }
+        else if(INPUT_BPP != 24 && INPUT_BPP != 12 && INPUT_BPP != 17)
+        {
+            fprintf(stderr, "NESVIDEOS_PIECE only supports 16 and 24 bpp, you gave %u bpp\n",
+                bpp);
+            return;
+        }
+        
+        if(!LogoInfo::SentVideo)
+        {
+            /* Send animation frames that do not involve source video? */
+            LogoInfo::SentVideo=true;
+
+            if(LogoFramesHeader > 0)
+            {
+                for(int frame = 0; frame < LogoFramesHeader; ++frame)
+                {
+                    std::vector<unsigned char> logodata(width*height*3); /* filled with black. */
+                    
+                    std::string fn = GetLogoFileName(frame);
+                    /*fprintf(stderr, "wid=%d(%d), hei=%d(%d),fn=%s\n",
+                        width, LogoInfo::width,
+                        height, LogoInfo::height,
+                        fn.c_str());*/
+                    OverlayLogoFrom(fn.c_str(), logodata);
+                    
+                    //INPUT_BPP = 24; USE_FOURCC = BGR24; // FIXME TEMPORARY
+                    
+                    if(INPUT_BPP == 16)
+                    {
+                        std::vector<unsigned char> result = NVConvert24ToR16Frame(logodata);
+                        GetAVIptr().Video(width,height,fps_scaled, &result[0]);
+                    }
+                    else if(INPUT_BPP == 15)
+                    {
+                        std::vector<unsigned char> result = NVConvert24To15Frame(logodata);
+                        GetAVIptr().Video(width,height,fps_scaled, &result[0]);
+                    }
+                    else if(INPUT_BPP == 12)
+                    {
+                        std::vector<unsigned char> result = NVConvert24To_I420Frame(logodata);
+                        GetAVIptr().Video(width,height,fps_scaled, &result[0]);
+                    }
+                    else if(INPUT_BPP == 17)
+                    {
+                        std::vector<unsigned char> result = NVConvert24To_YUY2Frame(logodata);
+                        GetAVIptr().Video(width,height,fps_scaled, &result[0]);
+                    }
+                    else
+                    {
+                        GetAVIptr().Video(width,height,fps_scaled, &logodata[0]);
+                    }
+                }
+            }
+        }
+        
+        if(LogoInfo::OverlapSent < LogoFramesOverlap)
+        {
+            /* Send animation frames that mix source and animation? */
+
+            std::string fn = GetLogoFileName(LogoInfo::OverlapSent + LogoFramesHeader);
+            /*
+            fprintf(stderr, "wid=%d(%d), hei=%d(%d),fn=%s\n",
+                width, LogoInfo::width,
+                height, LogoInfo::height,
+                fn.c_str());*/
+
+            std::vector<unsigned char> logodata;
+            if(INPUT_BPP == 16)
+            {
+                logodata = NVConvertR16To24Frame(data, width*height);
+            }
+            else if(INPUT_BPP == 15)
+            {
+                logodata = NVConvert15To24Frame(data, width*height);
+            }
+            else if(INPUT_BPP == 17)
+            {
+                logodata = NVConvert_YUY2To24Frame(data, width*height);
+            }
+            else if(INPUT_BPP == 12)
+            {
+                logodata = NVConvert_I420To24Frame(data, width*height);
+            }
+            else
+            {
+                logodata.resize(width*height*3); /* filled with black. */
+                memcpy(&logodata[0], data, width*height*3);
+            }
+
+            OverlayLogoFrom(fn.c_str(), logodata);
+            
+            if(INPUT_BPP == 16)
+            {
+                std::vector<unsigned char> result = NVConvert24ToR16Frame(logodata);
+                GetAVIptr().Video(width,height,fps_scaled, &result[0]);
+            }
+            else if(INPUT_BPP == 15)
+            {
+                std::vector<unsigned char> result = NVConvert24To15Frame(logodata);
+                GetAVIptr().Video(width,height,fps_scaled, &result[0]);
+            }
+            else if(INPUT_BPP == 12)
+            {
+                std::vector<unsigned char> result = NVConvert24To_I420Frame(logodata);
+                GetAVIptr().Video(width,height,fps_scaled, &result[0]);
+            }
+            else if(INPUT_BPP == 17)
+            {
+                std::vector<unsigned char> result = NVConvert24To_YUY2Frame(logodata);
+                GetAVIptr().Video(width,height,fps_scaled, &result[0]);
+            }
+            else
+            {
+                GetAVIptr().Video(width,height,fps_scaled, &logodata[0]);
+            }
+
+            ++LogoInfo::OverlapSent;
+            return;
+        }
+        
+        GetAVIptr().Video(width,height,fps_scaled,  (const unsigned char*) data);
+    }
+
+    void NESVideoLoggingAudio
+        (const void*data,
+         unsigned rate, unsigned bits, unsigned chans,
+         unsigned nsamples)
+    {
+        if(LoggingEnabled < 2) return;
+        
+        ++CurrentFrameNumber;
+        
+#ifdef THREAD_SAFETY
+        ScopedLock lock;
+#endif
+
+        if(!LogoInfo::SentAudio && LOGO_LENGTH_HEADER > 0)
+        {
+            LogoInfo::SentAudio=true;
+            
+            double HdrLength = LOGO_LENGTH_HEADER; // N64 workaround
+            
+            const long n = (long)(rate * HdrLength)/*
+                - (rate * 0.11)*/;
+            
+            if(n > 0) {
+            unsigned bytes = n*chans*(bits/8);
+            unsigned char* buf = (unsigned char*)malloc(bytes);
+            if(buf)
+            {
+                memset(buf,0,bytes);
+                GetAVIptr().Audio(rate,bits,chans, buf, n);
+                free(buf);
+            } }
+        }
+        
+        /*
+        fprintf(stderr, "Writing %u samples (%u bits, %u chans, %u rate)\n",
+            nsamples, bits, chans, rate);*/
+        
+        /*
+        static FILE*fp = fopen("audiodump.wav", "wb");
+        fwrite(data, 1, nsamples*(bits/8)*chans, fp);
+        fflush(fp);*/
+        
+        GetAVIptr().Audio(rate,bits,chans, (const unsigned char*) data, nsamples);
+    }
+} /* extern "C" */
diff -NaHudr dosbox-0.73/src/nesvideos-piece.hh dosbox-0.73-patched/src/nesvideos-piece.hh
--- dosbox-0.73/src/nesvideos-piece.hh	1970-01-01 02:00:00.000000000 +0200
+++ dosbox-0.73-patched/src/nesvideos-piece.hh	2010-02-25 23:50:10.326593232 +0200
@@ -0,0 +1,42 @@
+#ifndef NESVPIECEhh
+#define NESVPIECEhh
+
+#define NESVIDEOS_LOGGING 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Is video logging enabled? 0=no, 1=yes, 2=active. Default value: 0 */ 
+extern int LoggingEnabled; 
+
+/* Get and set the video recording command (shell command) */ 
+extern const char* NESVideoGetVideoCmd(); 
+extern void NESVideoSetVideoCmd(const char *cmd);
+
+/* Save 1 frame of video. (Assumed to be 16-bit RGB) */ 
+/* FPS is scaled by 24 bits (*0x1000000) */
+/* Does not do anything if LoggingEnabled<2. */ 
+extern void NESVideoLoggingVideo
+    (const void*data, unsigned width, unsigned height,
+     unsigned fps_scaled,
+     unsigned bpp); 
+
+/* Save N bytes of audio. bytes_per_second is required on the first call. */ 
+/* Does not do anything if LoggingEnabled<2. */ 
+/* The interval of calling this function is not important, as long as all the audio
+ * data is eventually written without too big delay (5 seconds is too big)
+ * This function may be called multiple times per video frame, or once per a few video
+ * frames, or anything in between. Just that all audio data must be written exactly once,
+ * and in order. */ 
+extern void NESVideoLoggingAudio
+    (const void*data,
+     unsigned rate, unsigned bits, unsigned chans,
+     unsigned nsamples);
+/* nsamples*chans*(bits/8) = bytes in *data. */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff -NaHudr dosbox-0.73/src/quantize.hh dosbox-0.73-patched/src/quantize.hh
--- dosbox-0.73/src/quantize.hh	1970-01-01 02:00:00.000000000 +0200
+++ dosbox-0.73-patched/src/quantize.hh	2008-02-20 23:44:33.773495959 +0200
@@ -0,0 +1,185 @@
+/*
+ Ordered dithering methods provided for:
+   8x8 (Quantize8x8)
+   4x4 (Quantize4x4)
+   3x3 (Quantize3x3)
+   4x2 (Quantize4x2)
+   3x2 (Quantize3x2)
+   2x2 (Quantize2x2)
+ The functions are:
+ 
+   template<int m, int in_max>
+   int QuantizeFunc(size_t quant_pos, double value)
+   
+      - Quantizes value, assumed to be in range 0..in_max, to range 0..m
+      - quant_pos tells the coordinate into the dithering matrix
+
+   template<int m, int in_max>
+   int QuantizeFunc(size_t quant_pos, unsigned value)
+
+      - Quantizes value, assumed to be in range 0..in_max, to range 0..m
+      - quant_pos tells the coordinate into the dithering matrix
+
+ Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
+*/
+
+#define OrderedDitherDecl(n) \
+    static const double flts[n]; \
+    static const int ints[n]; \
+    enum { mul = n+1, \
+           maxin = in_max, \
+           even = !(maxin % mul), \
+           intmul = even ? 1 : mul };
+
+/* macroes for initializing dither tables */
+#define d(n) (n)/double(mul) - 0.5
+#define i(n) even ? (n*in_max/mul - (int)in_max/2) \
+                  : (n*in_max - (int)mul*in_max/2)
+
+template<int m, int in_max = 255>
+struct QuantizeNoDither
+{
+    int res;
+    template<typename IntType>
+    QuantizeNoDither(IntType v) : res(v * m / in_max) { }
+    operator int() const { return res; }
+};
+
+template<int m, typename Base>
+struct QuantizeFuncBase: private Base
+{
+    int res;
+    
+    QuantizeFuncBase(size_t quant_pos, double v) : res(0)
+    {
+        if(v > 0.0)
+        {
+            const double dither_threshold = Base::flts[quant_pos];
+            res = (int)(v * (m / double(Base::maxin)) + dither_threshold);
+            if(res > m) res = m;
+        }
+    }
+    
+    QuantizeFuncBase(size_t quant_pos, unsigned char v) : res(v)
+    {
+        if(m == Base::maxin) return;
+        if(m < Base::maxin)
+        {
+            // With dithering
+            const int dither_threshold = Base::ints[quant_pos];
+            const int intmul = Base::intmul;
+            res = (res * (m * intmul) + dither_threshold) / (Base::maxin * intmul);
+        }
+        else
+        {
+            // Without dithering
+            res = QuantizeNoDither<m, Base::maxin> (res);
+        }
+    }
+};
+
+#define QuantizeFuncDecl(name, base) \
+  template<int m, int in_max=255> \
+  struct name: private QuantizeFuncBase<m, base<in_max> > \
+  { \
+      typedef QuantizeFuncBase<m, base<in_max> > Base; \
+      template<typename A, typename B> name(A a, B b) : Base(a, b) { } \
+      operator int() const { return Base::res; } \
+  }
+
+/******* Quantizing with 8x8 ordered dithering ********/
+template<int in_max> struct OrderedDither_8x8 { OrderedDitherDecl(8*8) };
+    template<int in_max>
+    const double OrderedDither_8x8<in_max>::flts[] /* A table for 8x8 ordered dithering */
+    = { d(1 ), d(49), d(13), d(61), d( 4), d(52), d(16), d(64),
+        d(33), d(17), d(45), d(29), d(36), d(20), d(48), d(32),
+        d(9 ), d(57), d( 5), d(53), d(12), d(60), d( 8), d(56),
+        d(41), d(25), d(37), d(21), d(44), d(28), d(40), d(24),
+        d(3 ), d(51), d(15), d(63), d( 2), d(50), d(14), d(62),
+        d(35), d(19), d(47), d(31), d(34), d(18), d(46), d(30),
+        d(11), d(59), d( 7), d(55), d(10), d(58), d( 6), d(54),
+        d(43), d(27), d(39), d(23), d(42), d(26), d(38), d(22) };
+    template<int in_max>
+    const int OrderedDither_8x8<in_max>::ints[]
+    = { i(1 ), i(49), i(13), i(61), i( 4), i(52), i(16), i(64),
+        i(33), i(17), i(45), i(29), i(36), i(20), i(48), i(32),
+        i(9 ), i(57), i( 5), i(53), i(12), i(60), i( 8), i(56),
+        i(41), i(25), i(37), i(21), i(44), i(28), i(40), i(24),
+        i(3 ), i(51), i(15), i(63), i( 2), i(50), i(14), i(62),
+        i(35), i(19), i(47), i(31), i(34), i(18), i(46), i(30),
+        i(11), i(59), i( 7), i(55), i(10), i(58), i( 6), i(54),
+        i(43), i(27), i(39), i(23), i(42), i(26), i(38), i(22) };
+QuantizeFuncDecl(Quantize8x8, OrderedDither_8x8);
+
+
+/******* Quantizing with 4x4 ordered dithering ********/
+template<int in_max> struct OrderedDither_4x4 { OrderedDitherDecl(4*4) };
+    template<int in_max>
+    const double OrderedDither_4x4<in_max>::flts[] /* A table for 4x4 ordered dithering */
+    = { d( 1), d( 9), d( 3), d(11),
+        d(13), d( 5), d(15), d( 7),
+        d( 4), d(12), d( 2), d(10),  
+        d(16), d( 8), d(14), d( 6) };
+    template<int in_max>
+    const int OrderedDither_4x4<in_max>::ints[]
+    = { i( 1), i( 9), i( 3), i(11),
+        i(13), i( 5), i(15), i( 7),
+        i( 4), i(12), i( 2), i(10),
+        i(16), i( 8), i(14), i( 6) };
+QuantizeFuncDecl(Quantize4x4, OrderedDither_4x4);
+
+/******* Quantizing with 3x3 ordered dithering ********/
+template<int in_max> struct OrderedDither_3x3 { OrderedDitherDecl(3*3) };
+    template<int in_max>
+    const double OrderedDither_3x3<in_max>::flts[] /* A table for 3x3 ordered dithering */
+    = { d(1), d(7), d(3),
+        d(6), d(4), d(9),
+        d(8), d(2), d(5) };
+    template<int in_max>
+    const int OrderedDither_3x3<in_max>::ints[]
+    = { i(1), i(7), i(3),
+        i(6), i(4), i(9),  
+        i(8), i(2), i(5) };
+QuantizeFuncDecl(Quantize3x3, OrderedDither_3x3);
+
+/******* Quantizing with 4x2 ordered dithering ********/
+template<int in_max> struct OrderedDither_4x2 { OrderedDitherDecl(4*2) };
+    template<int in_max>
+    const double OrderedDither_4x2<in_max>::flts[] /* A table for 4x2 ordered dithering */
+    = { d(1), d(5), d(2), d(6),
+        d(7), d(3), d(8), d(4) };
+    template<int in_max>
+    const int OrderedDither_4x2<in_max>::ints[]
+    = { i(1), i(5), i(2), i(6),
+        i(7), i(3), i(8), i(4) };
+QuantizeFuncDecl(Quantize4x2, OrderedDither_4x2);
+
+/******* Quantizing with 3x2 ordered dithering ********/
+template<int in_max> struct OrderedDither_3x2 { OrderedDitherDecl(3*2) };
+    template<int in_max>
+    const double OrderedDither_3x2<in_max>::flts[] /* A table for 3x2 ordered dithering */
+    = { d(1), d(5), d(3),
+        d(4), d(2), d(6) };
+    template<int in_max>
+    const int OrderedDither_3x2<in_max>::ints[]
+    = { i(1), i(5), i(3),
+        i(4), i(2), i(6) };
+QuantizeFuncDecl(Quantize3x2, OrderedDither_3x2);
+
+/******* Quantizing with 2x2 ordered dithering ********/
+template<int in_max> struct OrderedDither_2x2 { OrderedDitherDecl(2*2) };
+    template<int in_max>
+    const double OrderedDither_2x2<in_max>::flts[] /* A table for 2x2 ordered dithering */
+    = { d(1), d(4),
+        d(3), d(2) };
+    template<int in_max>
+    const int OrderedDither_2x2<in_max>::ints[]
+    = { i(1), i(4),
+        i(3), i(2) };
+QuantizeFuncDecl(Quantize2x2, OrderedDither_2x2);
+
+
+#undef OrderedDitherDecl
+#undef QuantizeFuncDecl
+#undef i
+#undef d
diff -NaHudr dosbox-0.73/src/rgbtorgb.cc dosbox-0.73-patched/src/rgbtorgb.cc
--- dosbox-0.73/src/rgbtorgb.cc	1970-01-01 02:00:00.000000000 +0200
+++ dosbox-0.73-patched/src/rgbtorgb.cc	2010-02-26 02:42:25.092840724 +0200
@@ -0,0 +1,1142 @@
+#include <stdint.h>
+#include <stdlib.h> // for size_t
+#include <vector>
+#include <cmath>
+
+/* RGB to RGB and RGB from/to I420 conversions written by Bisqwit
+ * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
+ */
+
+typedef uint_least64_t uint64_t;
+
+#include "quantize.hh"
+#include "rgbtorgb.hh"
+#include "simd.hh"
+
+/* For BPP conversions */
+
+static const uint64_t mask24l        __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
+static const uint64_t mask24h        __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
+static const uint64_t mask24hh       __attribute__((aligned(8))) = 0xffff000000000000ULL;
+static const uint64_t mask24hhh      __attribute__((aligned(8))) = 0xffffffff00000000ULL;
+static const uint64_t mask24hhhh     __attribute__((aligned(8))) = 0xffffffffffff0000ULL;
+
+static const uint64_t mask64h        __attribute__((aligned(8))) = 0xFF00FF00FF00FF00ULL;
+static const uint64_t mask64l        __attribute__((aligned(8))) = 0x00FF00FF00FF00FFULL;
+static const uint64_t mask64hw       __attribute__((aligned(8))) = 0xFFFF0000FFFF0000ULL;
+static const uint64_t mask64lw       __attribute__((aligned(8))) = 0x0000FFFF0000FFFFULL;
+static const uint64_t mask64hd       __attribute__((aligned(8))) = 0xFFFFFFFF00000000ULL;
+static const uint64_t mask64ld       __attribute__((aligned(8))) = 0x00000000FFFFFFFFULL;
+
+/* For RGB2YUV: */
+
+static const int RGB2YUV_SHIFT = 15; /* highest value where [RGB][YUV] fit in signed short */
+
+static const int RY = 8414;  //  ((int)(( 65.738/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+static const int RV = 14392; //  ((int)((112.439/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+static const int RU = -4856; //  ((int)((-37.945/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+
+static const int GY = 16519; //  ((int)((129.057/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+static const int GV = -12051;//  ((int)((-94.154/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+static const int GU = -9534; //  ((int)((-74.494/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+
+static const int BY = 3208;  //  ((int)(( 25.064/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+static const int BV = -2339; //  ((int)((-18.285/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+static const int BU = 14392; //  ((int)((112.439/256.0)*(1<<RGB2YUV_SHIFT)+0.5));
+
+static const int Y_ADD = 16;
+static const int U_ADD = 128;
+static const int V_ADD = 128;
+
+/* For YUV2RGB: */
+
+static const int YUV2RGB_SHIFT = 13; /* highest value where UB still fits in signed short */
+
+static const int Y_REV = 9539; // ((int)( (  255 / 219.0 )     * (1<<YUV2RGB_SHIFT)+0.5));
+static const int VR = 14688;   // ((int)( ( 117504 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
+static const int VG = -6659;   // ((int)( ( -53279 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
+static const int UG = -3208;   // ((int)( ( -25675 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
+static const int UB = 16525;   // ((int)( ( 132201 / 65536.0 ) * (1<<YUV2RGB_SHIFT)+0.5));
+
+/****************/
+
+template<typename c64>
+static inline void Convert32To24_32bytes(c64 w0, c64 w1, c64 w2, c64 w3, unsigned char* dest)
+{
+    c64 r0 = (w0 & mask24l) | ((w0 >> 8) & mask24h); /* bbbaaa */
+    c64 r1 = (w1 & mask24l) | ((w1 >> 8) & mask24h); /* dddccc */
+    c64 r2 = (w2 & mask24l) | ((w2 >> 8) & mask24h); /* fffeee */
+    c64 r3 = (w3 & mask24l) | ((w3 >> 8) & mask24h); /* hhhggg */
+    
+    /* ccbbbaaa */
+    ((r0      )  | ((r1 << 48) & mask24hh)).Put(dest+0);
+    /* feeedddc */
+    ((r1 >> 16)  | ((r2 << 32) & mask24hhh)).Put(dest+8);
+    /* hhhgggff */
+    ((r2 >> 32)  | ((r3 << 16) & mask24hhhh)).Put(dest+16);
+}
+
+#if defined(__x86_64) || defined(USE_MMX)
+static void Convert32To24_32bytes(const unsigned char* src,
+                                  unsigned char* dest)
+{
+    c64 w0; w0.Get(src+0);
+    c64 w1; w1.Get(src+8);
+    c64 w2; w2.Get(src+16);
+    c64 w3; w3.Get(src+24);
+    Convert32To24_32bytes(w0,w1,w2,w3, dest);
+}
+#endif
+
+void Convert32To24Frame(const void* data, unsigned char* dest, unsigned npixels)
+{
+    const unsigned char* src = (const unsigned char*)data;
+    
+    #if defined(__x86_64) || defined(USE_MMX)
+    while(npixels >= 8)
+    {
+        Convert32To24_32bytes(src, dest);
+        src  += 4*8;
+        dest += 3*8;
+        npixels -= 8;
+    }
+     #ifdef USE_MMX
+     MMX_clear();
+     #endif
+    #endif
+    
+    for(unsigned pos=0; pos<npixels; ++pos)
+    {
+        dest[3*pos+0] = src[4*pos+0];
+        dest[3*pos+1] = src[4*pos+1];
+        dest[3*pos+2] = src[4*pos+2];
+    }
+}
+
+static void Unbuild16(unsigned char* target, unsigned rgb16)
+{
+    unsigned B = (rgb16%32)*256/32;
+    unsigned G = ((rgb16/32)%64)*256/64;
+    unsigned R = ((rgb16/(32*64))%32)*256/32;
+    target[0] = R;
+    target[1] = G;
+    target[2] = B;
+}
+
+static void Unbuild15(unsigned char* target, unsigned rgb16)
+{
+    unsigned B = (rgb16%32)*256/32;
+    unsigned G = ((rgb16/32)%32)*256/32;
+    unsigned R = ((rgb16/(32*32))%32)*256/32;
+    target[0] = R;
+    target[1] = G;
+    target[2] = B;
+}
+
+template<int basevalue_lo, int basevalue_hi>
+struct Bits16const
+{
+    static const uint64_t static_value =
+       (( ((uint64_t)(unsigned short) basevalue_lo) << 0)
+      | ( ((uint64_t)(unsigned short) basevalue_hi) << 16)
+      | ( ((uint64_t)(unsigned short) basevalue_lo) << 32)
+      | ( ((uint64_t)(unsigned short) basevalue_hi) << 48));
+    static const uint64_t value;
+};
+template<int basevalue_lo, int basevalue_hi>
+const uint64_t Bits16const<basevalue_lo, basevalue_hi>::value =
+               Bits16const<basevalue_lo, basevalue_hi>::static_value;
+
+template<int basevalue_lo, int basevalue_hi>
+struct Bits32const
+{
+    static const uint64_t static_value = 
+       (( ((uint64_t)(unsigned int) basevalue_lo) << 0)
+      | ( ((uint64_t)(unsigned int) basevalue_hi) << 32));
+    static const uint64_t value = static_value;
+};/*
+template<int basevalue_lo, int basevalue_hi>
+const uint64_t Bits32const<basevalue_lo, basevalue_hi>::value =
+               Bits32const<basevalue_lo, basevalue_hi>::static_value;*/
+
+template<uint64_t basevalue_lo, uint64_t basevalue_hi>
+struct Bits8const
+{
+    static const uint64_t static_value =
+       ((basevalue_lo << 0)
+      | (basevalue_hi << 8)
+      | (basevalue_lo << 16)
+      | (basevalue_hi << 24)
+      | (basevalue_lo << 32)
+      | (basevalue_hi << 40)
+      | (basevalue_lo << 48)
+      | (basevalue_hi << 56));
+    static const uint64_t value = static_value;
+};
+
+
+template<int lowbitcount, int highbitcount, int leftshift>
+struct MaskBconst
+{
+    static const uint64_t basevalue_lo = (1 <<  lowbitcount) - 1;
+    static const uint64_t basevalue_hi = (1 << highbitcount) - 1;
+    static const uint64_t value = Bits8const<basevalue_lo,basevalue_hi>::value << leftshift;
+};
+
+template<int bits>
+struct Convert_2byte_consts
+{
+    static const uint64_t mask_lo;//   = MaskBconst<bits,0, 0>::value;
+    static const uint64_t mask_hi;//   = MaskBconst<bits,0, 8>::value;
+    static const uint64_t mask_frac;// = MaskBconst<8-bits,8-bits, 0>::value;
+};
+template<int bits>
+const uint64_t Convert_2byte_consts<bits>::mask_lo   = MaskBconst<bits, 0, 0>::value;
+template<int bits>
+const uint64_t Convert_2byte_consts<bits>::mask_hi   = MaskBconst<bits, 0, 8>::value;
+template<int bits>
+const uint64_t Convert_2byte_consts<bits>::mask_frac = MaskBconst<8-bits, 8-bits, 0>::value;
+
+template<int offs, int bits>
+struct Convert_2byte_helper
+{
+    c64 lo, hi;
+    
+    Convert_2byte_helper(c64 p4a, c64 p4b)
+    {
+        const uint64_t& mask_lo   = Convert_2byte_consts<bits>::mask_lo;
+        const uint64_t& mask_hi   = Convert_2byte_consts<bits>::mask_hi;
+        const uint64_t& mask_frac = Convert_2byte_consts<bits>::mask_frac;
+        
+        /* STEP 1: SEPARATE THE PIXELS INTO RED, GREEN AND BLUE COMPONENTS */
+
+        /* 000BBBBB 000bbbbb  000BBBBB 000bbbbb  000BBBBB 000bbbbb  000BBBBB 000bbbbb */
+        c64 s5 = ((p4a >> offs) & mask_lo) | ((p4b << (8-offs)) & mask_hi);
+
+        /* STEP 2: SCALE THE COLOR COMPONENTS TO 256 RANGE */
+        
+        /* BBBBB000 bbbbb000  BBBBB000 bbbbb000  BBBBB000 bbbbb000  BBBBB000 bbbbb000 */
+        /* 00000BBB 00000bbb  00000BBB 00000bbb  00000BBB 00000bbb  00000BBB 00000bbb */
+        c64 v8 = (s5 << (8-bits)) | ((s5 >> (bits-(8-bits))) & mask_frac);
+        /* v8:
+         *
+         * BBBBBBBB bbbbbbbb  BBBBBBBB bbbbbbbb  BBBBBBBB bbbbbbbb  BBBBBBBB bbbbbbbb *
+         */
+        
+        /* STEP 3: DEINTERLACE THE PIXELS */
+        lo = (v8     ) & mask64l;
+        hi = (v8 >> 8) & mask64l;
+    }
+};
+
+/*
+template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
+static void Convert_2byte_to_24Common(const unsigned char* src, unsigned char* dest)
+    __attribute((noinline));
+*/
+template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits, bool rgb24>
+static void Convert_2byte_to_24or32Common(const unsigned char* src, unsigned char* dest)
+{
+    c64 p4a; p4a.Get(src+0); // four pixels
+    c64 p4b; p4b.Get(src+8); // another four pixels
+    
+    /* in: In both registers: */
+    
+    Convert_2byte_helper<roffs,rbits> r(p4a,p4b);
+    Convert_2byte_helper<boffs,bbits> b(p4a,p4b);
+    Convert_2byte_helper<goffs,gbits> g(p4a,p4b);
+
+    /* STEP 4: CONVERT PIXELS INTO RGB32 */
+    
+    /* Now we have:
+     *               b.lo =  0j0g0d0a
+     *               g.lo =  0k0h0e0b
+     *               r.lo =  0l0i0f0c
+     *               b.hi =  0J0G0D0A
+     *               g.hi =  0K0H0E0B
+     *               r.hi =  0L0I0F0C
+     * We want:
+     *                 w1 =  0fed0cba
+     *                 w2 =  0lkj0ihg
+     *                 w3 =  0FED0CBA
+     *                 w4 =  0LKJ0IHG
+     */
+   
+#if 0 && defined(__MMX__) /* FIXME why is this 0&&? */
+    // punpcklbw  0k0h0e0b, 0j0g0d0a -> 00ed00ba
+    // punpcklwd  0l0i0f0c, ________ -> 0f__0c__
+    c64 w1 = r.lo.unpacklwd(0) | g.lo.unpacklbw(b.lo); // pix 0,1
+    // punpckhbw  0k0h0e0b, 0j0g0d0a -> 00kj00hg
+    // punpckhwd  0l0i0f0c, ________ -> 0l__0i__
+    c64 w2 = r.lo.unpackhwd(0) | g.lo.unpackhbw(b.lo); // pix 2,3
+    
+    c64 w3 = r.hi.unpacklwd(0) | g.hi.unpacklbw(b.hi); // pix 4,5
+    c64 w4 = r.hi.unpackhwd(0) | g.hi.unpackhbw(b.hi); // pix 6,7
+    #ifndef USE_MMX
+     MMX_clear();
+    #endif
+#else
+    /* With 64-bit registers, this code is greatly simpler than
+     * the emulation of unpack opcodes. However, when the
+     * unpack opcodes is available, using them is shorter.
+     * Which way is faster? FIXME: Find out
+     */
+
+    //        mask64lw:  00**00**
+    //        mask64hw:  **00**00
+    // b.lo & mask64lw:  000g000a
+    // g.lo & mask64lw:  000h000b
+    // r.lo & mask64lw:  000i000c
+    // b.lo & mask64hw:  0j000d00
+    // g.lo & mask64hw:  0k000e00
+    // r.lo & mask64hw:  0l000f00
+    
+    c64 tlo1 = ((b.lo & mask64lw)     ) | ((g.lo & mask64lw) << 8) | ((r.lo & mask64lw) << 16);
+    c64 tlo2 = ((b.lo & mask64hw) >>16) | ((g.lo & mask64hw) >> 8) | ((r.lo & mask64hw)      );
+
+    c64 thi1 = ((b.hi & mask64lw)     ) | ((g.hi & mask64lw) << 8) | ((r.hi & mask64lw) << 16);
+    c64 thi2 = ((b.hi & mask64hw) >>16) | ((g.hi & mask64hw) >> 8) | ((r.hi & mask64hw)      );
+    /*
+     *                tlo1 =  0ihg0cba
+     *                tlo2 =  0lkj0fed
+     *                thi1 =  0IHG0CBA
+     *                thi2 =  0LKJ0FED
+     *            mask64ld =  0000****
+     *            mask64hd =  ****0000
+     */
+     
+    c64 w1 = (tlo1 & mask64ld) | ((tlo2 & mask64ld) << 32); // 00000cba | 00000fed = 0fed0bca
+    c64 w2 = (tlo2 & mask64hd) | ((tlo1 & mask64hd) >> 32); // 0lkj0000 | 0ihg0000 = 0lkj0ihg
+
+    c64 w3 = (thi1 & mask64ld) | ((thi2 & mask64ld) << 32);
+    c64 w4 = (thi2 & mask64hd) | ((thi1 & mask64hd) >> 32);
+#endif
+    
+    if(rgb24)
+    {
+        /* STEP 5A: CONVERT PIXELS INTO RGB24 */
+        Convert32To24_32bytes(w1,w2,w3,w4, dest);
+    }
+    else
+    {
+        /* STEP 5B: STORE RGB32 */
+        w1.Put(dest+0);
+        w2.Put(dest+8);
+        w3.Put(dest+16);
+        w4.Put(dest+24);
+    }
+     
+    /*
+     punpcklbw    ____ABCD, ____abcd = AaBbCcDd
+     punpcklwd    ____ABCD, ____abcd = ABabCDcd
+     punpckldq    ____ABCD, ____abcd = ABCDabcd
+     
+     punpckhbw    ABCD____, abcd____ = AaBbCcDd
+     punpckhwd    ABCD____, abcd____ = ABabCDcd
+     punpckhdq    ABCD____, abcd____ = ABCDabcd
+    */
+}
+
+void Convert15To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
+{
+    const unsigned char* src = (const unsigned char*)data;
+    
+    if(swap_red_blue)
+        for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
+            Convert_2byte_to_24or32Common<0,5, 5,5, 10,5, true> (src, dest);
+    else
+        for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
+            Convert_2byte_to_24or32Common<10,5, 5,5, 0,5, true> (src, dest);
+
+    #ifdef USE_MMX
+     MMX_clear();
+    #endif
+    for(unsigned a=0; a<npixels; ++a)
+    {
+        unsigned short v = ((const unsigned short*)src)[a];
+        Unbuild15(&dest[a*3], v);
+    }
+}
+
+void Convert16To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
+{
+    const unsigned char* src = (const unsigned char*)data;
+    
+    if(swap_red_blue)
+        for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
+            Convert_2byte_to_24or32Common<0,5, 5,6, 11,5, true> (src, dest);
+    else
+        for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8)
+            Convert_2byte_to_24or32Common<11,5, 5,6, 0,5, true> (src, dest);
+
+    #ifdef USE_MMX
+     MMX_clear();
+    #endif
+    for(unsigned a=0; a<npixels; ++a)
+    {
+        unsigned short v = ((const unsigned short*)src)[a];
+        Unbuild16(&dest[a*3], v);
+    }
+}
+
+void Convert15To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
+{
+    const unsigned char* src = (const unsigned char*)data;
+    
+    if(swap_red_blue)
+        for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
+            Convert_2byte_to_24or32Common<0,5, 5,5, 10,5, false> (src, dest);
+    else
+        for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
+            Convert_2byte_to_24or32Common<10,5, 5,5, 0,5, false> (src, dest);
+
+    #ifdef USE_MMX
+     MMX_clear();
+    #endif
+    for(unsigned a=0; a<npixels; ++a)
+    {
+        unsigned short v = ((const unsigned short*)src)[a];
+        Unbuild15(&dest[a*4], v);
+    }
+}
+
+void Convert16To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue)
+{
+    const unsigned char* src = (const unsigned char*)data;
+    
+    if(swap_red_blue)
+        for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
+            Convert_2byte_to_24or32Common<0,5, 5,6, 11,5, false> (src, dest);
+    else
+        for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8)
+            Convert_2byte_to_24or32Common<11,5, 5,6, 0,5, false> (src, dest);
+
+    #ifdef USE_MMX
+     MMX_clear();
+    #endif
+    for(unsigned a=0; a<npixels; ++a)
+    {
+        unsigned short v = ((const unsigned short*)src)[a];
+        Unbuild16(&dest[a*4], v);
+    }
+}
+
+static inline unsigned Build16(unsigned x,unsigned y, const unsigned char* rgbdata)
+{
+#if 0
+    unsigned o16 = (x + 4*y) % 16;
+    return (Quantize4x4<31>(o16, rgbdata[2]) << 0)
+         | (Quantize4x4<63>(o16, rgbdata[1]) << 5)
+         | (Quantize4x4<31>(o16, rgbdata[0]) << 11);
+#else
+    return (QuantizeNoDither<31>(rgbdata[2]) << 0)
+         | (QuantizeNoDither<63>(rgbdata[1]) << 5)
+         | (QuantizeNoDither<31>(rgbdata[0]) << 11);
+#endif
+}
+static inline unsigned BuildR16(unsigned x,unsigned y, const unsigned char* rgbdata)
+{
+#if 0
+    unsigned o16 = (x + 4*y) % 16;
+    return (Quantize4x4<31>(o16, rgbdata[0]) << 0)
+         | (Quantize4x4<63>(o16, rgbdata[1]) << 5)
+         | (Quantize4x4<31>(o16, rgbdata[2]) << 11);
+#else
+    return (QuantizeNoDither<31>(rgbdata[0]) << 0)
+         | (QuantizeNoDither<63>(rgbdata[1]) << 5)
+         | (QuantizeNoDither<31>(rgbdata[2]) << 11);
+#endif
+}
+static inline unsigned Build15(unsigned x,unsigned y, const unsigned char* rgbdata)
+{
+    unsigned o16 = (x + 4*y) % 16;
+    return (Quantize4x4<31>(o16, rgbdata[2]) << 0)
+         | (Quantize4x4<31>(o16, rgbdata[1]) << 5)
+         | (Quantize4x4<31>(o16, rgbdata[0]) << 10);
+}
+
+void Convert24To16Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    const unsigned char* logodata = (const unsigned char*) data;
+    unsigned short* result = (unsigned short*) dest;
+    unsigned x=0,y=0;
+    for(unsigned pos=0; pos<npixels; ++pos)
+    {
+        result[pos] = Build16(x,y, &logodata[pos*3]);
+        if(++x >= width) { x=0; ++y; }
+    }
+}
+
+void Convert24ToR16Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    const unsigned char* logodata = (const unsigned char*) data;
+    unsigned short* result = (unsigned short*) dest;
+    unsigned x=0,y=0;
+    for(unsigned pos=0; pos<npixels; ++pos)
+    {
+        result[pos] = BuildR16(x,y, &logodata[pos*3]);
+        if(++x >= width) { x=0; ++y; }
+    }
+}
+
+void Convert24To15Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    const unsigned char* logodata = (const unsigned char*) data;
+    unsigned short* result = (unsigned short*) dest;
+    unsigned x=0,y=0;
+    for(unsigned pos=0; pos<npixels; ++pos)
+    {
+        result[pos] = Build15(x,y, &logodata[pos*3]);
+        if(++x >= width) { x=0; ++y; }
+    }
+}
+
+#ifdef __MMX__
+static inline void Convert_I420_MMX_Common
+    (c64_MMX p0_1, c64_MMX p2_3,
+     unsigned char* dest_y0,
+     unsigned char* dest_y1,
+     unsigned char* dest_u,
+     unsigned char* dest_v)
+{
+    c64_MMX p0 = c64_MMX(0).unpacklbw(p0_1); // expand to 64-bit (4*16)
+    c64_MMX p1 = c64_MMX(0).unpackhbw(p0_1);
+    c64_MMX p2 = c64_MMX(0).unpacklbw(p2_3);
+    c64_MMX p3 = c64_MMX(0).unpackhbw(p2_3);
+    
+    c64_MMX ry_gy_by; ry_gy_by.Init16(RY,GY,BY, 0);
+    c64_MMX rgb_u;    rgb_u.Init16(RU,GU,BU, 0);
+    c64_MMX rgb_v;    rgb_v.Init16(RV,GV,BV, 0);
+
+    c64_MMX ctotal = p0.add16(
+                     p2.add16(
+                     p1.add16(
+                     p3)));
+  
+    p0 = _mm_madd_pi16(ry_gy_by.value, p0.value);
+    p1 = _mm_madd_pi16(ry_gy_by.value, p1.value);
+    p2 = _mm_madd_pi16(ry_gy_by.value, p2.value);
+    p3 = _mm_madd_pi16(ry_gy_by.value, p3.value);
+    
+    c64_MMX yy;
+    yy.Init16( ((p0.Extract32<0>() + p0.Extract32<1>()) >> (RGB2YUV_SHIFT)),
+               ((p1.Extract32<0>() + p1.Extract32<1>()) >> (RGB2YUV_SHIFT)),
+               ((p2.Extract32<0>() + p2.Extract32<1>()) >> (RGB2YUV_SHIFT)),
+               ((p3.Extract32<0>() + p3.Extract32<1>()) >> (RGB2YUV_SHIFT)) );
+    yy = yy.add16( Bits16const<Y_ADD,Y_ADD>::value );
+    
+    // Because we're writing to adjacent pixels, we optimize this by
+    // writing two 8-bit values at once in both cases.
+    *(short*)dest_y0 = yy.Extract88_from_1616lo();
+    *(short*)dest_y1 = yy.Extract88_from_1616hi();
+    
+    c64_MMX u_total32 = _mm_madd_pi16(rgb_u.value, ctotal.value);
+    c64_MMX v_total32 = _mm_madd_pi16(rgb_v.value, ctotal.value);
+    
+    *dest_u = U_ADD + ((u_total32.Extract32<0>() + u_total32.Extract32<1>()) >> (RGB2YUV_SHIFT+2));
+    *dest_v = V_ADD + ((v_total32.Extract32<0>() + v_total32.Extract32<1>()) >> (RGB2YUV_SHIFT+2));
+}
+
+static inline void Convert_YUY2_MMX_Common
+    (c64_MMX p0_1, c64_MMX p2_3,
+     unsigned char* dest_yvyu)
+{
+    c64_MMX p0 = c64_MMX(0).unpacklbw(p0_1); // expand to 64-bit (4*16)
+    c64_MMX p1 = c64_MMX(0).unpackhbw(p0_1);
+    c64_MMX p2 = c64_MMX(0).unpacklbw(p2_3); // expand to 64-bit (4*16)
+    c64_MMX p3 = c64_MMX(0).unpackhbw(p2_3);
+    
+    c64_MMX ry_gy_by; ry_gy_by.Init16(RY,GY,BY, 0);
+    c64_MMX rgb_u;    rgb_u.Init16(RU,GU,BU, 0);
+    c64_MMX rgb_v;    rgb_v.Init16(RV,GV,BV, 0);
+
+    c64_MMX ctotal0 = p0.add16(p1);
+    c64_MMX ctotal2 = p2.add16(p3);
+  
+    p0 = _mm_madd_pi16(ry_gy_by.value, p0.value);
+    p1 = _mm_madd_pi16(ry_gy_by.value, p1.value);
+    p2 = _mm_madd_pi16(ry_gy_by.value, p2.value);
+    p3 = _mm_madd_pi16(ry_gy_by.value, p3.value);
+    
+    c64_MMX yy;
+    yy.Init16( ((p0.Extract32<0>() + p0.Extract32<1>()) >> (RGB2YUV_SHIFT)),
+               ((p1.Extract32<0>() + p1.Extract32<1>()) >> (RGB2YUV_SHIFT)),
+               ((p2.Extract32<0>() + p2.Extract32<1>()) >> (RGB2YUV_SHIFT)),
+               ((p3.Extract32<0>() + p3.Extract32<1>()) >> (RGB2YUV_SHIFT)) );
+
+    yy = yy.add16( Bits16const<Y_ADD,Y_ADD>::value );
+    
+    c64_MMX u_total32_0 = _mm_madd_pi16(rgb_u.value, ctotal0.value);
+    c64_MMX v_total32_0 = _mm_madd_pi16(rgb_v.value, ctotal0.value);
+    c64_MMX u_total32_2 = _mm_madd_pi16(rgb_u.value, ctotal2.value);
+    c64_MMX v_total32_2 = _mm_madd_pi16(rgb_v.value, ctotal2.value);
+    
+    c64_MMX quadword = yy; // four y values: at 0, 2, 4 and 6
+    
+    c64_MMX uv; uv.Init16(
+        ((v_total32_0.Extract32<0>() + v_total32_0.Extract32<1>()) >> (RGB2YUV_SHIFT+1)),
+        ((u_total32_0.Extract32<0>() + u_total32_0.Extract32<1>()) >> (RGB2YUV_SHIFT+1)),
+        ((v_total32_2.Extract32<0>() + v_total32_2.Extract32<1>()) >> (RGB2YUV_SHIFT+1)),
+        ((u_total32_2.Extract32<0>() + u_total32_2.Extract32<1>()) >> (RGB2YUV_SHIFT+1)) );
+    c64_MMX uv_adds; uv_adds.Init16(V_ADD, U_ADD, V_ADD, U_ADD);
+    uv = uv.add16(uv_adds);
+    
+    quadword |= uv << 8;     // two u and v values: at 1, 3, 5 and 7.
+    quadword.Put(dest_yvyu); // write four y values: at 0, 2, 4 and 6
+}
+#endif
+
+/*template<int PixStride>
+void Convert_4byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+    __attribute__((noinline));*/
+
+template<int PixStride>
+void Convert_4byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    const unsigned char* src = (const unsigned char*) data;
+    unsigned height = npixels / width;
+    
+    unsigned pos = 0;
+    unsigned ypos = 0;
+    unsigned vpos = npixels;
+    unsigned upos = vpos + npixels / 4;
+    unsigned stride = width*PixStride;
+
+    /*fprintf(stderr, "npixels=%u, width=%u, height=%u, ypos=%u,upos=%u,vpos=%u",
+        npixels,width,height, ypos,upos,vpos);*/
+
+    /* This function is based on code from x264 svn version 711 */
+    /* TODO: Apply MMX optimization for 24-bit pixels */
+    
+    for(unsigned y=0; y<height; y += 2)
+    {
+        for(unsigned x=0; x<width; x += 2)
+        {
+        #ifdef __MMX__
+          if(PixStride == 4)
+          {
+            c64_MMX p0_1; p0_1.Get(&src[pos]);        // two 32-bit pixels (4*8)
+            c64_MMX p2_3; p2_3.Get(&src[pos+stride]); // two 32-bit pixels
+
+            pos += PixStride*2;
+            
+            Convert_I420_MMX_Common(p0_1, p2_3,
+                dest+ypos,
+                dest+ypos+width,
+                dest+upos++,
+                dest+vpos++);
+          }
+          else
+        #endif
+          {
+            int c[3], rgb[3][4];
+            
+            /* luma */
+            for(int n=0; n<3; ++n) c[n]  = rgb[n][0] = src[pos + n];
+            for(int n=0; n<3; ++n) c[n] += rgb[n][1] = src[pos + n + stride];
+            pos += PixStride;
+            
+            for(int n=0; n<3; ++n) c[n] += rgb[n][2] = src[pos + n];
+            for(int n=0; n<3; ++n) c[n] += rgb[n][3] = src[pos + n + stride];
+            pos += PixStride;
+
+            unsigned destpos[4] = { ypos, ypos+width, ypos+1, ypos+width+1 };
+            for(int n=0; n<4; ++n)
+            {
+                dest[destpos[n]]
+                    = Y_ADD + ((RY * rgb[0][n]
+                              + GY * rgb[1][n]
+                              + BY * rgb[2][n]
+                               ) >> RGB2YUV_SHIFT);  // y
+            }
+            
+            dest[upos++] = (U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+2)) );
+            dest[vpos++] = (V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+2)) ); 
+          }
+            
+            ypos += 2;
+        }
+        pos += stride;
+        ypos += width;
+    }
+    
+    /*fprintf(stderr, ",yr=%u,ur=%u,vr=%u\n",
+        ypos,upos,vpos);*/
+    
+    #ifdef __MMX__
+     MMX_clear();
+    #endif
+}
+
+template<int PixStride>
+void Convert_4byte_To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    const unsigned char* src = (const unsigned char*) data;
+    unsigned height = npixels / width;
+    unsigned pos = 0;
+    unsigned ypos = 0;
+    unsigned stride = width*PixStride;
+
+    /* This function is based on code from x264 svn version 711 */
+    /* TODO: Apply MMX optimization for 24-bit pixels */
+    
+    for(unsigned y=0; y<height; ++y)
+    {
+        for(unsigned x=0; x<width; x += 2)
+        {
+        #ifdef __MMX__
+          if(PixStride == 4)
+          {
+            c64_MMX p0_1; p0_1.Get(&src[pos]);        // two 32-bit pixels (4*8)
+            pos += PixStride*2;
+            
+            c64_MMX p2_3; p2_3.Get(&src[pos]);        // two 32-bit pixels (4*8)
+            pos += PixStride*2;
+            x += 2;
+            
+            Convert_YUY2_MMX_Common(p0_1, p2_3,
+                dest+ypos);
+          
+            ypos += 4;
+          }
+          else
+        #endif
+          {
+            int c[3], rgb[3][2];
+            
+            /* luma */
+            for(int n=0; n<3; ++n) c[n]  = rgb[n][0] = src[pos + n];
+            pos += PixStride;
+            
+            for(int n=0; n<3; ++n) c[n] += rgb[n][1] = src[pos + n];
+            pos += PixStride;
+
+            for(int n=0; n<2; ++n)
+            {
+                dest[ypos + n*2]
+                    = Y_ADD + ((RY * rgb[0][n]
+                              + GY * rgb[1][n]
+                              + BY * rgb[2][n]
+                               ) >> RGB2YUV_SHIFT);  // y
+            }
+            
+            dest[ypos+3] = (U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+1)) );
+            dest[ypos+1] = (V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+1)) ); 
+          }
+            ypos += 4;
+        }
+    }
+    #ifdef __MMX__
+    MMX_clear();
+    #endif
+}
+
+/*template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
+void Convert_2byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+    __attribute__((noinline));*/
+    
+template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
+void Convert_2byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    const unsigned PixStride = 2;
+    const unsigned char* src = (const unsigned char*) data;
+    unsigned height = npixels / width;
+    unsigned pos = 0;
+    unsigned ypos = 0;
+    unsigned vpos = npixels;
+    unsigned upos = vpos + npixels / 4;
+    unsigned stride = width*PixStride;
+
+    /* This function is based on code from x264 svn version 711 */
+    
+    for(unsigned y=0; y<height; y += 2)
+    {
+        for(unsigned x=0; x<width; x += 8)
+        {
+            unsigned char Rgb2byteBuf[2][8][4];
+            
+            /* Convert 8 pixels from two scanlines (16 in total)
+             * from RGB15 / RGB16 to RGB32
+             * (Not RGB32, because RGB32 conversion is faster)
+             */
+            Convert_2byte_to_24or32Common
+                <roffs,rbits, goffs,gbits, boffs,bbits, false>
+                (src+pos,        Rgb2byteBuf[0][0]);
+
+            Convert_2byte_to_24or32Common
+                <roffs,rbits, goffs,gbits, boffs,bbits, false>
+                (src+pos+stride, Rgb2byteBuf[1][0]);
+
+            pos += 16;
+            
+            for(int x8 = 0; x8 < 8; x8 += 2)
+            {
+              #ifdef _q_MMX__
+                c64_MMX p0_1; p0_1.Get(&Rgb2byteBuf[0][x8][0]); // two 32-bit pixels (4*8)
+                c64_MMX p2_3; p2_3.Get(&Rgb2byteBuf[1][x8][0]); // two 32-bit pixels
+
+                Convert_I420_MMX_Common(p0_1, p2_3,
+                    dest+ypos,
+                    dest+ypos+width,
+                    dest+upos++,
+                    dest+vpos++);
+              #else
+                int c[3];
+                /* TODO: Some faster means than using pointers */
+                unsigned char* rgb[4] =
+                {
+                    Rgb2byteBuf[0][x8+0],
+                    Rgb2byteBuf[0][x8+1],
+                    Rgb2byteBuf[1][x8+0],
+                    Rgb2byteBuf[1][x8+1]
+                };
+                
+                for(int m=0; m<3; ++m) c[m] = 0;
+                for(int n=0; n<4; ++n)
+                    for(int m=0; m<3; ++m)
+                        c[m] += rgb[n][m];
+                
+                unsigned destpos[4] = { ypos, ypos+1, ypos+width, ypos+width+1 };
+                for(int n=0; n<4; ++n)
+                {
+                    dest[destpos[n]]
+                        = Y_ADD + ((RY * rgb[n][0]
+                                  + GY * rgb[n][1]
+                                  + BY * rgb[n][2]
+                                   ) >> RGB2YUV_SHIFT);  // y
+                }
+                
+                /*c[0] /= 4; c[1] /= 4; c[2] /= 4;*/
+                // Note: +2 is because c[] contains 4 values
+                dest[upos++] = U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+2));
+                dest[vpos++] = V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+2)); 
+              #endif
+                ypos += 2;
+            }
+        }
+        pos += stride;
+        ypos += width;
+    }
+
+    #ifdef __MMX__
+    MMX_clear();
+    #endif
+}
+
+template<int roffs,int rbits, int goffs,int gbits, int boffs,int bbits>
+void Convert_2byte_To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    const unsigned PixStride = 2;
+    const unsigned char* src = (const unsigned char*) data;
+    unsigned height = npixels / width;
+    unsigned pos = 0;
+    unsigned ypos = 0;
+    unsigned stride = width*PixStride;
+
+    for(unsigned y=0; y<height; ++y)
+    {
+        for(unsigned x=0; x<width; x += 8)
+        {
+            unsigned char Rgb2byteBuf[8][4];
+            
+            /* Convert 8 pixels from a scanline
+             * from RGB15 / RGB16 to RGB32
+             * (Not RGB32, because RGB32 conversion is faster)
+             */
+            Convert_2byte_to_24or32Common
+                <roffs,rbits, goffs,gbits, boffs,bbits, false>
+                (src+pos, Rgb2byteBuf[0]);
+
+            pos += 16;
+            
+            for(int x8 = 0; x8 < 8; )
+            {
+              #ifdef __MMX__
+                c64_MMX p0_1; p0_1.Get(&Rgb2byteBuf[x8  ][0]); // two 32-bit pixels (4*8)
+                c64_MMX p2_3; p2_3.Get(&Rgb2byteBuf[x8+2][0]); // two 32-bit pixels (4*8)
+                Convert_YUY2_MMX_Common(p0_1, p2_3, dest+ypos);
+                x8   += 4;
+                ypos += 8;
+              #else
+                int c[3];
+                /* TODO: Some faster means than using pointers */
+                unsigned char* rgb[2] =
+                {
+                    Rgb2byteBuf[x8+0],
+                    Rgb2byteBuf[x8+1],
+                };
+                
+                for(int m=0; m<3; ++m) c[m] = 0;
+                for(int n=0; n<2; ++n)
+                    for(int m=0; m<3; ++m)
+                        c[m] += rgb[n][m];
+                
+                for(int n=0; n<2; ++n)
+                {
+                    dest[ypos + n*2]
+                        = Y_ADD + ((RY * rgb[n][0]
+                                  + GY * rgb[n][1]
+                                  + BY * rgb[n][2]
+                                   ) >> RGB2YUV_SHIFT);  // y
+                }
+                
+                /*c[0] /= 4; c[1] /= 4; c[2] /= 4;*/
+                // Note: +2 is because c[] contains 4 values
+                dest[ypos+3] = U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+1));
+                dest[ypos+1] = V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+1)); 
+                x8   += 2;
+                ypos += 4;
+              #endif
+            }
+        }
+    }
+
+    #ifdef __MMX__
+    MMX_clear();
+    #endif
+}
+
+
+/***/
+
+void Convert_I420To24Frame(const void* data, unsigned char* dest,
+                           unsigned npixels, unsigned width, bool swap_red_blue)
+{
+    const unsigned char* src = (const unsigned char*) data;
+    unsigned height = npixels / width;
+    unsigned pos = 0;
+    unsigned ypos = 0;
+    unsigned vpos = npixels;
+    unsigned upos = vpos + npixels / 4;
+
+    /*fprintf(stderr, "npixels=%u, width=%u, height=%u, ypos=%u,upos=%u,vpos=%u\n",
+        npixels,width,height, ypos,upos,vpos);*/
+    
+    #ifdef __MMX__
+    c64_MMX rgb[4], yy[4];
+    static const c64_MMX vmul/*; vmul.Init16*/(VR, VG, 0, 0);  // R,G,B,0 * vmul = V
+    static const c64_MMX umul/*; umul.Init16*/(0, UG, UB, 0);  // R,G,B,0 * umul = U
+    #endif
+    
+    /*
+        Y input: 16..235
+        U input: 16..240
+        V input: 16..240
+        
+    */
+    
+  #pragma omp parallel for
+    for(unsigned y=0; y<height; y += 2)
+    {
+        for(unsigned x=0; x<width; )
+        {
+        #ifdef __MMX__
+            rgb[0]=rgb[1]=rgb[2]=rgb[3]=yy[0]=yy[1]=yy[2]=yy[3]=c64_MMX(mask64hd)|mask64ld;
+            /* Somehow, this line above fixes an error
+             * where U&V seem to be off by 4 pixels.
+             * Probably a GCC bug? */
+            
+            /* Load 4 U and V values and subtract U_ADD and V_ADD from them. */
+            uint64_t tmp_u = *(uint32_t*)&src[upos];
+            uint64_t tmp_v = *(uint32_t*)&src[vpos];
+            c64_MMX uuq = c64_MMX(0)
+                     .unpacklbw(tmp_u) // 8-bit to 16-bit
+                     .sub16(Bits16const<U_ADD,U_ADD>::value)
+                     .shl16(16 - YUV2RGB_SHIFT); // shift them so that *13bitconst results in upper 16 bits having the actual value
+            c64_MMX vvq = c64_MMX(0)
+                     .unpacklbw(tmp_v)
+                     .sub16(Bits16const<V_ADD,V_ADD>::value)
+                     .shl16(16 - YUV2RGB_SHIFT); // shift them so that *13bitconst results in upper 16 bits having the actual value
+            
+            const short* uu = (const short*)&uuq;
+            const short* vv = (const short*)&vvq;
+            
+            /* c64_MMX rgb[4]; // four sets of 4*int16, each representing 1 rgb value */
+            for(int n=0; n<4; ++n)
+            {
+                /* vv is shifted by 3 bits, vmul is shifted by 13 bits
+                 * 16 bits in total, so mul16hi gets the 16-bit downscaled part */
+                c64_MMX v; v.Init16(vv[n]);
+                c64_MMX u; u.Init16(uu[n]);
+                rgb[n] = v.mul16hi(vmul).add16(
+                         u.mul16hi(umul)      );
+            }
+            
+            /* rgb[0] : U,V increment of RGB32 for x0,y0 - x1,y1
+             * rgb[1] : U,V increment of RGB32 for x2,y0 - x3,y1
+             * rgb[2] : U,V increment of RGB32 for x4,y0 - x5,y1
+             * rgb[3] : U,V increment of RGB32 for x6,y0 - x7,y1
+             */
+            
+            unsigned yyoffs[4] = { ypos, ypos+1, ypos+width, ypos+width+1 };
+            /* c64_MMX yy[4]; // four sets of 4*int16, each representing four Y values */
+            for(int n=0; n<4; ++n)
+            {
+                c64_MMX luma; luma.Init16(
+                    src[yyoffs[0]+n*2],  /* n(0..3): x0y0,x2y0,x4y0,x6y0 */
+                    src[yyoffs[1]+n*2],  /* n(0..3): x1y0,x3y0,x5y0,x7y0 */
+                    src[yyoffs[2]+n*2],  /* n(0..3): x0y1,x2y1,x4y1,x6y1 */
+                    src[yyoffs[3]+n*2]   /* n(0..3): x1y1,x3y1,x5y1,x7y1 */
+                );
+                luma = luma.sub16(Bits16const<Y_ADD,Y_ADD>::value);
+                luma = luma.shl16(16 - YUV2RGB_SHIFT);
+                yy[n] = luma.mul16hi(Bits16const<Y_REV,Y_REV>::value);
+            }
+            const short* const yyval = (const short*) &yy[0].value;
+            /*
+                values in order:
+                   x0y0 x1y0 x0y1 x1y1
+                   x2y0 x3y0 x2y1 x3y1
+                   x4y0 x5y0 x4y1 x5y1
+                   x6y0 x7y0 x6y1 x7y1
+            */
+            int tmppos = pos;
+            for(int ny = 0; ny < 4; ny += 2)
+            {
+                /* Note: We must use 16-bit pixels here instead of 8-bit,
+                 * because the rgb+Y addition can overflow. conv_s16_u8()
+                 * does the necessary clamping, which would not be done
+                 * if the values were 8-bit.
+                 */
+                // 8 pixels for one scanline, repeated twice
+                /* Note: C++ has no named constructors, so we
+                 * use statement blocks here as substitutes.
+                 */
+                c64_MMX r0
+                    = rgb[0].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+0]); tmp; }) )
+                           .conv_s16_u8(
+                      rgb[0].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+1]); tmp; }) ));
+                c64_MMX r1
+                    = rgb[1].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+4]); tmp; }) )
+                           .conv_s16_u8(
+                      rgb[1].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+5]); tmp; }) ));
+                c64_MMX r2
+                    = rgb[2].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+8]); tmp; }) )
+                           .conv_s16_u8(
+                      rgb[2].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+9]); tmp; }) ));
+                c64_MMX r3
+                    = rgb[3].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+12]); tmp; }) )
+                           .conv_s16_u8(
+                      rgb[3].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+13]); tmp; }) ));
+
+                Convert32To24_32bytes(r0,r1,r2,r3, &dest[tmppos]);
+                tmppos += width*3; // next line
+            }
+            upos += 4;
+            vpos += 4;
+            ypos += 8;   // eight bytes for this line (and eight from next too)
+            pos  += 8*3; // eight triplets generated on this line
+            x    += 8;   // eight yy values used on this line
+        #else /* non-MMX */
+            int u = src[upos] - U_ADD;
+            int v = src[vpos] - V_ADD;
+
+            int rgb[3] =
+                {
+                   (VR * v         ) >> (YUV2RGB_SHIFT),
+                   (VG * v + UG * u) >> (YUV2RGB_SHIFT),
+                   (       + UB * u) >> (YUV2RGB_SHIFT)
+                };
+            
+            unsigned incr[4] = {0,1,width,width+1};
+
+            for(unsigned r=0; r<4; ++r)
+                for(unsigned doffs=pos + incr[r]*3, yoffs=ypos + incr[r],
+                        yy = (Y_REV * (src[yoffs] - Y_ADD)) >> YUV2RGB_SHIFT,
+                        n=0; n<3; ++n)
+                    dest[doffs+n] = c64::clamp_u8(rgb[n] + (int)yy);
+
+            upos += 1;
+            vpos += 1;
+            ypos += 2; // two bytes for this line (two from next line)
+            pos  += 2*3; // two triplets generated on this line
+            x    += 2; // two yy values used on this line
+        #endif
+        }
+        ypos += width;
+        pos += 3*width;
+    }
+    #ifdef __MMX__
+    MMX_clear();
+    #endif
+}
+
+void Convert_YUY2To24Frame(const void* data, unsigned char* dest,
+                           unsigned npixels, unsigned width, bool swap_red_blue)
+{
+    const unsigned char* src = (const unsigned char*) data;
+    unsigned height = npixels / width;
+    unsigned pos = 0;
+    unsigned ypos = 0;
+    
+    /* TODO: MMX optimization */
+    
+    /*
+        Y input: 16..235
+        U input: 16..240
+        V input: 16..240
+        
+    */
+  #pragma omp parallel for
+    for(unsigned y=0; y<height; ++y)
+    {
+        for(unsigned x=0; x<width; x += 2)
+        {
+            /* non-MMX */
+            int u = src[ypos+1] - U_ADD;
+            int v = src[ypos+3] - V_ADD;
+
+            int rgb[3] =
+                {
+                   (VR * v         ) >> (YUV2RGB_SHIFT),
+                   (VG * v + UG * u) >> (YUV2RGB_SHIFT),
+                   (       + UB * u) >> (YUV2RGB_SHIFT)
+                };
+            
+            for(unsigned r=0; r<2; ++r)
+                for(unsigned doffs=pos + r*3, yoffs=ypos+r*2,
+                        yy = (Y_REV * (src[yoffs] - Y_ADD)) >> YUV2RGB_SHIFT,
+                        n=0; n<3; ++n)
+                    dest[doffs+n] = c64::clamp_u8(rgb[n] + (int)yy);
+
+            ypos += 4; // four bytes for this line (y,u,y,v)
+            pos  += 2*3; // two triplets generated on this line
+            x    += 2; // two yy values used on this line
+        }
+    }
+}
+
+/***/
+void Convert24To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    Convert_4byte_To_I420Frame<3>(data,dest,npixels,width);
+}
+void Convert32To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    Convert_4byte_To_I420Frame<4>(data,dest,npixels,width);
+}
+void Convert15To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    Convert_2byte_To_I420Frame<10,5, 5,5, 0,5>(data,dest,npixels,width);
+}
+void Convert16To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    Convert_2byte_To_I420Frame<11,5, 5,6, 0,5>(data,dest,npixels,width);
+}
+/***/
+void Convert24To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    Convert_4byte_To_YUY2Frame<3>(data,dest,npixels,width);
+}
+void Convert32To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    Convert_4byte_To_YUY2Frame<4>(data,dest,npixels,width);
+}
+void Convert15To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    Convert_2byte_To_YUY2Frame<10,5, 5,5, 0,5>(data,dest,npixels,width);
+}
+void Convert16To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width)
+{
+    Convert_2byte_To_YUY2Frame<11,5, 5,6, 0,5>(data,dest,npixels,width);
+}
diff -NaHudr dosbox-0.73/src/rgbtorgb.hh dosbox-0.73-patched/src/rgbtorgb.hh
--- dosbox-0.73/src/rgbtorgb.hh	1970-01-01 02:00:00.000000000 +0200
+++ dosbox-0.73-patched/src/rgbtorgb.hh	2010-02-26 01:36:14.736589962 +0200
@@ -0,0 +1,69 @@
+#ifdef __cplusplus
+extern "C" {
+  #define defaulttrue =true
+#else
+  #define defaulttrue
+  #define bool       int
+#endif
+
+/* RGB to RGB and RGB from/to YCbRr (YUV) conversions written by Bisqwit
+ * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
+ *
+ * Concepts:
+ *   15 = RGB15 or BGR15
+ *   16 = RGB16 or BGR16
+ *   24 = RGB24 or BGR24
+ *   32 = RGB32 or BGR32
+ * I420 = YCbCr where Y is issued for each pixel,
+ *                    followed by Cr for 2x2 pixels,
+ *                    followed by Cb for 2x2 pixels
+ * YUY2 = YCbCr where for each pixel, Y is issued,
+ *                    followed by Cr for 2x1 pixels (if even pixel)
+ *                             or Cb for 2x1 pixels (if odd pixel)
+ *
+ * Note: Not all functions honor the swap_red_blue setting.
+ */
+
+void Convert32To24Frame(const void* data, unsigned char* dest, unsigned npixels)
+    __attribute__((noinline));
+
+void Convert15To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert16To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert15To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert16To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert24To16Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert24ToR16Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+
+void Convert24To15Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+
+void Convert_I420To24Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert15To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert16To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert24To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert32To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+
+void Convert_YUY2To24Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width, bool swap_red_blue defaulttrue)
+    __attribute__((noinline));
+
+void Convert15To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert16To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert24To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+void Convert32To_YUY2Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width);
+
+#ifdef __cplusplus
+}
+  #undef defaulttrue
+#else
+  #undef defaulttrue
+  #undef bool
+#endif
diff -NaHudr dosbox-0.73/src/simd.hh dosbox-0.73-patched/src/simd.hh
--- dosbox-0.73/src/simd.hh	1970-01-01 02:00:00.000000000 +0200
+++ dosbox-0.73-patched/src/simd.hh	2008-04-05 17:01:09.719886860 +0300
@@ -0,0 +1,365 @@
+#if defined(__MMX__) && !defined(__x86_64)
+#define USE_MMX
+#endif
+#if defined(__SSE__)
+#define USE_SSE
+#endif
+
+/* SIMD interface (MMX) written by Bisqwit
+ * Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/)
+ */
+
+#ifdef __3dNOW__
+# include <mm3dnow.h> /* Note: not available on ICC */ 
+#elif defined(__MMX__)
+# include <mmintrin.h>
+#endif
+#ifdef __SSE__
+#include <xmmintrin.h>
+ #ifdef __ICC
+ typedef __m128 __v4sf;
+ #endif
+#endif
+
+struct c64_common
+{
+    static signed char clamp_s8(int_fast64_t v)
+        { return v<-128 ? -128 : (v > 127 ? 127 : v); }
+    static unsigned char clamp_u8(int_fast64_t v)
+        { return v<0 ? 0 : (v > 255 ? 255 : v); }
+    static short clamp_s16(int_fast64_t v)
+        { return v<-32768 ? -32768 : (v > 32767 ? 32767 : v); }
+
+    static inline uint_fast64_t expand32_8(uint_fast32_t a)
+    {
+        // 0000abcd -> 0a0b0c0d
+        typedef uint_fast64_t v;
+        return (a&0xFFU)
+            | ((a&0xFF00U)<<8)    // base: 8+8 = 16
+            | ((v)(a&0xFF0000U)<<16) // base: 16+16 = 32
+            | ((v)(a&0xFF000000UL)<<24); // base: 24+24 = 48
+    }
+    static inline uint_fast64_t expand32_16(uint_fast32_t a)
+    {
+        // 0000abcd -> 00ab00cd
+        typedef uint_fast64_t v;
+        return (a&0xFFFFU)
+         | ((v)(a&0xFFFF0000UL)<<16);   // base: 16+16 = 32
+    }
+};
+
+#ifdef __MMX__
+/* 64-bit integers that use MMX / 3Dnow operations where relevant */
+struct c64_MMX: public c64_common
+{
+    typedef c64_MMX c64;
+
+    __m64 value;
+    
+    inline c64_MMX() { }
+    inline c64_MMX(__m64 v) : value(v) { }
+    inline c64_MMX(const uint64_t& v) : value( *(const __m64*)& v) { }
+    inline c64_MMX(int v) : value(_m_from_int(v)) { }
+    inline c64_MMX(short a,short b,short c, short d)
+        : value(_mm_setr_pi16(a,b,c,d)) { }
+
+    inline c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); }
+    inline c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); }
+    c64& operator<<= (int n) { return *this = shl64(n); }
+    c64& operator>>= (int n) { return *this = shr64(n); }
+
+    c64 conv_s16_u8() const { return conv_s16_u8(*this); }
+    c64 conv_s16_s8() const { return conv_s16_s8(*this); }
+
+    void Get(const unsigned char* p)      { value = *(const __m64*)p; }
+    void Put(      unsigned char* p)const { *(__m64*)p =  value; }
+    
+    void Init16(short a,short b,short c, short d)
+        { value = _mm_setr_pi16(a,b,c,d); }
+    void Init16(short a)
+        { value = _mm_set1_pi16(a); }
+
+    void GetD(const unsigned char* p)      { value = *(const __m64*)p; }
+    
+    template<int n>
+    short Extract16() const { return ((const short*)&value)[n]; }
+    template<int n>
+    int Extract32() const { return ((const int*)&value)[n]; }
+    
+    short Extract88_from_1616lo() const
+    {
+        const unsigned char* data = (const unsigned char*)&value;
+        // bytes:  76543210
+        // shorts: 33221100
+        // take:        H L
+        return data[0] | *(short*)(data+1);
+        //return data[0] | ((*(const unsigned int*)data) >> 8);
+    }
+    short Extract88_from_1616hi() const
+    {
+        const unsigned char* data = 4+(const unsigned char*)&value;
+        // bytes:  76543210
+        // shorts: 33221100
+        // take:    H L
+        return data[0] | *(short*)(data+1);
+        //return data[0] | ((*(const unsigned int*)data) >> 8);
+    }
+    
+
+    c64& operator&= (const c64& b) { value=_mm_and_si64(value,b.value); return *this; }
+    c64& operator|= (const c64& b) { value=_mm_or_si64(value,b.value); return *this; }
+    c64& operator^= (const c64& b) { value=_mm_xor_si64(value,b.value); return *this; }
+    c64& operator+= (const c64& b) { return *this = *this + b; }
+    c64& operator-= (const c64& b) { return *this = *this - b; }
+    
+    c64 operator~ () const {
+        static const uint_least64_t negpat = ~(uint_least64_t)0;
+        return c64(_mm_xor_si64(value, *(const __m64*)&negpat));
+    }
+    
+            /* psllqi: p = packed
+                       s = shift
+                       r = right, l = left
+                       l = shift in zero, a = shift in sign bit
+                       q = 64-bit, d = 32-bit, w = 16-bit
+                      [i = immed amount]
+             */
+    c64 operator& (const c64& b) const { return c64(_mm_and_si64(value,b.value)); }
+    c64 operator| (const c64& b) const { return c64(_mm_or_si64(value,b.value)); }
+    c64 operator^ (const c64& b) const { return c64(_mm_xor_si64(value,b.value)); }
+    
+    c64 operator- (const c64& b) const
+    {
+        #ifdef __SSE2__
+        return _mm_sub_si64(value, b.value);
+        #else
+        return (const uint64_t&)value - (const uint64_t&)b.value;
+        #endif
+    }
+    c64 operator+ (const c64& b) const
+    {
+        #ifdef __SSE2__
+        return _mm_add_si64(value, b.value);
+        #else
+        return (const uint64_t&)value + (const uint64_t&)b.value;
+        #endif
+    }
+    
+
+    c64 shl64(int b) const { return _mm_slli_si64(value, b); }
+    c64 shr64(int b) const { return _mm_srli_si64(value, b); }
+    c64 shl16(int b) const { return _mm_slli_pi16(value, b); }
+    c64 shr16(int b) const { return _mm_srli_pi16(value, b); }
+    c64 sar32(int b) const { return _mm_srai_pi32(value, b); }
+    c64 sar16(int b) const { return _mm_srai_pi16(value, b); }
+    c64 add32(const c64& b) const { return _mm_add_pi32(value, b.value); }
+    c64 add16(const c64& b) const { return _mm_add_pi16(value, b.value); }
+    c64 sub32(const c64& b) const { return _mm_sub_pi32(value, b.value); }
+    c64 sub16(const c64& b) const { return _mm_sub_pi16(value, b.value); }
+    c64 mul16(const c64& b) const   { return _mm_mullo_pi16(value, b.value); }
+    c64 mul16hi(const c64& b) const { return _mm_mulhi_pi16(value, b.value); }
+    //c64 mul32(const c64& b) const { return _mm_mullo_pi32(value, b.value); }
+    c64 add8(const c64& b) const { return _mm_add_pi8(value, b.value); }
+    c64 sub8(const c64& b) const { return _mm_sub_pi8(value, b.value); }
+    
+    c64 unpacklbw(const c64& b) const { return _mm_unpacklo_pi8(b.value,value); }
+    c64 unpacklwd(const c64& b) const { return _mm_unpacklo_pi16(b.value,value); }
+    c64 unpackhbw(const c64& b) const { return _mm_unpackhi_pi8(b.value,value); }
+    c64 unpackhwd(const c64& b) const { return _mm_unpackhi_pi16(b.value,value); }
+    c64 unpackldq(const c64& b) const { return _mm_unpacklo_pi32(b.value,value); }
+    c64 unpackldq() const { return _mm_unpacklo_pi32(value,value); }
+
+    c64 operator& (const uint64_t& v) { return c64(_mm_and_si64(value, *(const __m64*)& v)); }
+    
+    c64 conv_s32_s16(const c64& b) const { return _mm_packs_pi32(value, b.value); }
+    c64 conv_s16_u8(const c64& b) const { return _mm_packs_pu16(value, b.value); }
+    c64 conv_s16_s8(const c64& b) const { return _mm_packs_pi16(value, b.value); }
+};
+#endif
+
+struct c64_nonMMX: public c64_common
+{
+    typedef c64_nonMMX c64;
+    
+    uint_least64_t value;
+    
+    inline c64_nonMMX() { }
+    inline c64_nonMMX(uint64_t v) : value(v) { }
+    inline c64_nonMMX(int v) : value(v) { }
+    inline c64_nonMMX(short a,short b,short c, short d)
+        { Init16(a,b,c,d); }
+
+    c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); }
+    c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); }
+    c64& operator<<= (int n) { return *this = shl64(n); }
+    c64& operator>>= (int n) { return *this = shr64(n); }
+
+    c64 conv_s16_u8() const { return conv_s16_u8(*this); }
+    c64 conv_s16_s8() const { return conv_s16_s8(*this); }
+
+    void Init16(short a,short b,short c, short d)
+        { uint_fast64_t aa = (unsigned short)a,
+                        bb = (unsigned short)b,
+                        cc = (unsigned short)c,
+                        dd = (unsigned short)d;
+          value = aa | (bb << 16) | (cc << 32) | (dd << 48); }
+    void Init16(short a)
+        { Init16(a,a,a,a); }
+    void Init8(unsigned char a,unsigned char b,unsigned char c,unsigned char d,
+               unsigned char e,unsigned char f,unsigned char g,unsigned char h)
+    {
+        value = ((uint_fast64_t)(a | (b << 8) | (c << 16) | (d << 24)))
+              | (((uint_fast64_t)e) << 32)
+              | (((uint_fast64_t)f) << 40)
+              | (((uint_fast64_t)g) << 48)
+              | (((uint_fast64_t)h) << 56);
+    }
+
+    void Get(const unsigned char* p)      { value = *(const uint_least64_t*)p; }
+    void Put(      unsigned char* p)const { *(uint_least64_t*)p =  value; }
+    
+    c64& operator&= (const c64& b) { value&=b.value; return *this; }
+    c64& operator|= (const c64& b) { value|=b.value; return *this; }
+    c64& operator^= (const c64& b) { value^=b.value; return *this; }
+    c64& operator+= (const c64& b) { value+=b.value; return *this; }
+    c64& operator-= (const c64& b) { value-=b.value; return *this; }
+    c64 operator& (const c64& b) const { return value & b.value; }
+    c64 operator| (const c64& b) const { return value | b.value; }
+    c64 operator^ (const c64& b) const { return value ^ b.value; }
+    c64 operator- (const c64& b) const { return value - b.value; }
+    c64 operator+ (const c64& b) const { return value + b.value; }
+
+    c64 operator& (uint_fast64_t b) const { return value & b; }
+
+    c64 operator~ () const { return ~value; }
+    
+    #define usimdsim(type, count, op) \
+        type* p = (type*)&res.value; \
+        for(int n=0; n<count; ++n) p[n] = (p[n] op b)
+
+    #define simdsim(type, count, op) \
+        type* p = (type*)&res.value; \
+        const type* o = (const type*)&b.value; \
+        for(int n=0; n<count; ++n) p[n] = (p[n] op o[n])
+    
+    c64 shl64(int b) const { return value << b; }
+    c64 shr64(int b) const { return value >> b; }
+    c64 shl16(int b) const { c64 res = *this; usimdsim(short, 2, <<); return res; }
+    c64 shr16(int b) const { c64 res = *this; usimdsim(unsigned short, 2, >>); return res; }
+    c64 sar32(int b) const { c64 res = *this; usimdsim(int, 2, >>); return res; }
+    c64 sar16(int b) const { c64 res = *this; usimdsim(short, 2, >>); return res; }
+
+    c64 add16(const c64& b) const { c64 res = *this; simdsim(short, 4, +); return res; }
+    c64 sub16(const c64& b) const { c64 res = *this; simdsim(short, 4, -); return res; }
+    c64 add32(const c64& b) const { c64 res = *this; simdsim(int,   2, +); return res; }
+    c64 sub32(const c64& b) const { c64 res = *this; simdsim(int,   2, -); return res; }
+    c64 mul16(const c64& b) const { c64 res = *this; simdsim(short, 4, *); return res; }
+    c64 mul16hi(const c64& b) const { c64 res = *this; simdsim(short, 4, *) >> 16; return res; }
+    c64 add8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, +); return res; }
+    c64 sub8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, -); return res; }
+    
+    #undef simdsim
+    #undef usimdsim
+    
+    c64 conv_s32_s16(const c64& b) const
+    {
+        c64 res; res.
+        Init16(clamp_s16(value & 0xFFFFFFFFU),
+               clamp_s16(value >> 32),
+               clamp_s16(b.value & 0xFFFFFFFFU),
+               clamp_s16(b.value >> 32));
+        return res;
+    }
+    c64 conv_s16_u8(const c64& b) const
+    {
+        c64 res; res.
+        Init8(clamp_u8(value & 0xFFFF),
+              clamp_u8((value >> 16) & 0xFFFF),
+              clamp_u8((value >> 32) & 0xFFFF),
+              clamp_u8((value >> 48) & 0xFFFF),
+              clamp_u8(b.value & 0xFFFF),
+              clamp_u8((b.value >> 16) & 0xFFFF),
+              clamp_u8((b.value >> 32) & 0xFFFF),
+              clamp_u8((b.value >> 48) & 0xFFFF));
+        return res;
+    }
+    c64 conv_s16_s8(const c64& b) const
+    {
+        c64 res; res.
+        Init8(clamp_s8(value & 0xFFFF),
+              clamp_s8((value >> 16) & 0xFFFF),
+              clamp_s8((value >> 32) & 0xFFFF),
+              clamp_s8((value >> 48) & 0xFFFF),
+              clamp_s8(b.value & 0xFFFF),
+              clamp_s8((b.value >> 16) & 0xFFFF),
+              clamp_s8((b.value >> 32) & 0xFFFF),
+              clamp_s8((b.value >> 48) & 0xFFFF));
+        return res;
+    }
+
+    /* TODO: Verify that these are correct (though they should never be used anyway) */
+    c64 unpacklbw(const c64& p) const
+    {
+    #if defined(__MMX__) && !defined(__ICC)
+        /* ICC says [error: type of cast must be integral or enum]
+         * on the return value cast,
+         * so we cannot use this code on ICC. Fine for GCC. */
+        return (uint_least64_t)_m_punpcklbw(*(__m64*)&p.value, *(__m64*)&value);
+    #else
+        uint_fast64_t a=value, b=p.value;
+        return expand32_8(a) | (expand32_8(b) << 8);
+    #endif
+    }
+    c64 unpackhbw(const c64& p) const
+    {
+    #if defined(__MMX__) && !defined(__ICC)
+        return (uint_least64_t)_m_punpckhbw(*(__m64*)&p.value, *(__m64*)&value);
+    #else
+        uint_fast64_t a=value, b=p.value;
+        return expand32_8(a>>32) | (expand32_8(b>>32) << 8);
+    #endif
+    }
+    c64 unpacklwd(const c64& p) const
+    {
+    #if defined(__MMX__) && !defined(__ICC)
+        return (uint_least64_t)_m_punpcklwd(*(__m64*)&p.value, *(__m64*)&value);
+    #else
+        uint_fast64_t a=value, b=p.value;
+        return expand32_16(a) | (expand32_16(b) << 16);
+    #endif
+    }
+    c64 unpackhwd(const c64& p) const
+    {
+    #if defined(__MMX__) && !defined(__ICC)
+        return (uint_least64_t)_m_punpckhwd(*(__m64*)&p.value, *(__m64*)&value);
+    #else
+        uint_fast64_t a=value, b=p.value;
+        return expand32_16(a>>32) | (expand32_16(b>>32) << 16);
+    #endif
+    }
+    c64 unpackldq() const { return unpackldq(*this); }
+    c64 unpackldq(const c64& p) const
+    {
+    #if defined(__MMX__) && !defined(__ICC)
+        return (uint_least64_t)_m_punpckldq(*(__m64*)&p.value, *(__m64*)&value);
+    #else
+        return value | (p.value << 32);
+    #endif
+    }
+};
+
+#ifdef USE_MMX
+typedef c64_MMX c64;
+#else
+typedef c64_nonMMX c64;
+#endif
+
+static inline void MMX_clear()
+{
+    #ifdef __3dNOW__
+    _m_femms(); /* Note: not available on ICC or Valgrind */
+    //_mm_empty();
+    #elif defined(__MMX__)
+    _mm_empty();
+    #endif
+}