diff -NaHudr dega-1.14/doze/dam.cpp changed/doze/dam.cpp --- dega-1.14/doze/dam.cpp 2007-08-25 20:10:29.000000000 +0300 +++ changed/doze/dam.cpp 2007-12-04 19:43:29.315440000 +0200 @@ -7,7 +7,7 @@ char DamPc[]="si"; char DamCycles[]="dword [_nDozeCycles]"; -int ot(char *Format,...) +int ot(const char *Format,...) { va_list Arg; va_start(Arg,Format); if (Out!=NULL) vfprintf(Out,Format,Arg); @@ -188,7 +188,11 @@ { int i=0; ot("; Doze - Dave's Z80 Emulator - Assembler output\n\n"); +#ifdef x86_64 + ot("bits 64\n\n"); +#else ot("bits 32\n\n"); +#endif ot("section .data\n\n"); DamVariables(); @@ -237,4 +241,4 @@ DamMain(); fclose(Out); Out=NULL; return 0; -} \ No newline at end of file +} diff -NaHudr dega-1.14/doze/dam.h changed/doze/dam.h --- dega-1.14/doze/dam.h 2007-08-25 20:10:29.000000000 +0300 +++ changed/doze/dam.h 2007-12-04 19:37:09.465793000 +0200 @@ -8,7 +8,7 @@ // dam.cpp extern char DamPc[]; extern char DamCycles[]; -int ot(char *Format,...); +int ot(const char *Format,...); void DamAlign(); void DamVarToReg(); void DamRegToVar(); diff -NaHudr dega-1.14/Makefile changed/Makefile --- dega-1.14/Makefile 2007-08-25 20:10:29.000000000 +0300 +++ changed/Makefile 2008-02-18 04:54:45.642668000 +0200 @@ -2,9 +2,9 @@ #OPTFLAGS=-O3 -fomit-frame-pointer -funroll-loops -march=i686 -mcpu=i686 #OPTFLAGS=-xM -O3 -CC=gcc +CC=gcc -g #CC=icc -CXX=g++ +CXX=g++ -g #CXX=icpc NASM=nasm @@ -14,7 +14,8 @@ CFLAGS= $(OPTFLAGS) -mno-cygwin -Imast -Idoze -Imaster -Iextra -Izlib endif -CXXFLAGS= $(CFLAGS) -fno-exceptions +CXXFLAGS= $(CFLAGS) +# -fno-exceptions DOZEOBJ = doze/doze.o doze/dozea.o DAMOBJ = doze/dam.o doze/dama.o doze/damc.o doze/dame.o doze/damf.o doze/damj.o doze/damm.o doze/damo.o doze/damt.o @@ -26,12 +27,13 @@ NASM_FORMAT = elf EXEEXT = SOEXT = .so - PLATOBJ = sdl/main.o + PLATOBJ = sdl/main.o sdl/nesvideos-piece.o sdl/beauty.o sdl/rgbtorgb.o PLATPYOBJ = PLATPYOBJCXX = EXTRA_LIBS = $(shell sdl-config --libs) DOZE_FIXUP = sed -f doze/doze.cmd.sed doze/dozea.asm.new && mv doze/dozea.asm.new doze/dozea.asm - EXTRA_LDFLAGS = + EXTRA_LDFLAGS = -lgd + CFLAGS += -mmmx GUI_LDFLAGS = SPECS = PYTHON_CFLAGS = $(shell python-config --cflags) $(CFLAGS) @@ -84,17 +86,17 @@ endif -dega$(EXEEXT): $(PLATOBJ) $(PLATPYOBJ) $(PLATPYOBJCXX) $(DOZEOBJ) $(MASTOBJ) $(PYEMBOBJ) $(SPECS) - $(CC) $(EXTRA_LDFLAGS) $(GUI_LDFLAGS) -o dega$(EXEEXT) $(PLATOBJ) $(PLATPYOBJ) $(PLATPYOBJCXX) $(DOZEOBJ) $(MASTOBJ) $(PYEMBOBJ) $(EXTRA_LIBS) +dega$(EXEEXT): $(PLATOBJ) $(DOZEOBJ) $(MASTOBJ) $(SPECS) + $(CXX) $(EXTRA_LDFLAGS) $(GUI_LDFLAGS) -o dega$(EXEEXT) $(PLATOBJ) $(DOZEOBJ) $(MASTOBJ) $(EXTRA_LIBS) degavi$(EXEEXT): tools/degavi.o $(DOZEOBJ) $(MASTOBJ) - $(CC) $(EXTRA_LDFLAGS) -o degavi$(EXEEXT) tools/degavi.o $(DOZEOBJ) $(MASTOBJ) -lm + $(CXX) $(EXTRA_LDFLAGS) -o degavi$(EXEEXT) tools/degavi.o $(DOZEOBJ) $(MASTOBJ) -lm mmvconv$(EXEEXT): tools/mmvconv.o $(SPECS) - $(CC) $(EXTRA_LDFLAGS) -o mmvconv$(EXEEXT) tools/mmvconv.o + $(CXX) $(EXTRA_LDFLAGS) -o mmvconv$(EXEEXT) tools/mmvconv.o -pydega$(SOEXT): $(PYOBJ) $(DOZEOBJ) $(MASTOBJ) $(SPECS) - $(CC) -shared -o pydega$(SOEXT) $(PYOBJ) $(DOZEOBJ) $(MASTOBJ) $(EXTRA_LDFLAGS) $(PYTHON_LDFLAGS) +pydega$(SOEXT): $(DOZEOBJ) $(MASTOBJ) $(SPECS) + #$(CXX) -shared -o pydega$(SOEXT) $(DOZEOBJ) $(MASTOBJ) $(EXTRA_LDFLAGS) doze/dozea.o: doze/dozea.asm nasm -f $(NASM_FORMAT) -o doze/dozea.o doze/dozea.asm @@ -104,7 +106,7 @@ $(DOZE_FIXUP) doze/dam$(EXEEXT): $(DAMOBJ) - $(CC) -o doze/dam$(EXEEXT) $(DAMOBJ) + $(CXX) -o doze/dam$(EXEEXT) $(DAMOBJ) master/app.o: master/app.rc cd master && $(WINDRES) -o app.o app.rc diff -NaHudr dega-1.14/mast/doze.h changed/mast/doze.h --- dega-1.14/mast/doze.h 2007-08-25 20:10:29.000000000 +0300 +++ changed/mast/doze.h 2007-12-04 19:41:54.446033000 +0200 @@ -39,9 +39,9 @@ extern struct DozeReg Doze; extern int nDozeCycles; // Memory access: -extern unsigned int DozeMemFetch[0x100]; -extern unsigned int DozeMemRead [0x100]; -extern unsigned int DozeMemWrite[0x100]; +extern unsigned long DozeMemFetch[0x100]; +extern unsigned long DozeMemRead [0x100]; +extern unsigned long DozeMemWrite[0x100]; unsigned char __cdecl DozeAsmRead(unsigned short nAddr); // doze.cpp diff -NaHudr dega-1.14/mast/map.cpp changed/mast/map.cpp --- dega-1.14/mast/map.cpp 2007-08-25 20:10:29.000000000 +0300 +++ changed/mast/map.cpp 2007-12-04 19:42:50.453225000 +0200 @@ -13,14 +13,14 @@ // 0000-03ff Fixed Rom view for (i=0x00;i<0x04;i++) - { DozeMemFetch[i]=DozeMemRead[i]=(unsigned int)Mastz.Rom; DozeMemWrite[i]=0; } + { DozeMemFetch[i]=DozeMemRead[i]=(unsigned long)Mastz.Rom; DozeMemWrite[i]=0; } // c000-dfff Ram for (i=0xc0;i<0xe0;i++) - { DozeMemFetch[i]=DozeMemRead[i]=DozeMemWrite[i]=(unsigned int)pMastb->Ram-0xc000; } + { DozeMemFetch[i]=DozeMemRead[i]=DozeMemWrite[i]=(unsigned long)pMastb->Ram-0xc000; } // e000-ffff Ram mirror for (i=0xe0;i<0x100;i++) - { DozeMemFetch[i]=DozeMemRead[i]=DozeMemWrite[i]=(unsigned int)pMastb->Ram-0xe000; } + { DozeMemFetch[i]=DozeMemRead[i]=DozeMemWrite[i]=(unsigned long)pMastb->Ram-0xe000; } // For bank writes ff00-ffff callback Doze* DozeMemWrite[0xff]=0; @@ -46,16 +46,16 @@ Mastz.RomPage[n]=PageOff; // Store in the Mastz structure } -static INLINE unsigned int GetRomPage(int n) +static INLINE unsigned long GetRomPage(int n) { CalcRomPage(n); // Recalc the rom page - return (unsigned int)(Mastz.Rom+Mastz.RomPage[n]); // Get the direct memory pointer + return (unsigned long)(Mastz.Rom+Mastz.RomPage[n]); // Get the direct memory pointer } // 0400-3fff Page 0 void MastMapPage0() { - unsigned int Page; Page=GetRomPage(0); + unsigned long Page; Page=GetRomPage(0); // Map Rom Page #ifdef EMU_DOZE { @@ -72,7 +72,7 @@ // 4000-7fff Page 1 void MastMapPage1() { - unsigned int Page; Page=GetRomPage(1); + unsigned long Page; Page=GetRomPage(1); // Map Rom Page #ifdef EMU_DOZE { @@ -90,11 +90,11 @@ // 8000-bfff Page 2 void MastMapPage2() { - unsigned int Page=0; int i=0; + unsigned long Page=0; int i=0; if (Masta.Bank[0]&0x08) { // Map Battery Ram - Page=(unsigned int)pMastb->Sram; + Page=(unsigned long)pMastb->Sram; Page+=(Masta.Bank[0]&4)<<11; // Page -> 0000 or 2000 #ifdef EMU_DOZE Page-=0x8000; diff -NaHudr dega-1.14/sdl/beauty.cc changed/sdl/beauty.cc --- dega-1.14/sdl/beauty.cc 1970-01-01 02:00:00.000000000 +0200 +++ changed/sdl/beauty.cc 2007-12-04 20:29:18.992135000 +0200 @@ -0,0 +1,545 @@ +#include +#include +#include +#include + +#include +#include + +#include + +#include "beauty.hh" +#include "coroutine.h" + +namespace Dega +{ +//#include "mast.h" // included by mastint.h +#include "mastint.h" // for frameCount +} + +typedef std::complex complex; + +int do_beauty_analysis = 0; + +/* Analyzes beauty in the given SMS screenshot. + */ + +static const unsigned char* xbuf = 0; +static unsigned width, height; + +static int skippattern(int p) +{ + return 1 + ((p/7)%3); +} +/* Calculate average tint in given section of the screen. */ +static complex GetAverageTint(int x0,int y0, int x1,int y1) +{ + double avg_i = 0, avg_q = 0, avg_y = 0; + double count = 0; + + for(int y=y0; y<=y1; y+=skippattern(x0+y)) + for(int x=x0; x<=x1; x+=skippattern(y+x)) + { + const unsigned char* pixptr = &xbuf[(x + y * width)*3]; + unsigned rr = pixptr[2]; + unsigned gg = pixptr[1]; + unsigned bb = pixptr[0]; + + double r = rr / 255.0; + double g = gg / 255.0; + double b = bb / 255.0; + + double y = 0.299 * r + 0.587 * g + 0.114 * b; + double i = 0.5957 * r - 0.2744 * g - 0.3212 * b; + double q = 0.2114 * r - 0.5226 * g + 0.3111 * b; + + double pow = 1; + + avg_i += i*y; + avg_q += q*y; + avg_y += y; + + count += pow; + } + + avg_i /= count; + avg_q /= count; + avg_y /= count; + + complex res(avg_i, avg_q); res *= 100; + + double y = std::abs(res); + if(y >= 0.0) res *= std::log(1 + y) / (1 + y); + return res; + + //return complex ( atan2(avg_i, avg_q) , std::log(1 + avg_y)); +} + +static struct SOBJ +{ + int HPos, VPos; + int Hei, Wid; +} OBJ[128]; +static void GenSprites() +{ + // Much a duplicate of mast/draw.cpp + int i; + // Find sprite table + unsigned char *ps; + unsigned char *Sprite=Dega::pMastb->VRam + ((Dega::Masta.v.Reg[5]<< 7)&0x3f00); + // Find the end of the sprite list + for (i=0,ps=Sprite; i<64; i++,ps++) { if (ps[0]==0xd0) break; } // End of sprite list + --i; + for(ps=Sprite+i; i>=0; --i,--ps) + { + SOBJ tmp; + int y = ps[0]; if(y >= 0xE0) y -= 0x100; ++y; + tmp.VPos = y; + tmp.Wid = 8; + tmp.Hei = (Dega::Masta.v.Reg[1]&2) ? 16 : 8; + unsigned char* pa = Sprite+0x80+(i<<1); + tmp.HPos = pa[0]; + if (Dega::Masta.v.Reg[0]&8) tmp.HPos -= 8; // gng + } +} + +struct region +{ + int x0,x1, y0,y1; + + complex tint; + + int sprite_jitter; +}; +static bool SOBJcompare(const SOBJ& a, const SOBJ& b) +{ + return std::memcmp(&a, &b, sizeof(a)) < 0; +} +static double CalculateBeauty(unsigned w,unsigned h) +{ + std::vector regions; + regions.reserve(4*3); + for(unsigned x=0; x<4; ++x) + for(unsigned y=0; y<3; ++y) + { + region tmp; + tmp.tint=0; + tmp.sprite_jitter=0; + tmp.x0 = x*w/4; + tmp.x1 = (x+1)*w/4; + tmp.y0 = y*h/3; + tmp.y1 = (y+1)*h/3; + regions.push_back(tmp); + } +/* + regions[5] = + { + // corners: nw, ne, sw, se, middle + { w*0 /99,w*50/99, h*0 /99,h*50/99, 0, 0 }, + { w*50/99,w*99/99, h*0 /99,h*50/99, 0, 0 }, + { w*0 /99,w*50/99, h*50/99,h*99/99, 0, 0 }, + { w*50/99,w*99/99, h*50/99,h*99/99, 0, 0 }, + { w*33/99,w*55/99, h*33/99,h*55/99, 0, 0 } + }; +*/ + + GenSprites(); + static SOBJ old_sprites[128]; + SOBJ new_sprites[128]; + std::memcpy(&new_sprites, OBJ, sizeof(new_sprites)); + + std::sort(new_sprites, new_sprites+128, SOBJcompare); + + for(unsigned b=0; b<128; ++b) + { + SOBJ& spr = new_sprites[b]; + SOBJ& old = old_sprites[b]; + + /* If the sprite appears to have moved, compensate for scrolling and try again */ + if(spr.HPos != old.HPos || spr.VPos != old.VPos) + { + spr.HPos -= (-Dega::Masta.v.Reg[8])&0xff; + spr.VPos -= Dega::Masta.v.Reg[9] % 224; + } + + if(std::memcmp(&spr, &old, sizeof(spr)) != 0) + { + old = spr; + + int x = spr.HPos, y = spr.VPos & 0xFF, + width = spr.Wid, height = spr.Hei; + + for(unsigned a=0; a< regions.size(); ++a) + { + /* Check if this sprite affects this region */ + region& reg = regions[a]; + + if(x+width >= reg.x0 && x < reg.x1 + && y+height >= reg.y0 && y < reg.y1) + { + reg.sprite_jitter += 1; + if(spr.HPos != old.HPos || spr.VPos != old.VPos) + reg.sprite_jitter += 2; + } + } + } + } + + double totaltint = 0; + for(unsigned a=0; a< regions.size(); ++a) + { + region& reg = regions[a]; + reg.tint = GetAverageTint(reg.x0, reg.y0, reg.x1, reg.y1); + totaltint += std::abs(reg.tint); + } + totaltint /= regions.size(); + + /* + PLAN + + Analyze five sections of this screen. + + - Each four quarters of the screen + - And the middle (slightly larger than a quarter?) + + Analyze: + - Average tint + - Amount of motion (ignore global motion) + + Value: + - Non-bleak tints + - Different tint in each section of screen + - Motion appears in at least two different sections of screen + + */ + + if(do_beauty_analysis >= 3) + { + fprintf(stderr, "analysis: "); + for(unsigned a=0; a< regions.size(); ++a) + { + region& reg = regions[a]; + fprintf(stderr, "(%3.1f<%+3.1f)(%2d) ", + std::abs(reg.tint), + std::arg(reg.tint), + reg.sprite_jitter); + } + fprintf(stderr, "\n"); + } + +/* + complex t = regions[0].tint + regions[1].tint + regions[2].tint + regions[3].tint; + t /= 4; +*/ + + double interestingness = 0; + + if(totaltint >= 0.1) + { + for(unsigned a=0; a data; + size_t dataptr; + int SaveAcb(Dega::MastArea* pma) + { + const unsigned char* pmadata = (const unsigned char*) pma->Data; + data.insert(data.end(), pmadata, pmadata + pma->Len); + return 0; + } + int LoadAcb(Dega::MastArea* pma) + { + memcpy(pma->Data, &data[dataptr], pma->Len); + dataptr += pma->Len; + return 0; + } +} } +struct SaveState +{ + std::vector Data; + + void Create() + { + BeautySS::data.clear(); + Dega::MastAcb = BeautySS::SaveAcb; + Dega::MastAreaDega(); + Dega::MvidPostSaveState(); + Dega::MastAcb = Dega::MastAcbNull; + Data = BeautySS::data; + } + void Load() + { + BeautySS::data = Data; + BeautySS::dataptr = 0; + Dega::MastAcb = BeautySS::LoadAcb; + Dega::MastAreaDega(); + Dega::MvidPostLoadState(1); // readonly + Dega::MastAcb = Dega::MastAcbNull; + } +}; + +static std::vector interestingness_map; +static std::map some_savestates; +static std::vector review_results; + +static bool LocateBeauty(unsigned frameno) +{ + scrBegin; + if(true) /* scope */ + { + std::map::iterator + i = some_savestates.lower_bound(frameno); + + for(;;) + { + if(i != some_savestates.end()) + { + if(i->first <= frameno) + { + unsigned framecount = Dega::frameCount; + + if(framecount > frameno || i->first > framecount) + { + fprintf(stderr, "Aiming for frame %u, loading one that gives %u hopefully\n", + frameno, i->first); + i->second.Load(); + } + else + { + fprintf(stderr, "Aiming for frame %u. Playing from %d\n", + frameno, framecount); + } + + fflush(stderr); + break; + } + } + if(i == some_savestates.begin()) break; + --i; + } + } + + while(Dega::frameCount < frameno) + { + // fprintf(stderr, "Frame %u\n", Dega::frameCount); + // fflush(stderr); + // Settings.FrameAdvance = 1; + scrReturn(false); + } + scrFinish(true); +} + +extern "C" void CloseStuff(int signum); + +#include +static void CreatePNG(const char* beauty_fn) +{ + FILE*fp = fopen(beauty_fn, "wb"); + if(!fp) { perror(beauty_fn); return; } + + // xbuf, width, height + gdImagePtr im = gdImageCreateTrueColor(width,height); + for(unsigned y=0; y sect.value; } + bool operator== (const section& sect) const + { return value == sect.value; } + }; + + +static void ReviewBeauty() +{ + const unsigned framecount = Dega::frameCount; + const unsigned num_sections = 20;std::max( + std::min(50, (int)(framecount / 60)), + // not more than 50 + // not more than one per each second of movie + (int)(framecount / 600) + // but still at least one per each 10 seconds of movie + ); + const unsigned num_results = 20;std::min( + std::min(20, (int)( framecount / (3600/5) ) ), + // not more than 15 + // not more than 5 per each minute of movie + (int)(num_sections)); + + std::vector
sects(num_sections); + + for(unsigned a=0; a sects[sect].value) sects[sect].value = val; + } + + std::sort(sects.begin(), sects.end()); + + for(unsigned a=0; a= 2) + { + std::printf( + "Beauty %u: %g\n", framecount, interestingness); + std::fflush(stdout); + } + + if(interestingness_map.size() <= framecount) + interestingness_map.resize(framecount+1); + interestingness_map[framecount] = interestingness; + + static unsigned maxframe = ~0; + if(maxframe == (unsigned)~0) + { + const char* env = getenv("MAXFRAMES"); + maxframe = env ? atoi(env) : 0; + } + + finished_recording = framecount == maxframe-2; + + if(!finished_recording) + { + return; + } + + //Settings.Paused = 1; + } +} + +void Beauty_INPUT() +{ + if(!do_beauty_analysis) return; + + if(!finished_recording) /* Record interestingness */ + { + static unsigned lastcount=0; + const unsigned framecount = Dega::frameCount; + if(framecount != 0 && (framecount == 1 || !(framecount % 100)) + && framecount != lastcount) + { + fprintf(stderr, "Creating savestate @ %u\n", framecount); + fflush(stderr); + some_savestates[framecount].Create(); + lastcount = framecount; + } + return; + } + + /* Analyze interestingness */ + if(review_results.empty()) + ReviewBeauty(); + + CaptureBeauty(); +} diff -NaHudr dega-1.14/sdl/beauty.hh changed/sdl/beauty.hh --- dega-1.14/sdl/beauty.hh 1970-01-01 02:00:00.000000000 +0200 +++ changed/sdl/beauty.hh 2007-12-04 19:35:46.345056000 +0200 @@ -0,0 +1,9 @@ +#ifdef __cplusplus +extern "C" { +#endif +extern int do_beauty_analysis; +void Beauty_VIS(const unsigned char* Xbuf, unsigned w, unsigned h); +void Beauty_INPUT(); +#ifdef __cplusplus +} +#endif diff -NaHudr dega-1.14/sdl/coroutine.h changed/sdl/coroutine.h --- dega-1.14/sdl/coroutine.h 1970-01-01 02:00:00.000000000 +0200 +++ changed/sdl/coroutine.h 2007-11-23 10:24:25.046571000 +0200 @@ -0,0 +1,249 @@ +/* coroutine.h + * + * Coroutine mechanics, implemented on top of standard ANSI C. See + * http://www.chiark.greenend.org.uk/~sgtatham/coroutines.html for + * a full discussion of the theory behind this. + * + * To use these macros to define a coroutine, you need to write a + * function that looks something like this. + * + * [Simple version using static variables (scr macros)] + * int ascending (void) { + * static int i; + * + * scrBegin; + * for (i=0; i<10; i++) { + * scrReturn(i); + * } + * scrFinish(-1); + * } + * + * [Re-entrant version using an explicit context structure (ccr macros)] + * int ascending (ccrContParam) { + * ccrBeginContext; + * int i; + * ccrEndContext(foo); + * + * ccrBegin(foo); + * for (foo->i=0; foo->i<10; foo->i++) { + * ccrReturn(foo->i); + * } + * ccrFinish(-1); + * } + * + * In the static version, you need only surround the function body + * with `scrBegin' and `scrFinish', and then you can do `scrReturn' + * within the function and on the next call control will resume + * just after the scrReturn statement. Any local variables you need + * to be persistent across an `scrReturn' must be declared static. + * + * In the re-entrant version, you need to declare your persistent + * variables between `ccrBeginContext' and `ccrEndContext'. These + * will be members of a structure whose name you specify in the + * parameter to `ccrEndContext'. + * + * The re-entrant macros will malloc() the state structure on first + * call, and free() it when `ccrFinish' is reached. If you want to + * abort in the middle, you can use `ccrStop' to free the state + * structure immediately (equivalent to an explicit return() in a + * caller-type routine). + * + * A coroutine returning void type may call `ccrReturnV', + * `ccrFinishV' and `ccrStopV', or `scrReturnV', to avoid having to + * specify an empty parameter to the ordinary return macros. + * + * Ground rules: + * - never put `ccrReturn' or `scrReturn' within an explicit `switch'. + * - never put two `ccrReturn' or `scrReturn' statements on the same + * source line. + * + * The caller of a static coroutine calls it just as if it were an + * ordinary function: + * + * void main(void) { + * int i; + * do { + * i = ascending(); + * printf("got number %d\n", i); + * } while (i != -1); + * } + * + * The caller of a re-entrant coroutine must provide a context + * variable: + * + * void main(void) { + * ccrContext z = 0; + * do { + * printf("got number %d\n", ascending (&z)); + * } while (z); + * } + * + * Note that the context variable is set back to zero when the + * coroutine terminates (by crStop, or by control reaching + * crFinish). This can make the re-entrant coroutines more useful + * than the static ones, because you can tell when they have + * finished. + * + * If you need to dispose of a crContext when it is non-zero (that + * is, if you want to stop calling a coroutine without suffering a + * memory leak), the caller should call `ccrAbort(ctx)' where `ctx' + * is the context variable. + * + * This mechanism could have been better implemented using GNU C + * and its ability to store pointers to labels, but sadly this is + * not part of the ANSI C standard and so the mechanism is done by + * case statements instead. That's why you can't put a crReturn() + * inside a switch() statement. + */ + +/* + * coroutine.h is copyright 1995,2000 Simon Tatham. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL SIMON TATHAM BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: coroutine.h 6386 2005-10-12 09:13:42Z simon $ + */ + +#ifndef COROUTINE_H +#define COROUTINE_H + +//#include + +#ifdef __GNUC__ +//# define COROUTINES_USE_LABELGOTO +#endif + +/* + * `scr' macros for static coroutines. + */ + +/* 0 is faster, but 1 enables debugging */ + +#ifdef __cplusplus +# define scrPreamble try { +# define scrPostmble } catch(...) { scrLine=0; throw; } +# define ccrAlloc(x) new ccrContextTag +# define ccrFree(p) delete p +# define ccrPreamble try { +# define ccrPostmble } catch(...) { ccrFree( (ccrContextTag*) ccrParam); *ccrParam=0; throw; } +#else +# define scrPreamble +# define scrPostmble +# define ccrAlloc(x) malloc(sizeof(*x)) +# define ccrFree(p) free(p) +# define ccrPreamble +# define ccrPostmble +#endif + +#ifndef COROUTINES_USE_LABELGOTO + +/* debugging compatible, but allows only one scrReturn per line */ + +#define scrBegin static int scrLine = 0; scrPreamble switch(scrLine) { case 0:; +#define scrFinish(z) } scrPostmble scrLine=0; return (z) +#define scrFinishV } scrPostmble scrLine=0; return + +#define scrReturnCode(code) \ + do {\ + scrLine=__LINE__;\ + code; case __LINE__:;\ + } while (0) +#define scrCallBegin() do { scrLine=__LINE__;case __LINE__:; } while(0) + +#else + +/* faster, gcc-only */ + +#define scrBegin static void*scrLabel=0; scrPreamble \ + {if(scrLabel)goto *scrLabel;} +#define scrFinish(z) scrPostmble scrLabel=0; return (z) +#define scrFinishV scrPostmble scrLabel=0; return + +#define scrReturnCode(code) \ + do { __label__ scrAnchor; scrLabel = &&scrAnchor; \ + code; scrAnchor: ; } while(0) + +#define scrCallBegin() do { __label__ scrAnchor; scrLabel = &&scrAnchor; scrAnchor: ; } while(0) + +#endif + +#define scrReturn(z) scrReturnCode(return(z)) +#define scrReturnV scrReturnCode(return) +/* NOTE: scrThrow does not work together with scrPreamble+scrPostmble! */ +#define scrThrow(z) scrReturnCode(throw z) + + +/* + * `ccr' macros for re-entrant coroutines. + */ + +#define ccrContParam void **ccrParam + +#ifndef COROUTINES_USE_LABELGOTO + +/* debugging compatible, but allows only one ccrReturn per line */ + +#define ccrBeginContext struct ccrContextTag { int ccrLine +#define ccrEndContext(x) } *x = (ccrContextTag*)*ccrParam + +#define ccrBegin(x) if(!x) {x=ccrAlloc(x); *ccrParam=(void*)x; x->ccrLine=0;}\ + ccrPreamble if (x) switch(x->ccrLine) { case 0:; + +#define ccrReturnCode(code) \ + do {\ + ((struct ccrContextTag *)*ccrParam)->ccrLine=__LINE__;\ + code; case __LINE__:;\ + } while (0) +#else + +/* faster, gcc-only */ + +#define ccrBeginContext struct ccrContextTag { void* ccrLabel +#define ccrEndContext(x) } *x = (ccrContextTag*)*ccrParam + +#define ccrBegin(x) if(!x) {x=ccrAlloc(x); *ccrParam=(void*)x; x->ccrLabel=0;}\ + ccrPreamble if (x && x->ccrLabel) goto* x->ccrLabel; + +#define ccrReturnCode(code) \ + do {\ + __label__ ccrAnchor; \ + ((struct ccrContextTag *)*ccrParam)->ccrLabel=&&ccrAnchor; \ + code; ccrAnchor:; \ + } while (0) + +#endif + +#define ccrReturn(z) ccrReturnCode(return z) +#define ccrReturnV ccrReturnCode(return) +/* NOTE: ccrThrow does not work together with ccrPreamble+ccrPostmble! */ +#define ccrThrow(z) ccrReturnCode(throw z) + +#define ccrFinish(z) } ccrPostmble ccrFree( (ccrContextTag*) *ccrParam); *ccrParam=0; return (z) +#define ccrFinishV } ccrPostmble ccrFree( (ccrContextTag*) *ccrParam); *ccrParam=0; return + +#define ccrStop(z) do{ ccrFree( (ccrContextTag*) *ccrParam); *ccrParam=0; return (z); }while(0) +#define ccrStopV do{ ccrFree( (ccrContextTag*) *ccrParam); *ccrParam=0; return; }while(0) + +#define ccrContext void * +#define ccrAbort(ctx) do { ccrFree (/*FIXME,type?*/ctx); ctx = 0; } while (0) + +#endif /* COROUTINE_H */ diff -NaHudr dega-1.14/sdl/main.c changed/sdl/main.c --- dega-1.14/sdl/main.c 2007-08-25 20:10:29.000000000 +0300 +++ changed/sdl/main.c 2007-12-04 20:26:44.767346000 +0200 @@ -15,8 +15,11 @@ #include "../python/embed.h" -SDL_Surface *thescreen; -SDL_Color themap[256]; +#include "nesvideos-piece.hh" +#include "beauty.hh" + +static SDL_Surface *thescreen = 0; +static SDL_Color themap[256] = { { } }; int width, height; @@ -150,6 +153,7 @@ puts("Enter name of movie to begin playback:"); chompgets(buffer, sizeof(buffer), stdin); MvidStart(buffer, PLAYBACK_MODE, 0); + if(LoggingEnabled) LoggingEnabled = 2; } void HandleSetAuthor(void) { @@ -187,7 +191,7 @@ puts("Enter name of Python control script to execute:"); chompgets(buffer, sizeof(buffer), stdin); - MPyEmbed_Run(buffer); +// MPyEmbed_Run(buffer); } void HandlePythonREPL(void) { @@ -196,12 +200,12 @@ return; } - MPyEmbed_Repl(); +// MPyEmbed_Repl(); } void *PythonThreadRun(void *pbuf) { char *buffer = pbuf; - MPyEmbed_RunThread(buffer); +// MPyEmbed_RunThread(buffer); free(buffer); return 0; } @@ -299,7 +303,7 @@ #if 0 pydega_cbpostframe(mainstate); #else - MPyEmbed_CBPostFrame(); +// MPyEmbed_CBPostFrame(); #endif if (input) { @@ -320,15 +324,20 @@ printf(" -s --nosound\tdisable sound\n"); printf(" -f --fullscreen\tfullscreen display\n"); printf(" -r --readonly\tmovies are readonly\n"); + printf(" --autodemo \t automatically starts playing back the given movie\n"); printf("\n" APPNAME_LONG " version " VERSION " by Ulrich Hecht \n"); printf("extended by Peter Collingbourne \n"); printf("based on Win32 version by Dave \n"); exit (0); } +static unsigned char VideoData[512*512*3]; +static unsigned xlef, ytop; // xwid = width, yhei = height + int main(int argc, char** argv) { unsigned char* rom; + char* AutoLoadMovie; int romlength; int done=0; SDL_Event event; @@ -346,7 +355,7 @@ readonly = 0; - MPyEmbed_SetArgv(argc, argv); +// MPyEmbed_SetArgv(argc, argv); while(1) { @@ -363,6 +372,9 @@ {"nosound",no_argument,NULL,'s'}, {"fullscreen",no_argument,NULL,'f'}, {"readonly",no_argument,NULL,'r'}, + {"autodemo",required_argument,NULL,600}, + {"videolog",required_argument,NULL,601}, + {"beauty",no_argument,NULL,602}, {0,0,0,0} }; @@ -378,6 +390,16 @@ printf("%s",VERSION "\n"); exit(0); + case 600: + AutoLoadMovie = optarg; + break; + case 601: + NESVideoSetVideoCmd(optarg); + LoggingEnabled = 1; + break; + case 602: + do_beauty_analysis = 1; + break; case 'g': autodetect=0; MastEx |= MX_GG; @@ -434,8 +456,17 @@ width=MastEx&MX_GG?160:256; height=MastEx&MX_GG?144:192; + xlef = MastEx&MX_GG ? 64 : 16; + ytop = MastEx&MX_GG ? 24 : 0; - thescreen=SDL_SetVideoMode(width, height, 8, SDL_SWSURFACE|vidflags); + //vidflags |= SDL_SWSURFACE; + vidflags |= SDL_HWPALETTE; + thescreen=SDL_SetVideoMode(width, height, 8, vidflags); + if(!thescreen) + { + fprintf(stderr, "Couldn't open video: %s\n", SDL_GetError()); + return -1; + } MastInit(); MastLoadRom(argv[optind], &rom, &romlength); @@ -467,8 +498,12 @@ pMsndOut=NULL; } - MPyEmbed_Init(); - python = MPyEmbed_Available(); +// MPyEmbed_Init(); + python = 0; //MPyEmbed_Available(); + + if(AutoLoadMovie && *AutoLoadMovie) MvidStart(AutoLoadMovie, PLAYBACK_MODE, 0); + + if(LoggingEnabled) LoggingEnabled = 2; MastDrawDo=1; while(!done) @@ -479,6 +514,12 @@ scrlock(); MastFrame(); scrunlock(); + + NESVideoLoggingVideo(VideoData, + width, height, + framerate << 24, + 24); + Beauty_VIS(VideoData, width, height); #if 0 clock_gettime(CLOCK_REALTIME, &t1); @@ -486,20 +527,30 @@ clock_gettime(CLOCK_REALTIME, &t2); printf("postframe took %d ns\n", t2.tv_nsec-t1.tv_nsec); #else - MPyEmbed_CBPostFrame(); +// MPyEmbed_CBPostFrame(); #endif MastInput[0]&=~0x40; + + NESVideoLoggingAudio(pMsndOut, MsndRate, 16, aspec.channels, MsndLen); + if(sound) { SDL_LockAudio(); + + if(LoggingEnabled) audio_len = 0; + memcpy(audiobuf+audio_len,pMsndOut,MsndLen*aspec.channels*2); + audio_len+=MsndLen*aspec.channels*2; //printf("audio_len %d\n",audio_len); SDL_UnlockAudio(); } } frameadvance = 0; + + Beauty_INPUT(); + if (paused) { SDL_WaitEvent(&event); @@ -575,11 +626,14 @@ } if (!paused || frameadvance) { - if(sound) while(audio_len>aspec.samples*aspec.channels*2*4) usleep(5); + if(sound && !LoggingEnabled) + { + while(audio_len>aspec.samples*aspec.channels*2*4) usleep(5); + } } } if (python) { - MPyEmbed_Fini(); +// MPyEmbed_Fini(); } return 0; } @@ -587,6 +641,7 @@ void MdrawCall() { int i,yoff=0; + static unsigned char paldata[256][3]; if(Mdraw.Data[0]) printf("MdrawCall called, line %d, first pixel %d\n",Mdraw.Line,Mdraw.Data[0]); if(Mdraw.PalChange) { @@ -594,15 +649,28 @@ #define p(x) Mdraw.Pal[x] for(i=0;i<0x100;i++) { - themap[i].r=(p(i)&7)<<5; - themap[i].g=(p(i)&56)<<2; - themap[i].b=(p(i)&448)>>1; + themap[i].r=paldata[i][2]=(p(i)&7)<<5; + themap[i].g=paldata[i][1]=(p(i)&56)<<2; + themap[i].b=paldata[i][0]=(p(i)&448)>>1; } SDL_SetColors(thescreen, themap, 0, 256); } if(MastEx&MX_GG) {i=64; yoff=24;} else {i=16; } if(Mdraw.Line-yoff<0 || Mdraw.Line-yoff>=height) return; + memcpy(thescreen->pixels+(Mdraw.Line-yoff)*thescreen->pitch,Mdraw.Data+i,width); + + if(LoggingEnabled || do_beauty_analysis) + { + int y = Mdraw.Line-yoff; + /*printf("y=%d, width=%d\n", y, width); + fflush(stdout);*/ + for(i=0; i +#include +#include +#include +#include +#include + +#include // mknod, unlink, write +#include +#include // S_IFIFO +#include // fcntl +#include // poll +#include // setenv +#include // strrchr +#include +#include + +#include + +#ifdef THREAD_SAFETY +# include +#endif + +/* Note: This module assumes everyone uses BGR16 as display depth */ + +//#define LOGO_LENGTH_HEADER (1.2) +//#define LOGO_LENGTH_OVERLAP (10.0-LOGO_LENGTH_HEADER) +//#define LOGO_LENGTH_HEADER (1.1) +#define LOGO_LENGTH_OVERLAP (6.3-LOGO_LENGTH_HEADER) +//#define LOGO_LENGTH_HEADER (1.4) +//#define LOGO_LENGTH_OVERLAP (0) +#define LOGO_LENGTH_HEADER (0) + +static std::string VIDEO_CMD = ""; +/* +-rawvideo on:fps=60:format=0x42475220:w=256:h=224:size=$[1024*224] +-audiofile "+AUDIO_FN+" +*/ +static std::string AUDIO_FN = "s.log"; + +static bool Terminate=false; +static unsigned videonumber = 0; + +#ifdef THREAD_SAFETY +static pthread_mutex_t APIlock = PTHREAD_MUTEX_INITIALIZER; +#endif + +static unsigned NonblockWrite(FILE* fp, const unsigned char*buf, unsigned length) +{ + Retry: + int result = write(fileno(fp), buf, length); + if(result == -1 && errno==EAGAIN) + { + return 0; + } + if(result == -1 && errno==EINTR) goto Retry; + if(result == -1) + { + perror("write"); + Terminate=true; + return 0; + } + return result; +} +static int WaitUntilOneIsWritable(FILE*f1, FILE*f2) +{ + struct pollfd po[2] = { {fileno(f1),POLLOUT,0}, {fileno(f2),POLLOUT,0} }; + poll(po, 2, -1); + return ((po[0].revents & POLLOUT) ? 1 : 0) + | ((po[1].revents & POLLOUT) ? 2 : 0); +} + +#define BGR32 0x42475220 // BGR32 fourcc +#define BGR24 0x42475218 // BGR24 fourcc +#define BGR16 0x42475210 // BGR16 fourcc +#define BGR15 0x4247520F // BGR15 fourcc +#define I420 0x30323449 // I420 fourcc + +static unsigned USE_FOURCC = BGR16; +static unsigned INPUT_BPP = 16; + +#define u32(n) (n)&255,((n)>>8)&255,((n)>>16)&255,((n)>>24)&255 +#define u16(n) (n)&255,((n)>>8)&255 +#define s4(s) s[0],s[1],s[2],s[3] + +static const unsigned FPS_SCALE = 0x1000000; + +static struct Construct +{ + Construct() + { + char Buf[4096]; + getcwd(Buf,sizeof(Buf)); + Buf[sizeof(Buf)-1]=0; + AUDIO_FN = Buf + std::string("/") + AUDIO_FN; + } +} Construct; + +class AVI +{ + FILE* vidfp; + FILE* audfp; + + bool KnowVideo; + unsigned vid_width; + unsigned vid_height; + unsigned vid_fps_scaled; + std::list > VideoBuffer; + unsigned VidBufSize; + + bool KnowAudio; + unsigned aud_rate; + unsigned aud_chans; + unsigned aud_bits; + std::list > AudioBuffer; + unsigned AudBufSize; + +public: + AVI() : + vidfp(NULL), + audfp(NULL), + KnowVideo(false), VidBufSize(0), + KnowAudio(false), AudBufSize(0) + { + } + ~AVI() + { + while(VidBufSize && AudBufSize) + { + CheckFlushing(); + } + if(audfp) fclose(audfp); + if(vidfp) pclose(vidfp); + unlink(AUDIO_FN.c_str()); + } + + void Audio(unsigned r,unsigned b,unsigned c, + const unsigned char*d, unsigned nsamples) + { + if(Terminate) return; + if(!KnowAudio) + { + aud_rate = r; + aud_chans = c; + aud_bits = b; + KnowAudio = true; + } + CheckFlushing(); + + unsigned bytes = nsamples * aud_chans * (aud_bits / 8); + + unsigned wrote = 0; + if(KnowVideo && AudioBuffer.empty()) + { + //fprintf(stderr, "Writing %u of %s from %p to %p\t", bytes, "aud", (void*)d, (void*)audfp); + wrote = NonblockWrite(audfp, d, bytes); + //fprintf(stderr, "Wrote %u\n", wrote); + } + if(wrote < bytes) + { + unsigned remain = bytes-wrote; + //fprintf(stderr, "Buffering %u of %s (%p..%p)\n", remain, "aud", d+wrote, d+bytes); + AudioBuffer.push_back(std::vector(d+wrote, d+bytes)); + AudBufSize += remain; + } + CheckFlushing(); + } + void Video(unsigned w,unsigned h,unsigned f, const unsigned char*d) + { + if(Terminate) return; + if(!KnowVideo) + { + vid_width = w; + vid_height = h; + vid_fps_scaled = f; + KnowVideo = true; + } + CheckFlushing(); + + unsigned bpp = INPUT_BPP; if(bpp == 15) bpp = 16; + unsigned bytes = vid_width * vid_height * bpp / 8; + + //std::vector tmp(bytes, 'k'); + //d = &tmp[0]; + + unsigned wrote = 0; + if(KnowAudio && VideoBuffer.empty()) + { + CheckBegin(); + //fprintf(stderr, "Writing %u of %s from %p to %p\t", bytes, "vid", (void*)d, (void*)vidfp); + wrote = NonblockWrite(vidfp, d, bytes); + //fprintf(stderr, "Wrote %u\n", wrote); + } + + if(wrote < bytes) + { + unsigned remain = bytes-wrote; + //fprintf(stderr, "Buffering %u of %s (%p..%p)\n", remain, "vid", d+wrote, d+bytes); + + VideoBuffer.push_back(std::vector(d+wrote, d+bytes)); + VidBufSize += remain; + } + CheckFlushing(); + } + +private: + /* fp is passed as a reference because it may be NULL + * prior to calling, and this function changes it. */ + template + void FlushBufferSome(BufType& List, unsigned& Size, FILE*& fp, const char* what) + { + what=what; + + Retry: + if(List.empty() || Terminate) return; + + typename BufType::iterator i = List.begin(); + std::vector& buf = *i; + + if(buf.empty()) + { + List.erase(i); + goto Retry; + } + + unsigned bytes = buf.size(); + + CheckBegin(); + //fprintf(stderr, "Writing %u of %s from %p to %p\t", bytes, what, (void*)&buf[0], (void*)fp); + + unsigned ate = NonblockWrite(fp, &buf[0], bytes); + + //fprintf(stderr, "Wrote %u\n", ate); + + buf.erase(buf.begin(), buf.begin()+ate); + + Size -= ate; + + if(buf.empty()) + { + List.erase(i); + } + } + + void CheckFlushing() + { + //AudioBuffer.clear(); + //VideoBuffer.clear(); + + if(KnowAudio && KnowVideo && !Terminate) + { + if(!AudioBuffer.empty() && !VideoBuffer.empty()) + { + do { + /* vidfp = &1, audfp = &2 */ + int attempt = WaitUntilOneIsWritable(vidfp, audfp); + + if(attempt <= 0) break; /* Some kind of error can cause this */ + + // Flush Video + if(attempt&1) FlushBufferSome(VideoBuffer, VidBufSize, vidfp, "vid"); + + // Flush Audio + if(attempt&2) FlushBufferSome(AudioBuffer, AudBufSize, audfp, "aud"); + } while (!AudioBuffer.empty() && !VideoBuffer.empty()); + } + else + { + FlushBufferSome(VideoBuffer, VidBufSize, vidfp, "vid"); + FlushBufferSome(AudioBuffer, AudBufSize, audfp, "aud"); + } + /* + fprintf(stderr, "Buffer Sizes: Audio %u(%u) video %u(%u)\n", + (unsigned)AudioBuffer.size(), AudBufSize, + (unsigned)VideoBuffer.size(), VidBufSize); + */ + } + } + std::string GetMEncoderRawvideoParam() const + { + char Buf[512]; + unsigned bpp = INPUT_BPP; if(bpp == 15) bpp = 16; + sprintf(Buf, "fps=%g:format=0x%04X:w=%u:h=%u:size=%u", + vid_fps_scaled / (double)FPS_SCALE, + USE_FOURCC, + vid_width, + vid_height, + vid_width*vid_height * bpp/8); + return Buf; + } + std::string GetMEncoderRawaudioParam() const + { + char Buf[512]; + sprintf(Buf, "channels=%u:rate=%u:samplesize=%u:bitrate=%u", + aud_chans, + aud_rate, + aud_bits/8, + aud_rate*aud_chans*(aud_bits/8) ); + return Buf; + } + std::string GetMEncoderCommand() const + { + std::string mandatory = "-audiofile " + AUDIO_FN + + " -audio-demuxer rawaudio" + + " -demuxer rawvideo" + + " -rawvideo " + GetMEncoderRawvideoParam() + + " -rawaudio " + GetMEncoderRawaudioParam() + ; + std::string cmd = VIDEO_CMD; + + std::string::size_type p = cmd.find("NESV""SETTINGS"); + if(p != cmd.npos) + cmd = cmd.replace(p, 4+8, mandatory); + else + fprintf(stderr, "Warning: NESVSETTINGS not found in videocmd\n"); + + char videonumstr[64]; + sprintf(videonumstr, "%u", videonumber); + + for(;;) + { + p = cmd.find("VIDEO""NUMBER"); + if(p == cmd.npos) break; + cmd = cmd.replace(p, 5+6, videonumstr); + } + + fprintf(stderr, "Launch: %s\n", cmd.c_str()); fflush(stderr); + + return cmd; + } + + void CheckBegin() + { + if(!audfp) + { + unlink(AUDIO_FN.c_str()); + mknod(AUDIO_FN.c_str(), S_IFIFO|0666, 0); + } + + if(!vidfp) + { + /* Note: popen does not accept b/t in mode param */ + setenv("LD_PRELOAD", "", 1); + vidfp = popen(GetMEncoderCommand().c_str(), "w"); + if(!vidfp) + { + perror("Launch failed"); + } + else + { + fcntl(fileno(vidfp), F_SETFL, O_WRONLY | O_NONBLOCK); + } + } + + if(!audfp) + { + Retry: + audfp = fopen(AUDIO_FN.c_str(), "wb"); + + if(!audfp) + { + perror(AUDIO_FN.c_str()); + if(errno == ESTALE) goto Retry; + } + else + { + fcntl(fileno(audfp), F_SETFL, O_WRONLY | O_NONBLOCK); + } + } + } +}; + +static AVI* AVI = 0; + +namespace LogoInfo +{ + unsigned width; + unsigned height; +} + +#include "quantize.hh" +#include "rgbtorgb.hh" + +extern "C" +{ + int LoggingEnabled = 0; /* 0=no, 1=yes, 2=recording! */ + + const char* NESVideoGetVideoCmd() + { + return VIDEO_CMD.c_str(); + } + void NESVideoSetVideoCmd(const char *cmd) + { +#ifdef THREAD_SAFETY + struct ScopedLock + { ScopedLock() { + pthread_mutex_lock(&APIlock); + //fprintf(stderr, "audio start\n"); fflush(stderr); + } + ~ScopedLock() { + //fprintf(stderr, "audio end\n"); fflush(stderr); + pthread_mutex_unlock(&APIlock); } + } ScopedLock; +#endif + + VIDEO_CMD = cmd; + } + + static class AVI& GetAVIptr() + { + if(!AVI) + { + fprintf(stderr, "Starting new AVI (num %u)\n", videonumber); + AVI = new class AVI; + } + return *AVI; + } + + void NESVideoNextAVI() + { +#ifdef THREAD_SAFETY + struct ScopedLock + { ScopedLock() { + pthread_mutex_lock(&APIlock); + //fprintf(stderr, "audio start\n"); fflush(stderr); + } + ~ScopedLock() { + //fprintf(stderr, "audio end\n"); fflush(stderr); + pthread_mutex_unlock(&APIlock); } + } ScopedLock; +#endif + + if(AVI) + { + fprintf(stderr, "Closing AVI (next will be started)\n"); + delete AVI; + AVI = 0; + ++videonumber; + } + } + + static void Overlay32With32(unsigned char* target, const unsigned char* source, int alpha) + { + target[0] += ((int)(source[0] - target[0])) * alpha / 255; + target[1] += ((int)(source[1] - target[1])) * alpha / 255; + target[2] += ((int)(source[2] - target[2])) * alpha / 255; + } + + static void OverlayLogoFrom(const char* fn, std::vector& data) + { + FILE*fp = fopen(fn, "rb"); + if(!fp) perror(fn); + if(!fp) return; /* Silently ignore missing frames */ + + gdImagePtr im = gdImageCreateFromPng(fp); + if(!gdImageTrueColor(im)) + { + fprintf(stderr, "'%s': Only true color images are supported\n", fn); + goto CloseIm; + } + {/*scope begin*/ + + unsigned new_width = gdImageSX(im); + unsigned new_height= gdImageSY(im); + + if(new_width != LogoInfo::width + || new_height != LogoInfo::height) + { + if(new_height < LogoInfo::height || new_height > LogoInfo::height+20) + fprintf(stderr, "'%s': ERROR, expected %dx%d, got %dx%d\n", fn, + LogoInfo::width, LogoInfo::height, + new_width, new_height); + } + + for(unsigned y=0; y > files; + if(files.empty()) /* Cache the list of logo files. */ + { + static const char GlobPat[] = "logo_*_*_f*.png"; + glob_t globdata; + globdata.gl_offs = 0; + fprintf(stderr, "Loading list of usable logo animation files in %s...\n", avdir.c_str()); + int globres = glob( (avdir + GlobPat).c_str(), GLOB_NOSORT, NULL, &globdata); + if(globres == 0) + { + for(size_t n=0; n >::const_iterator + i = files.find(frameno); + if(i != files.end()) + { + std::string best; + int bestdist = -1; + + const std::vector& fnames = i->second; + for(size_t b=fnames.size(), a=0; a= 0) want = avdir + best; + } + } + return want; + } + + static const std::vector NVConvert24To16Frame + (const std::vector& logodata) + { + std::vector result(LogoInfo::width * LogoInfo::height * 2); + Convert24To16Frame(&logodata[0], &result[0], LogoInfo::width * LogoInfo::height, LogoInfo::width); + return result; + } + static const std::vector NVConvert24To15Frame + (const std::vector& logodata) + { + std::vector result(LogoInfo::width * LogoInfo::height * 2); + Convert24To15Frame(&logodata[0], &result[0], LogoInfo::width * LogoInfo::height, LogoInfo::width); + return result; + } + + static const std::vector NVConvert24To_I420Frame + (const std::vector& logodata) + { + std::vector result(LogoInfo::width * LogoInfo::height * 3 / 2); + Convert24To_I420Frame(&logodata[0], &result[0], LogoInfo::width * LogoInfo::height, LogoInfo::width); + return result; + } + + static const std::vector NVConvert16To24Frame + (const void* data, unsigned npixels) + { + std::vector logodata(npixels*3); /* filled with black. */ + Convert16To24Frame(data, &logodata[0], npixels); + return logodata; + } + + static const std::vector NVConvert15To24Frame + (const void* data, unsigned npixels) + { + std::vector logodata(npixels*3); /* filled with black. */ + Convert15To24Frame(data, &logodata[0], npixels); + return logodata; + } + + static const std::vector NVConvert_I420To24Frame + (const void* data, unsigned npixels) + { + std::vector logodata(npixels*3); /* filled with black. */ + Convert_I420To24Frame(data, &logodata[0], npixels, LogoInfo::width); + return logodata; + } + + static void SubstituteWithBlackIfNeeded(const void*& data) + { + /* If the first frames of the animation consist of a + * single color (such as gray for NES), replace them + * with black to avoid ugly backgrounds on logo animations + */ + + static bool Deviate = false; + static short* Replacement = 0; + static unsigned wid=0, hei=0; + if(Deviate) + { + if(Replacement) { delete[] Replacement; Replacement=0; } + return; + } + + unsigned dim = LogoInfo::width * LogoInfo::height; + const short* p = (const short*)data; + for(unsigned a=0; a VideoBuf; + VideoBuf.resize(width*height * 3); + + Convert32To24Frame(data, &VideoBuf[0], width*height); + data = (void*)&VideoBuf[0]; + } + + if(bpp) INPUT_BPP = bpp; + + switch(INPUT_BPP) + { + case 32: USE_FOURCC = BGR32; break; + case 24: USE_FOURCC = BGR24; break; + case 16: USE_FOURCC = BGR16; break; + case 15: USE_FOURCC = BGR15; break; + case 12: USE_FOURCC = I420; break; + } + //USE_FOURCC = BGR24; // FIXME TEMPORARY + + const int LogoFramesHeader = (int)( (LOGO_LENGTH_HEADER * fps_scaled) / (1 << 24) ); + const int LogoFramesOverlap = (int)( (LOGO_LENGTH_OVERLAP * fps_scaled) / (1 << 24) ); + + LogoInfo::width = width; + LogoInfo::height = height; + + if(INPUT_BPP == 16 || INPUT_BPP == 15) + { + SubstituteWithBlackIfNeeded(data); + } + else if(INPUT_BPP != 24 && INPUT_BPP != 12) + { + fprintf(stderr, "NESVIDEOS_PIECE only supports 16 and 24 bpp, you gave %u bpp\n", + bpp); + return; + } + + static bool LogoHeaderPartSent = false; + if(!LogoHeaderPartSent) + { + /* Send animation frames that do not involve source video? */ + LogoHeaderPartSent=true; + + if(LogoFramesHeader > 0) + { + for(int frame = 0; frame < LogoFramesHeader; ++frame) + { + std::vector logodata(width*height*3); /* filled with black. */ + + std::string fn = GetLogoFileName(frame); + /*fprintf(stderr, "wid=%d(%d), hei=%d(%d),fn=%s\n", + width, LogoInfo::width, + height, LogoInfo::height, + fn.c_str());*/ + OverlayLogoFrom(fn.c_str(), logodata); + + //INPUT_BPP = 24; USE_FOURCC = BGR24; // FIXME TEMPORARY + + if(INPUT_BPP == 16) + { + std::vector result = NVConvert24To16Frame(logodata); + GetAVIptr().Video(width,height,fps_scaled, &result[0]); + } + else if(INPUT_BPP == 15) + { + std::vector result = NVConvert24To15Frame(logodata); + GetAVIptr().Video(width,height,fps_scaled, &result[0]); + } + else if(INPUT_BPP == 12) + { + std::vector result = NVConvert24To_I420Frame(logodata); + GetAVIptr().Video(width,height,fps_scaled, &result[0]); + } + else + { + GetAVIptr().Video(width,height,fps_scaled, &logodata[0]); + } + } + } + } + + static int LogoOverlapSent = 0; + if(LogoOverlapSent < LogoFramesOverlap) + { + /* Send animation frames that mix source and animation? */ + + std::string fn = GetLogoFileName(LogoOverlapSent + LogoFramesHeader); + /* + fprintf(stderr, "wid=%d(%d), hei=%d(%d),fn=%s\n", + width, LogoInfo::width, + height, LogoInfo::height, + fn.c_str());*/ + + std::vector logodata; + if(INPUT_BPP == 16) + { + logodata = NVConvert16To24Frame(data, width*height); + } + else if(INPUT_BPP == 15) + { + logodata = NVConvert15To24Frame(data, width*height); + } + else if(INPUT_BPP == 12) + { + logodata = NVConvert_I420To24Frame(data, width*height); + } + else + { + logodata.resize(width*height*3); /* filled with black. */ + memcpy(&logodata[0], data, width*height*3); + } + + OverlayLogoFrom(fn.c_str(), logodata); + +// INPUT_BPP = 24; USE_FOURCC = BGR24; // FIXME TEMPORARY + + if(INPUT_BPP == 16) + { + std::vector result = NVConvert24To16Frame(logodata); + GetAVIptr().Video(width,height,fps_scaled, &result[0]); + } + else if(INPUT_BPP == 15) + { + std::vector result = NVConvert24To15Frame(logodata); + GetAVIptr().Video(width,height,fps_scaled, &result[0]); + } + else if(INPUT_BPP == 12) + { + std::vector result = NVConvert24To_I420Frame(logodata); + GetAVIptr().Video(width,height,fps_scaled, &result[0]); + } + else + { + GetAVIptr().Video(width,height,fps_scaled, &logodata[0]); + } + + ++LogoOverlapSent; + return; + } + + GetAVIptr().Video(width,height,fps_scaled, (const unsigned char*) data); + } + + void NESVideoLoggingAudio + (const void*data, + unsigned rate, unsigned bits, unsigned chans, + unsigned nsamples) + { + if(LoggingEnabled < 2) return; + +#ifdef THREAD_SAFETY + struct ScopedLock + { ScopedLock() { + pthread_mutex_lock(&APIlock); + //fprintf(stderr, "audio start\n"); fflush(stderr); + } + ~ScopedLock() { + //fprintf(stderr, "audio end\n"); fflush(stderr); + pthread_mutex_unlock(&APIlock); } + } ScopedLock; +#endif + + static bool LogoHeaderPartSent = false; + if(!LogoHeaderPartSent && LOGO_LENGTH_HEADER > 0) + { + LogoHeaderPartSent=true; + + double HdrLength = LOGO_LENGTH_HEADER; // N64 workaround + + const long n = (long)(rate * HdrLength)/* + - (rate * 0.11)*/; + + if(n > 0) { + unsigned bytes = n*chans*(bits/8); + unsigned char* buf = (unsigned char*)malloc(bytes); + if(buf) + { + memset(buf,0,bytes); + GetAVIptr().Audio(rate,bits,chans, buf, n); + free(buf); + } } + } + + /* + fprintf(stderr, "Writing %u samples (%u bits, %u chans, %u rate)\n", + nsamples, bits, chans, rate);*/ + + /* + static FILE*fp = fopen("audiodump.wav", "wb"); + fwrite(data, 1, nsamples*(bits/8)*chans, fp); + fflush(fp);*/ + + GetAVIptr().Audio(rate,bits,chans, (const unsigned char*) data, nsamples); + } +} /* extern "C" */ diff -NaHudr dega-1.14/sdl/nesvideos-piece.hh changed/sdl/nesvideos-piece.hh --- dega-1.14/sdl/nesvideos-piece.hh 1970-01-01 02:00:00.000000000 +0200 +++ changed/sdl/nesvideos-piece.hh 2008-02-03 16:57:50.354469000 +0200 @@ -0,0 +1,46 @@ +#ifndef NESVPIECEhh +#define NESVPIECEhh + +#define NESVIDEOS_LOGGING 1 + +#ifdef __cplusplus +extern "C" { +#endif + +/* Is video logging enabled? 0=no, 1=yes, 2=active. Default value: 0 */ +extern int LoggingEnabled; + +/* Get and set the video recording command (shell command) */ +extern const char* NESVideoGetVideoCmd(void); +extern void NESVideoSetVideoCmd(const char *cmd); + +/* Save 1 frame of video. (Assumed to be 16-bit RGB) */ +/* FPS is scaled by 24 bits (*0x1000000) */ +/* Does not do anything if LoggingEnabled<2. */ +extern void NESVideoLoggingVideo + (const void*data, unsigned width, unsigned height, + unsigned fps_scaled, + unsigned bpp); + +/* Save N bytes of audio. bytes_per_second is required on the first call. */ +/* Does not do anything if LoggingEnabled<2. */ +/* The interval of calling this function is not important, as long as all the audio + * data is eventually written without too big delay (5 seconds is too big) + * This function may be called multiple times per video frame, or once per a few video + * frames, or anything in between. Just that all audio data must be written exactly once, + * and in order. */ +extern void NESVideoLoggingAudio + (const void*data, + unsigned rate, unsigned bits, unsigned chans, + unsigned nsamples); +/* nsamples*chans*(bits/8) = bytes in *data. */ + +/* Requests current AVI to be closed and new be started */ +/* Use when encoding parameters have changed */ +extern void NESVideoNextAVI(); + +#ifdef __cplusplus +} +#endif + +#endif diff -NaHudr dega-1.14/sdl/quantize.hh changed/sdl/quantize.hh --- dega-1.14/sdl/quantize.hh 1970-01-01 02:00:00.000000000 +0200 +++ changed/sdl/quantize.hh 2008-02-06 22:05:09.470769000 +0200 @@ -0,0 +1,184 @@ +/* + Ordered dithering methods provided for: + 8x8 (Quantize8x8) + 4x4 (Quantize4x4) + 3x3 (Quantize3x3) + 4x2 (Quantize4x2) + 3x2 (Quantize3x2) + 2x2 (Quantize2x2) + The functions are: + + template + int QuantizeFunc(size_t quant_pos, double value) + + - Quantizes value, assumed to be in range 0..in_max, to range 0..m + - quant_pos tells the coordinate into the dithering matrix + + template + int QuantizeFunc(size_t quant_pos, unsigned value) + + - Quantizes value, assumed to be in range 0..in_max, to range 0..m + - quant_pos tells the coordinate into the dithering matrix + + Copyright (C) 1992,2008 Joel Yliluoma (http://iki.fi/bisqwit/) +*/ + +#define OrderedDitherDecl(n) \ + static const double flts[n]; \ + static const int ints[n]; \ + enum { mul = n+1, \ + maxin = in_max, \ + even = !(maxin % mul), \ + intmul = even ? 1 : mul }; + +#define d(n) (n)/double(mul) - 0.5 +#define i(n) even ? (n*in_max/mul - (int)in_max/2) \ + : (n*in_max - (int)mul*in_max/2) + +template +struct QuantizeNoDither +{ + int res; + template + QuantizeNoDither(IntType v) : res(v * m / in_max) { } + operator int() const { return res; } +}; + +template +struct QuantizeFuncBase: private Base +{ + int res; + + QuantizeFuncBase(size_t quant_pos, double v) : res(0) + { + if(v > 0.0) + { + const double dither_threshold = Base::flts[quant_pos]; + res = (int)(v * (m / double(Base::maxin)) + dither_threshold); + if(res > m) res = m; + } + } + + QuantizeFuncBase(size_t quant_pos, unsigned char v) : res(v) + { + if(m == Base::maxin) return; + if(m < Base::maxin) + { + // With dithering + const int dither_threshold = Base::ints[quant_pos]; + const int intmul = Base::intmul; + res = (res * (m * intmul) + dither_threshold) / (Base::maxin * intmul); + } + else + { + // Without dithering + res = QuantizeNoDither (res); + } + } +}; + +#define QuantizeFuncDecl(name, base) \ + template \ + struct name: private QuantizeFuncBase > \ + { \ + typedef QuantizeFuncBase > Base; \ + template name(A a, B b) : Base(a, b) { } \ + operator int() const { return Base::res; } \ + } + +/******* Quantizing with 8x8 ordered dithering ********/ +template struct OrderedDither_8x8 { OrderedDitherDecl(8*8) }; + template + const double OrderedDither_8x8::flts[] /* A table for 8x8 ordered dithering */ + = { d(1 ), d(49), d(13), d(61), d( 4), d(52), d(16), d(64), + d(33), d(17), d(45), d(29), d(36), d(20), d(48), d(32), + d(9 ), d(57), d( 5), d(53), d(12), d(60), d( 8), d(56), + d(41), d(25), d(37), d(21), d(44), d(28), d(40), d(24), + d(3 ), d(51), d(15), d(63), d( 2), d(50), d(14), d(62), + d(35), d(19), d(47), d(31), d(34), d(18), d(46), d(30), + d(11), d(59), d( 7), d(55), d(10), d(58), d( 6), d(54), + d(43), d(27), d(39), d(23), d(42), d(26), d(38), d(22) }; + template + const int OrderedDither_8x8::ints[] + = { i(1 ), i(49), i(13), i(61), i( 4), i(52), i(16), i(64), + i(33), i(17), i(45), i(29), i(36), i(20), i(48), i(32), + i(9 ), i(57), i( 5), i(53), i(12), i(60), i( 8), i(56), + i(41), i(25), i(37), i(21), i(44), i(28), i(40), i(24), + i(3 ), i(51), i(15), i(63), i( 2), i(50), i(14), i(62), + i(35), i(19), i(47), i(31), i(34), i(18), i(46), i(30), + i(11), i(59), i( 7), i(55), i(10), i(58), i( 6), i(54), + i(43), i(27), i(39), i(23), i(42), i(26), i(38), i(22) }; +QuantizeFuncDecl(Quantize8x8, OrderedDither_8x8); + + +/******* Quantizing with 4x4 ordered dithering ********/ +template struct OrderedDither_4x4 { OrderedDitherDecl(4*4) }; + template + const double OrderedDither_4x4::flts[] /* A table for 4x4 ordered dithering */ + = { d( 1), d( 9), d( 3), d(11), + d(13), d( 5), d(15), d( 7), + d( 4), d(12), d( 2), d(10), + d(16), d( 8), d(14), d( 6) }; + template + const int OrderedDither_4x4::ints[] + = { i( 1), i( 9), i( 3), i(11), + i(13), i( 5), i(15), i( 7), + i( 4), i(12), i( 2), i(10), + i(16), i( 8), i(14), i( 6) }; +QuantizeFuncDecl(Quantize4x4, OrderedDither_4x4); + +/******* Quantizing with 3x3 ordered dithering ********/ +template struct OrderedDither_3x3 { OrderedDitherDecl(3*3) }; + template + const double OrderedDither_3x3::flts[] /* A table for 3x3 ordered dithering */ + = { d(1), d(7), d(3), + d(6), d(4), d(9), + d(8), d(2), d(5) }; + template + const int OrderedDither_3x3::ints[] + = { i(1), i(7), i(3), + i(6), i(4), i(9), + i(8), i(2), i(5) }; +QuantizeFuncDecl(Quantize3x3, OrderedDither_3x3); + +/******* Quantizing with 4x2 ordered dithering ********/ +template struct OrderedDither_4x2 { OrderedDitherDecl(4*2) }; + template + const double OrderedDither_4x2::flts[] /* A table for 4x2 ordered dithering */ + = { d(1), d(5), d(2), d(6), + d(7), d(3), d(8), d(4) }; + template + const int OrderedDither_4x2::ints[] + = { i(1), i(5), i(2), i(6), + i(7), i(3), i(8), i(4) }; +QuantizeFuncDecl(Quantize4x2, OrderedDither_4x2); + +/******* Quantizing with 3x2 ordered dithering ********/ +template struct OrderedDither_3x2 { OrderedDitherDecl(3*2) }; + template + const double OrderedDither_3x2::flts[] /* A table for 3x2 ordered dithering */ + = { d(1), d(5), d(3), + d(4), d(2), d(6) }; + template + const int OrderedDither_3x2::ints[] + = { i(1), i(5), i(3), + i(4), i(2), i(6) }; +QuantizeFuncDecl(Quantize3x2, OrderedDither_3x2); + +/******* Quantizing with 2x2 ordered dithering ********/ +template struct OrderedDither_2x2 { OrderedDitherDecl(2*2) }; + template + const double OrderedDither_2x2::flts[] /* A table for 2x2 ordered dithering */ + = { d(1), d(4), + d(3), d(2) }; + template + const int OrderedDither_2x2::ints[] + = { i(1), i(4), + i(3), i(2) }; +QuantizeFuncDecl(Quantize2x2, OrderedDither_2x2); + + +#undef OrderedDitherDecl +#undef QuantizeFuncDecl +#undef i +#undef d diff -NaHudr dega-1.14/sdl/rgbtorgb.cc changed/sdl/rgbtorgb.cc --- dega-1.14/sdl/rgbtorgb.cc 1970-01-01 02:00:00.000000000 +0200 +++ changed/sdl/rgbtorgb.cc 2008-02-18 23:02:53.581518000 +0200 @@ -0,0 +1,786 @@ +#include +#include // for size_t +#include +#include + +typedef uint_least64_t uint64_t; + +#include "quantize.hh" +#include "rgbtorgb.hh" +#include "simd.hh" + +static const uint64_t zero64 __attribute__((aligned(8))) = 0ULL; +static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL; +static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL; +static const uint64_t mask24hh __attribute__((aligned(8))) = 0xffff000000000000ULL; +static const uint64_t mask24hhh __attribute__((aligned(8))) = 0xffffffff00000000ULL; +static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff0000ULL; + +static const uint64_t mask64h __attribute__((aligned(8))) = 0xFF00FF00FF00FF00ULL; +static const uint64_t mask64l __attribute__((aligned(8))) = 0x00FF00FF00FF00FFULL; +static const uint64_t mask64hw __attribute__((aligned(8))) = 0xFFFF0000FFFF0000ULL; +static const uint64_t mask64lw __attribute__((aligned(8))) = 0x0000FFFF0000FFFFULL; +static const uint64_t mask64hd __attribute__((aligned(8))) = 0xFFFFFFFF00000000ULL; +static const uint64_t mask64ld __attribute__((aligned(8))) = 0x00000000FFFFFFFFULL; + +#define RGB2YUV_SHIFT 16 +/* Note: With shift 8, it only uses U,V range 18..239. + * With just 9 bits, it would use whole 16..240 as it should. + */ +#if 1 + +static const int RY = ((int)(( 65.738/256.0)*(1< +static inline void Convert32To24_32bytes(c64 w0, c64 w1, c64 w2, c64 w3, unsigned char* dest) +{ + c64 r0 = (w0 & mask24l) | ((w0 >> 8) & mask24h); /* bbbaaa */ + c64 r1 = (w1 & mask24l) | ((w1 >> 8) & mask24h); /* dddccc */ + c64 r2 = (w2 & mask24l) | ((w2 >> 8) & mask24h); /* fffeee */ + c64 r3 = (w3 & mask24l) | ((w3 >> 8) & mask24h); /* hhhggg */ + + /* ccbbbaaa */ + ((r0 ) | ((r1 << 48) & mask24hh)).Put(dest+0); + /* feeedddc */ + ((r1 >> 16) | ((r2 << 32) & mask24hhh)).Put(dest+8); + /* hhhgggff */ + ((r2 >> 32) | ((r3 << 16) & mask24hhhh)).Put(dest+16); +} + +#if defined(__x86_64) || defined(USE_MMX) +static void Convert32To24_32bytes(const unsigned char* src, + unsigned char* dest) +{ + c64 w0; w0.Get(src+0); + c64 w1; w1.Get(src+8); + c64 w2; w2.Get(src+16); + c64 w3; w3.Get(src+24); + Convert32To24_32bytes(w0,w1,w2,w3, dest); +} +#endif + +void Convert32To24Frame(const void* data, unsigned char* dest, unsigned npixels) +{ + const unsigned char* src = (const unsigned char*)data; + + #if defined(__x86_64) || defined(USE_MMX) + while(npixels >= 8) + { + Convert32To24_32bytes(src, dest); + src += 4*8; + dest += 3*8; + npixels -= 8; + } + #ifdef USE_MMX + _mm_empty(); + #endif + #endif + + for(unsigned pos=0; pos +struct Bits16const +{ + static const uint64_t value; +}; +template +const uint64_t Bits16const::value = + (( ((uint64_t)(unsigned short) basevalue_lo) << 0) + | ( ((uint64_t)(unsigned short) basevalue_hi) << 16) + | ( ((uint64_t)(unsigned short) basevalue_lo) << 32) + | ( ((uint64_t)(unsigned short) basevalue_hi) << 48)); + +template +struct Bits32const +{ + static const uint64_t value; +}; +template +const uint64_t Bits32const::value = + (( ((uint64_t)(unsigned int) basevalue_lo) << 0) + | ( ((uint64_t)(unsigned int) basevalue_hi) << 32)); + +template +struct Bits8const +{ + static const uint64_t value = + ((basevalue_lo << 0) + | (basevalue_hi << 8) + | (basevalue_lo << 16) + | (basevalue_hi << 24) + | (basevalue_lo << 32) + | (basevalue_hi << 40) + | (basevalue_lo << 48) + | (basevalue_hi << 56)); +}; + + +template +struct MaskBconst +{ + static const uint64_t basevalue_lo = (1 << lowbitcount) - 1; + static const uint64_t basevalue_hi = (1 << highbitcount) - 1; + static const uint64_t value = Bits8const::value << leftshift; +}; + +template +struct Convert_2byte_consts +{ + static const uint64_t mask_lo;// = MaskBconst::value; + static const uint64_t mask_hi;// = MaskBconst::value; + static const uint64_t mask_frac;// = MaskBconst<8-bits,8-bits, 0>::value; +}; +template +const uint64_t Convert_2byte_consts::mask_lo = MaskBconst::value; +template +const uint64_t Convert_2byte_consts::mask_hi = MaskBconst::value; +template +const uint64_t Convert_2byte_consts::mask_frac = MaskBconst<8-bits, 8-bits, 0>::value; + +template +struct Convert_2byte_helper +{ + c64 lo, hi; + + Convert_2byte_helper(c64 p4a, c64 p4b) + { + const uint64_t& mask_lo = Convert_2byte_consts::mask_lo; + const uint64_t& mask_hi = Convert_2byte_consts::mask_hi; + const uint64_t& mask_frac = Convert_2byte_consts::mask_frac; + + /* STEP 1: SEPARATE THE PIXELS INTO RED, GREEN AND BLUE COMPONENTS */ + + /* 000BBBBB 000bbbbb 000BBBBB 000bbbbb 000BBBBB 000bbbbb 000BBBBB 000bbbbb */ + c64 s5 = ((p4a >> offs) & mask_lo) | ((p4b << (8-offs)) & mask_hi); + + /* STEP 2: SCALE THE COLOR COMPONENTS TO 256 RANGE */ + + /* BBBBB000 bbbbb000 BBBBB000 bbbbb000 BBBBB000 bbbbb000 BBBBB000 bbbbb000 */ + /* 00000BBB 00000bbb 00000BBB 00000bbb 00000BBB 00000bbb 00000BBB 00000bbb */ + c64 v8 = (s5 << (8-bits)) | ((s5 >> (bits-(8-bits))) & mask_frac); + /* v8: + * + * BBBBBBBB bbbbbbbb BBBBBBBB bbbbbbbb BBBBBBBB bbbbbbbb BBBBBBBB bbbbbbbb * + */ + + /* STEP 3: DEINTERLACE THE PIXELS */ + lo = (v8 ) & mask64l; + hi = (v8 >> 8) & mask64l; + } +}; + +/* +template +static void Convert_2byte_to_24Common(const unsigned char* src, unsigned char* dest) + __attribute((noinline)); +*/ +template +static void Convert_2byte_to_24or32Common(const unsigned char* src, unsigned char* dest) +{ + c64 p4a; p4a.Get(src+0); // four pixels + c64 p4b; p4b.Get(src+8); // another four pixels + + /* in: In both registers: */ + + Convert_2byte_helper r(p4a,p4b); + Convert_2byte_helper b(p4a,p4b); + Convert_2byte_helper g(p4a,p4b); + + /* STEP 4: CONVERT PIXELS INTO RGB32 */ + + /* Now we have: + * b.lo = 0j0g0d0a + * g.lo = 0k0h0e0b + * r.lo = 0l0i0f0c + * b.hi = 0J0G0D0A + * g.hi = 0K0H0E0B + * r.hi = 0L0I0F0C + * We want: + * w1 = 0fed0cba + * w2 = 0lkj0ihg + * w3 = 0FED0CBA + * w4 = 0LKJ0IHG + */ + +#if 0 && defined(__MMX__) /* FIXME why is this 0&&? */ + // punpcklbw 0k0h0e0b, 0j0g0d0a -> 00ed00ba + // punpcklwd 0l0i0f0c, ________ -> 0f__0c__ + c64 w1 = r.lo.unpacklwd(zero64) | g.lo.unpacklbw(b.lo); // pix 0,1 + // punpckhbw 0k0h0e0b, 0j0g0d0a -> 00kj00hg + // punpckhwd 0l0i0f0c, ________ -> 0l__0i__ + c64 w2 = r.lo.unpackhwd(zero64) | g.lo.unpackhbw(b.lo); // pix 2,3 + + c64 w3 = r.hi.unpacklwd(zero64) | g.hi.unpacklbw(b.hi); // pix 4,5 + c64 w4 = r.hi.unpackhwd(zero64) | g.hi.unpackhbw(b.hi); // pix 6,7 + #ifndef USE_MMX + _mm_empty(); + #endif +#else + /* With 64-bit registers, this code is greatly simpler than + * the emulation of unpack opcodes. However, when the + * unpack opcodes is available, using them is shorter. + * Which way is faster? FIXME: Find out + */ + + // mask64lw: 00**00** + // mask64hw: **00**00 + // b.lo & mask64lw: 000g000a + // g.lo & mask64lw: 000h000b + // r.lo & mask64lw: 000i000c + // b.lo & mask64hw: 0j000d00 + // g.lo & mask64hw: 0k000e00 + // r.lo & mask64hw: 0l000f00 + + c64 tlo1 = ((b.lo & mask64lw) ) | ((g.lo & mask64lw) << 8) | ((r.lo & mask64lw) << 16); + c64 tlo2 = ((b.lo & mask64hw) >>16) | ((g.lo & mask64hw) >> 8) | ((r.lo & mask64hw) ); + + c64 thi1 = ((b.hi & mask64lw) ) | ((g.hi & mask64lw) << 8) | ((r.hi & mask64lw) << 16); + c64 thi2 = ((b.hi & mask64hw) >>16) | ((g.hi & mask64hw) >> 8) | ((r.hi & mask64hw) ); + /* + * tlo1 = 0ihg0cba + * tlo2 = 0lkj0fed + * thi1 = 0IHG0CBA + * thi2 = 0LKJ0FED + * mask64ld = 0000**** + * mask64hd = ****0000 + */ + + c64 w1 = (tlo1 & mask64ld) | ((tlo2 & mask64ld) << 32); // 00000cba | 00000fed = 0fed0bca + c64 w2 = (tlo2 & mask64hd) | ((tlo1 & mask64hd) >> 32); // 0lkj0000 | 0ihg0000 = 0lkj0ihg + + c64 w3 = (thi1 & mask64ld) | ((thi2 & mask64ld) << 32); + c64 w4 = (thi2 & mask64hd) | ((thi1 & mask64hd) >> 32); +#endif + + if(rgb24) + { + /* STEP 5A: CONVERT PIXELS INTO RGB24 */ + Convert32To24_32bytes(w1,w2,w3,w4, dest); + } + else + { + /* STEP 5B: STORE RGB32 */ + w1.Put(dest+0); + w2.Put(dest+8); + w3.Put(dest+16); + w4.Put(dest+24); + } + + /* + punpcklbw ____ABCD, ____abcd = AaBbCcDd + punpcklwd ____ABCD, ____abcd = ABabCDcd + punpckldq ____ABCD, ____abcd = ABCDabcd + + punpckhbw ABCD____, abcd____ = AaBbCcDd + punpckhwd ABCD____, abcd____ = ABabCDcd + punpckhdq ABCD____, abcd____ = ABCDabcd + */ +} + +void Convert15To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue) +{ + const unsigned char* src = (const unsigned char*)data; + + if(swap_red_blue) + for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8) + Convert_2byte_to_24or32Common<0,5, 5,5, 10,5, true> (src, dest); + else + for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8) + Convert_2byte_to_24or32Common<10,5, 5,5, 0,5, true> (src, dest); + + #ifdef USE_MMX + _mm_empty(); + #endif + for(unsigned a=0; a= 8; src += 8*2, dest += 8*3, npixels -= 8) + Convert_2byte_to_24or32Common<0,5, 5,6, 11,5, true> (src, dest); + else + for(; npixels >= 8; src += 8*2, dest += 8*3, npixels -= 8) + Convert_2byte_to_24or32Common<11,5, 5,6, 0,5, true> (src, dest); + + #ifdef USE_MMX + _mm_empty(); + #endif + for(unsigned a=0; a= 8; src += 8*2, dest += 8*4, npixels -= 8) + Convert_2byte_to_24or32Common<0,5, 5,5, 10,5, false> (src, dest); + else + for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8) + Convert_2byte_to_24or32Common<10,5, 5,5, 0,5, false> (src, dest); + + #ifdef USE_MMX + _mm_empty(); + #endif + for(unsigned a=0; a= 8; src += 8*2, dest += 8*4, npixels -= 8) + Convert_2byte_to_24or32Common<0,5, 5,6, 11,5, false> (src, dest); + else + for(; npixels >= 8; src += 8*2, dest += 8*4, npixels -= 8) + Convert_2byte_to_24or32Common<11,5, 5,6, 0,5, false> (src, dest); + + #ifdef USE_MMX + _mm_empty(); + #endif + for(unsigned a=0; a(o16, rgbdata[2]) << 0) + | (Quantize4x4<63>(o16, rgbdata[1]) << 5) + | (Quantize4x4<31>(o16, rgbdata[0]) << 11); +} +static inline unsigned Build15(unsigned x,unsigned y, const unsigned char* rgbdata) +{ + unsigned o16 = (x + 4*y) % 16; + return (Quantize4x4<31>(o16, rgbdata[2]) << 0) + | (Quantize4x4<31>(o16, rgbdata[1]) << 5) + | (Quantize4x4<31>(o16, rgbdata[0]) << 10); +} + +void Convert24To16Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width) +{ + const unsigned char* logodata = (const unsigned char*) data; + unsigned short* result = (unsigned short*) dest; + unsigned x=0,y=0; + for(unsigned pos=0; pos= width) { x=0; ++y; } + } +} + +void Convert24To15Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width) +{ + const unsigned char* logodata = (const unsigned char*) data; + unsigned short* result = (unsigned short*) dest; + unsigned x=0,y=0; + for(unsigned pos=0; pos= width) { x=0; ++y; } + } +} + +template +void Convert_4byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width) +{ + const unsigned char* src = (const unsigned char*) data; + unsigned height = npixels / width; + unsigned pos = 0; + unsigned ypos = 0; + unsigned vpos = npixels; + unsigned upos = vpos + npixels / 4; + unsigned stride = width*PixStride; + + /* This function is based on code from x264 svn version 711 */ + /* TODO: Apply MMX optimization */ + + for(unsigned y=0; y> RGB2YUV_SHIFT); // y + } + + dest[upos++] = (U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+2)) ); + dest[vpos++] = (V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+2)) ); + + ypos += 2; + } + pos += stride; + ypos += width; + } +} + +template +void Convert_2byte_To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width) +{ + const unsigned PixStride = 2; + const unsigned char* src = (const unsigned char*) data; + unsigned height = npixels / width; + unsigned pos = 0; + unsigned ypos = 0; + unsigned vpos = npixels; + unsigned upos = vpos + npixels / 4; + unsigned stride = width*PixStride; + + /* This function is based on code from x264 svn version 711 */ + /* TODO: Apply MMX optimization */ + + for(unsigned y=0; y + (src+pos, Rgb2byteBuf[0][0]); + + Convert_2byte_to_24or32Common + + (src+pos+stride, Rgb2byteBuf[1][0]); + + pos += 16; + + for(int x8 = 0; x8 < 8; x8 += 2) + { + int c[3]; + /* TODO: Some faster means than using pointers */ + unsigned char* rgb[4] = + { + Rgb2byteBuf[0][x8+0], + Rgb2byteBuf[0][x8+1], + Rgb2byteBuf[1][x8+0], + Rgb2byteBuf[1][x8+1] + }; + + for(int m=0; m<3; ++m) c[m] = 0; + for(int n=0; n<4; ++n) + for(int m=0; m<3; ++m) + c[m] += rgb[n][m]; + + unsigned destpos[4] = { ypos, ypos+1, ypos+width, ypos+width+1 }; + for(int n=0; n<4; ++n) + { + dest[destpos[n]] + = Y_ADD + ((RY * rgb[n][0] + + GY * rgb[n][1] + + BY * rgb[n][2] + ) >> RGB2YUV_SHIFT); // y + } + + /*c[0] /= 4; c[1] /= 4; c[2] /= 4;*/ + // Note: +2 is because c[] contains 4 values + dest[upos++] = U_ADD + ((RU * c[0] + GU * c[1] + BU * c[2]) >> (RGB2YUV_SHIFT+2)); + dest[vpos++] = V_ADD + ((RV * c[0] + GV * c[1] + BV * c[2]) >> (RGB2YUV_SHIFT+2)); + + ypos += 2; + } + } + pos += stride; + ypos += width; + } + + #ifdef USE_MMX + /* because of Convert_2byte_to_24or32Common() */ + _mm_empty(); + #endif +} + +void Convert_I420To24Frame(const void* data, unsigned char* dest, + unsigned npixels, unsigned width, bool swap_red_blue) +{ + const unsigned char* src = (const unsigned char*) data; + unsigned height = npixels / width; + unsigned pos = 0; + unsigned ypos = 0; + unsigned vpos = npixels; + unsigned upos = vpos + npixels / 4; + + #ifdef __MMX__ + c64_MMX rgb[4], yy[4]; + #endif + + /* + Y input: 16..235 + U input: 16..240 + V input: 16..240 + + */ + + #pragma omp parallel for + for(unsigned y=0; y::value) + .shl16(16 - YUV2RGB_SHIFT); // shift them so that *13bitconst results in upper 16 bits having the actual value + c64_MMX vvq = c64_MMX(zero64) + .unpacklbw(tmp_v) + .sub16(Bits16const::value) + .shl16(16 - YUV2RGB_SHIFT); // shift them so that *13bitconst results in upper 16 bits having the actual value + + const short* uu = (const short*)&uuq; + const short* vv = (const short*)&vvq; + #if 1 + c64_MMX vmul; vmul.Init16(VR, VG, 0, 0); // R,G,B,0 * vmul = V + c64_MMX umul; umul.Init16(0, UG, UB, 0); // R,G,B,0 * umul = U + #else + // pmaddw does: A,B,C,D and E,F,G,H, A*E + B*F, C*G + D*H + + // we do: R= VR*v + 0*u, G= VG*v + UG*u + // B= 0*v + UB*u, 0*0 + 0*0 + c64_MMX vumul1; vumul1.Init16(VR, 0, VG, UG); + c64_MMX vumul2; vumul2.Init16(0, UB, 0, 0); + #endif + + /* c64_MMX rgb[4]; // four sets of 4*int16, each representing 1 rgb value */ + for(int n=0; n<4; ++n) + { + #if 1 + /* vv is shifted by 3 bits, vmul is shifted by 13 bits + * 16 bits in total, so mul16hi gets the 16-bit downscaled part */ + c64_MMX v; v.Init16(vv[n]); + c64_MMX u; u.Init16(uu[n]); + rgb[n] = v.mul16hi(vmul).add16( + u.mul16hi(umul) ); + #else + c64_MMX vuvu; vuvu.Init16(vv[n], uu[n], vv[n], uu[n]); + c64_MMX madd1 = _mm_madd_pi16(vumul1.value, vuvu.value); + c64_MMX madd2 = _mm_madd_pi16(vumul2.value, vuvu.value); + rgb[n] = madd1.sar32(YUV2RGB_SHIFT) + .conv_s32_s16( + madd2.sar32(YUV2RGB_SHIFT)); + #endif + } + + /* rgb[0] : U,V increment of RGB32 for x0,y0 - x1,y1 + * rgb[1] : U,V increment of RGB32 for x2,y0 - x3,y1 + * rgb[2] : U,V increment of RGB32 for x4,y0 - x5,y1 + * rgb[3] : U,V increment of RGB32 for x6,y0 - x7,y1 + */ + + unsigned yyoffs[4] = { ypos, ypos+1, ypos+width, ypos+width+1 }; + /* c64_MMX yy[4]; // four sets of 4*int16, each representing four Y values */ + for(int n=0; n<4; ++n) + { + c64_MMX luma; luma.Init16( + src[yyoffs[0]+n*2], + src[yyoffs[1]+n*2], + src[yyoffs[2]+n*2], + src[yyoffs[3]+n*2] + ); + luma = luma.sub16(Bits16const::value); + luma = luma.shl16(16 - YUV2RGB_SHIFT); + yy[n] = luma.mul16hi(Bits16const::value); + } + const short* const yyval = (const short*) &yy[0].value; + /* + values in order: + x0y0 x1y0 x0y1 x1y1 + x2y0 x3y0 x2y1 x3y1 + x4y0 x5y0 x4y1 x5y1 + x6y0 x7y0 x6y1 x7y1 + */ + int tmppos = pos; + for(int ny = 0; ny < 4; ny += 2) + { + /* Note: We must use 16-bit pixels here instead of 8-bit, + * because the rgb+Y addition can overflow. conv_s16_u8() + * does the necessary clamping, which would not be done + * if the values were 8-bit. + */ + // 8 pixels for one scanline, repeated twice + /* Note: C++ has no named constructors, so we + * use statement blocks here as substitutes. + */ + c64_MMX r0 + = rgb[0].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+0]); tmp; }) ) + .conv_s16_u8( + rgb[0].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+1]); tmp; }) )); + c64_MMX r1 + = rgb[1].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+4]); tmp; }) ) + .conv_s16_u8( + rgb[1].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+5]); tmp; }) )); + c64_MMX r2 + = rgb[2].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+8]); tmp; }) ) + .conv_s16_u8( + rgb[2].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+9]); tmp; }) )); + c64_MMX r3 + = rgb[3].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+12]); tmp; }) ) + .conv_s16_u8( + rgb[3].add16( ({ c64_MMX tmp; tmp.Init16(yyval[ny+13]); tmp; }) )); + + Convert32To24_32bytes(r0,r1,r2,r3, &dest[tmppos]); + tmppos += width*3; // next line + } + upos += 4; + vpos += 4; + ypos += 8; // eight bytes for this line (and eight from next too) + pos += 8*3; // eight triplets generated on this line + x += 8; // eight yy values used on this line + #else /* non-MMX */ + int u = src[upos] - U_ADD; + int v = src[vpos] - V_ADD; + + int rgb[3] = + { + (VR * v ) >> (YUV2RGB_SHIFT), + (VG * v + UG * u) >> (YUV2RGB_SHIFT), + ( + UB * u) >> (YUV2RGB_SHIFT) + }; + + unsigned incr[4] = {0,1,width,width+1}; + + for(unsigned r=0; r<4; ++r) + for(unsigned doffs=pos + incr[r]*3, yoffs=ypos + incr[r], + yy = (Y_REV * (src[yoffs] - Y_ADD)) >> YUV2RGB_SHIFT, + n=0; n<3; ++n) + dest[doffs+n] = c64::clamp_u8(rgb[n] + (int)yy); + + upos += 1; + vpos += 1; + ypos += 2; // two bytes for this line (two from next line) + pos += 2*3; // two triplets generated on this line + x += 2; // two yy values used on this line + #endif + } + ypos += width; + pos += 3*width; + } + #ifdef __MMX__ + _mm_empty(); + #endif +} + +void Convert24To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width) +{ + Convert_4byte_To_I420Frame<3>(data,dest,npixels,width); +} +void Convert32To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width) +{ + Convert_4byte_To_I420Frame<4>(data,dest,npixels,width); +} +void Convert15To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width) +{ + Convert_2byte_To_I420Frame<10,5, 5,5, 0,5>(data,dest,npixels,width); +} +void Convert16To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width) +{ + Convert_2byte_To_I420Frame<11,5, 5,6, 0,5>(data,dest,npixels,width); +} diff -NaHudr dega-1.14/sdl/rgbtorgb.hh changed/sdl/rgbtorgb.hh --- dega-1.14/sdl/rgbtorgb.hh 1970-01-01 02:00:00.000000000 +0200 +++ changed/sdl/rgbtorgb.hh 2008-02-18 22:55:25.037986000 +0200 @@ -0,0 +1,42 @@ +#ifdef __cplusplus +extern "C" { + #define defaulttrue =true +#else + #define defaulttrue + #define bool int +#endif + +void Convert32To24Frame(const void* data, unsigned char* dest, unsigned npixels) + __attribute__((noinline)); + +void Convert15To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue) + __attribute__((noinline)); + +void Convert16To24Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue) + __attribute__((noinline)); + +void Convert15To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue) + __attribute__((noinline)); + +void Convert16To32Frame(const void* data, unsigned char* dest, unsigned npixels, bool swap_red_blue defaulttrue) + __attribute__((noinline)); + +void Convert_I420To24Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width, bool swap_red_blue defaulttrue) + __attribute__((noinline)); + +void Convert24To16Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width); + +void Convert24To15Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width); + +void Convert15To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width); +void Convert16To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width); +void Convert24To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width); +void Convert32To_I420Frame(const void* data, unsigned char* dest, unsigned npixels, unsigned width); + +#ifdef __cplusplus +} + #undef defaulttrue +#else + #undef defaulttrue + #undef bool +#endif diff -NaHudr dega-1.14/sdl/simd.hh changed/sdl/simd.hh --- dega-1.14/sdl/simd.hh 1970-01-01 02:00:00.000000000 +0200 +++ changed/sdl/simd.hh 2008-02-18 04:56:08.371971000 +0200 @@ -0,0 +1,286 @@ +#if defined(__MMX__) && !defined(__x86_64) +#define USE_MMX +#endif +#if defined(__SSE__) +#define USE_SSE +#endif + +#ifdef __MMX__ +#include +#endif +#ifdef __SSE__ +#include + #ifdef __ICC + typedef __m128 __v4sf; + #endif +#endif + +struct c64_common +{ + static signed char clamp_s8(int_fast64_t v) + { return v<-128 ? -128 : (v > 127 ? 127 : v); } + static unsigned char clamp_u8(int_fast64_t v) + { return v<0 ? 0 : (v > 255 ? 255 : v); } + static short clamp_s16(int_fast64_t v) + { return v<-32768 ? -32768 : (v > 32767 ? 32767 : v); } + + static inline uint_fast64_t expand32_8(uint_fast32_t a) + { + // 0000abcd -> 0a0b0c0d + typedef uint_fast64_t v; + return (a&0xFFU) + | ((a&0xFF00U)<<8) // base: 8+8 = 16 + | ((v)(a&0xFF0000U)<<16) // base: 16+16 = 32 + | ((v)(a&0xFF000000UL)<<24); // base: 24+24 = 48 + } + static inline uint_fast64_t expand32_16(uint_fast32_t a) + { + // 0000abcd -> 00ab00cd + typedef uint_fast64_t v; + return (a&0xFFFFU) + | ((v)(a&0xFFFF0000UL)<<16); // base: 16+16 = 32 + } +}; + +#ifdef __MMX__ +/* 64-bit integers that use MMX / 3Dnow operations where relevant */ +struct c64_MMX: public c64_common +{ + typedef c64_MMX c64; + + __m64 value; + + inline c64_MMX() { } + inline c64_MMX(__m64 v) : value(v) { } + inline c64_MMX(const uint64_t& v) : value( *(const __m64*)& v) { } + + inline c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); } + inline c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); } + c64& operator<<= (int n) { return *this = shl64(n); } + c64& operator>>= (int n) { return *this = shr64(n); } + + c64 conv_s16_u8() const { return conv_s16_u8(*this); } + c64 conv_s16_s8() const { return conv_s16_s8(*this); } + + void Get(const unsigned char* p) { value = *(const __m64*)p; } + void Put( unsigned char* p)const { *(__m64*)p = value; } + + void Init16(short a,short b,short c, short d) + { value = _mm_setr_pi16(a,b,c,d); } + void Init16(short a) + { value = _mm_set1_pi16(a); } + + void GetD(const unsigned char* p) { value = *(const __m64*)p; } + + c64& operator&= (const c64& b) { value=_mm_and_si64(value,b.value); return *this; } + c64& operator|= (const c64& b) { value=_mm_or_si64(value,b.value); return *this; } + c64& operator^= (const c64& b) { value=_mm_xor_si64(value,b.value); return *this; } + + /* psllqi: p = packed + s = shift + r = right, l = left + l = shift in zero, a = shift in sign bit + q = 64-bit, d = 32-bit, w = 16-bit + [i = immed amount] + */ + c64 operator& (const c64& b) const { return c64(_mm_and_si64(value,b.value)); } + c64 operator| (const c64& b) const { return c64(_mm_or_si64(value,b.value)); } + c64 operator^ (const c64& b) const { return c64(_mm_xor_si64(value,b.value)); } + + c64 shl64(int b) const { return _mm_slli_si64(value, b); } + c64 shr64(int b) const { return _mm_srli_si64(value, b); } + c64 shl16(int b) const { return _mm_slli_pi16(value, b); } + c64 shr16(int b) const { return _mm_srli_pi16(value, b); } + c64 sar32(int b) const { return _mm_srai_pi32(value, b); } + c64 sar16(int b) const { return _mm_srai_pi16(value, b); } + c64 add32(const c64& b) const { return _mm_add_pi32(value, b.value); } + c64 add16(const c64& b) const { return _mm_add_pi16(value, b.value); } + c64 sub32(const c64& b) const { return _mm_sub_pi32(value, b.value); } + c64 sub16(const c64& b) const { return _mm_sub_pi16(value, b.value); } + c64 mul16(const c64& b) const { return _mm_mullo_pi16(value, b.value); } + c64 mul16hi(const c64& b) const { return _mm_mulhi_pi16(value, b.value); } + //c64 mul32(const c64& b) const { return _mm_mullo_pi32(value, b.value); } + c64 add8(const c64& b) const { return _mm_add_pi8(value, b.value); } + c64 sub8(const c64& b) const { return _mm_sub_pi8(value, b.value); } + + c64 unpacklbw(const c64& b) const { return _mm_unpacklo_pi8(b.value,value); } + c64 unpacklwd(const c64& b) const { return _mm_unpacklo_pi16(b.value,value); } + c64 unpackhbw(const c64& b) const { return _mm_unpackhi_pi8(b.value,value); } + c64 unpackhwd(const c64& b) const { return _mm_unpackhi_pi16(b.value,value); } + c64 unpackldq(const c64& b) const { return _mm_unpacklo_pi32(b.value,value); } + c64 unpackldq() const { return _mm_unpacklo_pi32(value,value); } + + c64 operator& (const uint64_t& v) { return c64(_mm_and_si64(value, *(const __m64*)& v)); } + + c64 conv_s32_s16(const c64& b) const { return _mm_packs_pi32(value, b.value); } + c64 conv_s16_u8(const c64& b) const { return _mm_packs_pu16(value, b.value); } + c64 conv_s16_s8(const c64& b) const { return _mm_packs_pi16(value, b.value); } +}; +#endif + +struct c64_nonMMX: public c64_common +{ + typedef c64_nonMMX c64; + + uint_least64_t value; + + inline c64_nonMMX() { } + inline c64_nonMMX(uint64_t v) : value(v) { } + + c64 operator<< (int b) const { if(b < 0) return *this >> -b; return shl64(b); } + c64 operator>> (int b) const { if(b < 0) return *this << -b; return shr64(b); } + c64& operator<<= (int n) { return *this = shl64(n); } + c64& operator>>= (int n) { return *this = shr64(n); } + + c64 conv_s16_u8() const { return conv_s16_u8(*this); } + c64 conv_s16_s8() const { return conv_s16_s8(*this); } + + void Init16(short a,short b,short c, short d) + { uint_fast64_t aa = (unsigned short)a, + bb = (unsigned short)b, + cc = (unsigned short)c, + dd = (unsigned short)d; + value = aa | (bb << 16) | (cc << 32) | (dd << 48); } + void Init16(short a) + { Init16(a,a,a,a); } + void Init8(unsigned char a,unsigned char b,unsigned char c,unsigned char d, + unsigned char e,unsigned char f,unsigned char g,unsigned char h) + { + value = ((uint_fast64_t)(a | (b << 8) | (c << 16) | (d << 24))) + | (((uint_fast64_t)e) << 32) + | (((uint_fast64_t)f) << 40) + | (((uint_fast64_t)g) << 48) + | (((uint_fast64_t)h) << 56); + } + + void Get(const unsigned char* p) { value = *(const uint_least64_t*)p; } + void Put( unsigned char* p)const { *(uint_least64_t*)p = value; } + + c64& operator&= (const c64& b) { value&=b.value; return *this; } + c64& operator|= (const c64& b) { value|=b.value; return *this; } + c64& operator^= (const c64& b) { value^=b.value; return *this; } + c64 operator& (const c64& b) const { return value & b.value; } + c64 operator| (const c64& b) const { return value | b.value; } + c64 operator^ (const c64& b) const { return value ^ b.value; } + + c64 operator& (uint_fast64_t b) const { return value & b; } + + #define usimdsim(type, count, op) \ + type* p = (type*)&res.value; \ + for(int n=0; n> b; } + c64 shl16(int b) const { c64 res = *this; usimdsim(short, 2, <<); return res; } + c64 shr16(int b) const { c64 res = *this; usimdsim(unsigned short, 2, >>); return res; } + c64 sar32(int b) const { c64 res = *this; usimdsim(int, 2, >>); return res; } + c64 sar16(int b) const { c64 res = *this; usimdsim(short, 2, >>); return res; } + + c64 add16(const c64& b) const { c64 res = *this; simdsim(short, 4, +); return res; } + c64 sub16(const c64& b) const { c64 res = *this; simdsim(short, 4, -); return res; } + c64 add32(const c64& b) const { c64 res = *this; simdsim(int, 2, +); return res; } + c64 sub32(const c64& b) const { c64 res = *this; simdsim(int, 2, -); return res; } + c64 mul16(const c64& b) const { c64 res = *this; simdsim(short, 4, *); return res; } + c64 mul16hi(const c64& b) const { c64 res = *this; simdsim(short, 4, *) >> 16; return res; } + c64 add8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, +); return res; } + c64 sub8(const c64& b) const { c64 res = *this; simdsim(unsigned char, 8, -); return res; } + + #undef simdsim + #undef usimdsim + + c64 conv_s32_s16(const c64& b) const + { + c64 res; res. + Init16(clamp_s16(value & 0xFFFFFFFFU), + clamp_s16(value >> 32), + clamp_s16(b.value & 0xFFFFFFFFU), + clamp_s16(b.value >> 32)); + return res; + } + c64 conv_s16_u8(const c64& b) const + { + c64 res; res. + Init8(clamp_u8(value & 0xFFFF), + clamp_u8((value >> 16) & 0xFFFF), + clamp_u8((value >> 32) & 0xFFFF), + clamp_u8((value >> 48) & 0xFFFF), + clamp_u8(b.value & 0xFFFF), + clamp_u8((b.value >> 16) & 0xFFFF), + clamp_u8((b.value >> 32) & 0xFFFF), + clamp_u8((b.value >> 48) & 0xFFFF)); + return res; + } + c64 conv_s16_s8(const c64& b) const + { + c64 res; res. + Init8(clamp_s8(value & 0xFFFF), + clamp_s8((value >> 16) & 0xFFFF), + clamp_s8((value >> 32) & 0xFFFF), + clamp_s8((value >> 48) & 0xFFFF), + clamp_s8(b.value & 0xFFFF), + clamp_s8((b.value >> 16) & 0xFFFF), + clamp_s8((b.value >> 32) & 0xFFFF), + clamp_s8((b.value >> 48) & 0xFFFF)); + return res; + } + + /* TODO: Verify that these are correct (though they should never be used anyway) */ + c64 unpacklbw(const c64& p) const + { + #if defined(__MMX__) + /* ICC says [error: cast to type "__m64" is not allowed], + * so we cannot use this code on ICC. Fine for GCC. */ + return (uint_least64_t)_m_punpcklbw(*(__m64*)&p.value, *(__m64*)&value); + #else + uint_fast64_t a=value, b=p.value; + return expand32_8(a) | (expand32_8(b) << 8); + #endif + } + c64 unpackhbw(const c64& p) const + { + #if defined(__MMX__) + return (uint_least64_t)_m_punpckhbw(*(__m64*)&p.value, *(__m64*)&value); + #else + uint_fast64_t a=value, b=p.value; + return expand32_8(a>>32) | (expand32_8(b>>32) << 8); + #endif + } + c64 unpacklwd(const c64& p) const + { + #if defined(__MMX__) + return (uint_least64_t)_m_punpcklwd(*(__m64*)&p.value, *(__m64*)&value); + #else + uint_fast64_t a=value, b=p.value; + return expand32_16(a) | (expand32_16(b) << 16); + #endif + } + c64 unpackhwd(const c64& p) const + { + #if defined(__MMX__) + return (uint_least64_t)_m_punpckhwd(*(__m64*)&p.value, *(__m64*)&value); + #else + uint_fast64_t a=value, b=p.value; + return expand32_16(a>>32) | (expand32_16(b>>32) << 16); + #endif + } + c64 unpackldq() const { return unpackldq(*this); } + c64 unpackldq(const c64& p) const + { + #if defined(__MMX__) + return (uint_least64_t)_m_punpckldq(*(__m64*)&p.value, *(__m64*)&value); + #else + return value | (p.value << 32); + #endif + } +}; + +#ifdef USE_MMX +typedef c64_MMX c64; +#else +typedef c64_nonMMX c64; +#endif