;; Copyright (C) 2009-2010 David McPaul;; All rights reserved. Distributed under the terms of the MIT License.;; A rather unoptimised set of sse yuv to rgb converters; does 4 pixels per loop; inputer:; reads 128 bits of yuv 8 bit data and puts; the y values converted to 16 bit in mm0; the u values converted to 16 bit and duplicated into mm1; the v values converted to 16 bit and duplicated into mm2; conversion:; does the yuv to rgb conversion using 16 bit fixed point and the; results are placed into the following registers as 8 bit clamped values; r values in mm3; g values in mm4; b values in mm5; outputer:; writes out the rgba pixels as 8 bit values with 0 for alpha; mm6 used for scratch; mm7 used for scratch%macro cglobal 1global _%1%define %1 _%1align 16%1:%endmacro; conversion code%macro yuv2rgbsse 0; u = u - 128; v = v - 128; r = y + v + v >> 2 + v >> 3 + v >> 5; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5); b = y + u + u >> 1 + u >> 2 + u >> 6; subtract 16 from ymovq mm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached); psubsw mm0,mm7 ; y = y - 16; subtract 128 from u and vmovq mm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached)psubsw mm1,mm7 ; u = u - 128psubsw mm2,mm7 ; v = v - 128; load r,g,b with ymovq mm3,mm0 ; r = ypshufw mm5,mm0, 0xE4 ; b = y; r = r + v + v >> 2 + v >> 3 + v >> 5paddsw mm3, mm2 ; add v to rmovq mm7, mm1 ; move u to scratchpshufw mm6, mm2, 0xE4 ; move v to scratchpsraw mm6,2 ; divide v by 4paddsw mm3, mm6 ; and add to rpsraw mm6,1 ; divide v by 2paddsw mm3, mm6 ; and add to rpsraw mm6,2 ; divide v by 4paddsw mm3, mm6 ; and add to r; b = y + u + u >> 1 + u >> 2 + u >> 6paddsw mm5, mm1 ; add u to bpsraw mm7,1 ; divide u by 2paddsw mm5, mm7 ; and add to bpsraw mm7,1 ; divide u by 2paddsw mm5, mm7 ; and add to bpsraw mm7,4 ; divide u by 32paddsw mm5, mm7 ; and add to b; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5movq mm7,mm2 ; move v to scratchpshufw mm6,mm1, 0xE4 ; move u to scratchmovq mm4,mm0 ; g = ypsraw mm6,2 ; divide u by 4psubsw mm4,mm6 ; subtract from gpsraw mm6,2 ; divide u by 4psubsw mm4,mm6 ; subtract from gpsraw mm6,1 ; divide u by 2psubsw mm4,mm6 ; subtract from gpsraw mm7,1 ; divide v by 2psubsw mm4,mm7 ; subtract from gpsraw mm7,2 ; divide v by 4psubsw mm4,mm7 ; subtract from gpsraw mm7,1 ; divide v by 2psubsw mm4,mm7 ; subtract from gpsraw mm7,1 ; divide v by 2psubsw mm4,mm7 ; subtract from g%endmacro; outputer%macro rgba32sseoutput 0; clamp valuespxor mm7,mm7packuswb mm3,mm7 ; clamp to 0,255 and pack R to 8 bit per pixelpackuswb mm4,mm7 ; clamp to 0,255 and pack G to 8 bit per pixelpackuswb mm5,mm7 ; clamp to 0,255 and pack B to 8 bit per pixel; convert to bgra32 packedpunpcklbw mm5,mm4 ; bgbgbgbgbgbgbgbgmovq mm0, mm5 ; save bg valuespunpcklbw mm3,mm7 ; r0r0r0r0punpcklwd mm5,mm3 ; lower half bgr0bgr0punpckhwd mm0,mm3 ; upper half bgr0bgr0; write to output ptrmovq [edi], mm5 ; output first 2 pixelsmovq [edi+8], mm0 ; output second 2 pixels%endmacroSECTION .data align=16Const16 dw 16dw 16dw 16dw 16dw 16dw 16dw 16dw 16Const128 dw 128dw 128dw 128dw 128dw 128dw 128dw 128dw 128; Packed Convert; void Convert_YUV422_RGBA32_SSE(void *fromPtr, void *toPtr, int width)%define width ebp+16%define toPtr ebp+12%define fromPtr ebp+8; Planar Convert; void Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)%define width1 ebp+24%define toPtr1 ebp+20%define fromVPtr ebp+16%define fromUPtr ebp+12%define fromYPtr ebp+8SECTION .text align=16; YUY2 FOURCCcglobal Convert_YUV422_RGBA32_SSE; reserve variablespush ebpmov ebp, esppush edipush esipush ecxmov esi, [fromPtr]mov ecx, [width]mov edi, [toPtr]; loop width / 4 timesshr ecx,2test ecx,ecxjng ENDLOOP2REPEATLOOP2: ; loop over width / 4; YUV422 packed inputermovq mm0, [esi] ; should have yuyv yuyvpshufw mm1, mm0, 0xE4 ; copy to mm1movq mm2, mm0 ; copy to mm2; extract ypxor mm7,mm7 ; 0000000000000000pcmpeqb mm6,mm6 ; ffffffffffffffffpunpckhbw mm6,mm7 ; interleave mm7 into mm6 ff00ff00ff00ff00pand mm0, mm6 ; clear all but y values leaving y0y0 etc; extract u and duplicate so each u in yuyv becomes 0u0upsrld mm6,8 ; 00ff0000 00ff0000pand mm1, mm6 ; clear all yv values leaving 0u00 etcpsrld mm1,8 ; rotate u to get u000pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0 (SSE not MMX); extract vpslld mm6,16 ; 000000ff000000ffpand mm2, mm6 ; clear all yu values leaving 000v etcpsrld mm2,8 ; rotate v to get 00v0pshufw mm2,mm2, 0xF5 ; copy v values to get v0v0 (SSE not MMX)yuv2rgbssergba32sseoutput; endloopadd edi,16add esi,8sub ecx, 1 ; apparently sub is better than decjnz REPEATLOOP2ENDLOOP2:; Cleanupemms ; reset mmx regs back to floatpop ecxpop esipop edimov esp, ebppop ebpretcglobal Convert_YUV420P_RGBA32_SSE; reserve variablespush ebpmov ebp, esppush edipush esipush ecxpush eaxpush ebxmov esi, [fromYPtr]mov eax, [fromUPtr]mov ebx, [fromVPtr]mov edi, [toPtr1]mov ecx, [width1]; loop width / 4 timesshr ecx,2test ecx,ecxjng ENDLOOP3REPEATLOOP3: ; loop over width / 4; YUV420 Planar inputermovq mm0, [esi] ; fetch 4 y values (8 bit) yyyy0000movd mm1, [eax] ; fetch 2 u values (8 bit) uu000000movd mm2, [ebx] ; fetch 2 v values (8 bit) vv000000; extract ypxor mm7,mm7 ; 0000000000000000punpcklbw mm0,mm7 ; interleave xmm7 into xmm0 y0y0y0y; extract u and duplicate so each becomes 0u0upunpcklbw mm1,mm7 ; interleave xmm7 into xmm1 u0u00000punpcklwd mm1,mm7 ; interleave again u000u000pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0; extract vpunpcklbw mm2,mm7 ; interleave xmm7 into xmm1 v0v00000punpcklwd mm2,mm7 ; interleave again v000v000pshufw mm2,mm2, 0xA0 ; copy v values to get v0v0yuv2rgbssergba32sseoutput; endloopadd edi,16add esi,4add eax,2add ebx,2sub ecx, 1 ; apparently sub is better than decjnz REPEATLOOP3ENDLOOP3:; Cleanupemmspop ebxpop eaxpop ecxpop esipop edimov esp, ebppop ebpretSECTION .note.GNU-stack noalloc noexec nowrite progbits