;; Copyright (C) 2009-2010 David McPaul;; All rights reserved. Distributed under the terms of the MIT License.;; A rather unoptimised set of sse2 yuv to rgb converters; does 8 pixels per loop; inputer:; reads 128 bits of yuv 8 bit data and puts; the y values converted to 16 bit in xmm0; the u values converted to 16 bit and duplicated into xmm1; the v values converted to 16 bit and duplicated into xmm2; conversion:; does the yuv to rgb conversion using 16 bit fixed point and the; results are placed into the following registers as 8 bit clamped values; r values in xmm3; g values in xmm4; b values in xmm5; outputer:; writes out the rgba pixels as 8 bit values with 0 for alpha; xmm6 used for scratch; xmm7 used for scratch%macro cglobal 1global _%1%define %1 _%1align 16%1:%endmacroSECTION .data align=16Const16 dw 16dw 16dw 16dw 16dw 16dw 16dw 16dw 16Const128 dw 128dw 128dw 128dw 128dw 128dw 128dw 128dw 128RConst dw 0dw 5743dw 0dw 5743dw 0dw 5743dw 0dw 5743GConst dw -1409dw -2925dw -1409dw -2925dw -1409dw -2925dw -1409dw -2925BConst dw 7258dw 0dw 7258dw 0dw 7258dw 0dw 7258dw 0shuffconst db 0x0db 0x01db 0x00db 0x01db 0x04db 0x05db 0x04db 0x05db 0x08db 0x09db 0x08db 0x09db 0x0cdb 0x0ddb 0x0cdb 0x0dYMask db 0x00db 0x80db 0x02db 0x80db 0x04db 0x80db 0x06db 0x80db 0x08db 0x80db 0x0adb 0x80db 0x0cdb 0x80db 0x0edb 0x80UVMask db 0x01db 0x80db 0x03db 0x80db 0x05db 0x80db 0x07db 0x80db 0x09db 0x80db 0x0bdb 0x80db 0x0ddb 0x80db 0x0fdb 0x80; conversion code%macro yuv2rgbsse2 0; u = u - 128; v = v - 128; r = y + 0 * u + 1.402 * v; g = y + -0.344 * u + -0.714 * v; b = y + 1.772 * u + 0 * v; subtract 16 from y; psubsw xmm0, [Const16] ; y = y - 16; subtract 128 from u and vpsubsw xmm3, [Const128] ; u = u - 128, v = v -128movdqa xmm4, xmm3 ; duplicatepshufd xmm5, xmm3, 0xE4 ; duplicatepmaddwd xmm3, [RConst] ; multiply and addpmaddwd xmm4, [GConst] ; to get RGB offsets to Ypmaddwd xmm5, [BConst] ;psrad xmm3, 12 ; Scale back to original rangepsrad xmm4, 12 ;psrad xmm5, 12 ;pshuflw xmm3, xmm3, 0xa0 ; duplicate resultspshufhw xmm3, xmm3, 0xa0pshuflw xmm4, xmm4, 0xa0pshufhw xmm4, xmm4, 0xa0pshuflw xmm5, xmm5, 0xa0pshufhw xmm5, xmm5, 0xa0paddsw xmm3, xmm0 ; add to ypaddsw xmm4, xmm0 ;paddsw xmm5, xmm0 ;%endmacro; outputer%macro rgba32sse2output 0; clamp valuespxor xmm7,xmm7packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixelpackuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixelpackuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel; convert to bgra32 packedpunpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbgmovdqa xmm0, xmm5 ; save bg valuespunpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0; write to output ptrmovntdq [edi], xmm5 ; output first 4 pixels bypassing cachemovntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache%endmacro; void Convert_YUV422_RGBA32_SSE2(void *fromPtr, void *toPtr, int width)%define width ebp+16%define toPtr ebp+12%define fromPtr ebp+8; void Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)%define width1 ebp+24%define toPtr1 ebp+20%define fromVPtr ebp+16%define fromUPtr ebp+12%define fromYPtr ebp+8SECTION .text align=16cglobal Convert_YUV422_RGBA32_SSE2; reserve variablespush ebpmov ebp, esppush edipush esipush ecxmov esi, [fromPtr]mov edi, [toPtr]mov ecx, [width]; loop width / 8 timesshr ecx,3test ecx,ecxjng ENDLOOPREPEATLOOP: ; loop over width / 8prefetchnta [esi+256]; YUV422 packed inputermovdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyvpshufd xmm3, xmm0, 0xE4 ; copy to xmm3; extract ypxor xmm7, xmm7 ; 00000000000000000000000000000000pcmpeqd xmm6, xmm6 ; ffffffffffffffffffffffffffffffffpunpcklbw xmm6, xmm7 ; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00pand xmm0, xmm6 ; clear all but y values leaving y0y0 etc; extract u and vpsllw xmm6, 8 ; 00ff00ff00ff00ff00ff00ff00ff00ffpand xmm3, xmm6 ; extract uv values 0u0v0u0v0u0v0u0v0u0vpsrlw xmm3, 8 ; covert to 16bit u0v0u0v0u0v0u0v0u0v0yuv2rgbsse2rgba32sse2output; endloopadd edi,32add esi,16sub ecx, 1 ; apparently sub is better than decjnz REPEATLOOPENDLOOP:; Cleanuppop ecxpop esipop edimov esp, ebppop ebpretcglobal Convert_YUV420P_RGBA32_SSE2; reserve variablespush ebpmov ebp, esppush edipush esipush ecxpush eaxpush ebxmov esi, [fromYPtr]mov eax, [fromUPtr]mov ebx, [fromVPtr]mov edi, [toPtr1]mov ecx, [width1]; loop width / 8 timesshr ecx,3test ecx,ecxjng ENDLOOP1REPEATLOOP1: ; loop over width / 8; YUV420 Planar inputermovq xmm0, [esi] ; fetch 8 y values (8 bit) yyyyyyyy00000000movd xmm3, [eax] ; fetch 4 u values (8 bit) uuuu000000000000movd xmm1, [ebx] ; fetch 4 v values (8 bit) vvvv000000000000; extract ypxor xmm7, xmm7 ; 00000000000000000000000000000000punpcklbw xmm0, xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0; combine u and vpunpcklbw xmm3, xmm1 ; uvuvuvuv00000000punpcklbw xmm3, xmm7 ; u0v0u0v0u0v0u0v0yuv2rgbsse2rgba32sse2output; endloopadd edi,32add esi,8add eax,4add ebx,4sub ecx, 1 ; apparently sub is better than decjnz REPEATLOOP1ENDLOOP1:; Cleanuppop ebxpop eaxpop ecxpop esipop edimov esp, ebppop ebpretcglobal Test_SSE2; reserve variablespush ebpmov ebp, esppush edipush esipush ecxpush eaxpush ebxmov esi, [fromPtr]mov edi, [toPtr]movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyvpshufd xmm1, xmm0, 0xE4 ; copy to xmm1movdqa xmm3, xmm0 ; copy to xmm2; extract ypxor xmm7,xmm7 ; 00000000000000000000000000000000pcmpeqd xmm6,xmm6 ; ffffffffffffffffffffffffffffffffpunpcklbw xmm6,xmm7 ; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00pand xmm0, xmm6 ; clear all but y values leaving y0y0 etc; extract u and duplicate so each u in yuyv becomes 0u0upsrld xmm6,8 ; 00ff0000 00ff0000 00ff0000 00ff0000pand xmm1, xmm6 ; clear all yv values leaving 0u00 etcpsrld xmm1,8 ; rotate u to get u000; extract vpslld xmm6,16 ; 000000ff000000ff 000000ff000000ffpand xmm3, xmm6 ; clear all yu values leaving 000v etcpsrld xmm3,8 ; rotate v to get 00v0por xmm3, xmm1psubsw xmm3, [Const128] ; u = u - 128, v = v -128pmaddwd xmm3, [RConst] ; multiply and addpsrad xmm3, 12 ; Scale back to original rangepshufb xmm3, [shuffconst] ; duplicate results; paddsw xmm3, xmm0 ; add to y; pxor xmm7,xmm7; packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixelmovntdq [edi], xmm3 ; output first 4 pixels bypassing cache; Cleanuppop ebxpop eaxpop ecxpop esipop edimov esp, ebppop ebpretSECTION .note.GNU-stack noalloc noexec nowrite progbits