From 7a778ee4b7ec09a1f5b2185c9cceee3910dfbdf2 Mon Sep 17 00:00:00 2001 From: Thomas Vander Stichele Date: Sun, 14 Mar 2004 22:34:33 +0000 Subject: gst-indent Original commit message from CVS: gst-indent --- gst/rtjpeg/RTjpeg.c | 4434 ++++++++++++++++++++++++++------------------------- 1 file changed, 2223 insertions(+), 2211 deletions(-) (limited to 'gst/rtjpeg/RTjpeg.c') diff --git a/gst/rtjpeg/RTjpeg.c b/gst/rtjpeg/RTjpeg.c index ab87fcc7..d2e7b67d 100644 --- a/gst/rtjpeg/RTjpeg.c +++ b/gst/rtjpeg/RTjpeg.c @@ -52,38 +52,47 @@ typedef unsigned long long __u64; #include "mmx.h" #endif -static const unsigned char RTjpeg_ZZ[64]={ -0, -8, 1, -2, 9, 16, -24, 17, 10, 3, -4, 11, 18, 25, 32, -40, 33, 26, 19, 12, 5, -6, 13, 20, 27, 34, 41, 48, -56, 49, 42, 35, 28, 21, 14, 7, -15, 22, 29, 36, 43, 50, 57, -58, 51, 44, 37, 30, 23, -31, 38, 45, 52, 59, -60, 53, 46, 39, -47, 54, 61, -62, 55, -63 }; - -static const __u64 RTjpeg_aan_tab[64]={ -4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL, -5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL, -5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL, -5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL, -4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL, -3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL, -2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL, -1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL, +static const unsigned char RTjpeg_ZZ[64] = { + 0, + 8, 1, + 2, 9, 16, + 24, 17, 10, 3, + 4, 11, 18, 25, 32, + 40, 33, 26, 19, 12, 5, + 6, 13, 20, 27, 34, 41, 48, + 56, 49, 42, 35, 28, 21, 14, 7, + 15, 22, 29, 36, 43, 50, 57, + 58, 51, 44, 37, 30, 23, + 31, 38, 45, 52, 59, + 60, 53, 46, 39, + 47, 54, 61, + 62, 55, + 63 +}; + +static const __u64 RTjpeg_aan_tab[64] = { + 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, + 3374581504ULL, 2324432128ULL, 1184891264ULL, + 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, + 4680582144ULL, 3224107520ULL, 1643641088ULL, + 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, + 4408998912ULL, 3036936960ULL, 1548224000ULL, + 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, + 3968072960ULL, 2733115392ULL, 1393296000ULL, + 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, + 3374581504ULL, 2324432128ULL, 1184891264ULL, + 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, + 2651326208ULL, 1826357504ULL, 931136000ULL, + 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, + 1826357504ULL, 1258030336ULL, 641204288ULL, + 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, + 931136000ULL, 641204288ULL, 326894240ULL, }; #ifndef HAVE_LIBMMX -static __s32 RTjpeg_ws[64+31]; +static __s32 RTjpeg_ws[64 + 31]; #endif -__u8 RTjpeg_alldata[2*64+4*64+4*64+4*64+4*64+32]; +__u8 RTjpeg_alldata[2 * 64 + 4 * 64 + 4 * 64 + 4 * 64 + 4 * 64 + 32]; __s16 *RTjpeg_block; __s32 *RTjpeg_lqt; @@ -97,7 +106,7 @@ int RTjpeg_width, RTjpeg_height; int RTjpeg_Ywidth, RTjpeg_Cwidth; int RTjpeg_Ysize, RTjpeg_Csize; -__s16 *RTjpeg_old=NULL; +__s16 *RTjpeg_old = NULL; #ifdef HAVE_LIBMMX mmx_t RTjpeg_lmask; @@ -106,173 +115,169 @@ mmx_t RTjpeg_cmask; __u16 RTjpeg_lmask; __u16 RTjpeg_cmask; #endif -int RTjpeg_mtest=0; +int RTjpeg_mtest = 0; static const unsigned char RTjpeg_lum_quant_tbl[64] = { - 16, 11, 10, 16, 24, 40, 51, 61, - 12, 12, 14, 19, 26, 58, 60, 55, - 14, 13, 16, 24, 40, 57, 69, 56, - 14, 17, 22, 29, 51, 87, 80, 62, - 18, 22, 37, 56, 68, 109, 103, 77, - 24, 35, 55, 64, 81, 104, 113, 92, - 49, 64, 78, 87, 103, 121, 120, 101, - 72, 92, 95, 98, 112, 100, 103, 99 - }; + 16, 11, 10, 16, 24, 40, 51, 61, + 12, 12, 14, 19, 26, 58, 60, 55, + 14, 13, 16, 24, 40, 57, 69, 56, + 14, 17, 22, 29, 51, 87, 80, 62, + 18, 22, 37, 56, 68, 109, 103, 77, + 24, 35, 55, 64, 81, 104, 113, 92, + 49, 64, 78, 87, 103, 121, 120, 101, + 72, 92, 95, 98, 112, 100, 103, 99 +}; static const unsigned char RTjpeg_chrom_quant_tbl[64] = { - 17, 18, 24, 47, 99, 99, 99, 99, - 18, 21, 26, 66, 99, 99, 99, 99, - 24, 26, 56, 99, 99, 99, 99, 99, - 47, 66, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99 - }; - -int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8) + 17, 18, 24, 47, 99, 99, 99, 99, + 18, 21, 26, 66, 99, 99, 99, 99, + 24, 26, 56, 99, 99, 99, 99, 99, + 47, 66, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99 +}; + +int +RTjpeg_b2s (__s16 * data, __s8 * strm, __u8 bt8) { - register int ci, co=1, tmp; - register __s16 ZZvalue; - - strm[0]=(__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]); - - for(ci=1; ci<=bt8; ci++) - { - ZZvalue = data[RTjpeg_ZZ[ci]]; - - if(ZZvalue>0) - { - strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue; - } - else - { - strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue; - } - } - - for(; ci<64; ci++) - { - ZZvalue = data[RTjpeg_ZZ[ci]]; - - if(ZZvalue>0) - { - strm[co++]=(__s8)(ZZvalue>63)?63:ZZvalue; - } - else if(ZZvalue<0) - { - strm[co++]=(__s8)(ZZvalue<-64)?-64:ZZvalue; - } - else /* compress zeros */ - { - tmp=ci; - do - { - ci++; - } - while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0)); + register int ci, co = 1, tmp; + register __s16 ZZvalue; + + strm[0] = + (__u8) (data[RTjpeg_ZZ[0]] > 254) ? 254 : ((data[RTjpeg_ZZ[0]] < + 0) ? 0 : data[RTjpeg_ZZ[0]]); + + for (ci = 1; ci <= bt8; ci++) { + ZZvalue = data[RTjpeg_ZZ[ci]]; + + if (ZZvalue > 0) { + strm[co++] = (__s8) (ZZvalue > 127) ? 127 : ZZvalue; + } else { + strm[co++] = (__s8) (ZZvalue < -128) ? -128 : ZZvalue; + } + } + + for (; ci < 64; ci++) { + ZZvalue = data[RTjpeg_ZZ[ci]]; + + if (ZZvalue > 0) { + strm[co++] = (__s8) (ZZvalue > 63) ? 63 : ZZvalue; + } else if (ZZvalue < 0) { + strm[co++] = (__s8) (ZZvalue < -64) ? -64 : ZZvalue; + } else { /* compress zeros */ - strm[co++]=(__s8)(63+(ci-tmp)); - ci--; + tmp = ci; + do { + ci++; + } + while ((ci < 64) && (data[RTjpeg_ZZ[ci]] == 0)); + + strm[co++] = (__s8) (63 + (ci - tmp)); + ci--; + } } - } - return (int)co; + return (int) co; } -int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl) +int +RTjpeg_s2b (__s16 * data, __s8 * strm, __u8 bt8, __u32 * qtbl) { - int ci=1, co=1, tmp; - register int i; - - i=RTjpeg_ZZ[0]; - data[i]=((__u8)strm[0])*qtbl[i]; - - for(co=1; co<=bt8; co++) - { - i=RTjpeg_ZZ[co]; - data[i]=strm[ci++]*qtbl[i]; - } - - for(; co<64; co++) - { - if(strm[ci]>63) - { - tmp=co+strm[ci]-63; - for(; co 63) { + tmp = co + strm[ci] - 63; + for (; co < tmp; co++) + data[RTjpeg_ZZ[co]] = 0; + co--; + } else { + i = RTjpeg_ZZ[co]; + data[i] = strm[ci] * qtbl[i]; + } + ci++; } - ci++; - } - return (int)ci; + return (int) ci; } #if defined(HAVE_LIBMMX) -void RTjpeg_quant_init(void) +void +RTjpeg_quant_init (void) { - int i; - __s16 *qtbl; - - qtbl=(__s16 *)RTjpeg_lqt; - for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_lqt[i]; - - qtbl=(__s16 *)RTjpeg_cqt; - for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_cqt[i]; + int i; + __s16 *qtbl; + + qtbl = (__s16 *) RTjpeg_lqt; + for (i = 0; i < 64; i++) + qtbl[i] = (__s16) RTjpeg_lqt[i]; + + qtbl = (__s16 *) RTjpeg_cqt; + for (i = 0; i < 64; i++) + qtbl[i] = (__s16) RTjpeg_cqt[i]; } -static mmx_t RTjpeg_ones=(mmx_t)(long long)0x0001000100010001LL; -static mmx_t RTjpeg_half=(mmx_t)(long long)0x7fff7fff7fff7fffLL; +static mmx_t RTjpeg_ones = (mmx_t) (long long) 0x0001000100010001LL; +static mmx_t RTjpeg_half = (mmx_t) (long long) 0x7fff7fff7fff7fffLL; -void RTjpeg_quant(__s16 *block, __s32 *qtbl) +void +RTjpeg_quant (__s16 * block, __s32 * qtbl) { - int i; - mmx_t *bl, *ql; - - ql=(mmx_t *)qtbl; - bl=(mmx_t *)block; - - movq_m2r(RTjpeg_ones, mm6); - movq_m2r(RTjpeg_half, mm7); - - for(i=16; i; i--) - { - movq_m2r(*(ql++), mm0); /* quant vals (4) */ - movq_m2r(*bl, mm2); /* block vals (4) */ - movq_r2r(mm0, mm1); - movq_r2r(mm2, mm3); - - punpcklwd_r2r(mm6, mm0); /* 1 qb 1 qa */ - punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */ - - punpcklwd_r2r(mm7, mm2); /* 32767 bb 32767 ba */ - punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */ - - pmaddwd_r2r(mm2, mm0); /* 32767+bb*qb 32767+ba*qa */ - pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */ - - psrad_i2r(16, mm0); - psrad_i2r(16, mm1); - - packssdw_r2r(mm1, mm0); - - movq_r2m(mm0, *(bl++)); - - } + int i; + mmx_t *bl, *ql; + + ql = (mmx_t *) qtbl; + bl = (mmx_t *) block; + + movq_m2r (RTjpeg_ones, mm6); + movq_m2r (RTjpeg_half, mm7); + + for (i = 16; i; i--) { + movq_m2r (*(ql++), mm0); /* quant vals (4) */ + movq_m2r (*bl, mm2); /* block vals (4) */ + movq_r2r (mm0, mm1); + movq_r2r (mm2, mm3); + + punpcklwd_r2r (mm6, mm0); /* 1 qb 1 qa */ + punpckhwd_r2r (mm6, mm1); /* 1 qd 1 qc */ + + punpcklwd_r2r (mm7, mm2); /* 32767 bb 32767 ba */ + punpckhwd_r2r (mm7, mm3); /* 32767 bd 32767 bc */ + + pmaddwd_r2r (mm2, mm0); /* 32767+bb*qb 32767+ba*qa */ + pmaddwd_r2r (mm3, mm1); /* 32767+bd*qd 32767+bc*qc */ + + psrad_i2r (16, mm0); + psrad_i2r (16, mm1); + + packssdw_r2r (mm1, mm0); + + movq_r2m (mm0, *(bl++)); + + } } #else -void RTjpeg_quant_init(void) +void +RTjpeg_quant_init (void) { } -void RTjpeg_quant(__s16 *block, __s32 *qtbl) +void +RTjpeg_quant (__s16 * block, __s32 * qtbl) { - int i; - - for(i=0; i<64; i++) - block[i]=(__s16)((block[i]*qtbl[i]+32767)>>16); + int i; + + for (i = 0; i < 64; i++) + block[i] = (__s16) ((block[i] * qtbl[i] + 32767) >> 16); } #endif @@ -280,36 +285,37 @@ void RTjpeg_quant(__s16 *block, __s32 *qtbl) * Perform the forward DCT on one block of samples. */ #ifdef HAVE_LIBMMX -static mmx_t RTjpeg_C4 =(mmx_t)(long long)0x2D412D412D412D41LL; -static mmx_t RTjpeg_C6 =(mmx_t)(long long)0x187E187E187E187ELL; -static mmx_t RTjpeg_C2mC6=(mmx_t)(long long)0x22A322A322A322A3LL; -static mmx_t RTjpeg_C2pC6=(mmx_t)(long long)0x539F539F539F539FLL; -static mmx_t RTjpeg_zero =(mmx_t)(long long)0x0000000000000000LL; +static mmx_t RTjpeg_C4 = (mmx_t) (long long) 0x2D412D412D412D41LL; +static mmx_t RTjpeg_C6 = (mmx_t) (long long) 0x187E187E187E187ELL; +static mmx_t RTjpeg_C2mC6 = (mmx_t) (long long) 0x22A322A322A322A3LL; +static mmx_t RTjpeg_C2pC6 = (mmx_t) (long long) 0x539F539F539F539FLL; +static mmx_t RTjpeg_zero = (mmx_t) (long long) 0x0000000000000000LL; #else -#define FIX_0_382683433 ((__s32) 98) /* FIX(0.382683433) */ -#define FIX_0_541196100 ((__s32) 139) /* FIX(0.541196100) */ -#define FIX_0_707106781 ((__s32) 181) /* FIX(0.707106781) */ -#define FIX_1_306562965 ((__s32) 334) /* FIX(1.306562965) */ +#define FIX_0_382683433 ((__s32) 98) /* FIX(0.382683433) */ +#define FIX_0_541196100 ((__s32) 139) /* FIX(0.541196100) */ +#define FIX_0_707106781 ((__s32) 181) /* FIX(0.707106781) */ +#define FIX_1_306562965 ((__s32) 334) /* FIX(1.306562965) */ #define DESCALE10(x) (__s16)( ((x)+128) >> 8) #define DESCALE20(x) (__s16)(((x)+32768) >> 16) #define D_MULTIPLY(var,const) ((__s32) ((var) * (const))) #endif -void RTjpeg_dct_init(void) +void +RTjpeg_dct_init (void) { - int i; - - for(i=0; i<64; i++) - { - RTjpeg_lqt[i]=(((__u64)RTjpeg_lqt[i]<<32)/RTjpeg_aan_tab[i]); - RTjpeg_cqt[i]=(((__u64)RTjpeg_cqt[i]<<32)/RTjpeg_aan_tab[i]); - } + int i; + + for (i = 0; i < 64; i++) { + RTjpeg_lqt[i] = (((__u64) RTjpeg_lqt[i] << 32) / RTjpeg_aan_tab[i]); + RTjpeg_cqt[i] = (((__u64) RTjpeg_cqt[i] << 32) / RTjpeg_aan_tab[i]); + } } -void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip) +void +RTjpeg_dctY (__u8 * idata, __s16 * odata, int rskip) { #ifndef HAVE_LIBMMX __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; @@ -331,42 +337,42 @@ void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip) tmp5 = idataptr[2] - idataptr[5]; tmp3 = idataptr[3] + idataptr[4]; tmp4 = idataptr[3] - idataptr[4]; - + tmp10 = (tmp0 + tmp3); /* phase 2 */ tmp13 = tmp0 - tmp3; tmp11 = (tmp1 + tmp2); tmp12 = tmp1 - tmp2; - - wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */ - wsptr[4] = (tmp10 - tmp11)<<8; - - z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ - wsptr[2] = (tmp13<<8) + z1; /* phase 5 */ - wsptr[6] = (tmp13<<8) - z1; - + + wsptr[0] = (tmp10 + tmp11) << 8; /* phase 3 */ + wsptr[4] = (tmp10 - tmp11) << 8; + + z1 = D_MULTIPLY (tmp12 + tmp13, FIX_0_707106781); /* c4 */ + wsptr[2] = (tmp13 << 8) + z1; /* phase 5 */ + wsptr[6] = (tmp13 << 8) - z1; + tmp10 = tmp4 + tmp5; /* phase 2 */ tmp11 = tmp5 + tmp6; tmp12 = tmp6 + tmp7; - z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ - z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ - z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ - z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ + z5 = D_MULTIPLY (tmp10 - tmp12, FIX_0_382683433); /* c6 */ + z2 = D_MULTIPLY (tmp10, FIX_0_541196100) + z5; /* c2-c6 */ + z4 = D_MULTIPLY (tmp12, FIX_1_306562965) + z5; /* c2+c6 */ + z3 = D_MULTIPLY (tmp11, FIX_0_707106781); /* c4 */ - z11 = (tmp7<<8) + z3; /* phase 5 */ - z13 = (tmp7<<8) - z3; + z11 = (tmp7 << 8) + z3; /* phase 5 */ + z13 = (tmp7 << 8) - z3; wsptr[5] = z13 + z2; /* phase 6 */ wsptr[3] = z13 - z2; wsptr[1] = z11 + z4; wsptr[7] = z11 - z4; - idataptr += rskip<<3; /* advance pointer to next row */ + idataptr += rskip << 3; /* advance pointer to next row */ wsptr += 8; } wsptr = RTjpeg_ws; - odataptr=odata; + odataptr = odata; for (ctr = 7; ctr >= 0; ctr--) { tmp0 = wsptr[0] + wsptr[56]; tmp7 = wsptr[0] - wsptr[56]; @@ -376,848 +382,848 @@ void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip) tmp5 = wsptr[16] - wsptr[40]; tmp3 = wsptr[24] + wsptr[32]; tmp4 = wsptr[24] - wsptr[32]; - + tmp10 = tmp0 + tmp3; /* phase 2 */ tmp13 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - - odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */ - odataptr[32] = DESCALE10(tmp10 - tmp11); - - z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */ - odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */ - odataptr[48] = DESCALE20((tmp13<<8) - z1); + + odataptr[0] = DESCALE10 (tmp10 + tmp11); /* phase 3 */ + odataptr[32] = DESCALE10 (tmp10 - tmp11); + + z1 = D_MULTIPLY (tmp12 + tmp13, FIX_0_707106781); /* c4 */ + odataptr[16] = DESCALE20 ((tmp13 << 8) + z1); /* phase 5 */ + odataptr[48] = DESCALE20 ((tmp13 << 8) - z1); tmp10 = tmp4 + tmp5; /* phase 2 */ tmp11 = tmp5 + tmp6; tmp12 = tmp6 + tmp7; - z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */ - z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */ - z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */ - z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */ + z5 = D_MULTIPLY (tmp10 - tmp12, FIX_0_382683433); /* c6 */ + z2 = D_MULTIPLY (tmp10, FIX_0_541196100) + z5; /* c2-c6 */ + z4 = D_MULTIPLY (tmp12, FIX_1_306562965) + z5; /* c2+c6 */ + z3 = D_MULTIPLY (tmp11, FIX_0_707106781); /* c4 */ - z11 = (tmp7<<8) + z3; /* phase 5 */ - z13 = (tmp7<<8) - z3; + z11 = (tmp7 << 8) + z3; /* phase 5 */ + z13 = (tmp7 << 8) - z3; - odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */ - odataptr[24] = DESCALE20(z13 - z2); - odataptr[8] = DESCALE20(z11 + z4); - odataptr[56] = DESCALE20(z11 - z4); + odataptr[40] = DESCALE20 (z13 + z2); /* phase 6 */ + odataptr[24] = DESCALE20 (z13 - z2); + odataptr[8] = DESCALE20 (z11 + z4); + odataptr[56] = DESCALE20 (z11 - z4); odataptr++; /* advance pointer to next column */ wsptr++; } #else mmx_t tmp6, tmp7; - register mmx_t *dataptr = (mmx_t *)odata; - mmx_t *idata2 = (mmx_t *)idata; + register mmx_t *dataptr = (mmx_t *) odata; + mmx_t *idata2 = (mmx_t *) idata; + + /* first copy the input 8 bit to the destination 16 bits */ + + movq_m2r (RTjpeg_zero, mm2); + - /* first copy the input 8 bit to the destination 16 bits */ + movq_m2r (*idata2, mm0); + movq_r2r (mm0, mm1); - movq_m2r(RTjpeg_zero, mm2); + punpcklbw_r2r (mm2, mm0); + movq_r2m (mm0, *(dataptr)); + punpckhbw_r2r (mm2, mm1); + movq_r2m (mm1, *(dataptr + 1)); - movq_m2r(*idata2, mm0); - movq_r2r(mm0, mm1); + idata2 += rskip; - punpcklbw_r2r(mm2, mm0); - movq_r2m(mm0, *(dataptr)); + movq_m2r (*idata2, mm0); + movq_r2r (mm0, mm1); - punpckhbw_r2r(mm2, mm1); - movq_r2m(mm1, *(dataptr+1)); - - idata2 += rskip; + punpcklbw_r2r (mm2, mm0); + movq_r2m (mm0, *(dataptr + 2)); - movq_m2r(*idata2, mm0); - movq_r2r(mm0, mm1); + punpckhbw_r2r (mm2, mm1); + movq_r2m (mm1, *(dataptr + 3)); - punpcklbw_r2r(mm2, mm0); - movq_r2m(mm0, *(dataptr+2)); + idata2 += rskip; - punpckhbw_r2r(mm2, mm1); - movq_r2m(mm1, *(dataptr+3)); - - idata2 += rskip; + movq_m2r (*idata2, mm0); + movq_r2r (mm0, mm1); - movq_m2r(*idata2, mm0); - movq_r2r(mm0, mm1); + punpcklbw_r2r (mm2, mm0); + movq_r2m (mm0, *(dataptr + 4)); - punpcklbw_r2r(mm2, mm0); - movq_r2m(mm0, *(dataptr+4)); + punpckhbw_r2r (mm2, mm1); + movq_r2m (mm1, *(dataptr + 5)); - punpckhbw_r2r(mm2, mm1); - movq_r2m(mm1, *(dataptr+5)); - - idata2 += rskip; + idata2 += rskip; - movq_m2r(*idata2, mm0); - movq_r2r(mm0, mm1); + movq_m2r (*idata2, mm0); + movq_r2r (mm0, mm1); - punpcklbw_r2r(mm2, mm0); - movq_r2m(mm0, *(dataptr+6)); + punpcklbw_r2r (mm2, mm0); + movq_r2m (mm0, *(dataptr + 6)); - punpckhbw_r2r(mm2, mm1); - movq_r2m(mm1, *(dataptr+7)); - - idata2 += rskip; + punpckhbw_r2r (mm2, mm1); + movq_r2m (mm1, *(dataptr + 7)); - movq_m2r(*idata2, mm0); - movq_r2r(mm0, mm1); + idata2 += rskip; - punpcklbw_r2r(mm2, mm0); - movq_r2m(mm0, *(dataptr+8)); + movq_m2r (*idata2, mm0); + movq_r2r (mm0, mm1); - punpckhbw_r2r(mm2, mm1); - movq_r2m(mm1, *(dataptr+9)); - - idata2 += rskip; + punpcklbw_r2r (mm2, mm0); + movq_r2m (mm0, *(dataptr + 8)); - movq_m2r(*idata2, mm0); - movq_r2r(mm0, mm1); + punpckhbw_r2r (mm2, mm1); + movq_r2m (mm1, *(dataptr + 9)); - punpcklbw_r2r(mm2, mm0); - movq_r2m(mm0, *(dataptr+10)); + idata2 += rskip; - punpckhbw_r2r(mm2, mm1); - movq_r2m(mm1, *(dataptr+11)); - - idata2 += rskip; + movq_m2r (*idata2, mm0); + movq_r2r (mm0, mm1); - movq_m2r(*idata2, mm0); - movq_r2r(mm0, mm1); + punpcklbw_r2r (mm2, mm0); + movq_r2m (mm0, *(dataptr + 10)); - punpcklbw_r2r(mm2, mm0); - movq_r2m(mm0, *(dataptr+12)); + punpckhbw_r2r (mm2, mm1); + movq_r2m (mm1, *(dataptr + 11)); - punpckhbw_r2r(mm2, mm1); - movq_r2m(mm1, *(dataptr+13)); - - idata2 += rskip; + idata2 += rskip; - movq_m2r(*idata2, mm0); - movq_r2r(mm0, mm1); + movq_m2r (*idata2, mm0); + movq_r2r (mm0, mm1); - punpcklbw_r2r(mm2, mm0); - movq_r2m(mm0, *(dataptr+14)); + punpcklbw_r2r (mm2, mm0); + movq_r2m (mm0, *(dataptr + 12)); - punpckhbw_r2r(mm2, mm1); - movq_r2m(mm1, *(dataptr+15)); + punpckhbw_r2r (mm2, mm1); + movq_r2m (mm1, *(dataptr + 13)); + + idata2 += rskip; + + movq_m2r (*idata2, mm0); + movq_r2r (mm0, mm1); + + punpcklbw_r2r (mm2, mm0); + movq_r2m (mm0, *(dataptr + 14)); + + punpckhbw_r2r (mm2, mm1); + movq_r2m (mm1, *(dataptr + 15)); /* Start Transpose to do calculations on rows */ - movq_m2r(*(dataptr+9), mm7); /* m03:m02|m01:m00 - first line (line 4)and copy into m5 */ + movq_m2r (*(dataptr + 9), mm7); /* m03:m02|m01:m00 - first line (line 4)and copy into m5 */ - movq_m2r(*(dataptr+13), mm6); /* m23:m22|m21:m20 - third line (line 6)and copy into m2 */ - movq_r2r(mm7, mm5); + movq_m2r (*(dataptr + 13), mm6); /* m23:m22|m21:m20 - third line (line 6)and copy into m2 */ + movq_r2r (mm7, mm5); - punpcklwd_m2r(*(dataptr+11), mm7); /* m11:m01|m10:m00 - interleave first and second lines */ - movq_r2r(mm6, mm2); + punpcklwd_m2r (*(dataptr + 11), mm7); /* m11:m01|m10:m00 - interleave first and second lines */ + movq_r2r (mm6, mm2); - punpcklwd_m2r(*(dataptr+15), mm6); /* m31:m21|m30:m20 - interleave third and fourth lines */ - movq_r2r(mm7, mm1); + punpcklwd_m2r (*(dataptr + 15), mm6); /* m31:m21|m30:m20 - interleave third and fourth lines */ + movq_r2r (mm7, mm1); - movq_m2r(*(dataptr+11), mm3); /* m13:m13|m11:m10 - second line */ - punpckldq_r2r(mm6, mm7); /* m30:m20|m10:m00 - interleave to produce result 1 */ + movq_m2r (*(dataptr + 11), mm3); /* m13:m13|m11:m10 - second line */ + punpckldq_r2r (mm6, mm7); /* m30:m20|m10:m00 - interleave to produce result 1 */ - movq_m2r(*(dataptr+15), mm0); /* m13:m13|m11:m10 - fourth line */ - punpckhdq_r2r(mm6, mm1); /* m31:m21|m11:m01 - interleave to produce result 2 */ + movq_m2r (*(dataptr + 15), mm0); /* m13:m13|m11:m10 - fourth line */ + punpckhdq_r2r (mm6, mm1); /* m31:m21|m11:m01 - interleave to produce result 2 */ - movq_r2m(mm7,*(dataptr+9)); /* write result 1 */ - punpckhwd_r2r(mm3, mm5); /* m13:m03|m12:m02 - interleave first and second lines */ - - movq_r2m(mm1,*(dataptr+11)); /* write result 2 */ - punpckhwd_r2r(mm0, mm2); /* m33:m23|m32:m22 - interleave third and fourth lines */ + movq_r2m (mm7, *(dataptr + 9)); /* write result 1 */ + punpckhwd_r2r (mm3, mm5); /* m13:m03|m12:m02 - interleave first and second lines */ - movq_r2r(mm5, mm1); - punpckldq_r2r(mm2, mm5); /* m32:m22|m12:m02 - interleave to produce result 3 */ + movq_r2m (mm1, *(dataptr + 11)); /* write result 2 */ + punpckhwd_r2r (mm0, mm2); /* m33:m23|m32:m22 - interleave third and fourth lines */ - movq_m2r(*(dataptr+1), mm0); /* m03:m02|m01:m00 - first line, 4x4 */ - punpckhdq_r2r(mm2, mm1); /* m33:m23|m13:m03 - interleave to produce result 4 */ + movq_r2r (mm5, mm1); + punpckldq_r2r (mm2, mm5); /* m32:m22|m12:m02 - interleave to produce result 3 */ - movq_r2m(mm5,*(dataptr+13)); /* write result 3 */ + movq_m2r (*(dataptr + 1), mm0); /* m03:m02|m01:m00 - first line, 4x4 */ + punpckhdq_r2r (mm2, mm1); /* m33:m23|m13:m03 - interleave to produce result 4 */ - /* last 4x4 done */ + movq_r2m (mm5, *(dataptr + 13)); /* write result 3 */ - movq_r2m(mm1, *(dataptr+15)); /* write result 4, last 4x4 */ + /* last 4x4 done */ - movq_m2r(*(dataptr+5), mm2); /* m23:m22|m21:m20 - third line */ - movq_r2r(mm0, mm6); + movq_r2m (mm1, *(dataptr + 15)); /* write result 4, last 4x4 */ - punpcklwd_m2r(*(dataptr+3), mm0); /* m11:m01|m10:m00 - interleave first and second lines */ - movq_r2r(mm2, mm7); + movq_m2r (*(dataptr + 5), mm2); /* m23:m22|m21:m20 - third line */ + movq_r2r (mm0, mm6); - punpcklwd_m2r(*(dataptr+7), mm2); /* m31:m21|m30:m20 - interleave third and fourth lines */ - movq_r2r(mm0, mm4); + punpcklwd_m2r (*(dataptr + 3), mm0); /* m11:m01|m10:m00 - interleave first and second lines */ + movq_r2r (mm2, mm7); - - movq_m2r(*(dataptr+8), mm1); /* n03:n02|n01:n00 - first line */ - punpckldq_r2r(mm2, mm0); /* m30:m20|m10:m00 - interleave to produce first result */ + punpcklwd_m2r (*(dataptr + 7), mm2); /* m31:m21|m30:m20 - interleave third and fourth lines */ + movq_r2r (mm0, mm4); - movq_m2r(*(dataptr+12), mm3); /* n23:n22|n21:n20 - third line */ - punpckhdq_r2r(mm2, mm4); /* m31:m21|m11:m01 - interleave to produce second result */ - punpckhwd_m2r(*(dataptr+3), mm6); /* m13:m03|m12:m02 - interleave first and second lines */ - movq_r2r(mm1, mm2); /* copy first line */ + movq_m2r (*(dataptr + 8), mm1); /* n03:n02|n01:n00 - first line */ + punpckldq_r2r (mm2, mm0); /* m30:m20|m10:m00 - interleave to produce first result */ - punpckhwd_m2r(*(dataptr+7), mm7); /* m33:m23|m32:m22 - interleave third and fourth lines */ - movq_r2r(mm6, mm5); /* copy first intermediate result */ + movq_m2r (*(dataptr + 12), mm3); /* n23:n22|n21:n20 - third line */ + punpckhdq_r2r (mm2, mm4); /* m31:m21|m11:m01 - interleave to produce second result */ - movq_r2m(mm0, *(dataptr+8)); /* write result 1 */ - punpckhdq_r2r(mm7, mm5); /* m33:m23|m13:m03 - produce third result */ + punpckhwd_m2r (*(dataptr + 3), mm6); /* m13:m03|m12:m02 - interleave first and second lines */ + movq_r2r (mm1, mm2); /* copy first line */ - punpcklwd_m2r(*(dataptr+10), mm1); /* n11:n01|n10:n00 - interleave first and second lines */ - movq_r2r(mm3, mm0); /* copy third line */ + punpckhwd_m2r (*(dataptr + 7), mm7); /* m33:m23|m32:m22 - interleave third and fourth lines */ + movq_r2r (mm6, mm5); /* copy first intermediate result */ - punpckhwd_m2r(*(dataptr+10), mm2); /* n13:n03|n12:n02 - interleave first and second lines */ + movq_r2m (mm0, *(dataptr + 8)); /* write result 1 */ + punpckhdq_r2r (mm7, mm5); /* m33:m23|m13:m03 - produce third result */ - movq_r2m(mm4, *(dataptr+10)); /* write result 2 out */ - punpckldq_r2r(mm7, mm6); /* m32:m22|m12:m02 - produce fourth result */ + punpcklwd_m2r (*(dataptr + 10), mm1); /* n11:n01|n10:n00 - interleave first and second lines */ + movq_r2r (mm3, mm0); /* copy third line */ - punpcklwd_m2r(*(dataptr+14), mm3); /* n31:n21|n30:n20 - interleave third and fourth lines */ - movq_r2r(mm1, mm4); + punpckhwd_m2r (*(dataptr + 10), mm2); /* n13:n03|n12:n02 - interleave first and second lines */ - movq_r2m(mm6, *(dataptr+12)); /* write result 3 out */ - punpckldq_r2r(mm3, mm1); /* n30:n20|n10:n00 - produce first result */ + movq_r2m (mm4, *(dataptr + 10)); /* write result 2 out */ + punpckldq_r2r (mm7, mm6); /* m32:m22|m12:m02 - produce fourth result */ - punpckhwd_m2r(*(dataptr+14), mm0); /* n33:n23|n32:n22 - interleave third and fourth lines */ - movq_r2r(mm2, mm6); + punpcklwd_m2r (*(dataptr + 14), mm3); /* n31:n21|n30:n20 - interleave third and fourth lines */ + movq_r2r (mm1, mm4); - movq_r2m(mm5, *(dataptr+14)); /* write result 4 out */ - punpckhdq_r2r(mm3, mm4); /* n31:n21|n11:n01- produce second result */ + movq_r2m (mm6, *(dataptr + 12)); /* write result 3 out */ + punpckldq_r2r (mm3, mm1); /* n30:n20|n10:n00 - produce first result */ - movq_r2m(mm1, *(dataptr+1)); /* write result 5 out - (first result for other 4 x 4 block) */ - punpckldq_r2r(mm0, mm2); /* n32:n22|n12:n02- produce third result */ + punpckhwd_m2r (*(dataptr + 14), mm0); /* n33:n23|n32:n22 - interleave third and fourth lines */ + movq_r2r (mm2, mm6); - movq_r2m(mm4, *(dataptr+3)); /* write result 6 out */ - punpckhdq_r2r(mm0, mm6); /* n33:n23|n13:n03 - produce fourth result */ + movq_r2m (mm5, *(dataptr + 14)); /* write result 4 out */ + punpckhdq_r2r (mm3, mm4); /* n31:n21|n11:n01- produce second result */ - movq_r2m(mm2, *(dataptr+5)); /* write result 7 out*/ + movq_r2m (mm1, *(dataptr + 1)); /* write result 5 out - (first result for other 4 x 4 block) */ + punpckldq_r2r (mm0, mm2); /* n32:n22|n12:n02- produce third result */ - movq_m2r(*dataptr, mm0); /* m03:m02|m01:m00 - first line, first 4x4 */ + movq_r2m (mm4, *(dataptr + 3)); /* write result 6 out */ + punpckhdq_r2r (mm0, mm6); /* n33:n23|n13:n03 - produce fourth result */ - movq_r2m(mm6, *(dataptr+7)); /* write result 8 out */ + movq_r2m (mm2, *(dataptr + 5)); /* write result 7 out */ + + movq_m2r (*dataptr, mm0); /* m03:m02|m01:m00 - first line, first 4x4 */ + + movq_r2m (mm6, *(dataptr + 7)); /* write result 8 out */ /* Do first 4x4 quadrant, which is used in the beginning of the DCT: */ - movq_m2r(*(dataptr+4), mm7); /* m23:m22|m21:m20 - third line */ - movq_r2r(mm0, mm2); + movq_m2r (*(dataptr + 4), mm7); /* m23:m22|m21:m20 - third line */ + movq_r2r (mm0, mm2); - punpcklwd_m2r(*(dataptr+2), mm0); /* m11:m01|m10:m00 - interleave first and second lines */ - movq_r2r(mm7, mm4); + punpcklwd_m2r (*(dataptr + 2), mm0); /* m11:m01|m10:m00 - interleave first and second lines */ + movq_r2r (mm7, mm4); - punpcklwd_m2r(*(dataptr+6), mm7); /* m31:m21|m30:m20 - interleave third and fourth lines */ - movq_r2r(mm0, mm1); + punpcklwd_m2r (*(dataptr + 6), mm7); /* m31:m21|m30:m20 - interleave third and fourth lines */ + movq_r2r (mm0, mm1); - movq_m2r(*(dataptr+2), mm6); /* m13:m12|m11:m10 - second line */ - punpckldq_r2r(mm7, mm0); /* m30:m20|m10:m00 - interleave to produce result 1 */ + movq_m2r (*(dataptr + 2), mm6); /* m13:m12|m11:m10 - second line */ + punpckldq_r2r (mm7, mm0); /* m30:m20|m10:m00 - interleave to produce result 1 */ - movq_m2r(*(dataptr+6), mm5); /* m33:m32|m31:m30 - fourth line */ - punpckhdq_r2r(mm7, mm1); /* m31:m21|m11:m01 - interleave to produce result 2 */ + movq_m2r (*(dataptr + 6), mm5); /* m33:m32|m31:m30 - fourth line */ + punpckhdq_r2r (mm7, mm1); /* m31:m21|m11:m01 - interleave to produce result 2 */ - movq_r2r(mm0, mm7); /* write result 1 */ - punpckhwd_r2r(mm6, mm2); /* m13:m03|m12:m02 - interleave first and second lines */ + movq_r2r (mm0, mm7); /* write result 1 */ + punpckhwd_r2r (mm6, mm2); /* m13:m03|m12:m02 - interleave first and second lines */ - psubw_m2r(*(dataptr+14), mm7); /* tmp07=x0-x7: Stage 1 */ - movq_r2r(mm1, mm6); /* write result 2 */ + psubw_m2r (*(dataptr + 14), mm7); /* tmp07=x0-x7: Stage 1 */ + movq_r2r (mm1, mm6); /* write result 2 */ - paddw_m2r(*(dataptr+14), mm0); /* tmp00=x0+x7: Stage 1 */ - punpckhwd_r2r(mm5, mm4); /* m33:m23|m32:m22 - interleave third and fourth lines */ + paddw_m2r (*(dataptr + 14), mm0); /* tmp00=x0+x7: Stage 1 */ + punpckhwd_r2r (mm5, mm4); /* m33:m23|m32:m22 - interleave third and fourth lines */ - paddw_m2r(*(dataptr+12), mm1); /* tmp01=x1+x6: Stage 1 */ - movq_r2r(mm2, mm3); /* copy first intermediate result */ + paddw_m2r (*(dataptr + 12), mm1); /* tmp01=x1+x6: Stage 1 */ + movq_r2r (mm2, mm3); /* copy first intermediate result */ - psubw_m2r(*(dataptr+12), mm6); /* tmp06=x1-x6: Stage 1 */ - punpckldq_r2r(mm4, mm2); /* m32:m22|m12:m02 - interleave to produce result 3 */ + psubw_m2r (*(dataptr + 12), mm6); /* tmp06=x1-x6: Stage 1 */ + punpckldq_r2r (mm4, mm2); /* m32:m22|m12:m02 - interleave to produce result 3 */ - movq_r2m(mm7, tmp7); - movq_r2r(mm2, mm5); /* write result 3 */ + movq_r2m (mm7, tmp7); + movq_r2r (mm2, mm5); /* write result 3 */ - movq_r2m(mm6, tmp6); - punpckhdq_r2r(mm4, mm3); /* m33:m23|m13:m03 - interleave to produce result 4 */ + movq_r2m (mm6, tmp6); + punpckhdq_r2r (mm4, mm3); /* m33:m23|m13:m03 - interleave to produce result 4 */ - paddw_m2r(*(dataptr+10), mm2); /* tmp02=x2+5: Stage 1 */ - movq_r2r(mm3, mm4); /* write result 4 */ + paddw_m2r (*(dataptr + 10), mm2); /* tmp02=x2+5: Stage 1 */ + movq_r2r (mm3, mm4); /* write result 4 */ /************************************************************************************************ End of Transpose ************************************************************************************************/ - paddw_m2r(*(dataptr+8), mm3); /* tmp03=x3+x4: stage 1 */ - movq_r2r(mm0, mm7); + paddw_m2r (*(dataptr + 8), mm3); /* tmp03=x3+x4: stage 1 */ + movq_r2r (mm0, mm7); - psubw_m2r(*(dataptr+8), mm4); /* tmp04=x3-x4: stage 1 */ - movq_r2r(mm1, mm6); + psubw_m2r (*(dataptr + 8), mm4); /* tmp04=x3-x4: stage 1 */ + movq_r2r (mm1, mm6); - paddw_r2r(mm3, mm0); /* tmp10 = tmp00 + tmp03: even 2 */ - psubw_r2r(mm3, mm7); /* tmp13 = tmp00 - tmp03: even 2 */ + paddw_r2r (mm3, mm0); /* tmp10 = tmp00 + tmp03: even 2 */ + psubw_r2r (mm3, mm7); /* tmp13 = tmp00 - tmp03: even 2 */ - psubw_r2r(mm2, mm6); /* tmp12 = tmp01 - tmp02: even 2 */ - paddw_r2r(mm2, mm1); /* tmp11 = tmp01 + tmp02: even 2 */ + psubw_r2r (mm2, mm6); /* tmp12 = tmp01 - tmp02: even 2 */ + paddw_r2r (mm2, mm1); /* tmp11 = tmp01 + tmp02: even 2 */ - psubw_m2r(*(dataptr+10), mm5); /* tmp05=x2-x5: stage 1 */ - paddw_r2r(mm7, mm6); /* tmp12 + tmp13 */ + psubw_m2r (*(dataptr + 10), mm5); /* tmp05=x2-x5: stage 1 */ + paddw_r2r (mm7, mm6); /* tmp12 + tmp13 */ - /* stage 3 */ + /* stage 3 */ - movq_m2r(tmp6, mm2); - movq_r2r(mm0, mm3); + movq_m2r (tmp6, mm2); + movq_r2r (mm0, mm3); - psllw_i2r(2, mm6); /* m8 * 2^2 */ - paddw_r2r(mm1, mm0); + psllw_i2r (2, mm6); /* m8 * 2^2 */ + paddw_r2r (mm1, mm0); - pmulhw_m2r(RTjpeg_C4, mm6); /* z1 */ - psubw_r2r(mm1, mm3); + pmulhw_m2r (RTjpeg_C4, mm6); /* z1 */ + psubw_r2r (mm1, mm3); - movq_r2m(mm0, *dataptr); - movq_r2r(mm7, mm0); - - /* Odd part */ - movq_r2m(mm3, *(dataptr+8)); - paddw_r2r(mm5, mm4); /* tmp10 */ + movq_r2m (mm0, *dataptr); + movq_r2r (mm7, mm0); - movq_m2r(tmp7, mm3); - paddw_r2r(mm6, mm0); /* tmp32 */ + /* Odd part */ + movq_r2m (mm3, *(dataptr + 8)); + paddw_r2r (mm5, mm4); /* tmp10 */ - paddw_r2r(mm2, mm5); /* tmp11 */ - psubw_r2r(mm6, mm7); /* tmp33 */ + movq_m2r (tmp7, mm3); + paddw_r2r (mm6, mm0); /* tmp32 */ - movq_r2m(mm0, *(dataptr+4)); - paddw_r2r(mm3, mm2); /* tmp12 */ + paddw_r2r (mm2, mm5); /* tmp11 */ + psubw_r2r (mm6, mm7); /* tmp33 */ - /* stage 4 */ + movq_r2m (mm0, *(dataptr + 4)); + paddw_r2r (mm3, mm2); /* tmp12 */ - movq_r2m(mm7, *(dataptr+12)); - movq_r2r(mm4, mm1); /* copy of tmp10 */ + /* stage 4 */ - psubw_r2r(mm2, mm1); /* tmp10 - tmp12 */ - psllw_i2r(2, mm4); /* m8 * 2^2 */ + movq_r2m (mm7, *(dataptr + 12)); + movq_r2r (mm4, mm1); /* copy of tmp10 */ - movq_m2r(RTjpeg_C2mC6, mm0); - psllw_i2r(2, mm1); + psubw_r2r (mm2, mm1); /* tmp10 - tmp12 */ + psllw_i2r (2, mm4); /* m8 * 2^2 */ - pmulhw_m2r(RTjpeg_C6, mm1); /* z5 */ - psllw_i2r(2, mm2); + movq_m2r (RTjpeg_C2mC6, mm0); + psllw_i2r (2, mm1); - pmulhw_r2r(mm0, mm4); /* z5 */ + pmulhw_m2r (RTjpeg_C6, mm1); /* z5 */ + psllw_i2r (2, mm2); - /* stage 5 */ + pmulhw_r2r (mm0, mm4); /* z5 */ - pmulhw_m2r(RTjpeg_C2pC6, mm2); - psllw_i2r(2, mm5); + /* stage 5 */ - pmulhw_m2r(RTjpeg_C4, mm5); /* z3 */ - movq_r2r(mm3, mm0); /* copy tmp7 */ + pmulhw_m2r (RTjpeg_C2pC6, mm2); + psllw_i2r (2, mm5); - movq_m2r(*(dataptr+1), mm7); - paddw_r2r(mm1, mm4); /* z2 */ + pmulhw_m2r (RTjpeg_C4, mm5); /* z3 */ + movq_r2r (mm3, mm0); /* copy tmp7 */ - paddw_r2r(mm1, mm2); /* z4 */ + movq_m2r (*(dataptr + 1), mm7); + paddw_r2r (mm1, mm4); /* z2 */ - paddw_r2r(mm5, mm0); /* z11 */ - psubw_r2r(mm5, mm3); /* z13 */ + paddw_r2r (mm1, mm2); /* z4 */ - /* stage 6 */ + paddw_r2r (mm5, mm0); /* z11 */ + psubw_r2r (mm5, mm3); /* z13 */ - movq_r2r(mm3, mm5); /* copy z13 */ - psubw_r2r(mm4, mm3); /* y3=z13 - z2 */ + /* stage 6 */ - paddw_r2r(mm4, mm5); /* y5=z13 + z2 */ - movq_r2r(mm0, mm6); /* copy z11 */ + movq_r2r (mm3, mm5); /* copy z13 */ + psubw_r2r (mm4, mm3); /* y3=z13 - z2 */ - movq_r2m(mm3, *(dataptr+6)); /*save y3 */ - psubw_r2r(mm2, mm0); /* y7=z11 - z4 */ + paddw_r2r (mm4, mm5); /* y5=z13 + z2 */ + movq_r2r (mm0, mm6); /* copy z11 */ - movq_r2m(mm5, *(dataptr+10)); /*save y5 */ - paddw_r2r(mm2, mm6); /* y1=z11 + z4 */ + movq_r2m (mm3, *(dataptr + 6)); /*save y3 */ + psubw_r2r (mm2, mm0); /* y7=z11 - z4 */ - movq_r2m(mm0, *(dataptr+14)); /*save y7 */ + movq_r2m (mm5, *(dataptr + 10)); /*save y5 */ + paddw_r2r (mm2, mm6); /* y1=z11 + z4 */ + + movq_r2m (mm0, *(dataptr + 14)); /*save y7 */ /************************************************ * End of 1st 4 rows ************************************************/ - movq_m2r(*(dataptr+3), mm1); /* load x1: stage 1 */ - movq_r2r(mm7, mm0); /* copy x0 */ + movq_m2r (*(dataptr + 3), mm1); /* load x1: stage 1 */ + movq_r2r (mm7, mm0); /* copy x0 */ + + movq_r2m (mm6, *(dataptr + 2)); /*save y1 */ - movq_r2m(mm6, *(dataptr+2)); /*save y1 */ + movq_m2r (*(dataptr + 5), mm2); /* load x2: stage 1 */ + movq_r2r (mm1, mm6); /* copy x1 */ - movq_m2r(*(dataptr+5), mm2); /* load x2: stage 1 */ - movq_r2r(mm1, mm6); /* copy x1 */ + paddw_m2r (*(dataptr + 15), mm0); /* tmp00 = x0 + x7 */ - paddw_m2r(*(dataptr+15), mm0); /* tmp00 = x0 + x7 */ + movq_m2r (*(dataptr + 7), mm3); /* load x3 : stage 1 */ + movq_r2r (mm2, mm5); /* copy x2 */ - movq_m2r(*(dataptr+7), mm3); /* load x3 : stage 1 */ - movq_r2r(mm2, mm5); /* copy x2 */ + psubw_m2r (*(dataptr + 15), mm7); /* tmp07 = x0 - x7 */ + movq_r2r (mm3, mm4); /* copy x3 */ - psubw_m2r(*(dataptr+15), mm7); /* tmp07 = x0 - x7 */ - movq_r2r(mm3, mm4); /* copy x3 */ + paddw_m2r (*(dataptr + 13), mm1); /* tmp01 = x1 + x6 */ - paddw_m2r(*(dataptr+13), mm1); /* tmp01 = x1 + x6 */ + movq_r2m (mm7, tmp7); /* save tmp07 */ + movq_r2r (mm0, mm7); /* copy tmp00 */ - movq_r2m(mm7, tmp7); /* save tmp07 */ - movq_r2r(mm0, mm7); /* copy tmp00 */ + psubw_m2r (*(dataptr + 13), mm6); /* tmp06 = x1 - x6 */ - psubw_m2r(*(dataptr+13), mm6); /* tmp06 = x1 - x6 */ + /* stage 2, Even Part */ - /* stage 2, Even Part */ + paddw_m2r (*(dataptr + 9), mm3); /* tmp03 = x3 + x4 */ - paddw_m2r(*(dataptr+9), mm3); /* tmp03 = x3 + x4 */ + movq_r2m (mm6, tmp6); /* save tmp07 */ + movq_r2r (mm1, mm6); /* copy tmp01 */ - movq_r2m(mm6, tmp6); /* save tmp07 */ - movq_r2r(mm1, mm6); /* copy tmp01 */ + paddw_m2r (*(dataptr + 11), mm2); /* tmp02 = x2 + x5 */ + paddw_r2r (mm3, mm0); /* tmp10 = tmp00 + tmp03 */ - paddw_m2r(*(dataptr+11), mm2); /* tmp02 = x2 + x5 */ - paddw_r2r(mm3, mm0); /* tmp10 = tmp00 + tmp03 */ + psubw_r2r (mm3, mm7); /* tmp13 = tmp00 - tmp03 */ - psubw_r2r(mm3, mm7); /* tmp13 = tmp00 - tmp03 */ + psubw_m2r (*(dataptr + 9), mm4); /* tmp04 = x3 - x4 */ + psubw_r2r (mm2, mm6); /* tmp12 = tmp01 - tmp02 */ - psubw_m2r(*(dataptr+9), mm4); /* tmp04 = x3 - x4 */ - psubw_r2r(mm2, mm6); /* tmp12 = tmp01 - tmp02 */ + paddw_r2r (mm2, mm1); /* tmp11 = tmp01 + tmp02 */ - paddw_r2r(mm2, mm1); /* tmp11 = tmp01 + tmp02 */ + psubw_m2r (*(dataptr + 11), mm5); /* tmp05 = x2 - x5 */ + paddw_r2r (mm7, mm6); /* tmp12 + tmp13 */ - psubw_m2r(*(dataptr+11), mm5); /* tmp05 = x2 - x5 */ - paddw_r2r(mm7, mm6); /* tmp12 + tmp13 */ + /* stage 3, Even and stage 4 & 5 even */ - /* stage 3, Even and stage 4 & 5 even */ + movq_m2r (tmp6, mm2); /* load tmp6 */ + movq_r2r (mm0, mm3); /* copy tmp10 */ - movq_m2r(tmp6, mm2); /* load tmp6 */ - movq_r2r(mm0, mm3); /* copy tmp10 */ + psllw_i2r (2, mm6); /* shift z1 */ + paddw_r2r (mm1, mm0); /* y0=tmp10 + tmp11 */ - psllw_i2r(2, mm6); /* shift z1 */ - paddw_r2r(mm1, mm0); /* y0=tmp10 + tmp11 */ + pmulhw_m2r (RTjpeg_C4, mm6); /* z1 */ + psubw_r2r (mm1, mm3); /* y4=tmp10 - tmp11 */ - pmulhw_m2r(RTjpeg_C4, mm6); /* z1 */ - psubw_r2r(mm1, mm3); /* y4=tmp10 - tmp11 */ + movq_r2m (mm0, *(dataptr + 1)); /*save y0 */ + movq_r2r (mm7, mm0); /* copy tmp13 */ - movq_r2m(mm0, *(dataptr+1)); /*save y0 */ - movq_r2r(mm7, mm0); /* copy tmp13 */ - - /* odd part */ + /* odd part */ - movq_r2m(mm3, *(dataptr+9)); /*save y4 */ - paddw_r2r(mm5, mm4); /* tmp10 = tmp4 + tmp5 */ + movq_r2m (mm3, *(dataptr + 9)); /*save y4 */ + paddw_r2r (mm5, mm4); /* tmp10 = tmp4 + tmp5 */ - movq_m2r(tmp7, mm3); /* load tmp7 */ - paddw_r2r(mm6, mm0); /* tmp32 = tmp13 + z1 */ + movq_m2r (tmp7, mm3); /* load tmp7 */ + paddw_r2r (mm6, mm0); /* tmp32 = tmp13 + z1 */ - paddw_r2r(mm2, mm5); /* tmp11 = tmp5 + tmp6 */ - psubw_r2r(mm6, mm7); /* tmp33 = tmp13 - z1 */ + paddw_r2r (mm2, mm5); /* tmp11 = tmp5 + tmp6 */ + psubw_r2r (mm6, mm7); /* tmp33 = tmp13 - z1 */ - movq_r2m(mm0, *(dataptr+5)); /*save y2 */ - paddw_r2r(mm3, mm2); /* tmp12 = tmp6 + tmp7 */ + movq_r2m (mm0, *(dataptr + 5)); /*save y2 */ + paddw_r2r (mm3, mm2); /* tmp12 = tmp6 + tmp7 */ - /* stage 4 */ + /* stage 4 */ - movq_r2m(mm7, *(dataptr+13)); /*save y6 */ - movq_r2r(mm4, mm1); /* copy tmp10 */ + movq_r2m (mm7, *(dataptr + 13)); /*save y6 */ + movq_r2r (mm4, mm1); /* copy tmp10 */ - psubw_r2r(mm2, mm1); /* tmp10 - tmp12 */ - psllw_i2r(2, mm4); /* shift tmp10 */ + psubw_r2r (mm2, mm1); /* tmp10 - tmp12 */ + psllw_i2r (2, mm4); /* shift tmp10 */ - movq_m2r(RTjpeg_C2mC6, mm0); /* load C2mC6 */ - psllw_i2r(2, mm1); /* shift (tmp10-tmp12) */ + movq_m2r (RTjpeg_C2mC6, mm0); /* load C2mC6 */ + psllw_i2r (2, mm1); /* shift (tmp10-tmp12) */ - pmulhw_m2r(RTjpeg_C6, mm1); /* z5 */ - psllw_i2r(2, mm5); /* prepare for multiply */ + pmulhw_m2r (RTjpeg_C6, mm1); /* z5 */ + psllw_i2r (2, mm5); /* prepare for multiply */ - pmulhw_r2r(mm0, mm4); /* multiply by converted real */ + pmulhw_r2r (mm0, mm4); /* multiply by converted real */ - /* stage 5 */ + /* stage 5 */ - pmulhw_m2r(RTjpeg_C4, mm5); /* z3 */ - psllw_i2r(2, mm2); /* prepare for multiply */ + pmulhw_m2r (RTjpeg_C4, mm5); /* z3 */ + psllw_i2r (2, mm2); /* prepare for multiply */ - pmulhw_m2r(RTjpeg_C2pC6, mm2); /* multiply */ - movq_r2r(mm3, mm0); /* copy tmp7 */ + pmulhw_m2r (RTjpeg_C2pC6, mm2); /* multiply */ + movq_r2r (mm3, mm0); /* copy tmp7 */ - movq_m2r(*(dataptr+9), mm7); /* m03:m02|m01:m00 - first line (line 4)and copy into mm7 */ - paddw_r2r(mm1, mm4); /* z2 */ + movq_m2r (*(dataptr + 9), mm7); /* m03:m02|m01:m00 - first line (line 4)and copy into mm7 */ + paddw_r2r (mm1, mm4); /* z2 */ - paddw_r2r(mm5, mm0); /* z11 */ - psubw_r2r(mm5, mm3); /* z13 */ + paddw_r2r (mm5, mm0); /* z11 */ + psubw_r2r (mm5, mm3); /* z13 */ - /* stage 6 */ + /* stage 6 */ - movq_r2r(mm3, mm5); /* copy z13 */ - paddw_r2r(mm1, mm2); /* z4 */ + movq_r2r (mm3, mm5); /* copy z13 */ + paddw_r2r (mm1, mm2); /* z4 */ - movq_r2r(mm0, mm6); /* copy z11 */ - psubw_r2r(mm4, mm5); /* y3 */ + movq_r2r (mm0, mm6); /* copy z11 */ + psubw_r2r (mm4, mm5); /* y3 */ - paddw_r2r(mm2, mm6); /* y1 */ - paddw_r2r(mm4, mm3); /* y5 */ + paddw_r2r (mm2, mm6); /* y1 */ + paddw_r2r (mm4, mm3); /* y5 */ - movq_r2m(mm5, *(dataptr+7)); /*save y3 */ + movq_r2m (mm5, *(dataptr + 7)); /*save y3 */ + + movq_r2m (mm6, *(dataptr + 3)); /*save y1 */ + psubw_r2r (mm2, mm0); /* y7 */ - movq_r2m(mm6, *(dataptr+3)); /*save y1 */ - psubw_r2r(mm2, mm0); /* y7 */ - /************************************************************************************************ Start of Transpose ************************************************************************************************/ - movq_m2r(*(dataptr+13), mm6); /* m23:m22|m21:m20 - third line (line 6)and copy into m2 */ - movq_r2r(mm7, mm5); /* copy first line */ + movq_m2r (*(dataptr + 13), mm6); /* m23:m22|m21:m20 - third line (line 6)and copy into m2 */ + movq_r2r (mm7, mm5); /* copy first line */ - punpcklwd_r2r(mm3, mm7); /* m11:m01|m10:m00 - interleave first and second lines */ - movq_r2r(mm6, mm2); /* copy third line */ + punpcklwd_r2r (mm3, mm7); /* m11:m01|m10:m00 - interleave first and second lines */ + movq_r2r (mm6, mm2); /* copy third line */ - punpcklwd_r2r(mm0, mm6); /* m31:m21|m30:m20 - interleave third and fourth lines */ - movq_r2r(mm7, mm1); /* copy first intermediate result */ + punpcklwd_r2r (mm0, mm6); /* m31:m21|m30:m20 - interleave third and fourth lines */ + movq_r2r (mm7, mm1); /* copy first intermediate result */ - punpckldq_r2r(mm6, mm7); /* m30:m20|m10:m00 - interleave to produce result 1 */ + punpckldq_r2r (mm6, mm7); /* m30:m20|m10:m00 - interleave to produce result 1 */ - punpckhdq_r2r(mm6, mm1); /* m31:m21|m11:m01 - interleave to produce result 2 */ + punpckhdq_r2r (mm6, mm1); /* m31:m21|m11:m01 - interleave to produce result 2 */ - movq_r2m(mm7, *(dataptr+9)); /* write result 1 */ - punpckhwd_r2r(mm3, mm5); /* m13:m03|m12:m02 - interleave first and second lines */ + movq_r2m (mm7, *(dataptr + 9)); /* write result 1 */ + punpckhwd_r2r (mm3, mm5); /* m13:m03|m12:m02 - interleave first and second lines */ - movq_r2m(mm1, *(dataptr+11)); /* write result 2 */ - punpckhwd_r2r(mm0, mm2); /* m33:m23|m32:m22 - interleave third and fourth lines */ + movq_r2m (mm1, *(dataptr + 11)); /* write result 2 */ + punpckhwd_r2r (mm0, mm2); /* m33:m23|m32:m22 - interleave third and fourth lines */ - movq_r2r(mm5, mm1); /* copy first intermediate result */ - punpckldq_r2r(mm2, mm5); /* m32:m22|m12:m02 - interleave to produce result 3 */ + movq_r2r (mm5, mm1); /* copy first intermediate result */ + punpckldq_r2r (mm2, mm5); /* m32:m22|m12:m02 - interleave to produce result 3 */ - movq_m2r(*(dataptr+1), mm0); /* m03:m02|m01:m00 - first line, 4x4 */ - punpckhdq_r2r(mm2, mm1); /* m33:m23|m13:m03 - interleave to produce result 4 */ + movq_m2r (*(dataptr + 1), mm0); /* m03:m02|m01:m00 - first line, 4x4 */ + punpckhdq_r2r (mm2, mm1); /* m33:m23|m13:m03 - interleave to produce result 4 */ - movq_r2m(mm5, *(dataptr+13)); /* write result 3 */ + movq_r2m (mm5, *(dataptr + 13)); /* write result 3 */ /****** last 4x4 done */ - movq_r2m(mm1, *(dataptr+15)); /* write result 4, last 4x4 */ + movq_r2m (mm1, *(dataptr + 15)); /* write result 4, last 4x4 */ + + movq_m2r (*(dataptr + 5), mm2); /* m23:m22|m21:m20 - third line */ + movq_r2r (mm0, mm6); /* copy first line */ - movq_m2r(*(dataptr+5), mm2); /* m23:m22|m21:m20 - third line */ - movq_r2r(mm0, mm6); /* copy first line */ + punpcklwd_m2r (*(dataptr + 3), mm0); /* m11:m01|m10:m00 - interleave first and second lines */ + movq_r2r (mm2, mm7); /* copy third line */ - punpcklwd_m2r(*(dataptr+3), mm0); /* m11:m01|m10:m00 - interleave first and second lines */ - movq_r2r(mm2, mm7); /* copy third line */ + punpcklwd_m2r (*(dataptr + 7), mm2); /* m31:m21|m30:m20 - interleave third and fourth lines */ + movq_r2r (mm0, mm4); /* copy first intermediate result */ - punpcklwd_m2r(*(dataptr+7), mm2); /* m31:m21|m30:m20 - interleave third and fourth lines */ - movq_r2r(mm0, mm4); /* copy first intermediate result */ - - movq_m2r(*(dataptr+8), mm1); /* n03:n02|n01:n00 - first line */ - punpckldq_r2r(mm2, mm0); /* m30:m20|m10:m00 - interleave to produce first result */ + movq_m2r (*(dataptr + 8), mm1); /* n03:n02|n01:n00 - first line */ + punpckldq_r2r (mm2, mm0); /* m30:m20|m10:m00 - interleave to produce first result */ - movq_m2r(*(dataptr+12), mm3); /* n23:n22|n21:n20 - third line */ - punpckhdq_r2r(mm2, mm4); /* m31:m21|m11:m01 - interleave to produce second result */ + movq_m2r (*(dataptr + 12), mm3); /* n23:n22|n21:n20 - third line */ + punpckhdq_r2r (mm2, mm4); /* m31:m21|m11:m01 - interleave to produce second result */ - punpckhwd_m2r(*(dataptr+3), mm6); /* m13:m03|m12:m02 - interleave first and second lines */ - movq_r2r(mm1, mm2); /* copy first line */ + punpckhwd_m2r (*(dataptr + 3), mm6); /* m13:m03|m12:m02 - interleave first and second lines */ + movq_r2r (mm1, mm2); /* copy first line */ - punpckhwd_m2r(*(dataptr+7), mm7); /* m33:m23|m32:m22 - interleave third and fourth lines */ - movq_r2r(mm6, mm5); /* copy first intermediate result */ + punpckhwd_m2r (*(dataptr + 7), mm7); /* m33:m23|m32:m22 - interleave third and fourth lines */ + movq_r2r (mm6, mm5); /* copy first intermediate result */ - movq_r2m(mm0, *(dataptr+8)); /* write result 1 */ - punpckhdq_r2r(mm7, mm5); /* m33:m23|m13:m03 - produce third result */ + movq_r2m (mm0, *(dataptr + 8)); /* write result 1 */ + punpckhdq_r2r (mm7, mm5); /* m33:m23|m13:m03 - produce third result */ - punpcklwd_m2r(*(dataptr+10), mm1); /* n11:n01|n10:n00 - interleave first and second lines */ - movq_r2r(mm3, mm0); /* copy third line */ + punpcklwd_m2r (*(dataptr + 10), mm1); /* n11:n01|n10:n00 - interleave first and second lines */ + movq_r2r (mm3, mm0); /* copy third line */ - punpckhwd_m2r(*(dataptr+10), mm2); /* n13:n03|n12:n02 - interleave first and second lines */ + punpckhwd_m2r (*(dataptr + 10), mm2); /* n13:n03|n12:n02 - interleave first and second lines */ - movq_r2m(mm4, *(dataptr+10)); /* write result 2 out */ - punpckldq_r2r(mm7, mm6); /* m32:m22|m12:m02 - produce fourth result */ + movq_r2m (mm4, *(dataptr + 10)); /* write result 2 out */ + punpckldq_r2r (mm7, mm6); /* m32:m22|m12:m02 - produce fourth result */ - punpcklwd_m2r(*(dataptr+14), mm3); /* n33:n23|n32:n22 - interleave third and fourth lines */ - movq_r2r(mm1, mm4); /* copy second intermediate result */ + punpcklwd_m2r (*(dataptr + 14), mm3); /* n33:n23|n32:n22 - interleave third and fourth lines */ + movq_r2r (mm1, mm4); /* copy second intermediate result */ - movq_r2m(mm6, *(dataptr+12)); /* write result 3 out */ - punpckldq_r2r(mm3, mm1); /* */ + movq_r2m (mm6, *(dataptr + 12)); /* write result 3 out */ + punpckldq_r2r (mm3, mm1); /* */ - punpckhwd_m2r(*(dataptr+14), mm0); /* n33:n23|n32:n22 - interleave third and fourth lines */ - movq_r2r(mm2, mm6); /* copy second intermediate result */ + punpckhwd_m2r (*(dataptr + 14), mm0); /* n33:n23|n32:n22 - interleave third and fourth lines */ + movq_r2r (mm2, mm6); /* copy second intermediate result */ - movq_r2m(mm5, *(dataptr+14)); /* write result 4 out */ - punpckhdq_r2r(mm3, mm4); /* n31:n21|n11:n01- produce second result */ + movq_r2m (mm5, *(dataptr + 14)); /* write result 4 out */ + punpckhdq_r2r (mm3, mm4); /* n31:n21|n11:n01- produce second result */ - movq_r2m(mm1, *(dataptr+1)); /* write result 5 out - (first result for other 4 x 4 block) */ - punpckldq_r2r(mm0, mm2); /* n32:n22|n12:n02- produce third result */ + movq_r2m (mm1, *(dataptr + 1)); /* write result 5 out - (first result for other 4 x 4 block) */ + punpckldq_r2r (mm0, mm2); /* n32:n22|n12:n02- produce third result */ - movq_r2m(mm4, *(dataptr+3)); /* write result 6 out */ - punpckhdq_r2r(mm0, mm6); /* n33:n23|n13:n03 - produce fourth result */ + movq_r2m (mm4, *(dataptr + 3)); /* write result 6 out */ + punpckhdq_r2r (mm0, mm6); /* n33:n23|n13:n03 - produce fourth result */ - movq_r2m(mm2, *(dataptr+5)); /* write result 7 out */ + movq_r2m (mm2, *(dataptr + 5)); /* write result 7 out */ - movq_m2r(*dataptr, mm0); /* m03:m02|m01:m00 - first line, first 4x4 */ + movq_m2r (*dataptr, mm0); /* m03:m02|m01:m00 - first line, first 4x4 */ - movq_r2m(mm6, *(dataptr+7)); /* write result 8 out */ + movq_r2m (mm6, *(dataptr + 7)); /* write result 8 out */ /* Do first 4x4 quadrant, which is used in the beginning of the DCT: */ - movq_m2r(*(dataptr+4), mm7); /* m23:m22|m21:m20 - third line */ - movq_r2r(mm0, mm2); /* copy first line */ + movq_m2r (*(dataptr + 4), mm7); /* m23:m22|m21:m20 - third line */ + movq_r2r (mm0, mm2); /* copy first line */ + + punpcklwd_m2r (*(dataptr + 2), mm0); /* m11:m01|m10:m00 - interleave first and second lines */ + movq_r2r (mm7, mm4); /* copy third line */ - punpcklwd_m2r(*(dataptr+2), mm0); /* m11:m01|m10:m00 - interleave first and second lines */ - movq_r2r(mm7, mm4); /* copy third line */ - - punpcklwd_m2r(*(dataptr+6), mm7); /* m31:m21|m30:m20 - interleave third and fourth lines */ - movq_r2r(mm0, mm1); /* copy first intermediate result */ + punpcklwd_m2r (*(dataptr + 6), mm7); /* m31:m21|m30:m20 - interleave third and fourth lines */ + movq_r2r (mm0, mm1); /* copy first intermediate result */ - movq_m2r(*(dataptr+2), mm6); /* m13:m12|m11:m10 - second line */ - punpckldq_r2r(mm7, mm0); /* m30:m20|m10:m00 - interleave to produce result 1 */ + movq_m2r (*(dataptr + 2), mm6); /* m13:m12|m11:m10 - second line */ + punpckldq_r2r (mm7, mm0); /* m30:m20|m10:m00 - interleave to produce result 1 */ - movq_m2r(*(dataptr+6), mm5); /* m33:m32|m31:m30 - fourth line */ - punpckhdq_r2r(mm7, mm1); /* m31:m21|m11:m01 - interleave to produce result 2 */ + movq_m2r (*(dataptr + 6), mm5); /* m33:m32|m31:m30 - fourth line */ + punpckhdq_r2r (mm7, mm1); /* m31:m21|m11:m01 - interleave to produce result 2 */ - movq_r2r(mm0, mm7); /* write result 1 */ - punpckhwd_r2r(mm6, mm2); /* m13:m03|m12:m02 - interleave first and second lines */ + movq_r2r (mm0, mm7); /* write result 1 */ + punpckhwd_r2r (mm6, mm2); /* m13:m03|m12:m02 - interleave first and second lines */ - psubw_m2r(*(dataptr+14), mm7); /* tmp07=x0-x7: Stage 1 */ - movq_r2r(mm1, mm6); /* write result 2 */ + psubw_m2r (*(dataptr + 14), mm7); /* tmp07=x0-x7: Stage 1 */ + movq_r2r (mm1, mm6); /* write result 2 */ - paddw_m2r(*(dataptr+14), mm0); /* tmp00=x0+x7: Stage 1 */ - punpckhwd_r2r(mm5, mm4); /* m33:m23|m32:m22 - interleave third and fourth lines */ + paddw_m2r (*(dataptr + 14), mm0); /* tmp00=x0+x7: Stage 1 */ + punpckhwd_r2r (mm5, mm4); /* m33:m23|m32:m22 - interleave third and fourth lines */ - paddw_m2r(*(dataptr+12), mm1); /* tmp01=x1+x6: Stage 1 */ - movq_r2r(mm2, mm3); /* copy first intermediate result */ + paddw_m2r (*(dataptr + 12), mm1); /* tmp01=x1+x6: Stage 1 */ + movq_r2r (mm2, mm3); /* copy first intermediate result */ - psubw_m2r(*(dataptr+12), mm6); /* tmp06=x1-x6: Stage 1 */ - punpckldq_r2r(mm4, mm2); /* m32:m22|m12:m02 - interleave to produce result 3 */ + psubw_m2r (*(dataptr + 12), mm6); /* tmp06=x1-x6: Stage 1 */ + punpckldq_r2r (mm4, mm2); /* m32:m22|m12:m02 - interleave to produce result 3 */ - movq_r2m(mm7, tmp7); /* save tmp07 */ - movq_r2r(mm2, mm5); /* write result 3 */ + movq_r2m (mm7, tmp7); /* save tmp07 */ + movq_r2r (mm2, mm5); /* write result 3 */ - movq_r2m(mm6, tmp6); /* save tmp06 */ + movq_r2m (mm6, tmp6); /* save tmp06 */ - punpckhdq_r2r(mm4, mm3); /* m33:m23|m13:m03 - interleave to produce result 4 */ + punpckhdq_r2r (mm4, mm3); /* m33:m23|m13:m03 - interleave to produce result 4 */ - paddw_m2r(*(dataptr+10), mm2); /* tmp02=x2+x5: stage 1 */ - movq_r2r(mm3, mm4); /* write result 4 */ + paddw_m2r (*(dataptr + 10), mm2); /* tmp02=x2+x5: stage 1 */ + movq_r2r (mm3, mm4); /* write result 4 */ /************************************************************************************************ End of Transpose 2 ************************************************************************************************/ - paddw_m2r(*(dataptr+8), mm3); /* tmp03=x3+x4: stage 1 */ - movq_r2r(mm0, mm7); + paddw_m2r (*(dataptr + 8), mm3); /* tmp03=x3+x4: stage 1 */ + movq_r2r (mm0, mm7); - psubw_m2r(*(dataptr+8), mm4); /* tmp04=x3-x4: stage 1 */ - movq_r2r(mm1, mm6); + psubw_m2r (*(dataptr + 8), mm4); /* tmp04=x3-x4: stage 1 */ + movq_r2r (mm1, mm6); - paddw_r2r(mm3, mm0); /* tmp10 = tmp00 + tmp03: even 2 */ - psubw_r2r(mm3, mm7); /* tmp13 = tmp00 - tmp03: even 2 */ + paddw_r2r (mm3, mm0); /* tmp10 = tmp00 + tmp03: even 2 */ + psubw_r2r (mm3, mm7); /* tmp13 = tmp00 - tmp03: even 2 */ - psubw_r2r(mm2, mm6); /* tmp12 = tmp01 - tmp02: even 2 */ - paddw_r2r(mm2, mm1); /* tmp11 = tmp01 + tmp02: even 2 */ + psubw_r2r (mm2, mm6); /* tmp12 = tmp01 - tmp02: even 2 */ + paddw_r2r (mm2, mm1); /* tmp11 = tmp01 + tmp02: even 2 */ - psubw_m2r(*(dataptr+10), mm5); /* tmp05=x2-x5: stage 1 */ - paddw_r2r(mm7, mm6); /* tmp12 + tmp13 */ + psubw_m2r (*(dataptr + 10), mm5); /* tmp05=x2-x5: stage 1 */ + paddw_r2r (mm7, mm6); /* tmp12 + tmp13 */ - /* stage 3 */ + /* stage 3 */ - movq_m2r(tmp6, mm2); - movq_r2r(mm0, mm3); + movq_m2r (tmp6, mm2); + movq_r2r (mm0, mm3); - psllw_i2r(2, mm6); /* m8 * 2^2 */ - paddw_r2r(mm1, mm0); + psllw_i2r (2, mm6); /* m8 * 2^2 */ + paddw_r2r (mm1, mm0); - pmulhw_m2r(RTjpeg_C4, mm6); /* z1 */ - psubw_r2r(mm1, mm3); + pmulhw_m2r (RTjpeg_C4, mm6); /* z1 */ + psubw_r2r (mm1, mm3); - movq_r2m(mm0, *dataptr); - movq_r2r(mm7, mm0); - - /* Odd part */ - movq_r2m(mm3, *(dataptr+8)); - paddw_r2r(mm5, mm4); /* tmp10 */ + movq_r2m (mm0, *dataptr); + movq_r2r (mm7, mm0); - movq_m2r(tmp7, mm3); - paddw_r2r(mm6, mm0); /* tmp32 */ + /* Odd part */ + movq_r2m (mm3, *(dataptr + 8)); + paddw_r2r (mm5, mm4); /* tmp10 */ - paddw_r2r(mm2, mm5); /* tmp11 */ - psubw_r2r(mm6, mm7); /* tmp33 */ + movq_m2r (tmp7, mm3); + paddw_r2r (mm6, mm0); /* tmp32 */ - movq_r2m(mm0, *(dataptr+4)); - paddw_r2r(mm3, mm2); /* tmp12 */ + paddw_r2r (mm2, mm5); /* tmp11 */ + psubw_r2r (mm6, mm7); /* tmp33 */ - /* stage 4 */ - movq_r2m(mm7, *(dataptr+12)); - movq_r2r(mm4, mm1); /* copy of tmp10 */ + movq_r2m (mm0, *(dataptr + 4)); + paddw_r2r (mm3, mm2); /* tmp12 */ - psubw_r2r(mm2, mm1); /* tmp10 - tmp12 */ - psllw_i2r(2, mm4); /* m8 * 2^2 */ + /* stage 4 */ + movq_r2m (mm7, *(dataptr + 12)); + movq_r2r (mm4, mm1); /* copy of tmp10 */ - movq_m2r(RTjpeg_C2mC6, mm0); - psllw_i2r(2, mm1); + psubw_r2r (mm2, mm1); /* tmp10 - tmp12 */ + psllw_i2r (2, mm4); /* m8 * 2^2 */ - pmulhw_m2r(RTjpeg_C6, mm1); /* z5 */ - psllw_i2r(2, mm2); + movq_m2r (RTjpeg_C2mC6, mm0); + psllw_i2r (2, mm1); - pmulhw_r2r(mm0, mm4); /* z5 */ + pmulhw_m2r (RTjpeg_C6, mm1); /* z5 */ + psllw_i2r (2, mm2); - /* stage 5 */ + pmulhw_r2r (mm0, mm4); /* z5 */ - pmulhw_m2r(RTjpeg_C2pC6, mm2); - psllw_i2r(2, mm5); + /* stage 5 */ - pmulhw_m2r(RTjpeg_C4, mm5); /* z3 */ - movq_r2r(mm3, mm0); /* copy tmp7 */ + pmulhw_m2r (RTjpeg_C2pC6, mm2); + psllw_i2r (2, mm5); - movq_m2r(*(dataptr+1), mm7); - paddw_r2r(mm1, mm4); /* z2 */ + pmulhw_m2r (RTjpeg_C4, mm5); /* z3 */ + movq_r2r (mm3, mm0); /* copy tmp7 */ - paddw_r2r(mm1, mm2); /* z4 */ + movq_m2r (*(dataptr + 1), mm7); + paddw_r2r (mm1, mm4); /* z2 */ - paddw_r2r(mm5, mm0); /* z11 */ - psubw_r2r(mm5, mm3); /* z13 */ + paddw_r2r (mm1, mm2); /* z4 */ - /* stage 6 */ + paddw_r2r (mm5, mm0); /* z11 */ + psubw_r2r (mm5, mm3); /* z13 */ - movq_r2r(mm3, mm5); /* copy z13 */ - psubw_r2r(mm4, mm3); /* y3=z13 - z2 */ + /* stage 6 */ - paddw_r2r(mm4, mm5); /* y5=z13 + z2 */ - movq_r2r(mm0, mm6); /* copy z11 */ + movq_r2r (mm3, mm5); /* copy z13 */ + psubw_r2r (mm4, mm3); /* y3=z13 - z2 */ - movq_r2m(mm3, *(dataptr+6)); /*save y3 */ - psubw_r2r(mm2, mm0); /* y7=z11 - z4 */ + paddw_r2r (mm4, mm5); /* y5=z13 + z2 */ + movq_r2r (mm0, mm6); /* copy z11 */ - movq_r2m(mm5, *(dataptr+10)); /*save y5 */ - paddw_r2r(mm2, mm6); /* y1=z11 + z4 */ + movq_r2m (mm3, *(dataptr + 6)); /*save y3 */ + psubw_r2r (mm2, mm0); /* y7=z11 - z4 */ - movq_r2m(mm0, *(dataptr+14)); /*save y7 */ + movq_r2m (mm5, *(dataptr + 10)); /*save y5 */ + paddw_r2r (mm2, mm6); /* y1=z11 + z4 */ + + movq_r2m (mm0, *(dataptr + 14)); /*save y7 */ /************************************************ * End of 1st 4 rows ************************************************/ - movq_m2r(*(dataptr+3), mm1); /* load x1 : stage 1 */ - movq_r2r(mm7, mm0); /* copy x0 */ + movq_m2r (*(dataptr + 3), mm1); /* load x1 : stage 1 */ + movq_r2r (mm7, mm0); /* copy x0 */ + + movq_r2m (mm6, *(dataptr + 2)); /*save y1 */ - movq_r2m(mm6, *(dataptr+2)); /*save y1 */ + movq_m2r (*(dataptr + 5), mm2); /* load x2 : stage 1 */ + movq_r2r (mm1, mm6); /* copy x1 */ - movq_m2r(*(dataptr+5), mm2); /* load x2 : stage 1 */ - movq_r2r(mm1, mm6); /* copy x1 */ + paddw_m2r (*(dataptr + 15), mm0); /* tmp00 = x0 + x7 */ - paddw_m2r(*(dataptr+15), mm0); /* tmp00 = x0 + x7 */ + movq_m2r (*(dataptr + 7), mm3); /* load x3 : stage 1 */ + movq_r2r (mm2, mm5); /* copy x2 */ - movq_m2r(*(dataptr+7), mm3); /* load x3 : stage 1 */ - movq_r2r(mm2, mm5); /* copy x2 */ + psubw_m2r (*(dataptr + 15), mm7); /* tmp07 = x0 - x7 */ + movq_r2r (mm3, mm4); /* copy x3 */ - psubw_m2r(*(dataptr+15), mm7); /* tmp07 = x0 - x7 */ - movq_r2r(mm3, mm4); /* copy x3 */ + paddw_m2r (*(dataptr + 13), mm1); /* tmp01 = x1 + x6 */ - paddw_m2r(*(dataptr+13), mm1); /* tmp01 = x1 + x6 */ + movq_r2m (mm7, tmp7); /* save tmp07 */ + movq_r2r (mm0, mm7); /* copy tmp00 */ - movq_r2m(mm7, tmp7); /* save tmp07 */ - movq_r2r(mm0, mm7); /* copy tmp00 */ + psubw_m2r (*(dataptr + 13), mm6); /* tmp06 = x1 - x6 */ - psubw_m2r(*(dataptr+13), mm6); /* tmp06 = x1 - x6 */ + /* stage 2, Even Part */ - /* stage 2, Even Part */ + paddw_m2r (*(dataptr + 9), mm3); /* tmp03 = x3 + x4 */ - paddw_m2r(*(dataptr+9), mm3); /* tmp03 = x3 + x4 */ + movq_r2m (mm6, tmp6); /* save tmp07 */ + movq_r2r (mm1, mm6); /* copy tmp01 */ - movq_r2m(mm6, tmp6); /* save tmp07 */ - movq_r2r(mm1, mm6); /* copy tmp01 */ + paddw_m2r (*(dataptr + 11), mm2); /* tmp02 = x2 + x5 */ + paddw_r2r (mm3, mm0); /* tmp10 = tmp00 + tmp03 */ - paddw_m2r(*(dataptr+11), mm2); /* tmp02 = x2 + x5 */ - paddw_r2r(mm3, mm0); /* tmp10 = tmp00 + tmp03 */ + psubw_r2r (mm3, mm7); /* tmp13 = tmp00 - tmp03 */ - psubw_r2r(mm3, mm7); /* tmp13 = tmp00 - tmp03 */ + psubw_m2r (*(dataptr + 9), mm4); /* tmp04 = x3 - x4 */ + psubw_r2r (mm2, mm6); /* tmp12 = tmp01 - tmp02 */ - psubw_m2r(*(dataptr+9), mm4); /* tmp04 = x3 - x4 */ - psubw_r2r(mm2, mm6); /* tmp12 = tmp01 - tmp02 */ + paddw_r2r (mm2, mm1); /* tmp11 = tmp01 + tmp02 */ - paddw_r2r(mm2, mm1); /* tmp11 = tmp01 + tmp02 */ + psubw_m2r (*(dataptr + 11), mm5); /* tmp05 = x2 - x5 */ + paddw_r2r (mm7, mm6); /* tmp12 + tmp13 */ - psubw_m2r(*(dataptr+11), mm5); /* tmp05 = x2 - x5 */ - paddw_r2r(mm7, mm6); /* tmp12 + tmp13 */ + /* stage 3, Even and stage 4 & 5 even */ - /* stage 3, Even and stage 4 & 5 even */ + movq_m2r (tmp6, mm2); /* load tmp6 */ + movq_r2r (mm0, mm3); /* copy tmp10 */ - movq_m2r(tmp6, mm2); /* load tmp6 */ - movq_r2r(mm0, mm3); /* copy tmp10 */ + psllw_i2r (2, mm6); /* shift z1 */ + paddw_r2r (mm1, mm0); /* y0=tmp10 + tmp11 */ - psllw_i2r(2, mm6); /* shift z1 */ - paddw_r2r(mm1, mm0); /* y0=tmp10 + tmp11 */ + pmulhw_m2r (RTjpeg_C4, mm6); /* z1 */ + psubw_r2r (mm1, mm3); /* y4=tmp10 - tmp11 */ - pmulhw_m2r(RTjpeg_C4, mm6); /* z1 */ - psubw_r2r(mm1, mm3); /* y4=tmp10 - tmp11 */ + movq_r2m (mm0, *(dataptr + 1)); /*save y0 */ + movq_r2r (mm7, mm0); /* copy tmp13 */ - movq_r2m(mm0, *(dataptr+1)); /*save y0 */ - movq_r2r(mm7, mm0); /* copy tmp13 */ - - /* odd part */ + /* odd part */ - movq_r2m(mm3, *(dataptr+9)); /*save y4 */ - paddw_r2r(mm5, mm4); /* tmp10 = tmp4 + tmp5 */ + movq_r2m (mm3, *(dataptr + 9)); /*save y4 */ + paddw_r2r (mm5, mm4); /* tmp10 = tmp4 + tmp5 */ - movq_m2r(tmp7, mm3); /* load tmp7 */ - paddw_r2r(mm6, mm0); /* tmp32 = tmp13 + z1 */ + movq_m2r (tmp7, mm3); /* load tmp7 */ + paddw_r2r (mm6, mm0); /* tmp32 = tmp13 + z1 */ - paddw_r2r(mm2, mm5); /* tmp11 = tmp5 + tmp6 */ - psubw_r2r(mm6, mm7); /* tmp33 = tmp13 - z1 */ + paddw_r2r (mm2, mm5); /* tmp11 = tmp5 + tmp6 */ + psubw_r2r (mm6, mm7); /* tmp33 = tmp13 - z1 */ - movq_r2m(mm0, *(dataptr+5)); /*save y2 */ - paddw_r2r(mm3, mm2); /* tmp12 = tmp6 + tmp7 */ + movq_r2m (mm0, *(dataptr + 5)); /*save y2 */ + paddw_r2r (mm3, mm2); /* tmp12 = tmp6 + tmp7 */ - /* stage 4 */ + /* stage 4 */ - movq_r2m(mm7, *(dataptr+13)); /*save y6 */ - movq_r2r(mm4, mm1); /* copy tmp10 */ + movq_r2m (mm7, *(dataptr + 13)); /*save y6 */ + movq_r2r (mm4, mm1); /* copy tmp10 */ - psubw_r2r(mm2, mm1); /* tmp10 - tmp12 */ - psllw_i2r(2, mm4); /* shift tmp10 */ + psubw_r2r (mm2, mm1); /* tmp10 - tmp12 */ + psllw_i2r (2, mm4); /* shift tmp10 */ - movq_m2r(RTjpeg_C2mC6, mm0); /* load C2mC6 */ - psllw_i2r(2, mm1); /* shift (tmp10-tmp12) */ + movq_m2r (RTjpeg_C2mC6, mm0); /* load C2mC6 */ + psllw_i2r (2, mm1); /* shift (tmp10-tmp12) */ - pmulhw_m2r(RTjpeg_C6, mm1); /* z5 */ - psllw_i2r(2, mm5); /* prepare for multiply */ + pmulhw_m2r (RTjpeg_C6, mm1); /* z5 */ + psllw_i2r (2, mm5); /* prepare for multiply */ - pmulhw_r2r(mm0, mm4); /* multiply by converted real */ + pmulhw_r2r (mm0, mm4); /* multiply by converted real */ - /* stage 5 */ + /* stage 5 */ - pmulhw_m2r(RTjpeg_C4, mm5); /* z3 */ - psllw_i2r(2, mm2); /* prepare for multiply */ + pmulhw_m2r (RTjpeg_C4, mm5); /* z3 */ + psllw_i2r (2, mm2); /* prepare for multiply */ - pmulhw_m2r(RTjpeg_C2pC6, mm2); /* multiply */ - movq_r2r(mm3, mm0); /* copy tmp7 */ + pmulhw_m2r (RTjpeg_C2pC6, mm2); /* multiply */ + movq_r2r (mm3, mm0); /* copy tmp7 */ - movq_m2r(*(dataptr+9), mm7); /* m03:m02|m01:m00 - first line (line 4)and copy into mm7 */ - paddw_r2r(mm1, mm4); /* z2 */ + movq_m2r (*(dataptr + 9), mm7); /* m03:m02|m01:m00 - first line (line 4)and copy into mm7 */ + paddw_r2r (mm1, mm4); /* z2 */ - paddw_r2r(mm5, mm0); /* z11 */ - psubw_r2r(mm5, mm3); /* z13 */ + paddw_r2r (mm5, mm0); /* z11 */ + psubw_r2r (mm5, mm3); /* z13 */ - /* stage 6 */ + /* stage 6 */ - movq_r2r(mm3, mm5); /* copy z13 */ - paddw_r2r(mm1, mm2); /* z4 */ + movq_r2r (mm3, mm5); /* copy z13 */ + paddw_r2r (mm1, mm2); /* z4 */ - movq_r2r(mm0, mm6); /* copy z11 */ - psubw_r2r(mm4, mm5); /* y3 */ + movq_r2r (mm0, mm6); /* copy z11 */ + psubw_r2r (mm4, mm5); /* y3 */ - paddw_r2r(mm2, mm6); /* y1 */ - paddw_r2r(mm4, mm3); /* y5 */ + paddw_r2r (mm2, mm6); /* y1 */ + paddw_r2r (mm4, mm3); /* y5 */ - movq_r2m(mm5, *(dataptr+7)); /*save y3 */ - psubw_r2r(mm2, mm0); /* yè=z11 - z4 */ + movq_r2m (mm5, *(dataptr + 7)); /*save y3 */ + psubw_r2r (mm2, mm0); /* yè=z11 - z4 */ - movq_r2m(mm3, *(dataptr+11)); /*save y5 */ + movq_r2m (mm3, *(dataptr + 11)); /*save y5 */ - movq_r2m(mm6, *(dataptr+3)); /*save y1 */ + movq_r2m (mm6, *(dataptr + 3)); /*save y1 */ + + movq_r2m (mm0, *(dataptr + 15)); /*save y7 */ - movq_r2m(mm0, *(dataptr+15)); /*save y7 */ - #endif } -#define FIX_1_082392200 ((__s32) 277) /* FIX(1.082392200) */ -#define FIX_1_414213562 ((__s32) 362) /* FIX(1.414213562) */ -#define FIX_1_847759065 ((__s32) 473) /* FIX(1.847759065) */ -#define FIX_2_613125930 ((__s32) 669) /* FIX(2.613125930) */ +#define FIX_1_082392200 ((__s32) 277) /* FIX(1.082392200) */ +#define FIX_1_414213562 ((__s32) 362) /* FIX(1.414213562) */ +#define FIX_1_847759065 ((__s32) 473) /* FIX(1.847759065) */ +#define FIX_2_613125930 ((__s32) 669) /* FIX(2.613125930) */ #define DESCALE(x) (__s16)( ((x)+4) >> 3) @@ -1226,284 +1232,283 @@ void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip) #define RL(x) ((x)>235) ? 235 : (((x)<16) ? 16 : (x)) #define MULTIPLY(var,const) (((__s32) ((var) * (const)) + 128)>>8) -void RTjpeg_idct_init(void) +void +RTjpeg_idct_init (void) { - int i; - - for(i=0; i<64; i++) - { - RTjpeg_liqt[i]=((__u64)RTjpeg_liqt[i]*RTjpeg_aan_tab[i])>>32; - RTjpeg_ciqt[i]=((__u64)RTjpeg_ciqt[i]*RTjpeg_aan_tab[i])>>32; - } + int i; + + for (i = 0; i < 64; i++) { + RTjpeg_liqt[i] = ((__u64) RTjpeg_liqt[i] * RTjpeg_aan_tab[i]) >> 32; + RTjpeg_ciqt[i] = ((__u64) RTjpeg_ciqt[i] * RTjpeg_aan_tab[i]) >> 32; + } } -void RTjpeg_idct(__u8 *odata, __s16 *data, int rskip) +void +RTjpeg_idct (__u8 * odata, __s16 * data, int rskip) { #ifdef HAVE_LIBMMX -static mmx_t fix_141 = (mmx_t)(long long)0x5a825a825a825a82LL; -static mmx_t fix_184n261 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; -static mmx_t fix_184 = (mmx_t)(long long)0x7641764176417641LL; -static mmx_t fix_n184 = (mmx_t)(long long)0x896f896f896f896fLL; -static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; + static mmx_t fix_141 = (mmx_t) (long long) 0x5a825a825a825a82LL; + static mmx_t fix_184n261 = (mmx_t) (long long) 0xcf04cf04cf04cf04LL; + static mmx_t fix_184 = (mmx_t) (long long) 0x7641764176417641LL; + static mmx_t fix_n184 = (mmx_t) (long long) 0x896f896f896f896fLL; + static mmx_t fix_108n184 = (mmx_t) (long long) 0xcf04cf04cf04cf04LL; mmx_t workspace[64]; mmx_t *wsptr = workspace; - register mmx_t *dataptr = (mmx_t *)odata; - mmx_t *idata = (mmx_t *)data; + register mmx_t *dataptr = (mmx_t *) odata; + mmx_t *idata = (mmx_t *) data; - rskip = rskip>>3; + rskip = rskip >> 3; /* * Perform inverse DCT on one block of coefficients. */ - /* Odd part */ + /* Odd part */ + + movq_m2r (*(idata + 10), mm1); /* load idata[DCTSIZE*5] */ - movq_m2r(*(idata+10), mm1); /* load idata[DCTSIZE*5] */ + movq_m2r (*(idata + 6), mm0); /* load idata[DCTSIZE*3] */ - movq_m2r(*(idata+6), mm0); /* load idata[DCTSIZE*3] */ + movq_m2r (*(idata + 2), mm3); /* load idata[DCTSIZE*1] */ - movq_m2r(*(idata+2), mm3); /* load idata[DCTSIZE*1] */ + movq_r2r (mm1, mm2); /* copy tmp6 : phase 6 */ + */movq_m2r (*(idata + 14), mm4); /* load idata[DCTSIZE*7] */ - movq_r2r(mm1, mm2); /* copy tmp6 : phase 6 */ */ + paddw_r2r (mm0, mm1); /* z13 = tmp6 + tmp5; */ - movq_m2r(*(idata+14), mm4); /* load idata[DCTSIZE*7] */ + psubw_r2r (mm0, mm2); /* z10 = tmp6 - tmp5 */ - paddw_r2r(mm0, mm1); /* z13 = tmp6 + tmp5; */ + psllw_i2r (2, mm2); /* shift z10 */ + movq_r2r (mm2, mm0); /* copy z10 */ - psubw_r2r(mm0, mm2); /* z10 = tmp6 - tmp5 */ + pmulhw_m2r (fix_184n261, mm2); /* MULTIPLY( z12, FIX_1_847759065); : 2*c2 */ + movq_r2r (mm3, mm5); /* copy tmp4 */ - psllw_i2r(2, mm2); /* shift z10 */ - movq_r2r(mm2, mm0); /* copy z10 */ + pmulhw_m2r (fix_n184, mm0); /* MULTIPLY(z10, -FIX_1_847759065); : 2*c2 */ + paddw_r2r (mm4, mm3); /* z11 = tmp4 + tmp7; */ - pmulhw_m2r(fix_184n261, mm2); /* MULTIPLY( z12, FIX_1_847759065); : 2*c2 */ - movq_r2r(mm3, mm5); /* copy tmp4 */ + movq_r2r (mm3, mm6); /* copy z11 : phase 5 */ + psubw_r2r (mm4, mm5); /* z12 = tmp4 - tmp7; */ - pmulhw_m2r(fix_n184, mm0); /* MULTIPLY(z10, -FIX_1_847759065); : 2*c2 */ - paddw_r2r(mm4, mm3); /* z11 = tmp4 + tmp7; */ + psubw_r2r (mm1, mm6); /* z11-z13 */ + psllw_i2r (2, mm5); /* shift z12 */ - movq_r2r(mm3, mm6); /* copy z11 : phase 5 */ - psubw_r2r(mm4, mm5); /* z12 = tmp4 - tmp7; */ + movq_m2r (*(idata + 12), mm4); /* load idata[DCTSIZE*6], even part */ + movq_r2r (mm5, mm7); /* copy z12 */ - psubw_r2r(mm1, mm6); /* z11-z13 */ - psllw_i2r(2, mm5); /* shift z12 */ + pmulhw_m2r (fix_108n184, mm5); /* MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; 2*(c2-c6): even part */ + paddw_r2r (mm1, mm3); /* tmp7 = z11 + z13; */ - movq_m2r(*(idata+12), mm4); /* load idata[DCTSIZE*6], even part */ - movq_r2r(mm5, mm7); /* copy z12 */ + /*ok */ - pmulhw_m2r(fix_108n184, mm5); /* MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; 2*(c2-c6): even part */ - paddw_r2r(mm1, mm3); /* tmp7 = z11 + z13; */ + /* Even part */ + pmulhw_m2r (fix_184, mm7); /* MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; -2*(c2+c6) */ + psllw_i2r (2, mm6); - /*ok */ + movq_m2r (*(idata + 4), mm1); /* load idata[DCTSIZE*2] */ - /* Even part */ - pmulhw_m2r(fix_184, mm7); /* MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; -2*(c2+c6) */ - psllw_i2r(2, mm6); + paddw_r2r (mm5, mm0); /* tmp10 */ - movq_m2r(*(idata+4), mm1); /* load idata[DCTSIZE*2] */ + paddw_r2r (mm7, mm2); /* tmp12 */ - paddw_r2r(mm5, mm0); /* tmp10 */ + pmulhw_m2r (fix_141, mm6); /* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); 2*c4 */ + psubw_r2r (mm3, mm2); /* tmp6 = tmp12 - tmp7 */ - paddw_r2r(mm7, mm2); /* tmp12 */ + movq_r2r (mm1, mm5); /* copy tmp1 */ + paddw_r2r (mm4, mm1); /* tmp13= tmp1 + tmp3; phases 5-3 */ - pmulhw_m2r(fix_141, mm6); /* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); 2*c4 */ - psubw_r2r(mm3, mm2); /* tmp6 = tmp12 - tmp7 */ + psubw_r2r (mm4, mm5); /* tmp1-tmp3 */ + psubw_r2r (mm2, mm6); /* tmp5 = tmp11 - tmp6; */ - movq_r2r(mm1, mm5); /* copy tmp1 */ - paddw_r2r(mm4, mm1); /* tmp13= tmp1 + tmp3; phases 5-3 */ + movq_r2m (mm1, *(wsptr)); /* save tmp13 in workspace */ + psllw_i2r (2, mm5); /* shift tmp1-tmp3 */ - psubw_r2r(mm4, mm5); /* tmp1-tmp3 */ - psubw_r2r(mm2, mm6); /* tmp5 = tmp11 - tmp6; */ + movq_m2r (*(idata), mm7); /* load idata[DCTSIZE*0] */ - movq_r2m(mm1, *(wsptr)); /* save tmp13 in workspace */ - psllw_i2r(2, mm5); /* shift tmp1-tmp3 */ - - movq_m2r(*(idata), mm7); /* load idata[DCTSIZE*0] */ + pmulhw_m2r (fix_141, mm5); /* MULTIPLY(tmp1 - tmp3, FIX_1_414213562) */ + paddw_r2r (mm6, mm0); /* tmp4 = tmp10 + tmp5; */ - pmulhw_m2r(fix_141, mm5); /* MULTIPLY(tmp1 - tmp3, FIX_1_414213562) */ - paddw_r2r(mm6, mm0); /* tmp4 = tmp10 + tmp5; */ + movq_m2r (*(idata + 8), mm4); /* load idata[DCTSIZE*4] */ - movq_m2r(*(idata+8), mm4); /* load idata[DCTSIZE*4] */ - - psubw_r2r(mm1, mm5); /* tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; 2*c4 */ + psubw_r2r (mm1, mm5); /* tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; 2*c4 */ - movq_r2m(mm0, *(wsptr+4)); /* save tmp4 in workspace */ - movq_r2r(mm7, mm1); /* copy tmp0 : phase 3 */ + movq_r2m (mm0, *(wsptr + 4)); /* save tmp4 in workspace */ + movq_r2r (mm7, mm1); /* copy tmp0 : phase 3 */ - movq_r2m(mm5, *(wsptr+2)); /* save tmp12 in workspace */ - psubw_r2r(mm4, mm1); /* tmp11 = tmp0 - tmp2; */ + movq_r2m (mm5, *(wsptr + 2)); /* save tmp12 in workspace */ + psubw_r2r (mm4, mm1); /* tmp11 = tmp0 - tmp2; */ - paddw_r2r(mm4, mm7); /* tmp10 = tmp0 + tmp2; */ - movq_r2r(mm1, mm5); /* copy tmp11 */ - - paddw_m2r(*(wsptr+2), mm1); /* tmp1 = tmp11 + tmp12; */ - movq_r2r(mm7, mm4); /* copy tmp10 : phase 2 */ + paddw_r2r (mm4, mm7); /* tmp10 = tmp0 + tmp2; */ + movq_r2r (mm1, mm5); /* copy tmp11 */ - paddw_m2r(*(wsptr), mm7); /* tmp0 = tmp10 + tmp13; */ + paddw_m2r (*(wsptr + 2), mm1); /* tmp1 = tmp11 + tmp12; */ + movq_r2r (mm7, mm4); /* copy tmp10 : phase 2 */ - psubw_m2r(*(wsptr), mm4); /* tmp3 = tmp10 - tmp13; */ - movq_r2r(mm7, mm0); /* copy tmp0 */ + paddw_m2r (*(wsptr), mm7); /* tmp0 = tmp10 + tmp13; */ - psubw_m2r(*(wsptr+2), mm5); /* tmp2 = tmp11 - tmp12; */ - paddw_r2r(mm3, mm7); /* wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); */ - - psubw_r2r(mm3, mm0); /* wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); */ + psubw_m2r (*(wsptr), mm4); /* tmp3 = tmp10 - tmp13; */ + movq_r2r (mm7, mm0); /* copy tmp0 */ - movq_r2m(mm7, *(wsptr)); /* wsptr[DCTSIZE*0] */ - movq_r2r(mm1, mm3); /* copy tmp1 */ + psubw_m2r (*(wsptr + 2), mm5); /* tmp2 = tmp11 - tmp12; */ + paddw_r2r (mm3, mm7); /* wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); */ - movq_r2m(mm0, *(wsptr+14)); /* wsptr[DCTSIZE*7] */ - paddw_r2r(mm2, mm1); /* wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); */ + psubw_r2r (mm3, mm0); /* wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); */ - psubw_r2r(mm2, mm3); /* wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); */ + movq_r2m (mm7, *(wsptr)); /* wsptr[DCTSIZE*0] */ + movq_r2r (mm1, mm3); /* copy tmp1 */ - movq_r2m(mm1, *(wsptr+2)); /* wsptr[DCTSIZE*1] */ - movq_r2r(mm4, mm1); /* copy tmp3 */ + movq_r2m (mm0, *(wsptr + 14)); /* wsptr[DCTSIZE*7] */ + paddw_r2r (mm2, mm1); /* wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); */ - movq_r2m(mm3, *(wsptr+12)); /* wsptr[DCTSIZE*6] */ + psubw_r2r (mm2, mm3); /* wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); */ - paddw_m2r(*(wsptr+4), mm4); /* wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); */ + movq_r2m (mm1, *(wsptr + 2)); /* wsptr[DCTSIZE*1] */ + movq_r2r (mm4, mm1); /* copy tmp3 */ - psubw_m2r(*(wsptr+4), mm1); /* wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); */ + movq_r2m (mm3, *(wsptr + 12)); /* wsptr[DCTSIZE*6] */ - movq_r2m(mm4, *(wsptr+8)); - movq_r2r(mm5, mm7); /* copy tmp2 */ + paddw_m2r (*(wsptr + 4), mm4); /* wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); */ - paddw_r2r(mm6, mm5); /* wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) */ + psubw_m2r (*(wsptr + 4), mm1); /* wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); */ - movq_r2m(mm1, *(wsptr+6)); - psubw_r2r(mm6, mm7); /* wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); */ + movq_r2m (mm4, *(wsptr + 8)); + movq_r2r (mm5, mm7); /* copy tmp2 */ - movq_r2m(mm5, *(wsptr+4)); + paddw_r2r (mm6, mm5); /* wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) */ - movq_r2m(mm7, *(wsptr+10)); + movq_r2m (mm1, *(wsptr + 6)); + psubw_r2r (mm6, mm7); /* wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); */ - /*ok */ + movq_r2m (mm5, *(wsptr + 4)); + + movq_r2m (mm7, *(wsptr + 10)); + + /*ok */ /*****************************************************************/ - idata++; - wsptr++; + idata++; + wsptr++; /*****************************************************************/ - movq_m2r(*(idata+10), mm1); /* load idata[DCTSIZE*5] */ + movq_m2r (*(idata + 10), mm1); /* load idata[DCTSIZE*5] */ + + movq_m2r (*(idata + 6), mm0); /* load idata[DCTSIZE*3] */ + + movq_m2r (*(idata + 2), mm3); /* load idata[DCTSIZE*1] */ + movq_r2r (mm1, mm2); /* copy tmp6 : phase 6 */ + */movq_m2r (*(idata + 14), mm4); /* load idata[DCTSIZE*7] */ + paddw_r2r (mm0, mm1); /* z13 = tmp6 + tmp5; */ - movq_m2r(*(idata+6), mm0); /* load idata[DCTSIZE*3] */ + psubw_r2r (mm0, mm2); /* z10 = tmp6 - tmp5 */ - movq_m2r(*(idata+2), mm3); /* load idata[DCTSIZE*1] */ - movq_r2r(mm1, mm2); /* copy tmp6 : phase 6 */ */ + psllw_i2r (2, mm2); /* shift z10 */ + movq_r2r (mm2, mm0); /* copy z10 */ - movq_m2r(*(idata+14), mm4); /* load idata[DCTSIZE*7] */ - paddw_r2r(mm0, mm1); /* z13 = tmp6 + tmp5; */ + pmulhw_m2r (fix_184n261, mm2); /* MULTIPLY( z12, FIX_1_847759065); : 2*c2 */ + movq_r2r (mm3, mm5); /* copy tmp4 */ - psubw_r2r(mm0, mm2); /* z10 = tmp6 - tmp5 */ + pmulhw_m2r (fix_n184, mm0); /* MULTIPLY(z10, -FIX_1_847759065); : 2*c2 */ + paddw_r2r (mm4, mm3); /* z11 = tmp4 + tmp7; */ - psllw_i2r(2, mm2); /* shift z10 */ - movq_r2r(mm2, mm0); /* copy z10 */ + movq_r2r (mm3, mm6); /* copy z11 : phase 5 */ + psubw_r2r (mm4, mm5); /* z12 = tmp4 - tmp7; */ - pmulhw_m2r(fix_184n261, mm2); /* MULTIPLY( z12, FIX_1_847759065); : 2*c2 */ - movq_r2r(mm3, mm5); /* copy tmp4 */ + psubw_r2r (mm1, mm6); /* z11-z13 */ + psllw_i2r (2, mm5); /* shift z12 */ - pmulhw_m2r(fix_n184, mm0); /* MULTIPLY(z10, -FIX_1_847759065); : 2*c2 */ - paddw_r2r(mm4, mm3); /* z11 = tmp4 + tmp7; */ + movq_m2r (*(idata + 12), mm4); /* load idata[DCTSIZE*6], even part */ + movq_r2r (mm5, mm7); /* copy z12 */ - movq_r2r(mm3, mm6); /* copy z11 : phase 5 */ - psubw_r2r(mm4, mm5); /* z12 = tmp4 - tmp7; */ + pmulhw_m2r (fix_108n184, mm5); /* MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; 2*(c2-c6) even part */ + paddw_r2r (mm1, mm3); /* tmp7 = z11 + z13; */ - psubw_r2r(mm1, mm6); /* z11-z13 */ - psllw_i2r(2, mm5); /* shift z12 */ + /*ok */ - movq_m2r(*(idata+12), mm4); /* load idata[DCTSIZE*6], even part */ - movq_r2r(mm5, mm7); /* copy z12 */ + /* Even part */ + pmulhw_m2r (fix_184, mm7); /* MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; -2*(c2+c6) */ + psllw_i2r (2, mm6); - pmulhw_m2r(fix_108n184, mm5); /* MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; 2*(c2-c6) even part */ - paddw_r2r(mm1, mm3); /* tmp7 = z11 + z13; */ + movq_m2r (*(idata + 4), mm1); /* load idata[DCTSIZE*2] */ - /*ok */ + paddw_r2r (mm5, mm0); /* tmp10 */ - /* Even part */ - pmulhw_m2r(fix_184, mm7); /* MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; -2*(c2+c6) */ - psllw_i2r(2, mm6); + paddw_r2r (mm7, mm2); /* tmp12 */ - movq_m2r(*(idata+4), mm1); /* load idata[DCTSIZE*2] */ + pmulhw_m2r (fix_141, mm6); /* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); 2*c4 */ + psubw_r2r (mm3, mm2); /* tmp6 = tmp12 - tmp7 */ - paddw_r2r(mm5, mm0); /* tmp10 */ + movq_r2r (mm1, mm5); /* copy tmp1 */ + paddw_r2r (mm4, mm1); /* tmp13= tmp1 + tmp3; phases 5-3 */ - paddw_r2r(mm7, mm2); /* tmp12 */ + psubw_r2r (mm4, mm5); /* tmp1-tmp3 */ + psubw_r2r (mm2, mm6); /* tmp5 = tmp11 - tmp6; */ - pmulhw_m2r(fix_141, mm6); /* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); 2*c4 */ - psubw_r2r(mm3, mm2); /* tmp6 = tmp12 - tmp7 */ + movq_r2m (mm1, *(wsptr)); /* save tmp13 in workspace */ + psllw_i2r (2, mm5); /* shift tmp1-tmp3 */ - movq_r2r(mm1, mm5); /* copy tmp1 */ - paddw_r2r(mm4, mm1); /* tmp13= tmp1 + tmp3; phases 5-3 */ + movq_m2r (*(idata), mm7); /* load idata[DCTSIZE*0] */ + paddw_r2r (mm6, mm0); /* tmp4 = tmp10 + tmp5; */ - psubw_r2r(mm4, mm5); /* tmp1-tmp3 */ - psubw_r2r(mm2, mm6); /* tmp5 = tmp11 - tmp6; */ + pmulhw_m2r (fix_141, mm5); /* MULTIPLY(tmp1 - tmp3, FIX_1_414213562) */ - movq_r2m(mm1, *(wsptr)); /* save tmp13 in workspace */ - psllw_i2r(2, mm5); /* shift tmp1-tmp3 */ - - movq_m2r(*(idata), mm7); /* load idata[DCTSIZE*0] */ - paddw_r2r(mm6, mm0); /* tmp4 = tmp10 + tmp5; */ + movq_m2r (*(idata + 8), mm4); /* load idata[DCTSIZE*4] */ - pmulhw_m2r(fix_141, mm5); /* MULTIPLY(tmp1 - tmp3, FIX_1_414213562) */ + psubw_r2r (mm1, mm5); /* tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; 2*c4 */ - movq_m2r(*(idata+8), mm4); /* load idata[DCTSIZE*4] */ - - psubw_r2r(mm1, mm5); /* tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; 2*c4 */ + movq_r2m (mm0, *(wsptr + 4)); /* save tmp4 in workspace */ + movq_r2r (mm7, mm1); /* copy tmp0: phase 3 */ - movq_r2m(mm0, *(wsptr+4)); /* save tmp4 in workspace */ - movq_r2r(mm7, mm1); /* copy tmp0: phase 3 */ + movq_r2m (mm5, *(wsptr + 2)); /* save tmp12 in workspace */ + psubw_r2r (mm4, mm1); /* tmp11 = tmp0 - tmp2; */ - movq_r2m(mm5, *(wsptr+2)); /* save tmp12 in workspace */ - psubw_r2r(mm4, mm1); /* tmp11 = tmp0 - tmp2; */ + paddw_r2r (mm4, mm7); /* tmp10 = tmp0 + tmp2; */ + movq_r2r (mm1, mm5); /* copy tmp11 */ - paddw_r2r(mm4, mm7); /* tmp10 = tmp0 + tmp2; */ - movq_r2r(mm1, mm5); /* copy tmp11 */ - - paddw_m2r(*(wsptr+2), mm1); /* tmp1 = tmp11 + tmp12; */ - movq_r2r(mm7, mm4); /* copy tmp10: phase 2 */ + paddw_m2r (*(wsptr + 2), mm1); /* tmp1 = tmp11 + tmp12; */ + movq_r2r (mm7, mm4); /* copy tmp10: phase 2 */ - paddw_m2r(*(wsptr), mm7); /* tmp0 = tmp10 + tmp13; */ + paddw_m2r (*(wsptr), mm7); /* tmp0 = tmp10 + tmp13; */ - psubw_m2r(*(wsptr), mm4); /* tmp3 = tmp10 - tmp13; */ - movq_r2r(mm7, mm0); /* copy tmp0 */ + psubw_m2r (*(wsptr), mm4); /* tmp3 = tmp10 - tmp13; */ + movq_r2r (mm7, mm0); /* copy tmp0 */ - psubw_m2r(*(wsptr+2), mm5); /* tmp2 = tmp11 - tmp12; */ - paddw_r2r(mm3, mm7); /* wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); */ - - psubw_r2r(mm3, mm0); /* wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); */ + psubw_m2r (*(wsptr + 2), mm5); /* tmp2 = tmp11 - tmp12; */ + paddw_r2r (mm3, mm7); /* wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); */ - movq_r2m(mm7, *(wsptr)); /* wsptr[DCTSIZE*0] */ - movq_r2r(mm1, mm3); /* copy tmp1 */ + psubw_r2r (mm3, mm0); /* wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); */ - movq_r2m(mm0, *(wsptr+14)); /* wsptr[DCTSIZE*7] */ - paddw_r2r(mm2, mm1); /* wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); */ + movq_r2m (mm7, *(wsptr)); /* wsptr[DCTSIZE*0] */ + movq_r2r (mm1, mm3); /* copy tmp1 */ - psubw_r2r(mm2, mm3); /* wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); */ + movq_r2m (mm0, *(wsptr + 14)); /* wsptr[DCTSIZE*7] */ + paddw_r2r (mm2, mm1); /* wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); */ - movq_r2m(mm1, *(wsptr+2)); /* wsptr[DCTSIZE*1] */ - movq_r2r(mm4, mm1); /* copy tmp3 */ + psubw_r2r (mm2, mm3); /* wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); */ - movq_r2m(mm3, *(wsptr+12)); /* wsptr[DCTSIZE*6] */ + movq_r2m (mm1, *(wsptr + 2)); /* wsptr[DCTSIZE*1] */ + movq_r2r (mm4, mm1); /* copy tmp3 */ - paddw_m2r(*(wsptr+4), mm4); /* wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); */ + movq_r2m (mm3, *(wsptr + 12)); /* wsptr[DCTSIZE*6] */ - psubw_m2r(*(wsptr+4), mm1); /* wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); */ + paddw_m2r (*(wsptr + 4), mm4); /* wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); */ - movq_r2m(mm4, *(wsptr+8)); - movq_r2r(mm5, mm7); /* copy tmp2 */ + psubw_m2r (*(wsptr + 4), mm1); /* wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); */ - paddw_r2r(mm6, mm5); /* wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) */ + movq_r2m (mm4, *(wsptr + 8)); + movq_r2r (mm5, mm7); /* copy tmp2 */ - movq_r2m(mm1, *(wsptr+6)); - psubw_r2r(mm6, mm7); /* wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); */ + paddw_r2r (mm6, mm5); /* wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) */ - movq_r2m(mm5, *(wsptr+4)); + movq_r2m (mm1, *(wsptr + 6)); + psubw_r2r (mm6, mm7); /* wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); */ - movq_r2m(mm7, *(wsptr+10)); + movq_r2m (mm5, *(wsptr + 4)); + + movq_r2m (mm7, *(wsptr + 10)); /*****************************************************************/ @@ -1512,258 +1517,258 @@ static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; /* and also undo the PASS1_BITS scaling. */ /*****************************************************************/ - /* Even part */ + /* Even part */ - wsptr--; + wsptr--; /* tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */ /* tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */ /* tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */ /* tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); */ - movq_m2r(*(wsptr), mm0); /* wsptr[0,0],[0,1],[0,2],[0,3] */ + movq_m2r (*(wsptr), mm0); /* wsptr[0,0],[0,1],[0,2],[0,3] */ + + movq_m2r (*(wsptr + 1), mm1); /* wsptr[0,4],[0,5],[0,6],[0,7] */ + movq_r2r (mm0, mm2); + + movq_m2r (*(wsptr + 2), mm3); /* wsptr[1,0],[1,1],[1,2],[1,3] */ + paddw_r2r (mm1, mm0); /* wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] */ + + movq_m2r (*(wsptr + 3), mm4); /* wsptr[1,4],[1,5],[1,6],[1,7] */ + psubw_r2r (mm1, mm2); /* wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] */ - movq_m2r(*(wsptr+1), mm1); /* wsptr[0,4],[0,5],[0,6],[0,7] */ - movq_r2r(mm0, mm2); - - movq_m2r(*(wsptr+2), mm3); /* wsptr[1,0],[1,1],[1,2],[1,3] */ - paddw_r2r(mm1, mm0); /* wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] */ + movq_r2r (mm0, mm6); + movq_r2r (mm3, mm5); - movq_m2r(*(wsptr+3), mm4); /* wsptr[1,4],[1,5],[1,6],[1,7] */ - psubw_r2r(mm1, mm2); /* wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] */ + paddw_r2r (mm4, mm3); /* wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] */ + movq_r2r (mm2, mm1); - movq_r2r(mm0, mm6); - movq_r2r(mm3, mm5); - - paddw_r2r(mm4, mm3); /* wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] */ - movq_r2r(mm2, mm1); + psubw_r2r (mm4, mm5); /* wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] */ + punpcklwd_r2r (mm3, mm0); /* wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] */ - psubw_r2r(mm4, mm5); /* wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] */ - punpcklwd_r2r(mm3, mm0); /* wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] */ + movq_m2r (*(wsptr + 7), mm7); /* wsptr[3,4],[3,5],[3,6],[3,7] */ + punpckhwd_r2r (mm3, mm6); /* wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] */ - movq_m2r(*(wsptr+7), mm7); /* wsptr[3,4],[3,5],[3,6],[3,7] */ - punpckhwd_r2r(mm3, mm6); /* wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] */ + movq_m2r (*(wsptr + 4), mm3); /* wsptr[2,0],[2,1],[2,2],[2,3] */ + punpckldq_r2r (mm6, mm0); /* wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */ - movq_m2r(*(wsptr+4), mm3); /* wsptr[2,0],[2,1],[2,2],[2,3] */ - punpckldq_r2r(mm6, mm0); /* wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */ + punpcklwd_r2r (mm5, mm1); /* wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] */ + movq_r2r (mm3, mm4); - punpcklwd_r2r(mm5, mm1); /* wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] */ - movq_r2r(mm3, mm4); + movq_m2r (*(wsptr + 6), mm6); /* wsptr[3,0],[3,1],[3,2],[3,3] */ + punpckhwd_r2r (mm5, mm2); /* wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] */ - movq_m2r(*(wsptr+6), mm6); /* wsptr[3,0],[3,1],[3,2],[3,3] */ - punpckhwd_r2r(mm5, mm2); /* wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] */ + movq_m2r (*(wsptr + 5), mm5); /* wsptr[2,4],[2,5],[2,6],[2,7] */ + punpckldq_r2r (mm2, mm1); /* wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */ - movq_m2r(*(wsptr+5), mm5); /* wsptr[2,4],[2,5],[2,6],[2,7] */ - punpckldq_r2r(mm2, mm1); /* wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */ - - paddw_r2r(mm5, mm3); /* wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] */ - movq_r2r(mm6, mm2); + paddw_r2r (mm5, mm3); /* wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] */ + movq_r2r (mm6, mm2); - psubw_r2r(mm5, mm4); /* wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] */ - paddw_r2r(mm7, mm6); /* wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] */ + psubw_r2r (mm5, mm4); /* wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] */ + paddw_r2r (mm7, mm6); /* wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] */ - movq_r2r(mm3, mm5); - punpcklwd_r2r(mm6, mm3); /* wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] */ - - psubw_r2r(mm7, mm2); /* wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] */ - punpckhwd_r2r(mm6, mm5); /* wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] */ + movq_r2r (mm3, mm5); + punpcklwd_r2r (mm6, mm3); /* wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] */ - movq_r2r(mm4, mm7); - punpckldq_r2r(mm5, mm3); /* wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] */ - - punpcklwd_r2r(mm2, mm4); /* wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] */ + psubw_r2r (mm7, mm2); /* wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] */ + punpckhwd_r2r (mm6, mm5); /* wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] */ - punpckhwd_r2r(mm2, mm7); /* wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] */ + movq_r2r (mm4, mm7); + punpckldq_r2r (mm5, mm3); /* wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] */ - punpckldq_r2r(mm7, mm4); /* wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] */ - movq_r2r(mm1, mm6); + punpcklwd_r2r (mm2, mm4); /* wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] */ - /*ok */ + punpckhwd_r2r (mm2, mm7); /* wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] */ + + punpckldq_r2r (mm7, mm4); /* wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] */ + movq_r2r (mm1, mm6); + + /*ok */ /* mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */ /* mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */ - movq_r2r(mm0, mm2); - punpckhdq_r2r(mm4, mm6); /* wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] */ + movq_r2r (mm0, mm2); + punpckhdq_r2r (mm4, mm6); /* wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] */ - punpckldq_r2r(mm4, mm1); /* wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] */ - psllw_i2r(2, mm6); + punpckldq_r2r (mm4, mm1); /* wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] */ + psllw_i2r (2, mm6); - pmulhw_m2r(fix_141, mm6); - punpckldq_r2r(mm3, mm0); /* wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] */ + pmulhw_m2r (fix_141, mm6); + punpckldq_r2r (mm3, mm0); /* wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] */ - punpckhdq_r2r(mm3, mm2); /* wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] */ - movq_r2r(mm0, mm7); + punpckhdq_r2r (mm3, mm2); /* wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] */ + movq_r2r (mm0, mm7); /* tmp0 = tmp10 + tmp13; */ /* tmp3 = tmp10 - tmp13; */ - paddw_r2r(mm2, mm0); /* [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] */ - psubw_r2r(mm2, mm7); /* [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] */ + paddw_r2r (mm2, mm0); /* [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] */ + psubw_r2r (mm2, mm7); /* [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] */ /* tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; */ - psubw_r2r(mm2, mm6); /* wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] */ + psubw_r2r (mm2, mm6); /* wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] */ /* tmp1 = tmp11 + tmp12; */ /* tmp2 = tmp11 - tmp12; */ - movq_r2r(mm1, mm5); + movq_r2r (mm1, mm5); - /*OK */ + /*OK */ - /* Odd part */ + /* Odd part */ /* z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */ /* z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */ /* z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */ /* z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */ - movq_m2r(*(wsptr), mm3); /* wsptr[0,0],[0,1],[0,2],[0,3] */ - paddw_r2r(mm6, mm1); /* [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] */ + movq_m2r (*(wsptr), mm3); /* wsptr[0,0],[0,1],[0,2],[0,3] */ + paddw_r2r (mm6, mm1); /* [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] */ - movq_m2r(*(wsptr+1), mm4); /* wsptr[0,4],[0,5],[0,6],[0,7] */ - psubw_r2r(mm6, mm5); /* [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] */ + movq_m2r (*(wsptr + 1), mm4); /* wsptr[0,4],[0,5],[0,6],[0,7] */ + psubw_r2r (mm6, mm5); /* [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] */ - movq_r2r(mm3, mm6); - punpckldq_r2r(mm4, mm3); /* wsptr[0,0],[0,1],[0,4],[0,5] */ + movq_r2r (mm3, mm6); + punpckldq_r2r (mm4, mm3); /* wsptr[0,0],[0,1],[0,4],[0,5] */ - punpckhdq_r2r(mm6, mm4); /* wsptr[0,6],[0,7],[0,2],[0,3] */ - movq_r2r(mm3, mm2); + punpckhdq_r2r (mm6, mm4); /* wsptr[0,6],[0,7],[0,2],[0,3] */ + movq_r2r (mm3, mm2); /*Save tmp0 and tmp1 in wsptr */ - movq_r2m(mm0, *(wsptr)); /* save tmp0 */ - paddw_r2r(mm4, mm2); /* wsptr[xxx],[0,z11],[xxx],[0,z13] */ + movq_r2m (mm0, *(wsptr)); /* save tmp0 */ + paddw_r2r (mm4, mm2); /* wsptr[xxx],[0,z11],[xxx],[0,z13] */ + - /*Continue with z10 --- z13 */ - movq_m2r(*(wsptr+2), mm6); /* wsptr[1,0],[1,1],[1,2],[1,3] */ - psubw_r2r(mm4, mm3); /* wsptr[xxx],[0,z12],[xxx],[0,z10] */ + movq_m2r (*(wsptr + 2), mm6); /* wsptr[1,0],[1,1],[1,2],[1,3] */ + psubw_r2r (mm4, mm3); /* wsptr[xxx],[0,z12],[xxx],[0,z10] */ - movq_m2r(*(wsptr+3), mm0); /* wsptr[1,4],[1,5],[1,6],[1,7] */ - movq_r2r(mm6, mm4); + movq_m2r (*(wsptr + 3), mm0); /* wsptr[1,4],[1,5],[1,6],[1,7] */ + movq_r2r (mm6, mm4); - movq_r2m(mm1, *(wsptr+1)); /* save tmp1 */ - punpckldq_r2r(mm0, mm6); /* wsptr[1,0],[1,1],[1,4],[1,5] */ + movq_r2m (mm1, *(wsptr + 1)); /* save tmp1 */ + punpckldq_r2r (mm0, mm6); /* wsptr[1,0],[1,1],[1,4],[1,5] */ + + punpckhdq_r2r (mm4, mm0); /* wsptr[1,6],[1,7],[1,2],[1,3] */ + movq_r2r (mm6, mm1); - punpckhdq_r2r(mm4, mm0); /* wsptr[1,6],[1,7],[1,2],[1,3] */ - movq_r2r(mm6, mm1); - /*Save tmp2 and tmp3 in wsptr */ - paddw_r2r(mm0, mm6); /* wsptr[xxx],[1,z11],[xxx],[1,z13] */ - movq_r2r(mm2, mm4); - + paddw_r2r (mm0, mm6); /* wsptr[xxx],[1,z11],[xxx],[1,z13] */ + movq_r2r (mm2, mm4); + /*Continue with z10 --- z13 */ - movq_r2m(mm5, *(wsptr+2)); /* save tmp2 */ - punpcklwd_r2r(mm6, mm2); /* wsptr[xxx],[xxx],[0,z11],[1,z11] */ + movq_r2m (mm5, *(wsptr + 2)); /* save tmp2 */ + punpcklwd_r2r (mm6, mm2); /* wsptr[xxx],[xxx],[0,z11],[1,z11] */ - psubw_r2r(mm0, mm1); /* wsptr[xxx],[1,z12],[xxx],[1,z10] */ - punpckhwd_r2r(mm6, mm4); /* wsptr[xxx],[xxx],[0,z13],[1,z13] */ + psubw_r2r (mm0, mm1); /* wsptr[xxx],[1,z12],[xxx],[1,z10] */ + punpckhwd_r2r (mm6, mm4); /* wsptr[xxx],[xxx],[0,z13],[1,z13] */ - movq_r2r(mm3, mm0); - punpcklwd_r2r(mm1, mm3); /* wsptr[xxx],[xxx],[0,z12],[1,z12] */ + movq_r2r (mm3, mm0); + punpcklwd_r2r (mm1, mm3); /* wsptr[xxx],[xxx],[0,z12],[1,z12] */ - movq_r2m(mm7, *(wsptr+3)); /* save tmp3 */ - punpckhwd_r2r(mm1, mm0); /* wsptr[xxx],[xxx],[0,z10],[1,z10] */ + movq_r2m (mm7, *(wsptr + 3)); /* save tmp3 */ + punpckhwd_r2r (mm1, mm0); /* wsptr[xxx],[xxx],[0,z10],[1,z10] */ - movq_m2r(*(wsptr+4), mm6); /* wsptr[2,0],[2,1],[2,2],[2,3] */ - punpckhdq_r2r(mm2, mm0); /* wsptr[0,z10],[1,z10],[0,z11],[1,z11] */ + movq_m2r (*(wsptr + 4), mm6); /* wsptr[2,0],[2,1],[2,2],[2,3] */ + punpckhdq_r2r (mm2, mm0); /* wsptr[0,z10],[1,z10],[0,z11],[1,z11] */ - movq_m2r(*(wsptr+5), mm7); /* wsptr[2,4],[2,5],[2,6],[2,7] */ - punpckhdq_r2r(mm4, mm3); /* wsptr[0,z12],[1,z12],[0,z13],[1,z13] */ + movq_m2r (*(wsptr + 5), mm7); /* wsptr[2,4],[2,5],[2,6],[2,7] */ + punpckhdq_r2r (mm4, mm3); /* wsptr[0,z12],[1,z12],[0,z13],[1,z13] */ - movq_m2r(*(wsptr+6), mm1); /* wsptr[3,0],[3,1],[3,2],[3,3] */ - movq_r2r(mm6, mm4); + movq_m2r (*(wsptr + 6), mm1); /* wsptr[3,0],[3,1],[3,2],[3,3] */ + movq_r2r (mm6, mm4); - punpckldq_r2r(mm7, mm6); /* wsptr[2,0],[2,1],[2,4],[2,5] */ - movq_r2r(mm1, mm5); + punpckldq_r2r (mm7, mm6); /* wsptr[2,0],[2,1],[2,4],[2,5] */ + movq_r2r (mm1, mm5); - punpckhdq_r2r(mm4, mm7); /* wsptr[2,6],[2,7],[2,2],[2,3] */ - movq_r2r(mm6, mm2); - - movq_m2r(*(wsptr+7), mm4); /* wsptr[3,4],[3,5],[3,6],[3,7] */ - paddw_r2r(mm7, mm6); /* wsptr[xxx],[2,z11],[xxx],[2,z13] */ + punpckhdq_r2r (mm4, mm7); /* wsptr[2,6],[2,7],[2,2],[2,3] */ + movq_r2r (mm6, mm2); - psubw_r2r(mm7, mm2); /* wsptr[xxx],[2,z12],[xxx],[2,z10] */ - punpckldq_r2r(mm4, mm1); /* wsptr[3,0],[3,1],[3,4],[3,5] */ + movq_m2r (*(wsptr + 7), mm4); /* wsptr[3,4],[3,5],[3,6],[3,7] */ + paddw_r2r (mm7, mm6); /* wsptr[xxx],[2,z11],[xxx],[2,z13] */ - punpckhdq_r2r(mm5, mm4); /* wsptr[3,6],[3,7],[3,2],[3,3] */ - movq_r2r(mm1, mm7); + psubw_r2r (mm7, mm2); /* wsptr[xxx],[2,z12],[xxx],[2,z10] */ + punpckldq_r2r (mm4, mm1); /* wsptr[3,0],[3,1],[3,4],[3,5] */ - paddw_r2r(mm4, mm1); /* wsptr[xxx],[3,z11],[xxx],[3,z13] */ - psubw_r2r(mm4, mm7); /* wsptr[xxx],[3,z12],[xxx],[3,z10] */ + punpckhdq_r2r (mm5, mm4); /* wsptr[3,6],[3,7],[3,2],[3,3] */ + movq_r2r (mm1, mm7); - movq_r2r(mm6, mm5); - punpcklwd_r2r(mm1, mm6); /* wsptr[xxx],[xxx],[2,z11],[3,z11] */ + paddw_r2r (mm4, mm1); /* wsptr[xxx],[3,z11],[xxx],[3,z13] */ + psubw_r2r (mm4, mm7); /* wsptr[xxx],[3,z12],[xxx],[3,z10] */ - punpckhwd_r2r(mm1, mm5); /* wsptr[xxx],[xxx],[2,z13],[3,z13] */ - movq_r2r(mm2, mm4); + movq_r2r (mm6, mm5); + punpcklwd_r2r (mm1, mm6); /* wsptr[xxx],[xxx],[2,z11],[3,z11] */ - punpcklwd_r2r(mm7, mm2); /* wsptr[xxx],[xxx],[2,z12],[3,z12] */ + punpckhwd_r2r (mm1, mm5); /* wsptr[xxx],[xxx],[2,z13],[3,z13] */ + movq_r2r (mm2, mm4); - punpckhwd_r2r(mm7, mm4); /* wsptr[xxx],[xxx],[2,z10],[3,z10] */ + punpcklwd_r2r (mm7, mm2); /* wsptr[xxx],[xxx],[2,z12],[3,z12] */ - punpckhdq_r2r(mm6, mm4); /*/ wsptr[2,z10],[3,z10],[2,z11],[3,z11] */ + punpckhwd_r2r (mm7, mm4); /* wsptr[xxx],[xxx],[2,z10],[3,z10] */ - punpckhdq_r2r(mm5, mm2); /* wsptr[2,z12],[3,z12],[2,z13],[3,z13] */ - movq_r2r(mm0, mm5); + punpckhdq_r2r (mm6, mm4); /*/ wsptr[2,z10],[3,z10],[2,z11],[3,z11] */ - punpckldq_r2r(mm4, mm0); /* wsptr[0,z10],[1,z10],[2,z10],[3,z10] */ + punpckhdq_r2r (mm5, mm2); /* wsptr[2,z12],[3,z12],[2,z13],[3,z13] */ + movq_r2r (mm0, mm5); - punpckhdq_r2r(mm4, mm5); /* wsptr[0,z11],[1,z11],[2,z11],[3,z11] */ - movq_r2r(mm3, mm4); + punpckldq_r2r (mm4, mm0); /* wsptr[0,z10],[1,z10],[2,z10],[3,z10] */ - punpckhdq_r2r(mm2, mm4); /* wsptr[0,z13],[1,z13],[2,z13],[3,z13] */ - movq_r2r(mm5, mm1); + punpckhdq_r2r (mm4, mm5); /* wsptr[0,z11],[1,z11],[2,z11],[3,z11] */ + movq_r2r (mm3, mm4); - punpckldq_r2r(mm2, mm3); /* wsptr[0,z12],[1,z12],[2,z12],[3,z12] */ + punpckhdq_r2r (mm2, mm4); /* wsptr[0,z13],[1,z13],[2,z13],[3,z13] */ + movq_r2r (mm5, mm1); + + punpckldq_r2r (mm2, mm3); /* wsptr[0,z12],[1,z12],[2,z12],[3,z12] */ /* tmp7 = z11 + z13; : phase 5 */ /* tmp8 = z11 - z13; : phase 5 */ - psubw_r2r(mm4, mm1); /* tmp8 */ + psubw_r2r (mm4, mm1); /* tmp8 */ - paddw_r2r(mm4, mm5); /* tmp7 */ + paddw_r2r (mm4, mm5); /* tmp7 */ /* tmp21 = MULTIPLY(tmp8, FIX_1_414213562); 2*c4 */ - psllw_i2r(2, mm1); + psllw_i2r (2, mm1); - psllw_i2r(2, mm0); + psllw_i2r (2, mm0); - pmulhw_m2r(fix_141, mm1); /* tmp21 */ + pmulhw_m2r (fix_141, mm1); /* tmp21 */ /* tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) 2*(c2-c6) */ /* + MULTIPLY(z10, - FIX_1_847759065); : 2*c2 */ - psllw_i2r(2, mm3); - movq_r2r(mm0, mm7); + psllw_i2r (2, mm3); + movq_r2r (mm0, mm7); - pmulhw_m2r(fix_n184, mm7); - movq_r2r(mm3, mm6); + pmulhw_m2r (fix_n184, mm7); + movq_r2r (mm3, mm6); - movq_m2r(*(wsptr), mm2); /* tmp0,final1 */ + movq_m2r (*(wsptr), mm2); /* tmp0,final1 */ - pmulhw_m2r(fix_108n184, mm6); + pmulhw_m2r (fix_108n184, mm6); /* tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) : -2*(c2+c6) */ /* + MULTIPLY(z12, FIX_1_847759065); 2*c2 */ - movq_r2r(mm2, mm4); /* final1 */ - - pmulhw_m2r(fix_184n261, mm0); - paddw_r2r(mm5, mm2); /* tmp0+tmp7,final1 */ + movq_r2r (mm2, mm4); /* final1 */ + + pmulhw_m2r (fix_184n261, mm0); + paddw_r2r (mm5, mm2); /* tmp0+tmp7,final1 */ - pmulhw_m2r(fix_184, mm3); - psubw_r2r(mm5, mm4); /* tmp0-tmp7,final1 */ + pmulhw_m2r (fix_184, mm3); + psubw_r2r (mm5, mm4); /* tmp0-tmp7,final1 */ /* tmp6 = tmp22 - tmp7; phase 2 */ - psraw_i2r(3, mm2); /* outptr[0,0],[1,0],[2,0],[3,0],final1 */ + psraw_i2r (3, mm2); /* outptr[0,0],[1,0],[2,0],[3,0],final1 */ - paddw_r2r(mm6, mm7); /* tmp20 */ - psraw_i2r(3, mm4); /* outptr[0,7],[1,7],[2,7],[3,7],final1 */ + paddw_r2r (mm6, mm7); /* tmp20 */ + psraw_i2r (3, mm4); /* outptr[0,7],[1,7],[2,7],[3,7],final1 */ - paddw_r2r(mm0, mm3); /* tmp22 */ + paddw_r2r (mm0, mm3); /* tmp22 */ /* tmp5 = tmp21 - tmp6; */ - psubw_r2r(mm5, mm3); /* tmp6 */ + psubw_r2r (mm5, mm3); /* tmp6 */ /* tmp4 = tmp20 + tmp5; */ - movq_m2r(*(wsptr+1), mm0); /* tmp1,final2 */ - psubw_r2r(mm3, mm1); /* tmp5 */ + movq_m2r (*(wsptr + 1), mm0); /* tmp1,final2 */ + psubw_r2r (mm3, mm1); /* tmp5 */ - movq_r2r(mm0, mm6); /* final2 */ - paddw_r2r(mm3, mm0); /* tmp1+tmp6,final2 */ + movq_r2r (mm0, mm6); /* final2 */ + paddw_r2r (mm3, mm0); /* tmp1+tmp6,final2 */ - /* Final output stage: scale down by a factor of 8 and range-limit */ + /* Final output stage: scale down by a factor of 8 and range-limit */ /* outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */ @@ -1776,30 +1781,30 @@ static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; /* & RANGE_MASK]; */ /* outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */ /* & RANGE_MASK]; final2 */ - psubw_r2r(mm3, mm6); /* tmp1-tmp6,final2 */ - psraw_i2r(3, mm0); /* outptr[0,1],[1,1],[2,1],[3,1] */ + psubw_r2r (mm3, mm6); /* tmp1-tmp6,final2 */ + psraw_i2r (3, mm0); /* outptr[0,1],[1,1],[2,1],[3,1] */ + + psraw_i2r (3, mm6); /* outptr[0,6],[1,6],[2,6],[3,6] */ + + packuswb_r2r (mm4, mm0); /* out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] */ - psraw_i2r(3, mm6); /* outptr[0,6],[1,6],[2,6],[3,6] */ - - packuswb_r2r(mm4, mm0); /* out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] */ - - movq_m2r(*(wsptr+2), mm5); /* tmp2,final3 */ - packuswb_r2r(mm6, mm2); /* out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] */ + movq_m2r (*(wsptr + 2), mm5); /* tmp2,final3 */ + packuswb_r2r (mm6, mm2); /* out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] */ /* outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */ /* & RANGE_MASK]; */ /* outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */ /* & RANGE_MASK]; final3 */ - paddw_r2r(mm1, mm7); /* tmp4 */ - movq_r2r(mm5, mm3); + paddw_r2r (mm1, mm7); /* tmp4 */ + movq_r2r (mm5, mm3); - paddw_r2r(mm1, mm5); /* tmp2+tmp5 */ - psubw_r2r(mm1, mm3); /* tmp2-tmp5 */ + paddw_r2r (mm1, mm5); /* tmp2+tmp5 */ + psubw_r2r (mm1, mm3); /* tmp2-tmp5 */ - psraw_i2r(3, mm5); /* outptr[0,2],[1,2],[2,2],[3,2] */ + psraw_i2r (3, mm5); /* outptr[0,2],[1,2],[2,2],[3,2] */ - movq_m2r(*(wsptr+3), mm4); /* tmp3,final4 */ - psraw_i2r(3, mm3); /* outptr[0,5],[1,5],[2,5],[3,5] */ + movq_m2r (*(wsptr + 3), mm4); /* tmp3,final4 */ + psraw_i2r (3, mm3); /* outptr[0,5],[1,5],[2,5],[3,5] */ @@ -1807,74 +1812,74 @@ static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; /* & RANGE_MASK]; */ /* outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */ /* & RANGE_MASK]; final4 */ - movq_r2r(mm4, mm6); - paddw_r2r(mm7, mm4); /* tmp3+tmp4 */ + movq_r2r (mm4, mm6); + paddw_r2r (mm7, mm4); /* tmp3+tmp4 */ - psubw_r2r(mm7, mm6); /* tmp3-tmp4 */ - psraw_i2r(3, mm4); /* outptr[0,4],[1,4],[2,4],[3,4] */ + psubw_r2r (mm7, mm6); /* tmp3-tmp4 */ + psraw_i2r (3, mm4); /* outptr[0,4],[1,4],[2,4],[3,4] */ - /* mov ecx, [dataptr] */ + /* mov ecx, [dataptr] */ - psraw_i2r(3, mm6); /* outptr[0,3],[1,3],[2,3],[3,3] */ + psraw_i2r (3, mm6); /* outptr[0,3],[1,3],[2,3],[3,3] */ - packuswb_r2r(mm4, mm5); /* out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] */ + packuswb_r2r (mm4, mm5); /* out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] */ - packuswb_r2r(mm3, mm6); /* out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] */ - movq_r2r(mm2, mm4); + packuswb_r2r (mm3, mm6); /* out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] */ + movq_r2r (mm2, mm4); - movq_r2r(mm5, mm7); - punpcklbw_r2r(mm0, mm2); /* out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] */ + movq_r2r (mm5, mm7); + punpcklbw_r2r (mm0, mm2); /* out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] */ - punpckhbw_r2r(mm0, mm4); /* out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] */ - movq_r2r(mm2, mm1); + punpckhbw_r2r (mm0, mm4); /* out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] */ + movq_r2r (mm2, mm1); - punpcklbw_r2r(mm6, mm5); /* out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] */ + punpcklbw_r2r (mm6, mm5); /* out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] */ - /* add dataptr, 4 */ + /* add dataptr, 4 */ - punpckhbw_r2r(mm6, mm7); /* out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] */ + punpckhbw_r2r (mm6, mm7); /* out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] */ - punpcklwd_r2r(mm5, mm2); /* out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] */ - - /* add ecx, output_col */ + punpcklwd_r2r (mm5, mm2); /* out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] */ - movq_r2r(mm7, mm6); - punpckhwd_r2r(mm5, mm1); /* out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] */ + /* add ecx, output_col */ - movq_r2r(mm2, mm0); - punpcklwd_r2r(mm4, mm6); /* out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] */ + movq_r2r (mm7, mm6); + punpckhwd_r2r (mm5, mm1); /* out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] */ - /* mov idata, [dataptr] */ - - punpckldq_r2r(mm6, mm2); /* out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] */ + movq_r2r (mm2, mm0); + punpcklwd_r2r (mm4, mm6); /* out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] */ - /* add dataptr, 4 */ - - movq_r2r(mm1, mm3); + /* mov idata, [dataptr] */ - /* add idata, output_col */ - - punpckhwd_r2r(mm4, mm7); /* out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] */ - - movq_r2m(mm2, *(dataptr)); - - punpckhdq_r2r(mm6, mm0); /* out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] */ + punpckldq_r2r (mm6, mm2); /* out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] */ - dataptr += rskip; - movq_r2m(mm0, *(dataptr)); + /* add dataptr, 4 */ - punpckldq_r2r(mm7, mm1); /* out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] */ - punpckhdq_r2r(mm7, mm3); /* out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] */ - - dataptr += rskip; - movq_r2m(mm1, *(dataptr)); + movq_r2r (mm1, mm3); - dataptr += rskip; - movq_r2m(mm3, *(dataptr)); + /* add idata, output_col */ + + punpckhwd_r2r (mm4, mm7); /* out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] */ + + movq_r2m (mm2, *(dataptr)); + + punpckhdq_r2r (mm6, mm0); /* out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] */ + + dataptr += rskip; + movq_r2m (mm0, *(dataptr)); + + punpckldq_r2r (mm7, mm1); /* out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] */ + punpckhdq_r2r (mm7, mm3); /* out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] */ + + dataptr += rskip; + movq_r2m (mm1, *(dataptr)); + + dataptr += rskip; + movq_r2m (mm3, *(dataptr)); /*******************************************************************/ - wsptr += 8; + wsptr += 8; /*******************************************************************/ @@ -1882,249 +1887,249 @@ static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; /* tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */ /* tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */ /* tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); */ - movq_m2r(*(wsptr), mm0); /* wsptr[0,0],[0,1],[0,2],[0,3] */ + movq_m2r (*(wsptr), mm0); /* wsptr[0,0],[0,1],[0,2],[0,3] */ + + movq_m2r (*(wsptr + 1), mm1); /* wsptr[0,4],[0,5],[0,6],[0,7] */ + movq_r2r (mm0, mm2); - movq_m2r(*(wsptr+1), mm1); /* wsptr[0,4],[0,5],[0,6],[0,7] */ - movq_r2r(mm0, mm2); - - movq_m2r(*(wsptr+2), mm3); /* wsptr[1,0],[1,1],[1,2],[1,3] */ - paddw_r2r(mm1, mm0); /* wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] */ + movq_m2r (*(wsptr + 2), mm3); /* wsptr[1,0],[1,1],[1,2],[1,3] */ + paddw_r2r (mm1, mm0); /* wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] */ - movq_m2r(*(wsptr+3), mm4); /* wsptr[1,4],[1,5],[1,6],[1,7] */ - psubw_r2r(mm1, mm2); /* wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] */ + movq_m2r (*(wsptr + 3), mm4); /* wsptr[1,4],[1,5],[1,6],[1,7] */ + psubw_r2r (mm1, mm2); /* wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] */ - movq_r2r(mm0, mm6); - movq_r2r(mm3, mm5); - - paddw_r2r(mm4, mm3); /* wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] */ - movq_r2r(mm2, mm1); + movq_r2r (mm0, mm6); + movq_r2r (mm3, mm5); - psubw_r2r(mm4, mm5); /* wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] */ - punpcklwd_r2r(mm3, mm0); /* wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] */ + paddw_r2r (mm4, mm3); /* wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] */ + movq_r2r (mm2, mm1); - movq_m2r(*(wsptr+7), mm7); /* wsptr[3,4],[3,5],[3,6],[3,7] */ - punpckhwd_r2r(mm3, mm6); /* wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] */ + psubw_r2r (mm4, mm5); /* wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] */ + punpcklwd_r2r (mm3, mm0); /* wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] */ - movq_m2r(*(wsptr+4), mm3); /* wsptr[2,0],[2,1],[2,2],[2,3] */ - punpckldq_r2r(mm6, mm0); /* wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */ + movq_m2r (*(wsptr + 7), mm7); /* wsptr[3,4],[3,5],[3,6],[3,7] */ + punpckhwd_r2r (mm3, mm6); /* wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] */ - punpcklwd_r2r(mm5, mm1); /* wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] */ - movq_r2r(mm3, mm4); + movq_m2r (*(wsptr + 4), mm3); /* wsptr[2,0],[2,1],[2,2],[2,3] */ + punpckldq_r2r (mm6, mm0); /* wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */ - movq_m2r(*(wsptr+6), mm6); /* wsptr[3,0],[3,1],[3,2],[3,3] */ - punpckhwd_r2r(mm5, mm2); /* wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] */ + punpcklwd_r2r (mm5, mm1); /* wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] */ + movq_r2r (mm3, mm4); - movq_m2r(*(wsptr+5), mm5); /* wsptr[2,4],[2,5],[2,6],[2,7] */ - punpckldq_r2r(mm2, mm1); /* wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */ + movq_m2r (*(wsptr + 6), mm6); /* wsptr[3,0],[3,1],[3,2],[3,3] */ + punpckhwd_r2r (mm5, mm2); /* wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] */ - paddw_r2r(mm5, mm3); /* wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] */ - movq_r2r(mm6, mm2); + movq_m2r (*(wsptr + 5), mm5); /* wsptr[2,4],[2,5],[2,6],[2,7] */ + punpckldq_r2r (mm2, mm1); /* wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */ - psubw_r2r(mm5, mm4); /* wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] */ - paddw_r2r(mm7, mm6); /* wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] */ + paddw_r2r (mm5, mm3); /* wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] */ + movq_r2r (mm6, mm2); - movq_r2r(mm3, mm5); - punpcklwd_r2r(mm6, mm3); /* wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] */ - - psubw_r2r(mm7, mm2); /* wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] */ - punpckhwd_r2r(mm6, mm5); /* wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] */ + psubw_r2r (mm5, mm4); /* wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] */ + paddw_r2r (mm7, mm6); /* wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] */ - movq_r2r(mm4, mm7); - punpckldq_r2r(mm5, mm3); /* wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] */ + movq_r2r (mm3, mm5); + punpcklwd_r2r (mm6, mm3); /* wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] */ - punpcklwd_r2r(mm2, mm4); /* wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] */ + psubw_r2r (mm7, mm2); /* wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] */ + punpckhwd_r2r (mm6, mm5); /* wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] */ - punpckhwd_r2r(mm2, mm7); /* wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] */ + movq_r2r (mm4, mm7); + punpckldq_r2r (mm5, mm3); /* wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] */ - punpckldq_r2r(mm7, mm4); /* wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] */ - movq_r2r(mm1, mm6); + punpcklwd_r2r (mm2, mm4); /* wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] */ - /*OK */ + punpckhwd_r2r (mm2, mm7); /* wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] */ + + punpckldq_r2r (mm7, mm4); /* wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] */ + movq_r2r (mm1, mm6); + + /*OK */ /* mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */ /* mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */ - movq_r2r(mm0, mm2); - punpckhdq_r2r(mm4, mm6); /* wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] */ + movq_r2r (mm0, mm2); + punpckhdq_r2r (mm4, mm6); /* wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] */ - punpckldq_r2r(mm4, mm1); /* wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] */ - psllw_i2r(2, mm6); + punpckldq_r2r (mm4, mm1); /* wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] */ + psllw_i2r (2, mm6); - pmulhw_m2r(fix_141, mm6); - punpckldq_r2r(mm3, mm0); /* wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] */ + pmulhw_m2r (fix_141, mm6); + punpckldq_r2r (mm3, mm0); /* wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] */ - punpckhdq_r2r(mm3, mm2); /* wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] */ - movq_r2r(mm0, mm7); + punpckhdq_r2r (mm3, mm2); /* wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] */ + movq_r2r (mm0, mm7); /* tmp0 = tmp10 + tmp13; */ /* tmp3 = tmp10 - tmp13; */ - paddw_r2r(mm2, mm0); /* [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] */ - psubw_r2r(mm2, mm7); /* [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] */ + paddw_r2r (mm2, mm0); /* [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] */ + psubw_r2r (mm2, mm7); /* [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] */ /* tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; */ - psubw_r2r(mm2, mm6); /* wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] */ + psubw_r2r (mm2, mm6); /* wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] */ /* tmp1 = tmp11 + tmp12; */ /* tmp2 = tmp11 - tmp12; */ - movq_r2r(mm1, mm5); + movq_r2r (mm1, mm5); - /*OK */ + /*OK */ - /* Odd part */ + /* Odd part */ /* z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */ /* z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */ /* z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */ /* z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */ - movq_m2r(*(wsptr), mm3); /* wsptr[0,0],[0,1],[0,2],[0,3] */ - paddw_r2r(mm6, mm1); /* [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] */ + movq_m2r (*(wsptr), mm3); /* wsptr[0,0],[0,1],[0,2],[0,3] */ + paddw_r2r (mm6, mm1); /* [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] */ - movq_m2r(*(wsptr+1), mm4); /* wsptr[0,4],[0,5],[0,6],[0,7] */ - psubw_r2r(mm6, mm5); /* [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] */ + movq_m2r (*(wsptr + 1), mm4); /* wsptr[0,4],[0,5],[0,6],[0,7] */ + psubw_r2r (mm6, mm5); /* [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] */ - movq_r2r(mm3, mm6); - punpckldq_r2r(mm4, mm3); /* wsptr[0,0],[0,1],[0,4],[0,5] */ + movq_r2r (mm3, mm6); + punpckldq_r2r (mm4, mm3); /* wsptr[0,0],[0,1],[0,4],[0,5] */ - punpckhdq_r2r(mm6, mm4); /* wsptr[0,6],[0,7],[0,2],[0,3] */ - movq_r2r(mm3, mm2); + punpckhdq_r2r (mm6, mm4); /* wsptr[0,6],[0,7],[0,2],[0,3] */ + movq_r2r (mm3, mm2); /*Save tmp0 and tmp1 in wsptr */ - movq_r2m(mm0, *(wsptr)); /* save tmp0 */ - paddw_r2r(mm4, mm2); /* wsptr[xxx],[0,z11],[xxx],[0,z13] */ + movq_r2m (mm0, *(wsptr)); /* save tmp0 */ + paddw_r2r (mm4, mm2); /* wsptr[xxx],[0,z11],[xxx],[0,z13] */ + - /*Continue with z10 --- z13 */ - movq_m2r(*(wsptr+2), mm6); /* wsptr[1,0],[1,1],[1,2],[1,3] */ - psubw_r2r(mm4, mm3); /* wsptr[xxx],[0,z12],[xxx],[0,z10] */ + movq_m2r (*(wsptr + 2), mm6); /* wsptr[1,0],[1,1],[1,2],[1,3] */ + psubw_r2r (mm4, mm3); /* wsptr[xxx],[0,z12],[xxx],[0,z10] */ - movq_m2r(*(wsptr+3), mm0); /* wsptr[1,4],[1,5],[1,6],[1,7] */ - movq_r2r(mm6, mm4); + movq_m2r (*(wsptr + 3), mm0); /* wsptr[1,4],[1,5],[1,6],[1,7] */ + movq_r2r (mm6, mm4); - movq_r2m(mm1, *(wsptr+1)); /* save tmp1 */ - punpckldq_r2r(mm0, mm6); /* wsptr[1,0],[1,1],[1,4],[1,5] */ + movq_r2m (mm1, *(wsptr + 1)); /* save tmp1 */ + punpckldq_r2r (mm0, mm6); /* wsptr[1,0],[1,1],[1,4],[1,5] */ + + punpckhdq_r2r (mm4, mm0); /* wsptr[1,6],[1,7],[1,2],[1,3] */ + movq_r2r (mm6, mm1); - punpckhdq_r2r(mm4, mm0); /* wsptr[1,6],[1,7],[1,2],[1,3] */ - movq_r2r(mm6, mm1); - /*Save tmp2 and tmp3 in wsptr */ - paddw_r2r(mm0, mm6); /* wsptr[xxx],[1,z11],[xxx],[1,z13] */ - movq_r2r(mm2, mm4); - + paddw_r2r (mm0, mm6); /* wsptr[xxx],[1,z11],[xxx],[1,z13] */ + movq_r2r (mm2, mm4); + /*Continue with z10 --- z13 */ - movq_r2m(mm5, *(wsptr+2)); /* save tmp2 */ - punpcklwd_r2r(mm6, mm2); /* wsptr[xxx],[xxx],[0,z11],[1,z11] */ + movq_r2m (mm5, *(wsptr + 2)); /* save tmp2 */ + punpcklwd_r2r (mm6, mm2); /* wsptr[xxx],[xxx],[0,z11],[1,z11] */ - psubw_r2r(mm0, mm1); /* wsptr[xxx],[1,z12],[xxx],[1,z10] */ - punpckhwd_r2r(mm6, mm4); /* wsptr[xxx],[xxx],[0,z13],[1,z13] */ + psubw_r2r (mm0, mm1); /* wsptr[xxx],[1,z12],[xxx],[1,z10] */ + punpckhwd_r2r (mm6, mm4); /* wsptr[xxx],[xxx],[0,z13],[1,z13] */ - movq_r2r(mm3, mm0); - punpcklwd_r2r(mm1, mm3); /* wsptr[xxx],[xxx],[0,z12],[1,z12] */ + movq_r2r (mm3, mm0); + punpcklwd_r2r (mm1, mm3); /* wsptr[xxx],[xxx],[0,z12],[1,z12] */ - movq_r2m(mm7, *(wsptr+3)); /* save tmp3 */ - punpckhwd_r2r(mm1, mm0); /* wsptr[xxx],[xxx],[0,z10],[1,z10] */ + movq_r2m (mm7, *(wsptr + 3)); /* save tmp3 */ + punpckhwd_r2r (mm1, mm0); /* wsptr[xxx],[xxx],[0,z10],[1,z10] */ - movq_m2r(*(wsptr+4), mm6); /* wsptr[2,0],[2,1],[2,2],[2,3] */ - punpckhdq_r2r(mm2, mm0); /* wsptr[0,z10],[1,z10],[0,z11],[1,z11] */ + movq_m2r (*(wsptr + 4), mm6); /* wsptr[2,0],[2,1],[2,2],[2,3] */ + punpckhdq_r2r (mm2, mm0); /* wsptr[0,z10],[1,z10],[0,z11],[1,z11] */ - movq_m2r(*(wsptr+5), mm7); /* wsptr[2,4],[2,5],[2,6],[2,7] */ - punpckhdq_r2r(mm4, mm3); /* wsptr[0,z12],[1,z12],[0,z13],[1,z13] */ + movq_m2r (*(wsptr + 5), mm7); /* wsptr[2,4],[2,5],[2,6],[2,7] */ + punpckhdq_r2r (mm4, mm3); /* wsptr[0,z12],[1,z12],[0,z13],[1,z13] */ - movq_m2r(*(wsptr+6), mm1); /* wsptr[3,0],[3,1],[3,2],[3,3] */ - movq_r2r(mm6, mm4); + movq_m2r (*(wsptr + 6), mm1); /* wsptr[3,0],[3,1],[3,2],[3,3] */ + movq_r2r (mm6, mm4); - punpckldq_r2r(mm7, mm6); /* wsptr[2,0],[2,1],[2,4],[2,5] */ - movq_r2r(mm1, mm5); + punpckldq_r2r (mm7, mm6); /* wsptr[2,0],[2,1],[2,4],[2,5] */ + movq_r2r (mm1, mm5); - punpckhdq_r2r(mm4, mm7); /* wsptr[2,6],[2,7],[2,2],[2,3] */ - movq_r2r(mm6, mm2); - - movq_m2r(*(wsptr+7), mm4); /* wsptr[3,4],[3,5],[3,6],[3,7] */ - paddw_r2r(mm7, mm6); /* wsptr[xxx],[2,z11],[xxx],[2,z13] */ + punpckhdq_r2r (mm4, mm7); /* wsptr[2,6],[2,7],[2,2],[2,3] */ + movq_r2r (mm6, mm2); - psubw_r2r(mm7, mm2); /* wsptr[xxx],[2,z12],[xxx],[2,z10] */ - punpckldq_r2r(mm4, mm1); /* wsptr[3,0],[3,1],[3,4],[3,5] */ + movq_m2r (*(wsptr + 7), mm4); /* wsptr[3,4],[3,5],[3,6],[3,7] */ + paddw_r2r (mm7, mm6); /* wsptr[xxx],[2,z11],[xxx],[2,z13] */ - punpckhdq_r2r(mm5, mm4); /* wsptr[3,6],[3,7],[3,2],[3,3] */ - movq_r2r(mm1, mm7); + psubw_r2r (mm7, mm2); /* wsptr[xxx],[2,z12],[xxx],[2,z10] */ + punpckldq_r2r (mm4, mm1); /* wsptr[3,0],[3,1],[3,4],[3,5] */ - paddw_r2r(mm4, mm1); /* wsptr[xxx],[3,z11],[xxx],[3,z13] */ - psubw_r2r(mm4, mm7); /* wsptr[xxx],[3,z12],[xxx],[3,z10] */ + punpckhdq_r2r (mm5, mm4); /* wsptr[3,6],[3,7],[3,2],[3,3] */ + movq_r2r (mm1, mm7); - movq_r2r(mm6, mm5); - punpcklwd_r2r(mm1, mm6); /* wsptr[xxx],[xxx],[2,z11],[3,z11] */ + paddw_r2r (mm4, mm1); /* wsptr[xxx],[3,z11],[xxx],[3,z13] */ + psubw_r2r (mm4, mm7); /* wsptr[xxx],[3,z12],[xxx],[3,z10] */ - punpckhwd_r2r(mm1, mm5); /* wsptr[xxx],[xxx],[2,z13],[3,z13] */ - movq_r2r(mm2, mm4); + movq_r2r (mm6, mm5); + punpcklwd_r2r (mm1, mm6); /* wsptr[xxx],[xxx],[2,z11],[3,z11] */ - punpcklwd_r2r(mm7, mm2); /* wsptr[xxx],[xxx],[2,z12],[3,z12] */ + punpckhwd_r2r (mm1, mm5); /* wsptr[xxx],[xxx],[2,z13],[3,z13] */ + movq_r2r (mm2, mm4); - punpckhwd_r2r(mm7, mm4); /* wsptr[xxx],[xxx],[2,z10],[3,z10] */ + punpcklwd_r2r (mm7, mm2); /* wsptr[xxx],[xxx],[2,z12],[3,z12] */ - punpckhdq_r2r(mm6, mm4); /* wsptr[2,z10],[3,z10],[2,z11],[3,z11] */ + punpckhwd_r2r (mm7, mm4); /* wsptr[xxx],[xxx],[2,z10],[3,z10] */ - punpckhdq_r2r(mm5, mm2); /* wsptr[2,z12],[3,z12],[2,z13],[3,z13] */ - movq_r2r(mm0, mm5); + punpckhdq_r2r (mm6, mm4); /* wsptr[2,z10],[3,z10],[2,z11],[3,z11] */ - punpckldq_r2r(mm4, mm0); /* wsptr[0,z10],[1,z10],[2,z10],[3,z10] */ + punpckhdq_r2r (mm5, mm2); /* wsptr[2,z12],[3,z12],[2,z13],[3,z13] */ + movq_r2r (mm0, mm5); - punpckhdq_r2r(mm4, mm5); /* wsptr[0,z11],[1,z11],[2,z11],[3,z11] */ - movq_r2r(mm3, mm4); + punpckldq_r2r (mm4, mm0); /* wsptr[0,z10],[1,z10],[2,z10],[3,z10] */ - punpckhdq_r2r(mm2, mm4); /* wsptr[0,z13],[1,z13],[2,z13],[3,z13] */ - movq_r2r(mm5, mm1); + punpckhdq_r2r (mm4, mm5); /* wsptr[0,z11],[1,z11],[2,z11],[3,z11] */ + movq_r2r (mm3, mm4); - punpckldq_r2r(mm2, mm3); /* wsptr[0,z12],[1,z12],[2,z12],[3,z12] */ + punpckhdq_r2r (mm2, mm4); /* wsptr[0,z13],[1,z13],[2,z13],[3,z13] */ + movq_r2r (mm5, mm1); + + punpckldq_r2r (mm2, mm3); /* wsptr[0,z12],[1,z12],[2,z12],[3,z12] */ /* tmp7 = z11 + z13; : phase 5 */ /* tmp8 = z11 - z13; : phase 5 */ - psubw_r2r(mm4, mm1); /* tmp8 */ + psubw_r2r (mm4, mm1); /* tmp8 */ - paddw_r2r(mm4, mm5); /* tmp7 */ + paddw_r2r (mm4, mm5); /* tmp7 */ /* tmp21 = MULTIPLY(tmp8, FIX_1_414213562); 2*c4 */ - psllw_i2r(2, mm1); + psllw_i2r (2, mm1); - psllw_i2r(2, mm0); + psllw_i2r (2, mm0); - pmulhw_m2r(fix_141, mm1); /* tmp21 */ + pmulhw_m2r (fix_141, mm1); /* tmp21 */ /* tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) : 2*(c2-c6) */ /* + MULTIPLY(z10, - FIX_1_847759065); : 2*c2 */ - psllw_i2r(2, mm3); - movq_r2r(mm0, mm7); + psllw_i2r (2, mm3); + movq_r2r (mm0, mm7); - pmulhw_m2r(fix_n184, mm7); - movq_r2r(mm3, mm6); + pmulhw_m2r (fix_n184, mm7); + movq_r2r (mm3, mm6); - movq_m2r(*(wsptr), mm2); /* tmp0,final1 */ + movq_m2r (*(wsptr), mm2); /* tmp0,final1 */ - pmulhw_m2r(fix_108n184, mm6); + pmulhw_m2r (fix_108n184, mm6); /* tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) : -2*(c2+c6) */ /* + MULTIPLY(z12, FIX_1_847759065); : 2*c2 */ - movq_r2r(mm2, mm4); /* final1 */ - - pmulhw_m2r(fix_184n261, mm0); - paddw_r2r(mm5, mm2); /* tmp0+tmp7,final1 */ + movq_r2r (mm2, mm4); /* final1 */ + + pmulhw_m2r (fix_184n261, mm0); + paddw_r2r (mm5, mm2); /* tmp0+tmp7,final1 */ - pmulhw_m2r(fix_184, mm3); - psubw_r2r(mm5, mm4); /* tmp0-tmp7,final1 */ + pmulhw_m2r (fix_184, mm3); + psubw_r2r (mm5, mm4); /* tmp0-tmp7,final1 */ /* tmp6 = tmp22 - tmp7; phase 2 */ - psraw_i2r(3, mm2); /* outptr[0,0],[1,0],[2,0],[3,0],final1 */ + psraw_i2r (3, mm2); /* outptr[0,0],[1,0],[2,0],[3,0],final1 */ - paddw_r2r(mm6, mm7); /* tmp20 */ - psraw_i2r(3, mm4); /* outptr[0,7],[1,7],[2,7],[3,7],final1 */ + paddw_r2r (mm6, mm7); /* tmp20 */ + psraw_i2r (3, mm4); /* outptr[0,7],[1,7],[2,7],[3,7],final1 */ - paddw_r2r(mm0, mm3); /* tmp22 */ + paddw_r2r (mm0, mm3); /* tmp22 */ /* tmp5 = tmp21 - tmp6; */ - psubw_r2r(mm5, mm3); /* tmp6 */ + psubw_r2r (mm5, mm3); /* tmp6 */ /* tmp4 = tmp20 + tmp5; */ - movq_m2r(*(wsptr+1), mm0); /* tmp1,final2 */ - psubw_r2r(mm3, mm1); /* tmp5 */ + movq_m2r (*(wsptr + 1), mm0); /* tmp1,final2 */ + psubw_r2r (mm3, mm1); /* tmp5 */ - movq_r2r(mm0, mm6); /* final2 */ - paddw_r2r(mm3, mm0); /* tmp1+tmp6,final2 */ + movq_r2r (mm0, mm6); /* final2 */ + paddw_r2r (mm3, mm0); /* tmp1+tmp6,final2 */ - /* Final output stage: scale down by a factor of 8 and range-limit */ + /* Final output stage: scale down by a factor of 8 and range-limit */ /* outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */ /* & RANGE_MASK]; */ @@ -2136,30 +2141,30 @@ static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; /* & RANGE_MASK]; */ /* outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */ /* & RANGE_MASK]; final2 */ - psubw_r2r(mm3, mm6); /* tmp1-tmp6,final2 */ - psraw_i2r(3, mm0); /* outptr[0,1],[1,1],[2,1],[3,1] */ + psubw_r2r (mm3, mm6); /* tmp1-tmp6,final2 */ + psraw_i2r (3, mm0); /* outptr[0,1],[1,1],[2,1],[3,1] */ + + psraw_i2r (3, mm6); /* outptr[0,6],[1,6],[2,6],[3,6] */ + + packuswb_r2r (mm4, mm0); /* out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] */ - psraw_i2r(3, mm6); /* outptr[0,6],[1,6],[2,6],[3,6] */ - - packuswb_r2r(mm4, mm0); /* out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] */ - - movq_m2r(*(wsptr+2), mm5); /* tmp2,final3 */ - packuswb_r2r(mm6, mm2); /* out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] */ + movq_m2r (*(wsptr + 2), mm5); /* tmp2,final3 */ + packuswb_r2r (mm6, mm2); /* out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] */ /* outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */ /* & RANGE_MASK]; */ /* outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */ /* & RANGE_MASK]; final3 */ - paddw_r2r(mm1, mm7); /* tmp4 */ - movq_r2r(mm5, mm3); + paddw_r2r (mm1, mm7); /* tmp4 */ + movq_r2r (mm5, mm3); - paddw_r2r(mm1, mm5); /* tmp2+tmp5 */ - psubw_r2r(mm1, mm3); /* tmp2-tmp5 */ + paddw_r2r (mm1, mm5); /* tmp2+tmp5 */ + psubw_r2r (mm1, mm3); /* tmp2-tmp5 */ - psraw_i2r(3, mm5); /* outptr[0,2],[1,2],[2,2],[3,2] */ + psraw_i2r (3, mm5); /* outptr[0,2],[1,2],[2,2],[3,2] */ - movq_m2r(*(wsptr+3), mm4); /* tmp3,final4 */ - psraw_i2r(3, mm3); /* outptr[0,5],[1,5],[2,5],[3,5] */ + movq_m2r (*(wsptr + 3), mm4); /* tmp3,final4 */ + psraw_i2r (3, mm3); /* outptr[0,5],[1,5],[2,5],[3,5] */ @@ -2167,68 +2172,68 @@ static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; /* & RANGE_MASK]; */ /* outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */ /* & RANGE_MASK]; final4 */ - movq_r2r(mm4, mm6); - paddw_r2r(mm7, mm4); /* tmp3+tmp4 */ + movq_r2r (mm4, mm6); + paddw_r2r (mm7, mm4); /* tmp3+tmp4 */ - psubw_r2r(mm7, mm6); /* tmp3-tmp4 */ - psraw_i2r(3, mm4); /* outptr[0,4],[1,4],[2,4],[3,4] */ + psubw_r2r (mm7, mm6); /* tmp3-tmp4 */ + psraw_i2r (3, mm4); /* outptr[0,4],[1,4],[2,4],[3,4] */ - psraw_i2r(3, mm6); /* outptr[0,3],[1,3],[2,3],[3,3] */ + psraw_i2r (3, mm6); /* outptr[0,3],[1,3],[2,3],[3,3] */ - /* - movq_r2m(mm4, *dummy); - fprintf(stderr, "3-4 %016llx\n", dummy); - movq_r2m(mm4, *dummy); - fprintf(stderr, "3+4 %016llx\n", dummy); - */ - + /* + movq_r2m(mm4, *dummy); + fprintf(stderr, "3-4 %016llx\n", dummy); + movq_r2m(mm4, *dummy); + fprintf(stderr, "3+4 %016llx\n", dummy); + */ - packuswb_r2r(mm4, mm5); /* out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] */ - packuswb_r2r(mm3, mm6); /* out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] */ - movq_r2r(mm2, mm4); + packuswb_r2r (mm4, mm5); /* out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] */ - movq_r2r(mm5, mm7); - punpcklbw_r2r(mm0, mm2); /* out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] */ + packuswb_r2r (mm3, mm6); /* out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] */ + movq_r2r (mm2, mm4); - punpckhbw_r2r(mm0, mm4); /* out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] */ - movq_r2r(mm2, mm1); + movq_r2r (mm5, mm7); + punpcklbw_r2r (mm0, mm2); /* out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] */ - punpcklbw_r2r(mm6, mm5); /* out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] */ - - punpckhbw_r2r(mm6, mm7); /* out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] */ + punpckhbw_r2r (mm0, mm4); /* out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] */ + movq_r2r (mm2, mm1); - punpcklwd_r2r(mm5, mm2); /* out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] */ - - movq_r2r(mm7, mm6); - punpckhwd_r2r(mm5, mm1); /* out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] */ + punpcklbw_r2r (mm6, mm5); /* out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] */ - movq_r2r(mm2, mm0); - punpcklwd_r2r(mm4, mm6); /* out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] */ + punpckhbw_r2r (mm6, mm7); /* out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] */ - punpckldq_r2r(mm6, mm2); /* out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] */ + punpcklwd_r2r (mm5, mm2); /* out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] */ - movq_r2r(mm1, mm3); + movq_r2r (mm7, mm6); + punpckhwd_r2r (mm5, mm1); /* out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] */ - punpckhwd_r2r(mm4, mm7); /* out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] */ - - dataptr += rskip; - movq_r2m(mm2, *(dataptr)); + movq_r2r (mm2, mm0); + punpcklwd_r2r (mm4, mm6); /* out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] */ - punpckhdq_r2r(mm6, mm0); /* out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] */ + punpckldq_r2r (mm6, mm2); /* out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] */ - dataptr += rskip; - movq_r2m(mm0, *(dataptr)); + movq_r2r (mm1, mm3); - punpckldq_r2r(mm7, mm1); /* out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] */ - - punpckhdq_r2r(mm7, mm3); /* out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] */ + punpckhwd_r2r (mm4, mm7); /* out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] */ - dataptr += rskip; - movq_r2m(mm1, *(dataptr)); + dataptr += rskip; + movq_r2m (mm2, *(dataptr)); - dataptr += rskip; - movq_r2m(mm3, *(dataptr)); + punpckhdq_r2r (mm6, mm0); /* out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] */ + + dataptr += rskip; + movq_r2m (mm0, *(dataptr)); + + punpckldq_r2r (mm7, mm1); /* out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] */ + + punpckhdq_r2r (mm7, mm3); /* out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] */ + + dataptr += rskip; + movq_r2m (mm1, *(dataptr)); + + dataptr += rskip; + movq_r2m (mm3, *(dataptr)); #else __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; @@ -2244,9 +2249,9 @@ static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; inptr = data; wsptr = workspace; for (ctr = 8; ctr > 0; ctr--) { - + if ((inptr[8] | inptr[16] | inptr[24] | - inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) { + inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) { dcval = inptr[0]; wsptr[0] = dcval; wsptr[8] = dcval; @@ -2256,12 +2261,12 @@ static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; wsptr[40] = dcval; wsptr[48] = dcval; wsptr[56] = dcval; - - inptr++; + + inptr++; wsptr++; continue; - } - + } + tmp0 = inptr[0]; tmp1 = inptr[16]; tmp2 = inptr[32]; @@ -2271,13 +2276,13 @@ static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; tmp11 = tmp0 - tmp2; tmp13 = tmp1 + tmp3; - tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; + tmp12 = MULTIPLY (tmp1 - tmp3, FIX_1_414213562) - tmp13; tmp0 = tmp10 + tmp13; tmp3 = tmp10 - tmp13; tmp1 = tmp11 + tmp12; tmp2 = tmp11 - tmp12; - + tmp4 = inptr[8]; tmp5 = inptr[24]; tmp6 = inptr[40]; @@ -2289,11 +2294,11 @@ static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; z12 = tmp4 - tmp7; tmp7 = z11 + z13; - tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); + tmp11 = MULTIPLY (z11 - z13, FIX_1_414213562); - z5 = MULTIPLY(z10 + z12, FIX_1_847759065); - tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; - tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; + z5 = MULTIPLY (z10 + z12, FIX_1_847759065); + tmp10 = MULTIPLY (z12, FIX_1_082392200) - z5; + tmp12 = MULTIPLY (z10, -FIX_2_613125930) + z5; tmp6 = tmp12 - tmp7; tmp5 = tmp11 - tmp6; @@ -2314,13 +2319,13 @@ static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; wsptr = workspace; for (ctr = 0; ctr < 8; ctr++) { - outptr = &(odata[ctr*rskip]); + outptr = &(odata[ctr * rskip]); tmp10 = wsptr[0] + wsptr[4]; tmp11 = wsptr[0] - wsptr[4]; tmp13 = wsptr[2] + wsptr[6]; - tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13; + tmp12 = MULTIPLY (wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13; tmp0 = tmp10 + tmp13; tmp3 = tmp10 - tmp13; @@ -2333,29 +2338,30 @@ static mmx_t fix_108n184 = (mmx_t)(long long)0xcf04cf04cf04cf04LL; z12 = wsptr[1] - wsptr[7]; tmp7 = z11 + z13; - tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); + tmp11 = MULTIPLY (z11 - z13, FIX_1_414213562); - z5 = MULTIPLY(z10 + z12, FIX_1_847759065); - tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; - tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; + z5 = MULTIPLY (z10 + z12, FIX_1_847759065); + tmp10 = MULTIPLY (z12, FIX_1_082392200) - z5; + tmp12 = MULTIPLY (z10, -FIX_2_613125930) + z5; tmp6 = tmp12 - tmp7; tmp5 = tmp11 - tmp6; tmp4 = tmp10 + tmp5; - outptr[0] = RL(DESCALE(tmp0 + tmp7)); - outptr[7] = RL(DESCALE(tmp0 - tmp7)); - outptr[1] = RL(DESCALE(tmp1 + tmp6)); - outptr[6] = RL(DESCALE(tmp1 - tmp6)); - outptr[2] = RL(DESCALE(tmp2 + tmp5)); - outptr[5] = RL(DESCALE(tmp2 - tmp5)); - outptr[4] = RL(DESCALE(tmp3 + tmp4)); - outptr[3] = RL(DESCALE(tmp3 - tmp4)); + outptr[0] = RL (DESCALE (tmp0 + tmp7)); + outptr[7] = RL (DESCALE (tmp0 - tmp7)); + outptr[1] = RL (DESCALE (tmp1 + tmp6)); + outptr[6] = RL (DESCALE (tmp1 - tmp6)); + outptr[2] = RL (DESCALE (tmp2 + tmp5)); + outptr[5] = RL (DESCALE (tmp2 - tmp5)); + outptr[4] = RL (DESCALE (tmp3 + tmp4)); + outptr[3] = RL (DESCALE (tmp3 - tmp4)); wsptr += 8; } #endif } + /* Main Routines @@ -2374,24 +2380,25 @@ Initialise all the cache-aliged data blocks */ -void RTjpeg_init_data(void) +void +RTjpeg_init_data (void) { - unsigned long dptr; - - dptr=(unsigned long)&(RTjpeg_alldata[0]); - dptr+=32; - dptr=dptr>>5; - dptr=dptr<<5; /* cache align data */ - - RTjpeg_block=(__s16 *)dptr; - dptr+=sizeof(__s16)*64; - RTjpeg_lqt=(__s32 *)dptr; - dptr+=sizeof(__s32)*64; - RTjpeg_cqt=(__s32 *)dptr; - dptr+=sizeof(__s32)*64; - RTjpeg_liqt=(__u32 *)dptr; - dptr+=sizeof(__u32)*64; - RTjpeg_ciqt=(__u32 *)dptr; + unsigned long dptr; + + dptr = (unsigned long) &(RTjpeg_alldata[0]); + dptr += 32; + dptr = dptr >> 5; + dptr = dptr << 5; /* cache align data */ + + RTjpeg_block = (__s16 *) dptr; + dptr += sizeof (__s16) * 64; + RTjpeg_lqt = (__s32 *) dptr; + dptr += sizeof (__s32) * 64; + RTjpeg_cqt = (__s32 *) dptr; + dptr += sizeof (__s32) * 64; + RTjpeg_liqt = (__u32 *) dptr; + dptr += sizeof (__u32) * 64; + RTjpeg_ciqt = (__u32 *) dptr; } /* @@ -2405,35 +2412,39 @@ Input: buf -> pointer to 128 ints for quant values store to pass back to Q -> quality factor (192=best, 32=worst) */ -void RTjpeg_init_Q(__u8 Q) +void +RTjpeg_init_Q (__u8 Q) { - int i; - __u64 qual; - - qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */ - - for(i=0; i<64; i++) - { - RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3); - if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1; - RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3); - if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1; - RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3); - RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3); - RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3; - RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3; - } - - RTjpeg_lb8=0; - while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8); - RTjpeg_lb8--; - RTjpeg_cb8=0; - while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8); - RTjpeg_cb8--; - - RTjpeg_dct_init(); - RTjpeg_idct_init(); - RTjpeg_quant_init(); + int i; + __u64 qual; + + qual = (__u64) Q << (32 - 7); /* 32 bit FP, 255=2, 0=0 */ + + for (i = 0; i < 64; i++) { + RTjpeg_lqt[i] = + (__s32) ((qual / ((__u64) RTjpeg_lum_quant_tbl[i] << 16)) >> 3); + if (RTjpeg_lqt[i] == 0) + RTjpeg_lqt[i] = 1; + RTjpeg_cqt[i] = + (__s32) ((qual / ((__u64) RTjpeg_chrom_quant_tbl[i] << 16)) >> 3); + if (RTjpeg_cqt[i] == 0) + RTjpeg_cqt[i] = 1; + RTjpeg_liqt[i] = (1 << 16) / (RTjpeg_lqt[i] << 3); + RTjpeg_ciqt[i] = (1 << 16) / (RTjpeg_cqt[i] << 3); + RTjpeg_lqt[i] = ((1 << 16) / RTjpeg_liqt[i]) >> 3; + RTjpeg_cqt[i] = ((1 << 16) / RTjpeg_ciqt[i]) >> 3; + } + + RTjpeg_lb8 = 0; + while (RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]] <= 8); + RTjpeg_lb8--; + RTjpeg_cb8 = 0; + while (RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]] <= 8); + RTjpeg_cb8--; + + RTjpeg_dct_init (); + RTjpeg_idct_init (); + RTjpeg_quant_init (); } /* @@ -2450,337 +2461,338 @@ Input: buf -> pointer to 128 ints for quant values store to pass back to */ -void RTjpeg_init_compress(__u32 *buf, int width, int height, __u8 Q) +void +RTjpeg_init_compress (__u32 * buf, int width, int height, __u8 Q) { - int i; - __u64 qual; - - RTjpeg_init_data(); - - RTjpeg_width=width; - RTjpeg_height=height; - RTjpeg_Ywidth = RTjpeg_width>>3; - RTjpeg_Ysize=width * height; - RTjpeg_Cwidth = RTjpeg_width>>4; - RTjpeg_Csize= (width>>1) * height; - - qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */ - - for(i=0; i<64; i++) - { - RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3); - if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1; - RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3); - if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1; - RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3); - RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3); - RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3; - RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3; - } - - RTjpeg_lb8=0; - while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8); - RTjpeg_lb8--; - RTjpeg_cb8=0; - while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8); - RTjpeg_cb8--; - - RTjpeg_dct_init(); - RTjpeg_quant_init(); - - for(i=0; i<64; i++) - buf[i]=RTjpeg_liqt[i]; - for(i=0; i<64; i++) - buf[64+i]=RTjpeg_ciqt[i]; + int i; + __u64 qual; + + RTjpeg_init_data (); + + RTjpeg_width = width; + RTjpeg_height = height; + RTjpeg_Ywidth = RTjpeg_width >> 3; + RTjpeg_Ysize = width * height; + RTjpeg_Cwidth = RTjpeg_width >> 4; + RTjpeg_Csize = (width >> 1) * height; + + qual = (__u64) Q << (32 - 7); /* 32 bit FP, 255=2, 0=0 */ + + for (i = 0; i < 64; i++) { + RTjpeg_lqt[i] = + (__s32) ((qual / ((__u64) RTjpeg_lum_quant_tbl[i] << 16)) >> 3); + if (RTjpeg_lqt[i] == 0) + RTjpeg_lqt[i] = 1; + RTjpeg_cqt[i] = + (__s32) ((qual / ((__u64) RTjpeg_chrom_quant_tbl[i] << 16)) >> 3); + if (RTjpeg_cqt[i] == 0) + RTjpeg_cqt[i] = 1; + RTjpeg_liqt[i] = (1 << 16) / (RTjpeg_lqt[i] << 3); + RTjpeg_ciqt[i] = (1 << 16) / (RTjpeg_cqt[i] << 3); + RTjpeg_lqt[i] = ((1 << 16) / RTjpeg_liqt[i]) >> 3; + RTjpeg_cqt[i] = ((1 << 16) / RTjpeg_ciqt[i]) >> 3; + } + + RTjpeg_lb8 = 0; + while (RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]] <= 8); + RTjpeg_lb8--; + RTjpeg_cb8 = 0; + while (RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]] <= 8); + RTjpeg_cb8--; + + RTjpeg_dct_init (); + RTjpeg_quant_init (); + + for (i = 0; i < 64; i++) + buf[i] = RTjpeg_liqt[i]; + for (i = 0; i < 64; i++) + buf[64 + i] = RTjpeg_ciqt[i]; } -void RTjpeg_init_decompress(__u32 *buf, int width, int height) +void +RTjpeg_init_decompress (__u32 * buf, int width, int height) { - int i; - - RTjpeg_init_data(); - - RTjpeg_width=width; - RTjpeg_height=height; - RTjpeg_Ywidth = RTjpeg_width>>3; - RTjpeg_Ysize=width * height; - RTjpeg_Cwidth = RTjpeg_width>>4; - RTjpeg_Csize= (width>>1) * height; - - for(i=0; i<64; i++) - { - RTjpeg_liqt[i]=buf[i]; - RTjpeg_ciqt[i]=buf[i+64]; - } - - RTjpeg_lb8=0; - while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8); - RTjpeg_lb8--; - RTjpeg_cb8=0; - while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8); - RTjpeg_cb8--; - - RTjpeg_idct_init(); + int i; + + RTjpeg_init_data (); + + RTjpeg_width = width; + RTjpeg_height = height; + RTjpeg_Ywidth = RTjpeg_width >> 3; + RTjpeg_Ysize = width * height; + RTjpeg_Cwidth = RTjpeg_width >> 4; + RTjpeg_Csize = (width >> 1) * height; + + for (i = 0; i < 64; i++) { + RTjpeg_liqt[i] = buf[i]; + RTjpeg_ciqt[i] = buf[i + 64]; + } + + RTjpeg_lb8 = 0; + while (RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]] <= 8); + RTjpeg_lb8--; + RTjpeg_cb8 = 0; + while (RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]] <= 8); + RTjpeg_cb8--; + + RTjpeg_idct_init (); /* RTjpeg_color_init(); */ } -int RTjpeg_compressYUV420(__s8 *sp, unsigned char *bp) +int +RTjpeg_compressYUV420 (__s8 * sp, unsigned char *bp) { - __s8 * sb; - register __s8 * bp1 = bp + (RTjpeg_width<<3); - register __s8 * bp2 = bp + RTjpeg_Ysize; - register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1); - register int i, j, k; + __s8 *sb; + register __s8 *bp1 = bp + (RTjpeg_width << 3); + register __s8 *bp2 = bp + RTjpeg_Ysize; + register __s8 *bp3 = bp2 + (RTjpeg_Csize >> 1); + register int i, j, k; #ifdef HAVE_LIBMMX - emms(); + emms (); #endif - sb=sp; + sb = sp; /* Y */ - for(i=RTjpeg_height>>1; i; i-=8) - { - for(j=0, k=0; j> 1; i; i -= 8) { + for (j = 0, k = 0; j < RTjpeg_width; j += 16, k += 8) { + RTjpeg_dctY (bp + j, RTjpeg_block, RTjpeg_Ywidth); + RTjpeg_quant (RTjpeg_block, RTjpeg_lqt); + sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_lb8); + + RTjpeg_dctY (bp + j + 8, RTjpeg_block, RTjpeg_Ywidth); + RTjpeg_quant (RTjpeg_block, RTjpeg_lqt); + sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_lb8); + + RTjpeg_dctY (bp1 + j, RTjpeg_block, RTjpeg_Ywidth); + RTjpeg_quant (RTjpeg_block, RTjpeg_lqt); + sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_lb8); + + RTjpeg_dctY (bp1 + j + 8, RTjpeg_block, RTjpeg_Ywidth); + RTjpeg_quant (RTjpeg_block, RTjpeg_lqt); + sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_lb8); + + RTjpeg_dctY (bp2 + k, RTjpeg_block, RTjpeg_Cwidth); + RTjpeg_quant (RTjpeg_block, RTjpeg_cqt); + sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_cb8); + + RTjpeg_dctY (bp3 + k, RTjpeg_block, RTjpeg_Cwidth); + RTjpeg_quant (RTjpeg_block, RTjpeg_cqt); + sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_cb8); + + } + bp += RTjpeg_width << 4; + bp1 += RTjpeg_width << 4; + bp2 += RTjpeg_width << 2; + bp3 += RTjpeg_width << 2; } - bp+=RTjpeg_width<<4; - bp1+=RTjpeg_width<<4; - bp2+=RTjpeg_width<<2; - bp3+=RTjpeg_width<<2; - - } #ifdef HAVE_LIBMMX - emms(); + emms (); #endif - return (sp-sb); + return (sp - sb); } -int RTjpeg_compressYUV422(__s8 *sp, unsigned char *bp) +int +RTjpeg_compressYUV422 (__s8 * sp, unsigned char *bp) { - __s8 * sb; - register __s8 * bp2 = bp + RTjpeg_Ysize; - register __s8 * bp3 = bp2 + RTjpeg_Csize; - register int i, j, k; + __s8 *sb; + register __s8 *bp2 = bp + RTjpeg_Ysize; + register __s8 *bp3 = bp2 + RTjpeg_Csize; + register int i, j, k; #ifdef HAVE_LIBMMX - emms(); + emms (); #endif - sb=sp; + sb = sp; /* Y */ - for(i=RTjpeg_height; i; i-=8) - { - for(j=0, k=0; j>1); - } - if(*sp==-1)sp++; - else - { - sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt); - RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1); - } + for (i = RTjpeg_height; i; i -= 8) { + for (k = 0, j = 0; j < RTjpeg_width; j += 16, k += 8) { + if (*sp == -1) + sp++; + else { + sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); + RTjpeg_idct (bp + j, RTjpeg_block, RTjpeg_width); + } + if (*sp == -1) + sp++; + else { + sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); + RTjpeg_idct (bp + j + 8, RTjpeg_block, RTjpeg_width); + } + if (*sp == -1) + sp++; + else { + sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt); + RTjpeg_idct (bp2 + k, RTjpeg_block, RTjpeg_width >> 1); + } + if (*sp == -1) + sp++; + else { + sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt); + RTjpeg_idct (bp3 + k, RTjpeg_block, RTjpeg_width >> 1); + } + } + bp += RTjpeg_width << 3; + bp2 += RTjpeg_width << 2; + bp3 += RTjpeg_width << 2; } - bp+=RTjpeg_width<<3; - bp2+=RTjpeg_width<<2; - bp3+=RTjpeg_width<<2; - } #ifdef HAVE_LIBMMX - emms(); + emms (); #endif } -void RTjpeg_decompressYUV420(__s8 *sp, __u8 *bp) +void +RTjpeg_decompressYUV420 (__s8 * sp, __u8 * bp) { - register __s8 * bp1 = bp + (RTjpeg_width<<3); - register __s8 * bp2 = bp + RTjpeg_Ysize; - register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1); - int i, j,k; + register __s8 *bp1 = bp + (RTjpeg_width << 3); + register __s8 *bp2 = bp + RTjpeg_Ysize; + register __s8 *bp3 = bp2 + (RTjpeg_Csize >> 1); + int i, j, k; #ifdef HAVE_LIBMMX - emms(); + emms (); #endif /* Y */ - for(i=RTjpeg_height>>1; i; i-=8) - { - for(k=0, j=0; j>1); - } - if(*sp==-1)sp++; - else - { - sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt); - RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1); - } + for (i = RTjpeg_height >> 1; i; i -= 8) { + for (k = 0, j = 0; j < RTjpeg_width; j += 16, k += 8) { + if (*sp == -1) + sp++; + else { + sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); + RTjpeg_idct (bp + j, RTjpeg_block, RTjpeg_width); + } + if (*sp == -1) + sp++; + else { + sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); + RTjpeg_idct (bp + j + 8, RTjpeg_block, RTjpeg_width); + } + if (*sp == -1) + sp++; + else { + sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); + RTjpeg_idct (bp1 + j, RTjpeg_block, RTjpeg_width); + } + if (*sp == -1) + sp++; + else { + sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt); + RTjpeg_idct (bp1 + j + 8, RTjpeg_block, RTjpeg_width); + } + if (*sp == -1) + sp++; + else { + sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt); + RTjpeg_idct (bp2 + k, RTjpeg_block, RTjpeg_width >> 1); + } + if (*sp == -1) + sp++; + else { + sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt); + RTjpeg_idct (bp3 + k, RTjpeg_block, RTjpeg_width >> 1); + } + } + bp += RTjpeg_width << 4; + bp1 += RTjpeg_width << 4; + bp2 += RTjpeg_width << 2; + bp3 += RTjpeg_width << 2; } - bp+=RTjpeg_width<<4; - bp1+=RTjpeg_width<<4; - bp2+=RTjpeg_width<<2; - bp3+=RTjpeg_width<<2; - } #ifdef HAVE_LIBMMX - emms(); + emms (); #endif } -void RTjpeg_decompress8(__s8 *sp, __u8 *bp) +void +RTjpeg_decompress8 (__s8 * sp, __u8 * bp) { - int i, j; + int i, j; #ifdef HAVE_LIBMMX - emms(); + emms (); #endif /* Y */ - for(i=0; i>5; - RTjpeg_old=(__s16 *)(tmp<<5); - } - if (!RTjpeg_old) - { - fprintf(stderr, "RTjpeg: Could not allocate memory\n"); - exit(-1); - } - memset (RTjpeg_old, 0, ((4*RTjpeg_width*RTjpeg_height))); + unsigned long tmp; + + if (!RTjpeg_old) { + RTjpeg_old = malloc ((4 * RTjpeg_width * RTjpeg_height) + 32); + tmp = (unsigned long) RTjpeg_old; + tmp += 32; + tmp = tmp >> 5; + RTjpeg_old = (__s16 *) (tmp << 5); + } + if (!RTjpeg_old) { + fprintf (stderr, "RTjpeg: Could not allocate memory\n"); + exit (-1); + } + memset (RTjpeg_old, 0, ((4 * RTjpeg_width * RTjpeg_height))); } #ifdef HAVE_LIBMMX -int RTjpeg_bcomp(__s16 *old, mmx_t *mask) +int +RTjpeg_bcomp (__s16 * old, mmx_t * mask) { - int i; - mmx_t *mold=(mmx_t *)old; - mmx_t *mblock=(mmx_t *)RTjpeg_block; - mmx_t result; - static mmx_t neg=(mmx_t)(unsigned long long)0xffffffffffffffffULL; - - movq_m2r(*mask, mm7); - movq_m2r(neg, mm6); - pxor_r2r(mm5, mm5); - - for(i=0; i<8; i++) - { - movq_m2r(*(mblock++), mm0); - movq_m2r(*(mblock++), mm2); - movq_m2r(*(mold++), mm1); - movq_m2r(*(mold++), mm3); - psubsw_r2r(mm1, mm0); - psubsw_r2r(mm3, mm2); - movq_r2r(mm0, mm1); - movq_r2r(mm2, mm3); - pcmpgtw_r2r(mm7, mm0); - pcmpgtw_r2r(mm7, mm2); - pxor_r2r(mm6, mm1); - pxor_r2r(mm6, mm3); - pcmpgtw_r2r(mm7, mm1); - pcmpgtw_r2r(mm7, mm3); - por_r2r(mm0, mm5); - por_r2r(mm2, mm5); - por_r2r(mm1, mm5); - por_r2r(mm3, mm5); - } - movq_r2m(mm5, result); - - if(result.q) - { - if(!RTjpeg_mtest) - for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i]; - return 0; - } + int i; + mmx_t *mold = (mmx_t *) old; + mmx_t *mblock = (mmx_t *) RTjpeg_block; + mmx_t result; + static mmx_t neg = (mmx_t) (unsigned long long) 0xffffffffffffffffULL; + + movq_m2r (*mask, mm7); + movq_m2r (neg, mm6); + pxor_r2r (mm5, mm5); + + for (i = 0; i < 8; i++) { + movq_m2r (*(mblock++), mm0); + movq_m2r (*(mblock++), mm2); + movq_m2r (*(mold++), mm1); + movq_m2r (*(mold++), mm3); + psubsw_r2r (mm1, mm0); + psubsw_r2r (mm3, mm2); + movq_r2r (mm0, mm1); + movq_r2r (mm2, mm3); + pcmpgtw_r2r (mm7, mm0); + pcmpgtw_r2r (mm7, mm2); + pxor_r2r (mm6, mm1); + pxor_r2r (mm6, mm3); + pcmpgtw_r2r (mm7, mm1); + pcmpgtw_r2r (mm7, mm3); + por_r2r (mm0, mm5); + por_r2r (mm2, mm5); + por_r2r (mm1, mm5); + por_r2r (mm3, mm5); + } + movq_r2m (mm5, result); + + if (result.q) { + if (!RTjpeg_mtest) + for (i = 0; i < 16; i++) + ((__u64 *) old)[i] = ((__u64 *) RTjpeg_block)[i]; + return 0; + } /* printf("."); */ - return 1; + return 1; } #else -int RTjpeg_bcomp(__s16 *old, __u16 *mask) +int +RTjpeg_bcomp (__s16 * old, __u16 * mask) { - int i; - - for(i=0; i<64; i++) - if(abs(old[i]-RTjpeg_block[i])>*mask) - { - if(!RTjpeg_mtest) - for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i]; - return 0; - } - return 1; + int i; + + for (i = 0; i < 64; i++) + if (abs (old[i] - RTjpeg_block[i]) > *mask) { + if (!RTjpeg_mtest) + for (i = 0; i < 16; i++) + ((__u64 *) old)[i] = ((__u64 *) RTjpeg_block)[i]; + return 0; + } + return 1; } #endif -void RTjpeg_set_test(int i) +void +RTjpeg_set_test (int i) { - RTjpeg_mtest=i; + RTjpeg_mtest = i; } -int RTjpeg_mcompress(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask) +int +RTjpeg_mcompress (__s8 * sp, unsigned char *bp, __u16 lmask, __u16 cmask) { - __s8 * sb; - __s16 *block; - register __s8 * bp2; - register __s8 * bp3; - register int i, j, k; + __s8 *sb; + __s16 *block; + register __s8 *bp2; + register __s8 *bp3; + register int i, j, k; #ifdef HAVE_LIBMMX - emms(); - RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask); - RTjpeg_cmask=(mmx_t)(((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask); + emms (); + RTjpeg_lmask = + (mmx_t) (((__u64) lmask << 48) | ((__u64) lmask << 32) | ((__u64) lmask << + 16) | lmask); + RTjpeg_cmask = + (mmx_t) (((__u64) cmask << 48) | ((__u64) cmask << 32) | ((__u64) cmask << + 16) | cmask); #else - RTjpeg_lmask=lmask; - RTjpeg_cmask=cmask; + RTjpeg_lmask = lmask; + RTjpeg_cmask = cmask; #endif - - bp = bp - RTjpeg_width*0; - bp2 = bp + RTjpeg_Ysize-RTjpeg_width*0; - bp3 = bp2 + RTjpeg_Csize; - sb=sp; - block=RTjpeg_old; -/* Y */ - for(i=RTjpeg_height; i; i-=8) - { - for(j=0, k=0; j>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+cbB)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - - y=(bufy[j+1]-16)*Ky; - - tmp=(y+crR)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+cbB)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - + int tmp; + int i, j; + __s32 y, crR, crG, cbG, cbB; + __u8 *bufcr, *bufcb, *bufy, *bufoute; + int yskip; + + yskip = RTjpeg_width; + + bufcb = &buf[RTjpeg_width * RTjpeg_height]; + bufcr = + &buf[RTjpeg_width * RTjpeg_height + (RTjpeg_width * RTjpeg_height) / 2]; + bufy = &buf[0]; + bufoute = rgb; + + for (i = 0; i < (RTjpeg_height); i++) { + for (j = 0; j < RTjpeg_width; j += 2) { + crR = (*bufcr - 128) * KcrR; + crG = (*(bufcr++) - 128) * KcrG; + cbG = (*bufcb - 128) * KcbG; + cbB = (*(bufcb++) - 128) * KcbB; + + y = (bufy[j] - 16) * Ky; + + tmp = (y + crR) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + cbB) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + + y = (bufy[j + 1] - 16) * Ky; + + tmp = (y + crR) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + cbB) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + + } + bufy += yskip; } - bufy+=yskip; - } } -void RTjpeg_yuv420rgb(__u8 *buf, __u8 *rgb) +void +RTjpeg_yuv420rgb (__u8 * buf, __u8 * rgb) { - int tmp; - int i, j; - __s32 y, crR, crG, cbG, cbB; - __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; - int oskip, yskip; - - oskip=RTjpeg_width*3; - yskip=RTjpeg_width; - - bufcb=&buf[RTjpeg_width*RTjpeg_height]; - bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4]; - bufy=&buf[0]; - bufoute=rgb; - bufouto=rgb+oskip; - - for(i=0; i<(RTjpeg_height>>1); i++) - { - for(j=0; j>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+cbB)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - - y=(bufy[j+1]-16)*Ky; - - tmp=(y+crR)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+cbB)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - - y=(bufy[j+yskip]-16)*Ky; - - tmp=(y+crR)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+cbB)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - - y=(bufy[j+1+yskip]-16)*Ky; - - tmp=(y+crR)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+cbB)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - + int tmp; + int i, j; + __s32 y, crR, crG, cbG, cbB; + __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; + int oskip, yskip; + + oskip = RTjpeg_width * 3; + yskip = RTjpeg_width; + + bufcb = &buf[RTjpeg_width * RTjpeg_height]; + bufcr = + &buf[RTjpeg_width * RTjpeg_height + (RTjpeg_width * RTjpeg_height) / 4]; + bufy = &buf[0]; + bufoute = rgb; + bufouto = rgb + oskip; + + for (i = 0; i < (RTjpeg_height >> 1); i++) { + for (j = 0; j < RTjpeg_width; j += 2) { + crR = (*bufcr - 128) * KcrR; + crG = (*(bufcr++) - 128) * KcrG; + cbG = (*bufcb - 128) * KcbG; + cbB = (*(bufcb++) - 128) * KcbB; + + y = (bufy[j] - 16) * Ky; + + tmp = (y + crR) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + cbB) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + + y = (bufy[j + 1] - 16) * Ky; + + tmp = (y + crR) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + cbB) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + + y = (bufy[j + yskip] - 16) * Ky; + + tmp = (y + crR) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + cbB) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + + y = (bufy[j + 1 + yskip] - 16) * Ky; + + tmp = (y + crR) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + cbB) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + + } + bufoute += oskip; + bufouto += oskip; + bufy += yskip << 1; } - bufoute+=oskip; - bufouto+=oskip; - bufy+=yskip<<1; - } } -void RTjpeg_yuvrgb32(__u8 *buf, __u8 *rgb) +void +RTjpeg_yuvrgb32 (__u8 * buf, __u8 * rgb) { - int tmp; - int i, j; - __s32 y, crR, crG, cbG, cbB; - __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; - int oskip, yskip; - - oskip=RTjpeg_width*4; - yskip=RTjpeg_width; - - bufcb=&buf[RTjpeg_width*RTjpeg_height]; - bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2]; - bufy=&buf[0]; - bufoute=rgb; - bufouto=rgb+oskip; - - for(i=0; i<(RTjpeg_height>>1); i++) - { - for(j=0; j>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+crR)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - bufoute++; - - y=(bufy[j+1]-16)*Ky; - - tmp=(y+cbB)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+crR)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - bufoute++; - - y=(bufy[j+yskip]-16)*Ky; - - tmp=(y+cbB)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+crR)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - bufouto++; - - y=(bufy[j+1+yskip]-16)*Ky; - - tmp=(y+cbB)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+crR)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - bufouto++; - + int tmp; + int i, j; + __s32 y, crR, crG, cbG, cbB; + __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; + int oskip, yskip; + + oskip = RTjpeg_width * 4; + yskip = RTjpeg_width; + + bufcb = &buf[RTjpeg_width * RTjpeg_height]; + bufcr = + &buf[RTjpeg_width * RTjpeg_height + (RTjpeg_width * RTjpeg_height) / 2]; + bufy = &buf[0]; + bufoute = rgb; + bufouto = rgb + oskip; + + for (i = 0; i < (RTjpeg_height >> 1); i++) { + for (j = 0; j < RTjpeg_width; j += 2) { + crR = (*bufcr - 128) * KcrR; + crG = (*(bufcr++) - 128) * KcrG; + cbG = (*bufcb - 128) * KcbG; + cbB = (*(bufcb++) - 128) * KcbB; + + y = (bufy[j] - 16) * Ky; + + tmp = (y + cbB) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + crR) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + bufoute++; + + y = (bufy[j + 1] - 16) * Ky; + + tmp = (y + cbB) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + crR) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + bufoute++; + + y = (bufy[j + yskip] - 16) * Ky; + + tmp = (y + cbB) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + crR) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + bufouto++; + + y = (bufy[j + 1 + yskip] - 16) * Ky; + + tmp = (y + cbB) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + crR) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + bufouto++; + + } + bufoute += oskip; + bufouto += oskip; + bufy += yskip << 1; } - bufoute+=oskip; - bufouto+=oskip; - bufy+=yskip<<1; - } } -void RTjpeg_yuvrgb24(__u8 *buf, __u8 *rgb) +void +RTjpeg_yuvrgb24 (__u8 * buf, __u8 * rgb) { - int tmp; - int i, j; - __s32 y, crR, crG, cbG, cbB; - __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; - int oskip, yskip; - - oskip=RTjpeg_width*3; - yskip=RTjpeg_width; - - bufcb=&buf[RTjpeg_width*RTjpeg_height]; - bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4]; - bufy=&buf[0]; - bufoute=rgb; - bufouto=rgb+oskip; - - for(i=0; i<(RTjpeg_height>>1); i++) - { - for(j=0; j>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+crR)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - - y=(bufy[j+1]-16)*Ky; - - tmp=(y+cbB)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+crR)>>16; - *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp); - - y=(bufy[j+yskip]-16)*Ky; - - tmp=(y+cbB)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+crR)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - - y=(bufy[j+1+yskip]-16)*Ky; - - tmp=(y+cbB)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+crR)>>16; - *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp); - + int tmp; + int i, j; + __s32 y, crR, crG, cbG, cbB; + __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; + int oskip, yskip; + + oskip = RTjpeg_width * 3; + yskip = RTjpeg_width; + + bufcb = &buf[RTjpeg_width * RTjpeg_height]; + bufcr = + &buf[RTjpeg_width * RTjpeg_height + (RTjpeg_width * RTjpeg_height) / 4]; + bufy = &buf[0]; + bufoute = rgb; + bufouto = rgb + oskip; + + for (i = 0; i < (RTjpeg_height >> 1); i++) { + for (j = 0; j < RTjpeg_width; j += 2) { + crR = (*bufcr - 128) * KcrR; + crG = (*(bufcr++) - 128) * KcrG; + cbG = (*bufcb - 128) * KcbG; + cbB = (*(bufcb++) - 128) * KcbB; + + y = (bufy[j] - 16) * Ky; + + tmp = (y + cbB) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + crR) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + + y = (bufy[j + 1] - 16) * Ky; + + tmp = (y + cbB) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + crR) >> 16; + *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + + y = (bufy[j + yskip] - 16) * Ky; + + tmp = (y + cbB) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + crR) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + + y = (bufy[j + 1 + yskip] - 16) * Ky; + + tmp = (y + cbB) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + crR) >> 16; + *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + + } + bufoute += oskip; + bufouto += oskip; + bufy += yskip << 1; } - bufoute+=oskip; - bufouto+=oskip; - bufy+=yskip<<1; - } } -void RTjpeg_yuvrgb16(__u8 *buf, __u8 *rgb) +void +RTjpeg_yuvrgb16 (__u8 * buf, __u8 * rgb) { - int tmp; - int i, j; - __s32 y, crR, crG, cbG, cbB; - __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; - int oskip, yskip; - unsigned char r, g, b; - - oskip=RTjpeg_width*2; - yskip=RTjpeg_width; - - bufcb=&buf[RTjpeg_width*RTjpeg_height]; - bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4]; - bufy=&buf[0]; - bufoute=rgb; - bufouto=rgb+oskip; - - for(i=0; i<(RTjpeg_height>>1); i++) - { - for(j=0; j>16; - b=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - g=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+crR)>>16; - r=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(int)((int)b >> 3); - tmp|=(int)(((int)g >> 2) << 5); - tmp|=(int)(((int)r >> 3) << 11); - *(bufoute++)=tmp&0xff; - *(bufoute++)=tmp>>8; - - - y=(bufy[j+1]-16)*Ky; - - tmp=(y+cbB)>>16; - b=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - g=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+crR)>>16; - r=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(int)((int)b >> 3); - tmp|=(int)(((int)g >> 2) << 5); - tmp|=(int)(((int)r >> 3) << 11); - *(bufoute++)=tmp&0xff; - *(bufoute++)=tmp>>8; - - y=(bufy[j+yskip]-16)*Ky; - - tmp=(y+cbB)>>16; - b=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - g=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+crR)>>16; - r=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(int)((int)b >> 3); - tmp|=(int)(((int)g >> 2) << 5); - tmp|=(int)(((int)r >> 3) << 11); - *(bufouto++)=tmp&0xff; - *(bufouto++)=tmp>>8; - - y=(bufy[j+1+yskip]-16)*Ky; - - tmp=(y+cbB)>>16; - b=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y-crG-cbG)>>16; - g=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(y+crR)>>16; - r=(tmp>255)?255:((tmp<0)?0:tmp); - tmp=(int)((int)b >> 3); - tmp|=(int)(((int)g >> 2) << 5); - tmp|=(int)(((int)r >> 3) << 11); - *(bufouto++)=tmp&0xff; - *(bufouto++)=tmp>>8; - + int tmp; + int i, j; + __s32 y, crR, crG, cbG, cbB; + __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto; + int oskip, yskip; + unsigned char r, g, b; + + oskip = RTjpeg_width * 2; + yskip = RTjpeg_width; + + bufcb = &buf[RTjpeg_width * RTjpeg_height]; + bufcr = + &buf[RTjpeg_width * RTjpeg_height + (RTjpeg_width * RTjpeg_height) / 4]; + bufy = &buf[0]; + bufoute = rgb; + bufouto = rgb + oskip; + + for (i = 0; i < (RTjpeg_height >> 1); i++) { + for (j = 0; j < RTjpeg_width; j += 2) { + crR = (*bufcr - 128) * KcrR; + crG = (*(bufcr++) - 128) * KcrG; + cbG = (*bufcb - 128) * KcbG; + cbB = (*(bufcb++) - 128) * KcbB; + + y = (bufy[j] - 16) * Ky; + + tmp = (y + cbB) >> 16; + b = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + g = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + crR) >> 16; + r = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (int) ((int) b >> 3); + tmp |= (int) (((int) g >> 2) << 5); + tmp |= (int) (((int) r >> 3) << 11); + *(bufoute++) = tmp & 0xff; + *(bufoute++) = tmp >> 8; + + + y = (bufy[j + 1] - 16) * Ky; + + tmp = (y + cbB) >> 16; + b = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + g = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + crR) >> 16; + r = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (int) ((int) b >> 3); + tmp |= (int) (((int) g >> 2) << 5); + tmp |= (int) (((int) r >> 3) << 11); + *(bufoute++) = tmp & 0xff; + *(bufoute++) = tmp >> 8; + + y = (bufy[j + yskip] - 16) * Ky; + + tmp = (y + cbB) >> 16; + b = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + g = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + crR) >> 16; + r = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (int) ((int) b >> 3); + tmp |= (int) (((int) g >> 2) << 5); + tmp |= (int) (((int) r >> 3) << 11); + *(bufouto++) = tmp & 0xff; + *(bufouto++) = tmp >> 8; + + y = (bufy[j + 1 + yskip] - 16) * Ky; + + tmp = (y + cbB) >> 16; + b = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y - crG - cbG) >> 16; + g = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (y + crR) >> 16; + r = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp); + tmp = (int) ((int) b >> 3); + tmp |= (int) (((int) g >> 2) << 5); + tmp |= (int) (((int) r >> 3) << 11); + *(bufouto++) = tmp & 0xff; + *(bufouto++) = tmp >> 8; + + } + bufoute += oskip; + bufouto += oskip; + bufy += yskip << 1; } - bufoute+=oskip; - bufouto+=oskip; - bufy+=yskip<<1; - } } -void RTjpeg_yuvrgb8(__u8 *buf, __u8 *rgb) +void +RTjpeg_yuvrgb8 (__u8 * buf, __u8 * rgb) { - bcopy(buf, rgb, RTjpeg_width*RTjpeg_height); + bcopy (buf, rgb, RTjpeg_width * RTjpeg_height); } -void RTjpeg_double32(__u32 *buf) +void +RTjpeg_double32 (__u32 * buf) { - int i, j; - - __u32 *iptr, *optr1, *optr2; - - iptr=buf+(RTjpeg_width*RTjpeg_height)-1; - optr1=buf+(RTjpeg_width*RTjpeg_height*4)-1; - optr2=optr1-(2*RTjpeg_width); - - for(i=0; i