/* 
    This program is free software; you can redristibute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

   This file is a modified version of RTjpeg 0.1.2, (C) Justin Schoeman 1998

   (991101) Wim Taymans : added MMX dct and idct from intels site.
*/


/*

Main Routines

This file contains most of the initialisation and control functions

(C) Justin Schoeman 1998

*/

#include <config.h>
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef unsigned char __u8;
typedef signed char __s8;
typedef unsigned short __u16;
typedef signed short __s16;
typedef unsigned long __u32;
typedef signed long __s32;
typedef unsigned long long __u64;

/*#define MMX_TRACE */


#ifdef HAVE_LIBMMX
#include "mmx.h"
#endif

static const unsigned char RTjpeg_ZZ[64]={
0,
8, 1,
2, 9, 16,
24, 17, 10, 3,
4, 11, 18, 25, 32,
40, 33, 26, 19, 12, 5,
6, 13, 20, 27, 34, 41, 48,
56, 49, 42, 35, 28, 21, 14, 7,
15, 22, 29, 36, 43, 50, 57,
58, 51, 44, 37, 30, 23,
31, 38, 45, 52, 59,
60, 53, 46, 39,
47, 54, 61,
62, 55,
63 };

static const __u64 RTjpeg_aan_tab[64]={
4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL, 
5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL, 
5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL, 
5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL, 
4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL, 
3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL, 
2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL, 
1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL, 
};

#ifndef HAVE_LIBMMX
static __s32 RTjpeg_ws[64+31];
#endif
__u8 RTjpeg_alldata[2*64+4*64+4*64+4*64+4*64+32];

__s16 *RTjpeg_block;
__s32 *RTjpeg_lqt;
__s32 *RTjpeg_cqt;
__u32 *RTjpeg_liqt;
__u32 *RTjpeg_ciqt;

unsigned char RTjpeg_lb8;
unsigned char RTjpeg_cb8;
int RTjpeg_width, RTjpeg_height;
int RTjpeg_Ywidth, RTjpeg_Cwidth;
int RTjpeg_Ysize, RTjpeg_Csize;

__s16 *RTjpeg_old=NULL;

#ifdef HAVE_LIBMMX
mmx_t RTjpeg_lmask;
mmx_t RTjpeg_cmask;
#else
__u16 RTjpeg_lmask;
__u16 RTjpeg_cmask;
#endif
int RTjpeg_mtest=0;

static const unsigned char RTjpeg_lum_quant_tbl[64] = {
    16,  11,  10,  16,  24,  40,  51,  61,
    12,  12,  14,  19,  26,  58,  60,  55,
    14,  13,  16,  24,  40,  57,  69,  56,
    14,  17,  22,  29,  51,  87,  80,  62,
    18,  22,  37,  56,  68, 109, 103,  77,
    24,  35,  55,  64,  81, 104, 113,  92,
    49,  64,  78,  87, 103, 121, 120, 101,
    72,  92,  95,  98, 112, 100, 103,  99
 };

static const unsigned char RTjpeg_chrom_quant_tbl[64] = {
    17,  18,  24,  47,  99,  99,  99,  99,
    18,  21,  26,  66,  99,  99,  99,  99,
    24,  26,  56,  99,  99,  99,  99,  99,
    47,  66,  99,  99,  99,  99,  99,  99,
    99,  99,  99,  99,  99,  99,  99,  99,
    99,  99,  99,  99,  99,  99,  99,  99,
    99,  99,  99,  99,  99,  99,  99,  99,
    99,  99,  99,  99,  99,  99,  99,  99
 };
 
int RTjpeg_b2s(__s16 *data, __s8 *strm, __u8 bt8)
{
 register int ci, co=1, tmp;
 register __s16 ZZvalue;

 strm[0]=(__u8)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
 
 for(ci=1; ci<=bt8; ci++) 
 {
	ZZvalue = data[RTjpeg_ZZ[ci]];

   if(ZZvalue>0) 
	{
     strm[co++]=(__s8)(ZZvalue>127)?127:ZZvalue;
   } 
	else 
	{
     strm[co++]=(__s8)(ZZvalue<-128)?-128:ZZvalue;
   }
 }

 for(; ci<64; ci++) 
 {
  ZZvalue = data[RTjpeg_ZZ[ci]];

  if(ZZvalue>0)
  {
   strm[co++]=(__s8)(ZZvalue>63)?63:ZZvalue;
  } 
  else if(ZZvalue<0)
  {
   strm[co++]=(__s8)(ZZvalue<-64)?-64:ZZvalue;
  } 
  else /* compress zeros */
  {
   tmp=ci;
   do
   {
    ci++;
   } 
	while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0));

   strm[co++]=(__s8)(63+(ci-tmp));
   ci--;
  }
 }
 return (int)co;
}

int RTjpeg_s2b(__s16 *data, __s8 *strm, __u8 bt8, __u32 *qtbl)
{
 int ci=1, co=1, tmp;
 register int i;

 i=RTjpeg_ZZ[0];
 data[i]=((__u8)strm[0])*qtbl[i];

 for(co=1; co<=bt8; co++)
 {
  i=RTjpeg_ZZ[co];
  data[i]=strm[ci++]*qtbl[i];
 }
 
 for(; co<64; co++)
 {
  if(strm[ci]>63)
  {
   tmp=co+strm[ci]-63;
   for(; co<tmp; co++)data[RTjpeg_ZZ[co]]=0;
   co--;
  } else
  {
   i=RTjpeg_ZZ[co];
   data[i]=strm[ci]*qtbl[i];
  }
  ci++;
 }
 return (int)ci;
}

#if defined(HAVE_LIBMMX)
void RTjpeg_quant_init(void)
{
 int i;
 __s16 *qtbl;
 
 qtbl=(__s16 *)RTjpeg_lqt;
 for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_lqt[i];

 qtbl=(__s16 *)RTjpeg_cqt;
 for(i=0; i<64; i++)qtbl[i]=(__s16)RTjpeg_cqt[i];
}

static mmx_t RTjpeg_ones=(mmx_t)(long long)0x0001000100010001LL;
static mmx_t RTjpeg_half=(mmx_t)(long long)0x7fff7fff7fff7fffLL;

void RTjpeg_quant(__s16 *block, __s32 *qtbl)
{
 int i;
 mmx_t *bl, *ql;
 
 ql=(mmx_t *)qtbl;
 bl=(mmx_t *)block;
 
 movq_m2r(RTjpeg_ones, mm6);
 movq_m2r(RTjpeg_half, mm7);

 for(i=16; i; i--) 
 {
  movq_m2r(*(ql++), mm0); /* quant vals (4) */
  movq_m2r(*bl, mm2); /* block vals (4) */
  movq_r2r(mm0, mm1);
  movq_r2r(mm2, mm3);
  
  punpcklwd_r2r(mm6, mm0); /*           1 qb 1 qa */
  punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */
  
  punpcklwd_r2r(mm7, mm2); /*                   32767 bb 32767 ba */
  punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */
  
  pmaddwd_r2r(mm2, mm0); /*                         32767+bb*qb 32767+ba*qa */
  pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */
  
  psrad_i2r(16, mm0);
  psrad_i2r(16, mm1);
  
  packssdw_r2r(mm1, mm0);
  
  movq_r2m(mm0, *(bl++));
  
 }
}
#else
void RTjpeg_quant_init(void)
{
}

void RTjpeg_quant(__s16 *block, __s32 *qtbl)
{
 int i;
 
 for(i=0; i<64; i++)
   block[i]=(__s16)((block[i]*qtbl[i]+32767)>>16);
}
#endif

/*
 * Perform the forward DCT on one block of samples.
 */
#ifdef HAVE_LIBMMX
static mmx_t RTjpeg_C4   =(mmx_t)(long long)0x2D412D412D412D41LL;
static mmx_t RTjpeg_C6   =(mmx_t)(long long)0x187E187E187E187ELL;
static mmx_t RTjpeg_C2mC6=(mmx_t)(long long)0x22A322A322A322A3LL;
static mmx_t RTjpeg_C2pC6=(mmx_t)(long long)0x539F539F539F539FLL;
static mmx_t RTjpeg_zero =(mmx_t)(long long)0x0000000000000000LL;

#else

#define FIX_0_382683433  ((__s32)   98)		/* FIX(0.382683433) */
#define FIX_0_541196100  ((__s32)  139)		/* FIX(0.541196100) */
#define FIX_0_707106781  ((__s32)  181)		/* FIX(0.707106781) */
#define FIX_1_306562965  ((__s32)  334)		/* FIX(1.306562965) */

#define DESCALE10(x) (__s16)( ((x)+128) >> 8)
#define DESCALE20(x)  (__s16)(((x)+32768) >> 16)
#define D_MULTIPLY(var,const)  ((__s32) ((var) * (const)))
#endif

void RTjpeg_dct_init(void)
{
 int i;
 
 for(i=0; i<64; i++)
 {
  RTjpeg_lqt[i]=(((__u64)RTjpeg_lqt[i]<<32)/RTjpeg_aan_tab[i]);
  RTjpeg_cqt[i]=(((__u64)RTjpeg_cqt[i]<<32)/RTjpeg_aan_tab[i]);
 }
}

void RTjpeg_dctY(__u8 *idata, __s16 *odata, int rskip)
{
#ifndef HAVE_LIBMMX
  __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  __s32 tmp10, tmp11, tmp12, tmp13;
  __s32 z1, z2, z3, z4, z5, z11, z13;
  __u8 *idataptr;
  __s16 *odataptr;
  __s32 *wsptr;
  int ctr;

  idataptr = idata;
  wsptr = RTjpeg_ws;
  for (ctr = 7; ctr >= 0; ctr--) {
    tmp0 = idataptr[0] + idataptr[7];
    tmp7 = idataptr[0] - idataptr[7];
    tmp1 = idataptr[1] + idataptr[6];
    tmp6 = idataptr[1] - idataptr[6];
    tmp2 = idataptr[2] + idataptr[5];
    tmp5 = idataptr[2] - idataptr[5];
    tmp3 = idataptr[3] + idataptr[4];
    tmp4 = idataptr[3] - idataptr[4];
    
    tmp10 = (tmp0 + tmp3);	/* phase 2 */
    tmp13 = tmp0 - tmp3;
    tmp11 = (tmp1 + tmp2);
    tmp12 = tmp1 - tmp2;
    
    wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */
    wsptr[4] = (tmp10 - tmp11)<<8;
    
    z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
    wsptr[2] = (tmp13<<8) + z1;	/* phase 5 */
    wsptr[6] = (tmp13<<8) - z1;
    
    tmp10 = tmp4 + tmp5;	/* phase 2 */
    tmp11 = tmp5 + tmp6;
    tmp12 = tmp6 + tmp7;

    z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
    z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
    z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
    z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */

    z11 = (tmp7<<8) + z3;		/* phase 5 */
    z13 = (tmp7<<8) - z3;

    wsptr[5] = z13 + z2;	/* phase 6 */
    wsptr[3] = z13 - z2;
    wsptr[1] = z11 + z4;
    wsptr[7] = z11 - z4;

    idataptr += rskip<<3;		/* advance pointer to next row */
    wsptr += 8;
  }

  wsptr = RTjpeg_ws;
  odataptr=odata;
  for (ctr = 7; ctr >= 0; ctr--) {
    tmp0 = wsptr[0] + wsptr[56];
    tmp7 = wsptr[0] - wsptr[56];
    tmp1 = wsptr[8] + wsptr[48];
    tmp6 = wsptr[8] - wsptr[48];
    tmp2 = wsptr[16] + wsptr[40];
    tmp5 = wsptr[16] - wsptr[40];
    tmp3 = wsptr[24] + wsptr[32];
    tmp4 = wsptr[24] - wsptr[32];
    
    tmp10 = tmp0 + tmp3;	/* phase 2 */
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;
    
    odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */
    odataptr[32] = DESCALE10(tmp10 - tmp11);
    
    z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
    odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */
    odataptr[48] = DESCALE20((tmp13<<8) - z1);

    tmp10 = tmp4 + tmp5;	/* phase 2 */
    tmp11 = tmp5 + tmp6;
    tmp12 = tmp6 + tmp7;

    z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
    z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
    z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
    z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */

    z11 = (tmp7<<8) + z3;		/* phase 5 */
    z13 = (tmp7<<8) - z3;

    odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */
    odataptr[24] = DESCALE20(z13 - z2);
    odataptr[8] = DESCALE20(z11 + z4);
    odataptr[56] = DESCALE20(z11 - z4);

    odataptr++;			/* advance pointer to next column */
    wsptr++;
  }
#else
  mmx_t tmp6, tmp7;
  register mmx_t *dataptr = (mmx_t *)odata;
  mmx_t *idata2 = (mmx_t *)idata;

   /* first copy the input 8 bit to the destination 16 bits */

   movq_m2r(RTjpeg_zero, mm2);


	movq_m2r(*idata2, mm0);		 
	movq_r2r(mm0, mm1);		 			

	punpcklbw_r2r(mm2, mm0);
	movq_r2m(mm0, *(dataptr));

	punpckhbw_r2r(mm2, mm1);
	movq_r2m(mm1, *(dataptr+1));
	
	idata2 += rskip;

	movq_m2r(*idata2, mm0);		 
	movq_r2r(mm0, mm1);		 			

	punpcklbw_r2r(mm2, mm0);
	movq_r2m(mm0, *(dataptr+2));

	punpckhbw_r2r(mm2, mm1);
	movq_r2m(mm1, *(dataptr+3));
	
	idata2 += rskip;

	movq_m2r(*idata2, mm0);		 
	movq_r2r(mm0, mm1);		 			

	punpcklbw_r2r(mm2, mm0);
	movq_r2m(mm0, *(dataptr+4));

	punpckhbw_r2r(mm2, mm1);
	movq_r2m(mm1, *(dataptr+5));
	
	idata2 += rskip;

	movq_m2r(*idata2, mm0);		 
	movq_r2r(mm0, mm1);		 			

	punpcklbw_r2r(mm2, mm0);
	movq_r2m(mm0, *(dataptr+6));

	punpckhbw_r2r(mm2, mm1);
	movq_r2m(mm1, *(dataptr+7));
	
	idata2 += rskip;

	movq_m2r(*idata2, mm0);		 
	movq_r2r(mm0, mm1);		 			

	punpcklbw_r2r(mm2, mm0);
	movq_r2m(mm0, *(dataptr+8));

	punpckhbw_r2r(mm2, mm1);
	movq_r2m(mm1, *(dataptr+9));
	
	idata2 += rskip;

	movq_m2r(*idata2, mm0);		 
	movq_r2r(mm0, mm1);		 			

	punpcklbw_r2r(mm2, mm0);
	movq_r2m(mm0, *(dataptr+10));

	punpckhbw_r2r(mm2, mm1);
	movq_r2m(mm1, *(dataptr+11));
	
	idata2 += rskip;

	movq_m2r(*idata2, mm0);		 
	movq_r2r(mm0, mm1);		 			

	punpcklbw_r2r(mm2, mm0);
	movq_r2m(mm0, *(dataptr+12));

	punpckhbw_r2r(mm2, mm1);
	movq_r2m(mm1, *(dataptr+13));
	
	idata2 += rskip;

	movq_m2r(*idata2, mm0);		 
	movq_r2r(mm0, mm1);		 			

	punpcklbw_r2r(mm2, mm0);
	movq_r2m(mm0, *(dataptr+14));

	punpckhbw_r2r(mm2, mm1);
	movq_r2m(mm1, *(dataptr+15));

/*  Start Transpose to do calculations on rows */

	movq_m2r(*(dataptr+9), mm7);		 	/* m03:m02|m01:m00 - first line (line 4)and copy into m5 */

	movq_m2r(*(dataptr+13), mm6);	    	/* m23:m22|m21:m20 - third line (line 6)and copy into m2 */
	movq_r2r(mm7, mm5);		 			

	punpcklwd_m2r(*(dataptr+11), mm7); 	/* m11:m01|m10:m00 - interleave first and second lines */
	movq_r2r(mm6, mm2);						 

	punpcklwd_m2r(*(dataptr+15), mm6);  /* m31:m21|m30:m20 - interleave third and fourth lines */
	movq_r2r(mm7, mm1);

	movq_m2r(*(dataptr+11), mm3);	      /* m13:m13|m11:m10 - second line	 */
	punpckldq_r2r(mm6, mm7);				/* m30:m20|m10:m00 - interleave to produce result 1 */

	movq_m2r(*(dataptr+15), mm0);	      /* m13:m13|m11:m10 - fourth line */
	punpckhdq_r2r(mm6, mm1);				/* m31:m21|m11:m01 - interleave to produce result 2 */

	movq_r2m(mm7,*(dataptr+9));			/* write result 1 */
	punpckhwd_r2r(mm3, mm5);				/* m13:m03|m12:m02 - interleave first and second lines */
	
	movq_r2m(mm1,*(dataptr+11));			/* write result 2 */
	punpckhwd_r2r(mm0, mm2);				/* m33:m23|m32:m22 - interleave third and fourth lines */

	movq_r2r(mm5, mm1);
	punpckldq_r2r(mm2, mm5);				/* m32:m22|m12:m02 - interleave to produce result 3 */

	movq_m2r(*(dataptr+1), mm0);			/* m03:m02|m01:m00 - first line, 4x4 */
	punpckhdq_r2r(mm2, mm1);				/* m33:m23|m13:m03 - interleave to produce result 4 */

	movq_r2m(mm5,*(dataptr+13));			/* write result 3 */

	/* last 4x4 done */

	movq_r2m(mm1, *(dataptr+15));			/* write result 4, last 4x4 */

	movq_m2r(*(dataptr+5), mm2);			/* m23:m22|m21:m20 - third line */
	movq_r2r(mm0, mm6);

	punpcklwd_m2r(*(dataptr+3), mm0);  	/* m11:m01|m10:m00 - interleave first and second lines */
	movq_r2r(mm2, mm7);

	punpcklwd_m2r(*(dataptr+7), mm2);  	/* m31:m21|m30:m20 - interleave third and fourth lines */
	movq_r2r(mm0, mm4);

	
	movq_m2r(*(dataptr+8), mm1);			/* n03:n02|n01:n00 - first line  */
	punpckldq_r2r(mm2, mm0);				/* m30:m20|m10:m00 - interleave to produce first result */

	movq_m2r(*(dataptr+12), mm3);			/* n23:n22|n21:n20 - third line */
	punpckhdq_r2r(mm2, mm4);				/* m31:m21|m11:m01 - interleave to produce second result */

	punpckhwd_m2r(*(dataptr+3), mm6);  	/* m13:m03|m12:m02 - interleave first and second lines */
	movq_r2r(mm1, mm2);               	/* copy first line */

	punpckhwd_m2r(*(dataptr+7), mm7);  	/* m33:m23|m32:m22 - interleave third and fourth lines */
	movq_r2r(mm6, mm5);						/* copy first intermediate result */

	movq_r2m(mm0, *(dataptr+8));			/* write result 1 */
	punpckhdq_r2r(mm7, mm5);				/* m33:m23|m13:m03 - produce third result */

	punpcklwd_m2r(*(dataptr+10), mm1);  /* n11:n01|n10:n00 - interleave first and second lines */
	movq_r2r(mm3, mm0);						/* copy third line */

	punpckhwd_m2r(*(dataptr+10), mm2);  /* n13:n03|n12:n02 - interleave first and second lines */

	movq_r2m(mm4, *(dataptr+10));			/* write result 2 out */
	punpckldq_r2r(mm7, mm6);				/* m32:m22|m12:m02 - produce fourth result */

	punpcklwd_m2r(*(dataptr+14), mm3);  /* n31:n21|n30:n20 - interleave third and fourth lines */
	movq_r2r(mm1, mm4);

	movq_r2m(mm6, *(dataptr+12));			/* write result 3 out */
	punpckldq_r2r(mm3, mm1);				/* n30:n20|n10:n00 - produce first result */

	punpckhwd_m2r(*(dataptr+14), mm0);  /* n33:n23|n32:n22 - interleave third and fourth lines */
	movq_r2r(mm2, mm6);

	movq_r2m(mm5, *(dataptr+14));			/* write result 4 out */
	punpckhdq_r2r(mm3, mm4);				/* n31:n21|n11:n01- produce second result */

	movq_r2m(mm1, *(dataptr+1));			/* write result 5 out - (first result for other 4 x 4 block) */
	punpckldq_r2r(mm0, mm2);				/* n32:n22|n12:n02- produce third result */

	movq_r2m(mm4, *(dataptr+3));			/* write result 6 out */
	punpckhdq_r2r(mm0, mm6);				/* n33:n23|n13:n03 - produce fourth result */

	movq_r2m(mm2, *(dataptr+5));			/* write result 7 out*/

	movq_m2r(*dataptr, mm0);				/* m03:m02|m01:m00 - first line, first 4x4 */

	movq_r2m(mm6, *(dataptr+7));			/* write result 8 out */


/* Do first 4x4 quadrant, which is used in the beginning of the DCT: */

	movq_m2r(*(dataptr+4), mm7);			/* m23:m22|m21:m20 - third line */
	movq_r2r(mm0, mm2);

	punpcklwd_m2r(*(dataptr+2), mm0);  	/* m11:m01|m10:m00 - interleave first and second lines */
	movq_r2r(mm7, mm4);

	punpcklwd_m2r(*(dataptr+6), mm7);  	/* m31:m21|m30:m20 - interleave third and fourth lines */
	movq_r2r(mm0, mm1);

	movq_m2r(*(dataptr+2), mm6);			/* m13:m12|m11:m10 - second line */
	punpckldq_r2r(mm7, mm0);				/* m30:m20|m10:m00 - interleave to produce result 1 */

	movq_m2r(*(dataptr+6), mm5);			/* m33:m32|m31:m30 - fourth line */
	punpckhdq_r2r(mm7, mm1);				/* m31:m21|m11:m01 - interleave to produce result 2 */

	movq_r2r(mm0, mm7);						/* write result 1 */
	punpckhwd_r2r(mm6, mm2);				/* m13:m03|m12:m02 - interleave first and second lines */

	psubw_m2r(*(dataptr+14), mm7);		/* tmp07=x0-x7: Stage 1 */
	movq_r2r(mm1, mm6);						/* write result 2 */

	paddw_m2r(*(dataptr+14), mm0);		/* tmp00=x0+x7: Stage 1 */
	punpckhwd_r2r(mm5, mm4);   			/* m33:m23|m32:m22 - interleave third and fourth lines */

	paddw_m2r(*(dataptr+12), mm1);		/* tmp01=x1+x6: Stage 1 */
	movq_r2r(mm2, mm3);						/* copy first intermediate result */

	psubw_m2r(*(dataptr+12), mm6);		/* tmp06=x1-x6: Stage 1 */
	punpckldq_r2r(mm4, mm2);				/* m32:m22|m12:m02 - interleave to produce result 3 */

   movq_r2m(mm7, tmp7);
	movq_r2r(mm2, mm5);						/* write result 3 */

   movq_r2m(mm6, tmp6);
	punpckhdq_r2r(mm4, mm3);				/* m33:m23|m13:m03 - interleave to produce result 4 */

	paddw_m2r(*(dataptr+10), mm2);     	/* tmp02=x2+5: Stage 1 */
	movq_r2r(mm3, mm4);						/* write result 4 */

/************************************************************************************************
					End of Transpose
************************************************************************************************/


   paddw_m2r(*(dataptr+8), mm3);    	/* tmp03=x3+x4: stage 1 */
   movq_r2r(mm0, mm7);

   psubw_m2r(*(dataptr+8), mm4);    	/* tmp04=x3-x4: stage 1 */
   movq_r2r(mm1, mm6);

	paddw_r2r(mm3, mm0);  					/* tmp10 = tmp00 + tmp03: even 2 */
	psubw_r2r(mm3, mm7);  					/* tmp13 = tmp00 - tmp03: even 2 */

	psubw_r2r(mm2, mm6);  					/* tmp12 = tmp01 - tmp02: even 2 */
	paddw_r2r(mm2, mm1);  					/* tmp11 = tmp01 + tmp02: even 2 */

   psubw_m2r(*(dataptr+10), mm5);    	/* tmp05=x2-x5: stage 1 */
	paddw_r2r(mm7, mm6);						/* tmp12 + tmp13 */

	/* stage 3 */

   movq_m2r(tmp6, mm2);
   movq_r2r(mm0, mm3);

	psllw_i2r(2, mm6);			/* m8 * 2^2 */
	paddw_r2r(mm1, mm0);		

	pmulhw_m2r(RTjpeg_C4, mm6);			/* z1 */
	psubw_r2r(mm1, mm3);		

   movq_r2m(mm0, *dataptr);
   movq_r2r(mm7, mm0);
   
    /* Odd part */
   movq_r2m(mm3, *(dataptr+8));
	paddw_r2r(mm5, mm4);						/* tmp10 */

   movq_m2r(tmp7, mm3);
	paddw_r2r(mm6, mm0);						/* tmp32 */

	paddw_r2r(mm2, mm5);						/* tmp11 */
	psubw_r2r(mm6, mm7);						/* tmp33 */

   movq_r2m(mm0, *(dataptr+4));
	paddw_r2r(mm3, mm2);						/* tmp12 */

	/* stage 4 */

   movq_r2m(mm7, *(dataptr+12));
	movq_r2r(mm4, mm1);						/* copy of tmp10 */

	psubw_r2r(mm2, mm1);						/* tmp10 - tmp12 */
	psllw_i2r(2, mm4);			/* m8 * 2^2 */

	movq_m2r(RTjpeg_C2mC6, mm0);		
	psllw_i2r(2, mm1);

	pmulhw_m2r(RTjpeg_C6, mm1);			/* z5 */
	psllw_i2r(2, mm2);

	pmulhw_r2r(mm0, mm4);					/* z5 */

	/* stage 5 */

	pmulhw_m2r(RTjpeg_C2pC6, mm2);
	psllw_i2r(2, mm5);

	pmulhw_m2r(RTjpeg_C4, mm5);			/* z3 */
	movq_r2r(mm3, mm0);						/* copy tmp7 */

   movq_m2r(*(dataptr+1), mm7);
	paddw_r2r(mm1, mm4);						/* z2 */

	paddw_r2r(mm1, mm2);						/* z4 */

	paddw_r2r(mm5, mm0);						/* z11 */
	psubw_r2r(mm5, mm3);						/* z13 */

	/* stage 6 */

	movq_r2r(mm3, mm5);						/* copy z13 */
	psubw_r2r(mm4, mm3);						/* y3=z13 - z2 */

	paddw_r2r(mm4, mm5);						/* y5=z13 + z2 */
	movq_r2r(mm0, mm6);						/* copy z11 */

   movq_r2m(mm3, *(dataptr+6)); 			/*save y3 */
	psubw_r2r(mm2, mm0);						/* y7=z11 - z4 */

   movq_r2m(mm5, *(dataptr+10)); 		/*save y5 */
	paddw_r2r(mm2, mm6);						/* y1=z11 + z4 */

   movq_r2m(mm0, *(dataptr+14)); 		/*save y7 */

	/************************************************
	 *  End of 1st 4 rows
	 ************************************************/

   movq_m2r(*(dataptr+3), mm1); 			/* load x1: stage 1 */
	movq_r2r(mm7, mm0);						/* copy x0 */

   movq_r2m(mm6, *(dataptr+2)); 			/*save y1 */

   movq_m2r(*(dataptr+5), mm2); 			/* load x2: stage 1 */
	movq_r2r(mm1, mm6);						/* copy x1 */

   paddw_m2r(*(dataptr+15), mm0);  		/* tmp00 = x0 + x7 */

   movq_m2r(*(dataptr+7), mm3); 			/* load x3 : stage 1 */
	movq_r2r(mm2, mm5);						/* copy x2 */

   psubw_m2r(*(dataptr+15), mm7);  		/* tmp07 = x0 - x7 */
	movq_r2r(mm3, mm4);						/* copy x3 */

   paddw_m2r(*(dataptr+13), mm1);  		/* tmp01 = x1 + x6 */

	movq_r2m(mm7, tmp7);						/* save tmp07 */
	movq_r2r(mm0, mm7);						/* copy tmp00 */

   psubw_m2r(*(dataptr+13), mm6);  		/* tmp06 = x1 - x6 */

   /* stage 2, Even Part */

   paddw_m2r(*(dataptr+9), mm3);  		/* tmp03 = x3 + x4 */

	movq_r2m(mm6, tmp6);						/* save tmp07 */
	movq_r2r(mm1, mm6);						/* copy tmp01 */

   paddw_m2r(*(dataptr+11), mm2);  		/* tmp02 = x2 + x5 */
	paddw_r2r(mm3, mm0);              	/* tmp10 = tmp00 + tmp03 */

	psubw_r2r(mm3, mm7);              	/* tmp13 = tmp00 - tmp03 */

   psubw_m2r(*(dataptr+9), mm4);  		/* tmp04 = x3 - x4 */
	psubw_r2r(mm2, mm6);              	/* tmp12 = tmp01 - tmp02 */

	paddw_r2r(mm2, mm1);              	/* tmp11 = tmp01 + tmp02 */

   psubw_m2r(*(dataptr+11), mm5);  		/* tmp05 = x2 - x5 */
	paddw_r2r(mm7, mm6);              	/*  tmp12 + tmp13 */

   /* stage 3, Even and stage 4 & 5 even */

	movq_m2r(tmp6, mm2);            		/* load tmp6 */
	movq_r2r(mm0, mm3);						/* copy tmp10 */

	psllw_i2r(2, mm6);			/* shift z1 */
	paddw_r2r(mm1, mm0);    				/* y0=tmp10 + tmp11 */

	pmulhw_m2r(RTjpeg_C4, mm6);    		/* z1 */
	psubw_r2r(mm1, mm3);    				/* y4=tmp10 - tmp11 */

   movq_r2m(mm0, *(dataptr+1)); 			/*save y0 */
	movq_r2r(mm7, mm0);						/* copy tmp13 */
  
	/* odd part */

   movq_r2m(mm3, *(dataptr+9)); 			/*save y4 */
	paddw_r2r(mm5, mm4);              	/* tmp10 = tmp4 + tmp5 */

	movq_m2r(tmp7, mm3);            		/* load tmp7 */
	paddw_r2r(mm6, mm0);              	/* tmp32 = tmp13 + z1 */

	paddw_r2r(mm2, mm5);              	/* tmp11 = tmp5 + tmp6 */
	psubw_r2r(mm6, mm7);              	/* tmp33 = tmp13 - z1 */

   movq_r2m(mm0, *(dataptr+5)); 			/*save y2 */
	paddw_r2r(mm3, mm2);              	/* tmp12 = tmp6 + tmp7 */

	/* stage 4 */

   movq_r2m(mm7, *(dataptr+13)); 		/*save y6 */
	movq_r2r(mm4, mm1);						/* copy tmp10 */

	psubw_r2r(mm2, mm1);    				/* tmp10 - tmp12 */
	psllw_i2r(2, mm4);			/* shift tmp10 */

	movq_m2r(RTjpeg_C2mC6, mm0);			/* load C2mC6 */
	psllw_i2r(2, mm1);			/* shift (tmp10-tmp12) */

	pmulhw_m2r(RTjpeg_C6, mm1);    		/* z5 */
	psllw_i2r(2, mm5);			/* prepare for multiply  */

	pmulhw_r2r(mm0, mm4);					/* multiply by converted real */

	/* stage 5 */

	pmulhw_m2r(RTjpeg_C4, mm5);			/* z3 */
	psllw_i2r(2, mm2);			/* prepare for multiply  */

	pmulhw_m2r(RTjpeg_C2pC6, mm2);		/* multiply */
	movq_r2r(mm3, mm0);						/* copy tmp7 */

	movq_m2r(*(dataptr+9), mm7);		 	/* m03:m02|m01:m00 - first line (line 4)and copy into mm7 */
	paddw_r2r(mm1, mm4);						/* z2 */

	paddw_r2r(mm5, mm0);						/* z11 */
	psubw_r2r(mm5, mm3);						/* z13 */

	/* stage 6 */

	movq_r2r(mm3, mm5);						/* copy z13 */
	paddw_r2r(mm1, mm2);						/* z4 */

	movq_r2r(mm0, mm6);						/* copy z11 */
	psubw_r2r(mm4, mm5);						/* y3 */

	paddw_r2r(mm2, mm6);						/* y1 */
	paddw_r2r(mm4, mm3);						/* y5 */

   movq_r2m(mm5, *(dataptr+7)); 			/*save y3 */

   movq_r2m(mm6, *(dataptr+3)); 			/*save y1 */
	psubw_r2r(mm2, mm0);						/* y7 */
	
/************************************************************************************************
					Start of Transpose
************************************************************************************************/

 	movq_m2r(*(dataptr+13), mm6);		 	/* m23:m22|m21:m20 - third line (line 6)and copy into m2 */
	movq_r2r(mm7, mm5);						/* copy first line */

	punpcklwd_r2r(mm3, mm7); 				/* m11:m01|m10:m00 - interleave first and second lines */
	movq_r2r(mm6, mm2);						/* copy third line */

	punpcklwd_r2r(mm0, mm6);  				/* m31:m21|m30:m20 - interleave third and fourth lines */
	movq_r2r(mm7, mm1);						/* copy first intermediate result */

	punpckldq_r2r(mm6, mm7);				/* m30:m20|m10:m00 - interleave to produce result 1 */

	punpckhdq_r2r(mm6, mm1);				/* m31:m21|m11:m01 - interleave to produce result 2 */

	movq_r2m(mm7, *(dataptr+9));			/* write result 1 */
	punpckhwd_r2r(mm3, mm5);				/* m13:m03|m12:m02 - interleave first and second lines */

	movq_r2m(mm1, *(dataptr+11));			/* write result 2 */
	punpckhwd_r2r(mm0, mm2);				/* m33:m23|m32:m22 - interleave third and fourth lines */

	movq_r2r(mm5, mm1);						/* copy first intermediate result */
	punpckldq_r2r(mm2, mm5);				/* m32:m22|m12:m02 - interleave to produce result 3 */

	movq_m2r(*(dataptr+1), mm0);			/* m03:m02|m01:m00 - first line, 4x4 */
	punpckhdq_r2r(mm2, mm1);				/* m33:m23|m13:m03 - interleave to produce result 4 */

	movq_r2m(mm5, *(dataptr+13));			/* write result 3 */

	/****** last 4x4 done */

	movq_r2m(mm1, *(dataptr+15));			/* write result 4, last 4x4 */

	movq_m2r(*(dataptr+5), mm2);			/* m23:m22|m21:m20 - third line */
	movq_r2r(mm0, mm6);						/* copy first line */

	punpcklwd_m2r(*(dataptr+3), mm0);  	/* m11:m01|m10:m00 - interleave first and second lines */
	movq_r2r(mm2, mm7);						/* copy third line */

	punpcklwd_m2r(*(dataptr+7), mm2);  	/* m31:m21|m30:m20 - interleave third and fourth lines */
	movq_r2r(mm0, mm4);						/* copy first intermediate result */

	

	movq_m2r(*(dataptr+8), mm1);			/* n03:n02|n01:n00 - first line  */
	punpckldq_r2r(mm2, mm0);				/* m30:m20|m10:m00 - interleave to produce first result */

	movq_m2r(*(dataptr+12), mm3);			/* n23:n22|n21:n20 - third line */
	punpckhdq_r2r(mm2, mm4);				/* m31:m21|m11:m01 - interleave to produce second result */

	punpckhwd_m2r(*(dataptr+3), mm6);  	/* m13:m03|m12:m02 - interleave first and second lines */
	movq_r2r(mm1, mm2);						/* copy first line */

	punpckhwd_m2r(*(dataptr+7), mm7);  	/* m33:m23|m32:m22 - interleave third and fourth lines */
	movq_r2r(mm6, mm5);						/* copy first intermediate result */

	movq_r2m(mm0, *(dataptr+8));			/* write result 1 */
	punpckhdq_r2r(mm7, mm5);				/* m33:m23|m13:m03 - produce third result */

	punpcklwd_m2r(*(dataptr+10), mm1);  /* n11:n01|n10:n00 - interleave first and second lines */
	movq_r2r(mm3, mm0);						/* copy third line */

	punpckhwd_m2r(*(dataptr+10), mm2);  /* n13:n03|n12:n02 - interleave first and second lines */

	movq_r2m(mm4, *(dataptr+10));			/* write result 2 out */
	punpckldq_r2r(mm7, mm6);				/* m32:m22|m12:m02 - produce fourth result */

	punpcklwd_m2r(*(dataptr+14), mm3);  /* n33:n23|n32:n22 - interleave third and fourth lines */
	movq_r2r(mm1, mm4);						/* copy second intermediate result */

	movq_r2m(mm6, *(dataptr+12));			/* write result 3 out */
	punpckldq_r2r(mm3, mm1);				/*  */

	punpckhwd_m2r(*(dataptr+14), mm0);  /* n33:n23|n32:n22 - interleave third and fourth lines */
	movq_r2r(mm2, mm6);						/* copy second intermediate result */

	movq_r2m(mm5, *(dataptr+14));			/* write result 4 out */
	punpckhdq_r2r(mm3, mm4);				/* n31:n21|n11:n01- produce second result */

	movq_r2m(mm1, *(dataptr+1));			/* write result 5 out - (first result for other 4 x 4 block) */
	punpckldq_r2r(mm0, mm2);				/* n32:n22|n12:n02- produce third result */

	movq_r2m(mm4, *(dataptr+3));			/* write result 6 out */
	punpckhdq_r2r(mm0, mm6);				/* n33:n23|n13:n03 - produce fourth result */

	movq_r2m(mm2, *(dataptr+5));			/* write result 7 out */

	movq_m2r(*dataptr, mm0);				/* m03:m02|m01:m00 - first line, first 4x4 */

	movq_r2m(mm6, *(dataptr+7));			/* write result 8 out */

/* Do first 4x4 quadrant, which is used in the beginning of the DCT: */

	movq_m2r(*(dataptr+4), mm7);			/* m23:m22|m21:m20 - third line */
	movq_r2r(mm0, mm2);						/* copy first line */

	punpcklwd_m2r(*(dataptr+2), mm0);  	/* m11:m01|m10:m00 - interleave first and second lines */
	movq_r2r(mm7, mm4);						/* copy third line */
	
	punpcklwd_m2r(*(dataptr+6), mm7);  	/* m31:m21|m30:m20 - interleave third and fourth lines */
	movq_r2r(mm0, mm1);						/* copy first intermediate result */

	movq_m2r(*(dataptr+2), mm6);			/* m13:m12|m11:m10 - second line */
	punpckldq_r2r(mm7, mm0);				/* m30:m20|m10:m00 - interleave to produce result 1 */

	movq_m2r(*(dataptr+6), mm5);			/* m33:m32|m31:m30 - fourth line */
	punpckhdq_r2r(mm7, mm1);				/* m31:m21|m11:m01 - interleave to produce result 2 */

	movq_r2r(mm0, mm7);						/* write result 1 */
	punpckhwd_r2r(mm6, mm2);				/* m13:m03|m12:m02 - interleave first and second lines */

	psubw_m2r(*(dataptr+14), mm7);		/* tmp07=x0-x7: Stage 1 */
	movq_r2r(mm1, mm6);						/* write result 2 */

	paddw_m2r(*(dataptr+14), mm0);		/* tmp00=x0+x7: Stage 1 */
	punpckhwd_r2r(mm5, mm4);   			/* m33:m23|m32:m22 - interleave third and fourth lines */

	paddw_m2r(*(dataptr+12), mm1);		/* tmp01=x1+x6: Stage 1 */
	movq_r2r(mm2, mm3);						/* copy first intermediate result */

	psubw_m2r(*(dataptr+12), mm6);		/* tmp06=x1-x6: Stage 1 */
	punpckldq_r2r(mm4, mm2);				/* m32:m22|m12:m02 - interleave to produce result 3 */

	movq_r2m(mm7, tmp7);						/* save tmp07 */
	movq_r2r(mm2, mm5);						/* write result 3 */

	movq_r2m(mm6, tmp6);						/* save tmp06 */

	punpckhdq_r2r(mm4, mm3);				/* m33:m23|m13:m03 - interleave to produce result 4 */

	paddw_m2r(*(dataptr+10), mm2);   	/* tmp02=x2+x5: stage 1 */
	movq_r2r(mm3, mm4);						/* write result 4 */

/************************************************************************************************
					End of Transpose 2
************************************************************************************************/

   paddw_m2r(*(dataptr+8), mm3);    	/* tmp03=x3+x4: stage 1 */
   movq_r2r(mm0, mm7);

   psubw_m2r(*(dataptr+8), mm4);    	/* tmp04=x3-x4: stage 1 */
   movq_r2r(mm1, mm6);

	paddw_r2r(mm3, mm0);  					/* tmp10 = tmp00 + tmp03: even 2 */
	psubw_r2r(mm3, mm7);  					/* tmp13 = tmp00 - tmp03: even 2 */

	psubw_r2r(mm2, mm6);  					/* tmp12 = tmp01 - tmp02: even 2 */
	paddw_r2r(mm2, mm1);  					/* tmp11 = tmp01 + tmp02: even 2 */

   psubw_m2r(*(dataptr+10), mm5);    	/* tmp05=x2-x5: stage 1 */
	paddw_r2r(mm7, mm6);						/* tmp12 + tmp13 */

	/* stage 3 */

   movq_m2r(tmp6, mm2);
   movq_r2r(mm0, mm3);

	psllw_i2r(2, mm6);			/* m8 * 2^2 */
	paddw_r2r(mm1, mm0);		

	pmulhw_m2r(RTjpeg_C4, mm6);			/* z1 */
	psubw_r2r(mm1, mm3);		

   movq_r2m(mm0, *dataptr);
   movq_r2r(mm7, mm0);
   
    /* Odd part */
   movq_r2m(mm3, *(dataptr+8));
	paddw_r2r(mm5, mm4);						/* tmp10 */

   movq_m2r(tmp7, mm3);
	paddw_r2r(mm6, mm0);						/* tmp32 */

	paddw_r2r(mm2, mm5);						/* tmp11 */
	psubw_r2r(mm6, mm7);						/* tmp33 */

   movq_r2m(mm0, *(dataptr+4));
	paddw_r2r(mm3, mm2);						/* tmp12 */

	/* stage 4 */
   movq_r2m(mm7, *(dataptr+12));
	movq_r2r(mm4, mm1);						/* copy of tmp10 */

	psubw_r2r(mm2, mm1);						/* tmp10 - tmp12 */
	psllw_i2r(2, mm4);			/* m8 * 2^2 */

	movq_m2r(RTjpeg_C2mC6, mm0);
	psllw_i2r(2, mm1);

	pmulhw_m2r(RTjpeg_C6, mm1);			/* z5 */
	psllw_i2r(2, mm2);

	pmulhw_r2r(mm0, mm4);					/* z5 */

	/* stage 5 */

	pmulhw_m2r(RTjpeg_C2pC6, mm2);
	psllw_i2r(2, mm5);

	pmulhw_m2r(RTjpeg_C4, mm5);			/* z3 */
	movq_r2r(mm3, mm0);						/* copy tmp7 */

   movq_m2r(*(dataptr+1), mm7);
	paddw_r2r(mm1, mm4);						/* z2 */

	paddw_r2r(mm1, mm2);						/* z4 */

	paddw_r2r(mm5, mm0);						/* z11 */
	psubw_r2r(mm5, mm3);						/* z13 */

	/* stage 6 */

	movq_r2r(mm3, mm5);						/* copy z13 */
	psubw_r2r(mm4, mm3);						/* y3=z13 - z2 */

	paddw_r2r(mm4, mm5);						/* y5=z13 + z2 */
	movq_r2r(mm0, mm6);						/* copy z11 */

   movq_r2m(mm3, *(dataptr+6)); 			/*save y3 */
	psubw_r2r(mm2, mm0);						/* y7=z11 - z4 */

   movq_r2m(mm5, *(dataptr+10)); 		/*save y5 */
	paddw_r2r(mm2, mm6);						/* y1=z11 + z4 */

   movq_r2m(mm0, *(dataptr+14)); 		/*save y7 */

	/************************************************
	 *  End of 1st 4 rows
	 ************************************************/

   movq_m2r(*(dataptr+3), mm1); 			/* load x1  : stage 1 */
	movq_r2r(mm7, mm0);						/* copy x0 */

   movq_r2m(mm6, *(dataptr+2)); 			/*save y1 */

   movq_m2r(*(dataptr+5), mm2); 			/* load x2  : stage 1 */
	movq_r2r(mm1, mm6);						/* copy x1 */

   paddw_m2r(*(dataptr+15), mm0);  		/* tmp00 = x0 + x7 */

   movq_m2r(*(dataptr+7), mm3); 			/* load x3  : stage 1 */
	movq_r2r(mm2, mm5);						/* copy x2 */

   psubw_m2r(*(dataptr+15), mm7);  		/* tmp07 = x0 - x7 */
	movq_r2r(mm3, mm4);						/* copy x3 */

   paddw_m2r(*(dataptr+13), mm1);  		/* tmp01 = x1 + x6 */

	movq_r2m(mm7, tmp7);						/* save tmp07 */
	movq_r2r(mm0, mm7);						/* copy tmp00 */

   psubw_m2r(*(dataptr+13), mm6);  		/* tmp06 = x1 - x6 */

   /* stage 2, Even Part */

   paddw_m2r(*(dataptr+9), mm3);  		/* tmp03 = x3 + x4 */

	movq_r2m(mm6, tmp6);						/* save tmp07 */
	movq_r2r(mm1, mm6);						/* copy tmp01 */

   paddw_m2r(*(dataptr+11), mm2);  		/* tmp02 = x2 + x5 */
	paddw_r2r(mm3, mm0);              	/* tmp10 = tmp00 + tmp03 */

	psubw_r2r(mm3, mm7);              	/* tmp13 = tmp00 - tmp03 */

   psubw_m2r(*(dataptr+9), mm4);  		/* tmp04 = x3 - x4 */
	psubw_r2r(mm2, mm6);              	/* tmp12 = tmp01 - tmp02 */

	paddw_r2r(mm2, mm1);              	/* tmp11 = tmp01 + tmp02 */

   psubw_m2r(*(dataptr+11), mm5);  		/* tmp05 = x2 - x5 */
	paddw_r2r(mm7, mm6);              	/*  tmp12 + tmp13 */

   /* stage 3, Even and stage 4 & 5 even */

	movq_m2r(tmp6, mm2);            		/* load tmp6 */
	movq_r2r(mm0, mm3);						/* copy tmp10 */

	psllw_i2r(2, mm6);			/* shift z1 */
	paddw_r2r(mm1, mm0);    				/* y0=tmp10 + tmp11 */

	pmulhw_m2r(RTjpeg_C4, mm6);    		/* z1 */
	psubw_r2r(mm1, mm3);    				/* y4=tmp10 - tmp11 */

   movq_r2m(mm0, *(dataptr+1)); 			/*save y0 */
	movq_r2r(mm7, mm0);						/* copy tmp13 */
  
	/* odd part */

   movq_r2m(mm3, *(dataptr+9)); 			/*save y4 */
	paddw_r2r(mm5, mm4);              	/* tmp10 = tmp4 + tmp5 */

	movq_m2r(tmp7, mm3);            		/* load tmp7 */
	paddw_r2r(mm6, mm0);              	/* tmp32 = tmp13 + z1 */

	paddw_r2r(mm2, mm5);              	/* tmp11 = tmp5 + tmp6 */
	psubw_r2r(mm6, mm7);              	/* tmp33 = tmp13 - z1 */

   movq_r2m(mm0, *(dataptr+5)); 			/*save y2 */
	paddw_r2r(mm3, mm2);              	/* tmp12 = tmp6 + tmp7 */

	/* stage 4 */

   movq_r2m(mm7, *(dataptr+13)); 		/*save y6 */
	movq_r2r(mm4, mm1);						/* copy tmp10 */

	psubw_r2r(mm2, mm1);    				/* tmp10 - tmp12 */
	psllw_i2r(2, mm4);			/* shift tmp10 */

	movq_m2r(RTjpeg_C2mC6, mm0);			/* load C2mC6 */
	psllw_i2r(2, mm1);			/* shift (tmp10-tmp12) */

	pmulhw_m2r(RTjpeg_C6, mm1);    		/* z5 */
	psllw_i2r(2, mm5);			/* prepare for multiply  */

	pmulhw_r2r(mm0, mm4);					/* multiply by converted real */

	/* stage 5 */

	pmulhw_m2r(RTjpeg_C4, mm5);			/* z3 */
	psllw_i2r(2, mm2);			/* prepare for multiply  */

	pmulhw_m2r(RTjpeg_C2pC6, mm2);		/* multiply */
	movq_r2r(mm3, mm0);						/* copy tmp7 */

	movq_m2r(*(dataptr+9), mm7);		 	/* m03:m02|m01:m00 - first line (line 4)and copy into mm7 */
	paddw_r2r(mm1, mm4);						/* z2 */

	paddw_r2r(mm5, mm0);						/* z11 */
	psubw_r2r(mm5, mm3);						/* z13 */

	/* stage 6 */

	movq_r2r(mm3, mm5);						/* copy z13 */
	paddw_r2r(mm1, mm2);						/* z4 */

	movq_r2r(mm0, mm6);						/* copy z11 */
	psubw_r2r(mm4, mm5);						/* y3 */

	paddw_r2r(mm2, mm6);						/* y1 */
	paddw_r2r(mm4, mm3);						/* y5 */

   movq_r2m(mm5, *(dataptr+7)); 			/*save y3 */
	psubw_r2r(mm2, mm0);						/* y�=z11 - z4 */

   movq_r2m(mm3, *(dataptr+11)); 		/*save y5 */

   movq_r2m(mm6, *(dataptr+3)); 			/*save y1 */

   movq_r2m(mm0, *(dataptr+15)); 		/*save y7 */
	

#endif
}

#define FIX_1_082392200  ((__s32)  277)		/* FIX(1.082392200) */
#define FIX_1_414213562  ((__s32)  362)		/* FIX(1.414213562) */
#define FIX_1_847759065  ((__s32)  473)		/* FIX(1.847759065) */
#define FIX_2_613125930  ((__s32)  669)		/* FIX(2.613125930) */

#define DESCALE(x) (__s16)( ((x)+4) >> 3)

/* clip yuv to 16..235 (should be 16..240 for cr/cb but ... */

#define RL(x) ((x)>235) ? 235 : (((x)<16) ? 16 : (x))
#define MULTIPLY(var,const)  (((__s32) ((var) * (const)) + 128)>>8)

void RTjpeg_idct_init(void)
{
 int i;
 
 for(i=0; i<64; i++)
 {
  RTjpeg_liqt[i]=((__u64)RTjpeg_liqt[i]*RTjpeg_aan_tab[i])>>32;
  RTjpeg_ciqt[i]=((__u64)RTjpeg_ciqt[i]*RTjpeg_aan_tab[i])>>32;
 }
}

void RTjpeg_idct(__u8 *odata, __s16 *data, int rskip)
{
#ifdef HAVE_LIBMMX

static mmx_t fix_141			= (mmx_t)(long long)0x5a825a825a825a82LL;
static mmx_t fix_184n261	= (mmx_t)(long long)0xcf04cf04cf04cf04LL;
static mmx_t fix_184			= (mmx_t)(long long)0x7641764176417641LL;
static mmx_t fix_n184		= (mmx_t)(long long)0x896f896f896f896fLL;
static mmx_t fix_108n184	= (mmx_t)(long long)0xcf04cf04cf04cf04LL;

  mmx_t workspace[64];
  mmx_t *wsptr = workspace;
  register mmx_t *dataptr = (mmx_t *)odata;
  mmx_t *idata = (mmx_t *)data;

  rskip = rskip>>3;
/*
 * Perform inverse DCT on one block of coefficients.
 */

    /* Odd part */

	movq_m2r(*(idata+10), mm1);	/* load idata[DCTSIZE*5] */

	movq_m2r(*(idata+6), mm0);		/* load idata[DCTSIZE*3] */

	movq_m2r(*(idata+2), mm3);		/* load idata[DCTSIZE*1] */

	movq_r2r(mm1, mm2);				/* copy tmp6	: phase 6 */ */

	movq_m2r(*(idata+14), mm4);	/* load idata[DCTSIZE*7] */

	paddw_r2r(mm0, mm1);				/* z13 = tmp6 + tmp5; */

	psubw_r2r(mm0, mm2);				/* z10 = tmp6 - tmp5    */

	psllw_i2r(2, mm2);				/* shift z10 */
	movq_r2r(mm2, mm0); 				/* copy z10 */

	pmulhw_m2r(fix_184n261, mm2);	/* MULTIPLY( z12, FIX_1_847759065); : 2*c2 */
	movq_r2r(mm3, mm5);				/* copy tmp4 */

	pmulhw_m2r(fix_n184, mm0);		/* MULTIPLY(z10, -FIX_1_847759065); : 2*c2 */
	paddw_r2r(mm4, mm3);				/* z11 = tmp4 + tmp7; */

	movq_r2r(mm3, mm6);				/* copy z11			: phase 5 */
	psubw_r2r(mm4, mm5);				/* z12 = tmp4 - tmp7; */

	psubw_r2r(mm1, mm6);				/* z11-z13 */
	psllw_i2r(2, mm5);				/*	shift z12 */

	movq_m2r(*(idata+12), mm4);	/* load idata[DCTSIZE*6], even part */
 	movq_r2r(mm5, mm7);				/*	copy z12 */

	pmulhw_m2r(fix_108n184, mm5); /*	MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; 2*(c2-c6): even part */
	paddw_r2r(mm1, mm3);				/* tmp7 = z11 + z13;	 */

	/*ok */

    /* Even part */
	pmulhw_m2r(fix_184, mm7);		/* MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; -2*(c2+c6) */
	psllw_i2r(2, mm6);

	movq_m2r(*(idata+4), mm1);		/* load idata[DCTSIZE*2] */

	paddw_r2r(mm5, mm0);				/*	tmp10 */

	paddw_r2r(mm7, mm2);				/* tmp12 */

	pmulhw_m2r(fix_141, mm6);		/* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); 2*c4 */
	psubw_r2r(mm3, mm2);				/* tmp6 = tmp12 - tmp7 */

	movq_r2r(mm1, mm5);				/* copy tmp1 */
	paddw_r2r(mm4, mm1);				/* tmp13= tmp1 + tmp3; phases 5-3 */

	psubw_r2r(mm4, mm5);				/* tmp1-tmp3 */
	psubw_r2r(mm2, mm6);				/* tmp5 = tmp11 - tmp6; */

	movq_r2m(mm1, *(wsptr));		/* save tmp13 in workspace */
	psllw_i2r(2, mm5);	/* shift tmp1-tmp3 */
    
	movq_m2r(*(idata), mm7); 		/* load idata[DCTSIZE*0] */

	pmulhw_m2r(fix_141, mm5);		/* MULTIPLY(tmp1 - tmp3, FIX_1_414213562) */
	paddw_r2r(mm6, mm0);				/* tmp4 = tmp10 + tmp5; */

	movq_m2r(*(idata+8), mm4); 	/* load idata[DCTSIZE*4] */
	
	psubw_r2r(mm1, mm5);				/* tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; 2*c4 */

	movq_r2m(mm0, *(wsptr+4));		/* save tmp4 in workspace */
	movq_r2r(mm7, mm1);			 	/* copy tmp0	: phase 3 */

	movq_r2m(mm5, *(wsptr+2));		/* save tmp12 in workspace */
	psubw_r2r(mm4, mm1);				/* tmp11 = tmp0 - tmp2;  */

	paddw_r2r(mm4, mm7);				/* tmp10 = tmp0 + tmp2; */
   movq_r2r(mm1, mm5);				/* copy tmp11 */
	
	paddw_m2r(*(wsptr+2), mm1);	/* tmp1 = tmp11 + tmp12; */
	movq_r2r(mm7, mm4);				/* copy tmp10		: phase 2 */

	paddw_m2r(*(wsptr), mm7);		/* tmp0 = tmp10 + tmp13;	 */

	psubw_m2r(*(wsptr), mm4);		/* tmp3 = tmp10 - tmp13; */
	movq_r2r(mm7, mm0);				/*	copy tmp0 */

	psubw_m2r(*(wsptr+2), mm5);	/* tmp2 = tmp11 - tmp12; */
	paddw_r2r(mm3, mm7);				/*	wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); */
	
	psubw_r2r(mm3, mm0);				/* wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); */

	movq_r2m(mm7, *(wsptr));		/*	wsptr[DCTSIZE*0] */
	movq_r2r(mm1, mm3);				/*	copy tmp1 */

	movq_r2m(mm0, *(wsptr+14));		/* wsptr[DCTSIZE*7] */
	paddw_r2r(mm2, mm1);				/* wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); */

	psubw_r2r(mm2, mm3);				/* wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); */

	movq_r2m(mm1, *(wsptr+2));		/* wsptr[DCTSIZE*1] */
	movq_r2r(mm4, mm1);				/*	copy tmp3 */

	movq_r2m(mm3, *(wsptr+12));		/* wsptr[DCTSIZE*6] */

	paddw_m2r(*(wsptr+4), mm4);	/* wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); */

	psubw_m2r(*(wsptr+4), mm1); 	/* wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); */

	movq_r2m(mm4, *(wsptr+8));		
	movq_r2r(mm5, mm7);				/* copy tmp2 */

	paddw_r2r(mm6, mm5);				/* wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) */

	movq_r2m(mm1, *(wsptr+6));	
	psubw_r2r(mm6, mm7);				/*	wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); */

	movq_r2m(mm5, *(wsptr+4));	

	movq_r2m(mm7, *(wsptr+10));		

	/*ok */


/*****************************************************************/

	idata++;
	wsptr++;

/*****************************************************************/

	movq_m2r(*(idata+10), mm1);	/* load idata[DCTSIZE*5] */

	movq_m2r(*(idata+6), mm0);		/* load idata[DCTSIZE*3] */

	movq_m2r(*(idata+2),	mm3);		/* load idata[DCTSIZE*1] */
	movq_r2r(mm1, mm2);				/*	copy tmp6	: phase 6 */ */

	movq_m2r(*(idata+14),	mm4);		/* load idata[DCTSIZE*7] */
	paddw_r2r(mm0, mm1);				/*	z13 = tmp6 + tmp5; */

	psubw_r2r(mm0, mm2);				/*	z10 = tmp6 - tmp5    */

	psllw_i2r(2, mm2);				/*	shift z10 */
	movq_r2r(mm2, mm0);				/*	copy z10 */

	pmulhw_m2r(fix_184n261, mm2);	/* MULTIPLY( z12, FIX_1_847759065); : 2*c2 */
	movq_r2r(mm3, mm5);				/*	copy tmp4 */

	pmulhw_m2r(fix_n184, mm0);		/* MULTIPLY(z10, -FIX_1_847759065); : 2*c2 */
	paddw_r2r(mm4, mm3);				/* z11 = tmp4 + tmp7; */

	movq_r2r(mm3, mm6);				/* copy z11			: phase 5 */
	psubw_r2r(mm4, mm5);				/*	z12 = tmp4 - tmp7; */

	psubw_r2r(mm1, mm6);				/* z11-z13 */
	psllw_i2r(2, mm5);				/*	shift z12 */

	movq_m2r(*(idata+12), mm4);	/* load idata[DCTSIZE*6], even part */
 	movq_r2r(mm5, mm7);				/* copy z12 */

	pmulhw_m2r(fix_108n184, mm5);	/* MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; 2*(c2-c6) even part */
	paddw_r2r(mm1, mm3);				/* tmp7 = z11 + z13;	 */

	/*ok */

    /* Even part */
	pmulhw_m2r(fix_184, mm7);		/* MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; -2*(c2+c6) */
	psllw_i2r(2, mm6);

	movq_m2r(*(idata+4), mm1);		/* load idata[DCTSIZE*2] */

	paddw_r2r(mm5, mm0);				/*	tmp10 */

	paddw_r2r(mm7, mm2);				/* tmp12 */

	pmulhw_m2r(fix_141, mm6);		/* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); 2*c4 */
	psubw_r2r(mm3, mm2);				/* tmp6 = tmp12 - tmp7 */

	movq_r2r(mm1, mm5);				/* copy tmp1 */
	paddw_r2r(mm4, mm1);				/* tmp13= tmp1 + tmp3;	phases 5-3 */

	psubw_r2r(mm4, mm5);				/* tmp1-tmp3 */
	psubw_r2r(mm2, mm6);				/* tmp5 = tmp11 - tmp6; */

	movq_r2m(mm1, *(wsptr));		/* save tmp13 in workspace */
	psllw_i2r(2, mm5); 				/* shift tmp1-tmp3 */
    
	movq_m2r(*(idata), mm7);		/* load idata[DCTSIZE*0] */
	paddw_r2r(mm6, mm0);				/* tmp4 = tmp10 + tmp5; */

	pmulhw_m2r(fix_141, mm5);		/* MULTIPLY(tmp1 - tmp3, FIX_1_414213562) */

	movq_m2r(*(idata+8), mm4);    /* load idata[DCTSIZE*4] */
	
	psubw_r2r(mm1, mm5);				/* tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; 2*c4 */

	movq_r2m(mm0, *(wsptr+4));		/* save tmp4 in workspace */
	movq_r2r(mm7, mm1);				/* copy tmp0: phase 3 */

	movq_r2m(mm5, *(wsptr+2));		/* save tmp12 in workspace */
	psubw_r2r(mm4, mm1);				/* tmp11 = tmp0 - tmp2;  */

	paddw_r2r(mm4, mm7);				/* tmp10 = tmp0 + tmp2; */
   movq_r2r(mm1, mm5);				/* copy tmp11 */
	
	paddw_m2r(*(wsptr+2), mm1);	/* tmp1 = tmp11 + tmp12; */
	movq_r2r(mm7, mm4);				/* copy tmp10: phase 2 */

	paddw_m2r(*(wsptr), mm7);		/* tmp0 = tmp10 + tmp13;	 */

	psubw_m2r(*(wsptr), mm4);		/* tmp3 = tmp10 - tmp13; */
	movq_r2r(mm7, mm0);				/* copy tmp0 */

	psubw_m2r(*(wsptr+2), mm5);	/* tmp2 = tmp11 - tmp12; */
	paddw_r2r(mm3, mm7);				/* wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); */
	
	psubw_r2r(mm3, mm0);				/* wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); */

	movq_r2m(mm7, *(wsptr));		/* wsptr[DCTSIZE*0] */
	movq_r2r(mm1, mm3);				/* copy tmp1 */

	movq_r2m(mm0, *(wsptr+14));		/* wsptr[DCTSIZE*7] */
	paddw_r2r(mm2, mm1);				/* wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); */

	psubw_r2r(mm2, mm3);				/* wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); */

	movq_r2m(mm1, *(wsptr+2));		/* wsptr[DCTSIZE*1] */
	movq_r2r(mm4, mm1);				/* copy tmp3 */

	movq_r2m(mm3, *(wsptr+12));		/* wsptr[DCTSIZE*6] */

	paddw_m2r(*(wsptr+4), mm4);	/* wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); */

	psubw_m2r(*(wsptr+4), mm1);	/* wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); */

	movq_r2m(mm4, *(wsptr+8));		
	movq_r2r(mm5, mm7);				/* copy tmp2 */

	paddw_r2r(mm6, mm5);				/* wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) */

	movq_r2m(mm1, *(wsptr+6));		
	psubw_r2r(mm6, mm7);				/* wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); */

	movq_r2m(mm5, *(wsptr+4));	

	movq_r2m(mm7, *(wsptr+10));

/*****************************************************************/

  /* Pass 2: process rows from work array, store into output array. */
  /* Note that we must descale the results by a factor of 8 == 2**3, */
  /* and also undo the PASS1_BITS scaling. */

/*****************************************************************/
    /* Even part */

	wsptr--;

/*    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */
/*    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */
/*    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */
/*    tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); */
	movq_m2r(*(wsptr), mm0);		/* wsptr[0,0],[0,1],[0,2],[0,3] */

	movq_m2r(*(wsptr+1),	mm1);		/* wsptr[0,4],[0,5],[0,6],[0,7] */
	movq_r2r(mm0, mm2);
	
	movq_m2r(*(wsptr+2), mm3);		/* wsptr[1,0],[1,1],[1,2],[1,3] */
	paddw_r2r(mm1, mm0);				/* wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] */

	movq_m2r(*(wsptr+3), mm4);		/* wsptr[1,4],[1,5],[1,6],[1,7] */
	psubw_r2r(mm1, mm2);				/* wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] */

	movq_r2r(mm0, mm6);
	movq_r2r(mm3, mm5);
	
	paddw_r2r(mm4, mm3);				/* wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] */
	movq_r2r(mm2, mm1);

	psubw_r2r(mm4, mm5);				/* wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] */
	punpcklwd_r2r(mm3, mm0);		/* wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] */

	movq_m2r(*(wsptr+7), mm7);		/* wsptr[3,4],[3,5],[3,6],[3,7] */
	punpckhwd_r2r(mm3, mm6);		/* wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] */

	movq_m2r(*(wsptr+4), mm3);		/* wsptr[2,0],[2,1],[2,2],[2,3] */
	punpckldq_r2r(mm6, mm0);		/* wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */

	punpcklwd_r2r(mm5, mm1);		/* wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] */
	movq_r2r(mm3, mm4);

	movq_m2r(*(wsptr+6), mm6);		/* wsptr[3,0],[3,1],[3,2],[3,3] */
	punpckhwd_r2r(mm5, mm2);		/* wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] */

	movq_m2r(*(wsptr+5), mm5);		/* wsptr[2,4],[2,5],[2,6],[2,7] */
	punpckldq_r2r(mm2, mm1);		/* wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */

	
	paddw_r2r(mm5, mm3);				/* wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] */
	movq_r2r(mm6, mm2);

	psubw_r2r(mm5, mm4);				/* wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] */
	paddw_r2r(mm7, mm6);				/* wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] */

	movq_r2r(mm3, mm5);
	punpcklwd_r2r(mm6, mm3);		/* wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] */
	
	psubw_r2r(mm7, mm2);				/* wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] */
	punpckhwd_r2r(mm6, mm5);		/* wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] */

	movq_r2r(mm4, mm7);
	punpckldq_r2r(mm5, mm3);		/* wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] */
						 
	punpcklwd_r2r(mm2, mm4);		/* wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] */

	punpckhwd_r2r(mm2, mm7);		/* wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] */

	punpckldq_r2r(mm7, mm4);		/* wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] */
	movq_r2r(mm1, mm6);

	/*ok */

/*	mm0 = 	;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */
/*	mm1 =	;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */


	movq_r2r(mm0, mm2);
	punpckhdq_r2r(mm4, mm6);		/* wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] */

	punpckldq_r2r(mm4, mm1);		/* wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] */
	psllw_i2r(2, mm6);

	pmulhw_m2r(fix_141, mm6);
	punpckldq_r2r(mm3, mm0);		/* wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] */

	punpckhdq_r2r(mm3, mm2);		/* wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] */
	movq_r2r(mm0, mm7);

/*    tmp0 = tmp10 + tmp13; */
/*    tmp3 = tmp10 - tmp13; */
	paddw_r2r(mm2, mm0);				/* [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] */
	psubw_r2r(mm2, mm7);				/* [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] */

/*    tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; */
	psubw_r2r(mm2, mm6);				/* wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] */
/*    tmp1 = tmp11 + tmp12; */
/*    tmp2 = tmp11 - tmp12; */
	movq_r2r(mm1, mm5);

	/*OK */

    /* Odd part */

/*    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */
/*    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */
/*    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */
/*    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */
	movq_m2r(*(wsptr), mm3);		/* wsptr[0,0],[0,1],[0,2],[0,3] */
	paddw_r2r(mm6, mm1);				/* [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] */

	movq_m2r(*(wsptr+1), mm4);		/* wsptr[0,4],[0,5],[0,6],[0,7] */
	psubw_r2r(mm6, mm5);				/* [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] */

	movq_r2r(mm3, mm6);
	punpckldq_r2r(mm4, mm3);		/* wsptr[0,0],[0,1],[0,4],[0,5] */

	punpckhdq_r2r(mm6, mm4);		/* wsptr[0,6],[0,7],[0,2],[0,3] */
	movq_r2r(mm3, mm2);

/*Save tmp0 and tmp1 in wsptr */
	movq_r2m(mm0, *(wsptr));		/* save tmp0 */
	paddw_r2r(mm4, mm2);				/* wsptr[xxx],[0,z11],[xxx],[0,z13] */

	
/*Continue with z10 --- z13 */
	movq_m2r(*(wsptr+2), mm6);		/* wsptr[1,0],[1,1],[1,2],[1,3] */
	psubw_r2r(mm4, mm3);				/* wsptr[xxx],[0,z12],[xxx],[0,z10] */

	movq_m2r(*(wsptr+3), mm0);		/* wsptr[1,4],[1,5],[1,6],[1,7] */
	movq_r2r(mm6, mm4);

	movq_r2m(mm1, *(wsptr+1));		/* save tmp1 */
	punpckldq_r2r(mm0, mm6);		/* wsptr[1,0],[1,1],[1,4],[1,5] */

	punpckhdq_r2r(mm4, mm0);		/* wsptr[1,6],[1,7],[1,2],[1,3] */
	movq_r2r(mm6, mm1);
	
/*Save tmp2 and tmp3 in wsptr */
	paddw_r2r(mm0, mm6);				/* wsptr[xxx],[1,z11],[xxx],[1,z13] */
	movq_r2r(mm2, mm4);
	
/*Continue with z10 --- z13 */
	movq_r2m(mm5, *(wsptr+2));		/* save tmp2 */
	punpcklwd_r2r(mm6, mm2);		/* wsptr[xxx],[xxx],[0,z11],[1,z11] */

	psubw_r2r(mm0, mm1);				/* wsptr[xxx],[1,z12],[xxx],[1,z10] */
	punpckhwd_r2r(mm6, mm4);		/* wsptr[xxx],[xxx],[0,z13],[1,z13] */

	movq_r2r(mm3, mm0);
	punpcklwd_r2r(mm1, mm3);		/* wsptr[xxx],[xxx],[0,z12],[1,z12] */

	movq_r2m(mm7, *(wsptr+3));		/* save tmp3 */
	punpckhwd_r2r(mm1, mm0);		/* wsptr[xxx],[xxx],[0,z10],[1,z10] */

	movq_m2r(*(wsptr+4), mm6);		/* wsptr[2,0],[2,1],[2,2],[2,3] */
	punpckhdq_r2r(mm2, mm0);		/* wsptr[0,z10],[1,z10],[0,z11],[1,z11] */

	movq_m2r(*(wsptr+5), mm7);	/* wsptr[2,4],[2,5],[2,6],[2,7] */
	punpckhdq_r2r(mm4, mm3);		/* wsptr[0,z12],[1,z12],[0,z13],[1,z13] */

	movq_m2r(*(wsptr+6), mm1);	/* wsptr[3,0],[3,1],[3,2],[3,3] */
	movq_r2r(mm6, mm4);

	punpckldq_r2r(mm7, mm6);		/* wsptr[2,0],[2,1],[2,4],[2,5] */
	movq_r2r(mm1, mm5);

	punpckhdq_r2r(mm4, mm7);		/* wsptr[2,6],[2,7],[2,2],[2,3] */
	movq_r2r(mm6, mm2);
	
	movq_m2r(*(wsptr+7), mm4);	/* wsptr[3,4],[3,5],[3,6],[3,7] */
	paddw_r2r(mm7, mm6);				/* wsptr[xxx],[2,z11],[xxx],[2,z13] */

	psubw_r2r(mm7, mm2);				/* wsptr[xxx],[2,z12],[xxx],[2,z10] */
	punpckldq_r2r(mm4, mm1);		/* wsptr[3,0],[3,1],[3,4],[3,5] */

	punpckhdq_r2r(mm5, mm4);		/* wsptr[3,6],[3,7],[3,2],[3,3] */
	movq_r2r(mm1, mm7);

	paddw_r2r(mm4, mm1);				/* wsptr[xxx],[3,z11],[xxx],[3,z13] */
	psubw_r2r(mm4, mm7);				/* wsptr[xxx],[3,z12],[xxx],[3,z10] */

	movq_r2r(mm6, mm5);
	punpcklwd_r2r(mm1, mm6);		/* wsptr[xxx],[xxx],[2,z11],[3,z11] */

	punpckhwd_r2r(mm1, mm5);		/* wsptr[xxx],[xxx],[2,z13],[3,z13] */
	movq_r2r(mm2, mm4);

	punpcklwd_r2r(mm7, mm2);		/* wsptr[xxx],[xxx],[2,z12],[3,z12] */

	punpckhwd_r2r(mm7, mm4);		/* wsptr[xxx],[xxx],[2,z10],[3,z10] */

	punpckhdq_r2r(mm6, mm4);		/*/ wsptr[2,z10],[3,z10],[2,z11],[3,z11] */

	punpckhdq_r2r(mm5, mm2);		/* wsptr[2,z12],[3,z12],[2,z13],[3,z13] */
	movq_r2r(mm0, mm5);

	punpckldq_r2r(mm4, mm0);		/* wsptr[0,z10],[1,z10],[2,z10],[3,z10] */

	punpckhdq_r2r(mm4, mm5);		/* wsptr[0,z11],[1,z11],[2,z11],[3,z11] */
	movq_r2r(mm3, mm4);

	punpckhdq_r2r(mm2, mm4);		/* wsptr[0,z13],[1,z13],[2,z13],[3,z13] */
	movq_r2r(mm5, mm1);

	punpckldq_r2r(mm2, mm3);		/* wsptr[0,z12],[1,z12],[2,z12],[3,z12] */
/*    tmp7 = z11 + z13;		: phase 5 */
/*    tmp8 = z11 - z13;		: phase 5 */
	psubw_r2r(mm4, mm1);				/* tmp8 */

	paddw_r2r(mm4, mm5);				/* tmp7 */
/*    tmp21 = MULTIPLY(tmp8, FIX_1_414213562); 2*c4  */
	psllw_i2r(2, mm1);

	psllw_i2r(2, mm0);

	pmulhw_m2r(fix_141, mm1);		/* tmp21 */
/*    tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065))  2*(c2-c6) */
/*			+ MULTIPLY(z10, - FIX_1_847759065); : 2*c2 */
	psllw_i2r(2, mm3);
	movq_r2r(mm0, mm7);

	pmulhw_m2r(fix_n184, mm7);
	movq_r2r(mm3, mm6);

	movq_m2r(*(wsptr), mm2);		/* tmp0,final1 */

	pmulhw_m2r(fix_108n184, mm6);
/*	 tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) : -2*(c2+c6) */
/*			+ MULTIPLY(z12, FIX_1_847759065); 2*c2 */
	movq_r2r(mm2, mm4);				/* final1 */
  
	pmulhw_m2r(fix_184n261, mm0);
	paddw_r2r(mm5, mm2);				/* tmp0+tmp7,final1 */

	pmulhw_m2r(fix_184, mm3);
	psubw_r2r(mm5, mm4);				/* tmp0-tmp7,final1 */

/*    tmp6 = tmp22 - tmp7;	phase 2 */
	psraw_i2r(3, mm2);				/* outptr[0,0],[1,0],[2,0],[3,0],final1 */

	paddw_r2r(mm6, mm7);				/* tmp20 */
	psraw_i2r(3, mm4);				/* outptr[0,7],[1,7],[2,7],[3,7],final1 */

	paddw_r2r(mm0, mm3);				/* tmp22 */

/*    tmp5 = tmp21 - tmp6; */
	psubw_r2r(mm5, mm3);				/* tmp6 */

/*    tmp4 = tmp20 + tmp5; */
	movq_m2r(*(wsptr+1), mm0);		/* tmp1,final2 */
	psubw_r2r(mm3, mm1);				/* tmp5 */

	movq_r2r(mm0, mm6);				/* final2 */
	paddw_r2r(mm3, mm0);				/* tmp1+tmp6,final2 */

    /* Final output stage: scale down by a factor of 8 and range-limit */


/*    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */
/*			    & RANGE_MASK]; */
/*    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */
/*			    & RANGE_MASK];	final1 */


/*    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */
/*			    & RANGE_MASK]; */
/*    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */
/*			    & RANGE_MASK];	final2 */
	psubw_r2r(mm3, mm6);				/* tmp1-tmp6,final2 */
	psraw_i2r(3, mm0);				/* outptr[0,1],[1,1],[2,1],[3,1] */

	psraw_i2r(3, mm6);				/* outptr[0,6],[1,6],[2,6],[3,6] */
	
	packuswb_r2r(mm4, mm0);			/* out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] */
	
	movq_m2r(*(wsptr+2), mm5);		/* tmp2,final3 */
	packuswb_r2r(mm6, mm2);			/* out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] */

/*    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */
/*			    & RANGE_MASK]; */
/*    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */
/*			    & RANGE_MASK];	final3 */
	paddw_r2r(mm1, mm7);				/* tmp4 */
	movq_r2r(mm5, mm3);

	paddw_r2r(mm1, mm5);				/* tmp2+tmp5 */
	psubw_r2r(mm1, mm3);				/* tmp2-tmp5 */

	psraw_i2r(3, mm5);				/* outptr[0,2],[1,2],[2,2],[3,2] */

	movq_m2r(*(wsptr+3), mm4);		/* tmp3,final4 */
	psraw_i2r(3, mm3);				/* outptr[0,5],[1,5],[2,5],[3,5] */



/*    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */
/*			    & RANGE_MASK]; */
/*    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */
/*			    & RANGE_MASK];	final4 */
	movq_r2r(mm4, mm6);
	paddw_r2r(mm7, mm4);				/* tmp3+tmp4 */

	psubw_r2r(mm7, mm6);				/* tmp3-tmp4 */
	psraw_i2r(3, mm4);				/* outptr[0,4],[1,4],[2,4],[3,4] */

	/* mov			ecx, [dataptr] */

	psraw_i2r(3, mm6);				/* outptr[0,3],[1,3],[2,3],[3,3] */

	packuswb_r2r(mm4, mm5);			/* out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] */

	packuswb_r2r(mm3, mm6);			/* out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] */
	movq_r2r(mm2, mm4);

	movq_r2r(mm5, mm7);
	punpcklbw_r2r(mm0, mm2);		/* out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] */

	punpckhbw_r2r(mm0, mm4);		/* out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] */
	movq_r2r(mm2, mm1);

	punpcklbw_r2r(mm6, mm5);		/* out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] */

	/* add		 	dataptr, 4 */

	punpckhbw_r2r(mm6, mm7);		/* out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] */

	punpcklwd_r2r(mm5, mm2);		/* out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] */
	
	/* add			ecx, output_col */

	movq_r2r(mm7, mm6);
	punpckhwd_r2r(mm5, mm1);		/* out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] */

	movq_r2r(mm2, mm0);
	punpcklwd_r2r(mm4, mm6);		/* out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] */

	/* mov			idata, [dataptr] */
	
	punpckldq_r2r(mm6, mm2);		/* out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] */

	/* add		 	dataptr, 4 */
	 
	movq_r2r(mm1, mm3);

	/* add			idata, output_col  */
	
	punpckhwd_r2r(mm4, mm7);		/* out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] */
	
	movq_r2m(mm2, *(dataptr));
	
	punpckhdq_r2r(mm6, mm0);		/* out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] */

	dataptr += rskip;
	movq_r2m(mm0, *(dataptr));

	punpckldq_r2r(mm7, mm1);		/* out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] */
	punpckhdq_r2r(mm7, mm3);		/* out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] */
	
	dataptr += rskip;
	movq_r2m(mm1, *(dataptr));

	dataptr += rskip;
	movq_r2m(mm3, *(dataptr));

/*******************************************************************/

	wsptr += 8;

/*******************************************************************/

/*    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */
/*    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */
/*    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */
/*    tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); */
	movq_m2r(*(wsptr), mm0);		/* wsptr[0,0],[0,1],[0,2],[0,3] */

	movq_m2r(*(wsptr+1), mm1);		/* wsptr[0,4],[0,5],[0,6],[0,7] */
	movq_r2r(mm0, mm2);
	
	movq_m2r(*(wsptr+2), mm3);		/* wsptr[1,0],[1,1],[1,2],[1,3] */
	paddw_r2r(mm1, mm0);				/* wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] */

	movq_m2r(*(wsptr+3), mm4);		/* wsptr[1,4],[1,5],[1,6],[1,7] */
	psubw_r2r(mm1, mm2);				/* wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] */

	movq_r2r(mm0, mm6);
	movq_r2r(mm3, mm5);
	
	paddw_r2r(mm4, mm3);				/* wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] */
	movq_r2r(mm2, mm1);

	psubw_r2r(mm4, mm5);				/* wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] */
	punpcklwd_r2r(mm3, mm0);		/* wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] */

	movq_m2r(*(wsptr+7), mm7);	/* wsptr[3,4],[3,5],[3,6],[3,7] */
	punpckhwd_r2r(mm3, mm6);		/* wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] */

	movq_m2r(*(wsptr+4),	mm3);		/* wsptr[2,0],[2,1],[2,2],[2,3] */
	punpckldq_r2r(mm6, mm0);		/* wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */

	punpcklwd_r2r(mm5, mm1);		/* wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] */
	movq_r2r(mm3, mm4);

	movq_m2r(*(wsptr+6), mm6);	/* wsptr[3,0],[3,1],[3,2],[3,3] */
	punpckhwd_r2r(mm5, mm2);		/* wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] */

	movq_m2r(*(wsptr+5), mm5);	/* wsptr[2,4],[2,5],[2,6],[2,7] */
	punpckldq_r2r(mm2, mm1);		/* wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */

	paddw_r2r(mm5, mm3);				/* wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] */
	movq_r2r(mm6, mm2);

	psubw_r2r(mm5, mm4);				/* wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] */
	paddw_r2r(mm7, mm6);				/* wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] */

	movq_r2r(mm3, mm5);
	punpcklwd_r2r(mm6, mm3);		/* wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] */
	
	psubw_r2r(mm7, mm2);				/* wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] */
	punpckhwd_r2r(mm6, mm5);		/* wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] */

	movq_r2r(mm4, mm7);
	punpckldq_r2r(mm5, mm3);		/* wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] */

	punpcklwd_r2r(mm2, mm4);		/* wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] */

	punpckhwd_r2r(mm2, mm7);		/* wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] */

	punpckldq_r2r(mm7, mm4);		/* wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] */
	movq_r2r(mm1, mm6);

	/*OK */

/*	mm0 = 	;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */
/*	mm1 =	;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */

	movq_r2r(mm0, mm2);
	punpckhdq_r2r(mm4, mm6);		/* wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] */

	punpckldq_r2r(mm4, mm1);		/* wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] */
	psllw_i2r(2, mm6);

	pmulhw_m2r(fix_141, mm6);
	punpckldq_r2r(mm3, mm0);		/* wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] */

	punpckhdq_r2r(mm3, mm2);		/* wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] */
	movq_r2r(mm0, mm7);

/*    tmp0 = tmp10 + tmp13; */
/*    tmp3 = tmp10 - tmp13; */
	paddw_r2r(mm2, mm0);				/* [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] */
	psubw_r2r(mm2, mm7);				/* [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] */

/*    tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; */
	psubw_r2r(mm2, mm6);				/* wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] */
/*    tmp1 = tmp11 + tmp12; */
/*    tmp2 = tmp11 - tmp12; */
	movq_r2r(mm1, mm5);

	 /*OK */


    /* Odd part */

/*    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */
/*    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */
/*    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */
/*    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */
	movq_m2r(*(wsptr), mm3);		/* wsptr[0,0],[0,1],[0,2],[0,3] */
	paddw_r2r(mm6, mm1);				/* [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] */

	movq_m2r(*(wsptr+1),	mm4);		/* wsptr[0,4],[0,5],[0,6],[0,7] */
	psubw_r2r(mm6, mm5);				/* [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] */

	movq_r2r(mm3, mm6);
	punpckldq_r2r(mm4, mm3);		/* wsptr[0,0],[0,1],[0,4],[0,5] */

	punpckhdq_r2r(mm6, mm4);		/* wsptr[0,6],[0,7],[0,2],[0,3] */
	movq_r2r(mm3, mm2);

/*Save tmp0 and tmp1 in wsptr */
	movq_r2m(mm0, *(wsptr));		/* save tmp0 */
	paddw_r2r(mm4, mm2);				/* wsptr[xxx],[0,z11],[xxx],[0,z13] */

	
/*Continue with z10 --- z13 */
	movq_m2r(*(wsptr+2), mm6);		/* wsptr[1,0],[1,1],[1,2],[1,3] */
	psubw_r2r(mm4, mm3);				/* wsptr[xxx],[0,z12],[xxx],[0,z10] */

	movq_m2r(*(wsptr+3), mm0);		/* wsptr[1,4],[1,5],[1,6],[1,7] */
	movq_r2r(mm6, mm4);

	movq_r2m(mm1, *(wsptr+1));		/* save tmp1 */
	punpckldq_r2r(mm0, mm6);		/* wsptr[1,0],[1,1],[1,4],[1,5] */

	punpckhdq_r2r(mm4, mm0);		/* wsptr[1,6],[1,7],[1,2],[1,3] */
	movq_r2r(mm6, mm1);
	
/*Save tmp2 and tmp3 in wsptr */
	paddw_r2r(mm0, mm6);				/* wsptr[xxx],[1,z11],[xxx],[1,z13] */
	movq_r2r(mm2, mm4);
	
/*Continue with z10 --- z13 */
	movq_r2m(mm5, *(wsptr+2));		/* save tmp2 */
	punpcklwd_r2r(mm6, mm2);		/* wsptr[xxx],[xxx],[0,z11],[1,z11] */

	psubw_r2r(mm0, mm1);				/* wsptr[xxx],[1,z12],[xxx],[1,z10] */
	punpckhwd_r2r(mm6, mm4);		/* wsptr[xxx],[xxx],[0,z13],[1,z13] */

	movq_r2r(mm3, mm0);
	punpcklwd_r2r(mm1, mm3);		/* wsptr[xxx],[xxx],[0,z12],[1,z12] */

	movq_r2m(mm7, *(wsptr+3));		/* save tmp3 */
	punpckhwd_r2r(mm1, mm0);		/* wsptr[xxx],[xxx],[0,z10],[1,z10] */

	movq_m2r(*(wsptr+4), mm6);		/* wsptr[2,0],[2,1],[2,2],[2,3] */
	punpckhdq_r2r(mm2, mm0);		/* wsptr[0,z10],[1,z10],[0,z11],[1,z11] */

	movq_m2r(*(wsptr+5), mm7);	/* wsptr[2,4],[2,5],[2,6],[2,7] */
	punpckhdq_r2r(mm4, mm3);		/* wsptr[0,z12],[1,z12],[0,z13],[1,z13] */

	movq_m2r(*(wsptr+6), mm1);	/* wsptr[3,0],[3,1],[3,2],[3,3] */
	movq_r2r(mm6, mm4);

	punpckldq_r2r(mm7, mm6);		/* wsptr[2,0],[2,1],[2,4],[2,5] */
	movq_r2r(mm1, mm5);

	punpckhdq_r2r(mm4, mm7);		/* wsptr[2,6],[2,7],[2,2],[2,3] */
	movq_r2r(mm6, mm2);
	
	movq_m2r(*(wsptr+7), mm4);	/* wsptr[3,4],[3,5],[3,6],[3,7] */
	paddw_r2r(mm7, mm6);				/* wsptr[xxx],[2,z11],[xxx],[2,z13] */

	psubw_r2r(mm7, mm2);				/* wsptr[xxx],[2,z12],[xxx],[2,z10] */
	punpckldq_r2r(mm4, mm1);		/* wsptr[3,0],[3,1],[3,4],[3,5] */

	punpckhdq_r2r(mm5, mm4);		/* wsptr[3,6],[3,7],[3,2],[3,3] */
	movq_r2r(mm1, mm7);

	paddw_r2r(mm4, mm1);				/* wsptr[xxx],[3,z11],[xxx],[3,z13] */
	psubw_r2r(mm4, mm7);				/* wsptr[xxx],[3,z12],[xxx],[3,z10] */

	movq_r2r(mm6, mm5);
	punpcklwd_r2r(mm1, mm6);		/* wsptr[xxx],[xxx],[2,z11],[3,z11] */

	punpckhwd_r2r(mm1, mm5);		/* wsptr[xxx],[xxx],[2,z13],[3,z13] */
	movq_r2r(mm2, mm4);

	punpcklwd_r2r(mm7, mm2);		/* wsptr[xxx],[xxx],[2,z12],[3,z12] */

	punpckhwd_r2r(mm7, mm4);		/* wsptr[xxx],[xxx],[2,z10],[3,z10] */

	punpckhdq_r2r(mm6, mm4);		/* wsptr[2,z10],[3,z10],[2,z11],[3,z11] */

	punpckhdq_r2r(mm5, mm2);		/* wsptr[2,z12],[3,z12],[2,z13],[3,z13] */
	movq_r2r(mm0, mm5);

	punpckldq_r2r(mm4, mm0);		/* wsptr[0,z10],[1,z10],[2,z10],[3,z10] */

	punpckhdq_r2r(mm4, mm5);		/* wsptr[0,z11],[1,z11],[2,z11],[3,z11] */
	movq_r2r(mm3, mm4);

	punpckhdq_r2r(mm2, mm4);		/* wsptr[0,z13],[1,z13],[2,z13],[3,z13] */
	movq_r2r(mm5, mm1);

	punpckldq_r2r(mm2, mm3);		/* wsptr[0,z12],[1,z12],[2,z12],[3,z12] */
/*    tmp7 = z11 + z13;		: phase 5 */
/*    tmp8 = z11 - z13;		: phase 5 */
	psubw_r2r(mm4, mm1);				/* tmp8 */

	paddw_r2r(mm4, mm5);				/* tmp7 */
/*    tmp21 = MULTIPLY(tmp8, FIX_1_414213562);  2*c4 */
	psllw_i2r(2, mm1);

	psllw_i2r(2, mm0);

	pmulhw_m2r(fix_141, mm1);		/* tmp21 */
/*    tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) :  2*(c2-c6) */
/*			+ MULTIPLY(z10, - FIX_1_847759065); : 2*c2 */
	psllw_i2r(2, mm3);
	movq_r2r(mm0, mm7);

	pmulhw_m2r(fix_n184, mm7);
	movq_r2r(mm3, mm6);

	movq_m2r(*(wsptr), mm2);		/* tmp0,final1 */

	pmulhw_m2r(fix_108n184, mm6);
/*	 tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) : -2*(c2+c6) */
/*			+ MULTIPLY(z12, FIX_1_847759065); : 2*c2 */
	movq_r2r(mm2, mm4);				/* final1 */
  
	pmulhw_m2r(fix_184n261, mm0);
	paddw_r2r(mm5, mm2);				/* tmp0+tmp7,final1 */

	pmulhw_m2r(fix_184, mm3);
	psubw_r2r(mm5, mm4);				/* tmp0-tmp7,final1 */

/*    tmp6 = tmp22 - tmp7;	phase 2 */
	psraw_i2r(3, mm2);				/* outptr[0,0],[1,0],[2,0],[3,0],final1 */

	paddw_r2r(mm6, mm7);				/* tmp20 */
	psraw_i2r(3, mm4);				/* outptr[0,7],[1,7],[2,7],[3,7],final1 */

	paddw_r2r(mm0, mm3);				/* tmp22 */

/*    tmp5 = tmp21 - tmp6; */
	psubw_r2r(mm5, mm3);				/* tmp6 */

/*    tmp4 = tmp20 + tmp5; */
	movq_m2r(*(wsptr+1), mm0);		/* tmp1,final2 */
	psubw_r2r(mm3, mm1);				/* tmp5 */

	movq_r2r(mm0, mm6);				/* final2 */
	paddw_r2r(mm3, mm0);				/* tmp1+tmp6,final2 */

    /* Final output stage: scale down by a factor of 8 and range-limit */

/*    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */
/*			    & RANGE_MASK]; */
/*    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */
/*			    & RANGE_MASK];	final1 */


/*    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */
/*			    & RANGE_MASK]; */
/*    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */
/*			    & RANGE_MASK];	final2 */
	psubw_r2r(mm3, mm6);				/* tmp1-tmp6,final2 */
	psraw_i2r(3, mm0);				/* outptr[0,1],[1,1],[2,1],[3,1] */

	psraw_i2r(3, mm6);				/* outptr[0,6],[1,6],[2,6],[3,6] */
	
	packuswb_r2r(mm4, mm0);			/* out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] */
	
	movq_m2r(*(wsptr+2), mm5);		/* tmp2,final3 */
	packuswb_r2r(mm6, mm2);			/* out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] */

/*    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */
/*			    & RANGE_MASK]; */
/*    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */
/*			    & RANGE_MASK];	final3 */
	paddw_r2r(mm1, mm7);				/* tmp4 */
	movq_r2r(mm5, mm3);

	paddw_r2r(mm1, mm5);				/* tmp2+tmp5 */
	psubw_r2r(mm1, mm3);				/* tmp2-tmp5 */

	psraw_i2r(3, mm5);				/* outptr[0,2],[1,2],[2,2],[3,2] */

	movq_m2r(*(wsptr+3), mm4);		/* tmp3,final4 */
	psraw_i2r(3, mm3);				/* outptr[0,5],[1,5],[2,5],[3,5] */



/*    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */
/*			    & RANGE_MASK]; */
/*    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */
/*			    & RANGE_MASK];	final4 */
	movq_r2r(mm4, mm6);
	paddw_r2r(mm7, mm4);				/* tmp3+tmp4 */

	psubw_r2r(mm7, mm6);				/* tmp3-tmp4 */
	psraw_i2r(3, mm4);				/* outptr[0,4],[1,4],[2,4],[3,4] */

	psraw_i2r(3, mm6);				/* outptr[0,3],[1,3],[2,3],[3,3] */

	/*
   movq_r2m(mm4, *dummy);
	fprintf(stderr, "3-4 %016llx\n", dummy);
   movq_r2m(mm4, *dummy);
	fprintf(stderr, "3+4 %016llx\n", dummy);
	*/
	

	packuswb_r2r(mm4, mm5);			/* out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] */

	packuswb_r2r(mm3, mm6);			/* out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] */
	movq_r2r(mm2, mm4);

	movq_r2r(mm5, mm7);
	punpcklbw_r2r(mm0, mm2);		/* out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] */

	punpckhbw_r2r(mm0, mm4);		/* out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] */
	movq_r2r(mm2, mm1);

	punpcklbw_r2r(mm6, mm5);		/* out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] */
	
	punpckhbw_r2r(mm6, mm7);		/* out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] */

	punpcklwd_r2r(mm5, mm2);		/* out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] */
	
	movq_r2r(mm7, mm6);
	punpckhwd_r2r(mm5, mm1);		/* out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] */

	movq_r2r(mm2, mm0);
	punpcklwd_r2r(mm4, mm6);		/* out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] */

	punpckldq_r2r(mm6, mm2);		/* out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] */

	movq_r2r(mm1, mm3);

	punpckhwd_r2r(mm4, mm7);		/* out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] */
	
	dataptr += rskip;
	movq_r2m(mm2, *(dataptr));

	punpckhdq_r2r(mm6, mm0);		/* out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] */

	dataptr += rskip;
	movq_r2m(mm0, *(dataptr));

	punpckldq_r2r(mm7, mm1);		/* out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] */
	
	punpckhdq_r2r(mm7, mm3);		/* out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] */

	dataptr += rskip;
	movq_r2m(mm1, *(dataptr));

	dataptr += rskip;
	movq_r2m(mm3, *(dataptr));

#else
  __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  __s32 tmp10, tmp11, tmp12, tmp13;
  __s32 z5, z10, z11, z12, z13;
  __s16 *inptr;
  __s32 *wsptr;
  __u8 *outptr;
  int ctr;
  __s32 dcval;
  __s32 workspace[64];

  inptr = data;
  wsptr = workspace;
  for (ctr = 8; ctr > 0; ctr--) {
    
    if ((inptr[8] | inptr[16] | inptr[24] |
	 inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) {
      dcval = inptr[0];
      wsptr[0] = dcval;
      wsptr[8] = dcval;
      wsptr[16] = dcval;
      wsptr[24] = dcval;
      wsptr[32] = dcval;
      wsptr[40] = dcval;
      wsptr[48] = dcval;
      wsptr[56] = dcval;
      
      inptr++;	
      wsptr++;
      continue;
    } 
    
    tmp0 = inptr[0];
    tmp1 = inptr[16];
    tmp2 = inptr[32];
    tmp3 = inptr[48];

    tmp10 = tmp0 + tmp2;
    tmp11 = tmp0 - tmp2;

    tmp13 = tmp1 + tmp3;
    tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13;

    tmp0 = tmp10 + tmp13;
    tmp3 = tmp10 - tmp13;
    tmp1 = tmp11 + tmp12;
    tmp2 = tmp11 - tmp12;
    
    tmp4 = inptr[8];
    tmp5 = inptr[24];
    tmp6 = inptr[40];
    tmp7 = inptr[56];

    z13 = tmp6 + tmp5;
    z10 = tmp6 - tmp5;
    z11 = tmp4 + tmp7;
    z12 = tmp4 - tmp7;

    tmp7 = z11 + z13;
    tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);

    z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
    tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;

    tmp6 = tmp12 - tmp7;
    tmp5 = tmp11 - tmp6;
    tmp4 = tmp10 + tmp5;

    wsptr[0] = (__s32) (tmp0 + tmp7);
    wsptr[56] = (__s32) (tmp0 - tmp7);
    wsptr[8] = (__s32) (tmp1 + tmp6);
    wsptr[48] = (__s32) (tmp1 - tmp6);
    wsptr[16] = (__s32) (tmp2 + tmp5);
    wsptr[40] = (__s32) (tmp2 - tmp5);
    wsptr[32] = (__s32) (tmp3 + tmp4);
    wsptr[24] = (__s32) (tmp3 - tmp4);

    inptr++;
    wsptr++;
  }

  wsptr = workspace;
  for (ctr = 0; ctr < 8; ctr++) {
    outptr = &(odata[ctr*rskip]);

    tmp10 = wsptr[0] + wsptr[4];
    tmp11 = wsptr[0] - wsptr[4];

    tmp13 = wsptr[2] + wsptr[6];
    tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13;

    tmp0 = tmp10 + tmp13;
    tmp3 = tmp10 - tmp13;
    tmp1 = tmp11 + tmp12;
    tmp2 = tmp11 - tmp12;

    z13 = wsptr[5] + wsptr[3];
    z10 = wsptr[5] - wsptr[3];
    z11 = wsptr[1] + wsptr[7];
    z12 = wsptr[1] - wsptr[7];

    tmp7 = z11 + z13;
    tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);

    z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
    tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;

    tmp6 = tmp12 - tmp7;
    tmp5 = tmp11 - tmp6;
    tmp4 = tmp10 + tmp5;

    outptr[0] = RL(DESCALE(tmp0 + tmp7));
    outptr[7] = RL(DESCALE(tmp0 - tmp7));
    outptr[1] = RL(DESCALE(tmp1 + tmp6));
    outptr[6] = RL(DESCALE(tmp1 - tmp6));
    outptr[2] = RL(DESCALE(tmp2 + tmp5));
    outptr[5] = RL(DESCALE(tmp2 - tmp5));
    outptr[4] = RL(DESCALE(tmp3 + tmp4));
    outptr[3] = RL(DESCALE(tmp3 - tmp4));

    wsptr += 8;
  }
#endif
}
/*

Main Routines

This file contains most of the initialisation and control functions

(C) Justin Schoeman 1998

*/

/*

Private function

Initialise all the cache-aliged data blocks

*/

void RTjpeg_init_data(void)
{
 unsigned long dptr;
 
 dptr=(unsigned long)&(RTjpeg_alldata[0]);
 dptr+=32;
 dptr=dptr>>5;
 dptr=dptr<<5; /* cache align data */
 
 RTjpeg_block=(__s16 *)dptr;
 dptr+=sizeof(__s16)*64;
 RTjpeg_lqt=(__s32 *)dptr;
 dptr+=sizeof(__s32)*64;
 RTjpeg_cqt=(__s32 *)dptr;
 dptr+=sizeof(__s32)*64;
 RTjpeg_liqt=(__u32 *)dptr;
 dptr+=sizeof(__u32)*64;
 RTjpeg_ciqt=(__u32 *)dptr;
}

/*

External Function

Re-set quality factor

Input: buf -> pointer to 128 ints for quant values store to pass back to
              init_decompress.
       Q -> quality factor (192=best, 32=worst)
*/

void RTjpeg_init_Q(__u8 Q)
{
 int i;
 __u64 qual;
 
 qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */

 for(i=0; i<64; i++)
 {
  RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3);
  if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1;
  RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
  if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1;
  RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3);
  RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3);
  RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3;
  RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3;
 }
 
 RTjpeg_lb8=0;
 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
 RTjpeg_lb8--;
 RTjpeg_cb8=0;
 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
 RTjpeg_cb8--;

 RTjpeg_dct_init();
 RTjpeg_idct_init();
 RTjpeg_quant_init();
}

/*

External Function

Initialise compression.

Input: buf -> pointer to 128 ints for quant values store to pass back to 
                init_decompress.
       width -> width of image
       height -> height of image
       Q -> quality factor (192=best, 32=worst)
       
*/

void RTjpeg_init_compress(__u32 *buf, int width, int height, __u8 Q)
{
 int i;
 __u64 qual;
 
 RTjpeg_init_data();
 
 RTjpeg_width=width;
 RTjpeg_height=height;
 RTjpeg_Ywidth = RTjpeg_width>>3;
 RTjpeg_Ysize=width * height;
 RTjpeg_Cwidth = RTjpeg_width>>4;
 RTjpeg_Csize= (width>>1) * height;

 qual=(__u64)Q<<(32-7); /* 32 bit FP, 255=2, 0=0 */

 for(i=0; i<64; i++)
 {
  RTjpeg_lqt[i]=(__s32)((qual/((__u64)RTjpeg_lum_quant_tbl[i]<<16))>>3);
  if(RTjpeg_lqt[i]==0)RTjpeg_lqt[i]=1;
  RTjpeg_cqt[i]=(__s32)((qual/((__u64)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
  if(RTjpeg_cqt[i]==0)RTjpeg_cqt[i]=1;
  RTjpeg_liqt[i]=(1<<16)/(RTjpeg_lqt[i]<<3);
  RTjpeg_ciqt[i]=(1<<16)/(RTjpeg_cqt[i]<<3);
  RTjpeg_lqt[i]=((1<<16)/RTjpeg_liqt[i])>>3;
  RTjpeg_cqt[i]=((1<<16)/RTjpeg_ciqt[i])>>3;
 }
 
 RTjpeg_lb8=0;
 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
 RTjpeg_lb8--;
 RTjpeg_cb8=0;
 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
 RTjpeg_cb8--;
 
 RTjpeg_dct_init();
 RTjpeg_quant_init();

 for(i=0; i<64; i++)
  buf[i]=RTjpeg_liqt[i];
 for(i=0; i<64; i++)
  buf[64+i]=RTjpeg_ciqt[i];
}

void RTjpeg_init_decompress(__u32 *buf, int width, int height)
{
 int i;

 RTjpeg_init_data();
 
 RTjpeg_width=width;
 RTjpeg_height=height;
 RTjpeg_Ywidth = RTjpeg_width>>3;
 RTjpeg_Ysize=width * height;
 RTjpeg_Cwidth = RTjpeg_width>>4;
 RTjpeg_Csize= (width>>1) * height;

 for(i=0; i<64; i++)
 {
  RTjpeg_liqt[i]=buf[i];
  RTjpeg_ciqt[i]=buf[i+64];
 }

 RTjpeg_lb8=0;
 while(RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]]<=8);
 RTjpeg_lb8--;
 RTjpeg_cb8=0;
 while(RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]]<=8);
 RTjpeg_cb8--;

 RTjpeg_idct_init();

/* RTjpeg_color_init(); */
}

int RTjpeg_compressYUV420(__s8 *sp, unsigned char *bp)
{
 __s8 * sb;
 register __s8 * bp1 = bp + (RTjpeg_width<<3);
 register __s8 * bp2 = bp + RTjpeg_Ysize;
 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
 register int i, j, k;

#ifdef HAVE_LIBMMX
 emms();
#endif
 sb=sp;
/* Y */
 for(i=RTjpeg_height>>1; i; i-=8)
 {
  for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
  {
   RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
   RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
   sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);

   RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
   RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
   sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);

   RTjpeg_dctY(bp1+j, RTjpeg_block, RTjpeg_Ywidth);
   RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
   sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);

   RTjpeg_dctY(bp1+j+8, RTjpeg_block, RTjpeg_Ywidth);
   RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
   sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);

   RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
   RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
   sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);

   RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
   RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
   sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);

  }
  bp+=RTjpeg_width<<4;
  bp1+=RTjpeg_width<<4;
  bp2+=RTjpeg_width<<2;
  bp3+=RTjpeg_width<<2;
			 
 }
#ifdef HAVE_LIBMMX
 emms();
#endif
 return (sp-sb);
}

int RTjpeg_compressYUV422(__s8 *sp, unsigned char *bp)
{
 __s8 * sb;
 register __s8 * bp2 = bp + RTjpeg_Ysize;
 register __s8 * bp3 = bp2 + RTjpeg_Csize;
 register int i, j, k;

#ifdef HAVE_LIBMMX
 emms();
#endif
 sb=sp;
/* Y */
 for(i=RTjpeg_height; i; i-=8)
 {
  for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
  {
   RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
   RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
   sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);

   RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
   RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
   sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);

   RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
   RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
   sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);

   RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
   RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
   sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);

  }
  bp+=RTjpeg_width<<3;
  bp2+=RTjpeg_width<<2;
  bp3+=RTjpeg_width<<2;
			 
 }
#ifdef HAVE_LIBMMX
 emms();
#endif
 return (sp-sb);
}

int RTjpeg_compress8(__s8 *sp, unsigned char *bp)
{
 __s8 * sb;
 int i, j;

#ifdef HAVE_LIBMMX
 emms();
#endif
 
 sb=sp;
/* Y */
 for(i=0; i<RTjpeg_height; i+=8)
 {
  for(j=0; j<RTjpeg_width; j+=8)
  {
   RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_width);
   RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
   sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
  }
  bp+=RTjpeg_width;
 }

#ifdef HAVE_LIBMMX
 emms();
#endif
 return (sp-sb);
}

void RTjpeg_decompressYUV422(__s8 *sp, __u8 *bp)
{
 register __s8 * bp2 = bp + RTjpeg_Ysize;
 register __s8 * bp3 = bp2 + (RTjpeg_Csize);
 int i, j,k;

#ifdef HAVE_LIBMMX
 emms();
#endif

/* Y */
 for(i=RTjpeg_height; i; i-=8)
 {
  for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) {
   if(*sp==-1)sp++;
   else
   { 
    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
    RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
   }
   if(*sp==-1)sp++;
   else
   { 
    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
    RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width);
   }
   if(*sp==-1)sp++;
   else
   { 
    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
    RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1);
   } 
   if(*sp==-1)sp++;
   else
   { 
    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
    RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1);
   } 
  }
  bp+=RTjpeg_width<<3;
  bp2+=RTjpeg_width<<2;
  bp3+=RTjpeg_width<<2;
 }
#ifdef HAVE_LIBMMX
 emms();
#endif
}

void RTjpeg_decompressYUV420(__s8 *sp, __u8 *bp)
{
 register __s8 * bp1 = bp + (RTjpeg_width<<3);
 register __s8 * bp2 = bp + RTjpeg_Ysize;
 register __s8 * bp3 = bp2 + (RTjpeg_Csize>>1);
 int i, j,k;

#ifdef HAVE_LIBMMX
 emms();
#endif

/* Y */
 for(i=RTjpeg_height>>1; i; i-=8)
 {
  for(k=0, j=0; j<RTjpeg_width; j+=16, k+=8) {
   if(*sp==-1)sp++;
   else
   { 
    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
    RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
   }
   if(*sp==-1)sp++;
   else
   { 
    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
    RTjpeg_idct(bp+j+8, RTjpeg_block, RTjpeg_width);
   }
   if(*sp==-1)sp++;
   else
   { 
    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
    RTjpeg_idct(bp1+j, RTjpeg_block, RTjpeg_width);
   }
   if(*sp==-1)sp++;
   else
   { 
    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
    RTjpeg_idct(bp1+j+8, RTjpeg_block, RTjpeg_width);
   }
   if(*sp==-1)sp++;
   else
   { 
    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
    RTjpeg_idct(bp2+k, RTjpeg_block, RTjpeg_width>>1);
   } 
   if(*sp==-1)sp++;
   else
   { 
    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
    RTjpeg_idct(bp3+k, RTjpeg_block, RTjpeg_width>>1);
   } 
  }
  bp+=RTjpeg_width<<4;
  bp1+=RTjpeg_width<<4;
  bp2+=RTjpeg_width<<2;
  bp3+=RTjpeg_width<<2;
 }
#ifdef HAVE_LIBMMX
 emms();
#endif
}

void RTjpeg_decompress8(__s8 *sp, __u8 *bp)
{
 int i, j;

#ifdef HAVE_LIBMMX
 emms();
#endif

/* Y */
 for(i=0; i<RTjpeg_height; i+=8)
 {
  for(j=0; j<RTjpeg_width; j+=8)
   if(*sp==-1)sp++;
   else
   { 
    sp+=RTjpeg_s2b(RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
    RTjpeg_idct(bp+j, RTjpeg_block, RTjpeg_width);
   }
  bp+=RTjpeg_width<<3;
 }
}

/*
External Function

Initialise additional data structures for motion compensation

*/

void RTjpeg_init_mcompress(void)
{
 unsigned long tmp;

 if(!RTjpeg_old)
 {
  RTjpeg_old=malloc((4*RTjpeg_width*RTjpeg_height)+32);
  tmp=(unsigned long)RTjpeg_old;
  tmp+=32;
  tmp=tmp>>5;
  RTjpeg_old=(__s16 *)(tmp<<5);
 }
 if (!RTjpeg_old)
 {
  fprintf(stderr, "RTjpeg: Could not allocate memory\n");
  exit(-1);
 }
 bzero(RTjpeg_old, ((4*RTjpeg_width*RTjpeg_height)));
}

#ifdef HAVE_LIBMMX

int RTjpeg_bcomp(__s16 *old, mmx_t *mask)
{
 int i;
 mmx_t *mold=(mmx_t *)old;
 mmx_t *mblock=(mmx_t *)RTjpeg_block;
 mmx_t result;
 static mmx_t neg=(mmx_t)(unsigned long long)0xffffffffffffffffULL;
 
 movq_m2r(*mask, mm7);
 movq_m2r(neg, mm6);
 pxor_r2r(mm5, mm5);
 
 for(i=0; i<8; i++)
 {
  movq_m2r(*(mblock++), mm0);
  			movq_m2r(*(mblock++), mm2);
  movq_m2r(*(mold++), mm1);
  			movq_m2r(*(mold++), mm3);
  psubsw_r2r(mm1, mm0);
  			psubsw_r2r(mm3, mm2);
  movq_r2r(mm0, mm1);
  			movq_r2r(mm2, mm3);
  pcmpgtw_r2r(mm7, mm0);
  			pcmpgtw_r2r(mm7, mm2);
  pxor_r2r(mm6, mm1);
  			pxor_r2r(mm6, mm3);
  pcmpgtw_r2r(mm7, mm1);
  			pcmpgtw_r2r(mm7, mm3);
  por_r2r(mm0, mm5);
  			por_r2r(mm2, mm5);
  por_r2r(mm1, mm5);
  			por_r2r(mm3, mm5);
 }
 movq_r2m(mm5, result);
 
 if(result.q)
 {
  if(!RTjpeg_mtest)
   for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i];
  return 0;
 }
/* printf("."); */
 return 1;
}

#else
int RTjpeg_bcomp(__s16 *old, __u16 *mask)
{
 int i;

 for(i=0; i<64; i++)
  if(abs(old[i]-RTjpeg_block[i])>*mask)
  {
   if(!RTjpeg_mtest)
    for(i=0; i<16; i++)((__u64 *)old)[i]=((__u64 *)RTjpeg_block)[i];
   return 0;
  }
 return 1;
}
#endif

void RTjpeg_set_test(int i)
{
 RTjpeg_mtest=i;
}

int RTjpeg_mcompress(__s8 *sp, unsigned char *bp, __u16 lmask, __u16 cmask)
{
 __s8 * sb;
 __s16 *block;
 register __s8 * bp2;
 register __s8 * bp3;
 register int i, j, k;

#ifdef HAVE_LIBMMX
 emms();
 RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask);
 RTjpeg_cmask=(mmx_t)(((__u64)cmask<<48)|((__u64)cmask<<32)|((__u64)cmask<<16)|cmask);
#else
 RTjpeg_lmask=lmask;
 RTjpeg_cmask=cmask;
#endif
 
 bp = bp - RTjpeg_width*0;
 bp2 = bp + RTjpeg_Ysize-RTjpeg_width*0;
 bp3 = bp2 + RTjpeg_Csize;

 sb=sp;
 block=RTjpeg_old;
/* Y */
 for(i=RTjpeg_height; i; i-=8)
 {
  for(j=0, k=0; j<RTjpeg_width; j+=16, k+=8)
  {
   RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_Ywidth);
   RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
   if(RTjpeg_bcomp(block, &RTjpeg_lmask))
   {
    *((__u8 *)sp++)=255;
   } 
	else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
   block+=64;

   RTjpeg_dctY(bp+j+8, RTjpeg_block, RTjpeg_Ywidth);
   RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
   if(RTjpeg_bcomp(block, &RTjpeg_lmask))
   {
    *((__u8 *)sp++)=255;
   } 
	else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
   block+=64;

   RTjpeg_dctY(bp2+k, RTjpeg_block, RTjpeg_Cwidth);
   RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
   if(RTjpeg_bcomp(block, &RTjpeg_cmask))
   {
    *((__u8 *)sp++)=255;
   } 
	else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
   block+=64;

   RTjpeg_dctY(bp3+k, RTjpeg_block, RTjpeg_Cwidth);
   RTjpeg_quant(RTjpeg_block, RTjpeg_cqt);
   if(RTjpeg_bcomp(block, &RTjpeg_cmask))
   {
    *((__u8 *)sp++)=255;
   } 
	else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_cb8);
   block+=64;

  }
  bp+=RTjpeg_width<<3;
  bp2+=RTjpeg_width<<2;
  bp3+=RTjpeg_width<<2;
 }
 /*printf ("%d\n", block - RTjpeg_old); */
#ifdef HAVE_LIBMMX
 emms();
#endif
 return (sp-sb);
}

int RTjpeg_mcompress8(__s8 *sp, unsigned char *bp, __u16 lmask)
{
 __s8 * sb;
 __s16 *block;
 int i, j;

#ifdef HAVE_LIBMMX
 emms();
 RTjpeg_lmask=(mmx_t)(((__u64)lmask<<48)|((__u64)lmask<<32)|((__u64)lmask<<16)|lmask);
#else
 RTjpeg_lmask=lmask;
#endif

 
 sb=sp;
 block=RTjpeg_old;
/* Y */
 for(i=0; i<RTjpeg_height; i+=8)
 {
  for(j=0; j<RTjpeg_width; j+=8)
  {
   RTjpeg_dctY(bp+j, RTjpeg_block, RTjpeg_width);
   RTjpeg_quant(RTjpeg_block, RTjpeg_lqt);
   if(RTjpeg_bcomp(block, &RTjpeg_lmask))
   {
    *((__u8 *)sp++)=255;
/*    printf("* %d ", sp[-1]); */
   } else sp+=RTjpeg_b2s(RTjpeg_block, sp, RTjpeg_lb8);
   block+=64;
  }
  bp+=RTjpeg_width<<3;
 }
#ifdef HAVE_LIBMMX
 emms();
#endif
 return (sp-sb);
}

void RTjpeg_color_init(void)
{
}  

#define KcrR 76284
#define KcrG 53281
#define KcbG 25625
#define KcbB 132252
#define Ky 76284

void RTjpeg_yuv422rgb(__u8 *buf, __u8 *rgb)
{
 int tmp;
 int i, j;
 __s32 y, crR, crG, cbG, cbB;
 __u8 *bufcr, *bufcb, *bufy, *bufoute;
 int yskip;
 
 yskip=RTjpeg_width;
 
 bufcb=&buf[RTjpeg_width*RTjpeg_height];
 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2];
 bufy=&buf[0];
 bufoute=rgb;
 
 for(i=0; i<(RTjpeg_height); i++)
 {
  for(j=0; j<RTjpeg_width; j+=2)
  {
   crR=(*bufcr-128)*KcrR;
   crG=(*(bufcr++)-128)*KcrG;
   cbG=(*bufcb-128)*KcbG;
   cbB=(*(bufcb++)-128)*KcbB;
  
   y=(bufy[j]-16)*Ky;
   
   tmp=(y+crR)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+cbB)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);

   y=(bufy[j+1]-16)*Ky;

   tmp=(y+crR)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+cbB)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);

  }
  bufy+=yskip;
 }
}


void RTjpeg_yuv420rgb(__u8 *buf, __u8 *rgb)
{
 int tmp;
 int i, j;
 __s32 y, crR, crG, cbG, cbB;
 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
 int oskip, yskip;
 
 oskip=RTjpeg_width*3;
 yskip=RTjpeg_width;
 
 bufcb=&buf[RTjpeg_width*RTjpeg_height];
 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
 bufy=&buf[0];
 bufoute=rgb;
 bufouto=rgb+oskip;
 
 for(i=0; i<(RTjpeg_height>>1); i++)
 {
  for(j=0; j<RTjpeg_width; j+=2)
  {
   crR=(*bufcr-128)*KcrR;
   crG=(*(bufcr++)-128)*KcrG;
   cbG=(*bufcb-128)*KcbG;
   cbB=(*(bufcb++)-128)*KcbB;
  
   y=(bufy[j]-16)*Ky;
   
   tmp=(y+crR)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+cbB)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);

   y=(bufy[j+1]-16)*Ky;

   tmp=(y+crR)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+cbB)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);

   y=(bufy[j+yskip]-16)*Ky;

   tmp=(y+crR)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+cbB)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);

   y=(bufy[j+1+yskip]-16)*Ky;

   tmp=(y+crR)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+cbB)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
   
  }
  bufoute+=oskip;
  bufouto+=oskip;
  bufy+=yskip<<1;
 }
}


void RTjpeg_yuvrgb32(__u8 *buf, __u8 *rgb)
{
 int tmp;
 int i, j;
 __s32 y, crR, crG, cbG, cbB;
 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
 int oskip, yskip;
 
 oskip=RTjpeg_width*4;
 yskip=RTjpeg_width;
 
 bufcb=&buf[RTjpeg_width*RTjpeg_height];
 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/2];
 bufy=&buf[0];
 bufoute=rgb;
 bufouto=rgb+oskip;
 
 for(i=0; i<(RTjpeg_height>>1); i++)
 {
  for(j=0; j<RTjpeg_width; j+=2)
  {
   crR=(*bufcr-128)*KcrR;
   crG=(*(bufcr++)-128)*KcrG;
   cbG=(*bufcb-128)*KcbG;
   cbB=(*(bufcb++)-128)*KcbB;
  
   y=(bufy[j]-16)*Ky;
   
   tmp=(y+cbB)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+crR)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   bufoute++;

   y=(bufy[j+1]-16)*Ky;

   tmp=(y+cbB)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+crR)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   bufoute++;

   y=(bufy[j+yskip]-16)*Ky;

   tmp=(y+cbB)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+crR)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
   bufouto++;

   y=(bufy[j+1+yskip]-16)*Ky;

   tmp=(y+cbB)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+crR)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
   bufouto++;
   
  }
  bufoute+=oskip;
  bufouto+=oskip;
  bufy+=yskip<<1;
 }
}

void RTjpeg_yuvrgb24(__u8 *buf, __u8 *rgb)
{
 int tmp;
 int i, j;
 __s32 y, crR, crG, cbG, cbB;
 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
 int oskip, yskip;
 
 oskip=RTjpeg_width*3;
 yskip=RTjpeg_width;
 
 bufcb=&buf[RTjpeg_width*RTjpeg_height];
 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
 bufy=&buf[0];
 bufoute=rgb;
 bufouto=rgb+oskip;
 
 for(i=0; i<(RTjpeg_height>>1); i++)
 {
  for(j=0; j<RTjpeg_width; j+=2)
  {
   crR=(*bufcr-128)*KcrR;
   crG=(*(bufcr++)-128)*KcrG;
   cbG=(*bufcb-128)*KcbG;
   cbB=(*(bufcb++)-128)*KcbB;
  
   y=(bufy[j]-16)*Ky;
   
   tmp=(y+cbB)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+crR)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);

   y=(bufy[j+1]-16)*Ky;

   tmp=(y+cbB)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+crR)>>16;
   *(bufoute++)=(tmp>255)?255:((tmp<0)?0:tmp);

   y=(bufy[j+yskip]-16)*Ky;

   tmp=(y+cbB)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+crR)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);

   y=(bufy[j+1+yskip]-16)*Ky;

   tmp=(y+cbB)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+crR)>>16;
   *(bufouto++)=(tmp>255)?255:((tmp<0)?0:tmp);
   
  }
  bufoute+=oskip;
  bufouto+=oskip;
  bufy+=yskip<<1;
 }
}

void RTjpeg_yuvrgb16(__u8 *buf, __u8 *rgb)
{
 int tmp;
 int i, j;
 __s32 y, crR, crG, cbG, cbB;
 __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
 int oskip, yskip;
 unsigned char r, g, b;
 
 oskip=RTjpeg_width*2;
 yskip=RTjpeg_width;
 
 bufcb=&buf[RTjpeg_width*RTjpeg_height];
 bufcr=&buf[RTjpeg_width*RTjpeg_height+(RTjpeg_width*RTjpeg_height)/4];
 bufy=&buf[0];
 bufoute=rgb;
 bufouto=rgb+oskip;
 
 for(i=0; i<(RTjpeg_height>>1); i++)
 {
  for(j=0; j<RTjpeg_width; j+=2)
  {
   crR=(*bufcr-128)*KcrR;
   crG=(*(bufcr++)-128)*KcrG;
   cbG=(*bufcb-128)*KcbG;
   cbB=(*(bufcb++)-128)*KcbB;
  
   y=(bufy[j]-16)*Ky;
   
   tmp=(y+cbB)>>16;
   b=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   g=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+crR)>>16;
   r=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(int)((int)b >> 3);
   tmp|=(int)(((int)g >> 2) << 5);
   tmp|=(int)(((int)r >> 3) << 11);
   *(bufoute++)=tmp&0xff;
   *(bufoute++)=tmp>>8;
   

   y=(bufy[j+1]-16)*Ky;

   tmp=(y+cbB)>>16;
   b=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   g=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+crR)>>16;
   r=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(int)((int)b >> 3);
   tmp|=(int)(((int)g >> 2) << 5);
   tmp|=(int)(((int)r >> 3) << 11);
   *(bufoute++)=tmp&0xff;
   *(bufoute++)=tmp>>8;

   y=(bufy[j+yskip]-16)*Ky;

   tmp=(y+cbB)>>16;
   b=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   g=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+crR)>>16;
   r=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(int)((int)b >> 3);
   tmp|=(int)(((int)g >> 2) << 5);
   tmp|=(int)(((int)r >> 3) << 11);
   *(bufouto++)=tmp&0xff;
   *(bufouto++)=tmp>>8;

   y=(bufy[j+1+yskip]-16)*Ky;

   tmp=(y+cbB)>>16;
   b=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y-crG-cbG)>>16;
   g=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(y+crR)>>16;
   r=(tmp>255)?255:((tmp<0)?0:tmp);
   tmp=(int)((int)b >> 3);
   tmp|=(int)(((int)g >> 2) << 5);
   tmp|=(int)(((int)r >> 3) << 11);
   *(bufouto++)=tmp&0xff;
   *(bufouto++)=tmp>>8;

  }
  bufoute+=oskip;
  bufouto+=oskip;
  bufy+=yskip<<1;
 }
}

void RTjpeg_yuvrgb8(__u8 *buf, __u8 *rgb)
{
 bcopy(buf, rgb, RTjpeg_width*RTjpeg_height);
}

void RTjpeg_double32(__u32 *buf)
{
 int i, j;
 
 __u32 *iptr, *optr1, *optr2;
 
 iptr=buf+(RTjpeg_width*RTjpeg_height)-1;
 optr1=buf+(RTjpeg_width*RTjpeg_height*4)-1;
 optr2=optr1-(2*RTjpeg_width);
 
 for(i=0; i<RTjpeg_height; i++)
 {
  for(j=0; j<RTjpeg_width; j++)
  {
   *(optr1--)=*iptr;
   *(optr1--)=*iptr;
   *(optr2--)=*iptr;
   *(optr2--)=*(iptr--);
  }
  optr2=optr2-2*RTjpeg_width;
  optr1=optr1-2*RTjpeg_width;
 } 
}

void RTjpeg_double24(__u8 *buf)
{
}

void RTjpeg_double16(__u16 *buf)
{
 int i, j;
 
 __u16 *iptr, *optr1, *optr2;
 
 iptr=buf+(RTjpeg_width*RTjpeg_height)-1;
 optr1=buf+(RTjpeg_width*RTjpeg_height*4)-1;
 optr2=optr1-(2*RTjpeg_width);
 
 for(i=0; i<RTjpeg_height; i++)
 {
  for(j=0; j<RTjpeg_width; j++)
  {
   *(optr1--)=*iptr;
   *(optr1--)=*iptr;
   *(optr2--)=*iptr;
   *(optr2--)=*(iptr--);
  }
  optr2=optr2-2*RTjpeg_width;
  optr1=optr1-2*RTjpeg_width;
 } 
}

void RTjpeg_double8(__u8 *buf)
{
 int i, j;
 
 __u8 *iptr, *optr1, *optr2;
 
 iptr=buf+(RTjpeg_width*RTjpeg_height)-1;
 optr1=buf+(RTjpeg_width*RTjpeg_height*4)-1;
 optr2=optr1-(2*RTjpeg_width);
 
 for(i=0; i<RTjpeg_height; i++)
 {
  for(j=0; j<RTjpeg_width; j++)
  {
   *(optr1--)=*iptr;
   *(optr1--)=*iptr;
   *(optr2--)=*iptr;
   *(optr2--)=*(iptr--);
  }
  optr2=optr2-2*RTjpeg_width;
  optr1=optr1-2*RTjpeg_width;
 } 
}