8 files changed, 908 insertions, 969 deletions
diff --git a/gst-libs/gst/idct/dct.h b/gst-libs/gst/idct/dct.h
index efb3ddb3..c2e37449 100644
--- a/gst-libs/gst/idct/dct.h
+++ b/gst-libs/gst/idct/dct.h
@@ -16,7 +16,7 @@ typedef DCTELEM DCTBLOCK[DCTSIZE2];
 
 typedef long INT32;		/* must be at least 32 bits */
 
-extern void gst_idct_int_idct();
+extern void gst_idct_int_idct ();
 
 extern void gst_idct_init_fast_int_idct (void);
 extern void gst_idct_fast_int_idct (short *block);
@@ -27,6 +27,5 @@ extern void gst_idct_mmx32_idct (short *block);
 extern void gst_idct_sse_idct (short *block);
 #endif /* HAVE_LIBMMX */
 
-extern void gst_idct_init_float_idct(void);
+extern void gst_idct_init_float_idct (void);
 extern void gst_idct_float_idct (short *block);
-
diff --git a/gst-libs/gst/idct/fastintidct.c b/gst-libs/gst/idct/fastintidct.c
index 27426672..9bb1436d 100644
--- a/gst-libs/gst/idct/fastintidct.c
+++ b/gst-libs/gst/idct/fastintidct.c
@@ -45,17 +45,17 @@
 /* this code assumes >> to be a two's-complement arithmetic */
 /* right shift: (-2)>>1 == -1 , (-3)>>1 == -2               */
 
-#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */
-#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */
-#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */
-#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */
-#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */
-#define W7 565  /* 2048*sqrt(2)*cos(7*pi/16) */
+#define W1 2841			/* 2048*sqrt(2)*cos(1*pi/16) */
+#define W2 2676			/* 2048*sqrt(2)*cos(2*pi/16) */
+#define W3 2408			/* 2048*sqrt(2)*cos(3*pi/16) */
+#define W5 1609			/* 2048*sqrt(2)*cos(5*pi/16) */
+#define W6 1108			/* 2048*sqrt(2)*cos(6*pi/16) */
+#define W7 565			/* 2048*sqrt(2)*cos(7*pi/16) */
 
 #include "dct.h"
 
 /* private data */
-static short iclip[1024]; /* clipping table */
+static short iclip[1024];	/* clipping table */
 static short *iclp;
 
 /* private prototypes */
@@ -72,57 +72,58 @@ static void idctcol (short *blk);
  *        c[1..7] = 128*sqrt(2)
  */
 
-static void idctrow(blk)
-short *blk;
+static void
+idctrow (blk)
+     short *blk;
 {
   int x0, x1, x2, x3, x4, x5, x6, x7, x8;
 
   /* shortcut */
-  if (!((x1 = blk[4]<<11) | (x2 = blk[6]) | (x3 = blk[2]) |
-        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3])))
-  {
-    blk[0]=blk[1]=blk[2]=blk[3]=blk[4]=blk[5]=blk[6]=blk[7]=blk[0]<<3;
+  if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
+	  (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
+    blk[0] = blk[1] = blk[2] = blk[3] = blk[4] = blk[5] = blk[6] = blk[7] =
+	blk[0] << 3;
     return;
   }
 
-  x0 = (blk[0]<<11) + 128; /* for proper rounding in the fourth stage */
+  x0 = (blk[0] << 11) + 128;	/* for proper rounding in the fourth stage */
 
   /* first stage */
-  x8 = W7*(x4+x5);
-  x4 = x8 + (W1-W7)*x4;
-  x5 = x8 - (W1+W7)*x5;
-  x8 = W3*(x6+x7);
-  x6 = x8 - (W3-W5)*x6;
-  x7 = x8 - (W3+W5)*x7;
-  
+  x8 = W7 * (x4 + x5);
+  x4 = x8 + (W1 - W7) * x4;
+  x5 = x8 - (W1 + W7) * x5;
+  x8 = W3 * (x6 + x7);
+  x6 = x8 - (W3 - W5) * x6;
+  x7 = x8 - (W3 + W5) * x7;
+
   /* second stage */
   x8 = x0 + x1;
   x0 -= x1;
-  x1 = W6*(x3+x2);
-  x2 = x1 - (W2+W6)*x2;
-  x3 = x1 + (W2-W6)*x3;
+  x1 = W6 * (x3 + x2);
+  x2 = x1 - (W2 + W6) * x2;
+  x3 = x1 + (W2 - W6) * x3;
   x1 = x4 + x6;
   x4 -= x6;
   x6 = x5 + x7;
   x5 -= x7;
-  
+
   /* third stage */
   x7 = x8 + x3;
   x8 -= x3;
   x3 = x0 + x2;
   x0 -= x2;
-  x2 = (181*(x4+x5)+128)>>8;
-  x4 = (181*(x4-x5)+128)>>8;
-  
+  x2 = (181 * (x4 + x5) + 128) >> 8;
+  x4 = (181 * (x4 - x5) + 128) >> 8;
+
   /* fourth stage */
-  blk[0] = (x7+x1)>>8;
-  blk[1] = (x3+x2)>>8;
-  blk[2] = (x0+x4)>>8;
-  blk[3] = (x8+x6)>>8;
-  blk[4] = (x8-x6)>>8;
-  blk[5] = (x0-x4)>>8;
-  blk[6] = (x3-x2)>>8;
-  blk[7] = (x7-x1)>>8;
+  blk[0] = (x7 + x1) >> 8;
+  blk[1] = (x3 + x2) >> 8;
+  blk[2] = (x0 + x4) >> 8;
+  blk[3] = (x8 + x6) >> 8;
+  blk[4] = (x8 - x6) >> 8;
+  blk[5] = (x0 - x4) >> 8;
+  blk[6] = (x3 - x2) >> 8;
+  blk[7] = (x7 - x1) >> 8;
 }
 
 /* column (vertical) IDCT
@@ -134,78 +135,81 @@ short *blk;
  * where: c[0]    = 1/1024
  *        c[1..7] = (1/1024)*sqrt(2)
  */
-static void idctcol(blk)
-short *blk;
+static void
+idctcol (blk)
+     short *blk;
 {
   int x0, x1, x2, x3, x4, x5, x6, x7, x8;
 
   /* shortcut */
-  if (!((x1 = (blk[8*4]<<8)) | (x2 = blk[8*6]) | (x3 = blk[8*2]) |
-        (x4 = blk[8*1]) | (x5 = blk[8*7]) | (x6 = blk[8*5]) | (x7 = blk[8*3])))
-  {
-    blk[8*0]=blk[8*1]=blk[8*2]=blk[8*3]=blk[8*4]=blk[8*5]=blk[8*6]=blk[8*7]=
-      iclp[(blk[8*0]+32)>>6];
+  if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
+	  (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) | (x7 =
+	      blk[8 * 3]))) {
+    blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3] = blk[8 * 4] =
+	blk[8 * 5] = blk[8 * 6] = blk[8 * 7] = iclp[(blk[8 * 0] + 32) >> 6];
     return;
   }
 
-  x0 = (blk[8*0]<<8) + 8192;
+  x0 = (blk[8 * 0] << 8) + 8192;
 
   /* first stage */
-  x8 = W7*(x4+x5) + 4;
-  x4 = (x8+(W1-W7)*x4)>>3;
-  x5 = (x8-(W1+W7)*x5)>>3;
-  x8 = W3*(x6+x7) + 4;
-  x6 = (x8-(W3-W5)*x6)>>3;
-  x7 = (x8-(W3+W5)*x7)>>3;
-  
+  x8 = W7 * (x4 + x5) + 4;
+  x4 = (x8 + (W1 - W7) * x4) >> 3;
+  x5 = (x8 - (W1 + W7) * x5) >> 3;
+  x8 = W3 * (x6 + x7) + 4;
+  x6 = (x8 - (W3 - W5) * x6) >> 3;
+  x7 = (x8 - (W3 + W5) * x7) >> 3;
+
   /* second stage */
   x8 = x0 + x1;
   x0 -= x1;
-  x1 = W6*(x3+x2) + 4;
-  x2 = (x1-(W2+W6)*x2)>>3;
-  x3 = (x1+(W2-W6)*x3)>>3;
+  x1 = W6 * (x3 + x2) + 4;
+  x2 = (x1 - (W2 + W6) * x2) >> 3;
+  x3 = (x1 + (W2 - W6) * x3) >> 3;
   x1 = x4 + x6;
   x4 -= x6;
   x6 = x5 + x7;
   x5 -= x7;
-  
+
   /* third stage */
   x7 = x8 + x3;
   x8 -= x3;
   x3 = x0 + x2;
   x0 -= x2;
-  x2 = (181*(x4+x5)+128)>>8;
-  x4 = (181*(x4-x5)+128)>>8;
-  
+  x2 = (181 * (x4 + x5) + 128) >> 8;
+  x4 = (181 * (x4 - x5) + 128) >> 8;
+
   /* fourth stage */
-  blk[8*0] = iclp[(x7+x1)>>14];
-  blk[8*1] = iclp[(x3+x2)>>14];
-  blk[8*2] = iclp[(x0+x4)>>14];
-  blk[8*3] = iclp[(x8+x6)>>14];
-  blk[8*4] = iclp[(x8-x6)>>14];
-  blk[8*5] = iclp[(x0-x4)>>14];
-  blk[8*6] = iclp[(x3-x2)>>14];
-  blk[8*7] = iclp[(x7-x1)>>14];
+  blk[8 * 0] = iclp[(x7 + x1) >> 14];
+  blk[8 * 1] = iclp[(x3 + x2) >> 14];
+  blk[8 * 2] = iclp[(x0 + x4) >> 14];
+  blk[8 * 3] = iclp[(x8 + x6) >> 14];
+  blk[8 * 4] = iclp[(x8 - x6) >> 14];
+  blk[8 * 5] = iclp[(x0 - x4) >> 14];
+  blk[8 * 6] = iclp[(x3 - x2) >> 14];
+  blk[8 * 7] = iclp[(x7 - x1) >> 14];
 }
 
 /* two dimensional inverse discrete cosine transform */
-void gst_idct_fast_int_idct(block)
-short *block;
+void
+gst_idct_fast_int_idct (block)
+     short *block;
 {
   int i;
 
-  for (i=0; i<8; i++)
-    idctrow(block+8*i);
+  for (i = 0; i < 8; i++)
+    idctrow (block + 8 * i);
 
-  for (i=0; i<8; i++)
-    idctcol(block+i);
+  for (i = 0; i < 8; i++)
+    idctcol (block + i);
 }
 
-void gst_idct_init_fast_int_idct()
+void
+gst_idct_init_fast_int_idct ()
 {
   int i;
 
-  iclp = iclip+512;
-  for (i= -512; i<512; i++)
-    iclp[i] = (i<-256) ? -256 : ((i>255) ? 255 : i);
+  iclp = iclip + 512;
+  for (i = -512; i < 512; i++)
+    iclp[i] = (i < -256) ? -256 : ((i > 255) ? 255 : i);
 }
diff --git a/gst-libs/gst/idct/floatidct.c b/gst-libs/gst/idct/floatidct.c
index b215bd78..0fa1e830 100644
--- a/gst-libs/gst/idct/floatidct.c
+++ b/gst-libs/gst/idct/floatidct.c
@@ -56,51 +56,51 @@ static double gst_idct_float_c[8][8];
 
 /* initialize DCT coefficient matrix */
 
-void gst_idct_init_float_idct()
+void
+gst_idct_init_float_idct ()
 {
   int freq, time;
   double scale;
 
-  for (freq=0; freq < 8; freq++)
-  {
-    scale = (freq == 0) ? sqrt(0.125) : 0.5;
-    for (time=0; time<8; time++)
-      gst_idct_float_c[freq][time] = scale*cos((PI/8.0)*freq*(time + 0.5));
+  for (freq = 0; freq < 8; freq++) {
+    scale = (freq == 0) ? sqrt (0.125) : 0.5;
+    for (time = 0; time < 8; time++)
+      gst_idct_float_c[freq][time] =
+	  scale * cos ((PI / 8.0) * freq * (time + 0.5));
   }
 }
 
 /* perform IDCT matrix multiply for 8x8 coefficient block */
 
-void gst_idct_float_idct(block)
-short *block;
+void
+gst_idct_float_idct (block)
+     short *block;
 {
   int i, j, k, v;
   double partial_product;
   double tmp[64];
 
-  for (i=0; i<8; i++)
-    for (j=0; j<8; j++)
-    {
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++) {
       partial_product = 0.0;
 
-      for (k=0; k<8; k++)
-        partial_product+= gst_idct_float_c[k][j]*block[8*i+k];
+      for (k = 0; k < 8; k++)
+	partial_product += gst_idct_float_c[k][j] * block[8 * i + k];
 
-      tmp[8*i+j] = partial_product;
+      tmp[8 * i + j] = partial_product;
     }
 
   /* Transpose operation is integrated into address mapping by switching 
      loop order of i and j */
 
-  for (j=0; j<8; j++)
-    for (i=0; i<8; i++)
-    {
+  for (j = 0; j < 8; j++)
+    for (i = 0; i < 8; i++) {
       partial_product = 0.0;
 
-      for (k=0; k<8; k++)
-        partial_product+= gst_idct_float_c[k][i]*tmp[8*k+j];
+      for (k = 0; k < 8; k++)
+	partial_product += gst_idct_float_c[k][i] * tmp[8 * k + j];
 
-      v = (int) floor(partial_product+0.5);
-      block[8*i+j] = (v<-256) ? -256 : ((v>255) ? 255 : v);
+      v = (int) floor (partial_product + 0.5);
+      block[8 * i + j] = (v < -256) ? -256 : ((v > 255) ? 255 : v);
     }
 }
diff --git a/gst-libs/gst/idct/idct.c b/gst-libs/gst/idct/idct.c
index 59c6a844..4be150f1 100644
--- a/gst-libs/gst/idct/idct.c
+++ b/gst-libs/gst/idct/idct.c
@@ -25,24 +25,25 @@
 #include <gst/idct/idct.h>
 #include "dct.h"
 
-static void gst_idct_int_sparse_idct(short *data);
+static void gst_idct_int_sparse_idct (short *data);
 
-GstIDCT *gst_idct_new(GstIDCTMethod method) 
+GstIDCT *
+gst_idct_new (GstIDCTMethod method)
 {
-  GstIDCT *new = g_malloc(sizeof(GstIDCT));
+  GstIDCT *new = g_malloc (sizeof (GstIDCT));
 
   new->need_transpose = FALSE;
 
   if (method == GST_IDCT_DEFAULT) {
 #ifdef HAVE_LIBMMX
-    if (gst_cpu_get_flags() & GST_CPU_FLAG_MMX) {
+    if (gst_cpu_get_flags () & GST_CPU_FLAG_MMX) {
       method = GST_IDCT_MMX;
     }
     /* disabled for now 
-    if (gst_cpu_get_flags() & GST_CPU_FLAG_SSE) {
-      method = GST_IDCT_SSE;
-    }
-    */
+       if (gst_cpu_get_flags() & GST_CPU_FLAG_SSE) {
+       method = GST_IDCT_SSE;
+       }
+     */
     else
 #endif /* HAVE_LIBMMX */
     {
@@ -53,49 +54,50 @@ GstIDCT *gst_idct_new(GstIDCTMethod method)
   new->convert_sparse = gst_idct_int_sparse_idct;
 
   switch (method) {
-	 case GST_IDCT_FAST_INT:
-		GST_INFO ( "using fast_int_idct");
-	   gst_idct_init_fast_int_idct();
-		new->convert = gst_idct_fast_int_idct;
-		break;
-	 case GST_IDCT_INT:
-		GST_INFO ( "using int_idct");
-		new->convert = gst_idct_int_idct;
-		break;
-	 case GST_IDCT_FLOAT:
-		GST_INFO ( "using float_idct");
-		gst_idct_init_float_idct();
-		new->convert = gst_idct_float_idct;
-		break;
+    case GST_IDCT_FAST_INT:
+      GST_INFO ("using fast_int_idct");
+      gst_idct_init_fast_int_idct ();
+      new->convert = gst_idct_fast_int_idct;
+      break;
+    case GST_IDCT_INT:
+      GST_INFO ("using int_idct");
+      new->convert = gst_idct_int_idct;
+      break;
+    case GST_IDCT_FLOAT:
+      GST_INFO ("using float_idct");
+      gst_idct_init_float_idct ();
+      new->convert = gst_idct_float_idct;
+      break;
 #ifdef HAVE_LIBMMX
-	 case GST_IDCT_MMX:
-		GST_INFO ( "using MMX_idct");
-		new->convert = gst_idct_mmx_idct;
-		new->need_transpose = TRUE;
-		break;
-	 case GST_IDCT_MMX32:
-		GST_INFO ( "using MMX32_idct");
-		new->convert = gst_idct_mmx32_idct;
-		new->need_transpose = TRUE;
-		break;
-	 case GST_IDCT_SSE:
-		GST_INFO ( "using SSE_idct");
-		new->convert = gst_idct_sse_idct;
-		new->need_transpose = TRUE;
-		break;
+    case GST_IDCT_MMX:
+      GST_INFO ("using MMX_idct");
+      new->convert = gst_idct_mmx_idct;
+      new->need_transpose = TRUE;
+      break;
+    case GST_IDCT_MMX32:
+      GST_INFO ("using MMX32_idct");
+      new->convert = gst_idct_mmx32_idct;
+      new->need_transpose = TRUE;
+      break;
+    case GST_IDCT_SSE:
+      GST_INFO ("using SSE_idct");
+      new->convert = gst_idct_sse_idct;
+      new->need_transpose = TRUE;
+      break;
 #endif /* HAVE_LIBMMX */
-	 default:
-		GST_INFO ( "method not supported");
-		g_free(new);
-		return NULL;
+    default:
+      GST_INFO ("method not supported");
+      g_free (new);
+      return NULL;
   }
   return new;
 }
 
-static void gst_idct_int_sparse_idct(short *data) 
+static void
+gst_idct_int_sparse_idct (short *data)
 {
   short val;
-  gint32 v, *dp = (guint32 *)data;
+  gint32 v, *dp = (guint32 *) data;
 
   v = *data;
 
@@ -104,43 +106,61 @@ static void gst_idct_int_sparse_idct(short *data)
     val += (8 >> 1);
     val /= 8;
     val = -val;
-  }
-  else {
+  } else {
     val = (v + (8 >> 1)) / 8;
   }
-  v = (( val & 0xffff) | (val << 16));
-
-  dp[0] = v;  dp[1] = v;  dp[2] = v;  dp[3] = v;
-  dp[4] = v;  dp[5] = v;  dp[6] = v;  dp[7] = v;
-  dp[8] = v;  dp[9] = v;  dp[10] = v; dp[11] = v;
-  dp[12] = v; dp[13] = v; dp[14] = v; dp[15] = v;
-  dp[16] = v; dp[17] = v; dp[18] = v; dp[19] = v;
-  dp[20] = v; dp[21] = v; dp[22] = v; dp[23] = v;
-  dp[24] = v; dp[25] = v; dp[26] = v; dp[27] = v;
-  dp[28] = v; dp[29] = v; dp[30] = v; dp[31] = v;
+  v = ((val & 0xffff) | (val << 16));
+
+  dp[0] = v;
+  dp[1] = v;
+  dp[2] = v;
+  dp[3] = v;
+  dp[4] = v;
+  dp[5] = v;
+  dp[6] = v;
+  dp[7] = v;
+  dp[8] = v;
+  dp[9] = v;
+  dp[10] = v;
+  dp[11] = v;
+  dp[12] = v;
+  dp[13] = v;
+  dp[14] = v;
+  dp[15] = v;
+  dp[16] = v;
+  dp[17] = v;
+  dp[18] = v;
+  dp[19] = v;
+  dp[20] = v;
+  dp[21] = v;
+  dp[22] = v;
+  dp[23] = v;
+  dp[24] = v;
+  dp[25] = v;
+  dp[26] = v;
+  dp[27] = v;
+  dp[28] = v;
+  dp[29] = v;
+  dp[30] = v;
+  dp[31] = v;
 }
 
-void gst_idct_destroy(GstIDCT *idct) 
+void
+gst_idct_destroy (GstIDCT * idct)
 {
-  g_return_if_fail(idct != NULL);
+  g_return_if_fail (idct != NULL);
 
-  g_free(idct);
+  g_free (idct);
 }
 
 static gboolean
-plugin_init (GstPlugin *plugin)
+plugin_init (GstPlugin * plugin)
 {
   return TRUE;
 }
 
-GST_PLUGIN_DEFINE (
-  GST_VERSION_MAJOR,
-  GST_VERSION_MINOR,
-  "gstidct",
-  "Accelerated IDCT routines",
-  plugin_init,
-  VERSION,
-  GST_LICENSE,
-  GST_PACKAGE,
-  GST_ORIGIN
-)
+GST_PLUGIN_DEFINE (GST_VERSION_MAJOR,
+    GST_VERSION_MINOR,
+    "gstidct",
+    "Accelerated IDCT routines",
+    plugin_init, VERSION, GST_LICENSE, GST_PACKAGE, GST_ORIGIN)
diff --git a/gst-libs/gst/idct/idct.h b/gst-libs/gst/idct/idct.h
index fa6f62cd..37a2a0b9 100644
--- a/gst-libs/gst/idct/idct.h
+++ b/gst-libs/gst/idct/idct.h
@@ -23,22 +23,24 @@
 
 #include <glib.h>
 
-typedef enum {
-  GST_IDCT_DEFAULT,    
-  GST_IDCT_INT,	
-  GST_IDCT_FAST_INT, 
-  GST_IDCT_FLOAT,   
-  GST_IDCT_MMX,	
+typedef enum
+{
+  GST_IDCT_DEFAULT,
+  GST_IDCT_INT,
+  GST_IDCT_FAST_INT,
+  GST_IDCT_FLOAT,
+  GST_IDCT_MMX,
   GST_IDCT_MMX32,
   GST_IDCT_SSE,
 } GstIDCTMethod;
 
 typedef struct _GstIDCT GstIDCT;
-typedef void (*GstIDCTFunction) (gshort *block);
+typedef void (*GstIDCTFunction) (gshort * block);
 
 #define GST_IDCT_TRANSPOSE(idct) ((idct)->need_transpose)
 
-struct _GstIDCT {
+struct _GstIDCT
+{
   /* private */
   GstIDCTFunction convert;
   GstIDCTFunction convert_sparse;
@@ -46,9 +48,10 @@ struct _GstIDCT {
 };
 
 
-GstIDCT *gst_idct_new(GstIDCTMethod method);
+GstIDCT *gst_idct_new (GstIDCTMethod method);
+
 #define gst_idct_convert(idct, blocks) (idct)->convert((blocks))
 #define gst_idct_convert_sparse(idct, blocks) (idct)->convert_sparse((blocks))
-void gst_idct_destroy(GstIDCT *idct);
+void gst_idct_destroy (GstIDCT * idct);
 
 #endif /* __GST_IDCT_H__ */
diff --git a/gst-libs/gst/idct/ieeetest.c b/gst-libs/gst/idct/ieeetest.c
index f5b270eb..d26181c1 100644
--- a/gst-libs/gst/idct/ieeetest.c
+++ b/gst-libs/gst/idct/ieeetest.c
@@ -27,9 +27,9 @@
 
 void usage (char *msg);
 long ieeerand (long L, long H);
-void dct_init(void);
-void ref_fdct(DCTELEM block[8][8]);
-void ref_idct(DCTELEM block[8][8]);
+void dct_init (void);
+void ref_fdct (DCTELEM block[8][8]);
+void ref_idct (DCTELEM block[8][8]);
 
 /* error stat accumulators -- assume initialized to 0 */
 
@@ -38,47 +38,49 @@ long sumsqerrs[DCTSIZE2];
 int maxerr[DCTSIZE2];
 
 
-char * meets (double val, double limit)
+char *
+meets (double val, double limit)
 {
-  return ((fabs(val) <= limit) ? "meets" : "FAILS");
+  return ((fabs (val) <= limit) ? "meets" : "FAILS");
 }
 
 int
-main(int argc, char **argv)
+main (int argc, char **argv)
 {
   long minpix, maxpix, sign;
   long curiter, niters;
   int i, j;
   double max, total;
   int method;
-  DCTELEM   block[DCTSIZE2];	/* random source data */
-  DCTELEM   refcoefs[DCTSIZE2]; /* coefs from reference FDCT */
-  DCTELEM   refout[DCTSIZE2];	/* output from reference IDCT */
-  DCTELEM   testout[DCTSIZE2]; /* output from test IDCT */
-  GstIDCT   *idct;
-  guint64  tscstart, tscmin = ~0, tscmax = 0;
-  guint64  tscstop;
+  DCTELEM block[DCTSIZE2];	/* random source data */
+  DCTELEM refcoefs[DCTSIZE2];	/* coefs from reference FDCT */
+  DCTELEM refout[DCTSIZE2];	/* output from reference IDCT */
+  DCTELEM testout[DCTSIZE2];	/* output from test IDCT */
+  GstIDCT *idct;
+  guint64 tscstart, tscmin = ~0, tscmax = 0;
+  guint64 tscstop;
 
   /* Argument parsing --- not very bulletproof at all */
 
-  if (argc != 6) usage(NULL);
+  if (argc != 6)
+    usage (NULL);
 
-  method = atoi(argv[1]);
-  minpix = atoi(argv[2]);
-  maxpix = atoi(argv[3]);
-  sign   = atoi(argv[4]);
-  niters = atol(argv[5]);
+  method = atoi (argv[1]);
+  minpix = atoi (argv[2]);
+  maxpix = atoi (argv[3]);
+  sign = atoi (argv[4]);
+  niters = atol (argv[5]);
 
-  gst_library_load("gstidct");
+  gst_library_load ("gstidct");
 
-  idct = gst_idct_new(method);
+  idct = gst_idct_new (method);
   if (idct == 0) {
-    printf("method not available\n\n\n");
+    printf ("method not available\n\n\n");
 
     return 0;
   }
 
-  dct_init();
+  dct_init ();
 
   /* Loop once per generated random-data block */
 
@@ -86,164 +88,186 @@ main(int argc, char **argv)
 
     /* generate a pseudo-random block of data */
     for (i = 0; i < DCTSIZE2; i++)
-      block[i] = (DCTELEM) (ieeerand(-minpix,maxpix) * sign);
+      block[i] = (DCTELEM) (ieeerand (-minpix, maxpix) * sign);
 
     /* perform reference FDCT */
-    memcpy(refcoefs, block, sizeof(DCTELEM)*DCTSIZE2);
-    ref_fdct((DCTELEM **) &refcoefs);
+    memcpy (refcoefs, block, sizeof (DCTELEM) * DCTSIZE2);
+    ref_fdct ((DCTELEM **) & refcoefs);
     /* clip */
     for (i = 0; i < DCTSIZE2; i++) {
-      if (refcoefs[i] < -2048) refcoefs[i] = -2048;
-      else if (refcoefs[i] > 2047) refcoefs[i] = 2047;
+      if (refcoefs[i] < -2048)
+	refcoefs[i] = -2048;
+      else if (refcoefs[i] > 2047)
+	refcoefs[i] = 2047;
     }
 
     /* perform reference IDCT */
-    memcpy(refout, refcoefs, sizeof(DCTELEM)*DCTSIZE2);
-    ref_idct(refout);
+    memcpy (refout, refcoefs, sizeof (DCTELEM) * DCTSIZE2);
+    ref_idct (refout);
     /* clip */
     for (i = 0; i < DCTSIZE2; i++) {
-      if (refout[i] < -256) refout[i] = -256;
-      else if (refout[i] > 255) refout[i] = 255;
+      if (refout[i] < -256)
+	refout[i] = -256;
+      else if (refout[i] > 255)
+	refout[i] = 255;
     }
 
     /* perform test IDCT */
-	 if (GST_IDCT_TRANSPOSE(idct)) {
+    if (GST_IDCT_TRANSPOSE (idct)) {
       for (j = 0; j < DCTSIZE; j++) {
-        for (i = 0; i < DCTSIZE; i++) {
-		    testout[i*DCTSIZE+j] = refcoefs[j*DCTSIZE+i];
-		  }
-	   }
-	 }
-	 else {
-      memcpy(testout, refcoefs, sizeof(DCTELEM)*DCTSIZE2);
-	 }
-
-	 gst_trace_read_tsc(&tscstart);
-    gst_idct_convert(idct, testout);
-	 gst_trace_read_tsc(&tscstop);
-	 /*printf("time %llu, %llu %lld\n", tscstart, tscstop, tscstop-tscstart); */
-	 if (tscstop - tscstart < tscmin) tscmin = tscstop-tscstart;
-	 if (tscstop - tscstart > tscmax) tscmax = tscstop-tscstart;
+	for (i = 0; i < DCTSIZE; i++) {
+	  testout[i * DCTSIZE + j] = refcoefs[j * DCTSIZE + i];
+	}
+      }
+    } else {
+      memcpy (testout, refcoefs, sizeof (DCTELEM) * DCTSIZE2);
+    }
+
+    gst_trace_read_tsc (&tscstart);
+    gst_idct_convert (idct, testout);
+    gst_trace_read_tsc (&tscstop);
+    /*printf("time %llu, %llu %lld\n", tscstart, tscstop, tscstop-tscstart); */
+    if (tscstop - tscstart < tscmin)
+      tscmin = tscstop - tscstart;
+    if (tscstop - tscstart > tscmax)
+      tscmax = tscstop - tscstart;
 
     /* clip */
     for (i = 0; i < DCTSIZE2; i++) {
-      if (testout[i] < -256) testout[i] = -256;
-      else if (testout[i] > 255) testout[i] = 255;
+      if (testout[i] < -256)
+	testout[i] = -256;
+      else if (testout[i] > 255)
+	testout[i] = 255;
     }
 
     /* accumulate error stats */
     for (i = 0; i < DCTSIZE2; i++) {
       register int err = testout[i] - refout[i];
+
       sumerrs[i] += err;
       sumsqerrs[i] += err * err;
-      if (err < 0) err = -err;
-      if (maxerr[i] < err) maxerr[i] = err;
+      if (err < 0)
+	err = -err;
+      if (maxerr[i] < err)
+	maxerr[i] = err;
     }
 
     if (curiter % 100 == 99) {
-      fprintf(stderr, ".");
-      fflush(stderr);
+      fprintf (stderr, ".");
+      fflush (stderr);
     }
   }
-  fprintf(stderr, "\n");
+  fprintf (stderr, "\n");
 
   /* print results */
 
-  printf("IEEE test conditions: -L = %ld, +H = %ld, sign = %ld, #iters = %ld\n",
-	 minpix, maxpix, sign, niters);
+  printf
+      ("IEEE test conditions: -L = %ld, +H = %ld, sign = %ld, #iters = %ld\n",
+      minpix, maxpix, sign, niters);
 
-  printf("Speed, min time %lld, max %lld\n", tscmin, tscmax);
+  printf ("Speed, min time %lld, max %lld\n", tscmin, tscmax);
 
-  printf("Peak absolute values of errors:\n");
+  printf ("Peak absolute values of errors:\n");
   for (i = 0, j = 0; i < DCTSIZE2; i++) {
-    if (j < maxerr[i]) j = maxerr[i];
-    printf("%4d", maxerr[i]);
-    if ((i%DCTSIZE) == DCTSIZE-1) printf("\n");
+    if (j < maxerr[i])
+      j = maxerr[i];
+    printf ("%4d", maxerr[i]);
+    if ((i % DCTSIZE) == DCTSIZE - 1)
+      printf ("\n");
   }
-  printf("Worst peak error = %d  (%s spec limit 1)\n\n", j,
-	 meets((double) j, 1.0));
+  printf ("Worst peak error = %d  (%s spec limit 1)\n\n", j,
+      meets ((double) j, 1.0));
 
-  printf("Mean square errors:\n");
+  printf ("Mean square errors:\n");
   max = total = 0.0;
   for (i = 0; i < DCTSIZE2; i++) {
-    double err = (double) sumsqerrs[i]  / ((double) niters);
+    double err = (double) sumsqerrs[i] / ((double) niters);
+
     total += (double) sumsqerrs[i];
-    if (max < err) max = err;
-    printf(" %8.4f", err);
-    if ((i%DCTSIZE) == DCTSIZE-1) printf("\n");
+    if (max < err)
+      max = err;
+    printf (" %8.4f", err);
+    if ((i % DCTSIZE) == DCTSIZE - 1)
+      printf ("\n");
   }
-  printf("Worst pmse = %.6f  (%s spec limit 0.06)\n", max, meets(max, 0.06));
-  total /= (double) (64*niters);
-  printf("Overall mse = %.6f  (%s spec limit 0.02)\n\n", total,
-	 meets(total, 0.02));
+  printf ("Worst pmse = %.6f  (%s spec limit 0.06)\n", max, meets (max, 0.06));
+  total /= (double) (64 * niters);
+  printf ("Overall mse = %.6f  (%s spec limit 0.02)\n\n", total,
+      meets (total, 0.02));
 
-  printf("Mean errors:\n");
+  printf ("Mean errors:\n");
   max = total = 0.0;
   for (i = 0; i < DCTSIZE2; i++) {
-    double err = (double) sumerrs[i]  / ((double) niters);
+    double err = (double) sumerrs[i] / ((double) niters);
+
     total += (double) sumerrs[i];
-    printf(" %8.4f", err);
-    if (err < 0.0) err = -err;
-    if (max < err) max = err;
-    if ((i%DCTSIZE) == DCTSIZE-1) printf("\n");
+    printf (" %8.4f", err);
+    if (err < 0.0)
+      err = -err;
+    if (max < err)
+      max = err;
+    if ((i % DCTSIZE) == DCTSIZE - 1)
+      printf ("\n");
   }
-  printf("Worst mean error = %.6f  (%s spec limit 0.015)\n", max,
-	 meets(max, 0.015));
-  total /= (double) (64*niters);
-  printf("Overall mean error = %.6f  (%s spec limit 0.0015)\n\n", total,
-	 meets(total, 0.0015));
+  printf ("Worst mean error = %.6f  (%s spec limit 0.015)\n", max,
+      meets (max, 0.015));
+  total /= (double) (64 * niters);
+  printf ("Overall mean error = %.6f  (%s spec limit 0.0015)\n\n", total,
+      meets (total, 0.0015));
 
   /* test for 0 input giving 0 output */
-  memset(testout, 0, sizeof(DCTELEM)*DCTSIZE2);
-  gst_idct_convert(idct, testout);
-  for (i = 0, j=0; i < DCTSIZE2; i++) {
+  memset (testout, 0, sizeof (DCTELEM) * DCTSIZE2);
+  gst_idct_convert (idct, testout);
+  for (i = 0, j = 0; i < DCTSIZE2; i++) {
     if (testout[i]) {
-      printf("Position %d of IDCT(0) = %d (FAILS)\n", i, testout[i]);
+      printf ("Position %d of IDCT(0) = %d (FAILS)\n", i, testout[i]);
       j++;
     }
   }
-  printf("%d elements of IDCT(0) were not zero\n\n\n", j);
+  printf ("%d elements of IDCT(0) were not zero\n\n\n", j);
 
-  exit(0);
+  exit (0);
   return 0;
 }
 
 
-void usage (char *msg)
+void
+usage (char *msg)
 {
   if (msg != NULL)
-    fprintf(stderr, "\nerror: %s\n", msg);
-
-  fprintf(stderr, "\n");
-  fprintf(stderr, "usage: ieeetest minpix maxpix sign niters\n");
-  fprintf(stderr, "\n");
-  fprintf(stderr, "  test = 1 - 5\n");
-  fprintf(stderr, "  minpix = -L value per IEEE spec\n");
-  fprintf(stderr, "  maxpix =  H value per IEEE spec\n");
-  fprintf(stderr, "  sign = +1 for normal, -1 to run negated test\n");
-  fprintf(stderr, "  niters = # iterations (10000 for full test)\n");
-  fprintf(stderr, "\n");
-
-  exit(1);
+    fprintf (stderr, "\nerror: %s\n", msg);
+
+  fprintf (stderr, "\n");
+  fprintf (stderr, "usage: ieeetest minpix maxpix sign niters\n");
+  fprintf (stderr, "\n");
+  fprintf (stderr, "  test = 1 - 5\n");
+  fprintf (stderr, "  minpix = -L value per IEEE spec\n");
+  fprintf (stderr, "  maxpix =  H value per IEEE spec\n");
+  fprintf (stderr, "  sign = +1 for normal, -1 to run negated test\n");
+  fprintf (stderr, "  niters = # iterations (10000 for full test)\n");
+  fprintf (stderr, "\n");
+
+  exit (1);
 }
 
 
 /* Pseudo-random generator specified by IEEE 1180 */
 
-long ieeerand (long L, long H)
+long
+ieeerand (long L, long H)
 {
   static long randx = 1;
   static double z = (double) 0x7fffffff;
 
-  long i,j;
+  long i, j;
   double x;
 
   randx = (randx * 1103515245) + 12345;
   i = randx & 0x7ffffffe;
   x = ((double) i) / z;
-  x *= (L+H+1);
+  x *= (L + H + 1);
   j = x;
-  return j-L;
+  return j - L;
 }
 
 
@@ -256,33 +280,35 @@ double coslu[8][8];
 
 
 /* Routine to initialise the cosine lookup table */
-void dct_init(void)
+void
+dct_init (void)
 {
-  int a,b;
+  int a, b;
   double tmp;
 
-  for(a=0;a<8;a++)
-    for(b=0;b<8;b++) {
-      tmp = cos((double)((a+a+1)*b) * (3.14159265358979323846 / 16.0));
-      if(b==0)
-	tmp /= sqrt(2.0);
+  for (a = 0; a < 8; a++)
+    for (b = 0; b < 8; b++) {
+      tmp = cos ((double) ((a + a + 1) * b) * (3.14159265358979323846 / 16.0));
+      if (b == 0)
+	tmp /= sqrt (2.0);
       coslu[a][b] = tmp * 0.5;
     }
 }
 
 
-void ref_fdct (DCTELEM block[8][8])
+void
+ref_fdct (DCTELEM block[8][8])
 {
-  int x,y,u,v;
+  int x, y, u, v;
   double tmp, tmp2;
   double res[8][8];
 
-  for (v=0; v<8; v++) {
-    for (u=0; u<8; u++) {
+  for (v = 0; v < 8; v++) {
+    for (u = 0; u < 8; u++) {
       tmp = 0.0;
-      for (y=0; y<8; y++) {
+      for (y = 0; y < 8; y++) {
 	tmp2 = 0.0;
-	for (x=0; x<8; x++) {
+	for (x = 0; x < 8; x++) {
 	  tmp2 += (double) block[y][x] * coslu[x][u];
 	}
 	tmp += coslu[y][v] * tmp2;
@@ -291,11 +317,11 @@ void ref_fdct (DCTELEM block[8][8])
     }
   }
 
-  for (v=0; v<8; v++) {
-    for (u=0; u<8; u++) {
+  for (v = 0; v < 8; v++) {
+    for (u = 0; u < 8; u++) {
       tmp = res[v][u];
       if (tmp < 0.0) {
-	x = - ((int) (0.5 - tmp));
+	x = -((int) (0.5 - tmp));
       } else {
 	x = (int) (tmp + 0.5);
       }
@@ -305,18 +331,19 @@ void ref_fdct (DCTELEM block[8][8])
 }
 
 
-void ref_idct (DCTELEM block[8][8])
+void
+ref_idct (DCTELEM block[8][8])
 {
-  int x,y,u,v;
+  int x, y, u, v;
   double tmp, tmp2;
   double res[8][8];
 
-  for (y=0; y<8; y++) {
-    for (x=0; x<8; x++) {
+  for (y = 0; y < 8; y++) {
+    for (x = 0; x < 8; x++) {
       tmp = 0.0;
-      for (v=0; v<8; v++) {
+      for (v = 0; v < 8; v++) {
 	tmp2 = 0.0;
-	for (u=0; u<8; u++) {
+	for (u = 0; u < 8; u++) {
 	  tmp2 += (double) block[v][u] * coslu[x][u];
 	}
 	tmp += coslu[y][v] * tmp2;
@@ -325,11 +352,11 @@ void ref_idct (DCTELEM block[8][8])
     }
   }
 
-  for (v=0; v<8; v++) {
-    for (u=0; u<8; u++) {
+  for (v = 0; v < 8; v++) {
+    for (u = 0; u < 8; u++) {
       tmp = res[v][u];
       if (tmp < 0.0) {
-	x = - ((int) (0.5 - tmp));
+	x = -((int) (0.5 - tmp));
       } else {
 	x = (int) (tmp + 0.5);
       }
diff --git a/gst-libs/gst/idct/intidct.c b/gst-libs/gst/idct/intidct.c
index e08e6adb..42f0ac84 100644
--- a/gst-libs/gst/idct/intidct.c
+++ b/gst-libs/gst/idct/intidct.c
@@ -51,10 +51,8 @@
  */
 
 #if DCTSIZE != 8
-  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+Sorry, this code only copes with 8 x8 DCTs.	/* deliberate syntax err */
 #endif
-
-
 /*
  * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT
  * on each column.  Direct algorithms are also available, but they are
@@ -90,7 +88,6 @@
  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
  * shows that the values given below are the most effective.
  */
-
 #ifdef EIGHT_BIT_SAMPLES
 #define CONST_BITS  13
 #define PASS1_BITS  2
@@ -98,22 +95,16 @@
 #define CONST_BITS  13
 #define PASS1_BITS  1		/* lose a little precision to avoid overflow */
 #endif
-
 #define ONE	((INT32) 1)
-
 #define CONST_SCALE (ONE << CONST_BITS)
-
 /* Convert a positive real constant to an integer scaled by CONST_SCALE. */
-
 #define FIX(x)	((INT32) ((x) * CONST_SCALE + 0.5))
-
 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
  * causing a lot of useless floating-point operations at run time.
  * To get around this we use the following pre-calculated constants.
  * If you change CONST_BITS you may want to add appropriate values.
  * (With a reasonable C compiler, you can just rely on the FIX() macro...)
  */
-
 #if CONST_BITS == 13
 #define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */
 #define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */
@@ -141,15 +132,11 @@
 #define FIX_2_562915447  FIX(2.562915447)
 #define FIX_3_072711026  FIX(3.072711026)
 #endif
-
-
 /* Descale and correctly round an INT32 value that's scaled by N bits.
  * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
  * the fudge factor is correct for either sign of X.
  */
-
 #define DESCALE(x,n)  RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
-
 /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
  * For 8-bit samples with the recommended scaling, all the variable
  * and constant values involved are no more than 16 bits wide, so a
@@ -160,7 +147,6 @@
  * combination of casts.
  * NB: for 12-bit samples, a full 32-bit multiplication will be needed.
  */
-
 #ifdef EIGHT_BIT_SAMPLES
 #ifdef SHORTxSHORT_32		/* may work if 'int' is 32 bits */
 #define MULTIPLY(var,const)  (((INT16) (var)) * ((INT16) (const)))
@@ -169,17 +155,13 @@
 #define MULTIPLY(var,const)  (((INT16) (var)) * ((INT32) (const)))
 #endif
 #endif
-
 #ifndef MULTIPLY		/* default definition */
 #define MULTIPLY(var,const)  ((var) * (const))
 #endif
-
-
 /*
  * Perform the inverse DCT on one block of coefficients.
  */
-
-void
+    void
 gst_idct_int_idct (DCTBLOCK data)
 {
   INT32 tmp0, tmp1, tmp2, tmp3;
@@ -187,14 +169,13 @@ gst_idct_int_idct (DCTBLOCK data)
   INT32 z1, z2, z3, z4, z5;
   register DCTELEM *dataptr;
   int rowctr;
-  SHIFT_TEMPS
-
-  /* Pass 1: process rows. */
-  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
 
-  dataptr = data;
-  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
+  SHIFT_TEMPS
+      /* Pass 1: process rows. */
+      /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
+      /* furthermore, we scale the results by 2**PASS1_BITS. */
+      dataptr = data;
+  for (rowctr = DCTSIZE - 1; rowctr >= 0; rowctr--) {
     /* Due to quantization, we will usually find that many of the input
      * coefficients are zero, especially the AC terms.  We can exploit this
      * by short-circuiting the IDCT calculation for any row in which all
@@ -205,10 +186,10 @@ gst_idct_int_idct (DCTBLOCK data)
      */
 
     if ((dataptr[1] | dataptr[2] | dataptr[3] | dataptr[4] |
-	 dataptr[5] | dataptr[6] | dataptr[7]) == 0) {
+	    dataptr[5] | dataptr[6] | dataptr[7]) == 0) {
       /* AC terms all zero */
       DCTELEM dcval = (DCTELEM) (dataptr[0] << PASS1_BITS);
-      
+
       dataptr[0] = dcval;
       dataptr[1] = dcval;
       dataptr[2] = dcval;
@@ -217,7 +198,7 @@ gst_idct_int_idct (DCTBLOCK data)
       dataptr[5] = dcval;
       dataptr[6] = dcval;
       dataptr[7] = dcval;
-      
+
       dataptr += DCTSIZE;	/* advance pointer to next row */
       continue;
     }
@@ -228,9 +209,9 @@ gst_idct_int_idct (DCTBLOCK data)
     z2 = (INT32) dataptr[2];
     z3 = (INT32) dataptr[6];
 
-    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
-    tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    z1 = MULTIPLY (z2 + z3, FIX_0_541196100);
+    tmp2 = z1 + MULTIPLY (z3, -FIX_1_847759065);
+    tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865);
 
     tmp0 = ((INT32) dataptr[0] + (INT32) dataptr[4]) << CONST_BITS;
     tmp1 = ((INT32) dataptr[0] - (INT32) dataptr[4]) << CONST_BITS;
@@ -239,7 +220,7 @@ gst_idct_int_idct (DCTBLOCK data)
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
+
     /* Odd part per figure 8; the matrix is unitary and hence its
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
@@ -253,20 +234,20 @@ gst_idct_int_idct (DCTBLOCK data)
     z2 = tmp1 + tmp2;
     z3 = tmp0 + tmp2;
     z4 = tmp1 + tmp3;
-    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
-    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
-    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
-    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
-    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
+    z5 = MULTIPLY (z3 + z4, FIX_1_175875602);	/* sqrt(2) * c3 */
+
+    tmp0 = MULTIPLY (tmp0, FIX_0_298631336);	/* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp1 = MULTIPLY (tmp1, FIX_2_053119869);	/* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp2 = MULTIPLY (tmp2, FIX_3_072711026);	/* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp3 = MULTIPLY (tmp3, FIX_1_501321110);	/* sqrt(2) * ( c1+c3-c5-c7) */
+    z1 = MULTIPLY (z1, -FIX_0_899976223);	/* sqrt(2) * (c7-c3) */
+    z2 = MULTIPLY (z2, -FIX_2_562915447);	/* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY (z3, -FIX_1_961570560);	/* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY (z4, -FIX_0_390180644);	/* sqrt(2) * (c5-c3) */
+
     z3 += z5;
     z4 += z5;
-    
+
     tmp0 += z1 + z3;
     tmp1 += z2 + z4;
     tmp2 += z2 + z3;
@@ -274,14 +255,14 @@ gst_idct_int_idct (DCTBLOCK data)
 
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 
-    dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
-    dataptr[7] = (DCTELEM) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
-    dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
-    dataptr[6] = (DCTELEM) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
-    dataptr[2] = (DCTELEM) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
-    dataptr[5] = (DCTELEM) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
-    dataptr[3] = (DCTELEM) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
-    dataptr[4] = (DCTELEM) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+    dataptr[0] = (DCTELEM) DESCALE (tmp10 + tmp3, CONST_BITS - PASS1_BITS);
+    dataptr[7] = (DCTELEM) DESCALE (tmp10 - tmp3, CONST_BITS - PASS1_BITS);
+    dataptr[1] = (DCTELEM) DESCALE (tmp11 + tmp2, CONST_BITS - PASS1_BITS);
+    dataptr[6] = (DCTELEM) DESCALE (tmp11 - tmp2, CONST_BITS - PASS1_BITS);
+    dataptr[2] = (DCTELEM) DESCALE (tmp12 + tmp1, CONST_BITS - PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE (tmp12 - tmp1, CONST_BITS - PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE (tmp13 + tmp0, CONST_BITS - PASS1_BITS);
+    dataptr[4] = (DCTELEM) DESCALE (tmp13 - tmp0, CONST_BITS - PASS1_BITS);
 
     dataptr += DCTSIZE;		/* advance pointer to next row */
   }
@@ -291,7 +272,7 @@ gst_idct_int_idct (DCTBLOCK data)
   /* and also undo the PASS1_BITS scaling. */
 
   dataptr = data;
-  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
+  for (rowctr = DCTSIZE - 1; rowctr >= 0; rowctr--) {
     /* Columns of zeroes can be exploited in the same way as we did with rows.
      * However, the row calculation has created many nonzero AC terms, so the
      * simplification applies less often (typically 5% to 10% of the time).
@@ -301,21 +282,21 @@ gst_idct_int_idct (DCTBLOCK data)
      */
 
 #ifndef NO_ZERO_COLUMN_TEST
-    if ((dataptr[DCTSIZE*1] | dataptr[DCTSIZE*2] | dataptr[DCTSIZE*3] |
-	 dataptr[DCTSIZE*4] | dataptr[DCTSIZE*5] | dataptr[DCTSIZE*6] |
-	 dataptr[DCTSIZE*7]) == 0) {
+    if ((dataptr[DCTSIZE * 1] | dataptr[DCTSIZE * 2] | dataptr[DCTSIZE * 3] |
+	    dataptr[DCTSIZE * 4] | dataptr[DCTSIZE * 5] | dataptr[DCTSIZE * 6] |
+	    dataptr[DCTSIZE * 7]) == 0) {
       /* AC terms all zero */
-      DCTELEM dcval = (DCTELEM) DESCALE((INT32) dataptr[0], PASS1_BITS+3);
-      
-      dataptr[DCTSIZE*0] = dcval;
-      dataptr[DCTSIZE*1] = dcval;
-      dataptr[DCTSIZE*2] = dcval;
-      dataptr[DCTSIZE*3] = dcval;
-      dataptr[DCTSIZE*4] = dcval;
-      dataptr[DCTSIZE*5] = dcval;
-      dataptr[DCTSIZE*6] = dcval;
-      dataptr[DCTSIZE*7] = dcval;
-      
+      DCTELEM dcval = (DCTELEM) DESCALE ((INT32) dataptr[0], PASS1_BITS + 3);
+
+      dataptr[DCTSIZE * 0] = dcval;
+      dataptr[DCTSIZE * 1] = dcval;
+      dataptr[DCTSIZE * 2] = dcval;
+      dataptr[DCTSIZE * 3] = dcval;
+      dataptr[DCTSIZE * 4] = dcval;
+      dataptr[DCTSIZE * 5] = dcval;
+      dataptr[DCTSIZE * 6] = dcval;
+      dataptr[DCTSIZE * 7] = dcval;
+
       dataptr++;		/* advance pointer to next column */
       continue;
     }
@@ -324,48 +305,52 @@ gst_idct_int_idct (DCTBLOCK data)
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
 
-    z2 = (INT32) dataptr[DCTSIZE*2];
-    z3 = (INT32) dataptr[DCTSIZE*6];
+    z2 = (INT32) dataptr[DCTSIZE * 2];
+    z3 = (INT32) dataptr[DCTSIZE * 6];
 
-    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
-    tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+    z1 = MULTIPLY (z2 + z3, FIX_0_541196100);
+    tmp2 = z1 + MULTIPLY (z3, -FIX_1_847759065);
+    tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865);
 
-    tmp0 = ((INT32) dataptr[DCTSIZE*0] + (INT32) dataptr[DCTSIZE*4]) << CONST_BITS;
-    tmp1 = ((INT32) dataptr[DCTSIZE*0] - (INT32) dataptr[DCTSIZE*4]) << CONST_BITS;
+    tmp0 =
+	((INT32) dataptr[DCTSIZE * 0] +
+	(INT32) dataptr[DCTSIZE * 4]) << CONST_BITS;
+    tmp1 =
+	((INT32) dataptr[DCTSIZE * 0] -
+	(INT32) dataptr[DCTSIZE * 4]) << CONST_BITS;
 
     tmp10 = tmp0 + tmp3;
     tmp13 = tmp0 - tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
-    
+
     /* Odd part per figure 8; the matrix is unitary and hence its
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
 
-    tmp0 = (INT32) dataptr[DCTSIZE*7];
-    tmp1 = (INT32) dataptr[DCTSIZE*5];
-    tmp2 = (INT32) dataptr[DCTSIZE*3];
-    tmp3 = (INT32) dataptr[DCTSIZE*1];
+    tmp0 = (INT32) dataptr[DCTSIZE * 7];
+    tmp1 = (INT32) dataptr[DCTSIZE * 5];
+    tmp2 = (INT32) dataptr[DCTSIZE * 3];
+    tmp3 = (INT32) dataptr[DCTSIZE * 1];
 
     z1 = tmp0 + tmp3;
     z2 = tmp1 + tmp2;
     z3 = tmp0 + tmp2;
     z4 = tmp1 + tmp3;
-    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
-    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
-    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
-    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
-    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
+    z5 = MULTIPLY (z3 + z4, FIX_1_175875602);	/* sqrt(2) * c3 */
+
+    tmp0 = MULTIPLY (tmp0, FIX_0_298631336);	/* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp1 = MULTIPLY (tmp1, FIX_2_053119869);	/* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp2 = MULTIPLY (tmp2, FIX_3_072711026);	/* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp3 = MULTIPLY (tmp3, FIX_1_501321110);	/* sqrt(2) * ( c1+c3-c5-c7) */
+    z1 = MULTIPLY (z1, -FIX_0_899976223);	/* sqrt(2) * (c7-c3) */
+    z2 = MULTIPLY (z2, -FIX_2_562915447);	/* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY (z3, -FIX_1_961570560);	/* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY (z4, -FIX_0_390180644);	/* sqrt(2) * (c5-c3) */
+
     z3 += z5;
     z4 += z5;
-    
+
     tmp0 += z1 + z3;
     tmp1 += z2 + z4;
     tmp2 += z2 + z3;
@@ -373,23 +358,23 @@ gst_idct_int_idct (DCTBLOCK data)
 
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 
-    dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp3,
-					   CONST_BITS+PASS1_BITS+3);
-    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp10 - tmp3,
-					   CONST_BITS+PASS1_BITS+3);
-    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp11 + tmp2,
-					   CONST_BITS+PASS1_BITS+3);
-    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(tmp11 - tmp2,
-					   CONST_BITS+PASS1_BITS+3);
-    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp12 + tmp1,
-					   CONST_BITS+PASS1_BITS+3);
-    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12 - tmp1,
-					   CONST_BITS+PASS1_BITS+3);
-    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp13 + tmp0,
-					   CONST_BITS+PASS1_BITS+3);
-    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp13 - tmp0,
-					   CONST_BITS+PASS1_BITS+3);
-    
+    dataptr[DCTSIZE * 0] = (DCTELEM) DESCALE (tmp10 + tmp3,
+	CONST_BITS + PASS1_BITS + 3);
+    dataptr[DCTSIZE * 7] = (DCTELEM) DESCALE (tmp10 - tmp3,
+	CONST_BITS + PASS1_BITS + 3);
+    dataptr[DCTSIZE * 1] = (DCTELEM) DESCALE (tmp11 + tmp2,
+	CONST_BITS + PASS1_BITS + 3);
+    dataptr[DCTSIZE * 6] = (DCTELEM) DESCALE (tmp11 - tmp2,
+	CONST_BITS + PASS1_BITS + 3);
+    dataptr[DCTSIZE * 2] = (DCTELEM) DESCALE (tmp12 + tmp1,
+	CONST_BITS + PASS1_BITS + 3);
+    dataptr[DCTSIZE * 5] = (DCTELEM) DESCALE (tmp12 - tmp1,
+	CONST_BITS + PASS1_BITS + 3);
+    dataptr[DCTSIZE * 3] = (DCTELEM) DESCALE (tmp13 + tmp0,
+	CONST_BITS + PASS1_BITS + 3);
+    dataptr[DCTSIZE * 4] = (DCTELEM) DESCALE (tmp13 - tmp0,
+	CONST_BITS + PASS1_BITS + 3);
+
     dataptr++;			/* advance pointer to next column */
   }
 }
diff --git a/gst-libs/gst/idct/mmx32idct.c b/gst-libs/gst/idct/mmx32idct.c
index 3b640976..cd191f0c 100644
--- a/gst-libs/gst/idct/mmx32idct.c
+++ b/gst-libs/gst/idct/mmx32idct.c
@@ -19,9 +19,8 @@
  *  along with GNU Make; see the file COPYING.  If not, write to
  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
  *
- */
-
-
+ */  
+    
 /* MMX32 iDCT algorithm  (IEEE-1180 compliant) :: idct_mmx32()
 */
 /*
 */
 /* MPEG2AVI
 */
@@ -102,8 +101,7 @@
 /*
 */
 /*   liaor@umcc.ais.org  http://members.tripod.com/~liaor
 */
 /*  
 */
-
-
+    
 /*;=============================================================================
 */
 /*;
 */
 /*;  AP-922   http://developer.intel.com/vtune/cbts/strmsimd
 */
@@ -113,68 +111,67 @@
 /*;=============================================================================
 */
 /*
 mword typedef qword
-qword ptr equ mword ptr */
-
+qword ptr equ mword ptr */ 
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
-
 #include <mmx.h>
-
+    
 #define BITS_INV_ACC	4	/*; 4 or 5 for IEEE
 */
-	/* 5 yields higher accuracy, but lessens dynamic range on the input matrix
- */
+    /* 5 yields higher accuracy, but lessens dynamic range on the input matrix
+ */
 #define SHIFT_INV_ROW	(16 - BITS_INV_ACC)
-#define SHIFT_INV_COL	(1 + BITS_INV_ACC +14 )  /* changed from Intel's val)
- */
+#define SHIFT_INV_COL	(1 + BITS_INV_ACC +14 )	/* changed from Intel's val)
+ */
 /*#define SHIFT_INV_COL	(1 + BITS_INV_ACC )
 */
-
+    
 #define RND_INV_ROW		(1 << (SHIFT_INV_ROW-1))
 #define RND_INV_COL		(1 << (SHIFT_INV_COL-1)) 
-#define RND_INV_CORR	(RND_INV_COL - 1)		/*; correction -1.0 and round
- */
+#define RND_INV_CORR	(RND_INV_COL - 1)	/*; correction -1.0 and round
+ */
 /*#define RND_INV_ROW		(1024 * (6 - BITS_INV_ACC)) //; 1 << (SHIFT_INV_ROW-1)
 */
 /*#define RND_INV_COL		(16 * (BITS_INV_ACC - 3)) //; 1 << (SHIFT_INV_COL-1)
 */
-
-
+    
 /*.data
 */
 /*Align 16
 */
-const static long r_inv_row[2] = { RND_INV_ROW, RND_INV_ROW};
-const static long r_inv_col[2] = {RND_INV_COL, RND_INV_COL};
-const static long r_inv_corr[2] = {RND_INV_CORR, RND_INV_CORR };
-
+const static long r_inv_row[2] = { RND_INV_ROW, RND_INV_ROW };
+
+const static long r_inv_col[2] = { RND_INV_COL, RND_INV_COL };
+
+const static long r_inv_corr[2] = { RND_INV_CORR, RND_INV_CORR };
+
+
 /*const static short r_inv_col[4] = 
 */
 /*	{RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL};
 */
 /*const static short r_inv_corr[4] =
 */
 /*	{RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR};
 */
-
+    
 /* constants for the forward DCT
 
 /*#define BITS_FRW_ACC	3 //; 2 or 3 for accuracy
 */
 /*#define SHIFT_FRW_COL	BITS_FRW_ACC
 */
 /*#define SHIFT_FRW_ROW	(BITS_FRW_ACC + 17)
 */
 /*#define RND_FRW_ROW		(262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1)
 */
-
-const static __int64 one_corr = 0x0001000100010001;
-const static long r_frw_row[2] = {RND_FRW_ROW, RND_FRW_ROW };
-
+
+const static __int64 one_corr = 0x0001000100010001;
+
+const static long r_frw_row[2] = { RND_FRW_ROW, RND_FRW_ROW };
+
+
 /*const static short tg_1_16[4] = {13036, 13036, 13036, 13036 }; //tg * (2<<16) + 0.5
 */
 /*const static short tg_2_16[4] = {27146, 27146, 27146, 27146 }; //tg * (2<<16) + 0.5
 */
 /*const static short tg_3_16[4] = {-21746, -21746, -21746, -21746 }; //tg * (2<<16) + 0.5
 */
 /*const static short cos_4_16[4] = {-19195, -19195, -19195, -19195 }; //cos * (2<<16) + 0.5
 */
 /*const static short ocos_4_16[4] = {23170, 23170, 23170, 23170 }; //cos * (2<<15) + 0.5
 */
-
+    
 /*concatenated table, for forward DCT transformation
 */
-const static short tg_all_16[] = {
-	13036, 13036, 13036, 13036,		/* tg * (2<<16) + 0.5
- */
-	27146, 27146, 27146, 27146,		/*tg * (2<<16) + 0.5
- */
-	-21746, -21746, -21746, -21746,	/* tg * (2<<16) + 0.5
- */
-	-19195, -19195, -19195, -19195,	/*cos * (2<<16) + 0.5
- */
-	23170, 23170, 23170, 23170 };	/*cos * (2<<15) + 0.5
- */
+const static short tg_all_16[] = { 
+13036, 13036, 13036, 13036,	/* tg * (2<<16) + 0.5
+ */
+  27146, 27146, 27146, 27146,	/*tg * (2<<16) + 0.5
+ */
+  -21746, -21746, -21746, -21746,	/* tg * (2<<16) + 0.5
+ */
+  -19195, -19195, -19195, -19195,	/*cos * (2<<16) + 0.5
+ */
+  23170, 23170, 23170, 23170
+};				/*cos * (2<<15) + 0.5
+ */
+
 
 #define tg_1_16 (tg_all_16 + 0)
 #define tg_2_16 (tg_all_16 + 8)
 #define tg_3_16 (tg_all_16 + 16)
 #define cos_4_16 (tg_all_16 + 24)
 #define ocos_4_16 (tg_all_16 + 32)
-*/
+    */
 /*
 ;=============================================================================
 ;
@@ -236,552 +233,456 @@ IF _MMX ; MMX code
 ;=============================================================================
 
 /*; Table for rows 0,4 - constants are multiplied by cos_4_16
 */
-const short tab_i_04[] = {
-	16384, 16384, 16384, -16384,	/* ; movq-> w06 w04 w02 w00
- */
-	21407, 8867, 8867, -21407,		/* w07 w05 w03 w01
- */
-	16384, -16384, 16384, 16384,	/*; w14 w12 w10 w08
- */
-	-8867, 21407, -21407, -8867,	/*; w15 w13 w11 w09
- */
-	22725, 12873, 19266, -22725,	/*; w22 w20 w18 w16
- */
-	19266, 4520, -4520, -12873,		/*; w23 w21 w19 w17
- */
-	12873, 4520, 4520, 19266,		/*; w30 w28 w26 w24
- */
-	-22725, 19266, -12873, -22725 };/*w31 w29 w27 w25
- */
+const short tab_i_04[] = { 
+16384, 16384, 16384, -16384,	/* ; movq-> w06 w04 w02 w00
+ */
+  21407, 8867, 8867, -21407,	/* w07 w05 w03 w01
+ */
+  16384, -16384, 16384, 16384,	/*; w14 w12 w10 w08
+ */
+  -8867, 21407, -21407, -8867,	/*; w15 w13 w11 w09
+ */
+  22725, 12873, 19266, -22725,	/*; w22 w20 w18 w16
+ */
+  19266, 4520, -4520, -12873,	/*; w23 w21 w19 w17
+ */
+  12873, 4520, 4520, 19266,	/*; w30 w28 w26 w24
+ */
+  -22725, 19266, -12873, -22725
+};				/*w31 w29 w27 w25
+ */
+
 
 /*; Table for rows 1,7 - constants are multiplied by cos_1_16
 */
-const short tab_i_17[] = {
-	22725, 22725, 22725, -22725,	/* ; movq-> w06 w04 w02 w00
- */
-	29692, 12299, 12299, -29692,	/*	; w07 w05 w03 w01
- */
-	22725, -22725, 22725, 22725,	/*; w14 w12 w10 w08
- */
-	-12299, 29692, -29692, -12299,	/*; w15 w13 w11 w09
- */
-	31521, 17855, 26722, -31521,	/*; w22 w20 w18 w16
- */
-	26722, 6270, -6270, -17855,		/*; w23 w21 w19 w17
- */
-	17855, 6270, 6270, 26722,		/*; w30 w28 w26 w24
- */
-	-31521, 26722, -17855, -31521};	/* w31 w29 w27 w25
- */
+const short tab_i_17[] = { 
+22725, 22725, 22725, -22725,	/* ; movq-> w06 w04 w02 w00
+ */
+  29692, 12299, 12299, -29692,	/*      ; w07 w05 w03 w01
+ */
+  22725, -22725, 22725, 22725,	/*; w14 w12 w10 w08
+ */
+  -12299, 29692, -29692, -12299,	/*; w15 w13 w11 w09
+ */
+  31521, 17855, 26722, -31521,	/*; w22 w20 w18 w16
+ */
+  26722, 6270, -6270, -17855,	/*; w23 w21 w19 w17
+ */
+  17855, 6270, 6270, 26722,	/*; w30 w28 w26 w24
+ */
+  -31521, 26722, -17855, -31521
+};				/* w31 w29 w27 w25
+ */
+
 
 /*; Table for rows 2,6 - constants are multiplied by cos_2_16
 */
-const short tab_i_26[] = {
-	21407, 21407, 21407, -21407,	/* ; movq-> w06 w04 w02 w00
- */
-	27969, 11585, 11585, -27969,	/* ; w07 w05 w03 w01
- */
-	21407, -21407, 21407, 21407,	/* ; w14 w12 w10 w08
- */
-	-11585, 27969, -27969, -11585,	/*  ;w15 w13 w11 w09
- */
-	29692, 16819, 25172, -29692, 	/* ;w22 w20 w18 w16
- */
-	25172, 5906, -5906, -16819, 	/* ;w23 w21 w19 w17
- */
-	16819, 5906, 5906, 25172, 		/* ;w30 w28 w26 w24
- */
-	-29692, 25172, -16819, -29692};	/*  ;w31 w29 w27 w25
- */
-
-
+const short tab_i_26[] = { 
+21407, 21407, 21407, -21407,	/* ; movq-> w06 w04 w02 w00
+ */
+  27969, 11585, 11585, -27969,	/* ; w07 w05 w03 w01
+ */
+  21407, -21407, 21407, 21407,	/* ; w14 w12 w10 w08
+ */
+  -11585, 27969, -27969, -11585,	/*  ;w15 w13 w11 w09
+ */
+  29692, 16819, 25172, -29692,	/* ;w22 w20 w18 w16
+ */
+  25172, 5906, -5906, -16819,	/* ;w23 w21 w19 w17
+ */
+  16819, 5906, 5906, 25172,	/* ;w30 w28 w26 w24
+ */
+  -29692, 25172, -16819, -29692
+};				/*  ;w31 w29 w27 w25
+ */
+
+
 /*; Table for rows 3,5 - constants are multiplied by cos_3_16
 */
-const short tab_i_35[] = {
-	19266, 19266, 19266, -19266,	/*; movq-> w06 w04 w02 w00
- */
-	25172, 10426, 10426, -25172,	/*; w07 w05 w03 w01
- */
-	19266, -19266, 19266, 19266,	/*; w14 w12 w10 w08
- */
-	-10426, 25172, -25172, -10426,	/*; w15 w13 w11 w09
- */
-	26722, 15137, 22654, -26722,	/*; w22 w20 w18 w16
- */
-	22654, 5315, -5315, -15137,		/*; w23 w21 w19 w17
- */
-	15137, 5315, 5315, 22654,		/*; w30 w28 w26 w24
- */
-	-26722, 22654, -15137, -26722};	/*; w31 w29 w27 w25
- */
-*/
-
+const short tab_i_35[] = { 
+19266, 19266, 19266, -19266,	/*; movq-> w06 w04 w02 w00
+ */
+  25172, 10426, 10426, -25172,	/*; w07 w05 w03 w01
+ */
+  19266, -19266, 19266, 19266,	/*; w14 w12 w10 w08
+ */
+  -10426, 25172, -25172, -10426,	/*; w15 w13 w11 w09
+ */
+  26722, 15137, 22654, -26722,	/*; w22 w20 w18 w16
+ */
+  22654, 5315, -5315, -15137,	/*; w23 w21 w19 w17
+ */
+  15137, 5315, 5315, 22654,	/*; w30 w28 w26 w24
+ */
+  -26722, 22654, -15137, -26722
+};				/*; w31 w29 w27 w25
+ */
+
+*/
 /* CONCATENATED TABLE, rows 0,1,2,3,4,5,6,7 (in order )
 */
 /*
 */
 /* In our implementation, however, we only use row0 !
 */
 /*
 */
-static const short tab_i_01234567[] = {
-	/*row0, this row is required
- */
-	16384, 16384, 16384, -16384,	/* ; movq-> w06 w04 w02 w00
- */
-	21407, 8867, 8867, -21407,		/* w07 w05 w03 w01
- */
-	16384, -16384, 16384, 16384,	/*; w14 w12 w10 w08
- */
-	-8867, 21407, -21407, -8867,	/*; w15 w13 w11 w09
- */
-	22725, 12873, 19266, -22725,	/*; w22 w20 w18 w16
- */
-	19266, 4520, -4520, -12873,		/*; w23 w21 w19 w17
- */
-	12873, 4520, 4520, 19266,		/*; w30 w28 w26 w24
- */
-	-22725, 19266, -12873, -22725,  /*w31 w29 w27 w25
- */
-
-	/* the rest of these rows (1-7), aren't used !
- */
-
-	/*row1
- */
-	22725, 22725, 22725, -22725,	/* ; movq-> w06 w04 w02 w00
- */
-	29692, 12299, 12299, -29692,	/*	; w07 w05 w03 w01
- */
-	22725, -22725, 22725, 22725,	/*; w14 w12 w10 w08
- */
-	-12299, 29692, -29692, -12299,	/*; w15 w13 w11 w09
- */
-	31521, 17855, 26722, -31521,	/*; w22 w20 w18 w16
- */
-	26722, 6270, -6270, -17855,		/*; w23 w21 w19 w17
- */
-	17855, 6270, 6270, 26722,		/*; w30 w28 w26 w24
- */
-	-31521, 26722, -17855, -31521,	/* w31 w29 w27 w25
- */
-
-	/*row2
- */
-	21407, 21407, 21407, -21407,	/* ; movq-> w06 w04 w02 w00
- */
-	27969, 11585, 11585, -27969,	/* ; w07 w05 w03 w01
- */
-	21407, -21407, 21407, 21407,	/* ; w14 w12 w10 w08
- */
-	-11585, 27969, -27969, -11585,	/*  ;w15 w13 w11 w09
- */
-	29692, 16819, 25172, -29692, 	/* ;w22 w20 w18 w16
- */
-	25172, 5906, -5906, -16819, 	/* ;w23 w21 w19 w17
- */
-	16819, 5906, 5906, 25172, 		/* ;w30 w28 w26 w24
- */
-	-29692, 25172, -16819, -29692,	/*  ;w31 w29 w27 w25
- */
-
-	/*row3
- */
-	19266, 19266, 19266, -19266,	/*; movq-> w06 w04 w02 w00
- */
-	25172, 10426, 10426, -25172,	/*; w07 w05 w03 w01
- */
-	19266, -19266, 19266, 19266,	/*; w14 w12 w10 w08
- */
-	-10426, 25172, -25172, -10426,	/*; w15 w13 w11 w09
- */
-	26722, 15137, 22654, -26722,	/*; w22 w20 w18 w16
- */
-	22654, 5315, -5315, -15137,		/*; w23 w21 w19 w17
- */
-	15137, 5315, 5315, 22654,		/*; w30 w28 w26 w24
- */
-	-26722, 22654, -15137, -26722,	/*; w31 w29 w27 w25
- */
-
-	/*row4
- */
-	16384, 16384, 16384, -16384,	/* ; movq-> w06 w04 w02 w00
- */
-	21407, 8867, 8867, -21407,		/* w07 w05 w03 w01
- */
-	16384, -16384, 16384, 16384,	/*; w14 w12 w10 w08
- */
-	-8867, 21407, -21407, -8867,	/*; w15 w13 w11 w09
- */
-	22725, 12873, 19266, -22725,	/*; w22 w20 w18 w16
- */
-	19266, 4520, -4520, -12873,		/*; w23 w21 w19 w17
- */
-	12873, 4520, 4520, 19266,		/*; w30 w28 w26 w24
- */
-	-22725, 19266, -12873, -22725,  /*w31 w29 w27 w25
- */
-
-	/*row5
- */
-	19266, 19266, 19266, -19266,	/*; movq-> w06 w04 w02 w00
- */
-	25172, 10426, 10426, -25172,	/*; w07 w05 w03 w01
- */
-	19266, -19266, 19266, 19266,	/*; w14 w12 w10 w08
- */
-	-10426, 25172, -25172, -10426,	/*; w15 w13 w11 w09
- */
-	26722, 15137, 22654, -26722,	/*; w22 w20 w18 w16
- */
-	22654, 5315, -5315, -15137,		/*; w23 w21 w19 w17
- */
-	15137, 5315, 5315, 22654,		/*; w30 w28 w26 w24
- */
-	-26722, 22654, -15137, -26722,	/*; w31 w29 w27 w25
- */
-
-	/*row6
- */
-	21407, 21407, 21407, -21407,	/* ; movq-> w06 w04 w02 w00
- */
-	27969, 11585, 11585, -27969,	/* ; w07 w05 w03 w01
- */
-	21407, -21407, 21407, 21407,	/* ; w14 w12 w10 w08
- */
-	-11585, 27969, -27969, -11585,	/*  ;w15 w13 w11 w09
- */
-	29692, 16819, 25172, -29692, 	/* ;w22 w20 w18 w16
- */
-	25172, 5906, -5906, -16819, 	/* ;w23 w21 w19 w17
- */
-	16819, 5906, 5906, 25172, 		/* ;w30 w28 w26 w24
- */
-	-29692, 25172, -16819, -29692,	/*  ;w31 w29 w27 w25
- */
-
-	/*row7
- */
-	22725, 22725, 22725, -22725,	/* ; movq-> w06 w04 w02 w00
- */
-	29692, 12299, 12299, -29692,	/*	; w07 w05 w03 w01
- */
-	22725, -22725, 22725, 22725,	/*; w14 w12 w10 w08
- */
-	-12299, 29692, -29692, -12299,	/*; w15 w13 w11 w09
- */
-	31521, 17855, 26722, -31521,	/*; w22 w20 w18 w16
- */
-	26722, 6270, -6270, -17855,		/*; w23 w21 w19 w17
- */
-	17855, 6270, 6270, 26722,		/*; w30 w28 w26 w24
- */
-	-31521, 26722, -17855, -31521};	/* w31 w29 w27 w25
- */
-
-
-#define INP eax		/* pointer to (short *blk)
- */
-#define OUT ecx		/* pointer to output (temporary store space qwTemp[])
- */
-#define TABLE ebx	/* pointer to tab_i_01234567[]
- */
+static const short tab_i_01234567[] = { 
+      /*row0, this row is required
+ */
+      16384, 16384, 16384, -16384,	/* ; movq-> w06 w04 w02 w00
+ */
+  21407, 8867, 8867, -21407,	/* w07 w05 w03 w01
+ */
+  16384, -16384, 16384, 16384,	/*; w14 w12 w10 w08
+ */
+  -8867, 21407, -21407, -8867,	/*; w15 w13 w11 w09
+ */
+  22725, 12873, 19266, -22725,	/*; w22 w20 w18 w16
+ */
+  19266, 4520, -4520, -12873,	/*; w23 w21 w19 w17
+ */
+  12873, 4520, 4520, 19266,	/*; w30 w28 w26 w24
+ */
+  -22725, 19266, -12873, -22725,	/*w31 w29 w27 w25
+ */
+  
+      /* the rest of these rows (1-7), aren't used !
+ */
+      
+      /*row1
+ */
+      22725, 22725, 22725, -22725,	/* ; movq-> w06 w04 w02 w00
+ */
+  29692, 12299, 12299, -29692,	/*      ; w07 w05 w03 w01
+ */
+  22725, -22725, 22725, 22725,	/*; w14 w12 w10 w08
+ */
+  -12299, 29692, -29692, -12299,	/*; w15 w13 w11 w09
+ */
+  31521, 17855, 26722, -31521,	/*; w22 w20 w18 w16
+ */
+  26722, 6270, -6270, -17855,	/*; w23 w21 w19 w17
+ */
+  17855, 6270, 6270, 26722,	/*; w30 w28 w26 w24
+ */
+  -31521, 26722, -17855, -31521,	/* w31 w29 w27 w25
+ */
+  
+      /*row2
+ */
+      21407, 21407, 21407, -21407,	/* ; movq-> w06 w04 w02 w00
+ */
+  27969, 11585, 11585, -27969,	/* ; w07 w05 w03 w01
+ */
+  21407, -21407, 21407, 21407,	/* ; w14 w12 w10 w08
+ */
+  -11585, 27969, -27969, -11585,	/*  ;w15 w13 w11 w09
+ */
+  29692, 16819, 25172, -29692,	/* ;w22 w20 w18 w16
+ */
+  25172, 5906, -5906, -16819,	/* ;w23 w21 w19 w17
+ */
+  16819, 5906, 5906, 25172,	/* ;w30 w28 w26 w24
+ */
+  -29692, 25172, -16819, -29692,	/*  ;w31 w29 w27 w25
+ */
+  
+      /*row3
+ */
+      19266, 19266, 19266, -19266,	/*; movq-> w06 w04 w02 w00
+ */
+  25172, 10426, 10426, -25172,	/*; w07 w05 w03 w01
+ */
+  19266, -19266, 19266, 19266,	/*; w14 w12 w10 w08
+ */
+  -10426, 25172, -25172, -10426,	/*; w15 w13 w11 w09
+ */
+  26722, 15137, 22654, -26722,	/*; w22 w20 w18 w16
+ */
+  22654, 5315, -5315, -15137,	/*; w23 w21 w19 w17
+ */
+  15137, 5315, 5315, 22654,	/*; w30 w28 w26 w24
+ */
+  -26722, 22654, -15137, -26722,	/*; w31 w29 w27 w25
+ */
+  
+      /*row4
+ */
+      16384, 16384, 16384, -16384,	/* ; movq-> w06 w04 w02 w00
+ */
+  21407, 8867, 8867, -21407,	/* w07 w05 w03 w01
+ */
+  16384, -16384, 16384, 16384,	/*; w14 w12 w10 w08
+ */
+  -8867, 21407, -21407, -8867,	/*; w15 w13 w11 w09
+ */
+  22725, 12873, 19266, -22725,	/*; w22 w20 w18 w16
+ */
+  19266, 4520, -4520, -12873,	/*; w23 w21 w19 w17
+ */
+  12873, 4520, 4520, 19266,	/*; w30 w28 w26 w24
+ */
+  -22725, 19266, -12873, -22725,	/*w31 w29 w27 w25
+ */
+  
+      /*row5
+ */
+      19266, 19266, 19266, -19266,	/*; movq-> w06 w04 w02 w00
+ */
+  25172, 10426, 10426, -25172,	/*; w07 w05 w03 w01
+ */
+  19266, -19266, 19266, 19266,	/*; w14 w12 w10 w08
+ */
+  -10426, 25172, -25172, -10426,	/*; w15 w13 w11 w09
+ */
+  26722, 15137, 22654, -26722,	/*; w22 w20 w18 w16
+ */
+  22654, 5315, -5315, -15137,	/*; w23 w21 w19 w17
+ */
+  15137, 5315, 5315, 22654,	/*; w30 w28 w26 w24
+ */
+  -26722, 22654, -15137, -26722,	/*; w31 w29 w27 w25
+ */
+  
+      /*row6
+ */
+      21407, 21407, 21407, -21407,	/* ; movq-> w06 w04 w02 w00
+ */
+  27969, 11585, 11585, -27969,	/* ; w07 w05 w03 w01
+ */
+  21407, -21407, 21407, 21407,	/* ; w14 w12 w10 w08
+ */
+  -11585, 27969, -27969, -11585,	/*  ;w15 w13 w11 w09
+ */
+  29692, 16819, 25172, -29692,	/* ;w22 w20 w18 w16
+ */
+  25172, 5906, -5906, -16819,	/* ;w23 w21 w19 w17
+ */
+  16819, 5906, 5906, 25172,	/* ;w30 w28 w26 w24
+ */
+  -29692, 25172, -16819, -29692,	/*  ;w31 w29 w27 w25
+ */
+  
+      /*row7
+ */
+      22725, 22725, 22725, -22725,	/* ; movq-> w06 w04 w02 w00
+ */
+  29692, 12299, 12299, -29692,	/*      ; w07 w05 w03 w01
+ */
+  22725, -22725, 22725, 22725,	/*; w14 w12 w10 w08
+ */
+  -12299, 29692, -29692, -12299,	/*; w15 w13 w11 w09
+ */
+  31521, 17855, 26722, -31521,	/*; w22 w20 w18 w16
+ */
+  26722, 6270, -6270, -17855,	/*; w23 w21 w19 w17
+ */
+  17855, 6270, 6270, 26722,	/*; w30 w28 w26 w24
+ */
+  -31521, 26722, -17855, -31521
+};				/* w31 w29 w27 w25
+ */
+
+
+#define INP eax			/* pointer to (short *blk)
+ */
+#define OUT ecx			/* pointer to output (temporary store space qwTemp[])
+ */
+#define TABLE ebx		/* pointer to tab_i_01234567[]
+ */
 #define round_inv_row edx
 #define round_inv_col edx
-
-#define ROW_STRIDE 8 /* for 8x8 matrix transposer
- */
-
+    
+#define ROW_STRIDE 8		/* for 8x8 matrix transposer
+ */
+    
 /* private variables and functions
 */
-
+    
 /*temporary storage space, 8x8 of shorts
 */
-
-__inline static void idct_mmx32_rows( short *blk ); /* transform rows
- */
-__inline static void idct_mmx32_cols( short *blk ); /* transform "columns"
- */
+
+__inline static void idct_mmx32_rows (short *blk);	/* transform rows
+ */
+__inline static void idct_mmx32_cols (short *blk);	/* transform "columns"
+ */
+
 	/* the "column" transform actually transforms rows, it is
 */
 	/* identical to the row-transform except for the ROUNDING
 */
 	/* and SHIFTING coefficients.
 */
-
- 
-static void 
-idct_mmx32_rows( short *blk )	/* transform all 8 rows of 8x8 iDCT block
- */
-{
-  int x;
-  short qwTemp[64];
-  short *out = &qwTemp[0];
-  short *inptr = blk;
-  /* this subroutine performs two operations
- */
-  /* 1) iDCT row transform
- */
-  /*		for( i = 0; i < 8; ++ i)
- */
-  /*			DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] );
- */
-  /*
- */
-  /* 2) transpose the matrix (which was stored in qwTemp[])
- */
-  /*        qwTemp[] -> [8x8 matrix transpose] -> blk[]
- */
-
-  for (x=0; x<8; x++) {  /* transform one row per iteration
- */
-	 movq_m2r(*(inptr), mm0);		/* 0 ; x3 x2 x1 x0
- */
-
-	 movq_m2r(*(inptr+4), mm1);	/* 1 ; x7 x6 x5 x4
- */
-	 movq_r2r(mm0, mm2);				/* 2 ; x3 x2 x1 x0
- */
-
-	 movq_m2r(*(tab_i_01234567), mm3);	/* 3 ; w06 w04 w02 w00
- */
-	 punpcklwd_r2r(mm1, mm0);			/* x5 x1 x4 x0
- */
-
-    /* ----------
- */
-	 movq_r2r(mm0, mm5);					/* 5 ; x5 x1 x4 x0
- */
-	 punpckldq_r2r(mm0, mm0);			/* x4 x0 x4 x0
- */
-
-	 movq_m2r(*(tab_i_01234567+4), mm4);	/* 4 ; w07 w05 w03 w01
- */
-	 punpckhwd_r2r(mm1, mm2);			/* 1 ; x7 x3 x6 x2
- */
-
-	 pmaddwd_r2r(mm0, mm3);				/* x4*w06+x0*w04 x4*w02+x0*w00
- */
-	 movq_r2r(mm2, mm6);				/* 6 ; x7 x3 x6 x2
- */
-
-	 movq_m2r(*(tab_i_01234567+16), mm1);/* 1 ; w22 w20 w18 w16
- */
-	 punpckldq_r2r(mm2, mm2);			/* x6 x2 x6 x2
- */
-
-	 pmaddwd_r2r(mm2, mm4);				/* x6*w07+x2*w05 x6*w03+x2*w01
- */
-	 punpckhdq_r2r(mm5, mm5);			/* x5 x1 x5 x1
- */
-
-	 pmaddwd_m2r(*(tab_i_01234567+8), mm0);/* x4*w14+x0*w12 x4*w10+x0*w08
- */
-	 punpckhdq_r2r(mm6, mm6);			/* x7 x3 x7 x3
- */
-
-	 movq_m2r(*(tab_i_01234567+20), mm7);/* 7 ; w23 w21 w19 w17
- */
-	 pmaddwd_r2r(mm5, mm1);				/* x5*w22+x1*w20 x5*w18+x1*w16
- */
-
-	 paddd_m2r(*(r_inv_row), mm3);/* +rounder
- */
-	 pmaddwd_r2r(mm6, mm7);				/* x7*w23+x3*w21 x7*w19+x3*w17
- */
-
-	 pmaddwd_m2r(*(tab_i_01234567+12), mm2);/* x6*w15+x2*w13 x6*w11+x2*w09
- */
-	 paddd_r2r(mm4, mm3);				/* 4 ; a1=sum(even1) a0=sum(even0)
- */
-
-	 pmaddwd_m2r(*(tab_i_01234567+24), mm5);/* x5*w30+x1*w28 x5*w26+x1*w24
- */
-	 movq_r2r(mm3, mm4);				/* 4 ; a1 a0
- */
-
-	 pmaddwd_m2r(*(tab_i_01234567+28), mm6);/* x7*w31+x3*w29 x7*w27+x3*w25
- */
-	 paddd_r2r(mm7, mm1);				/* 7 ; b1=sum(odd1) b0=sum(odd0)
- */
-
-	 paddd_m2r(*(r_inv_row), mm0);/* +rounder
- */
-	 psubd_r2r(mm1, mm3);				/* a1-b1 a0-b0
- */
-
-	 psrad_i2r(SHIFT_INV_ROW, mm3);		/* y6=a1-b1 y7=a0-b0
- */
-	 paddd_r2r(mm4, mm1);				/* 4 ; a1+b1 a0+b0
- */
-
-	 paddd_r2r(mm2, mm0);				/* 2 ; a3=sum(even3) a2=sum(even2)
- */
-	 psrad_i2r(SHIFT_INV_ROW, mm1);		/* y1=a1+b1 y0=a0+b0
- */
-
-	 paddd_r2r(mm6, mm5);				/* 6 ; b3=sum(odd3) b2=sum(odd2)
- */
-	 movq_r2r(mm0, mm4);				/* 4 ; a3 a2
- */
-
-	 paddd_r2r(mm5, mm0);				/* a3+b3 a2+b2
- */
-	 psubd_r2r(mm5, mm4);				/* 5 ; a3-b3 a2-b2
- */
-
-	 psrad_i2r(SHIFT_INV_ROW, mm4);		/* y4=a3-b3 y5=a2-b2
- */
-	 psrad_i2r(SHIFT_INV_ROW, mm0);		/* y3=a3+b3 y2=a2+b2
- */
-
-	 packssdw_r2r(mm3, mm4);				/* 3 ; y6 y7 y4 y5
- */
-
-	 packssdw_r2r(mm0, mm1);				/* 0 ; y3 y2 y1 y0
- */
-	 movq_r2r(mm4, mm7);				/* 7 ; y6 y7 y4 y5
- */
-
-	 psrld_i2r(16, mm4);					/* 0 y6 0 y4
- */
-
-	 movq_r2m(mm1, *(out));	/* 1 ; save y3 y2 y1 y0
- */
-	 pslld_i2r(16, mm7);					/* y7 0 y5 0
- */
-
-	 por_r2r(mm4, mm7);					/* 4 ; y7 y6 y5 y4
- */
-
-   /* begin processing row 1
- */
-	 movq_r2m(mm7, *(out+4));	/* 7 ; save y7 y6 y5 y4
- */
-
-	 inptr += 8;
-	 out += 8;
-  }
-	 
-
-	/* done with the iDCT row-transformation
- */
-
-	/* now we have to transpose the output 8x8 matrix
- */
-	/* 8x8 (OUT) -> 8x8't' (IN)
- */
-	/* the transposition is implemented as 4 sub-operations.
- */
-	/* 1) transpose upper-left quad
- */
-	/* 2) transpose lower-right quad
- */
-	/* 3) transpose lower-left quad
- */
-	/* 4) transpose upper-right quad
- */
-
- 
-	/* mm0 = 1st row [ A B C D ] row1
- */
-	/* mm1 = 2nd row [ E F G H ] 2
- */
-	/* mm2 = 3rd row [ I J K L ] 3
- */
-	/* mm3 = 4th row [ M N O P ] 4
- */
-
-	/* 1) transpose upper-left quad
- */
-  out = &qwTemp[0];
-
-  movq_m2r(*(out + ROW_STRIDE * 0), mm0);
-
-  movq_m2r(*(out + ROW_STRIDE * 1), mm1);
-  movq_r2r(mm0, mm4);	/* mm4 = copy of row1[A B C D]
- */
-	
-  movq_m2r(*(out + ROW_STRIDE * 2), mm2);
-  punpcklwd_r2r(mm1, mm0); /* mm0 = [ 0 4 1 5]
- */
-	
-  movq_m2r(*(out + ROW_STRIDE * 3), mm3);
-  punpckhwd_r2r(mm1, mm4); /* mm4 = [ 2 6 3 7]
- */
-
-  movq_r2r(mm2, mm6);
-  punpcklwd_r2r(mm3, mm2);	/* mm2 = [ 8 12 9 13]
- */
-
-  punpckhwd_r2r(mm3, mm6);	/* mm6 = 10 14 11 15]
- */
-  movq_r2r(mm0, mm1);	/* mm1 = [ 0 4 1 5]
- */
-
-  inptr = blk;
-
-  punpckldq_r2r(mm2, mm0);	/* final result mm0 = row1 [0 4 8 12]
- */
-
-  movq_r2r(mm4, mm3);	/* mm3 = [ 2 6 3 7]
- */
-  punpckhdq_r2r(mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13]
- */
-
-  movq_r2m(mm0, *(inptr + ROW_STRIDE * 0)); /* store row 1
- */
-  punpckldq_r2r(mm6, mm4); /* final result mm4 = row3 [2 6 10 14]
- */
-
+
+static void 
+idct_mmx32_rows (short *blk)
+{				/* transform all 8 rows of 8x8 iDCT block
+ */
+  
+int x;
+  
+short qwTemp[64];
+  
+short *out = &qwTemp[0];
+  
+short *inptr = blk;
+
+  
+      /* this subroutine performs two operations
+ */
+      /* 1) iDCT row transform
+ */
+      /*            for( i = 0; i < 8; ++ i)
+ */
+      /*                    DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] );
+ */
+      /*
+ */
+      /* 2) transpose the matrix (which was stored in qwTemp[])
+ */
+      /*        qwTemp[] -> [8x8 matrix transpose] -> blk[]
+ */
+      
+for (x = 0; x < 8; x++) {	/* transform one row per iteration
+ */
+    movq_m2r (*(inptr), mm0);	/* 0 ; x3 x2 x1 x0
+ */
+    
+movq_m2r (*(inptr + 4), mm1);	/* 1 ; x7 x6 x5 x4
+ */
+    movq_r2r (mm0, mm2);	/* 2 ; x3 x2 x1 x0
+ */
+    
+movq_m2r (*(tab_i_01234567), mm3);	/* 3 ; w06 w04 w02 w00
+ */
+    punpcklwd_r2r (mm1, mm0);	/* x5 x1 x4 x0
+ */
+    
+	/* ----------
+ */
+	movq_r2r (mm0, mm5);	/* 5 ; x5 x1 x4 x0
+ */
+    punpckldq_r2r (mm0, mm0);	/* x4 x0 x4 x0
+ */
+    
+movq_m2r (*(tab_i_01234567 + 4), mm4);	/* 4 ; w07 w05 w03 w01
+ */
+    punpckhwd_r2r (mm1, mm2);	/* 1 ; x7 x3 x6 x2
+ */
+    
+pmaddwd_r2r (mm0, mm3);	/* x4*w06+x0*w04 x4*w02+x0*w00
+ */
+    movq_r2r (mm2, mm6);	/* 6 ; x7 x3 x6 x2
+ */
+    
+movq_m2r (*(tab_i_01234567 + 16), mm1);	/* 1 ; w22 w20 w18 w16
+ */
+    punpckldq_r2r (mm2, mm2);	/* x6 x2 x6 x2
+ */
+    
+pmaddwd_r2r (mm2, mm4);	/* x6*w07+x2*w05 x6*w03+x2*w01
+ */
+    punpckhdq_r2r (mm5, mm5);	/* x5 x1 x5 x1
+ */
+    
+pmaddwd_m2r (*(tab_i_01234567 + 8), mm0);	/* x4*w14+x0*w12 x4*w10+x0*w08
+ */
+    punpckhdq_r2r (mm6, mm6);	/* x7 x3 x7 x3
+ */
+    
+movq_m2r (*(tab_i_01234567 + 20), mm7);	/* 7 ; w23 w21 w19 w17
+ */
+    pmaddwd_r2r (mm5, mm1);	/* x5*w22+x1*w20 x5*w18+x1*w16
+ */
+    
+paddd_m2r (*(r_inv_row), mm3);	/* +rounder
+ */
+    pmaddwd_r2r (mm6, mm7);	/* x7*w23+x3*w21 x7*w19+x3*w17
+ */
+    
+pmaddwd_m2r (*(tab_i_01234567 + 12), mm2);	/* x6*w15+x2*w13 x6*w11+x2*w09
+ */
+    paddd_r2r (mm4, mm3);	/* 4 ; a1=sum(even1) a0=sum(even0)
+ */
+    
+pmaddwd_m2r (*(tab_i_01234567 + 24), mm5);	/* x5*w30+x1*w28 x5*w26+x1*w24
+ */
+    movq_r2r (mm3, mm4);	/* 4 ; a1 a0
+ */
+    
+pmaddwd_m2r (*(tab_i_01234567 + 28), mm6);	/* x7*w31+x3*w29 x7*w27+x3*w25
+ */
+    paddd_r2r (mm7, mm1);	/* 7 ; b1=sum(odd1) b0=sum(odd0)
+ */
+    
+paddd_m2r (*(r_inv_row), mm0);	/* +rounder
+ */
+    psubd_r2r (mm1, mm3);	/* a1-b1 a0-b0
+ */
+    
+psrad_i2r (SHIFT_INV_ROW, mm3);	/* y6=a1-b1 y7=a0-b0
+ */
+    paddd_r2r (mm4, mm1);	/* 4 ; a1+b1 a0+b0
+ */
+    
+paddd_r2r (mm2, mm0);	/* 2 ; a3=sum(even3) a2=sum(even2)
+ */
+    psrad_i2r (SHIFT_INV_ROW, mm1);	/* y1=a1+b1 y0=a0+b0
+ */
+    
+paddd_r2r (mm6, mm5);	/* 6 ; b3=sum(odd3) b2=sum(odd2)
+ */
+    movq_r2r (mm0, mm4);	/* 4 ; a3 a2
+ */
+    
+paddd_r2r (mm5, mm0);	/* a3+b3 a2+b2
+ */
+    psubd_r2r (mm5, mm4);	/* 5 ; a3-b3 a2-b2
+ */
+    
+psrad_i2r (SHIFT_INV_ROW, mm4);	/* y4=a3-b3 y5=a2-b2
+ */
+    psrad_i2r (SHIFT_INV_ROW, mm0);	/* y3=a3+b3 y2=a2+b2
+ */
+    
+packssdw_r2r (mm3, mm4);	/* 3 ; y6 y7 y4 y5
+ */
+    
+packssdw_r2r (mm0, mm1);	/* 0 ; y3 y2 y1 y0
+ */
+    movq_r2r (mm4, mm7);	/* 7 ; y6 y7 y4 y5
+ */
+    
+psrld_i2r (16, mm4);	/* 0 y6 0 y4
+ */
+    
+movq_r2m (mm1, *(out));	/* 1 ; save y3 y2 y1 y0
+ */
+    pslld_i2r (16, mm7);	/* y7 0 y5 0
+ */
+    
+por_r2r (mm4, mm7);	/* 4 ; y7 y6 y5 y4
+ */
+    
+	/* begin processing row 1
+ */
+	movq_r2m (mm7, *(out + 4));	/* 7 ; save y7 y6 y5 y4
+ */
+    
+inptr += 8;
+    
+out += 8;
+  
+}
+  
+      /* done with the iDCT row-transformation
+ */
+      
+      /* now we have to transpose the output 8x8 matrix
+ */
+      /* 8x8 (OUT) -> 8x8't' (IN)
+ */
+      /* the transposition is implemented as 4 sub-operations.
+ */
+      /* 1) transpose upper-left quad
+ */
+      /* 2) transpose lower-right quad
+ */
+      /* 3) transpose lower-left quad
+ */
+      /* 4) transpose upper-right quad
+ */
+      
+      /* mm0 = 1st row [ A B C D ] row1
+ */
+      /* mm1 = 2nd row [ E F G H ] 2
+ */
+      /* mm2 = 3rd row [ I J K L ] 3
+ */
+      /* mm3 = 4th row [ M N O P ] 4
+ */
+      
+      /* 1) transpose upper-left quad
+ */
+      out = &qwTemp[0];
+  
+movq_m2r (*(out + ROW_STRIDE * 0), mm0);
+  
+movq_m2r (*(out + ROW_STRIDE * 1), mm1);
+  
+movq_r2r (mm0, mm4);		/* mm4 = copy of row1[A B C D]
+ */
+  
+movq_m2r (*(out + ROW_STRIDE * 2), mm2);
+  
+punpcklwd_r2r (mm1, mm0);	/* mm0 = [ 0 4 1 5]
+ */
+  
+movq_m2r (*(out + ROW_STRIDE * 3), mm3);
+  
+punpckhwd_r2r (mm1, mm4);	/* mm4 = [ 2 6 3 7]
+ */
+  
+movq_r2r (mm2, mm6);
+  
+punpcklwd_r2r (mm3, mm2);	/* mm2 = [ 8 12 9 13]
+ */
+  
+punpckhwd_r2r (mm3, mm6);	/* mm6 = 10 14 11 15]
+ */
+  movq_r2r (mm0, mm1);		/* mm1 = [ 0 4 1 5]
+ */
+  
+inptr = blk;
+  
+punpckldq_r2r (mm2, mm0);	/* final result mm0 = row1 [0 4 8 12]
+ */
+  
+movq_r2r (mm4, mm3);		/* mm3 = [ 2 6 3 7]
+ */
+  punpckhdq_r2r (mm2, mm1);	/* mm1 = final result mm1 = row2 [1 5 9 13]
+ */
+  
+movq_r2m (mm0, *(inptr + ROW_STRIDE * 0));	/* store row 1
+ */
+  punpckldq_r2r (mm6, mm4);	/* final result mm4 = row3 [2 6 10 14]
+ */
+  
 /* begin reading next quadrant (lower-right)
 */
-  movq_m2r(*(out + ROW_STRIDE*4 + 4), mm0); 
-  punpckhdq_r2r(mm6, mm3); /* final result mm3 = row4 [3 7 11 15]
- */
-
-  movq_r2m(mm4, *(inptr + ROW_STRIDE * 2)); /* store row 3
- */
-  movq_r2r(mm0, mm4);	/* mm4 = copy of row1[A B C D]
- */
-
-  movq_r2m(mm1, *(inptr + ROW_STRIDE * 1)); /* store row 2
- */
-
-  movq_m2r(*(out + ROW_STRIDE*5 + 4), mm1);
-
-  movq_r2m(mm3, *(inptr + ROW_STRIDE * 3)); /* store row 4
- */
-  punpcklwd_r2r(mm1, mm0); /* mm0 = [ 0 4 1 5]
- */
-
-	/* 2) transpose lower-right quadrant
- */
-
+      movq_m2r (*(out + ROW_STRIDE * 4 + 4), mm0);
+  
+punpckhdq_r2r (mm6, mm3);	/* final result mm3 = row4 [3 7 11 15]
+ */
+  
+movq_r2m (mm4, *(inptr + ROW_STRIDE * 2));	/* store row 3
+ */
+  movq_r2r (mm0, mm4);		/* mm4 = copy of row1[A B C D]
+ */
+  
+movq_r2m (mm1, *(inptr + ROW_STRIDE * 1));	/* store row 2
+ */
+  
+movq_m2r (*(out + ROW_STRIDE * 5 + 4), mm1);
+  
+movq_r2m (mm3, *(inptr + ROW_STRIDE * 3));	/* store row 4
+ */
+  punpcklwd_r2r (mm1, mm0);	/* mm0 = [ 0 4 1 5]
+ */
+  
+      /* 2) transpose lower-right quadrant
+ */
+      
 /*	movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]
 */
-
+      
 /*	movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
 */
 /*	 movq mm4, mm0;	// mm4 = copy of row1[A B C D]
 */
-	
-  movq_m2r(*(out + ROW_STRIDE*6 + 4), mm2);
-/*	 punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
- */
-  punpckhwd_r2r(mm1, mm4); /* mm4 = [ 2 6 3 7]
- */
-	
-  movq_m2r(*(out + ROW_STRIDE*7 + 4), mm3);
-  movq_r2r(mm2, mm6);
-
-  punpcklwd_r2r(mm3, mm2);	/* mm2 = [ 8 12 9 13]
- */
-  movq_r2r(mm0, mm1);	/* mm1 = [ 0 4 1 5]
- */
-
-  punpckhwd_r2r(mm3, mm6);	/* mm6 = 10 14 11 15]
- */
-  movq_r2r(mm4, mm3);	/* mm3 = [ 2 6 3 7]
- */
-
-  punpckldq_r2r(mm2, mm0);	/* final result mm0 = row1 [0 4 8 12]
- */
-
-  punpckhdq_r2r(mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13]
- */
-  ; /* slot
- */
-
-  movq_r2m(mm0, *(inptr + ROW_STRIDE*4 + 4)); /* store row 1
- */
-  punpckldq_r2r(mm6, mm4); /* final result mm4 = row3 [2 6 10 14]
- */
-
-  movq_m2r(*(out + ROW_STRIDE * 4 ), mm0);
-  punpckhdq_r2r(mm6, mm3); /* final result mm3 = row4 [3 7 11 15]
- */
+      
+movq_m2r (*(out + ROW_STRIDE * 6 + 4), mm2);
   
-  movq_r2m(mm4, *(inptr + ROW_STRIDE*6 + 4)); /* store row 3
- */
-  movq_r2r(mm0, mm4);	/* mm4 = copy of row1[A B C D]
- */
-
-  movq_r2m(mm1, *(inptr + ROW_STRIDE*5 + 4)); /* store row 2
- */
-  ; /* slot
- */
+/*	 punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
+ */
+      punpckhwd_r2r (mm1, mm4);	/* mm4 = [ 2 6 3 7]
+ */
+  
+movq_m2r (*(out + ROW_STRIDE * 7 + 4), mm3);
+  
+movq_r2r (mm2, mm6);
+  
+punpcklwd_r2r (mm3, mm2);	/* mm2 = [ 8 12 9 13]
+ */
+  movq_r2r (mm0, mm1);		/* mm1 = [ 0 4 1 5]
+ */
+  
+punpckhwd_r2r (mm3, mm6);	/* mm6 = 10 14 11 15]
+ */
+  movq_r2r (mm4, mm3);		/* mm3 = [ 2 6 3 7]
+ */
+  
+punpckldq_r2r (mm2, mm0);	/* final result mm0 = row1 [0 4 8 12]
+ */
+  
+punpckhdq_r2r (mm2, mm1);	/* mm1 = final result mm1 = row2 [1 5 9 13]
+ */
+  ;				/* slot
+ */
+  
+movq_r2m (mm0, *(inptr + ROW_STRIDE * 4 + 4));	/* store row 1
+ */
+  punpckldq_r2r (mm6, mm4);	/* final result mm4 = row3 [2 6 10 14]
+ */
+  
+movq_m2r (*(out + ROW_STRIDE * 4), mm0);
+  
+punpckhdq_r2r (mm6, mm3);	/* final result mm3 = row4 [3 7 11 15]
+ */
+  
+movq_r2m (mm4, *(inptr + ROW_STRIDE * 6 + 4));	/* store row 3
+ */
+  movq_r2r (mm0, mm4);		/* mm4 = copy of row1[A B C D]
+ */
+  
+movq_r2m (mm1, *(inptr + ROW_STRIDE * 5 + 4));	/* store row 2
+ */
+  ;				/* slot
+ */
+  
+movq_m2r (*(out + ROW_STRIDE * 5), mm1);
+  
+;				/* slot
+ */
+  
+movq_r2m (mm3, *(inptr + ROW_STRIDE * 7 + 4));	/* store row 4
+ */
+  punpcklwd_r2r (mm1, mm0);	/* mm0 = [ 0 4 1 5]
+ */
   
-  movq_m2r(*(out + ROW_STRIDE * 5 ), mm1);
-  ; /* slot
- */
-
-  movq_r2m(mm3, *(inptr + ROW_STRIDE*7 + 4)); /* store row 4
- */
-  punpcklwd_r2r(mm1, mm0); /* mm0 = [ 0 4 1 5]
- */
-
-  /* 3) transpose lower-left
- */
+      /* 3) transpose lower-left
+ */
 /*	movq mm0, qword ptr [OUT + ROW_STRIDE * 4 ]
 */
-
+      
 /*	movq mm1, qword ptr [OUT + ROW_STRIDE * 5 ]
 */
 /*	 movq mm4, mm0;	// mm4 = copy of row1[A B C D]
 */
-	
-  movq_m2r(*(out + ROW_STRIDE * 6 ), mm2);
+      
+movq_m2r (*(out + ROW_STRIDE * 6), mm2);
+  
 /*	 punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
 */
-  punpckhwd_r2r(mm1, mm4); /* mm4 = [ 2 6 3 7]
- */
-	
-  movq_m2r(*(out + ROW_STRIDE * 7 ), mm3);
-  movq_r2r(mm2, mm6);
-
-  punpcklwd_r2r(mm3, mm2);	/* mm2 = [ 8 12 9 13]
- */
-  movq_r2r(mm0, mm1);	/* mm1 = [ 0 4 1 5]
- */
-
-  punpckhwd_r2r(mm3, mm6);	/* mm6 = 10 14 11 15]
- */
-  movq_r2r(mm4, mm3);	/* mm3 = [ 2 6 3 7]
- */
-
-  punpckldq_r2r(mm2, mm0);	/* final result mm0 = row1 [0 4 8 12]
- */
-
-  punpckhdq_r2r(mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13]
- */
-  ;/*slot
- */
-
-  movq_r2m(mm0, *(inptr + ROW_STRIDE * 0 + 4 )); /* store row 1
- */
-  punpckldq_r2r(mm6, mm4); /* final result mm4 = row3 [2 6 10 14]
- */
-
+      punpckhwd_r2r (mm1, mm4);	/* mm4 = [ 2 6 3 7]
+ */
+  
+movq_m2r (*(out + ROW_STRIDE * 7), mm3);
+  
+movq_r2r (mm2, mm6);
+  
+punpcklwd_r2r (mm3, mm2);	/* mm2 = [ 8 12 9 13]
+ */
+  movq_r2r (mm0, mm1);		/* mm1 = [ 0 4 1 5]
+ */
+  
+punpckhwd_r2r (mm3, mm6);	/* mm6 = 10 14 11 15]
+ */
+  movq_r2r (mm4, mm3);		/* mm3 = [ 2 6 3 7]
+ */
+  
+punpckldq_r2r (mm2, mm0);	/* final result mm0 = row1 [0 4 8 12]
+ */
+  
+punpckhdq_r2r (mm2, mm1);	/* mm1 = final result mm1 = row2 [1 5 9 13]
+ */
+  ;				/*slot
+ */
+  
+movq_r2m (mm0, *(inptr + ROW_STRIDE * 0 + 4));	/* store row 1
+ */
+  punpckldq_r2r (mm6, mm4);	/* final result mm4 = row3 [2 6 10 14]
+ */
+  
 /* begin reading next quadrant (upper-right)
 */
-  movq_m2r(*(out + ROW_STRIDE*0 + 4), mm0);
-  punpckhdq_r2r(mm6, mm3); /* final result mm3 = row4 [3 7 11 15]
- */
-
-  movq_r2m(mm4, *(inptr + ROW_STRIDE * 2 + 4)); /* store row 3
- */
-  movq_r2r(mm0, mm4);	/* mm4 = copy of row1[A B C D]
- */
-
-  movq_r2m(mm1, *(inptr + ROW_STRIDE * 1 + 4)); /* store row 2
- */
-  movq_m2r(*(out + ROW_STRIDE*1 + 4), mm1);
-
-  movq_r2m(mm3, *(inptr + ROW_STRIDE * 3 + 4)); /* store row 4
- */
-  punpcklwd_r2r(mm1, mm0); /* mm0 = [ 0 4 1 5]
- */
-
-
-	/* 2) transpose lower-right quadrant
- */
-
+      movq_m2r (*(out + ROW_STRIDE * 0 + 4), mm0);
+  
+punpckhdq_r2r (mm6, mm3);	/* final result mm3 = row4 [3 7 11 15]
+ */
+  
+movq_r2m (mm4, *(inptr + ROW_STRIDE * 2 + 4));	/* store row 3
+ */
+  movq_r2r (mm0, mm4);		/* mm4 = copy of row1[A B C D]
+ */
+  
+movq_r2m (mm1, *(inptr + ROW_STRIDE * 1 + 4));	/* store row 2
+ */
+  movq_m2r (*(out + ROW_STRIDE * 1 + 4), mm1);
+  
+movq_r2m (mm3, *(inptr + ROW_STRIDE * 3 + 4));	/* store row 4
+ */
+  punpcklwd_r2r (mm1, mm0);	/* mm0 = [ 0 4 1 5]
+ */
+  
+      /* 2) transpose lower-right quadrant
+ */
+      
 /*	movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]
 */
-
+      
 /*	movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
 */
 /*	 movq mm4, mm0;	// mm4 = copy of row1[A B C D]
 */
-	
-  movq_m2r(*(out + ROW_STRIDE*2 + 4), mm2);
+      
+movq_m2r (*(out + ROW_STRIDE * 2 + 4), mm2);
+  
 /*	 punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
 */
-  punpckhwd_r2r(mm1, mm4); /* mm4 = [ 2 6 3 7]
- */
-	
-  movq_m2r(*(out + ROW_STRIDE*3 + 4), mm3);
-  movq_r2r(mm2, mm6);
-
-  punpcklwd_r2r(mm3, mm2);	/* mm2 = [ 8 12 9 13]
- */
-  movq_r2r(mm0, mm1);	/* mm1 = [ 0 4 1 5]
- */
-
-  punpckhwd_r2r(mm3, mm6);	/* mm6 = 10 14 11 15]
- */
-  movq_r2r(mm4, mm3);	/* mm3 = [ 2 6 3 7]
- */
-
-  punpckldq_r2r(mm2, mm0);	/* final result mm0 = row1 [0 4 8 12]
- */
-
-  punpckhdq_r2r(mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13]
- */
-  ; /* slot
- */
-
-  movq_r2m(mm0, *(inptr + ROW_STRIDE*4)); /* store row 1
- */
-  punpckldq_r2r(mm6, mm4); /* final result mm4 = row3 [2 6 10 14]
- */
-
-  movq_r2m(mm1, *(inptr + ROW_STRIDE*5)); /* store row 2
- */
-  punpckhdq_r2r(mm6, mm3); /* final result mm3 = row4 [3 7 11 15]
- */
-
-  movq_r2m(mm4, *(inptr + ROW_STRIDE*6)); /* store row 3
- */
-  ; /* slot
- */
-
-  movq_r2m(mm3, *(inptr + ROW_STRIDE*7)); /* store row 4
- */
-  ; /* slot
- */
- 
-}
-
-
-static void 
-idct_mmx32_cols( short *blk )	/* transform all 8 cols of 8x8 iDCT block
- */
-{
-  int x;
-  short *inptr = blk;
-
-	/* Despite the function's name, the matrix is transformed
- */
-	/* row by row.  This function is identical to idct_mmx32_rows(),
- */
-	/* except for the SHIFT amount and ROUND_INV amount.
- */
-
-	/* this subroutine performs two operations
- */
-	/* 1) iDCT row transform
- */
-	/*		for( i = 0; i < 8; ++ i)
- */
-	/*			DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] );
- */
-	/*
- */
-	/* 2) transpose the matrix (which was stored in qwTemp[])
- */
-	/*        qwTemp[] -> [8x8 matrix transpose] -> blk[]
- */
-
-
-  for (x=0; x<8; x++) {  /* transform one row per iteration
- */
-
-    movq_m2r(*(inptr), mm0);		/* 0 ; x3 x2 x1 x0
- */
-
-    movq_m2r(*(inptr+4), mm1);	/* 1 ; x7 x6 x5 x4
- */
-	 movq_r2r(mm0, mm2);				/* 2 ; x3 x2 x1 x0
- */
-
-	 movq_m2r(*(tab_i_01234567), mm3);	/* 3 ; w06 w04 w02 w00
- */
-	 punpcklwd_r2r(mm1, mm0);			/* x5 x1 x4 x0
- */
-
+      punpckhwd_r2r (mm1, mm4);	/* mm4 = [ 2 6 3 7]
+ */
+  
+movq_m2r (*(out + ROW_STRIDE * 3 + 4), mm3);
+  
+movq_r2r (mm2, mm6);
+  
+punpcklwd_r2r (mm3, mm2);	/* mm2 = [ 8 12 9 13]
+ */
+  movq_r2r (mm0, mm1);		/* mm1 = [ 0 4 1 5]
+ */
+  
+punpckhwd_r2r (mm3, mm6);	/* mm6 = 10 14 11 15]
+ */
+  movq_r2r (mm4, mm3);		/* mm3 = [ 2 6 3 7]
+ */
+  
+punpckldq_r2r (mm2, mm0);	/* final result mm0 = row1 [0 4 8 12]
+ */
+  
+punpckhdq_r2r (mm2, mm1);	/* mm1 = final result mm1 = row2 [1 5 9 13]
+ */
+  ;				/* slot
+ */
+  
+movq_r2m (mm0, *(inptr + ROW_STRIDE * 4));	/* store row 1
+ */
+  punpckldq_r2r (mm6, mm4);	/* final result mm4 = row3 [2 6 10 14]
+ */
+  
+movq_r2m (mm1, *(inptr + ROW_STRIDE * 5));	/* store row 2
+ */
+  punpckhdq_r2r (mm6, mm3);	/* final result mm3 = row4 [3 7 11 15]
+ */
+  
+movq_r2m (mm4, *(inptr + ROW_STRIDE * 6));	/* store row 3
+ */
+  ;				/* slot
+ */
+  
+movq_r2m (mm3, *(inptr + ROW_STRIDE * 7));	/* store row 4
+ */
+  ;				/* slot
+ */
+
+}
+
+static void 
+idct_mmx32_cols (short *blk)
+{				/* transform all 8 cols of 8x8 iDCT block
+ */
+  
+int x;
+  
+short *inptr = blk;
+
+  
+      /* Despite the function's name, the matrix is transformed
+ */
+      /* row by row.  This function is identical to idct_mmx32_rows(),
+ */
+      /* except for the SHIFT amount and ROUND_INV amount.
+ */
+      
+      /* this subroutine performs two operations
+ */
+      /* 1) iDCT row transform
+ */
+      /*              for( i = 0; i < 8; ++ i)
+ */
+      /*                      DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] );
+ */
+      /*
+ */
+      /* 2) transpose the matrix (which was stored in qwTemp[])
+ */
+      /*        qwTemp[] -> [8x8 matrix transpose] -> blk[]
+ */
+      
+for (x = 0; x < 8; x++) {	/* transform one row per iteration
+ */
+    
+movq_m2r (*(inptr), mm0);	/* 0 ; x3 x2 x1 x0
+ */
+    
+movq_m2r (*(inptr + 4), mm1);	/* 1 ; x7 x6 x5 x4
+ */
+    movq_r2r (mm0, mm2);	/* 2 ; x3 x2 x1 x0
+ */
+    
+movq_m2r (*(tab_i_01234567), mm3);	/* 3 ; w06 w04 w02 w00
+ */
+    punpcklwd_r2r (mm1, mm0);	/* x5 x1 x4 x0
+ */
+    
 /* ----------
 */
-	 movq_r2r(mm0, mm5);					/* 5 ; x5 x1 x4 x0
- */
-	 punpckldq_r2r(mm0, mm0);			/* x4 x0 x4 x0
- */
-
-	 movq_m2r(*(tab_i_01234567+4), mm4);	/* 4 ; w07 w05 w03 w01
- */
-	 punpckhwd_r2r(mm1, mm2);			/* 1 ; x7 x3 x6 x2
- */
-
-	 pmaddwd_r2r(mm0, mm3);				/* x4*w06+x0*w04 x4*w02+x0*w00
- */
-	 movq_r2r(mm2, mm6);				/* 6 ; x7 x3 x6 x2
- */
-
-	 movq_m2r(*(tab_i_01234567+16), mm1);/* 1 ; w22 w20 w18 w16
- */
-	 punpckldq_r2r(mm2, mm2);			/* x6 x2 x6 x2
- */
-
-	 pmaddwd_r2r(mm2, mm4);				/* x6*w07+x2*w05 x6*w03+x2*w01
- */
-	 punpckhdq_r2r(mm5, mm5);			/* x5 x1 x5 x1
- */
-
-	 pmaddwd_m2r(*(tab_i_01234567+8), mm0);/* x4*w14+x0*w12 x4*w10+x0*w08
- */
-	 punpckhdq_r2r(mm6, mm6);			/* x7 x3 x7 x3
- */
-
-	 movq_m2r(*(tab_i_01234567+20), mm7);/* 7 ; w23 w21 w19 w17
- */
-	 pmaddwd_r2r(mm5, mm1);				/* x5*w22+x1*w20 x5*w18+x1*w16
- */
-
-	 paddd_m2r(*(r_inv_col), mm3);/* +rounder
- */
-	 pmaddwd_r2r(mm6, mm7);				/* x7*w23+x3*w21 x7*w19+x3*w17
- */
-
-	 pmaddwd_m2r(*(tab_i_01234567+12), mm2);/* x6*w15+x2*w13 x6*w11+x2*w09
- */
-	 paddd_r2r(mm4, mm3);				/* 4 ; a1=sum(even1) a0=sum(even0)
- */
-
-	 pmaddwd_m2r(*(tab_i_01234567+24), mm5);/* x5*w30+x1*w28 x5*w26+x1*w24
- */
-	 movq_r2r(mm3, mm4);				/* 4 ; a1 a0
- */
-
-	 pmaddwd_m2r(*(tab_i_01234567+28), mm6);/* x7*w31+x3*w29 x7*w27+x3*w25
- */
-	 paddd_r2r(mm7, mm1);				/* 7 ; b1=sum(odd1) b0=sum(odd0)
- */
-
-	 paddd_m2r(*(r_inv_col), mm0);/* +rounder
- */
-	 psubd_r2r(mm1, mm3);				/* a1-b1 a0-b0
- */
-
-	 psrad_i2r(SHIFT_INV_COL, mm3);		/* y6=a1-b1 y7=a0-b0
- */
-	 paddd_r2r(mm4, mm1);				/* 4 ; a1+b1 a0+b0
- */
-
-	 paddd_r2r(mm2, mm0);				/* 2 ; a3=sum(even3) a2=sum(even2)
- */
-	 psrad_i2r(SHIFT_INV_COL, mm1);		/* y1=a1+b1 y0=a0+b0
- */
-
-	 paddd_r2r(mm6, mm5);				/* 6 ; b3=sum(odd3) b2=sum(odd2)
- */
-	 movq_r2r(mm0, mm4);				/* 4 ; a3 a2
- */
-
-	 paddd_r2r(mm5, mm0);				/* a3+b3 a2+b2
- */
-	 psubd_r2r(mm5, mm4);				/* 5 ; a3-b3 a2-b2
- */
-
-
-	 psrad_i2r(SHIFT_INV_COL, mm4);		/* y4=a3-b3 y5=a2-b2
- */
-	 psrad_i2r(SHIFT_INV_COL, mm0);		/* y3=a3+b3 y2=a2+b2
- */
-
-	 packssdw_r2r(mm3, mm4);				/* 3 ; y6 y7 y4 y5
- */
-
-	 packssdw_r2r(mm0, mm1);				/* 0 ; y3 y2 y1 y0
- */
-	 movq_r2r(mm4, mm7);				/* 7 ; y6 y7 y4 y5
- */
-
-	 psrld_i2r(16, mm4);					/* 0 y6 0 y4
- */
-
-	 movq_r2m(mm1, *(inptr));	/* 1 ; save y3 y2 y1 y0
- */
-	 pslld_i2r(16, mm7);					/* y7 0 y5 0
- */
-
-	 por_r2r(mm4, mm7);					/* 4 ; y7 y6 y5 y4
- */
-
-   /* begin processing row 1
- */
-	 movq_r2m(mm7, *(inptr+4));	/* 7 ; save y7 y6 y5 y4
- */
-
-	 inptr += 8;
-  }
-  /* done with the iDCT column-transformation
- */
-}
-
+	movq_r2r (mm0, mm5);	/* 5 ; x5 x1 x4 x0
+ */
+    punpckldq_r2r (mm0, mm0);	/* x4 x0 x4 x0
+ */
+    
+movq_m2r (*(tab_i_01234567 + 4), mm4);	/* 4 ; w07 w05 w03 w01
+ */
+    punpckhwd_r2r (mm1, mm2);	/* 1 ; x7 x3 x6 x2
+ */
+    
+pmaddwd_r2r (mm0, mm3);	/* x4*w06+x0*w04 x4*w02+x0*w00
+ */
+    movq_r2r (mm2, mm6);	/* 6 ; x7 x3 x6 x2
+ */
+    
+movq_m2r (*(tab_i_01234567 + 16), mm1);	/* 1 ; w22 w20 w18 w16
+ */
+    punpckldq_r2r (mm2, mm2);	/* x6 x2 x6 x2
+ */
+    
+pmaddwd_r2r (mm2, mm4);	/* x6*w07+x2*w05 x6*w03+x2*w01
+ */
+    punpckhdq_r2r (mm5, mm5);	/* x5 x1 x5 x1
+ */
+    
+pmaddwd_m2r (*(tab_i_01234567 + 8), mm0);	/* x4*w14+x0*w12 x4*w10+x0*w08
+ */
+    punpckhdq_r2r (mm6, mm6);	/* x7 x3 x7 x3
+ */
+    
+movq_m2r (*(tab_i_01234567 + 20), mm7);	/* 7 ; w23 w21 w19 w17
+ */
+    pmaddwd_r2r (mm5, mm1);	/* x5*w22+x1*w20 x5*w18+x1*w16
+ */
+    
+paddd_m2r (*(r_inv_col), mm3);	/* +rounder
+ */
+    pmaddwd_r2r (mm6, mm7);	/* x7*w23+x3*w21 x7*w19+x3*w17
+ */
+    
+pmaddwd_m2r (*(tab_i_01234567 + 12), mm2);	/* x6*w15+x2*w13 x6*w11+x2*w09
+ */
+    paddd_r2r (mm4, mm3);	/* 4 ; a1=sum(even1) a0=sum(even0)
+ */
+    
+pmaddwd_m2r (*(tab_i_01234567 + 24), mm5);	/* x5*w30+x1*w28 x5*w26+x1*w24
+ */
+    movq_r2r (mm3, mm4);	/* 4 ; a1 a0
+ */
+    
+pmaddwd_m2r (*(tab_i_01234567 + 28), mm6);	/* x7*w31+x3*w29 x7*w27+x3*w25
+ */
+    paddd_r2r (mm7, mm1);	/* 7 ; b1=sum(odd1) b0=sum(odd0)
+ */
+    
+paddd_m2r (*(r_inv_col), mm0);	/* +rounder
+ */
+    psubd_r2r (mm1, mm3);	/* a1-b1 a0-b0
+ */
+    
+psrad_i2r (SHIFT_INV_COL, mm3);	/* y6=a1-b1 y7=a0-b0
+ */
+    paddd_r2r (mm4, mm1);	/* 4 ; a1+b1 a0+b0
+ */
+    
+paddd_r2r (mm2, mm0);	/* 2 ; a3=sum(even3) a2=sum(even2)
+ */
+    psrad_i2r (SHIFT_INV_COL, mm1);	/* y1=a1+b1 y0=a0+b0
+ */
+    
+paddd_r2r (mm6, mm5);	/* 6 ; b3=sum(odd3) b2=sum(odd2)
+ */
+    movq_r2r (mm0, mm4);	/* 4 ; a3 a2
+ */
+    
+paddd_r2r (mm5, mm0);	/* a3+b3 a2+b2
+ */
+    psubd_r2r (mm5, mm4);	/* 5 ; a3-b3 a2-b2
+ */
+    
+psrad_i2r (SHIFT_INV_COL, mm4);	/* y4=a3-b3 y5=a2-b2
+ */
+    psrad_i2r (SHIFT_INV_COL, mm0);	/* y3=a3+b3 y2=a2+b2
+ */
+    
+packssdw_r2r (mm3, mm4);	/* 3 ; y6 y7 y4 y5
+ */
+    
+packssdw_r2r (mm0, mm1);	/* 0 ; y3 y2 y1 y0
+ */
+    movq_r2r (mm4, mm7);	/* 7 ; y6 y7 y4 y5
+ */
+    
+psrld_i2r (16, mm4);	/* 0 y6 0 y4
+ */
+    
+movq_r2m (mm1, *(inptr));	/* 1 ; save y3 y2 y1 y0
+ */
+    pslld_i2r (16, mm7);	/* y7 0 y5 0
+ */
+    
+por_r2r (mm4, mm7);	/* 4 ; y7 y6 y5 y4
+ */
+    
+	/* begin processing row 1
+ */
+	movq_r2m (mm7, *(inptr + 4));	/* 7 ; save y7 y6 y5 y4
+ */
+    
+inptr += 8;
+  
+}
+  
+      /* done with the iDCT column-transformation
+ */
+}
+
+
 /*	
 */
 /* public interface to MMX32 IDCT 8x8 operation
 */
 /*
 */
-void
-gst_idct_mmx32_idct( short *blk )
-{
-	/* 1) iDCT row transformation
- */
-	idct_mmx32_rows( blk ); /* 1) transform iDCT row, and transpose
- */
-
-	/* 2) iDCT column transformation
- */
-	idct_mmx32_cols( blk ); /* 2) transform iDCT row, and transpose
- */
-
-	emms();  /* restore processor state
- */
-	/* all done
- */
-}
+void 
+gst_idct_mmx32_idct (short *blk) 
+{
+  
+      /* 1) iDCT row transformation
+ */
+      idct_mmx32_rows (blk);	/* 1) transform iDCT row, and transpose
+ */
+  
+      /* 2) iDCT column transformation
+ */
+      idct_mmx32_cols (blk);	/* 2) transform iDCT row, and transpose
+ */
+  
+emms ();			/* restore processor state
+ */
+  /* all done
+ */
+}