summaryrefslogtreecommitdiffstats
path: root/gst-libs/gst/idct
diff options
context:
space:
mode:
Diffstat (limited to 'gst-libs/gst/idct')
-rw-r--r--gst-libs/gst/idct/dct.h5
-rw-r--r--gst-libs/gst/idct/fastintidct.c156
-rw-r--r--gst-libs/gst/idct/floatidct.c42
-rw-r--r--gst-libs/gst/idct/idct.c160
-rw-r--r--gst-libs/gst/idct/idct.h23
-rw-r--r--gst-libs/gst/idct/ieeetest.c293
-rw-r--r--gst-libs/gst/idct/intidct.c205
-rw-r--r--gst-libs/gst/idct/mmx32idct.c993
8 files changed, 908 insertions, 969 deletions
diff --git a/gst-libs/gst/idct/dct.h b/gst-libs/gst/idct/dct.h
index efb3ddb3..c2e37449 100644
--- a/gst-libs/gst/idct/dct.h
+++ b/gst-libs/gst/idct/dct.h
@@ -16,7 +16,7 @@ typedef DCTELEM DCTBLOCK[DCTSIZE2];
typedef long INT32; /* must be at least 32 bits */
-extern void gst_idct_int_idct();
+extern void gst_idct_int_idct ();
extern void gst_idct_init_fast_int_idct (void);
extern void gst_idct_fast_int_idct (short *block);
@@ -27,6 +27,5 @@ extern void gst_idct_mmx32_idct (short *block);
extern void gst_idct_sse_idct (short *block);
#endif /* HAVE_LIBMMX */
-extern void gst_idct_init_float_idct(void);
+extern void gst_idct_init_float_idct (void);
extern void gst_idct_float_idct (short *block);
-
diff --git a/gst-libs/gst/idct/fastintidct.c b/gst-libs/gst/idct/fastintidct.c
index 27426672..9bb1436d 100644
--- a/gst-libs/gst/idct/fastintidct.c
+++ b/gst-libs/gst/idct/fastintidct.c
@@ -45,17 +45,17 @@
/* this code assumes >> to be a two's-complement arithmetic */
/* right shift: (-2)>>1 == -1 , (-3)>>1 == -2 */
-#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */
-#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */
-#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */
-#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */
-#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */
-#define W7 565 /* 2048*sqrt(2)*cos(7*pi/16) */
+#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */
+#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */
+#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */
+#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */
+#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */
+#define W7 565 /* 2048*sqrt(2)*cos(7*pi/16) */
#include "dct.h"
/* private data */
-static short iclip[1024]; /* clipping table */
+static short iclip[1024]; /* clipping table */
static short *iclp;
/* private prototypes */
@@ -72,57 +72,58 @@ static void idctcol (short *blk);
* c[1..7] = 128*sqrt(2)
*/
-static void idctrow(blk)
-short *blk;
+static void
+idctrow (blk)
+ short *blk;
{
int x0, x1, x2, x3, x4, x5, x6, x7, x8;
/* shortcut */
- if (!((x1 = blk[4]<<11) | (x2 = blk[6]) | (x3 = blk[2]) |
- (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3])))
- {
- blk[0]=blk[1]=blk[2]=blk[3]=blk[4]=blk[5]=blk[6]=blk[7]=blk[0]<<3;
+ if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
+ (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
+ blk[0] = blk[1] = blk[2] = blk[3] = blk[4] = blk[5] = blk[6] = blk[7] =
+ blk[0] << 3;
return;
}
- x0 = (blk[0]<<11) + 128; /* for proper rounding in the fourth stage */
+ x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */
/* first stage */
- x8 = W7*(x4+x5);
- x4 = x8 + (W1-W7)*x4;
- x5 = x8 - (W1+W7)*x5;
- x8 = W3*(x6+x7);
- x6 = x8 - (W3-W5)*x6;
- x7 = x8 - (W3+W5)*x7;
-
+ x8 = W7 * (x4 + x5);
+ x4 = x8 + (W1 - W7) * x4;
+ x5 = x8 - (W1 + W7) * x5;
+ x8 = W3 * (x6 + x7);
+ x6 = x8 - (W3 - W5) * x6;
+ x7 = x8 - (W3 + W5) * x7;
+
/* second stage */
x8 = x0 + x1;
x0 -= x1;
- x1 = W6*(x3+x2);
- x2 = x1 - (W2+W6)*x2;
- x3 = x1 + (W2-W6)*x3;
+ x1 = W6 * (x3 + x2);
+ x2 = x1 - (W2 + W6) * x2;
+ x3 = x1 + (W2 - W6) * x3;
x1 = x4 + x6;
x4 -= x6;
x6 = x5 + x7;
x5 -= x7;
-
+
/* third stage */
x7 = x8 + x3;
x8 -= x3;
x3 = x0 + x2;
x0 -= x2;
- x2 = (181*(x4+x5)+128)>>8;
- x4 = (181*(x4-x5)+128)>>8;
-
+ x2 = (181 * (x4 + x5) + 128) >> 8;
+ x4 = (181 * (x4 - x5) + 128) >> 8;
+
/* fourth stage */
- blk[0] = (x7+x1)>>8;
- blk[1] = (x3+x2)>>8;
- blk[2] = (x0+x4)>>8;
- blk[3] = (x8+x6)>>8;
- blk[4] = (x8-x6)>>8;
- blk[5] = (x0-x4)>>8;
- blk[6] = (x3-x2)>>8;
- blk[7] = (x7-x1)>>8;
+ blk[0] = (x7 + x1) >> 8;
+ blk[1] = (x3 + x2) >> 8;
+ blk[2] = (x0 + x4) >> 8;
+ blk[3] = (x8 + x6) >> 8;
+ blk[4] = (x8 - x6) >> 8;
+ blk[5] = (x0 - x4) >> 8;
+ blk[6] = (x3 - x2) >> 8;
+ blk[7] = (x7 - x1) >> 8;
}
/* column (vertical) IDCT
@@ -134,78 +135,81 @@ short *blk;
* where: c[0] = 1/1024
* c[1..7] = (1/1024)*sqrt(2)
*/
-static void idctcol(blk)
-short *blk;
+static void
+idctcol (blk)
+ short *blk;
{
int x0, x1, x2, x3, x4, x5, x6, x7, x8;
/* shortcut */
- if (!((x1 = (blk[8*4]<<8)) | (x2 = blk[8*6]) | (x3 = blk[8*2]) |
- (x4 = blk[8*1]) | (x5 = blk[8*7]) | (x6 = blk[8*5]) | (x7 = blk[8*3])))
- {
- blk[8*0]=blk[8*1]=blk[8*2]=blk[8*3]=blk[8*4]=blk[8*5]=blk[8*6]=blk[8*7]=
- iclp[(blk[8*0]+32)>>6];
+ if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
+ (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) | (x7 =
+ blk[8 * 3]))) {
+ blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3] = blk[8 * 4] =
+ blk[8 * 5] = blk[8 * 6] = blk[8 * 7] = iclp[(blk[8 * 0] + 32) >> 6];
return;
}
- x0 = (blk[8*0]<<8) + 8192;
+ x0 = (blk[8 * 0] << 8) + 8192;
/* first stage */
- x8 = W7*(x4+x5) + 4;
- x4 = (x8+(W1-W7)*x4)>>3;
- x5 = (x8-(W1+W7)*x5)>>3;
- x8 = W3*(x6+x7) + 4;
- x6 = (x8-(W3-W5)*x6)>>3;
- x7 = (x8-(W3+W5)*x7)>>3;
-
+ x8 = W7 * (x4 + x5) + 4;
+ x4 = (x8 + (W1 - W7) * x4) >> 3;
+ x5 = (x8 - (W1 + W7) * x5) >> 3;
+ x8 = W3 * (x6 + x7) + 4;
+ x6 = (x8 - (W3 - W5) * x6) >> 3;
+ x7 = (x8 - (W3 + W5) * x7) >> 3;
+
/* second stage */
x8 = x0 + x1;
x0 -= x1;
- x1 = W6*(x3+x2) + 4;
- x2 = (x1-(W2+W6)*x2)>>3;
- x3 = (x1+(W2-W6)*x3)>>3;
+ x1 = W6 * (x3 + x2) + 4;
+ x2 = (x1 - (W2 + W6) * x2) >> 3;
+ x3 = (x1 + (W2 - W6) * x3) >> 3;
x1 = x4 + x6;
x4 -= x6;
x6 = x5 + x7;
x5 -= x7;
-
+
/* third stage */
x7 = x8 + x3;
x8 -= x3;
x3 = x0 + x2;
x0 -= x2;
- x2 = (181*(x4+x5)+128)>>8;
- x4 = (181*(x4-x5)+128)>>8;
-
+ x2 = (181 * (x4 + x5) + 128) >> 8;
+ x4 = (181 * (x4 - x5) + 128) >> 8;
+
/* fourth stage */
- blk[8*0] = iclp[(x7+x1)>>14];
- blk[8*1] = iclp[(x3+x2)>>14];
- blk[8*2] = iclp[(x0+x4)>>14];
- blk[8*3] = iclp[(x8+x6)>>14];
- blk[8*4] = iclp[(x8-x6)>>14];
- blk[8*5] = iclp[(x0-x4)>>14];
- blk[8*6] = iclp[(x3-x2)>>14];
- blk[8*7] = iclp[(x7-x1)>>14];
+ blk[8 * 0] = iclp[(x7 + x1) >> 14];
+ blk[8 * 1] = iclp[(x3 + x2) >> 14];
+ blk[8 * 2] = iclp[(x0 + x4) >> 14];
+ blk[8 * 3] = iclp[(x8 + x6) >> 14];
+ blk[8 * 4] = iclp[(x8 - x6) >> 14];
+ blk[8 * 5] = iclp[(x0 - x4) >> 14];
+ blk[8 * 6] = iclp[(x3 - x2) >> 14];
+ blk[8 * 7] = iclp[(x7 - x1) >> 14];
}
/* two dimensional inverse discrete cosine transform */
-void gst_idct_fast_int_idct(block)
-short *block;
+void
+gst_idct_fast_int_idct (block)
+ short *block;
{
int i;
- for (i=0; i<8; i++)
- idctrow(block+8*i);
+ for (i = 0; i < 8; i++)
+ idctrow (block + 8 * i);
- for (i=0; i<8; i++)
- idctcol(block+i);
+ for (i = 0; i < 8; i++)
+ idctcol (block + i);
}
-void gst_idct_init_fast_int_idct()
+void
+gst_idct_init_fast_int_idct ()
{
int i;
- iclp = iclip+512;
- for (i= -512; i<512; i++)
- iclp[i] = (i<-256) ? -256 : ((i>255) ? 255 : i);
+ iclp = iclip + 512;
+ for (i = -512; i < 512; i++)
+ iclp[i] = (i < -256) ? -256 : ((i > 255) ? 255 : i);
}
diff --git a/gst-libs/gst/idct/floatidct.c b/gst-libs/gst/idct/floatidct.c
index b215bd78..0fa1e830 100644
--- a/gst-libs/gst/idct/floatidct.c
+++ b/gst-libs/gst/idct/floatidct.c
@@ -56,51 +56,51 @@ static double gst_idct_float_c[8][8];
/* initialize DCT coefficient matrix */
-void gst_idct_init_float_idct()
+void
+gst_idct_init_float_idct ()
{
int freq, time;
double scale;
- for (freq=0; freq < 8; freq++)
- {
- scale = (freq == 0) ? sqrt(0.125) : 0.5;
- for (time=0; time<8; time++)
- gst_idct_float_c[freq][time] = scale*cos((PI/8.0)*freq*(time + 0.5));
+ for (freq = 0; freq < 8; freq++) {
+ scale = (freq == 0) ? sqrt (0.125) : 0.5;
+ for (time = 0; time < 8; time++)
+ gst_idct_float_c[freq][time] =
+ scale * cos ((PI / 8.0) * freq * (time + 0.5));
}
}
/* perform IDCT matrix multiply for 8x8 coefficient block */
-void gst_idct_float_idct(block)
-short *block;
+void
+gst_idct_float_idct (block)
+ short *block;
{
int i, j, k, v;
double partial_product;
double tmp[64];
- for (i=0; i<8; i++)
- for (j=0; j<8; j++)
- {
+ for (i = 0; i < 8; i++)
+ for (j = 0; j < 8; j++) {
partial_product = 0.0;
- for (k=0; k<8; k++)
- partial_product+= gst_idct_float_c[k][j]*block[8*i+k];
+ for (k = 0; k < 8; k++)
+ partial_product += gst_idct_float_c[k][j] * block[8 * i + k];
- tmp[8*i+j] = partial_product;
+ tmp[8 * i + j] = partial_product;
}
/* Transpose operation is integrated into address mapping by switching
loop order of i and j */
- for (j=0; j<8; j++)
- for (i=0; i<8; i++)
- {
+ for (j = 0; j < 8; j++)
+ for (i = 0; i < 8; i++) {
partial_product = 0.0;
- for (k=0; k<8; k++)
- partial_product+= gst_idct_float_c[k][i]*tmp[8*k+j];
+ for (k = 0; k < 8; k++)
+ partial_product += gst_idct_float_c[k][i] * tmp[8 * k + j];
- v = (int) floor(partial_product+0.5);
- block[8*i+j] = (v<-256) ? -256 : ((v>255) ? 255 : v);
+ v = (int) floor (partial_product + 0.5);
+ block[8 * i + j] = (v < -256) ? -256 : ((v > 255) ? 255 : v);
}
}
diff --git a/gst-libs/gst/idct/idct.c b/gst-libs/gst/idct/idct.c
index 59c6a844..4be150f1 100644
--- a/gst-libs/gst/idct/idct.c
+++ b/gst-libs/gst/idct/idct.c
@@ -25,24 +25,25 @@
#include <gst/idct/idct.h>
#include "dct.h"
-static void gst_idct_int_sparse_idct(short *data);
+static void gst_idct_int_sparse_idct (short *data);
-GstIDCT *gst_idct_new(GstIDCTMethod method)
+GstIDCT *
+gst_idct_new (GstIDCTMethod method)
{
- GstIDCT *new = g_malloc(sizeof(GstIDCT));
+ GstIDCT *new = g_malloc (sizeof (GstIDCT));
new->need_transpose = FALSE;
if (method == GST_IDCT_DEFAULT) {
#ifdef HAVE_LIBMMX
- if (gst_cpu_get_flags() & GST_CPU_FLAG_MMX) {
+ if (gst_cpu_get_flags () & GST_CPU_FLAG_MMX) {
method = GST_IDCT_MMX;
}
/* disabled for now
- if (gst_cpu_get_flags() & GST_CPU_FLAG_SSE) {
- method = GST_IDCT_SSE;
- }
- */
+ if (gst_cpu_get_flags() & GST_CPU_FLAG_SSE) {
+ method = GST_IDCT_SSE;
+ }
+ */
else
#endif /* HAVE_LIBMMX */
{
@@ -53,49 +54,50 @@ GstIDCT *gst_idct_new(GstIDCTMethod method)
new->convert_sparse = gst_idct_int_sparse_idct;
switch (method) {
- case GST_IDCT_FAST_INT:
- GST_INFO ( "using fast_int_idct");
- gst_idct_init_fast_int_idct();
- new->convert = gst_idct_fast_int_idct;
- break;
- case GST_IDCT_INT:
- GST_INFO ( "using int_idct");
- new->convert = gst_idct_int_idct;
- break;
- case GST_IDCT_FLOAT:
- GST_INFO ( "using float_idct");
- gst_idct_init_float_idct();
- new->convert = gst_idct_float_idct;
- break;
+ case GST_IDCT_FAST_INT:
+ GST_INFO ("using fast_int_idct");
+ gst_idct_init_fast_int_idct ();
+ new->convert = gst_idct_fast_int_idct;
+ break;
+ case GST_IDCT_INT:
+ GST_INFO ("using int_idct");
+ new->convert = gst_idct_int_idct;
+ break;
+ case GST_IDCT_FLOAT:
+ GST_INFO ("using float_idct");
+ gst_idct_init_float_idct ();
+ new->convert = gst_idct_float_idct;
+ break;
#ifdef HAVE_LIBMMX
- case GST_IDCT_MMX:
- GST_INFO ( "using MMX_idct");
- new->convert = gst_idct_mmx_idct;
- new->need_transpose = TRUE;
- break;
- case GST_IDCT_MMX32:
- GST_INFO ( "using MMX32_idct");
- new->convert = gst_idct_mmx32_idct;
- new->need_transpose = TRUE;
- break;
- case GST_IDCT_SSE:
- GST_INFO ( "using SSE_idct");
- new->convert = gst_idct_sse_idct;
- new->need_transpose = TRUE;
- break;
+ case GST_IDCT_MMX:
+ GST_INFO ("using MMX_idct");
+ new->convert = gst_idct_mmx_idct;
+ new->need_transpose = TRUE;
+ break;
+ case GST_IDCT_MMX32:
+ GST_INFO ("using MMX32_idct");
+ new->convert = gst_idct_mmx32_idct;
+ new->need_transpose = TRUE;
+ break;
+ case GST_IDCT_SSE:
+ GST_INFO ("using SSE_idct");
+ new->convert = gst_idct_sse_idct;
+ new->need_transpose = TRUE;
+ break;
#endif /* HAVE_LIBMMX */
- default:
- GST_INFO ( "method not supported");
- g_free(new);
- return NULL;
+ default:
+ GST_INFO ("method not supported");
+ g_free (new);
+ return NULL;
}
return new;
}
-static void gst_idct_int_sparse_idct(short *data)
+static void
+gst_idct_int_sparse_idct (short *data)
{
short val;
- gint32 v, *dp = (guint32 *)data;
+ gint32 v, *dp = (guint32 *) data;
v = *data;
@@ -104,43 +106,61 @@ static void gst_idct_int_sparse_idct(short *data)
val += (8 >> 1);
val /= 8;
val = -val;
- }
- else {
+ } else {
val = (v + (8 >> 1)) / 8;
}
- v = (( val & 0xffff) | (val << 16));
-
- dp[0] = v; dp[1] = v; dp[2] = v; dp[3] = v;
- dp[4] = v; dp[5] = v; dp[6] = v; dp[7] = v;
- dp[8] = v; dp[9] = v; dp[10] = v; dp[11] = v;
- dp[12] = v; dp[13] = v; dp[14] = v; dp[15] = v;
- dp[16] = v; dp[17] = v; dp[18] = v; dp[19] = v;
- dp[20] = v; dp[21] = v; dp[22] = v; dp[23] = v;
- dp[24] = v; dp[25] = v; dp[26] = v; dp[27] = v;
- dp[28] = v; dp[29] = v; dp[30] = v; dp[31] = v;
+ v = ((val & 0xffff) | (val << 16));
+
+ dp[0] = v;
+ dp[1] = v;
+ dp[2] = v;
+ dp[3] = v;
+ dp[4] = v;
+ dp[5] = v;
+ dp[6] = v;
+ dp[7] = v;
+ dp[8] = v;
+ dp[9] = v;
+ dp[10] = v;
+ dp[11] = v;
+ dp[12] = v;
+ dp[13] = v;
+ dp[14] = v;
+ dp[15] = v;
+ dp[16] = v;
+ dp[17] = v;
+ dp[18] = v;
+ dp[19] = v;
+ dp[20] = v;
+ dp[21] = v;
+ dp[22] = v;
+ dp[23] = v;
+ dp[24] = v;
+ dp[25] = v;
+ dp[26] = v;
+ dp[27] = v;
+ dp[28] = v;
+ dp[29] = v;
+ dp[30] = v;
+ dp[31] = v;
}
-void gst_idct_destroy(GstIDCT *idct)
+void
+gst_idct_destroy (GstIDCT * idct)
{
- g_return_if_fail(idct != NULL);
+ g_return_if_fail (idct != NULL);
- g_free(idct);
+ g_free (idct);
}
static gboolean
-plugin_init (GstPlugin *plugin)
+plugin_init (GstPlugin * plugin)
{
return TRUE;
}
-GST_PLUGIN_DEFINE (
- GST_VERSION_MAJOR,
- GST_VERSION_MINOR,
- "gstidct",
- "Accelerated IDCT routines",
- plugin_init,
- VERSION,
- GST_LICENSE,
- GST_PACKAGE,
- GST_ORIGIN
-)
+GST_PLUGIN_DEFINE (GST_VERSION_MAJOR,
+ GST_VERSION_MINOR,
+ "gstidct",
+ "Accelerated IDCT routines",
+ plugin_init, VERSION, GST_LICENSE, GST_PACKAGE, GST_ORIGIN)
diff --git a/gst-libs/gst/idct/idct.h b/gst-libs/gst/idct/idct.h
index fa6f62cd..37a2a0b9 100644
--- a/gst-libs/gst/idct/idct.h
+++ b/gst-libs/gst/idct/idct.h
@@ -23,22 +23,24 @@
#include <glib.h>
-typedef enum {
- GST_IDCT_DEFAULT,
- GST_IDCT_INT,
- GST_IDCT_FAST_INT,
- GST_IDCT_FLOAT,
- GST_IDCT_MMX,
+typedef enum
+{
+ GST_IDCT_DEFAULT,
+ GST_IDCT_INT,
+ GST_IDCT_FAST_INT,
+ GST_IDCT_FLOAT,
+ GST_IDCT_MMX,
GST_IDCT_MMX32,
GST_IDCT_SSE,
} GstIDCTMethod;
typedef struct _GstIDCT GstIDCT;
-typedef void (*GstIDCTFunction) (gshort *block);
+typedef void (*GstIDCTFunction) (gshort * block);
#define GST_IDCT_TRANSPOSE(idct) ((idct)->need_transpose)
-struct _GstIDCT {
+struct _GstIDCT
+{
/* private */
GstIDCTFunction convert;
GstIDCTFunction convert_sparse;
@@ -46,9 +48,10 @@ struct _GstIDCT {
};
-GstIDCT *gst_idct_new(GstIDCTMethod method);
+GstIDCT *gst_idct_new (GstIDCTMethod method);
+
#define gst_idct_convert(idct, blocks) (idct)->convert((blocks))
#define gst_idct_convert_sparse(idct, blocks) (idct)->convert_sparse((blocks))
-void gst_idct_destroy(GstIDCT *idct);
+void gst_idct_destroy (GstIDCT * idct);
#endif /* __GST_IDCT_H__ */
diff --git a/gst-libs/gst/idct/ieeetest.c b/gst-libs/gst/idct/ieeetest.c
index f5b270eb..d26181c1 100644
--- a/gst-libs/gst/idct/ieeetest.c
+++ b/gst-libs/gst/idct/ieeetest.c
@@ -27,9 +27,9 @@
void usage (char *msg);
long ieeerand (long L, long H);
-void dct_init(void);
-void ref_fdct(DCTELEM block[8][8]);
-void ref_idct(DCTELEM block[8][8]);
+void dct_init (void);
+void ref_fdct (DCTELEM block[8][8]);
+void ref_idct (DCTELEM block[8][8]);
/* error stat accumulators -- assume initialized to 0 */
@@ -38,47 +38,49 @@ long sumsqerrs[DCTSIZE2];
int maxerr[DCTSIZE2];
-char * meets (double val, double limit)
+char *
+meets (double val, double limit)
{
- return ((fabs(val) <= limit) ? "meets" : "FAILS");
+ return ((fabs (val) <= limit) ? "meets" : "FAILS");
}
int
-main(int argc, char **argv)
+main (int argc, char **argv)
{
long minpix, maxpix, sign;
long curiter, niters;
int i, j;
double max, total;
int method;
- DCTELEM block[DCTSIZE2]; /* random source data */
- DCTELEM refcoefs[DCTSIZE2]; /* coefs from reference FDCT */
- DCTELEM refout[DCTSIZE2]; /* output from reference IDCT */
- DCTELEM testout[DCTSIZE2]; /* output from test IDCT */
- GstIDCT *idct;
- guint64 tscstart, tscmin = ~0, tscmax = 0;
- guint64 tscstop;
+ DCTELEM block[DCTSIZE2]; /* random source data */
+ DCTELEM refcoefs[DCTSIZE2]; /* coefs from reference FDCT */
+ DCTELEM refout[DCTSIZE2]; /* output from reference IDCT */
+ DCTELEM testout[DCTSIZE2]; /* output from test IDCT */
+ GstIDCT *idct;
+ guint64 tscstart, tscmin = ~0, tscmax = 0;
+ guint64 tscstop;
/* Argument parsing --- not very bulletproof at all */
- if (argc != 6) usage(NULL);
+ if (argc != 6)
+ usage (NULL);
- method = atoi(argv[1]);
- minpix = atoi(argv[2]);
- maxpix = atoi(argv[3]);
- sign = atoi(argv[4]);
- niters = atol(argv[5]);
+ method = atoi (argv[1]);
+ minpix = atoi (argv[2]);
+ maxpix = atoi (argv[3]);
+ sign = atoi (argv[4]);
+ niters = atol (argv[5]);
- gst_library_load("gstidct");
+ gst_library_load ("gstidct");
- idct = gst_idct_new(method);
+ idct = gst_idct_new (method);
if (idct == 0) {
- printf("method not available\n\n\n");
+ printf ("method not available\n\n\n");
return 0;
}
- dct_init();
+ dct_init ();
/* Loop once per generated random-data block */
@@ -86,164 +88,186 @@ main(int argc, char **argv)
/* generate a pseudo-random block of data */
for (i = 0; i < DCTSIZE2; i++)
- block[i] = (DCTELEM) (ieeerand(-minpix,maxpix) * sign);
+ block[i] = (DCTELEM) (ieeerand (-minpix, maxpix) * sign);
/* perform reference FDCT */
- memcpy(refcoefs, block, sizeof(DCTELEM)*DCTSIZE2);
- ref_fdct((DCTELEM **) &refcoefs);
+ memcpy (refcoefs, block, sizeof (DCTELEM) * DCTSIZE2);
+ ref_fdct ((DCTELEM **) & refcoefs);
/* clip */
for (i = 0; i < DCTSIZE2; i++) {
- if (refcoefs[i] < -2048) refcoefs[i] = -2048;
- else if (refcoefs[i] > 2047) refcoefs[i] = 2047;
+ if (refcoefs[i] < -2048)
+ refcoefs[i] = -2048;
+ else if (refcoefs[i] > 2047)
+ refcoefs[i] = 2047;
}
/* perform reference IDCT */
- memcpy(refout, refcoefs, sizeof(DCTELEM)*DCTSIZE2);
- ref_idct(refout);
+ memcpy (refout, refcoefs, sizeof (DCTELEM) * DCTSIZE2);
+ ref_idct (refout);
/* clip */
for (i = 0; i < DCTSIZE2; i++) {
- if (refout[i] < -256) refout[i] = -256;
- else if (refout[i] > 255) refout[i] = 255;
+ if (refout[i] < -256)
+ refout[i] = -256;
+ else if (refout[i] > 255)
+ refout[i] = 255;
}
/* perform test IDCT */
- if (GST_IDCT_TRANSPOSE(idct)) {
+ if (GST_IDCT_TRANSPOSE (idct)) {
for (j = 0; j < DCTSIZE; j++) {
- for (i = 0; i < DCTSIZE; i++) {
- testout[i*DCTSIZE+j] = refcoefs[j*DCTSIZE+i];
- }
- }
- }
- else {
- memcpy(testout, refcoefs, sizeof(DCTELEM)*DCTSIZE2);
- }
-
- gst_trace_read_tsc(&tscstart);
- gst_idct_convert(idct, testout);
- gst_trace_read_tsc(&tscstop);
- /*printf("time %llu, %llu %lld\n", tscstart, tscstop, tscstop-tscstart); */
- if (tscstop - tscstart < tscmin) tscmin = tscstop-tscstart;
- if (tscstop - tscstart > tscmax) tscmax = tscstop-tscstart;
+ for (i = 0; i < DCTSIZE; i++) {
+ testout[i * DCTSIZE + j] = refcoefs[j * DCTSIZE + i];
+ }
+ }
+ } else {
+ memcpy (testout, refcoefs, sizeof (DCTELEM) * DCTSIZE2);
+ }
+
+ gst_trace_read_tsc (&tscstart);
+ gst_idct_convert (idct, testout);
+ gst_trace_read_tsc (&tscstop);
+ /*printf("time %llu, %llu %lld\n", tscstart, tscstop, tscstop-tscstart); */
+ if (tscstop - tscstart < tscmin)
+ tscmin = tscstop - tscstart;
+ if (tscstop - tscstart > tscmax)
+ tscmax = tscstop - tscstart;
/* clip */
for (i = 0; i < DCTSIZE2; i++) {
- if (testout[i] < -256) testout[i] = -256;
- else if (testout[i] > 255) testout[i] = 255;
+ if (testout[i] < -256)
+ testout[i] = -256;
+ else if (testout[i] > 255)
+ testout[i] = 255;
}
/* accumulate error stats */
for (i = 0; i < DCTSIZE2; i++) {
register int err = testout[i] - refout[i];
+
sumerrs[i] += err;
sumsqerrs[i] += err * err;
- if (err < 0) err = -err;
- if (maxerr[i] < err) maxerr[i] = err;
+ if (err < 0)
+ err = -err;
+ if (maxerr[i] < err)
+ maxerr[i] = err;
}
if (curiter % 100 == 99) {
- fprintf(stderr, ".");
- fflush(stderr);
+ fprintf (stderr, ".");
+ fflush (stderr);
}
}
- fprintf(stderr, "\n");
+ fprintf (stderr, "\n");
/* print results */
- printf("IEEE test conditions: -L = %ld, +H = %ld, sign = %ld, #iters = %ld\n",
- minpix, maxpix, sign, niters);
+ printf
+ ("IEEE test conditions: -L = %ld, +H = %ld, sign = %ld, #iters = %ld\n",
+ minpix, maxpix, sign, niters);
- printf("Speed, min time %lld, max %lld\n", tscmin, tscmax);
+ printf ("Speed, min time %lld, max %lld\n", tscmin, tscmax);
- printf("Peak absolute values of errors:\n");
+ printf ("Peak absolute values of errors:\n");
for (i = 0, j = 0; i < DCTSIZE2; i++) {
- if (j < maxerr[i]) j = maxerr[i];
- printf("%4d", maxerr[i]);
- if ((i%DCTSIZE) == DCTSIZE-1) printf("\n");
+ if (j < maxerr[i])
+ j = maxerr[i];
+ printf ("%4d", maxerr[i]);
+ if ((i % DCTSIZE) == DCTSIZE - 1)
+ printf ("\n");
}
- printf("Worst peak error = %d (%s spec limit 1)\n\n", j,
- meets((double) j, 1.0));
+ printf ("Worst peak error = %d (%s spec limit 1)\n\n", j,
+ meets ((double) j, 1.0));
- printf("Mean square errors:\n");
+ printf ("Mean square errors:\n");
max = total = 0.0;
for (i = 0; i < DCTSIZE2; i++) {
- double err = (double) sumsqerrs[i] / ((double) niters);
+ double err = (double) sumsqerrs[i] / ((double) niters);
+
total += (double) sumsqerrs[i];
- if (max < err) max = err;
- printf(" %8.4f", err);
- if ((i%DCTSIZE) == DCTSIZE-1) printf("\n");
+ if (max < err)
+ max = err;
+ printf (" %8.4f", err);
+ if ((i % DCTSIZE) == DCTSIZE - 1)
+ printf ("\n");
}
- printf("Worst pmse = %.6f (%s spec limit 0.06)\n", max, meets(max, 0.06));
- total /= (double) (64*niters);
- printf("Overall mse = %.6f (%s spec limit 0.02)\n\n", total,
- meets(total, 0.02));
+ printf ("Worst pmse = %.6f (%s spec limit 0.06)\n", max, meets (max, 0.06));
+ total /= (double) (64 * niters);
+ printf ("Overall mse = %.6f (%s spec limit 0.02)\n\n", total,
+ meets (total, 0.02));
- printf("Mean errors:\n");
+ printf ("Mean errors:\n");
max = total = 0.0;
for (i = 0; i < DCTSIZE2; i++) {
- double err = (double) sumerrs[i] / ((double) niters);
+ double err = (double) sumerrs[i] / ((double) niters);
+
total += (double) sumerrs[i];
- printf(" %8.4f", err);
- if (err < 0.0) err = -err;
- if (max < err) max = err;
- if ((i%DCTSIZE) == DCTSIZE-1) printf("\n");
+ printf (" %8.4f", err);
+ if (err < 0.0)
+ err = -err;
+ if (max < err)
+ max = err;
+ if ((i % DCTSIZE) == DCTSIZE - 1)
+ printf ("\n");
}
- printf("Worst mean error = %.6f (%s spec limit 0.015)\n", max,
- meets(max, 0.015));
- total /= (double) (64*niters);
- printf("Overall mean error = %.6f (%s spec limit 0.0015)\n\n", total,
- meets(total, 0.0015));
+ printf ("Worst mean error = %.6f (%s spec limit 0.015)\n", max,
+ meets (max, 0.015));
+ total /= (double) (64 * niters);
+ printf ("Overall mean error = %.6f (%s spec limit 0.0015)\n\n", total,
+ meets (total, 0.0015));
/* test for 0 input giving 0 output */
- memset(testout, 0, sizeof(DCTELEM)*DCTSIZE2);
- gst_idct_convert(idct, testout);
- for (i = 0, j=0; i < DCTSIZE2; i++) {
+ memset (testout, 0, sizeof (DCTELEM) * DCTSIZE2);
+ gst_idct_convert (idct, testout);
+ for (i = 0, j = 0; i < DCTSIZE2; i++) {
if (testout[i]) {
- printf("Position %d of IDCT(0) = %d (FAILS)\n", i, testout[i]);
+ printf ("Position %d of IDCT(0) = %d (FAILS)\n", i, testout[i]);
j++;
}
}
- printf("%d elements of IDCT(0) were not zero\n\n\n", j);
+ printf ("%d elements of IDCT(0) were not zero\n\n\n", j);
- exit(0);
+ exit (0);
return 0;
}
-void usage (char *msg)
+void
+usage (char *msg)
{
if (msg != NULL)
- fprintf(stderr, "\nerror: %s\n", msg);
-
- fprintf(stderr, "\n");
- fprintf(stderr, "usage: ieeetest minpix maxpix sign niters\n");
- fprintf(stderr, "\n");
- fprintf(stderr, " test = 1 - 5\n");
- fprintf(stderr, " minpix = -L value per IEEE spec\n");
- fprintf(stderr, " maxpix = H value per IEEE spec\n");
- fprintf(stderr, " sign = +1 for normal, -1 to run negated test\n");
- fprintf(stderr, " niters = # iterations (10000 for full test)\n");
- fprintf(stderr, "\n");
-
- exit(1);
+ fprintf (stderr, "\nerror: %s\n", msg);
+
+ fprintf (stderr, "\n");
+ fprintf (stderr, "usage: ieeetest minpix maxpix sign niters\n");
+ fprintf (stderr, "\n");
+ fprintf (stderr, " test = 1 - 5\n");
+ fprintf (stderr, " minpix = -L value per IEEE spec\n");
+ fprintf (stderr, " maxpix = H value per IEEE spec\n");
+ fprintf (stderr, " sign = +1 for normal, -1 to run negated test\n");
+ fprintf (stderr, " niters = # iterations (10000 for full test)\n");
+ fprintf (stderr, "\n");
+
+ exit (1);
}
/* Pseudo-random generator specified by IEEE 1180 */
-long ieeerand (long L, long H)
+long
+ieeerand (long L, long H)
{
static long randx = 1;
static double z = (double) 0x7fffffff;
- long i,j;
+ long i, j;
double x;
randx = (randx * 1103515245) + 12345;
i = randx & 0x7ffffffe;
x = ((double) i) / z;
- x *= (L+H+1);
+ x *= (L + H + 1);
j = x;
- return j-L;
+ return j - L;
}
@@ -256,33 +280,35 @@ double coslu[8][8];
/* Routine to initialise the cosine lookup table */
-void dct_init(void)
+void
+dct_init (void)
{
- int a,b;
+ int a, b;
double tmp;
- for(a=0;a<8;a++)
- for(b=0;b<8;b++) {
- tmp = cos((double)((a+a+1)*b) * (3.14159265358979323846 / 16.0));
- if(b==0)
- tmp /= sqrt(2.0);
+ for (a = 0; a < 8; a++)
+ for (b = 0; b < 8; b++) {
+ tmp = cos ((double) ((a + a + 1) * b) * (3.14159265358979323846 / 16.0));
+ if (b == 0)
+ tmp /= sqrt (2.0);
coslu[a][b] = tmp * 0.5;
}
}
-void ref_fdct (DCTELEM block[8][8])
+void
+ref_fdct (DCTELEM block[8][8])
{
- int x,y,u,v;
+ int x, y, u, v;
double tmp, tmp2;
double res[8][8];
- for (v=0; v<8; v++) {
- for (u=0; u<8; u++) {
+ for (v = 0; v < 8; v++) {
+ for (u = 0; u < 8; u++) {
tmp = 0.0;
- for (y=0; y<8; y++) {
+ for (y = 0; y < 8; y++) {
tmp2 = 0.0;
- for (x=0; x<8; x++) {
+ for (x = 0; x < 8; x++) {
tmp2 += (double) block[y][x] * coslu[x][u];
}
tmp += coslu[y][v] * tmp2;
@@ -291,11 +317,11 @@ void ref_fdct (DCTELEM block[8][8])
}
}
- for (v=0; v<8; v++) {
- for (u=0; u<8; u++) {
+ for (v = 0; v < 8; v++) {
+ for (u = 0; u < 8; u++) {
tmp = res[v][u];
if (tmp < 0.0) {
- x = - ((int) (0.5 - tmp));
+ x = -((int) (0.5 - tmp));
} else {
x = (int) (tmp + 0.5);
}
@@ -305,18 +331,19 @@ void ref_fdct (DCTELEM block[8][8])
}
-void ref_idct (DCTELEM block[8][8])
+void
+ref_idct (DCTELEM block[8][8])
{
- int x,y,u,v;
+ int x, y, u, v;
double tmp, tmp2;
double res[8][8];
- for (y=0; y<8; y++) {
- for (x=0; x<8; x++) {
+ for (y = 0; y < 8; y++) {
+ for (x = 0; x < 8; x++) {
tmp = 0.0;
- for (v=0; v<8; v++) {
+ for (v = 0; v < 8; v++) {
tmp2 = 0.0;
- for (u=0; u<8; u++) {
+ for (u = 0; u < 8; u++) {
tmp2 += (double) block[v][u] * coslu[x][u];
}
tmp += coslu[y][v] * tmp2;
@@ -325,11 +352,11 @@ void ref_idct (DCTELEM block[8][8])
}
}
- for (v=0; v<8; v++) {
- for (u=0; u<8; u++) {
+ for (v = 0; v < 8; v++) {
+ for (u = 0; u < 8; u++) {
tmp = res[v][u];
if (tmp < 0.0) {
- x = - ((int) (0.5 - tmp));
+ x = -((int) (0.5 - tmp));
} else {
x = (int) (tmp + 0.5);
}
diff --git a/gst-libs/gst/idct/intidct.c b/gst-libs/gst/idct/intidct.c
index e08e6adb..42f0ac84 100644
--- a/gst-libs/gst/idct/intidct.c
+++ b/gst-libs/gst/idct/intidct.c
@@ -51,10 +51,8 @@
*/
#if DCTSIZE != 8
- Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
+Sorry, this code only copes with 8 x8 DCTs. /* deliberate syntax err */
#endif
-
-
/*
* A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT
* on each column. Direct algorithms are also available, but they are
@@ -90,7 +88,6 @@
* have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
* shows that the values given below are the most effective.
*/
-
#ifdef EIGHT_BIT_SAMPLES
#define CONST_BITS 13
#define PASS1_BITS 2
@@ -98,22 +95,16 @@
#define CONST_BITS 13
#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
#endif
-
#define ONE ((INT32) 1)
-
#define CONST_SCALE (ONE << CONST_BITS)
-
/* Convert a positive real constant to an integer scaled by CONST_SCALE. */
-
#define FIX(x) ((INT32) ((x) * CONST_SCALE + 0.5))
-
/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
* causing a lot of useless floating-point operations at run time.
* To get around this we use the following pre-calculated constants.
* If you change CONST_BITS you may want to add appropriate values.
* (With a reasonable C compiler, you can just rely on the FIX() macro...)
*/
-
#if CONST_BITS == 13
#define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */
#define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */
@@ -141,15 +132,11 @@
#define FIX_2_562915447 FIX(2.562915447)
#define FIX_3_072711026 FIX(3.072711026)
#endif
-
-
/* Descale and correctly round an INT32 value that's scaled by N bits.
* We assume RIGHT_SHIFT rounds towards minus infinity, so adding
* the fudge factor is correct for either sign of X.
*/
-
#define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
-
/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
* For 8-bit samples with the recommended scaling, all the variable
* and constant values involved are no more than 16 bits wide, so a
@@ -160,7 +147,6 @@
* combination of casts.
* NB: for 12-bit samples, a full 32-bit multiplication will be needed.
*/
-
#ifdef EIGHT_BIT_SAMPLES
#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */
#define MULTIPLY(var,const) (((INT16) (var)) * ((INT16) (const)))
@@ -169,17 +155,13 @@
#define MULTIPLY(var,const) (((INT16) (var)) * ((INT32) (const)))
#endif
#endif
-
#ifndef MULTIPLY /* default definition */
#define MULTIPLY(var,const) ((var) * (const))
#endif
-
-
/*
* Perform the inverse DCT on one block of coefficients.
*/
-
-void
+ void
gst_idct_int_idct (DCTBLOCK data)
{
INT32 tmp0, tmp1, tmp2, tmp3;
@@ -187,14 +169,13 @@ gst_idct_int_idct (DCTBLOCK data)
INT32 z1, z2, z3, z4, z5;
register DCTELEM *dataptr;
int rowctr;
- SHIFT_TEMPS
-
- /* Pass 1: process rows. */
- /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
- /* furthermore, we scale the results by 2**PASS1_BITS. */
- dataptr = data;
- for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
+ SHIFT_TEMPS
+ /* Pass 1: process rows. */
+ /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
+ /* furthermore, we scale the results by 2**PASS1_BITS. */
+ dataptr = data;
+ for (rowctr = DCTSIZE - 1; rowctr >= 0; rowctr--) {
/* Due to quantization, we will usually find that many of the input
* coefficients are zero, especially the AC terms. We can exploit this
* by short-circuiting the IDCT calculation for any row in which all
@@ -205,10 +186,10 @@ gst_idct_int_idct (DCTBLOCK data)
*/
if ((dataptr[1] | dataptr[2] | dataptr[3] | dataptr[4] |
- dataptr[5] | dataptr[6] | dataptr[7]) == 0) {
+ dataptr[5] | dataptr[6] | dataptr[7]) == 0) {
/* AC terms all zero */
DCTELEM dcval = (DCTELEM) (dataptr[0] << PASS1_BITS);
-
+
dataptr[0] = dcval;
dataptr[1] = dcval;
dataptr[2] = dcval;
@@ -217,7 +198,7 @@ gst_idct_int_idct (DCTBLOCK data)
dataptr[5] = dcval;
dataptr[6] = dcval;
dataptr[7] = dcval;
-
+
dataptr += DCTSIZE; /* advance pointer to next row */
continue;
}
@@ -228,9 +209,9 @@ gst_idct_int_idct (DCTBLOCK data)
z2 = (INT32) dataptr[2];
z3 = (INT32) dataptr[6];
- z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
- tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
- tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+ z1 = MULTIPLY (z2 + z3, FIX_0_541196100);
+ tmp2 = z1 + MULTIPLY (z3, -FIX_1_847759065);
+ tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865);
tmp0 = ((INT32) dataptr[0] + (INT32) dataptr[4]) << CONST_BITS;
tmp1 = ((INT32) dataptr[0] - (INT32) dataptr[4]) << CONST_BITS;
@@ -239,7 +220,7 @@ gst_idct_int_idct (DCTBLOCK data)
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
-
+
/* Odd part per figure 8; the matrix is unitary and hence its
* transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
*/
@@ -253,20 +234,20 @@ gst_idct_int_idct (DCTBLOCK data)
z2 = tmp1 + tmp2;
z3 = tmp0 + tmp2;
z4 = tmp1 + tmp3;
- z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-
- tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
- tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
- tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
- tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
- z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
- z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
- z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
- z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-
+ z5 = MULTIPLY (z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
+
+ tmp0 = MULTIPLY (tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+ tmp1 = MULTIPLY (tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+ tmp2 = MULTIPLY (tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+ tmp3 = MULTIPLY (tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+ z1 = MULTIPLY (z1, -FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+ z2 = MULTIPLY (z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+ z3 = MULTIPLY (z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z4 = MULTIPLY (z4, -FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+
z3 += z5;
z4 += z5;
-
+
tmp0 += z1 + z3;
tmp1 += z2 + z4;
tmp2 += z2 + z3;
@@ -274,14 +255,14 @@ gst_idct_int_idct (DCTBLOCK data)
/* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
- dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
- dataptr[7] = (DCTELEM) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
- dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
- dataptr[6] = (DCTELEM) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
- dataptr[2] = (DCTELEM) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
- dataptr[5] = (DCTELEM) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
- dataptr[3] = (DCTELEM) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
- dataptr[4] = (DCTELEM) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+ dataptr[0] = (DCTELEM) DESCALE (tmp10 + tmp3, CONST_BITS - PASS1_BITS);
+ dataptr[7] = (DCTELEM) DESCALE (tmp10 - tmp3, CONST_BITS - PASS1_BITS);
+ dataptr[1] = (DCTELEM) DESCALE (tmp11 + tmp2, CONST_BITS - PASS1_BITS);
+ dataptr[6] = (DCTELEM) DESCALE (tmp11 - tmp2, CONST_BITS - PASS1_BITS);
+ dataptr[2] = (DCTELEM) DESCALE (tmp12 + tmp1, CONST_BITS - PASS1_BITS);
+ dataptr[5] = (DCTELEM) DESCALE (tmp12 - tmp1, CONST_BITS - PASS1_BITS);
+ dataptr[3] = (DCTELEM) DESCALE (tmp13 + tmp0, CONST_BITS - PASS1_BITS);
+ dataptr[4] = (DCTELEM) DESCALE (tmp13 - tmp0, CONST_BITS - PASS1_BITS);
dataptr += DCTSIZE; /* advance pointer to next row */
}
@@ -291,7 +272,7 @@ gst_idct_int_idct (DCTBLOCK data)
/* and also undo the PASS1_BITS scaling. */
dataptr = data;
- for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
+ for (rowctr = DCTSIZE - 1; rowctr >= 0; rowctr--) {
/* Columns of zeroes can be exploited in the same way as we did with rows.
* However, the row calculation has created many nonzero AC terms, so the
* simplification applies less often (typically 5% to 10% of the time).
@@ -301,21 +282,21 @@ gst_idct_int_idct (DCTBLOCK data)
*/
#ifndef NO_ZERO_COLUMN_TEST
- if ((dataptr[DCTSIZE*1] | dataptr[DCTSIZE*2] | dataptr[DCTSIZE*3] |
- dataptr[DCTSIZE*4] | dataptr[DCTSIZE*5] | dataptr[DCTSIZE*6] |
- dataptr[DCTSIZE*7]) == 0) {
+ if ((dataptr[DCTSIZE * 1] | dataptr[DCTSIZE * 2] | dataptr[DCTSIZE * 3] |
+ dataptr[DCTSIZE * 4] | dataptr[DCTSIZE * 5] | dataptr[DCTSIZE * 6] |
+ dataptr[DCTSIZE * 7]) == 0) {
/* AC terms all zero */
- DCTELEM dcval = (DCTELEM) DESCALE((INT32) dataptr[0], PASS1_BITS+3);
-
- dataptr[DCTSIZE*0] = dcval;
- dataptr[DCTSIZE*1] = dcval;
- dataptr[DCTSIZE*2] = dcval;
- dataptr[DCTSIZE*3] = dcval;
- dataptr[DCTSIZE*4] = dcval;
- dataptr[DCTSIZE*5] = dcval;
- dataptr[DCTSIZE*6] = dcval;
- dataptr[DCTSIZE*7] = dcval;
-
+ DCTELEM dcval = (DCTELEM) DESCALE ((INT32) dataptr[0], PASS1_BITS + 3);
+
+ dataptr[DCTSIZE * 0] = dcval;
+ dataptr[DCTSIZE * 1] = dcval;
+ dataptr[DCTSIZE * 2] = dcval;
+ dataptr[DCTSIZE * 3] = dcval;
+ dataptr[DCTSIZE * 4] = dcval;
+ dataptr[DCTSIZE * 5] = dcval;
+ dataptr[DCTSIZE * 6] = dcval;
+ dataptr[DCTSIZE * 7] = dcval;
+
dataptr++; /* advance pointer to next column */
continue;
}
@@ -324,48 +305,52 @@ gst_idct_int_idct (DCTBLOCK data)
/* Even part: reverse the even part of the forward DCT. */
/* The rotator is sqrt(2)*c(-6). */
- z2 = (INT32) dataptr[DCTSIZE*2];
- z3 = (INT32) dataptr[DCTSIZE*6];
+ z2 = (INT32) dataptr[DCTSIZE * 2];
+ z3 = (INT32) dataptr[DCTSIZE * 6];
- z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
- tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
- tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+ z1 = MULTIPLY (z2 + z3, FIX_0_541196100);
+ tmp2 = z1 + MULTIPLY (z3, -FIX_1_847759065);
+ tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865);
- tmp0 = ((INT32) dataptr[DCTSIZE*0] + (INT32) dataptr[DCTSIZE*4]) << CONST_BITS;
- tmp1 = ((INT32) dataptr[DCTSIZE*0] - (INT32) dataptr[DCTSIZE*4]) << CONST_BITS;
+ tmp0 =
+ ((INT32) dataptr[DCTSIZE * 0] +
+ (INT32) dataptr[DCTSIZE * 4]) << CONST_BITS;
+ tmp1 =
+ ((INT32) dataptr[DCTSIZE * 0] -
+ (INT32) dataptr[DCTSIZE * 4]) << CONST_BITS;
tmp10 = tmp0 + tmp3;
tmp13 = tmp0 - tmp3;
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
-
+
/* Odd part per figure 8; the matrix is unitary and hence its
* transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
*/
- tmp0 = (INT32) dataptr[DCTSIZE*7];
- tmp1 = (INT32) dataptr[DCTSIZE*5];
- tmp2 = (INT32) dataptr[DCTSIZE*3];
- tmp3 = (INT32) dataptr[DCTSIZE*1];
+ tmp0 = (INT32) dataptr[DCTSIZE * 7];
+ tmp1 = (INT32) dataptr[DCTSIZE * 5];
+ tmp2 = (INT32) dataptr[DCTSIZE * 3];
+ tmp3 = (INT32) dataptr[DCTSIZE * 1];
z1 = tmp0 + tmp3;
z2 = tmp1 + tmp2;
z3 = tmp0 + tmp2;
z4 = tmp1 + tmp3;
- z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-
- tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
- tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
- tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
- tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
- z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
- z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
- z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
- z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-
+ z5 = MULTIPLY (z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
+
+ tmp0 = MULTIPLY (tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+ tmp1 = MULTIPLY (tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+ tmp2 = MULTIPLY (tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+ tmp3 = MULTIPLY (tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+ z1 = MULTIPLY (z1, -FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+ z2 = MULTIPLY (z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+ z3 = MULTIPLY (z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+ z4 = MULTIPLY (z4, -FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+
z3 += z5;
z4 += z5;
-
+
tmp0 += z1 + z3;
tmp1 += z2 + z4;
tmp2 += z2 + z3;
@@ -373,23 +358,23 @@ gst_idct_int_idct (DCTBLOCK data)
/* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
- dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp3,
- CONST_BITS+PASS1_BITS+3);
- dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp10 - tmp3,
- CONST_BITS+PASS1_BITS+3);
- dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp11 + tmp2,
- CONST_BITS+PASS1_BITS+3);
- dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(tmp11 - tmp2,
- CONST_BITS+PASS1_BITS+3);
- dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp12 + tmp1,
- CONST_BITS+PASS1_BITS+3);
- dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12 - tmp1,
- CONST_BITS+PASS1_BITS+3);
- dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp13 + tmp0,
- CONST_BITS+PASS1_BITS+3);
- dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp13 - tmp0,
- CONST_BITS+PASS1_BITS+3);
-
+ dataptr[DCTSIZE * 0] = (DCTELEM) DESCALE (tmp10 + tmp3,
+ CONST_BITS + PASS1_BITS + 3);
+ dataptr[DCTSIZE * 7] = (DCTELEM) DESCALE (tmp10 - tmp3,
+ CONST_BITS + PASS1_BITS + 3);
+ dataptr[DCTSIZE * 1] = (DCTELEM) DESCALE (tmp11 + tmp2,
+ CONST_BITS + PASS1_BITS + 3);
+ dataptr[DCTSIZE * 6] = (DCTELEM) DESCALE (tmp11 - tmp2,
+ CONST_BITS + PASS1_BITS + 3);
+ dataptr[DCTSIZE * 2] = (DCTELEM) DESCALE (tmp12 + tmp1,
+ CONST_BITS + PASS1_BITS + 3);
+ dataptr[DCTSIZE * 5] = (DCTELEM) DESCALE (tmp12 - tmp1,
+ CONST_BITS + PASS1_BITS + 3);
+ dataptr[DCTSIZE * 3] = (DCTELEM) DESCALE (tmp13 + tmp0,
+ CONST_BITS + PASS1_BITS + 3);
+ dataptr[DCTSIZE * 4] = (DCTELEM) DESCALE (tmp13 - tmp0,
+ CONST_BITS + PASS1_BITS + 3);
+
dataptr++; /* advance pointer to next column */
}
}
diff --git a/gst-libs/gst/idct/mmx32idct.c b/gst-libs/gst/idct/mmx32idct.c
index 3b640976..cd191f0c 100644
--- a/gst-libs/gst/idct/mmx32idct.c
+++ b/gst-libs/gst/idct/mmx32idct.c
@@ -19,9 +19,8 @@
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*
- */
-
-
+ */
+
/* MMX32 iDCT algorithm (IEEE-1180 compliant) :: idct_mmx32() */
/* */
/* MPEG2AVI */
@@ -102,8 +101,7 @@
/* */
/* liaor@umcc.ais.org http://members.tripod.com/~liaor */
/* */
-
-
+
/*;============================================================================= */
/*; */
/*; AP-922 http://developer.intel.com/vtune/cbts/strmsimd */
@@ -113,68 +111,67 @@
/*;============================================================================= */
/*
mword typedef qword
-qword ptr equ mword ptr */
-
+qword ptr equ mword ptr */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
-
#include <mmx.h>
-
+
#define BITS_INV_ACC 4 /*; 4 or 5 for IEEE */
- /* 5 yields higher accuracy, but lessens dynamic range on the input matrix */
+ /* 5 yields higher accuracy, but lessens dynamic range on the input matrix */
#define SHIFT_INV_ROW (16 - BITS_INV_ACC)
-#define SHIFT_INV_COL (1 + BITS_INV_ACC +14 ) /* changed from Intel's val) */
+#define SHIFT_INV_COL (1 + BITS_INV_ACC +14 ) /* changed from Intel's val) */
/*#define SHIFT_INV_COL (1 + BITS_INV_ACC ) */
-
+
#define RND_INV_ROW (1 << (SHIFT_INV_ROW-1))
#define RND_INV_COL (1 << (SHIFT_INV_COL-1))
-#define RND_INV_CORR (RND_INV_COL - 1) /*; correction -1.0 and round */
+#define RND_INV_CORR (RND_INV_COL - 1) /*; correction -1.0 and round */
/*#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC)) //; 1 << (SHIFT_INV_ROW-1) */
/*#define RND_INV_COL (16 * (BITS_INV_ACC - 3)) //; 1 << (SHIFT_INV_COL-1) */
-
-
+
/*.data */
/*Align 16 */
-const static long r_inv_row[2] = { RND_INV_ROW, RND_INV_ROW};
-const static long r_inv_col[2] = {RND_INV_COL, RND_INV_COL};
-const static long r_inv_corr[2] = {RND_INV_CORR, RND_INV_CORR };
-
+const static long r_inv_row[2] = { RND_INV_ROW, RND_INV_ROW };
+ const static long r_inv_col[2] = { RND_INV_COL, RND_INV_COL };
+ const static long r_inv_corr[2] = { RND_INV_CORR, RND_INV_CORR };
+
+
/*const static short r_inv_col[4] = */
/* {RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL}; */
/*const static short r_inv_corr[4] = */
/* {RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR}; */
-
+
/* constants for the forward DCT
/*#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy */
/*#define SHIFT_FRW_COL BITS_FRW_ACC */
/*#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17) */
/*#define RND_FRW_ROW (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1) */
-
-const static __int64 one_corr = 0x0001000100010001;
-const static long r_frw_row[2] = {RND_FRW_ROW, RND_FRW_ROW };
-
+ const static __int64 one_corr = 0x0001000100010001;
+ const static long r_frw_row[2] = { RND_FRW_ROW, RND_FRW_ROW };
+
+
/*const static short tg_1_16[4] = {13036, 13036, 13036, 13036 }; //tg * (2<<16) + 0.5 */
/*const static short tg_2_16[4] = {27146, 27146, 27146, 27146 }; //tg * (2<<16) + 0.5 */
/*const static short tg_3_16[4] = {-21746, -21746, -21746, -21746 }; //tg * (2<<16) + 0.5 */
/*const static short cos_4_16[4] = {-19195, -19195, -19195, -19195 }; //cos * (2<<16) + 0.5 */
/*const static short ocos_4_16[4] = {23170, 23170, 23170, 23170 }; //cos * (2<<15) + 0.5 */
-
+
/*concatenated table, for forward DCT transformation */
-const static short tg_all_16[] = {
- 13036, 13036, 13036, 13036, /* tg * (2<<16) + 0.5 */
- 27146, 27146, 27146, 27146, /*tg * (2<<16) + 0.5 */
- -21746, -21746, -21746, -21746, /* tg * (2<<16) + 0.5 */
- -19195, -19195, -19195, -19195, /*cos * (2<<16) + 0.5 */
- 23170, 23170, 23170, 23170 }; /*cos * (2<<15) + 0.5 */
+const static short tg_all_16[] = { 13036, 13036, 13036, 13036, /* tg * (2<<16) + 0.5 */
+ 27146, 27146, 27146, 27146, /*tg * (2<<16) + 0.5 */
+ -21746, -21746, -21746, -21746, /* tg * (2<<16) + 0.5 */
+ -19195, -19195, -19195, -19195, /*cos * (2<<16) + 0.5 */
+ 23170, 23170, 23170, 23170
+}; /*cos * (2<<15) + 0.5 */
+
#define tg_1_16 (tg_all_16 + 0)
#define tg_2_16 (tg_all_16 + 8)
#define tg_3_16 (tg_all_16 + 16)
#define cos_4_16 (tg_all_16 + 24)
#define ocos_4_16 (tg_all_16 + 32)
-*/
+ */
/*
;=============================================================================
;
@@ -236,552 +233,456 @@ IF _MMX ; MMX code
;=============================================================================
/*; Table for rows 0,4 - constants are multiplied by cos_4_16 */
-const short tab_i_04[] = {
- 16384, 16384, 16384, -16384, /* ; movq-> w06 w04 w02 w00 */
- 21407, 8867, 8867, -21407, /* w07 w05 w03 w01 */
- 16384, -16384, 16384, 16384, /*; w14 w12 w10 w08 */
- -8867, 21407, -21407, -8867, /*; w15 w13 w11 w09 */
- 22725, 12873, 19266, -22725, /*; w22 w20 w18 w16 */
- 19266, 4520, -4520, -12873, /*; w23 w21 w19 w17 */
- 12873, 4520, 4520, 19266, /*; w30 w28 w26 w24 */
- -22725, 19266, -12873, -22725 };/*w31 w29 w27 w25 */
+const short tab_i_04[] = { 16384, 16384, 16384, -16384, /* ; movq-> w06 w04 w02 w00 */
+ 21407, 8867, 8867, -21407, /* w07 w05 w03 w01 */
+ 16384, -16384, 16384, 16384, /*; w14 w12 w10 w08 */
+ -8867, 21407, -21407, -8867, /*; w15 w13 w11 w09 */
+ 22725, 12873, 19266, -22725, /*; w22 w20 w18 w16 */
+ 19266, 4520, -4520, -12873, /*; w23 w21 w19 w17 */
+ 12873, 4520, 4520, 19266, /*; w30 w28 w26 w24 */
+ -22725, 19266, -12873, -22725
+}; /*w31 w29 w27 w25 */
+
/*; Table for rows 1,7 - constants are multiplied by cos_1_16 */
-const short tab_i_17[] = {
- 22725, 22725, 22725, -22725, /* ; movq-> w06 w04 w02 w00 */
- 29692, 12299, 12299, -29692, /* ; w07 w05 w03 w01 */
- 22725, -22725, 22725, 22725, /*; w14 w12 w10 w08 */
- -12299, 29692, -29692, -12299, /*; w15 w13 w11 w09 */
- 31521, 17855, 26722, -31521, /*; w22 w20 w18 w16 */
- 26722, 6270, -6270, -17855, /*; w23 w21 w19 w17 */
- 17855, 6270, 6270, 26722, /*; w30 w28 w26 w24 */
- -31521, 26722, -17855, -31521}; /* w31 w29 w27 w25 */
+const short tab_i_17[] = { 22725, 22725, 22725, -22725, /* ; movq-> w06 w04 w02 w00 */
+ 29692, 12299, 12299, -29692, /* ; w07 w05 w03 w01 */
+ 22725, -22725, 22725, 22725, /*; w14 w12 w10 w08 */
+ -12299, 29692, -29692, -12299, /*; w15 w13 w11 w09 */
+ 31521, 17855, 26722, -31521, /*; w22 w20 w18 w16 */
+ 26722, 6270, -6270, -17855, /*; w23 w21 w19 w17 */
+ 17855, 6270, 6270, 26722, /*; w30 w28 w26 w24 */
+ -31521, 26722, -17855, -31521
+}; /* w31 w29 w27 w25 */
+
/*; Table for rows 2,6 - constants are multiplied by cos_2_16 */
-const short tab_i_26[] = {
- 21407, 21407, 21407, -21407, /* ; movq-> w06 w04 w02 w00 */
- 27969, 11585, 11585, -27969, /* ; w07 w05 w03 w01 */
- 21407, -21407, 21407, 21407, /* ; w14 w12 w10 w08 */
- -11585, 27969, -27969, -11585, /* ;w15 w13 w11 w09 */
- 29692, 16819, 25172, -29692, /* ;w22 w20 w18 w16 */
- 25172, 5906, -5906, -16819, /* ;w23 w21 w19 w17 */
- 16819, 5906, 5906, 25172, /* ;w30 w28 w26 w24 */
- -29692, 25172, -16819, -29692}; /* ;w31 w29 w27 w25 */
-
-
+const short tab_i_26[] = { 21407, 21407, 21407, -21407, /* ; movq-> w06 w04 w02 w00 */
+ 27969, 11585, 11585, -27969, /* ; w07 w05 w03 w01 */
+ 21407, -21407, 21407, 21407, /* ; w14 w12 w10 w08 */
+ -11585, 27969, -27969, -11585, /* ;w15 w13 w11 w09 */
+ 29692, 16819, 25172, -29692, /* ;w22 w20 w18 w16 */
+ 25172, 5906, -5906, -16819, /* ;w23 w21 w19 w17 */
+ 16819, 5906, 5906, 25172, /* ;w30 w28 w26 w24 */
+ -29692, 25172, -16819, -29692
+}; /* ;w31 w29 w27 w25 */
+
+
/*; Table for rows 3,5 - constants are multiplied by cos_3_16 */
-const short tab_i_35[] = {
- 19266, 19266, 19266, -19266, /*; movq-> w06 w04 w02 w00 */
- 25172, 10426, 10426, -25172, /*; w07 w05 w03 w01 */
- 19266, -19266, 19266, 19266, /*; w14 w12 w10 w08 */
- -10426, 25172, -25172, -10426, /*; w15 w13 w11 w09 */
- 26722, 15137, 22654, -26722, /*; w22 w20 w18 w16 */
- 22654, 5315, -5315, -15137, /*; w23 w21 w19 w17 */
- 15137, 5315, 5315, 22654, /*; w30 w28 w26 w24 */
- -26722, 22654, -15137, -26722}; /*; w31 w29 w27 w25 */
-*/
-
+const short tab_i_35[] = { 19266, 19266, 19266, -19266, /*; movq-> w06 w04 w02 w00 */
+ 25172, 10426, 10426, -25172, /*; w07 w05 w03 w01 */
+ 19266, -19266, 19266, 19266, /*; w14 w12 w10 w08 */
+ -10426, 25172, -25172, -10426, /*; w15 w13 w11 w09 */
+ 26722, 15137, 22654, -26722, /*; w22 w20 w18 w16 */
+ 22654, 5315, -5315, -15137, /*; w23 w21 w19 w17 */
+ 15137, 5315, 5315, 22654, /*; w30 w28 w26 w24 */
+ -26722, 22654, -15137, -26722
+}; /*; w31 w29 w27 w25 */
+
+*/
/* CONCATENATED TABLE, rows 0,1,2,3,4,5,6,7 (in order ) */
/* */
/* In our implementation, however, we only use row0 ! */
/* */
-static const short tab_i_01234567[] = {
- /*row0, this row is required */
- 16384, 16384, 16384, -16384, /* ; movq-> w06 w04 w02 w00 */
- 21407, 8867, 8867, -21407, /* w07 w05 w03 w01 */
- 16384, -16384, 16384, 16384, /*; w14 w12 w10 w08 */
- -8867, 21407, -21407, -8867, /*; w15 w13 w11 w09 */
- 22725, 12873, 19266, -22725, /*; w22 w20 w18 w16 */
- 19266, 4520, -4520, -12873, /*; w23 w21 w19 w17 */
- 12873, 4520, 4520, 19266, /*; w30 w28 w26 w24 */
- -22725, 19266, -12873, -22725, /*w31 w29 w27 w25 */
-
- /* the rest of these rows (1-7), aren't used ! */
-
- /*row1 */
- 22725, 22725, 22725, -22725, /* ; movq-> w06 w04 w02 w00 */
- 29692, 12299, 12299, -29692, /* ; w07 w05 w03 w01 */
- 22725, -22725, 22725, 22725, /*; w14 w12 w10 w08 */
- -12299, 29692, -29692, -12299, /*; w15 w13 w11 w09 */
- 31521, 17855, 26722, -31521, /*; w22 w20 w18 w16 */
- 26722, 6270, -6270, -17855, /*; w23 w21 w19 w17 */
- 17855, 6270, 6270, 26722, /*; w30 w28 w26 w24 */
- -31521, 26722, -17855, -31521, /* w31 w29 w27 w25 */
-
- /*row2 */
- 21407, 21407, 21407, -21407, /* ; movq-> w06 w04 w02 w00 */
- 27969, 11585, 11585, -27969, /* ; w07 w05 w03 w01 */
- 21407, -21407, 21407, 21407, /* ; w14 w12 w10 w08 */
- -11585, 27969, -27969, -11585, /* ;w15 w13 w11 w09 */
- 29692, 16819, 25172, -29692, /* ;w22 w20 w18 w16 */
- 25172, 5906, -5906, -16819, /* ;w23 w21 w19 w17 */
- 16819, 5906, 5906, 25172, /* ;w30 w28 w26 w24 */
- -29692, 25172, -16819, -29692, /* ;w31 w29 w27 w25 */
-
- /*row3 */
- 19266, 19266, 19266, -19266, /*; movq-> w06 w04 w02 w00 */
- 25172, 10426, 10426, -25172, /*; w07 w05 w03 w01 */
- 19266, -19266, 19266, 19266, /*; w14 w12 w10 w08 */
- -10426, 25172, -25172, -10426, /*; w15 w13 w11 w09 */
- 26722, 15137, 22654, -26722, /*; w22 w20 w18 w16 */
- 22654, 5315, -5315, -15137, /*; w23 w21 w19 w17 */
- 15137, 5315, 5315, 22654, /*; w30 w28 w26 w24 */
- -26722, 22654, -15137, -26722, /*; w31 w29 w27 w25 */
-
- /*row4 */
- 16384, 16384, 16384, -16384, /* ; movq-> w06 w04 w02 w00 */
- 21407, 8867, 8867, -21407, /* w07 w05 w03 w01 */
- 16384, -16384, 16384, 16384, /*; w14 w12 w10 w08 */
- -8867, 21407, -21407, -8867, /*; w15 w13 w11 w09 */
- 22725, 12873, 19266, -22725, /*; w22 w20 w18 w16 */
- 19266, 4520, -4520, -12873, /*; w23 w21 w19 w17 */
- 12873, 4520, 4520, 19266, /*; w30 w28 w26 w24 */
- -22725, 19266, -12873, -22725, /*w31 w29 w27 w25 */
-
- /*row5 */
- 19266, 19266, 19266, -19266, /*; movq-> w06 w04 w02 w00 */
- 25172, 10426, 10426, -25172, /*; w07 w05 w03 w01 */
- 19266, -19266, 19266, 19266, /*; w14 w12 w10 w08 */
- -10426, 25172, -25172, -10426, /*; w15 w13 w11 w09 */
- 26722, 15137, 22654, -26722, /*; w22 w20 w18 w16 */
- 22654, 5315, -5315, -15137, /*; w23 w21 w19 w17 */
- 15137, 5315, 5315, 22654, /*; w30 w28 w26 w24 */
- -26722, 22654, -15137, -26722, /*; w31 w29 w27 w25 */
-
- /*row6 */
- 21407, 21407, 21407, -21407, /* ; movq-> w06 w04 w02 w00 */
- 27969, 11585, 11585, -27969, /* ; w07 w05 w03 w01 */
- 21407, -21407, 21407, 21407, /* ; w14 w12 w10 w08 */
- -11585, 27969, -27969, -11585, /* ;w15 w13 w11 w09 */
- 29692, 16819, 25172, -29692, /* ;w22 w20 w18 w16 */
- 25172, 5906, -5906, -16819, /* ;w23 w21 w19 w17 */
- 16819, 5906, 5906, 25172, /* ;w30 w28 w26 w24 */
- -29692, 25172, -16819, -29692, /* ;w31 w29 w27 w25 */
-
- /*row7 */
- 22725, 22725, 22725, -22725, /* ; movq-> w06 w04 w02 w00 */
- 29692, 12299, 12299, -29692, /* ; w07 w05 w03 w01 */
- 22725, -22725, 22725, 22725, /*; w14 w12 w10 w08 */
- -12299, 29692, -29692, -12299, /*; w15 w13 w11 w09 */
- 31521, 17855, 26722, -31521, /*; w22 w20 w18 w16 */
- 26722, 6270, -6270, -17855, /*; w23 w21 w19 w17 */
- 17855, 6270, 6270, 26722, /*; w30 w28 w26 w24 */
- -31521, 26722, -17855, -31521}; /* w31 w29 w27 w25 */
-
-
-#define INP eax /* pointer to (short *blk) */
-#define OUT ecx /* pointer to output (temporary store space qwTemp[]) */
-#define TABLE ebx /* pointer to tab_i_01234567[] */
+static const short tab_i_01234567[] = {
+ /*row0, this row is required */
+ 16384, 16384, 16384, -16384, /* ; movq-> w06 w04 w02 w00 */
+ 21407, 8867, 8867, -21407, /* w07 w05 w03 w01 */
+ 16384, -16384, 16384, 16384, /*; w14 w12 w10 w08 */
+ -8867, 21407, -21407, -8867, /*; w15 w13 w11 w09 */
+ 22725, 12873, 19266, -22725, /*; w22 w20 w18 w16 */
+ 19266, 4520, -4520, -12873, /*; w23 w21 w19 w17 */
+ 12873, 4520, 4520, 19266, /*; w30 w28 w26 w24 */
+ -22725, 19266, -12873, -22725, /*w31 w29 w27 w25 */
+
+ /* the rest of these rows (1-7), aren't used ! */
+
+ /*row1 */
+ 22725, 22725, 22725, -22725, /* ; movq-> w06 w04 w02 w00 */
+ 29692, 12299, 12299, -29692, /* ; w07 w05 w03 w01 */
+ 22725, -22725, 22725, 22725, /*; w14 w12 w10 w08 */
+ -12299, 29692, -29692, -12299, /*; w15 w13 w11 w09 */
+ 31521, 17855, 26722, -31521, /*; w22 w20 w18 w16 */
+ 26722, 6270, -6270, -17855, /*; w23 w21 w19 w17 */
+ 17855, 6270, 6270, 26722, /*; w30 w28 w26 w24 */
+ -31521, 26722, -17855, -31521, /* w31 w29 w27 w25 */
+
+ /*row2 */
+ 21407, 21407, 21407, -21407, /* ; movq-> w06 w04 w02 w00 */
+ 27969, 11585, 11585, -27969, /* ; w07 w05 w03 w01 */
+ 21407, -21407, 21407, 21407, /* ; w14 w12 w10 w08 */
+ -11585, 27969, -27969, -11585, /* ;w15 w13 w11 w09 */
+ 29692, 16819, 25172, -29692, /* ;w22 w20 w18 w16 */
+ 25172, 5906, -5906, -16819, /* ;w23 w21 w19 w17 */
+ 16819, 5906, 5906, 25172, /* ;w30 w28 w26 w24 */
+ -29692, 25172, -16819, -29692, /* ;w31 w29 w27 w25 */
+
+ /*row3 */
+ 19266, 19266, 19266, -19266, /*; movq-> w06 w04 w02 w00 */
+ 25172, 10426, 10426, -25172, /*; w07 w05 w03 w01 */
+ 19266, -19266, 19266, 19266, /*; w14 w12 w10 w08 */
+ -10426, 25172, -25172, -10426, /*; w15 w13 w11 w09 */
+ 26722, 15137, 22654, -26722, /*; w22 w20 w18 w16 */
+ 22654, 5315, -5315, -15137, /*; w23 w21 w19 w17 */
+ 15137, 5315, 5315, 22654, /*; w30 w28 w26 w24 */
+ -26722, 22654, -15137, -26722, /*; w31 w29 w27 w25 */
+
+ /*row4 */
+ 16384, 16384, 16384, -16384, /* ; movq-> w06 w04 w02 w00 */
+ 21407, 8867, 8867, -21407, /* w07 w05 w03 w01 */
+ 16384, -16384, 16384, 16384, /*; w14 w12 w10 w08 */
+ -8867, 21407, -21407, -8867, /*; w15 w13 w11 w09 */
+ 22725, 12873, 19266, -22725, /*; w22 w20 w18 w16 */
+ 19266, 4520, -4520, -12873, /*; w23 w21 w19 w17 */
+ 12873, 4520, 4520, 19266, /*; w30 w28 w26 w24 */
+ -22725, 19266, -12873, -22725, /*w31 w29 w27 w25 */
+
+ /*row5 */
+ 19266, 19266, 19266, -19266, /*; movq-> w06 w04 w02 w00 */
+ 25172, 10426, 10426, -25172, /*; w07 w05 w03 w01 */
+ 19266, -19266, 19266, 19266, /*; w14 w12 w10 w08 */
+ -10426, 25172, -25172, -10426, /*; w15 w13 w11 w09 */
+ 26722, 15137, 22654, -26722, /*; w22 w20 w18 w16 */
+ 22654, 5315, -5315, -15137, /*; w23 w21 w19 w17 */
+ 15137, 5315, 5315, 22654, /*; w30 w28 w26 w24 */
+ -26722, 22654, -15137, -26722, /*; w31 w29 w27 w25 */
+
+ /*row6 */
+ 21407, 21407, 21407, -21407, /* ; movq-> w06 w04 w02 w00 */
+ 27969, 11585, 11585, -27969, /* ; w07 w05 w03 w01 */
+ 21407, -21407, 21407, 21407, /* ; w14 w12 w10 w08 */
+ -11585, 27969, -27969, -11585, /* ;w15 w13 w11 w09 */
+ 29692, 16819, 25172, -29692, /* ;w22 w20 w18 w16 */
+ 25172, 5906, -5906, -16819, /* ;w23 w21 w19 w17 */
+ 16819, 5906, 5906, 25172, /* ;w30 w28 w26 w24 */
+ -29692, 25172, -16819, -29692, /* ;w31 w29 w27 w25 */
+
+ /*row7 */
+ 22725, 22725, 22725, -22725, /* ; movq-> w06 w04 w02 w00 */
+ 29692, 12299, 12299, -29692, /* ; w07 w05 w03 w01 */
+ 22725, -22725, 22725, 22725, /*; w14 w12 w10 w08 */
+ -12299, 29692, -29692, -12299, /*; w15 w13 w11 w09 */
+ 31521, 17855, 26722, -31521, /*; w22 w20 w18 w16 */
+ 26722, 6270, -6270, -17855, /*; w23 w21 w19 w17 */
+ 17855, 6270, 6270, 26722, /*; w30 w28 w26 w24 */
+ -31521, 26722, -17855, -31521
+}; /* w31 w29 w27 w25 */
+
+
+#define INP eax /* pointer to (short *blk) */
+#define OUT ecx /* pointer to output (temporary store space qwTemp[]) */
+#define TABLE ebx /* pointer to tab_i_01234567[] */
#define round_inv_row edx
#define round_inv_col edx
-
-#define ROW_STRIDE 8 /* for 8x8 matrix transposer */
-
+
+#define ROW_STRIDE 8 /* for 8x8 matrix transposer */
+
/* private variables and functions */
-
+
/*temporary storage space, 8x8 of shorts */
-
-__inline static void idct_mmx32_rows( short *blk ); /* transform rows */
-__inline static void idct_mmx32_cols( short *blk ); /* transform "columns" */
+ __inline static void idct_mmx32_rows (short *blk); /* transform rows */
+__inline static void idct_mmx32_cols (short *blk); /* transform "columns" */
+
/* the "column" transform actually transforms rows, it is */
/* identical to the row-transform except for the ROUNDING */
/* and SHIFTING coefficients. */
-
-
-static void
-idct_mmx32_rows( short *blk ) /* transform all 8 rows of 8x8 iDCT block */
-{
- int x;
- short qwTemp[64];
- short *out = &qwTemp[0];
- short *inptr = blk;
- /* this subroutine performs two operations */
- /* 1) iDCT row transform */
- /* for( i = 0; i < 8; ++ i) */
- /* DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] ); */
- /* */
- /* 2) transpose the matrix (which was stored in qwTemp[]) */
- /* qwTemp[] -> [8x8 matrix transpose] -> blk[] */
-
- for (x=0; x<8; x++) { /* transform one row per iteration */
- movq_m2r(*(inptr), mm0); /* 0 ; x3 x2 x1 x0 */
-
- movq_m2r(*(inptr+4), mm1); /* 1 ; x7 x6 x5 x4 */
- movq_r2r(mm0, mm2); /* 2 ; x3 x2 x1 x0 */
-
- movq_m2r(*(tab_i_01234567), mm3); /* 3 ; w06 w04 w02 w00 */
- punpcklwd_r2r(mm1, mm0); /* x5 x1 x4 x0 */
-
- /* ---------- */
- movq_r2r(mm0, mm5); /* 5 ; x5 x1 x4 x0 */
- punpckldq_r2r(mm0, mm0); /* x4 x0 x4 x0 */
-
- movq_m2r(*(tab_i_01234567+4), mm4); /* 4 ; w07 w05 w03 w01 */
- punpckhwd_r2r(mm1, mm2); /* 1 ; x7 x3 x6 x2 */
-
- pmaddwd_r2r(mm0, mm3); /* x4*w06+x0*w04 x4*w02+x0*w00 */
- movq_r2r(mm2, mm6); /* 6 ; x7 x3 x6 x2 */
-
- movq_m2r(*(tab_i_01234567+16), mm1);/* 1 ; w22 w20 w18 w16 */
- punpckldq_r2r(mm2, mm2); /* x6 x2 x6 x2 */
-
- pmaddwd_r2r(mm2, mm4); /* x6*w07+x2*w05 x6*w03+x2*w01 */
- punpckhdq_r2r(mm5, mm5); /* x5 x1 x5 x1 */
-
- pmaddwd_m2r(*(tab_i_01234567+8), mm0);/* x4*w14+x0*w12 x4*w10+x0*w08 */
- punpckhdq_r2r(mm6, mm6); /* x7 x3 x7 x3 */
-
- movq_m2r(*(tab_i_01234567+20), mm7);/* 7 ; w23 w21 w19 w17 */
- pmaddwd_r2r(mm5, mm1); /* x5*w22+x1*w20 x5*w18+x1*w16 */
-
- paddd_m2r(*(r_inv_row), mm3);/* +rounder */
- pmaddwd_r2r(mm6, mm7); /* x7*w23+x3*w21 x7*w19+x3*w17 */
-
- pmaddwd_m2r(*(tab_i_01234567+12), mm2);/* x6*w15+x2*w13 x6*w11+x2*w09 */
- paddd_r2r(mm4, mm3); /* 4 ; a1=sum(even1) a0=sum(even0) */
-
- pmaddwd_m2r(*(tab_i_01234567+24), mm5);/* x5*w30+x1*w28 x5*w26+x1*w24 */
- movq_r2r(mm3, mm4); /* 4 ; a1 a0 */
-
- pmaddwd_m2r(*(tab_i_01234567+28), mm6);/* x7*w31+x3*w29 x7*w27+x3*w25 */
- paddd_r2r(mm7, mm1); /* 7 ; b1=sum(odd1) b0=sum(odd0) */
-
- paddd_m2r(*(r_inv_row), mm0);/* +rounder */
- psubd_r2r(mm1, mm3); /* a1-b1 a0-b0 */
-
- psrad_i2r(SHIFT_INV_ROW, mm3); /* y6=a1-b1 y7=a0-b0 */
- paddd_r2r(mm4, mm1); /* 4 ; a1+b1 a0+b0 */
-
- paddd_r2r(mm2, mm0); /* 2 ; a3=sum(even3) a2=sum(even2) */
- psrad_i2r(SHIFT_INV_ROW, mm1); /* y1=a1+b1 y0=a0+b0 */
-
- paddd_r2r(mm6, mm5); /* 6 ; b3=sum(odd3) b2=sum(odd2) */
- movq_r2r(mm0, mm4); /* 4 ; a3 a2 */
-
- paddd_r2r(mm5, mm0); /* a3+b3 a2+b2 */
- psubd_r2r(mm5, mm4); /* 5 ; a3-b3 a2-b2 */
-
- psrad_i2r(SHIFT_INV_ROW, mm4); /* y4=a3-b3 y5=a2-b2 */
- psrad_i2r(SHIFT_INV_ROW, mm0); /* y3=a3+b3 y2=a2+b2 */
-
- packssdw_r2r(mm3, mm4); /* 3 ; y6 y7 y4 y5 */
-
- packssdw_r2r(mm0, mm1); /* 0 ; y3 y2 y1 y0 */
- movq_r2r(mm4, mm7); /* 7 ; y6 y7 y4 y5 */
-
- psrld_i2r(16, mm4); /* 0 y6 0 y4 */
-
- movq_r2m(mm1, *(out)); /* 1 ; save y3 y2 y1 y0 */
- pslld_i2r(16, mm7); /* y7 0 y5 0 */
-
- por_r2r(mm4, mm7); /* 4 ; y7 y6 y5 y4 */
-
- /* begin processing row 1 */
- movq_r2m(mm7, *(out+4)); /* 7 ; save y7 y6 y5 y4 */
-
- inptr += 8;
- out += 8;
- }
-
-
- /* done with the iDCT row-transformation */
-
- /* now we have to transpose the output 8x8 matrix */
- /* 8x8 (OUT) -> 8x8't' (IN) */
- /* the transposition is implemented as 4 sub-operations. */
- /* 1) transpose upper-left quad */
- /* 2) transpose lower-right quad */
- /* 3) transpose lower-left quad */
- /* 4) transpose upper-right quad */
-
-
- /* mm0 = 1st row [ A B C D ] row1 */
- /* mm1 = 2nd row [ E F G H ] 2 */
- /* mm2 = 3rd row [ I J K L ] 3 */
- /* mm3 = 4th row [ M N O P ] 4 */
-
- /* 1) transpose upper-left quad */
- out = &qwTemp[0];
-
- movq_m2r(*(out + ROW_STRIDE * 0), mm0);
-
- movq_m2r(*(out + ROW_STRIDE * 1), mm1);
- movq_r2r(mm0, mm4); /* mm4 = copy of row1[A B C D] */
-
- movq_m2r(*(out + ROW_STRIDE * 2), mm2);
- punpcklwd_r2r(mm1, mm0); /* mm0 = [ 0 4 1 5] */
-
- movq_m2r(*(out + ROW_STRIDE * 3), mm3);
- punpckhwd_r2r(mm1, mm4); /* mm4 = [ 2 6 3 7] */
-
- movq_r2r(mm2, mm6);
- punpcklwd_r2r(mm3, mm2); /* mm2 = [ 8 12 9 13] */
-
- punpckhwd_r2r(mm3, mm6); /* mm6 = 10 14 11 15] */
- movq_r2r(mm0, mm1); /* mm1 = [ 0 4 1 5] */
-
- inptr = blk;
-
- punpckldq_r2r(mm2, mm0); /* final result mm0 = row1 [0 4 8 12] */
-
- movq_r2r(mm4, mm3); /* mm3 = [ 2 6 3 7] */
- punpckhdq_r2r(mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13] */
-
- movq_r2m(mm0, *(inptr + ROW_STRIDE * 0)); /* store row 1 */
- punpckldq_r2r(mm6, mm4); /* final result mm4 = row3 [2 6 10 14] */
-
+ static void
+idct_mmx32_rows (short *blk)
+{ /* transform all 8 rows of 8x8 iDCT block */
+ int x;
+ short qwTemp[64];
+ short *out = &qwTemp[0];
+ short *inptr = blk;
+
+
+ /* this subroutine performs two operations */
+ /* 1) iDCT row transform */
+ /* for( i = 0; i < 8; ++ i) */
+ /* DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] ); */
+ /* */
+ /* 2) transpose the matrix (which was stored in qwTemp[]) */
+ /* qwTemp[] -> [8x8 matrix transpose] -> blk[] */
+ for (x = 0; x < 8; x++) { /* transform one row per iteration */
+ movq_m2r (*(inptr), mm0); /* 0 ; x3 x2 x1 x0 */
+ movq_m2r (*(inptr + 4), mm1); /* 1 ; x7 x6 x5 x4 */
+ movq_r2r (mm0, mm2); /* 2 ; x3 x2 x1 x0 */
+ movq_m2r (*(tab_i_01234567), mm3); /* 3 ; w06 w04 w02 w00 */
+ punpcklwd_r2r (mm1, mm0); /* x5 x1 x4 x0 */
+
+ /* ---------- */
+ movq_r2r (mm0, mm5); /* 5 ; x5 x1 x4 x0 */
+ punpckldq_r2r (mm0, mm0); /* x4 x0 x4 x0 */
+ movq_m2r (*(tab_i_01234567 + 4), mm4); /* 4 ; w07 w05 w03 w01 */
+ punpckhwd_r2r (mm1, mm2); /* 1 ; x7 x3 x6 x2 */
+ pmaddwd_r2r (mm0, mm3); /* x4*w06+x0*w04 x4*w02+x0*w00 */
+ movq_r2r (mm2, mm6); /* 6 ; x7 x3 x6 x2 */
+ movq_m2r (*(tab_i_01234567 + 16), mm1); /* 1 ; w22 w20 w18 w16 */
+ punpckldq_r2r (mm2, mm2); /* x6 x2 x6 x2 */
+ pmaddwd_r2r (mm2, mm4); /* x6*w07+x2*w05 x6*w03+x2*w01 */
+ punpckhdq_r2r (mm5, mm5); /* x5 x1 x5 x1 */
+ pmaddwd_m2r (*(tab_i_01234567 + 8), mm0); /* x4*w14+x0*w12 x4*w10+x0*w08 */
+ punpckhdq_r2r (mm6, mm6); /* x7 x3 x7 x3 */
+ movq_m2r (*(tab_i_01234567 + 20), mm7); /* 7 ; w23 w21 w19 w17 */
+ pmaddwd_r2r (mm5, mm1); /* x5*w22+x1*w20 x5*w18+x1*w16 */
+ paddd_m2r (*(r_inv_row), mm3); /* +rounder */
+ pmaddwd_r2r (mm6, mm7); /* x7*w23+x3*w21 x7*w19+x3*w17 */
+ pmaddwd_m2r (*(tab_i_01234567 + 12), mm2); /* x6*w15+x2*w13 x6*w11+x2*w09 */
+ paddd_r2r (mm4, mm3); /* 4 ; a1=sum(even1) a0=sum(even0) */
+ pmaddwd_m2r (*(tab_i_01234567 + 24), mm5); /* x5*w30+x1*w28 x5*w26+x1*w24 */
+ movq_r2r (mm3, mm4); /* 4 ; a1 a0 */
+ pmaddwd_m2r (*(tab_i_01234567 + 28), mm6); /* x7*w31+x3*w29 x7*w27+x3*w25 */
+ paddd_r2r (mm7, mm1); /* 7 ; b1=sum(odd1) b0=sum(odd0) */
+ paddd_m2r (*(r_inv_row), mm0); /* +rounder */
+ psubd_r2r (mm1, mm3); /* a1-b1 a0-b0 */
+ psrad_i2r (SHIFT_INV_ROW, mm3); /* y6=a1-b1 y7=a0-b0 */
+ paddd_r2r (mm4, mm1); /* 4 ; a1+b1 a0+b0 */
+ paddd_r2r (mm2, mm0); /* 2 ; a3=sum(even3) a2=sum(even2) */
+ psrad_i2r (SHIFT_INV_ROW, mm1); /* y1=a1+b1 y0=a0+b0 */
+ paddd_r2r (mm6, mm5); /* 6 ; b3=sum(odd3) b2=sum(odd2) */
+ movq_r2r (mm0, mm4); /* 4 ; a3 a2 */
+ paddd_r2r (mm5, mm0); /* a3+b3 a2+b2 */
+ psubd_r2r (mm5, mm4); /* 5 ; a3-b3 a2-b2 */
+ psrad_i2r (SHIFT_INV_ROW, mm4); /* y4=a3-b3 y5=a2-b2 */
+ psrad_i2r (SHIFT_INV_ROW, mm0); /* y3=a3+b3 y2=a2+b2 */
+ packssdw_r2r (mm3, mm4); /* 3 ; y6 y7 y4 y5 */
+ packssdw_r2r (mm0, mm1); /* 0 ; y3 y2 y1 y0 */
+ movq_r2r (mm4, mm7); /* 7 ; y6 y7 y4 y5 */
+ psrld_i2r (16, mm4); /* 0 y6 0 y4 */
+ movq_r2m (mm1, *(out)); /* 1 ; save y3 y2 y1 y0 */
+ pslld_i2r (16, mm7); /* y7 0 y5 0 */
+ por_r2r (mm4, mm7); /* 4 ; y7 y6 y5 y4 */
+
+ /* begin processing row 1 */
+ movq_r2m (mm7, *(out + 4)); /* 7 ; save y7 y6 y5 y4 */
+ inptr += 8;
+ out += 8;
+ }
+
+ /* done with the iDCT row-transformation */
+
+ /* now we have to transpose the output 8x8 matrix */
+ /* 8x8 (OUT) -> 8x8't' (IN) */
+ /* the transposition is implemented as 4 sub-operations. */
+ /* 1) transpose upper-left quad */
+ /* 2) transpose lower-right quad */
+ /* 3) transpose lower-left quad */
+ /* 4) transpose upper-right quad */
+
+ /* mm0 = 1st row [ A B C D ] row1 */
+ /* mm1 = 2nd row [ E F G H ] 2 */
+ /* mm2 = 3rd row [ I J K L ] 3 */
+ /* mm3 = 4th row [ M N O P ] 4 */
+
+ /* 1) transpose upper-left quad */
+ out = &qwTemp[0];
+ movq_m2r (*(out + ROW_STRIDE * 0), mm0);
+ movq_m2r (*(out + ROW_STRIDE * 1), mm1);
+ movq_r2r (mm0, mm4); /* mm4 = copy of row1[A B C D] */
+ movq_m2r (*(out + ROW_STRIDE * 2), mm2);
+ punpcklwd_r2r (mm1, mm0); /* mm0 = [ 0 4 1 5] */
+ movq_m2r (*(out + ROW_STRIDE * 3), mm3);
+ punpckhwd_r2r (mm1, mm4); /* mm4 = [ 2 6 3 7] */
+ movq_r2r (mm2, mm6);
+ punpcklwd_r2r (mm3, mm2); /* mm2 = [ 8 12 9 13] */
+ punpckhwd_r2r (mm3, mm6); /* mm6 = 10 14 11 15] */
+ movq_r2r (mm0, mm1); /* mm1 = [ 0 4 1 5] */
+ inptr = blk;
+ punpckldq_r2r (mm2, mm0); /* final result mm0 = row1 [0 4 8 12] */
+ movq_r2r (mm4, mm3); /* mm3 = [ 2 6 3 7] */
+ punpckhdq_r2r (mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13] */
+ movq_r2m (mm0, *(inptr + ROW_STRIDE * 0)); /* store row 1 */
+ punpckldq_r2r (mm6, mm4); /* final result mm4 = row3 [2 6 10 14] */
+
/* begin reading next quadrant (lower-right) */
- movq_m2r(*(out + ROW_STRIDE*4 + 4), mm0);
- punpckhdq_r2r(mm6, mm3); /* final result mm3 = row4 [3 7 11 15] */
-
- movq_r2m(mm4, *(inptr + ROW_STRIDE * 2)); /* store row 3 */
- movq_r2r(mm0, mm4); /* mm4 = copy of row1[A B C D] */
-
- movq_r2m(mm1, *(inptr + ROW_STRIDE * 1)); /* store row 2 */
-
- movq_m2r(*(out + ROW_STRIDE*5 + 4), mm1);
-
- movq_r2m(mm3, *(inptr + ROW_STRIDE * 3)); /* store row 4 */
- punpcklwd_r2r(mm1, mm0); /* mm0 = [ 0 4 1 5] */
-
- /* 2) transpose lower-right quadrant */
-
+ movq_m2r (*(out + ROW_STRIDE * 4 + 4), mm0);
+ punpckhdq_r2r (mm6, mm3); /* final result mm3 = row4 [3 7 11 15] */
+ movq_r2m (mm4, *(inptr + ROW_STRIDE * 2)); /* store row 3 */
+ movq_r2r (mm0, mm4); /* mm4 = copy of row1[A B C D] */
+ movq_r2m (mm1, *(inptr + ROW_STRIDE * 1)); /* store row 2 */
+ movq_m2r (*(out + ROW_STRIDE * 5 + 4), mm1);
+ movq_r2m (mm3, *(inptr + ROW_STRIDE * 3)); /* store row 4 */
+ punpcklwd_r2r (mm1, mm0); /* mm0 = [ 0 4 1 5] */
+
+ /* 2) transpose lower-right quadrant */
+
/* movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8] */
-
+
/* movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8] */
/* movq mm4, mm0; // mm4 = copy of row1[A B C D] */
-
- movq_m2r(*(out + ROW_STRIDE*6 + 4), mm2);
-/* punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5] */
- punpckhwd_r2r(mm1, mm4); /* mm4 = [ 2 6 3 7] */
-
- movq_m2r(*(out + ROW_STRIDE*7 + 4), mm3);
- movq_r2r(mm2, mm6);
-
- punpcklwd_r2r(mm3, mm2); /* mm2 = [ 8 12 9 13] */
- movq_r2r(mm0, mm1); /* mm1 = [ 0 4 1 5] */
-
- punpckhwd_r2r(mm3, mm6); /* mm6 = 10 14 11 15] */
- movq_r2r(mm4, mm3); /* mm3 = [ 2 6 3 7] */
-
- punpckldq_r2r(mm2, mm0); /* final result mm0 = row1 [0 4 8 12] */
-
- punpckhdq_r2r(mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13] */
- ; /* slot */
-
- movq_r2m(mm0, *(inptr + ROW_STRIDE*4 + 4)); /* store row 1 */
- punpckldq_r2r(mm6, mm4); /* final result mm4 = row3 [2 6 10 14] */
-
- movq_m2r(*(out + ROW_STRIDE * 4 ), mm0);
- punpckhdq_r2r(mm6, mm3); /* final result mm3 = row4 [3 7 11 15] */
+ movq_m2r (*(out + ROW_STRIDE * 6 + 4), mm2);
- movq_r2m(mm4, *(inptr + ROW_STRIDE*6 + 4)); /* store row 3 */
- movq_r2r(mm0, mm4); /* mm4 = copy of row1[A B C D] */
-
- movq_r2m(mm1, *(inptr + ROW_STRIDE*5 + 4)); /* store row 2 */
- ; /* slot */
+/* punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5] */
+ punpckhwd_r2r (mm1, mm4); /* mm4 = [ 2 6 3 7] */
+ movq_m2r (*(out + ROW_STRIDE * 7 + 4), mm3);
+ movq_r2r (mm2, mm6);
+ punpcklwd_r2r (mm3, mm2); /* mm2 = [ 8 12 9 13] */
+ movq_r2r (mm0, mm1); /* mm1 = [ 0 4 1 5] */
+ punpckhwd_r2r (mm3, mm6); /* mm6 = 10 14 11 15] */
+ movq_r2r (mm4, mm3); /* mm3 = [ 2 6 3 7] */
+ punpckldq_r2r (mm2, mm0); /* final result mm0 = row1 [0 4 8 12] */
+ punpckhdq_r2r (mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13] */
+ ; /* slot */
+ movq_r2m (mm0, *(inptr + ROW_STRIDE * 4 + 4)); /* store row 1 */
+ punpckldq_r2r (mm6, mm4); /* final result mm4 = row3 [2 6 10 14] */
+ movq_m2r (*(out + ROW_STRIDE * 4), mm0);
+ punpckhdq_r2r (mm6, mm3); /* final result mm3 = row4 [3 7 11 15] */
+ movq_r2m (mm4, *(inptr + ROW_STRIDE * 6 + 4)); /* store row 3 */
+ movq_r2r (mm0, mm4); /* mm4 = copy of row1[A B C D] */
+ movq_r2m (mm1, *(inptr + ROW_STRIDE * 5 + 4)); /* store row 2 */
+ ; /* slot */
+ movq_m2r (*(out + ROW_STRIDE * 5), mm1);
+ ; /* slot */
+ movq_r2m (mm3, *(inptr + ROW_STRIDE * 7 + 4)); /* store row 4 */
+ punpcklwd_r2r (mm1, mm0); /* mm0 = [ 0 4 1 5] */
- movq_m2r(*(out + ROW_STRIDE * 5 ), mm1);
- ; /* slot */
-
- movq_r2m(mm3, *(inptr + ROW_STRIDE*7 + 4)); /* store row 4 */
- punpcklwd_r2r(mm1, mm0); /* mm0 = [ 0 4 1 5] */
-
- /* 3) transpose lower-left */
+ /* 3) transpose lower-left */
/* movq mm0, qword ptr [OUT + ROW_STRIDE * 4 ] */
-
+
/* movq mm1, qword ptr [OUT + ROW_STRIDE * 5 ] */
/* movq mm4, mm0; // mm4 = copy of row1[A B C D] */
-
- movq_m2r(*(out + ROW_STRIDE * 6 ), mm2);
+ movq_m2r (*(out + ROW_STRIDE * 6), mm2);
+
/* punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5] */
- punpckhwd_r2r(mm1, mm4); /* mm4 = [ 2 6 3 7] */
-
- movq_m2r(*(out + ROW_STRIDE * 7 ), mm3);
- movq_r2r(mm2, mm6);
-
- punpcklwd_r2r(mm3, mm2); /* mm2 = [ 8 12 9 13] */
- movq_r2r(mm0, mm1); /* mm1 = [ 0 4 1 5] */
-
- punpckhwd_r2r(mm3, mm6); /* mm6 = 10 14 11 15] */
- movq_r2r(mm4, mm3); /* mm3 = [ 2 6 3 7] */
-
- punpckldq_r2r(mm2, mm0); /* final result mm0 = row1 [0 4 8 12] */
-
- punpckhdq_r2r(mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13] */
- ;/*slot */
-
- movq_r2m(mm0, *(inptr + ROW_STRIDE * 0 + 4 )); /* store row 1 */
- punpckldq_r2r(mm6, mm4); /* final result mm4 = row3 [2 6 10 14] */
-
+ punpckhwd_r2r (mm1, mm4); /* mm4 = [ 2 6 3 7] */
+ movq_m2r (*(out + ROW_STRIDE * 7), mm3);
+ movq_r2r (mm2, mm6);
+ punpcklwd_r2r (mm3, mm2); /* mm2 = [ 8 12 9 13] */
+ movq_r2r (mm0, mm1); /* mm1 = [ 0 4 1 5] */
+ punpckhwd_r2r (mm3, mm6); /* mm6 = 10 14 11 15] */
+ movq_r2r (mm4, mm3); /* mm3 = [ 2 6 3 7] */
+ punpckldq_r2r (mm2, mm0); /* final result mm0 = row1 [0 4 8 12] */
+ punpckhdq_r2r (mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13] */
+ ; /*slot */
+ movq_r2m (mm0, *(inptr + ROW_STRIDE * 0 + 4)); /* store row 1 */
+ punpckldq_r2r (mm6, mm4); /* final result mm4 = row3 [2 6 10 14] */
+
/* begin reading next quadrant (upper-right) */
- movq_m2r(*(out + ROW_STRIDE*0 + 4), mm0);
- punpckhdq_r2r(mm6, mm3); /* final result mm3 = row4 [3 7 11 15] */
-
- movq_r2m(mm4, *(inptr + ROW_STRIDE * 2 + 4)); /* store row 3 */
- movq_r2r(mm0, mm4); /* mm4 = copy of row1[A B C D] */
-
- movq_r2m(mm1, *(inptr + ROW_STRIDE * 1 + 4)); /* store row 2 */
- movq_m2r(*(out + ROW_STRIDE*1 + 4), mm1);
-
- movq_r2m(mm3, *(inptr + ROW_STRIDE * 3 + 4)); /* store row 4 */
- punpcklwd_r2r(mm1, mm0); /* mm0 = [ 0 4 1 5] */
-
-
- /* 2) transpose lower-right quadrant */
-
+ movq_m2r (*(out + ROW_STRIDE * 0 + 4), mm0);
+ punpckhdq_r2r (mm6, mm3); /* final result mm3 = row4 [3 7 11 15] */
+ movq_r2m (mm4, *(inptr + ROW_STRIDE * 2 + 4)); /* store row 3 */
+ movq_r2r (mm0, mm4); /* mm4 = copy of row1[A B C D] */
+ movq_r2m (mm1, *(inptr + ROW_STRIDE * 1 + 4)); /* store row 2 */
+ movq_m2r (*(out + ROW_STRIDE * 1 + 4), mm1);
+ movq_r2m (mm3, *(inptr + ROW_STRIDE * 3 + 4)); /* store row 4 */
+ punpcklwd_r2r (mm1, mm0); /* mm0 = [ 0 4 1 5] */
+
+ /* 2) transpose lower-right quadrant */
+
/* movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8] */
-
+
/* movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8] */
/* movq mm4, mm0; // mm4 = copy of row1[A B C D] */
-
- movq_m2r(*(out + ROW_STRIDE*2 + 4), mm2);
+ movq_m2r (*(out + ROW_STRIDE * 2 + 4), mm2);
+
/* punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5] */
- punpckhwd_r2r(mm1, mm4); /* mm4 = [ 2 6 3 7] */
-
- movq_m2r(*(out + ROW_STRIDE*3 + 4), mm3);
- movq_r2r(mm2, mm6);
-
- punpcklwd_r2r(mm3, mm2); /* mm2 = [ 8 12 9 13] */
- movq_r2r(mm0, mm1); /* mm1 = [ 0 4 1 5] */
-
- punpckhwd_r2r(mm3, mm6); /* mm6 = 10 14 11 15] */
- movq_r2r(mm4, mm3); /* mm3 = [ 2 6 3 7] */
-
- punpckldq_r2r(mm2, mm0); /* final result mm0 = row1 [0 4 8 12] */
-
- punpckhdq_r2r(mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13] */
- ; /* slot */
-
- movq_r2m(mm0, *(inptr + ROW_STRIDE*4)); /* store row 1 */
- punpckldq_r2r(mm6, mm4); /* final result mm4 = row3 [2 6 10 14] */
-
- movq_r2m(mm1, *(inptr + ROW_STRIDE*5)); /* store row 2 */
- punpckhdq_r2r(mm6, mm3); /* final result mm3 = row4 [3 7 11 15] */
-
- movq_r2m(mm4, *(inptr + ROW_STRIDE*6)); /* store row 3 */
- ; /* slot */
-
- movq_r2m(mm3, *(inptr + ROW_STRIDE*7)); /* store row 4 */
- ; /* slot */
-
-}
-
-
-static void
-idct_mmx32_cols( short *blk ) /* transform all 8 cols of 8x8 iDCT block */
-{
- int x;
- short *inptr = blk;
-
- /* Despite the function's name, the matrix is transformed */
- /* row by row. This function is identical to idct_mmx32_rows(), */
- /* except for the SHIFT amount and ROUND_INV amount. */
-
- /* this subroutine performs two operations */
- /* 1) iDCT row transform */
- /* for( i = 0; i < 8; ++ i) */
- /* DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] ); */
- /* */
- /* 2) transpose the matrix (which was stored in qwTemp[]) */
- /* qwTemp[] -> [8x8 matrix transpose] -> blk[] */
-
-
- for (x=0; x<8; x++) { /* transform one row per iteration */
-
- movq_m2r(*(inptr), mm0); /* 0 ; x3 x2 x1 x0 */
-
- movq_m2r(*(inptr+4), mm1); /* 1 ; x7 x6 x5 x4 */
- movq_r2r(mm0, mm2); /* 2 ; x3 x2 x1 x0 */
-
- movq_m2r(*(tab_i_01234567), mm3); /* 3 ; w06 w04 w02 w00 */
- punpcklwd_r2r(mm1, mm0); /* x5 x1 x4 x0 */
-
+ punpckhwd_r2r (mm1, mm4); /* mm4 = [ 2 6 3 7] */
+ movq_m2r (*(out + ROW_STRIDE * 3 + 4), mm3);
+ movq_r2r (mm2, mm6);
+ punpcklwd_r2r (mm3, mm2); /* mm2 = [ 8 12 9 13] */
+ movq_r2r (mm0, mm1); /* mm1 = [ 0 4 1 5] */
+ punpckhwd_r2r (mm3, mm6); /* mm6 = 10 14 11 15] */
+ movq_r2r (mm4, mm3); /* mm3 = [ 2 6 3 7] */
+ punpckldq_r2r (mm2, mm0); /* final result mm0 = row1 [0 4 8 12] */
+ punpckhdq_r2r (mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13] */
+ ; /* slot */
+ movq_r2m (mm0, *(inptr + ROW_STRIDE * 4)); /* store row 1 */
+ punpckldq_r2r (mm6, mm4); /* final result mm4 = row3 [2 6 10 14] */
+ movq_r2m (mm1, *(inptr + ROW_STRIDE * 5)); /* store row 2 */
+ punpckhdq_r2r (mm6, mm3); /* final result mm3 = row4 [3 7 11 15] */
+ movq_r2m (mm4, *(inptr + ROW_STRIDE * 6)); /* store row 3 */
+ ; /* slot */
+ movq_r2m (mm3, *(inptr + ROW_STRIDE * 7)); /* store row 4 */
+ ; /* slot */
+ }
+ static void
+idct_mmx32_cols (short *blk)
+{ /* transform all 8 cols of 8x8 iDCT block */
+ int x;
+ short *inptr = blk;
+
+
+ /* Despite the function's name, the matrix is transformed */
+ /* row by row. This function is identical to idct_mmx32_rows(), */
+ /* except for the SHIFT amount and ROUND_INV amount. */
+
+ /* this subroutine performs two operations */
+ /* 1) iDCT row transform */
+ /* for( i = 0; i < 8; ++ i) */
+ /* DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] ); */
+ /* */
+ /* 2) transpose the matrix (which was stored in qwTemp[]) */
+ /* qwTemp[] -> [8x8 matrix transpose] -> blk[] */
+ for (x = 0; x < 8; x++) { /* transform one row per iteration */
+ movq_m2r (*(inptr), mm0); /* 0 ; x3 x2 x1 x0 */
+ movq_m2r (*(inptr + 4), mm1); /* 1 ; x7 x6 x5 x4 */
+ movq_r2r (mm0, mm2); /* 2 ; x3 x2 x1 x0 */
+ movq_m2r (*(tab_i_01234567), mm3); /* 3 ; w06 w04 w02 w00 */
+ punpcklwd_r2r (mm1, mm0); /* x5 x1 x4 x0 */
+
/* ---------- */
- movq_r2r(mm0, mm5); /* 5 ; x5 x1 x4 x0 */
- punpckldq_r2r(mm0, mm0); /* x4 x0 x4 x0 */
-
- movq_m2r(*(tab_i_01234567+4), mm4); /* 4 ; w07 w05 w03 w01 */
- punpckhwd_r2r(mm1, mm2); /* 1 ; x7 x3 x6 x2 */
-
- pmaddwd_r2r(mm0, mm3); /* x4*w06+x0*w04 x4*w02+x0*w00 */
- movq_r2r(mm2, mm6); /* 6 ; x7 x3 x6 x2 */
-
- movq_m2r(*(tab_i_01234567+16), mm1);/* 1 ; w22 w20 w18 w16 */
- punpckldq_r2r(mm2, mm2); /* x6 x2 x6 x2 */
-
- pmaddwd_r2r(mm2, mm4); /* x6*w07+x2*w05 x6*w03+x2*w01 */
- punpckhdq_r2r(mm5, mm5); /* x5 x1 x5 x1 */
-
- pmaddwd_m2r(*(tab_i_01234567+8), mm0);/* x4*w14+x0*w12 x4*w10+x0*w08 */
- punpckhdq_r2r(mm6, mm6); /* x7 x3 x7 x3 */
-
- movq_m2r(*(tab_i_01234567+20), mm7);/* 7 ; w23 w21 w19 w17 */
- pmaddwd_r2r(mm5, mm1); /* x5*w22+x1*w20 x5*w18+x1*w16 */
-
- paddd_m2r(*(r_inv_col), mm3);/* +rounder */
- pmaddwd_r2r(mm6, mm7); /* x7*w23+x3*w21 x7*w19+x3*w17 */
-
- pmaddwd_m2r(*(tab_i_01234567+12), mm2);/* x6*w15+x2*w13 x6*w11+x2*w09 */
- paddd_r2r(mm4, mm3); /* 4 ; a1=sum(even1) a0=sum(even0) */
-
- pmaddwd_m2r(*(tab_i_01234567+24), mm5);/* x5*w30+x1*w28 x5*w26+x1*w24 */
- movq_r2r(mm3, mm4); /* 4 ; a1 a0 */
-
- pmaddwd_m2r(*(tab_i_01234567+28), mm6);/* x7*w31+x3*w29 x7*w27+x3*w25 */
- paddd_r2r(mm7, mm1); /* 7 ; b1=sum(odd1) b0=sum(odd0) */
-
- paddd_m2r(*(r_inv_col), mm0);/* +rounder */
- psubd_r2r(mm1, mm3); /* a1-b1 a0-b0 */
-
- psrad_i2r(SHIFT_INV_COL, mm3); /* y6=a1-b1 y7=a0-b0 */
- paddd_r2r(mm4, mm1); /* 4 ; a1+b1 a0+b0 */
-
- paddd_r2r(mm2, mm0); /* 2 ; a3=sum(even3) a2=sum(even2) */
- psrad_i2r(SHIFT_INV_COL, mm1); /* y1=a1+b1 y0=a0+b0 */
-
- paddd_r2r(mm6, mm5); /* 6 ; b3=sum(odd3) b2=sum(odd2) */
- movq_r2r(mm0, mm4); /* 4 ; a3 a2 */
-
- paddd_r2r(mm5, mm0); /* a3+b3 a2+b2 */
- psubd_r2r(mm5, mm4); /* 5 ; a3-b3 a2-b2 */
-
-
- psrad_i2r(SHIFT_INV_COL, mm4); /* y4=a3-b3 y5=a2-b2 */
- psrad_i2r(SHIFT_INV_COL, mm0); /* y3=a3+b3 y2=a2+b2 */
-
- packssdw_r2r(mm3, mm4); /* 3 ; y6 y7 y4 y5 */
-
- packssdw_r2r(mm0, mm1); /* 0 ; y3 y2 y1 y0 */
- movq_r2r(mm4, mm7); /* 7 ; y6 y7 y4 y5 */
-
- psrld_i2r(16, mm4); /* 0 y6 0 y4 */
-
- movq_r2m(mm1, *(inptr)); /* 1 ; save y3 y2 y1 y0 */
- pslld_i2r(16, mm7); /* y7 0 y5 0 */
-
- por_r2r(mm4, mm7); /* 4 ; y7 y6 y5 y4 */
-
- /* begin processing row 1 */
- movq_r2m(mm7, *(inptr+4)); /* 7 ; save y7 y6 y5 y4 */
-
- inptr += 8;
- }
- /* done with the iDCT column-transformation */
-}
-
+ movq_r2r (mm0, mm5); /* 5 ; x5 x1 x4 x0 */
+ punpckldq_r2r (mm0, mm0); /* x4 x0 x4 x0 */
+ movq_m2r (*(tab_i_01234567 + 4), mm4); /* 4 ; w07 w05 w03 w01 */
+ punpckhwd_r2r (mm1, mm2); /* 1 ; x7 x3 x6 x2 */
+ pmaddwd_r2r (mm0, mm3); /* x4*w06+x0*w04 x4*w02+x0*w00 */
+ movq_r2r (mm2, mm6); /* 6 ; x7 x3 x6 x2 */
+ movq_m2r (*(tab_i_01234567 + 16), mm1); /* 1 ; w22 w20 w18 w16 */
+ punpckldq_r2r (mm2, mm2); /* x6 x2 x6 x2 */
+ pmaddwd_r2r (mm2, mm4); /* x6*w07+x2*w05 x6*w03+x2*w01 */
+ punpckhdq_r2r (mm5, mm5); /* x5 x1 x5 x1 */
+ pmaddwd_m2r (*(tab_i_01234567 + 8), mm0); /* x4*w14+x0*w12 x4*w10+x0*w08 */
+ punpckhdq_r2r (mm6, mm6); /* x7 x3 x7 x3 */
+ movq_m2r (*(tab_i_01234567 + 20), mm7); /* 7 ; w23 w21 w19 w17 */
+ pmaddwd_r2r (mm5, mm1); /* x5*w22+x1*w20 x5*w18+x1*w16 */
+ paddd_m2r (*(r_inv_col), mm3); /* +rounder */
+ pmaddwd_r2r (mm6, mm7); /* x7*w23+x3*w21 x7*w19+x3*w17 */
+ pmaddwd_m2r (*(tab_i_01234567 + 12), mm2); /* x6*w15+x2*w13 x6*w11+x2*w09 */
+ paddd_r2r (mm4, mm3); /* 4 ; a1=sum(even1) a0=sum(even0) */
+ pmaddwd_m2r (*(tab_i_01234567 + 24), mm5); /* x5*w30+x1*w28 x5*w26+x1*w24 */
+ movq_r2r (mm3, mm4); /* 4 ; a1 a0 */
+ pmaddwd_m2r (*(tab_i_01234567 + 28), mm6); /* x7*w31+x3*w29 x7*w27+x3*w25 */
+ paddd_r2r (mm7, mm1); /* 7 ; b1=sum(odd1) b0=sum(odd0) */
+ paddd_m2r (*(r_inv_col), mm0); /* +rounder */
+ psubd_r2r (mm1, mm3); /* a1-b1 a0-b0 */
+ psrad_i2r (SHIFT_INV_COL, mm3); /* y6=a1-b1 y7=a0-b0 */
+ paddd_r2r (mm4, mm1); /* 4 ; a1+b1 a0+b0 */
+ paddd_r2r (mm2, mm0); /* 2 ; a3=sum(even3) a2=sum(even2) */
+ psrad_i2r (SHIFT_INV_COL, mm1); /* y1=a1+b1 y0=a0+b0 */
+ paddd_r2r (mm6, mm5); /* 6 ; b3=sum(odd3) b2=sum(odd2) */
+ movq_r2r (mm0, mm4); /* 4 ; a3 a2 */
+ paddd_r2r (mm5, mm0); /* a3+b3 a2+b2 */
+ psubd_r2r (mm5, mm4); /* 5 ; a3-b3 a2-b2 */
+ psrad_i2r (SHIFT_INV_COL, mm4); /* y4=a3-b3 y5=a2-b2 */
+ psrad_i2r (SHIFT_INV_COL, mm0); /* y3=a3+b3 y2=a2+b2 */
+ packssdw_r2r (mm3, mm4); /* 3 ; y6 y7 y4 y5 */
+ packssdw_r2r (mm0, mm1); /* 0 ; y3 y2 y1 y0 */
+ movq_r2r (mm4, mm7); /* 7 ; y6 y7 y4 y5 */
+ psrld_i2r (16, mm4); /* 0 y6 0 y4 */
+ movq_r2m (mm1, *(inptr)); /* 1 ; save y3 y2 y1 y0 */
+ pslld_i2r (16, mm7); /* y7 0 y5 0 */
+ por_r2r (mm4, mm7); /* 4 ; y7 y6 y5 y4 */
+
+ /* begin processing row 1 */
+ movq_r2m (mm7, *(inptr + 4)); /* 7 ; save y7 y6 y5 y4 */
+ inptr += 8;
+ }
+
+ /* done with the iDCT column-transformation */
+}
+
+
/* */
/* public interface to MMX32 IDCT 8x8 operation */
/* */
-void
-gst_idct_mmx32_idct( short *blk )
-{
- /* 1) iDCT row transformation */
- idct_mmx32_rows( blk ); /* 1) transform iDCT row, and transpose */
-
- /* 2) iDCT column transformation */
- idct_mmx32_cols( blk ); /* 2) transform iDCT row, and transpose */
-
- emms(); /* restore processor state */
- /* all done */
-}
+void
+gst_idct_mmx32_idct (short *blk)
+{
+
+ /* 1) iDCT row transformation */
+ idct_mmx32_rows (blk); /* 1) transform iDCT row, and transpose */
+
+ /* 2) iDCT column transformation */
+ idct_mmx32_cols (blk); /* 2) transform iDCT row, and transpose */
+ emms (); /* restore processor state */
+ /* all done */
+}