diff options
author | Thomas Vander Stichele <thomas@apestaart.org> | 2004-03-14 22:34:33 +0000 |
---|---|---|
committer | Thomas Vander Stichele <thomas@apestaart.org> | 2004-03-14 22:34:33 +0000 |
commit | 7a778ee4b7ec09a1f5b2185c9cceee3910dfbdf2 (patch) | |
tree | f863b467dea9559a6ec9c48affbfae11f8104164 /gst-libs/gst/idct | |
parent | a19db4bbdc4a15ea0d8f4d28e9a1302c9c3d1657 (diff) | |
download | gst-plugins-bad-7a778ee4b7ec09a1f5b2185c9cceee3910dfbdf2.tar.gz gst-plugins-bad-7a778ee4b7ec09a1f5b2185c9cceee3910dfbdf2.tar.bz2 gst-plugins-bad-7a778ee4b7ec09a1f5b2185c9cceee3910dfbdf2.zip |
gst-indent
Original commit message from CVS:
gst-indent
Diffstat (limited to 'gst-libs/gst/idct')
-rw-r--r-- | gst-libs/gst/idct/dct.h | 5 | ||||
-rw-r--r-- | gst-libs/gst/idct/fastintidct.c | 156 | ||||
-rw-r--r-- | gst-libs/gst/idct/floatidct.c | 42 | ||||
-rw-r--r-- | gst-libs/gst/idct/idct.c | 160 | ||||
-rw-r--r-- | gst-libs/gst/idct/idct.h | 23 | ||||
-rw-r--r-- | gst-libs/gst/idct/ieeetest.c | 293 | ||||
-rw-r--r-- | gst-libs/gst/idct/intidct.c | 205 | ||||
-rw-r--r-- | gst-libs/gst/idct/mmx32idct.c | 993 |
8 files changed, 908 insertions, 969 deletions
diff --git a/gst-libs/gst/idct/dct.h b/gst-libs/gst/idct/dct.h index efb3ddb3..c2e37449 100644 --- a/gst-libs/gst/idct/dct.h +++ b/gst-libs/gst/idct/dct.h @@ -16,7 +16,7 @@ typedef DCTELEM DCTBLOCK[DCTSIZE2]; typedef long INT32; /* must be at least 32 bits */ -extern void gst_idct_int_idct(); +extern void gst_idct_int_idct (); extern void gst_idct_init_fast_int_idct (void); extern void gst_idct_fast_int_idct (short *block); @@ -27,6 +27,5 @@ extern void gst_idct_mmx32_idct (short *block); extern void gst_idct_sse_idct (short *block); #endif /* HAVE_LIBMMX */ -extern void gst_idct_init_float_idct(void); +extern void gst_idct_init_float_idct (void); extern void gst_idct_float_idct (short *block); - diff --git a/gst-libs/gst/idct/fastintidct.c b/gst-libs/gst/idct/fastintidct.c index 27426672..9bb1436d 100644 --- a/gst-libs/gst/idct/fastintidct.c +++ b/gst-libs/gst/idct/fastintidct.c @@ -45,17 +45,17 @@ /* this code assumes >> to be a two's-complement arithmetic */ /* right shift: (-2)>>1 == -1 , (-3)>>1 == -2 */ -#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */ -#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */ -#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */ -#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */ -#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */ -#define W7 565 /* 2048*sqrt(2)*cos(7*pi/16) */ +#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */ +#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */ +#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */ +#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */ +#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */ +#define W7 565 /* 2048*sqrt(2)*cos(7*pi/16) */ #include "dct.h" /* private data */ -static short iclip[1024]; /* clipping table */ +static short iclip[1024]; /* clipping table */ static short *iclp; /* private prototypes */ @@ -72,57 +72,58 @@ static void idctcol (short *blk); * c[1..7] = 128*sqrt(2) */ -static void idctrow(blk) -short *blk; +static void +idctrow (blk) + short *blk; { int x0, x1, x2, x3, x4, x5, x6, x7, x8; /* shortcut */ - if (!((x1 = blk[4]<<11) | (x2 = blk[6]) | (x3 = blk[2]) | - (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) - { - blk[0]=blk[1]=blk[2]=blk[3]=blk[4]=blk[5]=blk[6]=blk[7]=blk[0]<<3; + if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) | + (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) { + blk[0] = blk[1] = blk[2] = blk[3] = blk[4] = blk[5] = blk[6] = blk[7] = + blk[0] << 3; return; } - x0 = (blk[0]<<11) + 128; /* for proper rounding in the fourth stage */ + x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */ /* first stage */ - x8 = W7*(x4+x5); - x4 = x8 + (W1-W7)*x4; - x5 = x8 - (W1+W7)*x5; - x8 = W3*(x6+x7); - x6 = x8 - (W3-W5)*x6; - x7 = x8 - (W3+W5)*x7; - + x8 = W7 * (x4 + x5); + x4 = x8 + (W1 - W7) * x4; + x5 = x8 - (W1 + W7) * x5; + x8 = W3 * (x6 + x7); + x6 = x8 - (W3 - W5) * x6; + x7 = x8 - (W3 + W5) * x7; + /* second stage */ x8 = x0 + x1; x0 -= x1; - x1 = W6*(x3+x2); - x2 = x1 - (W2+W6)*x2; - x3 = x1 + (W2-W6)*x3; + x1 = W6 * (x3 + x2); + x2 = x1 - (W2 + W6) * x2; + x3 = x1 + (W2 - W6) * x3; x1 = x4 + x6; x4 -= x6; x6 = x5 + x7; x5 -= x7; - + /* third stage */ x7 = x8 + x3; x8 -= x3; x3 = x0 + x2; x0 -= x2; - x2 = (181*(x4+x5)+128)>>8; - x4 = (181*(x4-x5)+128)>>8; - + x2 = (181 * (x4 + x5) + 128) >> 8; + x4 = (181 * (x4 - x5) + 128) >> 8; + /* fourth stage */ - blk[0] = (x7+x1)>>8; - blk[1] = (x3+x2)>>8; - blk[2] = (x0+x4)>>8; - blk[3] = (x8+x6)>>8; - blk[4] = (x8-x6)>>8; - blk[5] = (x0-x4)>>8; - blk[6] = (x3-x2)>>8; - blk[7] = (x7-x1)>>8; + blk[0] = (x7 + x1) >> 8; + blk[1] = (x3 + x2) >> 8; + blk[2] = (x0 + x4) >> 8; + blk[3] = (x8 + x6) >> 8; + blk[4] = (x8 - x6) >> 8; + blk[5] = (x0 - x4) >> 8; + blk[6] = (x3 - x2) >> 8; + blk[7] = (x7 - x1) >> 8; } /* column (vertical) IDCT @@ -134,78 +135,81 @@ short *blk; * where: c[0] = 1/1024 * c[1..7] = (1/1024)*sqrt(2) */ -static void idctcol(blk) -short *blk; +static void +idctcol (blk) + short *blk; { int x0, x1, x2, x3, x4, x5, x6, x7, x8; /* shortcut */ - if (!((x1 = (blk[8*4]<<8)) | (x2 = blk[8*6]) | (x3 = blk[8*2]) | - (x4 = blk[8*1]) | (x5 = blk[8*7]) | (x6 = blk[8*5]) | (x7 = blk[8*3]))) - { - blk[8*0]=blk[8*1]=blk[8*2]=blk[8*3]=blk[8*4]=blk[8*5]=blk[8*6]=blk[8*7]= - iclp[(blk[8*0]+32)>>6]; + if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) | + (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) | (x7 = + blk[8 * 3]))) { + blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3] = blk[8 * 4] = + blk[8 * 5] = blk[8 * 6] = blk[8 * 7] = iclp[(blk[8 * 0] + 32) >> 6]; return; } - x0 = (blk[8*0]<<8) + 8192; + x0 = (blk[8 * 0] << 8) + 8192; /* first stage */ - x8 = W7*(x4+x5) + 4; - x4 = (x8+(W1-W7)*x4)>>3; - x5 = (x8-(W1+W7)*x5)>>3; - x8 = W3*(x6+x7) + 4; - x6 = (x8-(W3-W5)*x6)>>3; - x7 = (x8-(W3+W5)*x7)>>3; - + x8 = W7 * (x4 + x5) + 4; + x4 = (x8 + (W1 - W7) * x4) >> 3; + x5 = (x8 - (W1 + W7) * x5) >> 3; + x8 = W3 * (x6 + x7) + 4; + x6 = (x8 - (W3 - W5) * x6) >> 3; + x7 = (x8 - (W3 + W5) * x7) >> 3; + /* second stage */ x8 = x0 + x1; x0 -= x1; - x1 = W6*(x3+x2) + 4; - x2 = (x1-(W2+W6)*x2)>>3; - x3 = (x1+(W2-W6)*x3)>>3; + x1 = W6 * (x3 + x2) + 4; + x2 = (x1 - (W2 + W6) * x2) >> 3; + x3 = (x1 + (W2 - W6) * x3) >> 3; x1 = x4 + x6; x4 -= x6; x6 = x5 + x7; x5 -= x7; - + /* third stage */ x7 = x8 + x3; x8 -= x3; x3 = x0 + x2; x0 -= x2; - x2 = (181*(x4+x5)+128)>>8; - x4 = (181*(x4-x5)+128)>>8; - + x2 = (181 * (x4 + x5) + 128) >> 8; + x4 = (181 * (x4 - x5) + 128) >> 8; + /* fourth stage */ - blk[8*0] = iclp[(x7+x1)>>14]; - blk[8*1] = iclp[(x3+x2)>>14]; - blk[8*2] = iclp[(x0+x4)>>14]; - blk[8*3] = iclp[(x8+x6)>>14]; - blk[8*4] = iclp[(x8-x6)>>14]; - blk[8*5] = iclp[(x0-x4)>>14]; - blk[8*6] = iclp[(x3-x2)>>14]; - blk[8*7] = iclp[(x7-x1)>>14]; + blk[8 * 0] = iclp[(x7 + x1) >> 14]; + blk[8 * 1] = iclp[(x3 + x2) >> 14]; + blk[8 * 2] = iclp[(x0 + x4) >> 14]; + blk[8 * 3] = iclp[(x8 + x6) >> 14]; + blk[8 * 4] = iclp[(x8 - x6) >> 14]; + blk[8 * 5] = iclp[(x0 - x4) >> 14]; + blk[8 * 6] = iclp[(x3 - x2) >> 14]; + blk[8 * 7] = iclp[(x7 - x1) >> 14]; } /* two dimensional inverse discrete cosine transform */ -void gst_idct_fast_int_idct(block) -short *block; +void +gst_idct_fast_int_idct (block) + short *block; { int i; - for (i=0; i<8; i++) - idctrow(block+8*i); + for (i = 0; i < 8; i++) + idctrow (block + 8 * i); - for (i=0; i<8; i++) - idctcol(block+i); + for (i = 0; i < 8; i++) + idctcol (block + i); } -void gst_idct_init_fast_int_idct() +void +gst_idct_init_fast_int_idct () { int i; - iclp = iclip+512; - for (i= -512; i<512; i++) - iclp[i] = (i<-256) ? -256 : ((i>255) ? 255 : i); + iclp = iclip + 512; + for (i = -512; i < 512; i++) + iclp[i] = (i < -256) ? -256 : ((i > 255) ? 255 : i); } diff --git a/gst-libs/gst/idct/floatidct.c b/gst-libs/gst/idct/floatidct.c index b215bd78..0fa1e830 100644 --- a/gst-libs/gst/idct/floatidct.c +++ b/gst-libs/gst/idct/floatidct.c @@ -56,51 +56,51 @@ static double gst_idct_float_c[8][8]; /* initialize DCT coefficient matrix */ -void gst_idct_init_float_idct() +void +gst_idct_init_float_idct () { int freq, time; double scale; - for (freq=0; freq < 8; freq++) - { - scale = (freq == 0) ? sqrt(0.125) : 0.5; - for (time=0; time<8; time++) - gst_idct_float_c[freq][time] = scale*cos((PI/8.0)*freq*(time + 0.5)); + for (freq = 0; freq < 8; freq++) { + scale = (freq == 0) ? sqrt (0.125) : 0.5; + for (time = 0; time < 8; time++) + gst_idct_float_c[freq][time] = + scale * cos ((PI / 8.0) * freq * (time + 0.5)); } } /* perform IDCT matrix multiply for 8x8 coefficient block */ -void gst_idct_float_idct(block) -short *block; +void +gst_idct_float_idct (block) + short *block; { int i, j, k, v; double partial_product; double tmp[64]; - for (i=0; i<8; i++) - for (j=0; j<8; j++) - { + for (i = 0; i < 8; i++) + for (j = 0; j < 8; j++) { partial_product = 0.0; - for (k=0; k<8; k++) - partial_product+= gst_idct_float_c[k][j]*block[8*i+k]; + for (k = 0; k < 8; k++) + partial_product += gst_idct_float_c[k][j] * block[8 * i + k]; - tmp[8*i+j] = partial_product; + tmp[8 * i + j] = partial_product; } /* Transpose operation is integrated into address mapping by switching loop order of i and j */ - for (j=0; j<8; j++) - for (i=0; i<8; i++) - { + for (j = 0; j < 8; j++) + for (i = 0; i < 8; i++) { partial_product = 0.0; - for (k=0; k<8; k++) - partial_product+= gst_idct_float_c[k][i]*tmp[8*k+j]; + for (k = 0; k < 8; k++) + partial_product += gst_idct_float_c[k][i] * tmp[8 * k + j]; - v = (int) floor(partial_product+0.5); - block[8*i+j] = (v<-256) ? -256 : ((v>255) ? 255 : v); + v = (int) floor (partial_product + 0.5); + block[8 * i + j] = (v < -256) ? -256 : ((v > 255) ? 255 : v); } } diff --git a/gst-libs/gst/idct/idct.c b/gst-libs/gst/idct/idct.c index 59c6a844..4be150f1 100644 --- a/gst-libs/gst/idct/idct.c +++ b/gst-libs/gst/idct/idct.c @@ -25,24 +25,25 @@ #include <gst/idct/idct.h> #include "dct.h" -static void gst_idct_int_sparse_idct(short *data); +static void gst_idct_int_sparse_idct (short *data); -GstIDCT *gst_idct_new(GstIDCTMethod method) +GstIDCT * +gst_idct_new (GstIDCTMethod method) { - GstIDCT *new = g_malloc(sizeof(GstIDCT)); + GstIDCT *new = g_malloc (sizeof (GstIDCT)); new->need_transpose = FALSE; if (method == GST_IDCT_DEFAULT) { #ifdef HAVE_LIBMMX - if (gst_cpu_get_flags() & GST_CPU_FLAG_MMX) { + if (gst_cpu_get_flags () & GST_CPU_FLAG_MMX) { method = GST_IDCT_MMX; } /* disabled for now - if (gst_cpu_get_flags() & GST_CPU_FLAG_SSE) { - method = GST_IDCT_SSE; - } - */ + if (gst_cpu_get_flags() & GST_CPU_FLAG_SSE) { + method = GST_IDCT_SSE; + } + */ else #endif /* HAVE_LIBMMX */ { @@ -53,49 +54,50 @@ GstIDCT *gst_idct_new(GstIDCTMethod method) new->convert_sparse = gst_idct_int_sparse_idct; switch (method) { - case GST_IDCT_FAST_INT: - GST_INFO ( "using fast_int_idct"); - gst_idct_init_fast_int_idct(); - new->convert = gst_idct_fast_int_idct; - break; - case GST_IDCT_INT: - GST_INFO ( "using int_idct"); - new->convert = gst_idct_int_idct; - break; - case GST_IDCT_FLOAT: - GST_INFO ( "using float_idct"); - gst_idct_init_float_idct(); - new->convert = gst_idct_float_idct; - break; + case GST_IDCT_FAST_INT: + GST_INFO ("using fast_int_idct"); + gst_idct_init_fast_int_idct (); + new->convert = gst_idct_fast_int_idct; + break; + case GST_IDCT_INT: + GST_INFO ("using int_idct"); + new->convert = gst_idct_int_idct; + break; + case GST_IDCT_FLOAT: + GST_INFO ("using float_idct"); + gst_idct_init_float_idct (); + new->convert = gst_idct_float_idct; + break; #ifdef HAVE_LIBMMX - case GST_IDCT_MMX: - GST_INFO ( "using MMX_idct"); - new->convert = gst_idct_mmx_idct; - new->need_transpose = TRUE; - break; - case GST_IDCT_MMX32: - GST_INFO ( "using MMX32_idct"); - new->convert = gst_idct_mmx32_idct; - new->need_transpose = TRUE; - break; - case GST_IDCT_SSE: - GST_INFO ( "using SSE_idct"); - new->convert = gst_idct_sse_idct; - new->need_transpose = TRUE; - break; + case GST_IDCT_MMX: + GST_INFO ("using MMX_idct"); + new->convert = gst_idct_mmx_idct; + new->need_transpose = TRUE; + break; + case GST_IDCT_MMX32: + GST_INFO ("using MMX32_idct"); + new->convert = gst_idct_mmx32_idct; + new->need_transpose = TRUE; + break; + case GST_IDCT_SSE: + GST_INFO ("using SSE_idct"); + new->convert = gst_idct_sse_idct; + new->need_transpose = TRUE; + break; #endif /* HAVE_LIBMMX */ - default: - GST_INFO ( "method not supported"); - g_free(new); - return NULL; + default: + GST_INFO ("method not supported"); + g_free (new); + return NULL; } return new; } -static void gst_idct_int_sparse_idct(short *data) +static void +gst_idct_int_sparse_idct (short *data) { short val; - gint32 v, *dp = (guint32 *)data; + gint32 v, *dp = (guint32 *) data; v = *data; @@ -104,43 +106,61 @@ static void gst_idct_int_sparse_idct(short *data) val += (8 >> 1); val /= 8; val = -val; - } - else { + } else { val = (v + (8 >> 1)) / 8; } - v = (( val & 0xffff) | (val << 16)); - - dp[0] = v; dp[1] = v; dp[2] = v; dp[3] = v; - dp[4] = v; dp[5] = v; dp[6] = v; dp[7] = v; - dp[8] = v; dp[9] = v; dp[10] = v; dp[11] = v; - dp[12] = v; dp[13] = v; dp[14] = v; dp[15] = v; - dp[16] = v; dp[17] = v; dp[18] = v; dp[19] = v; - dp[20] = v; dp[21] = v; dp[22] = v; dp[23] = v; - dp[24] = v; dp[25] = v; dp[26] = v; dp[27] = v; - dp[28] = v; dp[29] = v; dp[30] = v; dp[31] = v; + v = ((val & 0xffff) | (val << 16)); + + dp[0] = v; + dp[1] = v; + dp[2] = v; + dp[3] = v; + dp[4] = v; + dp[5] = v; + dp[6] = v; + dp[7] = v; + dp[8] = v; + dp[9] = v; + dp[10] = v; + dp[11] = v; + dp[12] = v; + dp[13] = v; + dp[14] = v; + dp[15] = v; + dp[16] = v; + dp[17] = v; + dp[18] = v; + dp[19] = v; + dp[20] = v; + dp[21] = v; + dp[22] = v; + dp[23] = v; + dp[24] = v; + dp[25] = v; + dp[26] = v; + dp[27] = v; + dp[28] = v; + dp[29] = v; + dp[30] = v; + dp[31] = v; } -void gst_idct_destroy(GstIDCT *idct) +void +gst_idct_destroy (GstIDCT * idct) { - g_return_if_fail(idct != NULL); + g_return_if_fail (idct != NULL); - g_free(idct); + g_free (idct); } static gboolean -plugin_init (GstPlugin *plugin) +plugin_init (GstPlugin * plugin) { return TRUE; } -GST_PLUGIN_DEFINE ( - GST_VERSION_MAJOR, - GST_VERSION_MINOR, - "gstidct", - "Accelerated IDCT routines", - plugin_init, - VERSION, - GST_LICENSE, - GST_PACKAGE, - GST_ORIGIN -) +GST_PLUGIN_DEFINE (GST_VERSION_MAJOR, + GST_VERSION_MINOR, + "gstidct", + "Accelerated IDCT routines", + plugin_init, VERSION, GST_LICENSE, GST_PACKAGE, GST_ORIGIN) diff --git a/gst-libs/gst/idct/idct.h b/gst-libs/gst/idct/idct.h index fa6f62cd..37a2a0b9 100644 --- a/gst-libs/gst/idct/idct.h +++ b/gst-libs/gst/idct/idct.h @@ -23,22 +23,24 @@ #include <glib.h> -typedef enum { - GST_IDCT_DEFAULT, - GST_IDCT_INT, - GST_IDCT_FAST_INT, - GST_IDCT_FLOAT, - GST_IDCT_MMX, +typedef enum +{ + GST_IDCT_DEFAULT, + GST_IDCT_INT, + GST_IDCT_FAST_INT, + GST_IDCT_FLOAT, + GST_IDCT_MMX, GST_IDCT_MMX32, GST_IDCT_SSE, } GstIDCTMethod; typedef struct _GstIDCT GstIDCT; -typedef void (*GstIDCTFunction) (gshort *block); +typedef void (*GstIDCTFunction) (gshort * block); #define GST_IDCT_TRANSPOSE(idct) ((idct)->need_transpose) -struct _GstIDCT { +struct _GstIDCT +{ /* private */ GstIDCTFunction convert; GstIDCTFunction convert_sparse; @@ -46,9 +48,10 @@ struct _GstIDCT { }; -GstIDCT *gst_idct_new(GstIDCTMethod method); +GstIDCT *gst_idct_new (GstIDCTMethod method); + #define gst_idct_convert(idct, blocks) (idct)->convert((blocks)) #define gst_idct_convert_sparse(idct, blocks) (idct)->convert_sparse((blocks)) -void gst_idct_destroy(GstIDCT *idct); +void gst_idct_destroy (GstIDCT * idct); #endif /* __GST_IDCT_H__ */ diff --git a/gst-libs/gst/idct/ieeetest.c b/gst-libs/gst/idct/ieeetest.c index f5b270eb..d26181c1 100644 --- a/gst-libs/gst/idct/ieeetest.c +++ b/gst-libs/gst/idct/ieeetest.c @@ -27,9 +27,9 @@ void usage (char *msg); long ieeerand (long L, long H); -void dct_init(void); -void ref_fdct(DCTELEM block[8][8]); -void ref_idct(DCTELEM block[8][8]); +void dct_init (void); +void ref_fdct (DCTELEM block[8][8]); +void ref_idct (DCTELEM block[8][8]); /* error stat accumulators -- assume initialized to 0 */ @@ -38,47 +38,49 @@ long sumsqerrs[DCTSIZE2]; int maxerr[DCTSIZE2]; -char * meets (double val, double limit) +char * +meets (double val, double limit) { - return ((fabs(val) <= limit) ? "meets" : "FAILS"); + return ((fabs (val) <= limit) ? "meets" : "FAILS"); } int -main(int argc, char **argv) +main (int argc, char **argv) { long minpix, maxpix, sign; long curiter, niters; int i, j; double max, total; int method; - DCTELEM block[DCTSIZE2]; /* random source data */ - DCTELEM refcoefs[DCTSIZE2]; /* coefs from reference FDCT */ - DCTELEM refout[DCTSIZE2]; /* output from reference IDCT */ - DCTELEM testout[DCTSIZE2]; /* output from test IDCT */ - GstIDCT *idct; - guint64 tscstart, tscmin = ~0, tscmax = 0; - guint64 tscstop; + DCTELEM block[DCTSIZE2]; /* random source data */ + DCTELEM refcoefs[DCTSIZE2]; /* coefs from reference FDCT */ + DCTELEM refout[DCTSIZE2]; /* output from reference IDCT */ + DCTELEM testout[DCTSIZE2]; /* output from test IDCT */ + GstIDCT *idct; + guint64 tscstart, tscmin = ~0, tscmax = 0; + guint64 tscstop; /* Argument parsing --- not very bulletproof at all */ - if (argc != 6) usage(NULL); + if (argc != 6) + usage (NULL); - method = atoi(argv[1]); - minpix = atoi(argv[2]); - maxpix = atoi(argv[3]); - sign = atoi(argv[4]); - niters = atol(argv[5]); + method = atoi (argv[1]); + minpix = atoi (argv[2]); + maxpix = atoi (argv[3]); + sign = atoi (argv[4]); + niters = atol (argv[5]); - gst_library_load("gstidct"); + gst_library_load ("gstidct"); - idct = gst_idct_new(method); + idct = gst_idct_new (method); if (idct == 0) { - printf("method not available\n\n\n"); + printf ("method not available\n\n\n"); return 0; } - dct_init(); + dct_init (); /* Loop once per generated random-data block */ @@ -86,164 +88,186 @@ main(int argc, char **argv) /* generate a pseudo-random block of data */ for (i = 0; i < DCTSIZE2; i++) - block[i] = (DCTELEM) (ieeerand(-minpix,maxpix) * sign); + block[i] = (DCTELEM) (ieeerand (-minpix, maxpix) * sign); /* perform reference FDCT */ - memcpy(refcoefs, block, sizeof(DCTELEM)*DCTSIZE2); - ref_fdct((DCTELEM **) &refcoefs); + memcpy (refcoefs, block, sizeof (DCTELEM) * DCTSIZE2); + ref_fdct ((DCTELEM **) & refcoefs); /* clip */ for (i = 0; i < DCTSIZE2; i++) { - if (refcoefs[i] < -2048) refcoefs[i] = -2048; - else if (refcoefs[i] > 2047) refcoefs[i] = 2047; + if (refcoefs[i] < -2048) + refcoefs[i] = -2048; + else if (refcoefs[i] > 2047) + refcoefs[i] = 2047; } /* perform reference IDCT */ - memcpy(refout, refcoefs, sizeof(DCTELEM)*DCTSIZE2); - ref_idct(refout); + memcpy (refout, refcoefs, sizeof (DCTELEM) * DCTSIZE2); + ref_idct (refout); /* clip */ for (i = 0; i < DCTSIZE2; i++) { - if (refout[i] < -256) refout[i] = -256; - else if (refout[i] > 255) refout[i] = 255; + if (refout[i] < -256) + refout[i] = -256; + else if (refout[i] > 255) + refout[i] = 255; } /* perform test IDCT */ - if (GST_IDCT_TRANSPOSE(idct)) { + if (GST_IDCT_TRANSPOSE (idct)) { for (j = 0; j < DCTSIZE; j++) { - for (i = 0; i < DCTSIZE; i++) { - testout[i*DCTSIZE+j] = refcoefs[j*DCTSIZE+i]; - } - } - } - else { - memcpy(testout, refcoefs, sizeof(DCTELEM)*DCTSIZE2); - } - - gst_trace_read_tsc(&tscstart); - gst_idct_convert(idct, testout); - gst_trace_read_tsc(&tscstop); - /*printf("time %llu, %llu %lld\n", tscstart, tscstop, tscstop-tscstart); */ - if (tscstop - tscstart < tscmin) tscmin = tscstop-tscstart; - if (tscstop - tscstart > tscmax) tscmax = tscstop-tscstart; + for (i = 0; i < DCTSIZE; i++) { + testout[i * DCTSIZE + j] = refcoefs[j * DCTSIZE + i]; + } + } + } else { + memcpy (testout, refcoefs, sizeof (DCTELEM) * DCTSIZE2); + } + + gst_trace_read_tsc (&tscstart); + gst_idct_convert (idct, testout); + gst_trace_read_tsc (&tscstop); + /*printf("time %llu, %llu %lld\n", tscstart, tscstop, tscstop-tscstart); */ + if (tscstop - tscstart < tscmin) + tscmin = tscstop - tscstart; + if (tscstop - tscstart > tscmax) + tscmax = tscstop - tscstart; /* clip */ for (i = 0; i < DCTSIZE2; i++) { - if (testout[i] < -256) testout[i] = -256; - else if (testout[i] > 255) testout[i] = 255; + if (testout[i] < -256) + testout[i] = -256; + else if (testout[i] > 255) + testout[i] = 255; } /* accumulate error stats */ for (i = 0; i < DCTSIZE2; i++) { register int err = testout[i] - refout[i]; + sumerrs[i] += err; sumsqerrs[i] += err * err; - if (err < 0) err = -err; - if (maxerr[i] < err) maxerr[i] = err; + if (err < 0) + err = -err; + if (maxerr[i] < err) + maxerr[i] = err; } if (curiter % 100 == 99) { - fprintf(stderr, "."); - fflush(stderr); + fprintf (stderr, "."); + fflush (stderr); } } - fprintf(stderr, "\n"); + fprintf (stderr, "\n"); /* print results */ - printf("IEEE test conditions: -L = %ld, +H = %ld, sign = %ld, #iters = %ld\n", - minpix, maxpix, sign, niters); + printf + ("IEEE test conditions: -L = %ld, +H = %ld, sign = %ld, #iters = %ld\n", + minpix, maxpix, sign, niters); - printf("Speed, min time %lld, max %lld\n", tscmin, tscmax); + printf ("Speed, min time %lld, max %lld\n", tscmin, tscmax); - printf("Peak absolute values of errors:\n"); + printf ("Peak absolute values of errors:\n"); for (i = 0, j = 0; i < DCTSIZE2; i++) { - if (j < maxerr[i]) j = maxerr[i]; - printf("%4d", maxerr[i]); - if ((i%DCTSIZE) == DCTSIZE-1) printf("\n"); + if (j < maxerr[i]) + j = maxerr[i]; + printf ("%4d", maxerr[i]); + if ((i % DCTSIZE) == DCTSIZE - 1) + printf ("\n"); } - printf("Worst peak error = %d (%s spec limit 1)\n\n", j, - meets((double) j, 1.0)); + printf ("Worst peak error = %d (%s spec limit 1)\n\n", j, + meets ((double) j, 1.0)); - printf("Mean square errors:\n"); + printf ("Mean square errors:\n"); max = total = 0.0; for (i = 0; i < DCTSIZE2; i++) { - double err = (double) sumsqerrs[i] / ((double) niters); + double err = (double) sumsqerrs[i] / ((double) niters); + total += (double) sumsqerrs[i]; - if (max < err) max = err; - printf(" %8.4f", err); - if ((i%DCTSIZE) == DCTSIZE-1) printf("\n"); + if (max < err) + max = err; + printf (" %8.4f", err); + if ((i % DCTSIZE) == DCTSIZE - 1) + printf ("\n"); } - printf("Worst pmse = %.6f (%s spec limit 0.06)\n", max, meets(max, 0.06)); - total /= (double) (64*niters); - printf("Overall mse = %.6f (%s spec limit 0.02)\n\n", total, - meets(total, 0.02)); + printf ("Worst pmse = %.6f (%s spec limit 0.06)\n", max, meets (max, 0.06)); + total /= (double) (64 * niters); + printf ("Overall mse = %.6f (%s spec limit 0.02)\n\n", total, + meets (total, 0.02)); - printf("Mean errors:\n"); + printf ("Mean errors:\n"); max = total = 0.0; for (i = 0; i < DCTSIZE2; i++) { - double err = (double) sumerrs[i] / ((double) niters); + double err = (double) sumerrs[i] / ((double) niters); + total += (double) sumerrs[i]; - printf(" %8.4f", err); - if (err < 0.0) err = -err; - if (max < err) max = err; - if ((i%DCTSIZE) == DCTSIZE-1) printf("\n"); + printf (" %8.4f", err); + if (err < 0.0) + err = -err; + if (max < err) + max = err; + if ((i % DCTSIZE) == DCTSIZE - 1) + printf ("\n"); } - printf("Worst mean error = %.6f (%s spec limit 0.015)\n", max, - meets(max, 0.015)); - total /= (double) (64*niters); - printf("Overall mean error = %.6f (%s spec limit 0.0015)\n\n", total, - meets(total, 0.0015)); + printf ("Worst mean error = %.6f (%s spec limit 0.015)\n", max, + meets (max, 0.015)); + total /= (double) (64 * niters); + printf ("Overall mean error = %.6f (%s spec limit 0.0015)\n\n", total, + meets (total, 0.0015)); /* test for 0 input giving 0 output */ - memset(testout, 0, sizeof(DCTELEM)*DCTSIZE2); - gst_idct_convert(idct, testout); - for (i = 0, j=0; i < DCTSIZE2; i++) { + memset (testout, 0, sizeof (DCTELEM) * DCTSIZE2); + gst_idct_convert (idct, testout); + for (i = 0, j = 0; i < DCTSIZE2; i++) { if (testout[i]) { - printf("Position %d of IDCT(0) = %d (FAILS)\n", i, testout[i]); + printf ("Position %d of IDCT(0) = %d (FAILS)\n", i, testout[i]); j++; } } - printf("%d elements of IDCT(0) were not zero\n\n\n", j); + printf ("%d elements of IDCT(0) were not zero\n\n\n", j); - exit(0); + exit (0); return 0; } -void usage (char *msg) +void +usage (char *msg) { if (msg != NULL) - fprintf(stderr, "\nerror: %s\n", msg); - - fprintf(stderr, "\n"); - fprintf(stderr, "usage: ieeetest minpix maxpix sign niters\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " test = 1 - 5\n"); - fprintf(stderr, " minpix = -L value per IEEE spec\n"); - fprintf(stderr, " maxpix = H value per IEEE spec\n"); - fprintf(stderr, " sign = +1 for normal, -1 to run negated test\n"); - fprintf(stderr, " niters = # iterations (10000 for full test)\n"); - fprintf(stderr, "\n"); - - exit(1); + fprintf (stderr, "\nerror: %s\n", msg); + + fprintf (stderr, "\n"); + fprintf (stderr, "usage: ieeetest minpix maxpix sign niters\n"); + fprintf (stderr, "\n"); + fprintf (stderr, " test = 1 - 5\n"); + fprintf (stderr, " minpix = -L value per IEEE spec\n"); + fprintf (stderr, " maxpix = H value per IEEE spec\n"); + fprintf (stderr, " sign = +1 for normal, -1 to run negated test\n"); + fprintf (stderr, " niters = # iterations (10000 for full test)\n"); + fprintf (stderr, "\n"); + + exit (1); } /* Pseudo-random generator specified by IEEE 1180 */ -long ieeerand (long L, long H) +long +ieeerand (long L, long H) { static long randx = 1; static double z = (double) 0x7fffffff; - long i,j; + long i, j; double x; randx = (randx * 1103515245) + 12345; i = randx & 0x7ffffffe; x = ((double) i) / z; - x *= (L+H+1); + x *= (L + H + 1); j = x; - return j-L; + return j - L; } @@ -256,33 +280,35 @@ double coslu[8][8]; /* Routine to initialise the cosine lookup table */ -void dct_init(void) +void +dct_init (void) { - int a,b; + int a, b; double tmp; - for(a=0;a<8;a++) - for(b=0;b<8;b++) { - tmp = cos((double)((a+a+1)*b) * (3.14159265358979323846 / 16.0)); - if(b==0) - tmp /= sqrt(2.0); + for (a = 0; a < 8; a++) + for (b = 0; b < 8; b++) { + tmp = cos ((double) ((a + a + 1) * b) * (3.14159265358979323846 / 16.0)); + if (b == 0) + tmp /= sqrt (2.0); coslu[a][b] = tmp * 0.5; } } -void ref_fdct (DCTELEM block[8][8]) +void +ref_fdct (DCTELEM block[8][8]) { - int x,y,u,v; + int x, y, u, v; double tmp, tmp2; double res[8][8]; - for (v=0; v<8; v++) { - for (u=0; u<8; u++) { + for (v = 0; v < 8; v++) { + for (u = 0; u < 8; u++) { tmp = 0.0; - for (y=0; y<8; y++) { + for (y = 0; y < 8; y++) { tmp2 = 0.0; - for (x=0; x<8; x++) { + for (x = 0; x < 8; x++) { tmp2 += (double) block[y][x] * coslu[x][u]; } tmp += coslu[y][v] * tmp2; @@ -291,11 +317,11 @@ void ref_fdct (DCTELEM block[8][8]) } } - for (v=0; v<8; v++) { - for (u=0; u<8; u++) { + for (v = 0; v < 8; v++) { + for (u = 0; u < 8; u++) { tmp = res[v][u]; if (tmp < 0.0) { - x = - ((int) (0.5 - tmp)); + x = -((int) (0.5 - tmp)); } else { x = (int) (tmp + 0.5); } @@ -305,18 +331,19 @@ void ref_fdct (DCTELEM block[8][8]) } -void ref_idct (DCTELEM block[8][8]) +void +ref_idct (DCTELEM block[8][8]) { - int x,y,u,v; + int x, y, u, v; double tmp, tmp2; double res[8][8]; - for (y=0; y<8; y++) { - for (x=0; x<8; x++) { + for (y = 0; y < 8; y++) { + for (x = 0; x < 8; x++) { tmp = 0.0; - for (v=0; v<8; v++) { + for (v = 0; v < 8; v++) { tmp2 = 0.0; - for (u=0; u<8; u++) { + for (u = 0; u < 8; u++) { tmp2 += (double) block[v][u] * coslu[x][u]; } tmp += coslu[y][v] * tmp2; @@ -325,11 +352,11 @@ void ref_idct (DCTELEM block[8][8]) } } - for (v=0; v<8; v++) { - for (u=0; u<8; u++) { + for (v = 0; v < 8; v++) { + for (u = 0; u < 8; u++) { tmp = res[v][u]; if (tmp < 0.0) { - x = - ((int) (0.5 - tmp)); + x = -((int) (0.5 - tmp)); } else { x = (int) (tmp + 0.5); } diff --git a/gst-libs/gst/idct/intidct.c b/gst-libs/gst/idct/intidct.c index e08e6adb..42f0ac84 100644 --- a/gst-libs/gst/idct/intidct.c +++ b/gst-libs/gst/idct/intidct.c @@ -51,10 +51,8 @@ */ #if DCTSIZE != 8 - Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ +Sorry, this code only copes with 8 x8 DCTs. /* deliberate syntax err */ #endif - - /* * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT * on each column. Direct algorithms are also available, but they are @@ -90,7 +88,6 @@ * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis * shows that the values given below are the most effective. */ - #ifdef EIGHT_BIT_SAMPLES #define CONST_BITS 13 #define PASS1_BITS 2 @@ -98,22 +95,16 @@ #define CONST_BITS 13 #define PASS1_BITS 1 /* lose a little precision to avoid overflow */ #endif - #define ONE ((INT32) 1) - #define CONST_SCALE (ONE << CONST_BITS) - /* Convert a positive real constant to an integer scaled by CONST_SCALE. */ - #define FIX(x) ((INT32) ((x) * CONST_SCALE + 0.5)) - /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus * causing a lot of useless floating-point operations at run time. * To get around this we use the following pre-calculated constants. * If you change CONST_BITS you may want to add appropriate values. * (With a reasonable C compiler, you can just rely on the FIX() macro...) */ - #if CONST_BITS == 13 #define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */ #define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */ @@ -141,15 +132,11 @@ #define FIX_2_562915447 FIX(2.562915447) #define FIX_3_072711026 FIX(3.072711026) #endif - - /* Descale and correctly round an INT32 value that's scaled by N bits. * We assume RIGHT_SHIFT rounds towards minus infinity, so adding * the fudge factor is correct for either sign of X. */ - #define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n) - /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result. * For 8-bit samples with the recommended scaling, all the variable * and constant values involved are no more than 16 bits wide, so a @@ -160,7 +147,6 @@ * combination of casts. * NB: for 12-bit samples, a full 32-bit multiplication will be needed. */ - #ifdef EIGHT_BIT_SAMPLES #ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */ #define MULTIPLY(var,const) (((INT16) (var)) * ((INT16) (const))) @@ -169,17 +155,13 @@ #define MULTIPLY(var,const) (((INT16) (var)) * ((INT32) (const))) #endif #endif - #ifndef MULTIPLY /* default definition */ #define MULTIPLY(var,const) ((var) * (const)) #endif - - /* * Perform the inverse DCT on one block of coefficients. */ - -void + void gst_idct_int_idct (DCTBLOCK data) { INT32 tmp0, tmp1, tmp2, tmp3; @@ -187,14 +169,13 @@ gst_idct_int_idct (DCTBLOCK data) INT32 z1, z2, z3, z4, z5; register DCTELEM *dataptr; int rowctr; - SHIFT_TEMPS - - /* Pass 1: process rows. */ - /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ - /* furthermore, we scale the results by 2**PASS1_BITS. */ - dataptr = data; - for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { + SHIFT_TEMPS + /* Pass 1: process rows. */ + /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ + /* furthermore, we scale the results by 2**PASS1_BITS. */ + dataptr = data; + for (rowctr = DCTSIZE - 1; rowctr >= 0; rowctr--) { /* Due to quantization, we will usually find that many of the input * coefficients are zero, especially the AC terms. We can exploit this * by short-circuiting the IDCT calculation for any row in which all @@ -205,10 +186,10 @@ gst_idct_int_idct (DCTBLOCK data) */ if ((dataptr[1] | dataptr[2] | dataptr[3] | dataptr[4] | - dataptr[5] | dataptr[6] | dataptr[7]) == 0) { + dataptr[5] | dataptr[6] | dataptr[7]) == 0) { /* AC terms all zero */ DCTELEM dcval = (DCTELEM) (dataptr[0] << PASS1_BITS); - + dataptr[0] = dcval; dataptr[1] = dcval; dataptr[2] = dcval; @@ -217,7 +198,7 @@ gst_idct_int_idct (DCTBLOCK data) dataptr[5] = dcval; dataptr[6] = dcval; dataptr[7] = dcval; - + dataptr += DCTSIZE; /* advance pointer to next row */ continue; } @@ -228,9 +209,9 @@ gst_idct_int_idct (DCTBLOCK data) z2 = (INT32) dataptr[2]; z3 = (INT32) dataptr[6]; - z1 = MULTIPLY(z2 + z3, FIX_0_541196100); - tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); - tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); + z1 = MULTIPLY (z2 + z3, FIX_0_541196100); + tmp2 = z1 + MULTIPLY (z3, -FIX_1_847759065); + tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865); tmp0 = ((INT32) dataptr[0] + (INT32) dataptr[4]) << CONST_BITS; tmp1 = ((INT32) dataptr[0] - (INT32) dataptr[4]) << CONST_BITS; @@ -239,7 +220,7 @@ gst_idct_int_idct (DCTBLOCK data) tmp13 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - + /* Odd part per figure 8; the matrix is unitary and hence its * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. */ @@ -253,20 +234,20 @@ gst_idct_int_idct (DCTBLOCK data) z2 = tmp1 + tmp2; z3 = tmp0 + tmp2; z4 = tmp1 + tmp3; - z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ - - tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ - tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ - tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ - tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ - z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ - z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ - z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ - z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ - + z5 = MULTIPLY (z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ + + tmp0 = MULTIPLY (tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ + tmp1 = MULTIPLY (tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ + tmp2 = MULTIPLY (tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ + tmp3 = MULTIPLY (tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ + z1 = MULTIPLY (z1, -FIX_0_899976223); /* sqrt(2) * (c7-c3) */ + z2 = MULTIPLY (z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ + z3 = MULTIPLY (z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ + z4 = MULTIPLY (z4, -FIX_0_390180644); /* sqrt(2) * (c5-c3) */ + z3 += z5; z4 += z5; - + tmp0 += z1 + z3; tmp1 += z2 + z4; tmp2 += z2 + z3; @@ -274,14 +255,14 @@ gst_idct_int_idct (DCTBLOCK data) /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ - dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS); - dataptr[7] = (DCTELEM) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS); - dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS); - dataptr[6] = (DCTELEM) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS); - dataptr[2] = (DCTELEM) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS); - dataptr[5] = (DCTELEM) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS); - dataptr[3] = (DCTELEM) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS); - dataptr[4] = (DCTELEM) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS); + dataptr[0] = (DCTELEM) DESCALE (tmp10 + tmp3, CONST_BITS - PASS1_BITS); + dataptr[7] = (DCTELEM) DESCALE (tmp10 - tmp3, CONST_BITS - PASS1_BITS); + dataptr[1] = (DCTELEM) DESCALE (tmp11 + tmp2, CONST_BITS - PASS1_BITS); + dataptr[6] = (DCTELEM) DESCALE (tmp11 - tmp2, CONST_BITS - PASS1_BITS); + dataptr[2] = (DCTELEM) DESCALE (tmp12 + tmp1, CONST_BITS - PASS1_BITS); + dataptr[5] = (DCTELEM) DESCALE (tmp12 - tmp1, CONST_BITS - PASS1_BITS); + dataptr[3] = (DCTELEM) DESCALE (tmp13 + tmp0, CONST_BITS - PASS1_BITS); + dataptr[4] = (DCTELEM) DESCALE (tmp13 - tmp0, CONST_BITS - PASS1_BITS); dataptr += DCTSIZE; /* advance pointer to next row */ } @@ -291,7 +272,7 @@ gst_idct_int_idct (DCTBLOCK data) /* and also undo the PASS1_BITS scaling. */ dataptr = data; - for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { + for (rowctr = DCTSIZE - 1; rowctr >= 0; rowctr--) { /* Columns of zeroes can be exploited in the same way as we did with rows. * However, the row calculation has created many nonzero AC terms, so the * simplification applies less often (typically 5% to 10% of the time). @@ -301,21 +282,21 @@ gst_idct_int_idct (DCTBLOCK data) */ #ifndef NO_ZERO_COLUMN_TEST - if ((dataptr[DCTSIZE*1] | dataptr[DCTSIZE*2] | dataptr[DCTSIZE*3] | - dataptr[DCTSIZE*4] | dataptr[DCTSIZE*5] | dataptr[DCTSIZE*6] | - dataptr[DCTSIZE*7]) == 0) { + if ((dataptr[DCTSIZE * 1] | dataptr[DCTSIZE * 2] | dataptr[DCTSIZE * 3] | + dataptr[DCTSIZE * 4] | dataptr[DCTSIZE * 5] | dataptr[DCTSIZE * 6] | + dataptr[DCTSIZE * 7]) == 0) { /* AC terms all zero */ - DCTELEM dcval = (DCTELEM) DESCALE((INT32) dataptr[0], PASS1_BITS+3); - - dataptr[DCTSIZE*0] = dcval; - dataptr[DCTSIZE*1] = dcval; - dataptr[DCTSIZE*2] = dcval; - dataptr[DCTSIZE*3] = dcval; - dataptr[DCTSIZE*4] = dcval; - dataptr[DCTSIZE*5] = dcval; - dataptr[DCTSIZE*6] = dcval; - dataptr[DCTSIZE*7] = dcval; - + DCTELEM dcval = (DCTELEM) DESCALE ((INT32) dataptr[0], PASS1_BITS + 3); + + dataptr[DCTSIZE * 0] = dcval; + dataptr[DCTSIZE * 1] = dcval; + dataptr[DCTSIZE * 2] = dcval; + dataptr[DCTSIZE * 3] = dcval; + dataptr[DCTSIZE * 4] = dcval; + dataptr[DCTSIZE * 5] = dcval; + dataptr[DCTSIZE * 6] = dcval; + dataptr[DCTSIZE * 7] = dcval; + dataptr++; /* advance pointer to next column */ continue; } @@ -324,48 +305,52 @@ gst_idct_int_idct (DCTBLOCK data) /* Even part: reverse the even part of the forward DCT. */ /* The rotator is sqrt(2)*c(-6). */ - z2 = (INT32) dataptr[DCTSIZE*2]; - z3 = (INT32) dataptr[DCTSIZE*6]; + z2 = (INT32) dataptr[DCTSIZE * 2]; + z3 = (INT32) dataptr[DCTSIZE * 6]; - z1 = MULTIPLY(z2 + z3, FIX_0_541196100); - tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); - tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); + z1 = MULTIPLY (z2 + z3, FIX_0_541196100); + tmp2 = z1 + MULTIPLY (z3, -FIX_1_847759065); + tmp3 = z1 + MULTIPLY (z2, FIX_0_765366865); - tmp0 = ((INT32) dataptr[DCTSIZE*0] + (INT32) dataptr[DCTSIZE*4]) << CONST_BITS; - tmp1 = ((INT32) dataptr[DCTSIZE*0] - (INT32) dataptr[DCTSIZE*4]) << CONST_BITS; + tmp0 = + ((INT32) dataptr[DCTSIZE * 0] + + (INT32) dataptr[DCTSIZE * 4]) << CONST_BITS; + tmp1 = + ((INT32) dataptr[DCTSIZE * 0] - + (INT32) dataptr[DCTSIZE * 4]) << CONST_BITS; tmp10 = tmp0 + tmp3; tmp13 = tmp0 - tmp3; tmp11 = tmp1 + tmp2; tmp12 = tmp1 - tmp2; - + /* Odd part per figure 8; the matrix is unitary and hence its * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. */ - tmp0 = (INT32) dataptr[DCTSIZE*7]; - tmp1 = (INT32) dataptr[DCTSIZE*5]; - tmp2 = (INT32) dataptr[DCTSIZE*3]; - tmp3 = (INT32) dataptr[DCTSIZE*1]; + tmp0 = (INT32) dataptr[DCTSIZE * 7]; + tmp1 = (INT32) dataptr[DCTSIZE * 5]; + tmp2 = (INT32) dataptr[DCTSIZE * 3]; + tmp3 = (INT32) dataptr[DCTSIZE * 1]; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; z3 = tmp0 + tmp2; z4 = tmp1 + tmp3; - z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ - - tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ - tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ - tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ - tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ - z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ - z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ - z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ - z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ - + z5 = MULTIPLY (z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ + + tmp0 = MULTIPLY (tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ + tmp1 = MULTIPLY (tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ + tmp2 = MULTIPLY (tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ + tmp3 = MULTIPLY (tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ + z1 = MULTIPLY (z1, -FIX_0_899976223); /* sqrt(2) * (c7-c3) */ + z2 = MULTIPLY (z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ + z3 = MULTIPLY (z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ + z4 = MULTIPLY (z4, -FIX_0_390180644); /* sqrt(2) * (c5-c3) */ + z3 += z5; z4 += z5; - + tmp0 += z1 + z3; tmp1 += z2 + z4; tmp2 += z2 + z3; @@ -373,23 +358,23 @@ gst_idct_int_idct (DCTBLOCK data) /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ - dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp3, - CONST_BITS+PASS1_BITS+3); - dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp10 - tmp3, - CONST_BITS+PASS1_BITS+3); - dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp11 + tmp2, - CONST_BITS+PASS1_BITS+3); - dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(tmp11 - tmp2, - CONST_BITS+PASS1_BITS+3); - dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp12 + tmp1, - CONST_BITS+PASS1_BITS+3); - dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12 - tmp1, - CONST_BITS+PASS1_BITS+3); - dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp13 + tmp0, - CONST_BITS+PASS1_BITS+3); - dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp13 - tmp0, - CONST_BITS+PASS1_BITS+3); - + dataptr[DCTSIZE * 0] = (DCTELEM) DESCALE (tmp10 + tmp3, + CONST_BITS + PASS1_BITS + 3); + dataptr[DCTSIZE * 7] = (DCTELEM) DESCALE (tmp10 - tmp3, + CONST_BITS + PASS1_BITS + 3); + dataptr[DCTSIZE * 1] = (DCTELEM) DESCALE (tmp11 + tmp2, + CONST_BITS + PASS1_BITS + 3); + dataptr[DCTSIZE * 6] = (DCTELEM) DESCALE (tmp11 - tmp2, + CONST_BITS + PASS1_BITS + 3); + dataptr[DCTSIZE * 2] = (DCTELEM) DESCALE (tmp12 + tmp1, + CONST_BITS + PASS1_BITS + 3); + dataptr[DCTSIZE * 5] = (DCTELEM) DESCALE (tmp12 - tmp1, + CONST_BITS + PASS1_BITS + 3); + dataptr[DCTSIZE * 3] = (DCTELEM) DESCALE (tmp13 + tmp0, + CONST_BITS + PASS1_BITS + 3); + dataptr[DCTSIZE * 4] = (DCTELEM) DESCALE (tmp13 - tmp0, + CONST_BITS + PASS1_BITS + 3); + dataptr++; /* advance pointer to next column */ } } diff --git a/gst-libs/gst/idct/mmx32idct.c b/gst-libs/gst/idct/mmx32idct.c index 3b640976..cd191f0c 100644 --- a/gst-libs/gst/idct/mmx32idct.c +++ b/gst-libs/gst/idct/mmx32idct.c @@ -19,9 +19,8 @@ * along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*
- */
-
-
+ */
+
/* MMX32 iDCT algorithm (IEEE-1180 compliant) :: idct_mmx32()
*/ /*
*/ /* MPEG2AVI
*/ @@ -102,8 +101,7 @@ /*
*/ /* liaor@umcc.ais.org http://members.tripod.com/~liaor
*/ /*
*/ -
-
+
/*;=============================================================================
*/ /*;
*/ /*; AP-922 http://developer.intel.com/vtune/cbts/strmsimd
*/ @@ -113,68 +111,67 @@ /*;=============================================================================
*/ /*
mword typedef qword
-qword ptr equ mword ptr */
- +qword ptr equ mword ptr */
#ifdef HAVE_CONFIG_H #include "config.h" #endif - #include <mmx.h>
-
+
#define BITS_INV_ACC 4 /*; 4 or 5 for IEEE
*/ - /* 5 yields higher accuracy, but lessens dynamic range on the input matrix
*/ + /* 5 yields higher accuracy, but lessens dynamic range on the input matrix
*/ #define SHIFT_INV_ROW (16 - BITS_INV_ACC)
-#define SHIFT_INV_COL (1 + BITS_INV_ACC +14 ) /* changed from Intel's val)
*/ +#define SHIFT_INV_COL (1 + BITS_INV_ACC +14 ) /* changed from Intel's val)
*/ /*#define SHIFT_INV_COL (1 + BITS_INV_ACC )
*/ -
+
#define RND_INV_ROW (1 << (SHIFT_INV_ROW-1))
#define RND_INV_COL (1 << (SHIFT_INV_COL-1))
-#define RND_INV_CORR (RND_INV_COL - 1) /*; correction -1.0 and round
*/ +#define RND_INV_CORR (RND_INV_COL - 1) /*; correction -1.0 and round
*/ /*#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC)) //; 1 << (SHIFT_INV_ROW-1)
*/ /*#define RND_INV_COL (16 * (BITS_INV_ACC - 3)) //; 1 << (SHIFT_INV_COL-1)
*/ -
-
+
/*.data
*/ /*Align 16
*/ -const static long r_inv_row[2] = { RND_INV_ROW, RND_INV_ROW};
-const static long r_inv_col[2] = {RND_INV_COL, RND_INV_COL};
-const static long r_inv_corr[2] = {RND_INV_CORR, RND_INV_CORR };
-
+const static long r_inv_row[2] = { RND_INV_ROW, RND_INV_ROW }; +
const static long r_inv_col[2] = { RND_INV_COL, RND_INV_COL }; +
const static long r_inv_corr[2] = { RND_INV_CORR, RND_INV_CORR }; + +
/*const static short r_inv_col[4] =
*/ /* {RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL};
*/ /*const static short r_inv_corr[4] =
*/ /* {RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR};
*/ -
+
/* constants for the forward DCT
/*#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
*/ /*#define SHIFT_FRW_COL BITS_FRW_ACC
*/ /*#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17)
*/ /*#define RND_FRW_ROW (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1)
*/ -
-const static __int64 one_corr = 0x0001000100010001;
-const static long r_frw_row[2] = {RND_FRW_ROW, RND_FRW_ROW };
-
+
const static __int64 one_corr = 0x0001000100010001; +
const static long r_frw_row[2] = { RND_FRW_ROW, RND_FRW_ROW }; + +
/*const static short tg_1_16[4] = {13036, 13036, 13036, 13036 }; //tg * (2<<16) + 0.5
*/ /*const static short tg_2_16[4] = {27146, 27146, 27146, 27146 }; //tg * (2<<16) + 0.5
*/ /*const static short tg_3_16[4] = {-21746, -21746, -21746, -21746 }; //tg * (2<<16) + 0.5
*/ /*const static short cos_4_16[4] = {-19195, -19195, -19195, -19195 }; //cos * (2<<16) + 0.5
*/ /*const static short ocos_4_16[4] = {23170, 23170, 23170, 23170 }; //cos * (2<<15) + 0.5
*/ -
+
/*concatenated table, for forward DCT transformation
*/ -const static short tg_all_16[] = {
- 13036, 13036, 13036, 13036, /* tg * (2<<16) + 0.5
*/ - 27146, 27146, 27146, 27146, /*tg * (2<<16) + 0.5
*/ - -21746, -21746, -21746, -21746, /* tg * (2<<16) + 0.5
*/ - -19195, -19195, -19195, -19195, /*cos * (2<<16) + 0.5
*/ - 23170, 23170, 23170, 23170 }; /*cos * (2<<15) + 0.5
*/ +const static short tg_all_16[] = {
13036, 13036, 13036, 13036, /* tg * (2<<16) + 0.5
*/ + 27146, 27146, 27146, 27146, /*tg * (2<<16) + 0.5
*/ + -21746, -21746, -21746, -21746, /* tg * (2<<16) + 0.5
*/ + -19195, -19195, -19195, -19195, /*cos * (2<<16) + 0.5
*/ + 23170, 23170, 23170, 23170 +}; /*cos * (2<<15) + 0.5
*/ + #define tg_1_16 (tg_all_16 + 0)
#define tg_2_16 (tg_all_16 + 8)
#define tg_3_16 (tg_all_16 + 16)
#define cos_4_16 (tg_all_16 + 24)
#define ocos_4_16 (tg_all_16 + 32)
-*/
+ */
/*
;=============================================================================
;
@@ -236,552 +233,456 @@ IF _MMX ; MMX code ;=============================================================================
/*; Table for rows 0,4 - constants are multiplied by cos_4_16
*/ -const short tab_i_04[] = {
- 16384, 16384, 16384, -16384, /* ; movq-> w06 w04 w02 w00
*/ - 21407, 8867, 8867, -21407, /* w07 w05 w03 w01
*/ - 16384, -16384, 16384, 16384, /*; w14 w12 w10 w08
*/ - -8867, 21407, -21407, -8867, /*; w15 w13 w11 w09
*/ - 22725, 12873, 19266, -22725, /*; w22 w20 w18 w16
*/ - 19266, 4520, -4520, -12873, /*; w23 w21 w19 w17
*/ - 12873, 4520, 4520, 19266, /*; w30 w28 w26 w24
*/ - -22725, 19266, -12873, -22725 };/*w31 w29 w27 w25
*/ +const short tab_i_04[] = {
16384, 16384, 16384, -16384, /* ; movq-> w06 w04 w02 w00
*/ + 21407, 8867, 8867, -21407, /* w07 w05 w03 w01
*/ + 16384, -16384, 16384, 16384, /*; w14 w12 w10 w08
*/ + -8867, 21407, -21407, -8867, /*; w15 w13 w11 w09
*/ + 22725, 12873, 19266, -22725, /*; w22 w20 w18 w16
*/ + 19266, 4520, -4520, -12873, /*; w23 w21 w19 w17
*/ + 12873, 4520, 4520, 19266, /*; w30 w28 w26 w24
*/ + -22725, 19266, -12873, -22725 +}; /*w31 w29 w27 w25
*/ + /*; Table for rows 1,7 - constants are multiplied by cos_1_16
*/ -const short tab_i_17[] = {
- 22725, 22725, 22725, -22725, /* ; movq-> w06 w04 w02 w00
*/ - 29692, 12299, 12299, -29692, /* ; w07 w05 w03 w01
*/ - 22725, -22725, 22725, 22725, /*; w14 w12 w10 w08
*/ - -12299, 29692, -29692, -12299, /*; w15 w13 w11 w09
*/ - 31521, 17855, 26722, -31521, /*; w22 w20 w18 w16
*/ - 26722, 6270, -6270, -17855, /*; w23 w21 w19 w17
*/ - 17855, 6270, 6270, 26722, /*; w30 w28 w26 w24
*/ - -31521, 26722, -17855, -31521}; /* w31 w29 w27 w25
*/ +const short tab_i_17[] = {
22725, 22725, 22725, -22725, /* ; movq-> w06 w04 w02 w00
*/ + 29692, 12299, 12299, -29692, /* ; w07 w05 w03 w01
*/ + 22725, -22725, 22725, 22725, /*; w14 w12 w10 w08
*/ + -12299, 29692, -29692, -12299, /*; w15 w13 w11 w09
*/ + 31521, 17855, 26722, -31521, /*; w22 w20 w18 w16
*/ + 26722, 6270, -6270, -17855, /*; w23 w21 w19 w17
*/ + 17855, 6270, 6270, 26722, /*; w30 w28 w26 w24
*/ + -31521, 26722, -17855, -31521 +}; /* w31 w29 w27 w25
*/ + /*; Table for rows 2,6 - constants are multiplied by cos_2_16
*/ -const short tab_i_26[] = {
- 21407, 21407, 21407, -21407, /* ; movq-> w06 w04 w02 w00
*/ - 27969, 11585, 11585, -27969, /* ; w07 w05 w03 w01
*/ - 21407, -21407, 21407, 21407, /* ; w14 w12 w10 w08
*/ - -11585, 27969, -27969, -11585, /* ;w15 w13 w11 w09
*/ - 29692, 16819, 25172, -29692, /* ;w22 w20 w18 w16
*/ - 25172, 5906, -5906, -16819, /* ;w23 w21 w19 w17
*/ - 16819, 5906, 5906, 25172, /* ;w30 w28 w26 w24
*/ - -29692, 25172, -16819, -29692}; /* ;w31 w29 w27 w25
*/ -
-
+const short tab_i_26[] = {
21407, 21407, 21407, -21407, /* ; movq-> w06 w04 w02 w00
*/ + 27969, 11585, 11585, -27969, /* ; w07 w05 w03 w01
*/ + 21407, -21407, 21407, 21407, /* ; w14 w12 w10 w08
*/ + -11585, 27969, -27969, -11585, /* ;w15 w13 w11 w09
*/ + 29692, 16819, 25172, -29692, /* ;w22 w20 w18 w16
*/ + 25172, 5906, -5906, -16819, /* ;w23 w21 w19 w17
*/ + 16819, 5906, 5906, 25172, /* ;w30 w28 w26 w24
*/ + -29692, 25172, -16819, -29692 +}; /* ;w31 w29 w27 w25
*/ + +
/*; Table for rows 3,5 - constants are multiplied by cos_3_16
*/ -const short tab_i_35[] = {
- 19266, 19266, 19266, -19266, /*; movq-> w06 w04 w02 w00
*/ - 25172, 10426, 10426, -25172, /*; w07 w05 w03 w01
*/ - 19266, -19266, 19266, 19266, /*; w14 w12 w10 w08
*/ - -10426, 25172, -25172, -10426, /*; w15 w13 w11 w09
*/ - 26722, 15137, 22654, -26722, /*; w22 w20 w18 w16
*/ - 22654, 5315, -5315, -15137, /*; w23 w21 w19 w17
*/ - 15137, 5315, 5315, 22654, /*; w30 w28 w26 w24
*/ - -26722, 22654, -15137, -26722}; /*; w31 w29 w27 w25
*/ -*/
-
+const short tab_i_35[] = {
19266, 19266, 19266, -19266, /*; movq-> w06 w04 w02 w00
*/ + 25172, 10426, 10426, -25172, /*; w07 w05 w03 w01
*/ + 19266, -19266, 19266, 19266, /*; w14 w12 w10 w08
*/ + -10426, 25172, -25172, -10426, /*; w15 w13 w11 w09
*/ + 26722, 15137, 22654, -26722, /*; w22 w20 w18 w16
*/ + 22654, 5315, -5315, -15137, /*; w23 w21 w19 w17
*/ + 15137, 5315, 5315, 22654, /*; w30 w28 w26 w24
*/ + -26722, 22654, -15137, -26722 +}; /*; w31 w29 w27 w25
*/ + +*/
/* CONCATENATED TABLE, rows 0,1,2,3,4,5,6,7 (in order )
*/ /*
*/ /* In our implementation, however, we only use row0 !
*/ /*
*/ -static const short tab_i_01234567[] = {
- /*row0, this row is required
*/ - 16384, 16384, 16384, -16384, /* ; movq-> w06 w04 w02 w00
*/ - 21407, 8867, 8867, -21407, /* w07 w05 w03 w01
*/ - 16384, -16384, 16384, 16384, /*; w14 w12 w10 w08
*/ - -8867, 21407, -21407, -8867, /*; w15 w13 w11 w09
*/ - 22725, 12873, 19266, -22725, /*; w22 w20 w18 w16
*/ - 19266, 4520, -4520, -12873, /*; w23 w21 w19 w17
*/ - 12873, 4520, 4520, 19266, /*; w30 w28 w26 w24
*/ - -22725, 19266, -12873, -22725, /*w31 w29 w27 w25
*/ -
- /* the rest of these rows (1-7), aren't used !
*/ -
- /*row1
*/ - 22725, 22725, 22725, -22725, /* ; movq-> w06 w04 w02 w00
*/ - 29692, 12299, 12299, -29692, /* ; w07 w05 w03 w01
*/ - 22725, -22725, 22725, 22725, /*; w14 w12 w10 w08
*/ - -12299, 29692, -29692, -12299, /*; w15 w13 w11 w09
*/ - 31521, 17855, 26722, -31521, /*; w22 w20 w18 w16
*/ - 26722, 6270, -6270, -17855, /*; w23 w21 w19 w17
*/ - 17855, 6270, 6270, 26722, /*; w30 w28 w26 w24
*/ - -31521, 26722, -17855, -31521, /* w31 w29 w27 w25
*/ -
- /*row2
*/ - 21407, 21407, 21407, -21407, /* ; movq-> w06 w04 w02 w00
*/ - 27969, 11585, 11585, -27969, /* ; w07 w05 w03 w01
*/ - 21407, -21407, 21407, 21407, /* ; w14 w12 w10 w08
*/ - -11585, 27969, -27969, -11585, /* ;w15 w13 w11 w09
*/ - 29692, 16819, 25172, -29692, /* ;w22 w20 w18 w16
*/ - 25172, 5906, -5906, -16819, /* ;w23 w21 w19 w17
*/ - 16819, 5906, 5906, 25172, /* ;w30 w28 w26 w24
*/ - -29692, 25172, -16819, -29692, /* ;w31 w29 w27 w25
*/ -
- /*row3
*/ - 19266, 19266, 19266, -19266, /*; movq-> w06 w04 w02 w00
*/ - 25172, 10426, 10426, -25172, /*; w07 w05 w03 w01
*/ - 19266, -19266, 19266, 19266, /*; w14 w12 w10 w08
*/ - -10426, 25172, -25172, -10426, /*; w15 w13 w11 w09
*/ - 26722, 15137, 22654, -26722, /*; w22 w20 w18 w16
*/ - 22654, 5315, -5315, -15137, /*; w23 w21 w19 w17
*/ - 15137, 5315, 5315, 22654, /*; w30 w28 w26 w24
*/ - -26722, 22654, -15137, -26722, /*; w31 w29 w27 w25
*/ -
- /*row4
*/ - 16384, 16384, 16384, -16384, /* ; movq-> w06 w04 w02 w00
*/ - 21407, 8867, 8867, -21407, /* w07 w05 w03 w01
*/ - 16384, -16384, 16384, 16384, /*; w14 w12 w10 w08
*/ - -8867, 21407, -21407, -8867, /*; w15 w13 w11 w09
*/ - 22725, 12873, 19266, -22725, /*; w22 w20 w18 w16
*/ - 19266, 4520, -4520, -12873, /*; w23 w21 w19 w17
*/ - 12873, 4520, 4520, 19266, /*; w30 w28 w26 w24
*/ - -22725, 19266, -12873, -22725, /*w31 w29 w27 w25
*/ -
- /*row5
*/ - 19266, 19266, 19266, -19266, /*; movq-> w06 w04 w02 w00
*/ - 25172, 10426, 10426, -25172, /*; w07 w05 w03 w01
*/ - 19266, -19266, 19266, 19266, /*; w14 w12 w10 w08
*/ - -10426, 25172, -25172, -10426, /*; w15 w13 w11 w09
*/ - 26722, 15137, 22654, -26722, /*; w22 w20 w18 w16
*/ - 22654, 5315, -5315, -15137, /*; w23 w21 w19 w17
*/ - 15137, 5315, 5315, 22654, /*; w30 w28 w26 w24
*/ - -26722, 22654, -15137, -26722, /*; w31 w29 w27 w25
*/ -
- /*row6
*/ - 21407, 21407, 21407, -21407, /* ; movq-> w06 w04 w02 w00
*/ - 27969, 11585, 11585, -27969, /* ; w07 w05 w03 w01
*/ - 21407, -21407, 21407, 21407, /* ; w14 w12 w10 w08
*/ - -11585, 27969, -27969, -11585, /* ;w15 w13 w11 w09
*/ - 29692, 16819, 25172, -29692, /* ;w22 w20 w18 w16
*/ - 25172, 5906, -5906, -16819, /* ;w23 w21 w19 w17
*/ - 16819, 5906, 5906, 25172, /* ;w30 w28 w26 w24
*/ - -29692, 25172, -16819, -29692, /* ;w31 w29 w27 w25
*/ -
- /*row7
*/ - 22725, 22725, 22725, -22725, /* ; movq-> w06 w04 w02 w00
*/ - 29692, 12299, 12299, -29692, /* ; w07 w05 w03 w01
*/ - 22725, -22725, 22725, 22725, /*; w14 w12 w10 w08
*/ - -12299, 29692, -29692, -12299, /*; w15 w13 w11 w09
*/ - 31521, 17855, 26722, -31521, /*; w22 w20 w18 w16
*/ - 26722, 6270, -6270, -17855, /*; w23 w21 w19 w17
*/ - 17855, 6270, 6270, 26722, /*; w30 w28 w26 w24
*/ - -31521, 26722, -17855, -31521}; /* w31 w29 w27 w25
*/ -
-
-#define INP eax /* pointer to (short *blk)
*/ -#define OUT ecx /* pointer to output (temporary store space qwTemp[])
*/ -#define TABLE ebx /* pointer to tab_i_01234567[]
*/ +static const short tab_i_01234567[] = {
+ /*row0, this row is required
*/ + 16384, 16384, 16384, -16384, /* ; movq-> w06 w04 w02 w00
*/ + 21407, 8867, 8867, -21407, /* w07 w05 w03 w01
*/ + 16384, -16384, 16384, 16384, /*; w14 w12 w10 w08
*/ + -8867, 21407, -21407, -8867, /*; w15 w13 w11 w09
*/ + 22725, 12873, 19266, -22725, /*; w22 w20 w18 w16
*/ + 19266, 4520, -4520, -12873, /*; w23 w21 w19 w17
*/ + 12873, 4520, 4520, 19266, /*; w30 w28 w26 w24
*/ + -22725, 19266, -12873, -22725, /*w31 w29 w27 w25
*/ +
+ /* the rest of these rows (1-7), aren't used !
*/ +
+ /*row1
*/ + 22725, 22725, 22725, -22725, /* ; movq-> w06 w04 w02 w00
*/ + 29692, 12299, 12299, -29692, /* ; w07 w05 w03 w01
*/ + 22725, -22725, 22725, 22725, /*; w14 w12 w10 w08
*/ + -12299, 29692, -29692, -12299, /*; w15 w13 w11 w09
*/ + 31521, 17855, 26722, -31521, /*; w22 w20 w18 w16
*/ + 26722, 6270, -6270, -17855, /*; w23 w21 w19 w17
*/ + 17855, 6270, 6270, 26722, /*; w30 w28 w26 w24
*/ + -31521, 26722, -17855, -31521, /* w31 w29 w27 w25
*/ +
+ /*row2
*/ + 21407, 21407, 21407, -21407, /* ; movq-> w06 w04 w02 w00
*/ + 27969, 11585, 11585, -27969, /* ; w07 w05 w03 w01
*/ + 21407, -21407, 21407, 21407, /* ; w14 w12 w10 w08
*/ + -11585, 27969, -27969, -11585, /* ;w15 w13 w11 w09
*/ + 29692, 16819, 25172, -29692, /* ;w22 w20 w18 w16
*/ + 25172, 5906, -5906, -16819, /* ;w23 w21 w19 w17
*/ + 16819, 5906, 5906, 25172, /* ;w30 w28 w26 w24
*/ + -29692, 25172, -16819, -29692, /* ;w31 w29 w27 w25
*/ +
+ /*row3
*/ + 19266, 19266, 19266, -19266, /*; movq-> w06 w04 w02 w00
*/ + 25172, 10426, 10426, -25172, /*; w07 w05 w03 w01
*/ + 19266, -19266, 19266, 19266, /*; w14 w12 w10 w08
*/ + -10426, 25172, -25172, -10426, /*; w15 w13 w11 w09
*/ + 26722, 15137, 22654, -26722, /*; w22 w20 w18 w16
*/ + 22654, 5315, -5315, -15137, /*; w23 w21 w19 w17
*/ + 15137, 5315, 5315, 22654, /*; w30 w28 w26 w24
*/ + -26722, 22654, -15137, -26722, /*; w31 w29 w27 w25
*/ +
+ /*row4
*/ + 16384, 16384, 16384, -16384, /* ; movq-> w06 w04 w02 w00
*/ + 21407, 8867, 8867, -21407, /* w07 w05 w03 w01
*/ + 16384, -16384, 16384, 16384, /*; w14 w12 w10 w08
*/ + -8867, 21407, -21407, -8867, /*; w15 w13 w11 w09
*/ + 22725, 12873, 19266, -22725, /*; w22 w20 w18 w16
*/ + 19266, 4520, -4520, -12873, /*; w23 w21 w19 w17
*/ + 12873, 4520, 4520, 19266, /*; w30 w28 w26 w24
*/ + -22725, 19266, -12873, -22725, /*w31 w29 w27 w25
*/ +
+ /*row5
*/ + 19266, 19266, 19266, -19266, /*; movq-> w06 w04 w02 w00
*/ + 25172, 10426, 10426, -25172, /*; w07 w05 w03 w01
*/ + 19266, -19266, 19266, 19266, /*; w14 w12 w10 w08
*/ + -10426, 25172, -25172, -10426, /*; w15 w13 w11 w09
*/ + 26722, 15137, 22654, -26722, /*; w22 w20 w18 w16
*/ + 22654, 5315, -5315, -15137, /*; w23 w21 w19 w17
*/ + 15137, 5315, 5315, 22654, /*; w30 w28 w26 w24
*/ + -26722, 22654, -15137, -26722, /*; w31 w29 w27 w25
*/ +
+ /*row6
*/ + 21407, 21407, 21407, -21407, /* ; movq-> w06 w04 w02 w00
*/ + 27969, 11585, 11585, -27969, /* ; w07 w05 w03 w01
*/ + 21407, -21407, 21407, 21407, /* ; w14 w12 w10 w08
*/ + -11585, 27969, -27969, -11585, /* ;w15 w13 w11 w09
*/ + 29692, 16819, 25172, -29692, /* ;w22 w20 w18 w16
*/ + 25172, 5906, -5906, -16819, /* ;w23 w21 w19 w17
*/ + 16819, 5906, 5906, 25172, /* ;w30 w28 w26 w24
*/ + -29692, 25172, -16819, -29692, /* ;w31 w29 w27 w25
*/ +
+ /*row7
*/ + 22725, 22725, 22725, -22725, /* ; movq-> w06 w04 w02 w00
*/ + 29692, 12299, 12299, -29692, /* ; w07 w05 w03 w01
*/ + 22725, -22725, 22725, 22725, /*; w14 w12 w10 w08
*/ + -12299, 29692, -29692, -12299, /*; w15 w13 w11 w09
*/ + 31521, 17855, 26722, -31521, /*; w22 w20 w18 w16
*/ + 26722, 6270, -6270, -17855, /*; w23 w21 w19 w17
*/ + 17855, 6270, 6270, 26722, /*; w30 w28 w26 w24
*/ + -31521, 26722, -17855, -31521 +}; /* w31 w29 w27 w25
*/ + +
+#define INP eax /* pointer to (short *blk)
*/ +#define OUT ecx /* pointer to output (temporary store space qwTemp[])
*/ +#define TABLE ebx /* pointer to tab_i_01234567[]
*/ #define round_inv_row edx
#define round_inv_col edx
-
-#define ROW_STRIDE 8 /* for 8x8 matrix transposer
*/ -
+
+#define ROW_STRIDE 8 /* for 8x8 matrix transposer
*/ +
/* private variables and functions
*/ -
+
/*temporary storage space, 8x8 of shorts
*/ -
-__inline static void idct_mmx32_rows( short *blk ); /* transform rows
*/ -__inline static void idct_mmx32_cols( short *blk ); /* transform "columns"
*/ +
__inline static void idct_mmx32_rows (short *blk); /* transform rows
*/ +__inline static void idct_mmx32_cols (short *blk); /* transform "columns"
*/ + /* the "column" transform actually transforms rows, it is
*/ /* identical to the row-transform except for the ROUNDING
*/ /* and SHIFTING coefficients.
*/ -
-
-static void
-idct_mmx32_rows( short *blk ) /* transform all 8 rows of 8x8 iDCT block
*/ -{
- int x;
- short qwTemp[64];
- short *out = &qwTemp[0];
- short *inptr = blk;
- /* this subroutine performs two operations
*/ - /* 1) iDCT row transform
*/ - /* for( i = 0; i < 8; ++ i)
*/ - /* DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] );
*/ - /*
*/ - /* 2) transpose the matrix (which was stored in qwTemp[])
*/ - /* qwTemp[] -> [8x8 matrix transpose] -> blk[]
*/ -
- for (x=0; x<8; x++) { /* transform one row per iteration
*/ - movq_m2r(*(inptr), mm0); /* 0 ; x3 x2 x1 x0
*/ -
- movq_m2r(*(inptr+4), mm1); /* 1 ; x7 x6 x5 x4
*/ - movq_r2r(mm0, mm2); /* 2 ; x3 x2 x1 x0
*/ -
- movq_m2r(*(tab_i_01234567), mm3); /* 3 ; w06 w04 w02 w00
*/ - punpcklwd_r2r(mm1, mm0); /* x5 x1 x4 x0
*/ -
- /* ----------
*/ - movq_r2r(mm0, mm5); /* 5 ; x5 x1 x4 x0
*/ - punpckldq_r2r(mm0, mm0); /* x4 x0 x4 x0
*/ -
- movq_m2r(*(tab_i_01234567+4), mm4); /* 4 ; w07 w05 w03 w01
*/ - punpckhwd_r2r(mm1, mm2); /* 1 ; x7 x3 x6 x2
*/ -
- pmaddwd_r2r(mm0, mm3); /* x4*w06+x0*w04 x4*w02+x0*w00
*/ - movq_r2r(mm2, mm6); /* 6 ; x7 x3 x6 x2
*/ -
- movq_m2r(*(tab_i_01234567+16), mm1);/* 1 ; w22 w20 w18 w16
*/ - punpckldq_r2r(mm2, mm2); /* x6 x2 x6 x2
*/ -
- pmaddwd_r2r(mm2, mm4); /* x6*w07+x2*w05 x6*w03+x2*w01
*/ - punpckhdq_r2r(mm5, mm5); /* x5 x1 x5 x1
*/ -
- pmaddwd_m2r(*(tab_i_01234567+8), mm0);/* x4*w14+x0*w12 x4*w10+x0*w08
*/ - punpckhdq_r2r(mm6, mm6); /* x7 x3 x7 x3
*/ -
- movq_m2r(*(tab_i_01234567+20), mm7);/* 7 ; w23 w21 w19 w17
*/ - pmaddwd_r2r(mm5, mm1); /* x5*w22+x1*w20 x5*w18+x1*w16
*/ -
- paddd_m2r(*(r_inv_row), mm3);/* +rounder
*/ - pmaddwd_r2r(mm6, mm7); /* x7*w23+x3*w21 x7*w19+x3*w17
*/ -
- pmaddwd_m2r(*(tab_i_01234567+12), mm2);/* x6*w15+x2*w13 x6*w11+x2*w09
*/ - paddd_r2r(mm4, mm3); /* 4 ; a1=sum(even1) a0=sum(even0)
*/ -
- pmaddwd_m2r(*(tab_i_01234567+24), mm5);/* x5*w30+x1*w28 x5*w26+x1*w24
*/ - movq_r2r(mm3, mm4); /* 4 ; a1 a0
*/ -
- pmaddwd_m2r(*(tab_i_01234567+28), mm6);/* x7*w31+x3*w29 x7*w27+x3*w25
*/ - paddd_r2r(mm7, mm1); /* 7 ; b1=sum(odd1) b0=sum(odd0)
*/ -
- paddd_m2r(*(r_inv_row), mm0);/* +rounder
*/ - psubd_r2r(mm1, mm3); /* a1-b1 a0-b0
*/ -
- psrad_i2r(SHIFT_INV_ROW, mm3); /* y6=a1-b1 y7=a0-b0
*/ - paddd_r2r(mm4, mm1); /* 4 ; a1+b1 a0+b0
*/ -
- paddd_r2r(mm2, mm0); /* 2 ; a3=sum(even3) a2=sum(even2)
*/ - psrad_i2r(SHIFT_INV_ROW, mm1); /* y1=a1+b1 y0=a0+b0
*/ -
- paddd_r2r(mm6, mm5); /* 6 ; b3=sum(odd3) b2=sum(odd2)
*/ - movq_r2r(mm0, mm4); /* 4 ; a3 a2
*/ -
- paddd_r2r(mm5, mm0); /* a3+b3 a2+b2
*/ - psubd_r2r(mm5, mm4); /* 5 ; a3-b3 a2-b2
*/ -
- psrad_i2r(SHIFT_INV_ROW, mm4); /* y4=a3-b3 y5=a2-b2
*/ - psrad_i2r(SHIFT_INV_ROW, mm0); /* y3=a3+b3 y2=a2+b2
*/ -
- packssdw_r2r(mm3, mm4); /* 3 ; y6 y7 y4 y5
*/ -
- packssdw_r2r(mm0, mm1); /* 0 ; y3 y2 y1 y0
*/ - movq_r2r(mm4, mm7); /* 7 ; y6 y7 y4 y5
*/ -
- psrld_i2r(16, mm4); /* 0 y6 0 y4
*/ -
- movq_r2m(mm1, *(out)); /* 1 ; save y3 y2 y1 y0
*/ - pslld_i2r(16, mm7); /* y7 0 y5 0
*/ -
- por_r2r(mm4, mm7); /* 4 ; y7 y6 y5 y4
*/ -
- /* begin processing row 1
*/ - movq_r2m(mm7, *(out+4)); /* 7 ; save y7 y6 y5 y4
*/ -
- inptr += 8;
- out += 8;
- }
-
-
- /* done with the iDCT row-transformation
*/ -
- /* now we have to transpose the output 8x8 matrix
*/ - /* 8x8 (OUT) -> 8x8't' (IN)
*/ - /* the transposition is implemented as 4 sub-operations.
*/ - /* 1) transpose upper-left quad
*/ - /* 2) transpose lower-right quad
*/ - /* 3) transpose lower-left quad
*/ - /* 4) transpose upper-right quad
*/ -
-
- /* mm0 = 1st row [ A B C D ] row1
*/ - /* mm1 = 2nd row [ E F G H ] 2
*/ - /* mm2 = 3rd row [ I J K L ] 3
*/ - /* mm3 = 4th row [ M N O P ] 4
*/ -
- /* 1) transpose upper-left quad
*/ - out = &qwTemp[0];
-
- movq_m2r(*(out + ROW_STRIDE * 0), mm0);
-
- movq_m2r(*(out + ROW_STRIDE * 1), mm1);
- movq_r2r(mm0, mm4); /* mm4 = copy of row1[A B C D]
*/ -
- movq_m2r(*(out + ROW_STRIDE * 2), mm2);
- punpcklwd_r2r(mm1, mm0); /* mm0 = [ 0 4 1 5]
*/ -
- movq_m2r(*(out + ROW_STRIDE * 3), mm3);
- punpckhwd_r2r(mm1, mm4); /* mm4 = [ 2 6 3 7]
*/ -
- movq_r2r(mm2, mm6);
- punpcklwd_r2r(mm3, mm2); /* mm2 = [ 8 12 9 13]
*/ -
- punpckhwd_r2r(mm3, mm6); /* mm6 = 10 14 11 15]
*/ - movq_r2r(mm0, mm1); /* mm1 = [ 0 4 1 5]
*/ -
- inptr = blk;
-
- punpckldq_r2r(mm2, mm0); /* final result mm0 = row1 [0 4 8 12]
*/ -
- movq_r2r(mm4, mm3); /* mm3 = [ 2 6 3 7]
*/ - punpckhdq_r2r(mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13]
*/ -
- movq_r2m(mm0, *(inptr + ROW_STRIDE * 0)); /* store row 1
*/ - punpckldq_r2r(mm6, mm4); /* final result mm4 = row3 [2 6 10 14]
*/ -
+
static void
+idct_mmx32_rows (short *blk) +{ /* transform all 8 rows of 8x8 iDCT block
*/ +
int x; +
short qwTemp[64]; +
short *out = &qwTemp[0]; +
short *inptr = blk; + +
+ /* this subroutine performs two operations
*/ + /* 1) iDCT row transform
*/ + /* for( i = 0; i < 8; ++ i)
*/ + /* DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] );
*/ + /*
*/ + /* 2) transpose the matrix (which was stored in qwTemp[])
*/ + /* qwTemp[] -> [8x8 matrix transpose] -> blk[]
*/ +
for (x = 0; x < 8; x++) { /* transform one row per iteration
*/ + movq_m2r (*(inptr), mm0); /* 0 ; x3 x2 x1 x0
*/ +
movq_m2r (*(inptr + 4), mm1); /* 1 ; x7 x6 x5 x4
*/ + movq_r2r (mm0, mm2); /* 2 ; x3 x2 x1 x0
*/ +
movq_m2r (*(tab_i_01234567), mm3); /* 3 ; w06 w04 w02 w00
*/ + punpcklwd_r2r (mm1, mm0); /* x5 x1 x4 x0
*/ +
+ /* ----------
*/ + movq_r2r (mm0, mm5); /* 5 ; x5 x1 x4 x0
*/ + punpckldq_r2r (mm0, mm0); /* x4 x0 x4 x0
*/ +
movq_m2r (*(tab_i_01234567 + 4), mm4); /* 4 ; w07 w05 w03 w01
*/ + punpckhwd_r2r (mm1, mm2); /* 1 ; x7 x3 x6 x2
*/ +
pmaddwd_r2r (mm0, mm3); /* x4*w06+x0*w04 x4*w02+x0*w00
*/ + movq_r2r (mm2, mm6); /* 6 ; x7 x3 x6 x2
*/ +
movq_m2r (*(tab_i_01234567 + 16), mm1); /* 1 ; w22 w20 w18 w16
*/ + punpckldq_r2r (mm2, mm2); /* x6 x2 x6 x2
*/ +
pmaddwd_r2r (mm2, mm4); /* x6*w07+x2*w05 x6*w03+x2*w01
*/ + punpckhdq_r2r (mm5, mm5); /* x5 x1 x5 x1
*/ +
pmaddwd_m2r (*(tab_i_01234567 + 8), mm0); /* x4*w14+x0*w12 x4*w10+x0*w08
*/ + punpckhdq_r2r (mm6, mm6); /* x7 x3 x7 x3
*/ +
movq_m2r (*(tab_i_01234567 + 20), mm7); /* 7 ; w23 w21 w19 w17
*/ + pmaddwd_r2r (mm5, mm1); /* x5*w22+x1*w20 x5*w18+x1*w16
*/ +
paddd_m2r (*(r_inv_row), mm3); /* +rounder
*/ + pmaddwd_r2r (mm6, mm7); /* x7*w23+x3*w21 x7*w19+x3*w17
*/ +
pmaddwd_m2r (*(tab_i_01234567 + 12), mm2); /* x6*w15+x2*w13 x6*w11+x2*w09
*/ + paddd_r2r (mm4, mm3); /* 4 ; a1=sum(even1) a0=sum(even0)
*/ +
pmaddwd_m2r (*(tab_i_01234567 + 24), mm5); /* x5*w30+x1*w28 x5*w26+x1*w24
*/ + movq_r2r (mm3, mm4); /* 4 ; a1 a0
*/ +
pmaddwd_m2r (*(tab_i_01234567 + 28), mm6); /* x7*w31+x3*w29 x7*w27+x3*w25
*/ + paddd_r2r (mm7, mm1); /* 7 ; b1=sum(odd1) b0=sum(odd0)
*/ +
paddd_m2r (*(r_inv_row), mm0); /* +rounder
*/ + psubd_r2r (mm1, mm3); /* a1-b1 a0-b0
*/ +
psrad_i2r (SHIFT_INV_ROW, mm3); /* y6=a1-b1 y7=a0-b0
*/ + paddd_r2r (mm4, mm1); /* 4 ; a1+b1 a0+b0
*/ +
paddd_r2r (mm2, mm0); /* 2 ; a3=sum(even3) a2=sum(even2)
*/ + psrad_i2r (SHIFT_INV_ROW, mm1); /* y1=a1+b1 y0=a0+b0
*/ +
paddd_r2r (mm6, mm5); /* 6 ; b3=sum(odd3) b2=sum(odd2)
*/ + movq_r2r (mm0, mm4); /* 4 ; a3 a2
*/ +
paddd_r2r (mm5, mm0); /* a3+b3 a2+b2
*/ + psubd_r2r (mm5, mm4); /* 5 ; a3-b3 a2-b2
*/ +
psrad_i2r (SHIFT_INV_ROW, mm4); /* y4=a3-b3 y5=a2-b2
*/ + psrad_i2r (SHIFT_INV_ROW, mm0); /* y3=a3+b3 y2=a2+b2
*/ +
packssdw_r2r (mm3, mm4); /* 3 ; y6 y7 y4 y5
*/ +
packssdw_r2r (mm0, mm1); /* 0 ; y3 y2 y1 y0
*/ + movq_r2r (mm4, mm7); /* 7 ; y6 y7 y4 y5
*/ +
psrld_i2r (16, mm4); /* 0 y6 0 y4
*/ +
movq_r2m (mm1, *(out)); /* 1 ; save y3 y2 y1 y0
*/ + pslld_i2r (16, mm7); /* y7 0 y5 0
*/ +
por_r2r (mm4, mm7); /* 4 ; y7 y6 y5 y4
*/ +
+ /* begin processing row 1
*/ + movq_r2m (mm7, *(out + 4)); /* 7 ; save y7 y6 y5 y4
*/ +
inptr += 8; +
out += 8; +
} +
+ /* done with the iDCT row-transformation
*/ +
+ /* now we have to transpose the output 8x8 matrix
*/ + /* 8x8 (OUT) -> 8x8't' (IN)
*/ + /* the transposition is implemented as 4 sub-operations.
*/ + /* 1) transpose upper-left quad
*/ + /* 2) transpose lower-right quad
*/ + /* 3) transpose lower-left quad
*/ + /* 4) transpose upper-right quad
*/ +
+ /* mm0 = 1st row [ A B C D ] row1
*/ + /* mm1 = 2nd row [ E F G H ] 2
*/ + /* mm2 = 3rd row [ I J K L ] 3
*/ + /* mm3 = 4th row [ M N O P ] 4
*/ +
+ /* 1) transpose upper-left quad
*/ + out = &qwTemp[0]; +
movq_m2r (*(out + ROW_STRIDE * 0), mm0); +
movq_m2r (*(out + ROW_STRIDE * 1), mm1); +
movq_r2r (mm0, mm4); /* mm4 = copy of row1[A B C D]
*/ +
movq_m2r (*(out + ROW_STRIDE * 2), mm2); +
punpcklwd_r2r (mm1, mm0); /* mm0 = [ 0 4 1 5]
*/ +
movq_m2r (*(out + ROW_STRIDE * 3), mm3); +
punpckhwd_r2r (mm1, mm4); /* mm4 = [ 2 6 3 7]
*/ +
movq_r2r (mm2, mm6); +
punpcklwd_r2r (mm3, mm2); /* mm2 = [ 8 12 9 13]
*/ +
punpckhwd_r2r (mm3, mm6); /* mm6 = 10 14 11 15]
*/ + movq_r2r (mm0, mm1); /* mm1 = [ 0 4 1 5]
*/ +
inptr = blk; +
punpckldq_r2r (mm2, mm0); /* final result mm0 = row1 [0 4 8 12]
*/ +
movq_r2r (mm4, mm3); /* mm3 = [ 2 6 3 7]
*/ + punpckhdq_r2r (mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13]
*/ +
movq_r2m (mm0, *(inptr + ROW_STRIDE * 0)); /* store row 1
*/ + punpckldq_r2r (mm6, mm4); /* final result mm4 = row3 [2 6 10 14]
*/ +
/* begin reading next quadrant (lower-right)
*/ - movq_m2r(*(out + ROW_STRIDE*4 + 4), mm0);
- punpckhdq_r2r(mm6, mm3); /* final result mm3 = row4 [3 7 11 15]
*/ -
- movq_r2m(mm4, *(inptr + ROW_STRIDE * 2)); /* store row 3
*/ - movq_r2r(mm0, mm4); /* mm4 = copy of row1[A B C D]
*/ -
- movq_r2m(mm1, *(inptr + ROW_STRIDE * 1)); /* store row 2
*/ -
- movq_m2r(*(out + ROW_STRIDE*5 + 4), mm1);
-
- movq_r2m(mm3, *(inptr + ROW_STRIDE * 3)); /* store row 4
*/ - punpcklwd_r2r(mm1, mm0); /* mm0 = [ 0 4 1 5]
*/ -
- /* 2) transpose lower-right quadrant
*/ -
+ movq_m2r (*(out + ROW_STRIDE * 4 + 4), mm0); +
punpckhdq_r2r (mm6, mm3); /* final result mm3 = row4 [3 7 11 15]
*/ +
movq_r2m (mm4, *(inptr + ROW_STRIDE * 2)); /* store row 3
*/ + movq_r2r (mm0, mm4); /* mm4 = copy of row1[A B C D]
*/ +
movq_r2m (mm1, *(inptr + ROW_STRIDE * 1)); /* store row 2
*/ +
movq_m2r (*(out + ROW_STRIDE * 5 + 4), mm1); +
movq_r2m (mm3, *(inptr + ROW_STRIDE * 3)); /* store row 4
*/ + punpcklwd_r2r (mm1, mm0); /* mm0 = [ 0 4 1 5]
*/ +
+ /* 2) transpose lower-right quadrant
*/ +
/* movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]
*/ -
+
/* movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
*/ /* movq mm4, mm0; // mm4 = copy of row1[A B C D]
*/ -
- movq_m2r(*(out + ROW_STRIDE*6 + 4), mm2);
-/* punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
*/ - punpckhwd_r2r(mm1, mm4); /* mm4 = [ 2 6 3 7]
*/ -
- movq_m2r(*(out + ROW_STRIDE*7 + 4), mm3);
- movq_r2r(mm2, mm6);
-
- punpcklwd_r2r(mm3, mm2); /* mm2 = [ 8 12 9 13]
*/ - movq_r2r(mm0, mm1); /* mm1 = [ 0 4 1 5]
*/ -
- punpckhwd_r2r(mm3, mm6); /* mm6 = 10 14 11 15]
*/ - movq_r2r(mm4, mm3); /* mm3 = [ 2 6 3 7]
*/ -
- punpckldq_r2r(mm2, mm0); /* final result mm0 = row1 [0 4 8 12]
*/ -
- punpckhdq_r2r(mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13]
*/ - ; /* slot
*/ -
- movq_r2m(mm0, *(inptr + ROW_STRIDE*4 + 4)); /* store row 1
*/ - punpckldq_r2r(mm6, mm4); /* final result mm4 = row3 [2 6 10 14]
*/ -
- movq_m2r(*(out + ROW_STRIDE * 4 ), mm0);
- punpckhdq_r2r(mm6, mm3); /* final result mm3 = row4 [3 7 11 15]
*/ +
movq_m2r (*(out + ROW_STRIDE * 6 + 4), mm2); - movq_r2m(mm4, *(inptr + ROW_STRIDE*6 + 4)); /* store row 3
*/ - movq_r2r(mm0, mm4); /* mm4 = copy of row1[A B C D]
*/ -
- movq_r2m(mm1, *(inptr + ROW_STRIDE*5 + 4)); /* store row 2
*/ - ; /* slot
*/ +/* punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
*/ + punpckhwd_r2r (mm1, mm4); /* mm4 = [ 2 6 3 7]
*/ +
movq_m2r (*(out + ROW_STRIDE * 7 + 4), mm3); +
movq_r2r (mm2, mm6); +
punpcklwd_r2r (mm3, mm2); /* mm2 = [ 8 12 9 13]
*/ + movq_r2r (mm0, mm1); /* mm1 = [ 0 4 1 5]
*/ +
punpckhwd_r2r (mm3, mm6); /* mm6 = 10 14 11 15]
*/ + movq_r2r (mm4, mm3); /* mm3 = [ 2 6 3 7]
*/ +
punpckldq_r2r (mm2, mm0); /* final result mm0 = row1 [0 4 8 12]
*/ +
punpckhdq_r2r (mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13]
*/ + ; /* slot
*/ +
movq_r2m (mm0, *(inptr + ROW_STRIDE * 4 + 4)); /* store row 1
*/ + punpckldq_r2r (mm6, mm4); /* final result mm4 = row3 [2 6 10 14]
*/ +
movq_m2r (*(out + ROW_STRIDE * 4), mm0); +
punpckhdq_r2r (mm6, mm3); /* final result mm3 = row4 [3 7 11 15]
*/ +
movq_r2m (mm4, *(inptr + ROW_STRIDE * 6 + 4)); /* store row 3
*/ + movq_r2r (mm0, mm4); /* mm4 = copy of row1[A B C D]
*/ +
movq_r2m (mm1, *(inptr + ROW_STRIDE * 5 + 4)); /* store row 2
*/ + ; /* slot
*/ +
movq_m2r (*(out + ROW_STRIDE * 5), mm1); +
; /* slot
*/ +
movq_r2m (mm3, *(inptr + ROW_STRIDE * 7 + 4)); /* store row 4
*/ + punpcklwd_r2r (mm1, mm0); /* mm0 = [ 0 4 1 5]
*/ - movq_m2r(*(out + ROW_STRIDE * 5 ), mm1);
- ; /* slot
*/ -
- movq_r2m(mm3, *(inptr + ROW_STRIDE*7 + 4)); /* store row 4
*/ - punpcklwd_r2r(mm1, mm0); /* mm0 = [ 0 4 1 5]
*/ -
- /* 3) transpose lower-left
*/ + /* 3) transpose lower-left
*/ /* movq mm0, qword ptr [OUT + ROW_STRIDE * 4 ]
*/ -
+
/* movq mm1, qword ptr [OUT + ROW_STRIDE * 5 ]
*/ /* movq mm4, mm0; // mm4 = copy of row1[A B C D]
*/ -
- movq_m2r(*(out + ROW_STRIDE * 6 ), mm2);
+
movq_m2r (*(out + ROW_STRIDE * 6), mm2); +
/* punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
*/ - punpckhwd_r2r(mm1, mm4); /* mm4 = [ 2 6 3 7]
*/ -
- movq_m2r(*(out + ROW_STRIDE * 7 ), mm3);
- movq_r2r(mm2, mm6);
-
- punpcklwd_r2r(mm3, mm2); /* mm2 = [ 8 12 9 13]
*/ - movq_r2r(mm0, mm1); /* mm1 = [ 0 4 1 5]
*/ -
- punpckhwd_r2r(mm3, mm6); /* mm6 = 10 14 11 15]
*/ - movq_r2r(mm4, mm3); /* mm3 = [ 2 6 3 7]
*/ -
- punpckldq_r2r(mm2, mm0); /* final result mm0 = row1 [0 4 8 12]
*/ -
- punpckhdq_r2r(mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13]
*/ - ;/*slot
*/ -
- movq_r2m(mm0, *(inptr + ROW_STRIDE * 0 + 4 )); /* store row 1
*/ - punpckldq_r2r(mm6, mm4); /* final result mm4 = row3 [2 6 10 14]
*/ -
+ punpckhwd_r2r (mm1, mm4); /* mm4 = [ 2 6 3 7]
*/ +
movq_m2r (*(out + ROW_STRIDE * 7), mm3); +
movq_r2r (mm2, mm6); +
punpcklwd_r2r (mm3, mm2); /* mm2 = [ 8 12 9 13]
*/ + movq_r2r (mm0, mm1); /* mm1 = [ 0 4 1 5]
*/ +
punpckhwd_r2r (mm3, mm6); /* mm6 = 10 14 11 15]
*/ + movq_r2r (mm4, mm3); /* mm3 = [ 2 6 3 7]
*/ +
punpckldq_r2r (mm2, mm0); /* final result mm0 = row1 [0 4 8 12]
*/ +
punpckhdq_r2r (mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13]
*/ + ; /*slot
*/ +
movq_r2m (mm0, *(inptr + ROW_STRIDE * 0 + 4)); /* store row 1
*/ + punpckldq_r2r (mm6, mm4); /* final result mm4 = row3 [2 6 10 14]
*/ +
/* begin reading next quadrant (upper-right)
*/ - movq_m2r(*(out + ROW_STRIDE*0 + 4), mm0);
- punpckhdq_r2r(mm6, mm3); /* final result mm3 = row4 [3 7 11 15]
*/ -
- movq_r2m(mm4, *(inptr + ROW_STRIDE * 2 + 4)); /* store row 3
*/ - movq_r2r(mm0, mm4); /* mm4 = copy of row1[A B C D]
*/ -
- movq_r2m(mm1, *(inptr + ROW_STRIDE * 1 + 4)); /* store row 2
*/ - movq_m2r(*(out + ROW_STRIDE*1 + 4), mm1);
-
- movq_r2m(mm3, *(inptr + ROW_STRIDE * 3 + 4)); /* store row 4
*/ - punpcklwd_r2r(mm1, mm0); /* mm0 = [ 0 4 1 5]
*/ -
-
- /* 2) transpose lower-right quadrant
*/ -
+ movq_m2r (*(out + ROW_STRIDE * 0 + 4), mm0); +
punpckhdq_r2r (mm6, mm3); /* final result mm3 = row4 [3 7 11 15]
*/ +
movq_r2m (mm4, *(inptr + ROW_STRIDE * 2 + 4)); /* store row 3
*/ + movq_r2r (mm0, mm4); /* mm4 = copy of row1[A B C D]
*/ +
movq_r2m (mm1, *(inptr + ROW_STRIDE * 1 + 4)); /* store row 2
*/ + movq_m2r (*(out + ROW_STRIDE * 1 + 4), mm1); +
movq_r2m (mm3, *(inptr + ROW_STRIDE * 3 + 4)); /* store row 4
*/ + punpcklwd_r2r (mm1, mm0); /* mm0 = [ 0 4 1 5]
*/ +
+ /* 2) transpose lower-right quadrant
*/ +
/* movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8]
*/ -
+
/* movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8]
*/ /* movq mm4, mm0; // mm4 = copy of row1[A B C D]
*/ -
- movq_m2r(*(out + ROW_STRIDE*2 + 4), mm2);
+
movq_m2r (*(out + ROW_STRIDE * 2 + 4), mm2); +
/* punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5]
*/ - punpckhwd_r2r(mm1, mm4); /* mm4 = [ 2 6 3 7]
*/ -
- movq_m2r(*(out + ROW_STRIDE*3 + 4), mm3);
- movq_r2r(mm2, mm6);
-
- punpcklwd_r2r(mm3, mm2); /* mm2 = [ 8 12 9 13]
*/ - movq_r2r(mm0, mm1); /* mm1 = [ 0 4 1 5]
*/ -
- punpckhwd_r2r(mm3, mm6); /* mm6 = 10 14 11 15]
*/ - movq_r2r(mm4, mm3); /* mm3 = [ 2 6 3 7]
*/ -
- punpckldq_r2r(mm2, mm0); /* final result mm0 = row1 [0 4 8 12]
*/ -
- punpckhdq_r2r(mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13]
*/ - ; /* slot
*/ -
- movq_r2m(mm0, *(inptr + ROW_STRIDE*4)); /* store row 1
*/ - punpckldq_r2r(mm6, mm4); /* final result mm4 = row3 [2 6 10 14]
*/ -
- movq_r2m(mm1, *(inptr + ROW_STRIDE*5)); /* store row 2
*/ - punpckhdq_r2r(mm6, mm3); /* final result mm3 = row4 [3 7 11 15]
*/ -
- movq_r2m(mm4, *(inptr + ROW_STRIDE*6)); /* store row 3
*/ - ; /* slot
*/ -
- movq_r2m(mm3, *(inptr + ROW_STRIDE*7)); /* store row 4
*/ - ; /* slot
*/ -
-}
-
-
-static void
-idct_mmx32_cols( short *blk ) /* transform all 8 cols of 8x8 iDCT block
*/ -{
- int x;
- short *inptr = blk;
-
- /* Despite the function's name, the matrix is transformed
*/ - /* row by row. This function is identical to idct_mmx32_rows(),
*/ - /* except for the SHIFT amount and ROUND_INV amount.
*/ -
- /* this subroutine performs two operations
*/ - /* 1) iDCT row transform
*/ - /* for( i = 0; i < 8; ++ i)
*/ - /* DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] );
*/ - /*
*/ - /* 2) transpose the matrix (which was stored in qwTemp[])
*/ - /* qwTemp[] -> [8x8 matrix transpose] -> blk[]
*/ -
-
- for (x=0; x<8; x++) { /* transform one row per iteration
*/ -
- movq_m2r(*(inptr), mm0); /* 0 ; x3 x2 x1 x0
*/ -
- movq_m2r(*(inptr+4), mm1); /* 1 ; x7 x6 x5 x4
*/ - movq_r2r(mm0, mm2); /* 2 ; x3 x2 x1 x0
*/ -
- movq_m2r(*(tab_i_01234567), mm3); /* 3 ; w06 w04 w02 w00
*/ - punpcklwd_r2r(mm1, mm0); /* x5 x1 x4 x0
*/ -
+ punpckhwd_r2r (mm1, mm4); /* mm4 = [ 2 6 3 7]
*/ +
movq_m2r (*(out + ROW_STRIDE * 3 + 4), mm3); +
movq_r2r (mm2, mm6); +
punpcklwd_r2r (mm3, mm2); /* mm2 = [ 8 12 9 13]
*/ + movq_r2r (mm0, mm1); /* mm1 = [ 0 4 1 5]
*/ +
punpckhwd_r2r (mm3, mm6); /* mm6 = 10 14 11 15]
*/ + movq_r2r (mm4, mm3); /* mm3 = [ 2 6 3 7]
*/ +
punpckldq_r2r (mm2, mm0); /* final result mm0 = row1 [0 4 8 12]
*/ +
punpckhdq_r2r (mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13]
*/ + ; /* slot
*/ +
movq_r2m (mm0, *(inptr + ROW_STRIDE * 4)); /* store row 1
*/ + punpckldq_r2r (mm6, mm4); /* final result mm4 = row3 [2 6 10 14]
*/ +
movq_r2m (mm1, *(inptr + ROW_STRIDE * 5)); /* store row 2
*/ + punpckhdq_r2r (mm6, mm3); /* final result mm3 = row4 [3 7 11 15]
*/ +
movq_r2m (mm4, *(inptr + ROW_STRIDE * 6)); /* store row 3
*/ + ; /* slot
*/ +
movq_r2m (mm3, *(inptr + ROW_STRIDE * 7)); /* store row 4
*/ + ; /* slot
*/ +
} +
static void
+idct_mmx32_cols (short *blk) +{ /* transform all 8 cols of 8x8 iDCT block
*/ +
int x; +
short *inptr = blk; + +
+ /* Despite the function's name, the matrix is transformed
*/ + /* row by row. This function is identical to idct_mmx32_rows(),
*/ + /* except for the SHIFT amount and ROUND_INV amount.
*/ +
+ /* this subroutine performs two operations
*/ + /* 1) iDCT row transform
*/ + /* for( i = 0; i < 8; ++ i)
*/ + /* DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] );
*/ + /*
*/ + /* 2) transpose the matrix (which was stored in qwTemp[])
*/ + /* qwTemp[] -> [8x8 matrix transpose] -> blk[]
*/ +
for (x = 0; x < 8; x++) { /* transform one row per iteration
*/ +
movq_m2r (*(inptr), mm0); /* 0 ; x3 x2 x1 x0
*/ +
movq_m2r (*(inptr + 4), mm1); /* 1 ; x7 x6 x5 x4
*/ + movq_r2r (mm0, mm2); /* 2 ; x3 x2 x1 x0
*/ +
movq_m2r (*(tab_i_01234567), mm3); /* 3 ; w06 w04 w02 w00
*/ + punpcklwd_r2r (mm1, mm0); /* x5 x1 x4 x0
*/ +
/* ----------
*/ - movq_r2r(mm0, mm5); /* 5 ; x5 x1 x4 x0
*/ - punpckldq_r2r(mm0, mm0); /* x4 x0 x4 x0
*/ -
- movq_m2r(*(tab_i_01234567+4), mm4); /* 4 ; w07 w05 w03 w01
*/ - punpckhwd_r2r(mm1, mm2); /* 1 ; x7 x3 x6 x2
*/ -
- pmaddwd_r2r(mm0, mm3); /* x4*w06+x0*w04 x4*w02+x0*w00
*/ - movq_r2r(mm2, mm6); /* 6 ; x7 x3 x6 x2
*/ -
- movq_m2r(*(tab_i_01234567+16), mm1);/* 1 ; w22 w20 w18 w16
*/ - punpckldq_r2r(mm2, mm2); /* x6 x2 x6 x2
*/ -
- pmaddwd_r2r(mm2, mm4); /* x6*w07+x2*w05 x6*w03+x2*w01
*/ - punpckhdq_r2r(mm5, mm5); /* x5 x1 x5 x1
*/ -
- pmaddwd_m2r(*(tab_i_01234567+8), mm0);/* x4*w14+x0*w12 x4*w10+x0*w08
*/ - punpckhdq_r2r(mm6, mm6); /* x7 x3 x7 x3
*/ -
- movq_m2r(*(tab_i_01234567+20), mm7);/* 7 ; w23 w21 w19 w17
*/ - pmaddwd_r2r(mm5, mm1); /* x5*w22+x1*w20 x5*w18+x1*w16
*/ -
- paddd_m2r(*(r_inv_col), mm3);/* +rounder
*/ - pmaddwd_r2r(mm6, mm7); /* x7*w23+x3*w21 x7*w19+x3*w17
*/ -
- pmaddwd_m2r(*(tab_i_01234567+12), mm2);/* x6*w15+x2*w13 x6*w11+x2*w09
*/ - paddd_r2r(mm4, mm3); /* 4 ; a1=sum(even1) a0=sum(even0)
*/ -
- pmaddwd_m2r(*(tab_i_01234567+24), mm5);/* x5*w30+x1*w28 x5*w26+x1*w24
*/ - movq_r2r(mm3, mm4); /* 4 ; a1 a0
*/ -
- pmaddwd_m2r(*(tab_i_01234567+28), mm6);/* x7*w31+x3*w29 x7*w27+x3*w25
*/ - paddd_r2r(mm7, mm1); /* 7 ; b1=sum(odd1) b0=sum(odd0)
*/ -
- paddd_m2r(*(r_inv_col), mm0);/* +rounder
*/ - psubd_r2r(mm1, mm3); /* a1-b1 a0-b0
*/ -
- psrad_i2r(SHIFT_INV_COL, mm3); /* y6=a1-b1 y7=a0-b0
*/ - paddd_r2r(mm4, mm1); /* 4 ; a1+b1 a0+b0
*/ -
- paddd_r2r(mm2, mm0); /* 2 ; a3=sum(even3) a2=sum(even2)
*/ - psrad_i2r(SHIFT_INV_COL, mm1); /* y1=a1+b1 y0=a0+b0
*/ -
- paddd_r2r(mm6, mm5); /* 6 ; b3=sum(odd3) b2=sum(odd2)
*/ - movq_r2r(mm0, mm4); /* 4 ; a3 a2
*/ -
- paddd_r2r(mm5, mm0); /* a3+b3 a2+b2
*/ - psubd_r2r(mm5, mm4); /* 5 ; a3-b3 a2-b2
*/ -
-
- psrad_i2r(SHIFT_INV_COL, mm4); /* y4=a3-b3 y5=a2-b2
*/ - psrad_i2r(SHIFT_INV_COL, mm0); /* y3=a3+b3 y2=a2+b2
*/ -
- packssdw_r2r(mm3, mm4); /* 3 ; y6 y7 y4 y5
*/ -
- packssdw_r2r(mm0, mm1); /* 0 ; y3 y2 y1 y0
*/ - movq_r2r(mm4, mm7); /* 7 ; y6 y7 y4 y5
*/ -
- psrld_i2r(16, mm4); /* 0 y6 0 y4
*/ -
- movq_r2m(mm1, *(inptr)); /* 1 ; save y3 y2 y1 y0
*/ - pslld_i2r(16, mm7); /* y7 0 y5 0
*/ -
- por_r2r(mm4, mm7); /* 4 ; y7 y6 y5 y4
*/ -
- /* begin processing row 1
*/ - movq_r2m(mm7, *(inptr+4)); /* 7 ; save y7 y6 y5 y4
*/ -
- inptr += 8;
- }
- /* done with the iDCT column-transformation
*/ -}
-
+ movq_r2r (mm0, mm5); /* 5 ; x5 x1 x4 x0
*/ + punpckldq_r2r (mm0, mm0); /* x4 x0 x4 x0
*/ +
movq_m2r (*(tab_i_01234567 + 4), mm4); /* 4 ; w07 w05 w03 w01
*/ + punpckhwd_r2r (mm1, mm2); /* 1 ; x7 x3 x6 x2
*/ +
pmaddwd_r2r (mm0, mm3); /* x4*w06+x0*w04 x4*w02+x0*w00
*/ + movq_r2r (mm2, mm6); /* 6 ; x7 x3 x6 x2
*/ +
movq_m2r (*(tab_i_01234567 + 16), mm1); /* 1 ; w22 w20 w18 w16
*/ + punpckldq_r2r (mm2, mm2); /* x6 x2 x6 x2
*/ +
pmaddwd_r2r (mm2, mm4); /* x6*w07+x2*w05 x6*w03+x2*w01
*/ + punpckhdq_r2r (mm5, mm5); /* x5 x1 x5 x1
*/ +
pmaddwd_m2r (*(tab_i_01234567 + 8), mm0); /* x4*w14+x0*w12 x4*w10+x0*w08
*/ + punpckhdq_r2r (mm6, mm6); /* x7 x3 x7 x3
*/ +
movq_m2r (*(tab_i_01234567 + 20), mm7); /* 7 ; w23 w21 w19 w17
*/ + pmaddwd_r2r (mm5, mm1); /* x5*w22+x1*w20 x5*w18+x1*w16
*/ +
paddd_m2r (*(r_inv_col), mm3); /* +rounder
*/ + pmaddwd_r2r (mm6, mm7); /* x7*w23+x3*w21 x7*w19+x3*w17
*/ +
pmaddwd_m2r (*(tab_i_01234567 + 12), mm2); /* x6*w15+x2*w13 x6*w11+x2*w09
*/ + paddd_r2r (mm4, mm3); /* 4 ; a1=sum(even1) a0=sum(even0)
*/ +
pmaddwd_m2r (*(tab_i_01234567 + 24), mm5); /* x5*w30+x1*w28 x5*w26+x1*w24
*/ + movq_r2r (mm3, mm4); /* 4 ; a1 a0
*/ +
pmaddwd_m2r (*(tab_i_01234567 + 28), mm6); /* x7*w31+x3*w29 x7*w27+x3*w25
*/ + paddd_r2r (mm7, mm1); /* 7 ; b1=sum(odd1) b0=sum(odd0)
*/ +
paddd_m2r (*(r_inv_col), mm0); /* +rounder
*/ + psubd_r2r (mm1, mm3); /* a1-b1 a0-b0
*/ +
psrad_i2r (SHIFT_INV_COL, mm3); /* y6=a1-b1 y7=a0-b0
*/ + paddd_r2r (mm4, mm1); /* 4 ; a1+b1 a0+b0
*/ +
paddd_r2r (mm2, mm0); /* 2 ; a3=sum(even3) a2=sum(even2)
*/ + psrad_i2r (SHIFT_INV_COL, mm1); /* y1=a1+b1 y0=a0+b0
*/ +
paddd_r2r (mm6, mm5); /* 6 ; b3=sum(odd3) b2=sum(odd2)
*/ + movq_r2r (mm0, mm4); /* 4 ; a3 a2
*/ +
paddd_r2r (mm5, mm0); /* a3+b3 a2+b2
*/ + psubd_r2r (mm5, mm4); /* 5 ; a3-b3 a2-b2
*/ +
psrad_i2r (SHIFT_INV_COL, mm4); /* y4=a3-b3 y5=a2-b2
*/ + psrad_i2r (SHIFT_INV_COL, mm0); /* y3=a3+b3 y2=a2+b2
*/ +
packssdw_r2r (mm3, mm4); /* 3 ; y6 y7 y4 y5
*/ +
packssdw_r2r (mm0, mm1); /* 0 ; y3 y2 y1 y0
*/ + movq_r2r (mm4, mm7); /* 7 ; y6 y7 y4 y5
*/ +
psrld_i2r (16, mm4); /* 0 y6 0 y4
*/ +
movq_r2m (mm1, *(inptr)); /* 1 ; save y3 y2 y1 y0
*/ + pslld_i2r (16, mm7); /* y7 0 y5 0
*/ +
por_r2r (mm4, mm7); /* 4 ; y7 y6 y5 y4
*/ +
+ /* begin processing row 1
*/ + movq_r2m (mm7, *(inptr + 4)); /* 7 ; save y7 y6 y5 y4
*/ +
inptr += 8; +
} +
+ /* done with the iDCT column-transformation
*/ +} + +
/*
*/ /* public interface to MMX32 IDCT 8x8 operation
*/ /*
*/ -void
-gst_idct_mmx32_idct( short *blk )
-{
- /* 1) iDCT row transformation
*/ - idct_mmx32_rows( blk ); /* 1) transform iDCT row, and transpose
*/ -
- /* 2) iDCT column transformation
*/ - idct_mmx32_cols( blk ); /* 2) transform iDCT row, and transpose
*/ -
- emms(); /* restore processor state
*/ - /* all done
*/ -}
+void
+gst_idct_mmx32_idct (short *blk)
+{ +
+ /* 1) iDCT row transformation
*/ + idct_mmx32_rows (blk); /* 1) transform iDCT row, and transpose
*/ +
+ /* 2) iDCT column transformation
*/ + idct_mmx32_cols (blk); /* 2) transform iDCT row, and transpose
*/ +
emms (); /* restore processor state
*/ + /* all done
*/ +}
|