From 4359de26166000e317604b6b20283c79dd2a0521 Mon Sep 17 00:00:00 2001 From: Andy Wingo Date: Tue, 19 Mar 2002 04:10:06 +0000 Subject: removal of //-style comments don't link plugins to core libs -- the versioning is done internally to the plugins with... Original commit message from CVS: * removal of //-style comments * don't link plugins to core libs -- the versioning is done internally to the plugins with the plugin_info struct, and symbol resolution is lazy, so we can always know if a plugin can be loaded by the plugin_info data. in theory. --- gst-libs/gst/idct/Makefile.am | 2 +- gst-libs/gst/idct/ieeetest.c | 2 +- gst-libs/gst/idct/mmx32idct.c | 958 +++++++++++++++++++++--------------------- 3 files changed, 481 insertions(+), 481 deletions(-) (limited to 'gst-libs/gst/idct') diff --git a/gst-libs/gst/idct/Makefile.am b/gst-libs/gst/idct/Makefile.am index 8fea3121..8b9d86ba 100644 --- a/gst-libs/gst/idct/Makefile.am +++ b/gst-libs/gst/idct/Makefile.am @@ -27,6 +27,6 @@ ieeetest_LDFLAGS = $(GST_LIBS) noinst_HEADERS = dct.h -libgstidct_la_LIBADD = $(GST_LIBS) +libgstidct_la_LIBADD = libgstidct_la_CFLAGS = $(GST_CFLAGS) -finline-functions -ffast-math libgstidct_la_LDFLAGS = $(GST_PLUGIN_LDFLAGS) diff --git a/gst-libs/gst/idct/ieeetest.c b/gst-libs/gst/idct/ieeetest.c index 23d64eb5..5925a6fd 100644 --- a/gst-libs/gst/idct/ieeetest.c +++ b/gst-libs/gst/idct/ieeetest.c @@ -117,7 +117,7 @@ main(int argc, char **argv) gst_trace_read_tsc(&tscstart); gst_idct_convert(idct, testout); gst_trace_read_tsc(&tscstop); - //printf("time %llu, %llu %lld\n", tscstart, tscstop, tscstop-tscstart); + /*printf("time %llu, %llu %lld\n", tscstart, tscstop, tscstop-tscstart); */ if (tscstop - tscstart < tscmin) tscmin = tscstop-tscstart; if (tscstop - tscstart > tscmax) tscmax = tscstop-tscstart; diff --git a/gst-libs/gst/idct/mmx32idct.c b/gst-libs/gst/idct/mmx32idct.c index 78bf45bf..0090d8a7 100644 --- a/gst-libs/gst/idct/mmx32idct.c +++ b/gst-libs/gst/idct/mmx32idct.c @@ -22,148 +22,148 @@ */ -// MMX32 iDCT algorithm (IEEE-1180 compliant) :: idct_mmx32() -// -// MPEG2AVI -// -------- -// v0.16B33 initial release -// -// This was one of the harder pieces of work to code. -// Intel's app-note focuses on the numerical issues of the algorithm, but -// assumes the programmer is familiar with IDCT mathematics, leaving the -// form of the complete function up to the programmer's imagination. -// -// ALGORITHM OVERVIEW -// ------------------ -// I played around with the code for quite a few hours. I came up -// with *A* working IDCT algorithm, however I'm not sure whether my routine -// is "the correct one." But rest assured, my code passes all six IEEE -// accuracy tests with plenty of margin. -// -// My IDCT algorithm consists of 4 steps: -// -// 1) IDCT-row transformation (using the IDCT-row function) on all 8 rows -// This yields an intermediate 8x8 matrix. -// -// 2) intermediate matrix transpose (mandatory) -// -// 3) IDCT-row transformation (2nd time) on all 8 rows of the intermediate -// matrix. The output is the final-result, in transposed form. -// -// 4) post-transformation matrix transpose -// (not necessary if the input-data is already transposed, this could -// be done during the MPEG "zig-zag" scan, but since my algorithm -// requires at least one transpose operation, why not re-use the -// transpose-code.) -// -// Although the (1st) and (3rd) steps use the SAME row-transform operation, -// the (3rd) step uses different shift&round constants (explained later.) -// -// Also note that the intermediate transpose (2) would not be neccessary, -// if the subsequent operation were a iDCT-column transformation. Since -// we only have the iDCT-row transform, we transpose the intermediate -// matrix and use the iDCT-row transform a 2nd time. -// -// I had to change some constants/variables for my method to work : -// -// As given by Intel, the #defines for SHIFT_INV_COL and RND_INV_COL are -// wrong. Not surprising since I'm not using a true column-transform -// operation, but the row-transform operation (as mentioned earlier.) -// round_inv_col[], which is given as "4 short" values, should have the -// same dimensions as round_inv_row[]. The corrected variables are -// shown. -// -// Intel's code defines a different table for each each row operation. -// The tables given are 0/4, 1/7, 2/6, and 5/3. My code only uses row#0. -// Using the other rows messes up the overall transform. -// -// IMPLEMENTATION DETAILs -// ---------------------- -// -// I divided the algorithm's work into two subroutines, -// 1) idct_mmx32_rows() - transforms 8 rows, then transpose -// 2) idct_mmx32_cols() - transforms 8 rows, then transpose -// yields final result ("drop-in" direct replacement for INT32 IDCT) -// -// The 2nd function is a clone of the 1st, with changes made only to the -// shift&rounding instructions. -// -// In the 1st function (rows), the shift & round instructions use -// SHIFT_INV_ROW & round_inv_row[] (renamed to r_inv_row[]) -// -// In the 2nd function (cols)-> r_inv_col[], and -// SHIFT_INV_COL & round_inv_col[] (renamed to r_inv_col[]) -// -// Each function contains an integrated transpose-operator, which comes -// AFTER the primary transformation operation. In the future, I'll optimize -// the code to do more of the transpose-work "in-place". Right now, I've -// left the code as two subroutines and a main calling function, so other -// people can read the code more easily. -// -// liaor@umcc.ais.org http://members.tripod.com/~liaor -// - - -//;============================================================================= -//; -//; AP-922 http://developer.intel.com/vtune/cbts/strmsimd -//; These examples contain code fragments for first stage iDCT 8x8 -//; (for rows) and first stage DCT 8x8 (for columns) -//; -//;============================================================================= +/* MMX32 iDCT algorithm (IEEE-1180 compliant) :: idct_mmx32() */ +/* */ +/* MPEG2AVI */ +/* -------- */ +/* v0.16B33 initial release */ +/* */ +/* This was one of the harder pieces of work to code. */ +/* Intel's app-note focuses on the numerical issues of the algorithm, but */ +/* assumes the programmer is familiar with IDCT mathematics, leaving the */ +/* form of the complete function up to the programmer's imagination. */ +/* */ +/* ALGORITHM OVERVIEW */ +/* ------------------ */ +/* I played around with the code for quite a few hours. I came up */ +/* with *A* working IDCT algorithm, however I'm not sure whether my routine */ +/* is "the correct one." But rest assured, my code passes all six IEEE */ +/* accuracy tests with plenty of margin. */ +/* */ +/* My IDCT algorithm consists of 4 steps: */ +/* */ +/* 1) IDCT-row transformation (using the IDCT-row function) on all 8 rows */ +/* This yields an intermediate 8x8 matrix. */ +/* */ +/* 2) intermediate matrix transpose (mandatory) */ +/* */ +/* 3) IDCT-row transformation (2nd time) on all 8 rows of the intermediate */ +/* matrix. The output is the final-result, in transposed form. */ +/* */ +/* 4) post-transformation matrix transpose */ +/* (not necessary if the input-data is already transposed, this could */ +/* be done during the MPEG "zig-zag" scan, but since my algorithm */ +/* requires at least one transpose operation, why not re-use the */ +/* transpose-code.) */ +/* */ +/* Although the (1st) and (3rd) steps use the SAME row-transform operation, */ +/* the (3rd) step uses different shift&round constants (explained later.) */ +/* */ +/* Also note that the intermediate transpose (2) would not be neccessary, */ +/* if the subsequent operation were a iDCT-column transformation. Since */ +/* we only have the iDCT-row transform, we transpose the intermediate */ +/* matrix and use the iDCT-row transform a 2nd time. */ +/* */ +/* I had to change some constants/variables for my method to work : */ +/* */ +/* As given by Intel, the #defines for SHIFT_INV_COL and RND_INV_COL are */ +/* wrong. Not surprising since I'm not using a true column-transform */ +/* operation, but the row-transform operation (as mentioned earlier.) */ +/* round_inv_col[], which is given as "4 short" values, should have the */ +/* same dimensions as round_inv_row[]. The corrected variables are */ +/* shown. */ +/* */ +/* Intel's code defines a different table for each each row operation. */ +/* The tables given are 0/4, 1/7, 2/6, and 5/3. My code only uses row#0. */ +/* Using the other rows messes up the overall transform. */ +/* */ +/* IMPLEMENTATION DETAILs */ +/* ---------------------- */ +/* */ +/* I divided the algorithm's work into two subroutines, */ +/* 1) idct_mmx32_rows() - transforms 8 rows, then transpose */ +/* 2) idct_mmx32_cols() - transforms 8 rows, then transpose */ +/* yields final result ("drop-in" direct replacement for INT32 IDCT) */ +/* */ +/* The 2nd function is a clone of the 1st, with changes made only to the */ +/* shift&rounding instructions. */ +/* */ +/* In the 1st function (rows), the shift & round instructions use */ +/* SHIFT_INV_ROW & round_inv_row[] (renamed to r_inv_row[]) */ +/* */ +/* In the 2nd function (cols)-> r_inv_col[], and */ +/* SHIFT_INV_COL & round_inv_col[] (renamed to r_inv_col[]) */ +/* */ +/* Each function contains an integrated transpose-operator, which comes */ +/* AFTER the primary transformation operation. In the future, I'll optimize */ +/* the code to do more of the transpose-work "in-place". Right now, I've */ +/* left the code as two subroutines and a main calling function, so other */ +/* people can read the code more easily. */ +/* */ +/* liaor@umcc.ais.org http://members.tripod.com/~liaor */ +/* */ + + +/*;============================================================================= */ +/*; */ +/*; AP-922 http://developer.intel.com/vtune/cbts/strmsimd */ +/*; These examples contain code fragments for first stage iDCT 8x8 */ +/*; (for rows) and first stage DCT 8x8 (for columns) */ +/*; */ +/*;============================================================================= */ /* mword typedef qword qword ptr equ mword ptr */ #include -#define BITS_INV_ACC 4 //; 4 or 5 for IEEE - // 5 yields higher accuracy, but lessens dynamic range on the input matrix +#define BITS_INV_ACC 4 /*; 4 or 5 for IEEE */ + /* 5 yields higher accuracy, but lessens dynamic range on the input matrix */ #define SHIFT_INV_ROW (16 - BITS_INV_ACC) -#define SHIFT_INV_COL (1 + BITS_INV_ACC +14 ) // changed from Intel's val) -//#define SHIFT_INV_COL (1 + BITS_INV_ACC ) +#define SHIFT_INV_COL (1 + BITS_INV_ACC +14 ) /* changed from Intel's val) */ +/*#define SHIFT_INV_COL (1 + BITS_INV_ACC ) */ #define RND_INV_ROW (1 << (SHIFT_INV_ROW-1)) #define RND_INV_COL (1 << (SHIFT_INV_COL-1)) -#define RND_INV_CORR (RND_INV_COL - 1) //; correction -1.0 and round -//#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC)) //; 1 << (SHIFT_INV_ROW-1) -//#define RND_INV_COL (16 * (BITS_INV_ACC - 3)) //; 1 << (SHIFT_INV_COL-1) +#define RND_INV_CORR (RND_INV_COL - 1) /*; correction -1.0 and round */ +/*#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC)) //; 1 << (SHIFT_INV_ROW-1) */ +/*#define RND_INV_COL (16 * (BITS_INV_ACC - 3)) //; 1 << (SHIFT_INV_COL-1) */ -//.data -//Align 16 +/*.data */ +/*Align 16 */ const static long r_inv_row[2] = { RND_INV_ROW, RND_INV_ROW}; const static long r_inv_col[2] = {RND_INV_COL, RND_INV_COL}; const static long r_inv_corr[2] = {RND_INV_CORR, RND_INV_CORR }; -//const static short r_inv_col[4] = -// {RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL}; -//const static short r_inv_corr[4] = -// {RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR}; +/*const static short r_inv_col[4] = */ +/* {RND_INV_COL, RND_INV_COL, RND_INV_COL, RND_INV_COL}; */ +/*const static short r_inv_corr[4] = */ +/* {RND_INV_CORR, RND_INV_CORR, RND_INV_CORR, RND_INV_CORR}; */ /* constants for the forward DCT -//#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy -//#define SHIFT_FRW_COL BITS_FRW_ACC -//#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17) -//#define RND_FRW_ROW (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1) +/*#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy */ +/*#define SHIFT_FRW_COL BITS_FRW_ACC */ +/*#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17) */ +/*#define RND_FRW_ROW (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1) */ const static __int64 one_corr = 0x0001000100010001; const static long r_frw_row[2] = {RND_FRW_ROW, RND_FRW_ROW }; -//const static short tg_1_16[4] = {13036, 13036, 13036, 13036 }; //tg * (2<<16) + 0.5 -//const static short tg_2_16[4] = {27146, 27146, 27146, 27146 }; //tg * (2<<16) + 0.5 -//const static short tg_3_16[4] = {-21746, -21746, -21746, -21746 }; //tg * (2<<16) + 0.5 -//const static short cos_4_16[4] = {-19195, -19195, -19195, -19195 }; //cos * (2<<16) + 0.5 -//const static short ocos_4_16[4] = {23170, 23170, 23170, 23170 }; //cos * (2<<15) + 0.5 +/*const static short tg_1_16[4] = {13036, 13036, 13036, 13036 }; //tg * (2<<16) + 0.5 */ +/*const static short tg_2_16[4] = {27146, 27146, 27146, 27146 }; //tg * (2<<16) + 0.5 */ +/*const static short tg_3_16[4] = {-21746, -21746, -21746, -21746 }; //tg * (2<<16) + 0.5 */ +/*const static short cos_4_16[4] = {-19195, -19195, -19195, -19195 }; //cos * (2<<16) + 0.5 */ +/*const static short ocos_4_16[4] = {23170, 23170, 23170, 23170 }; //cos * (2<<15) + 0.5 */ -//concatenated table, for forward DCT transformation +/*concatenated table, for forward DCT transformation */ const static short tg_all_16[] = { - 13036, 13036, 13036, 13036, // tg * (2<<16) + 0.5 - 27146, 27146, 27146, 27146, //tg * (2<<16) + 0.5 - -21746, -21746, -21746, -21746, // tg * (2<<16) + 0.5 - -19195, -19195, -19195, -19195, //cos * (2<<16) + 0.5 - 23170, 23170, 23170, 23170 }; //cos * (2<<15) + 0.5 + 13036, 13036, 13036, 13036, /* tg * (2<<16) + 0.5 */ + 27146, 27146, 27146, 27146, /*tg * (2<<16) + 0.5 */ + -21746, -21746, -21746, -21746, /* tg * (2<<16) + 0.5 */ + -19195, -19195, -19195, -19195, /*cos * (2<<16) + 0.5 */ + 23170, 23170, 23170, 23170 }; /*cos * (2<<15) + 0.5 */ #define tg_1_16 (tg_all_16 + 0) #define tg_2_16 (tg_all_16 + 8) @@ -231,553 +231,553 @@ const static short tg_all_16[] = { IF _MMX ; MMX code ;============================================================================= -//; Table for rows 0,4 - constants are multiplied by cos_4_16 +/*; Table for rows 0,4 - constants are multiplied by cos_4_16 */ const short tab_i_04[] = { - 16384, 16384, 16384, -16384, // ; movq-> w06 w04 w02 w00 - 21407, 8867, 8867, -21407, // w07 w05 w03 w01 - 16384, -16384, 16384, 16384, //; w14 w12 w10 w08 - -8867, 21407, -21407, -8867, //; w15 w13 w11 w09 - 22725, 12873, 19266, -22725, //; w22 w20 w18 w16 - 19266, 4520, -4520, -12873, //; w23 w21 w19 w17 - 12873, 4520, 4520, 19266, //; w30 w28 w26 w24 - -22725, 19266, -12873, -22725 };//w31 w29 w27 w25 - -//; Table for rows 1,7 - constants are multiplied by cos_1_16 + 16384, 16384, 16384, -16384, /* ; movq-> w06 w04 w02 w00 */ + 21407, 8867, 8867, -21407, /* w07 w05 w03 w01 */ + 16384, -16384, 16384, 16384, /*; w14 w12 w10 w08 */ + -8867, 21407, -21407, -8867, /*; w15 w13 w11 w09 */ + 22725, 12873, 19266, -22725, /*; w22 w20 w18 w16 */ + 19266, 4520, -4520, -12873, /*; w23 w21 w19 w17 */ + 12873, 4520, 4520, 19266, /*; w30 w28 w26 w24 */ + -22725, 19266, -12873, -22725 };/*w31 w29 w27 w25 */ + +/*; Table for rows 1,7 - constants are multiplied by cos_1_16 */ const short tab_i_17[] = { - 22725, 22725, 22725, -22725, // ; movq-> w06 w04 w02 w00 - 29692, 12299, 12299, -29692, // ; w07 w05 w03 w01 - 22725, -22725, 22725, 22725, //; w14 w12 w10 w08 - -12299, 29692, -29692, -12299, //; w15 w13 w11 w09 - 31521, 17855, 26722, -31521, //; w22 w20 w18 w16 - 26722, 6270, -6270, -17855, //; w23 w21 w19 w17 - 17855, 6270, 6270, 26722, //; w30 w28 w26 w24 - -31521, 26722, -17855, -31521}; // w31 w29 w27 w25 - -//; Table for rows 2,6 - constants are multiplied by cos_2_16 + 22725, 22725, 22725, -22725, /* ; movq-> w06 w04 w02 w00 */ + 29692, 12299, 12299, -29692, /* ; w07 w05 w03 w01 */ + 22725, -22725, 22725, 22725, /*; w14 w12 w10 w08 */ + -12299, 29692, -29692, -12299, /*; w15 w13 w11 w09 */ + 31521, 17855, 26722, -31521, /*; w22 w20 w18 w16 */ + 26722, 6270, -6270, -17855, /*; w23 w21 w19 w17 */ + 17855, 6270, 6270, 26722, /*; w30 w28 w26 w24 */ + -31521, 26722, -17855, -31521}; /* w31 w29 w27 w25 */ + +/*; Table for rows 2,6 - constants are multiplied by cos_2_16 */ const short tab_i_26[] = { - 21407, 21407, 21407, -21407, // ; movq-> w06 w04 w02 w00 - 27969, 11585, 11585, -27969, // ; w07 w05 w03 w01 - 21407, -21407, 21407, 21407, // ; w14 w12 w10 w08 - -11585, 27969, -27969, -11585, // ;w15 w13 w11 w09 - 29692, 16819, 25172, -29692, // ;w22 w20 w18 w16 - 25172, 5906, -5906, -16819, // ;w23 w21 w19 w17 - 16819, 5906, 5906, 25172, // ;w30 w28 w26 w24 - -29692, 25172, -16819, -29692}; // ;w31 w29 w27 w25 + 21407, 21407, 21407, -21407, /* ; movq-> w06 w04 w02 w00 */ + 27969, 11585, 11585, -27969, /* ; w07 w05 w03 w01 */ + 21407, -21407, 21407, 21407, /* ; w14 w12 w10 w08 */ + -11585, 27969, -27969, -11585, /* ;w15 w13 w11 w09 */ + 29692, 16819, 25172, -29692, /* ;w22 w20 w18 w16 */ + 25172, 5906, -5906, -16819, /* ;w23 w21 w19 w17 */ + 16819, 5906, 5906, 25172, /* ;w30 w28 w26 w24 */ + -29692, 25172, -16819, -29692}; /* ;w31 w29 w27 w25 */ -//; Table for rows 3,5 - constants are multiplied by cos_3_16 +/*; Table for rows 3,5 - constants are multiplied by cos_3_16 */ const short tab_i_35[] = { - 19266, 19266, 19266, -19266, //; movq-> w06 w04 w02 w00 - 25172, 10426, 10426, -25172, //; w07 w05 w03 w01 - 19266, -19266, 19266, 19266, //; w14 w12 w10 w08 - -10426, 25172, -25172, -10426, //; w15 w13 w11 w09 - 26722, 15137, 22654, -26722, //; w22 w20 w18 w16 - 22654, 5315, -5315, -15137, //; w23 w21 w19 w17 - 15137, 5315, 5315, 22654, //; w30 w28 w26 w24 - -26722, 22654, -15137, -26722}; //; w31 w29 w27 w25 + 19266, 19266, 19266, -19266, /*; movq-> w06 w04 w02 w00 */ + 25172, 10426, 10426, -25172, /*; w07 w05 w03 w01 */ + 19266, -19266, 19266, 19266, /*; w14 w12 w10 w08 */ + -10426, 25172, -25172, -10426, /*; w15 w13 w11 w09 */ + 26722, 15137, 22654, -26722, /*; w22 w20 w18 w16 */ + 22654, 5315, -5315, -15137, /*; w23 w21 w19 w17 */ + 15137, 5315, 5315, 22654, /*; w30 w28 w26 w24 */ + -26722, 22654, -15137, -26722}; /*; w31 w29 w27 w25 */ */ -// CONCATENATED TABLE, rows 0,1,2,3,4,5,6,7 (in order ) -// -// In our implementation, however, we only use row0 ! -// +/* CONCATENATED TABLE, rows 0,1,2,3,4,5,6,7 (in order ) */ +/* */ +/* In our implementation, however, we only use row0 ! */ +/* */ static const short tab_i_01234567[] = { - //row0, this row is required - 16384, 16384, 16384, -16384, // ; movq-> w06 w04 w02 w00 - 21407, 8867, 8867, -21407, // w07 w05 w03 w01 - 16384, -16384, 16384, 16384, //; w14 w12 w10 w08 - -8867, 21407, -21407, -8867, //; w15 w13 w11 w09 - 22725, 12873, 19266, -22725, //; w22 w20 w18 w16 - 19266, 4520, -4520, -12873, //; w23 w21 w19 w17 - 12873, 4520, 4520, 19266, //; w30 w28 w26 w24 - -22725, 19266, -12873, -22725, //w31 w29 w27 w25 - - // the rest of these rows (1-7), aren't used ! - - //row1 - 22725, 22725, 22725, -22725, // ; movq-> w06 w04 w02 w00 - 29692, 12299, 12299, -29692, // ; w07 w05 w03 w01 - 22725, -22725, 22725, 22725, //; w14 w12 w10 w08 - -12299, 29692, -29692, -12299, //; w15 w13 w11 w09 - 31521, 17855, 26722, -31521, //; w22 w20 w18 w16 - 26722, 6270, -6270, -17855, //; w23 w21 w19 w17 - 17855, 6270, 6270, 26722, //; w30 w28 w26 w24 - -31521, 26722, -17855, -31521, // w31 w29 w27 w25 - - //row2 - 21407, 21407, 21407, -21407, // ; movq-> w06 w04 w02 w00 - 27969, 11585, 11585, -27969, // ; w07 w05 w03 w01 - 21407, -21407, 21407, 21407, // ; w14 w12 w10 w08 - -11585, 27969, -27969, -11585, // ;w15 w13 w11 w09 - 29692, 16819, 25172, -29692, // ;w22 w20 w18 w16 - 25172, 5906, -5906, -16819, // ;w23 w21 w19 w17 - 16819, 5906, 5906, 25172, // ;w30 w28 w26 w24 - -29692, 25172, -16819, -29692, // ;w31 w29 w27 w25 - - //row3 - 19266, 19266, 19266, -19266, //; movq-> w06 w04 w02 w00 - 25172, 10426, 10426, -25172, //; w07 w05 w03 w01 - 19266, -19266, 19266, 19266, //; w14 w12 w10 w08 - -10426, 25172, -25172, -10426, //; w15 w13 w11 w09 - 26722, 15137, 22654, -26722, //; w22 w20 w18 w16 - 22654, 5315, -5315, -15137, //; w23 w21 w19 w17 - 15137, 5315, 5315, 22654, //; w30 w28 w26 w24 - -26722, 22654, -15137, -26722, //; w31 w29 w27 w25 - - //row4 - 16384, 16384, 16384, -16384, // ; movq-> w06 w04 w02 w00 - 21407, 8867, 8867, -21407, // w07 w05 w03 w01 - 16384, -16384, 16384, 16384, //; w14 w12 w10 w08 - -8867, 21407, -21407, -8867, //; w15 w13 w11 w09 - 22725, 12873, 19266, -22725, //; w22 w20 w18 w16 - 19266, 4520, -4520, -12873, //; w23 w21 w19 w17 - 12873, 4520, 4520, 19266, //; w30 w28 w26 w24 - -22725, 19266, -12873, -22725, //w31 w29 w27 w25 - - //row5 - 19266, 19266, 19266, -19266, //; movq-> w06 w04 w02 w00 - 25172, 10426, 10426, -25172, //; w07 w05 w03 w01 - 19266, -19266, 19266, 19266, //; w14 w12 w10 w08 - -10426, 25172, -25172, -10426, //; w15 w13 w11 w09 - 26722, 15137, 22654, -26722, //; w22 w20 w18 w16 - 22654, 5315, -5315, -15137, //; w23 w21 w19 w17 - 15137, 5315, 5315, 22654, //; w30 w28 w26 w24 - -26722, 22654, -15137, -26722, //; w31 w29 w27 w25 - - //row6 - 21407, 21407, 21407, -21407, // ; movq-> w06 w04 w02 w00 - 27969, 11585, 11585, -27969, // ; w07 w05 w03 w01 - 21407, -21407, 21407, 21407, // ; w14 w12 w10 w08 - -11585, 27969, -27969, -11585, // ;w15 w13 w11 w09 - 29692, 16819, 25172, -29692, // ;w22 w20 w18 w16 - 25172, 5906, -5906, -16819, // ;w23 w21 w19 w17 - 16819, 5906, 5906, 25172, // ;w30 w28 w26 w24 - -29692, 25172, -16819, -29692, // ;w31 w29 w27 w25 - - //row7 - 22725, 22725, 22725, -22725, // ; movq-> w06 w04 w02 w00 - 29692, 12299, 12299, -29692, // ; w07 w05 w03 w01 - 22725, -22725, 22725, 22725, //; w14 w12 w10 w08 - -12299, 29692, -29692, -12299, //; w15 w13 w11 w09 - 31521, 17855, 26722, -31521, //; w22 w20 w18 w16 - 26722, 6270, -6270, -17855, //; w23 w21 w19 w17 - 17855, 6270, 6270, 26722, //; w30 w28 w26 w24 - -31521, 26722, -17855, -31521}; // w31 w29 w27 w25 - - -#define INP eax // pointer to (short *blk) -#define OUT ecx // pointer to output (temporary store space qwTemp[]) -#define TABLE ebx // pointer to tab_i_01234567[] + /*row0, this row is required */ + 16384, 16384, 16384, -16384, /* ; movq-> w06 w04 w02 w00 */ + 21407, 8867, 8867, -21407, /* w07 w05 w03 w01 */ + 16384, -16384, 16384, 16384, /*; w14 w12 w10 w08 */ + -8867, 21407, -21407, -8867, /*; w15 w13 w11 w09 */ + 22725, 12873, 19266, -22725, /*; w22 w20 w18 w16 */ + 19266, 4520, -4520, -12873, /*; w23 w21 w19 w17 */ + 12873, 4520, 4520, 19266, /*; w30 w28 w26 w24 */ + -22725, 19266, -12873, -22725, /*w31 w29 w27 w25 */ + + /* the rest of these rows (1-7), aren't used ! */ + + /*row1 */ + 22725, 22725, 22725, -22725, /* ; movq-> w06 w04 w02 w00 */ + 29692, 12299, 12299, -29692, /* ; w07 w05 w03 w01 */ + 22725, -22725, 22725, 22725, /*; w14 w12 w10 w08 */ + -12299, 29692, -29692, -12299, /*; w15 w13 w11 w09 */ + 31521, 17855, 26722, -31521, /*; w22 w20 w18 w16 */ + 26722, 6270, -6270, -17855, /*; w23 w21 w19 w17 */ + 17855, 6270, 6270, 26722, /*; w30 w28 w26 w24 */ + -31521, 26722, -17855, -31521, /* w31 w29 w27 w25 */ + + /*row2 */ + 21407, 21407, 21407, -21407, /* ; movq-> w06 w04 w02 w00 */ + 27969, 11585, 11585, -27969, /* ; w07 w05 w03 w01 */ + 21407, -21407, 21407, 21407, /* ; w14 w12 w10 w08 */ + -11585, 27969, -27969, -11585, /* ;w15 w13 w11 w09 */ + 29692, 16819, 25172, -29692, /* ;w22 w20 w18 w16 */ + 25172, 5906, -5906, -16819, /* ;w23 w21 w19 w17 */ + 16819, 5906, 5906, 25172, /* ;w30 w28 w26 w24 */ + -29692, 25172, -16819, -29692, /* ;w31 w29 w27 w25 */ + + /*row3 */ + 19266, 19266, 19266, -19266, /*; movq-> w06 w04 w02 w00 */ + 25172, 10426, 10426, -25172, /*; w07 w05 w03 w01 */ + 19266, -19266, 19266, 19266, /*; w14 w12 w10 w08 */ + -10426, 25172, -25172, -10426, /*; w15 w13 w11 w09 */ + 26722, 15137, 22654, -26722, /*; w22 w20 w18 w16 */ + 22654, 5315, -5315, -15137, /*; w23 w21 w19 w17 */ + 15137, 5315, 5315, 22654, /*; w30 w28 w26 w24 */ + -26722, 22654, -15137, -26722, /*; w31 w29 w27 w25 */ + + /*row4 */ + 16384, 16384, 16384, -16384, /* ; movq-> w06 w04 w02 w00 */ + 21407, 8867, 8867, -21407, /* w07 w05 w03 w01 */ + 16384, -16384, 16384, 16384, /*; w14 w12 w10 w08 */ + -8867, 21407, -21407, -8867, /*; w15 w13 w11 w09 */ + 22725, 12873, 19266, -22725, /*; w22 w20 w18 w16 */ + 19266, 4520, -4520, -12873, /*; w23 w21 w19 w17 */ + 12873, 4520, 4520, 19266, /*; w30 w28 w26 w24 */ + -22725, 19266, -12873, -22725, /*w31 w29 w27 w25 */ + + /*row5 */ + 19266, 19266, 19266, -19266, /*; movq-> w06 w04 w02 w00 */ + 25172, 10426, 10426, -25172, /*; w07 w05 w03 w01 */ + 19266, -19266, 19266, 19266, /*; w14 w12 w10 w08 */ + -10426, 25172, -25172, -10426, /*; w15 w13 w11 w09 */ + 26722, 15137, 22654, -26722, /*; w22 w20 w18 w16 */ + 22654, 5315, -5315, -15137, /*; w23 w21 w19 w17 */ + 15137, 5315, 5315, 22654, /*; w30 w28 w26 w24 */ + -26722, 22654, -15137, -26722, /*; w31 w29 w27 w25 */ + + /*row6 */ + 21407, 21407, 21407, -21407, /* ; movq-> w06 w04 w02 w00 */ + 27969, 11585, 11585, -27969, /* ; w07 w05 w03 w01 */ + 21407, -21407, 21407, 21407, /* ; w14 w12 w10 w08 */ + -11585, 27969, -27969, -11585, /* ;w15 w13 w11 w09 */ + 29692, 16819, 25172, -29692, /* ;w22 w20 w18 w16 */ + 25172, 5906, -5906, -16819, /* ;w23 w21 w19 w17 */ + 16819, 5906, 5906, 25172, /* ;w30 w28 w26 w24 */ + -29692, 25172, -16819, -29692, /* ;w31 w29 w27 w25 */ + + /*row7 */ + 22725, 22725, 22725, -22725, /* ; movq-> w06 w04 w02 w00 */ + 29692, 12299, 12299, -29692, /* ; w07 w05 w03 w01 */ + 22725, -22725, 22725, 22725, /*; w14 w12 w10 w08 */ + -12299, 29692, -29692, -12299, /*; w15 w13 w11 w09 */ + 31521, 17855, 26722, -31521, /*; w22 w20 w18 w16 */ + 26722, 6270, -6270, -17855, /*; w23 w21 w19 w17 */ + 17855, 6270, 6270, 26722, /*; w30 w28 w26 w24 */ + -31521, 26722, -17855, -31521}; /* w31 w29 w27 w25 */ + + +#define INP eax /* pointer to (short *blk) */ +#define OUT ecx /* pointer to output (temporary store space qwTemp[]) */ +#define TABLE ebx /* pointer to tab_i_01234567[] */ #define round_inv_row edx #define round_inv_col edx -#define ROW_STRIDE 8 // for 8x8 matrix transposer +#define ROW_STRIDE 8 /* for 8x8 matrix transposer */ -// private variables and functions +/* private variables and functions */ -//temporary storage space, 8x8 of shorts +/*temporary storage space, 8x8 of shorts */ -__inline static void idct_mmx32_rows( short *blk ); // transform rows -__inline static void idct_mmx32_cols( short *blk ); // transform "columns" - // the "column" transform actually transforms rows, it is - // identical to the row-transform except for the ROUNDING - // and SHIFTING coefficients. +__inline static void idct_mmx32_rows( short *blk ); /* transform rows */ +__inline static void idct_mmx32_cols( short *blk ); /* transform "columns" */ + /* the "column" transform actually transforms rows, it is */ + /* identical to the row-transform except for the ROUNDING */ + /* and SHIFTING coefficients. */ static void -idct_mmx32_rows( short *blk ) // transform all 8 rows of 8x8 iDCT block +idct_mmx32_rows( short *blk ) /* transform all 8 rows of 8x8 iDCT block */ { int x; short qwTemp[64]; short *out = &qwTemp[0]; short *inptr = blk; - // this subroutine performs two operations - // 1) iDCT row transform - // for( i = 0; i < 8; ++ i) - // DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] ); - // - // 2) transpose the matrix (which was stored in qwTemp[]) - // qwTemp[] -> [8x8 matrix transpose] -> blk[] + /* this subroutine performs two operations */ + /* 1) iDCT row transform */ + /* for( i = 0; i < 8; ++ i) */ + /* DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] ); */ + /* */ + /* 2) transpose the matrix (which was stored in qwTemp[]) */ + /* qwTemp[] -> [8x8 matrix transpose] -> blk[] */ - for (x=0; x<8; x++) { // transform one row per iteration - movq_m2r(*(inptr), mm0); // 0 ; x3 x2 x1 x0 + for (x=0; x<8; x++) { /* transform one row per iteration */ + movq_m2r(*(inptr), mm0); /* 0 ; x3 x2 x1 x0 */ - movq_m2r(*(inptr+4), mm1); // 1 ; x7 x6 x5 x4 - movq_r2r(mm0, mm2); // 2 ; x3 x2 x1 x0 + movq_m2r(*(inptr+4), mm1); /* 1 ; x7 x6 x5 x4 */ + movq_r2r(mm0, mm2); /* 2 ; x3 x2 x1 x0 */ - movq_m2r(*(tab_i_01234567), mm3); // 3 ; w06 w04 w02 w00 - punpcklwd_r2r(mm1, mm0); // x5 x1 x4 x0 + movq_m2r(*(tab_i_01234567), mm3); /* 3 ; w06 w04 w02 w00 */ + punpcklwd_r2r(mm1, mm0); /* x5 x1 x4 x0 */ - // ---------- - movq_r2r(mm0, mm5); // 5 ; x5 x1 x4 x0 - punpckldq_r2r(mm0, mm0); // x4 x0 x4 x0 + /* ---------- */ + movq_r2r(mm0, mm5); /* 5 ; x5 x1 x4 x0 */ + punpckldq_r2r(mm0, mm0); /* x4 x0 x4 x0 */ - movq_m2r(*(tab_i_01234567+4), mm4); // 4 ; w07 w05 w03 w01 - punpckhwd_r2r(mm1, mm2); // 1 ; x7 x3 x6 x2 + movq_m2r(*(tab_i_01234567+4), mm4); /* 4 ; w07 w05 w03 w01 */ + punpckhwd_r2r(mm1, mm2); /* 1 ; x7 x3 x6 x2 */ - pmaddwd_r2r(mm0, mm3); // x4*w06+x0*w04 x4*w02+x0*w00 - movq_r2r(mm2, mm6); // 6 ; x7 x3 x6 x2 + pmaddwd_r2r(mm0, mm3); /* x4*w06+x0*w04 x4*w02+x0*w00 */ + movq_r2r(mm2, mm6); /* 6 ; x7 x3 x6 x2 */ - movq_m2r(*(tab_i_01234567+16), mm1);// 1 ; w22 w20 w18 w16 - punpckldq_r2r(mm2, mm2); // x6 x2 x6 x2 + movq_m2r(*(tab_i_01234567+16), mm1);/* 1 ; w22 w20 w18 w16 */ + punpckldq_r2r(mm2, mm2); /* x6 x2 x6 x2 */ - pmaddwd_r2r(mm2, mm4); // x6*w07+x2*w05 x6*w03+x2*w01 - punpckhdq_r2r(mm5, mm5); // x5 x1 x5 x1 + pmaddwd_r2r(mm2, mm4); /* x6*w07+x2*w05 x6*w03+x2*w01 */ + punpckhdq_r2r(mm5, mm5); /* x5 x1 x5 x1 */ - pmaddwd_m2r(*(tab_i_01234567+8), mm0);// x4*w14+x0*w12 x4*w10+x0*w08 - punpckhdq_r2r(mm6, mm6); // x7 x3 x7 x3 + pmaddwd_m2r(*(tab_i_01234567+8), mm0);/* x4*w14+x0*w12 x4*w10+x0*w08 */ + punpckhdq_r2r(mm6, mm6); /* x7 x3 x7 x3 */ - movq_m2r(*(tab_i_01234567+20), mm7);// 7 ; w23 w21 w19 w17 - pmaddwd_r2r(mm5, mm1); // x5*w22+x1*w20 x5*w18+x1*w16 + movq_m2r(*(tab_i_01234567+20), mm7);/* 7 ; w23 w21 w19 w17 */ + pmaddwd_r2r(mm5, mm1); /* x5*w22+x1*w20 x5*w18+x1*w16 */ - paddd_m2r(*(r_inv_row), mm3);// +rounder - pmaddwd_r2r(mm6, mm7); // x7*w23+x3*w21 x7*w19+x3*w17 + paddd_m2r(*(r_inv_row), mm3);/* +rounder */ + pmaddwd_r2r(mm6, mm7); /* x7*w23+x3*w21 x7*w19+x3*w17 */ - pmaddwd_m2r(*(tab_i_01234567+12), mm2);// x6*w15+x2*w13 x6*w11+x2*w09 - paddd_r2r(mm4, mm3); // 4 ; a1=sum(even1) a0=sum(even0) + pmaddwd_m2r(*(tab_i_01234567+12), mm2);/* x6*w15+x2*w13 x6*w11+x2*w09 */ + paddd_r2r(mm4, mm3); /* 4 ; a1=sum(even1) a0=sum(even0) */ - pmaddwd_m2r(*(tab_i_01234567+24), mm5);// x5*w30+x1*w28 x5*w26+x1*w24 - movq_r2r(mm3, mm4); // 4 ; a1 a0 + pmaddwd_m2r(*(tab_i_01234567+24), mm5);/* x5*w30+x1*w28 x5*w26+x1*w24 */ + movq_r2r(mm3, mm4); /* 4 ; a1 a0 */ - pmaddwd_m2r(*(tab_i_01234567+28), mm6);// x7*w31+x3*w29 x7*w27+x3*w25 - paddd_r2r(mm7, mm1); // 7 ; b1=sum(odd1) b0=sum(odd0) + pmaddwd_m2r(*(tab_i_01234567+28), mm6);/* x7*w31+x3*w29 x7*w27+x3*w25 */ + paddd_r2r(mm7, mm1); /* 7 ; b1=sum(odd1) b0=sum(odd0) */ - paddd_m2r(*(r_inv_row), mm0);// +rounder - psubd_r2r(mm1, mm3); // a1-b1 a0-b0 + paddd_m2r(*(r_inv_row), mm0);/* +rounder */ + psubd_r2r(mm1, mm3); /* a1-b1 a0-b0 */ - psrad_i2r(SHIFT_INV_ROW, mm3); // y6=a1-b1 y7=a0-b0 - paddd_r2r(mm4, mm1); // 4 ; a1+b1 a0+b0 + psrad_i2r(SHIFT_INV_ROW, mm3); /* y6=a1-b1 y7=a0-b0 */ + paddd_r2r(mm4, mm1); /* 4 ; a1+b1 a0+b0 */ - paddd_r2r(mm2, mm0); // 2 ; a3=sum(even3) a2=sum(even2) - psrad_i2r(SHIFT_INV_ROW, mm1); // y1=a1+b1 y0=a0+b0 + paddd_r2r(mm2, mm0); /* 2 ; a3=sum(even3) a2=sum(even2) */ + psrad_i2r(SHIFT_INV_ROW, mm1); /* y1=a1+b1 y0=a0+b0 */ - paddd_r2r(mm6, mm5); // 6 ; b3=sum(odd3) b2=sum(odd2) - movq_r2r(mm0, mm4); // 4 ; a3 a2 + paddd_r2r(mm6, mm5); /* 6 ; b3=sum(odd3) b2=sum(odd2) */ + movq_r2r(mm0, mm4); /* 4 ; a3 a2 */ - paddd_r2r(mm5, mm0); // a3+b3 a2+b2 - psubd_r2r(mm5, mm4); // 5 ; a3-b3 a2-b2 + paddd_r2r(mm5, mm0); /* a3+b3 a2+b2 */ + psubd_r2r(mm5, mm4); /* 5 ; a3-b3 a2-b2 */ - psrad_i2r(SHIFT_INV_ROW, mm4); // y4=a3-b3 y5=a2-b2 - psrad_i2r(SHIFT_INV_ROW, mm0); // y3=a3+b3 y2=a2+b2 + psrad_i2r(SHIFT_INV_ROW, mm4); /* y4=a3-b3 y5=a2-b2 */ + psrad_i2r(SHIFT_INV_ROW, mm0); /* y3=a3+b3 y2=a2+b2 */ - packssdw_r2r(mm3, mm4); // 3 ; y6 y7 y4 y5 + packssdw_r2r(mm3, mm4); /* 3 ; y6 y7 y4 y5 */ - packssdw_r2r(mm0, mm1); // 0 ; y3 y2 y1 y0 - movq_r2r(mm4, mm7); // 7 ; y6 y7 y4 y5 + packssdw_r2r(mm0, mm1); /* 0 ; y3 y2 y1 y0 */ + movq_r2r(mm4, mm7); /* 7 ; y6 y7 y4 y5 */ - psrld_i2r(16, mm4); // 0 y6 0 y4 + psrld_i2r(16, mm4); /* 0 y6 0 y4 */ - movq_r2m(mm1, *(out)); // 1 ; save y3 y2 y1 y0 - pslld_i2r(16, mm7); // y7 0 y5 0 + movq_r2m(mm1, *(out)); /* 1 ; save y3 y2 y1 y0 */ + pslld_i2r(16, mm7); /* y7 0 y5 0 */ - por_r2r(mm4, mm7); // 4 ; y7 y6 y5 y4 + por_r2r(mm4, mm7); /* 4 ; y7 y6 y5 y4 */ - // begin processing row 1 - movq_r2m(mm7, *(out+4)); // 7 ; save y7 y6 y5 y4 + /* begin processing row 1 */ + movq_r2m(mm7, *(out+4)); /* 7 ; save y7 y6 y5 y4 */ inptr += 8; out += 8; } - // done with the iDCT row-transformation + /* done with the iDCT row-transformation */ - // now we have to transpose the output 8x8 matrix - // 8x8 (OUT) -> 8x8't' (IN) - // the transposition is implemented as 4 sub-operations. - // 1) transpose upper-left quad - // 2) transpose lower-right quad - // 3) transpose lower-left quad - // 4) transpose upper-right quad + /* now we have to transpose the output 8x8 matrix */ + /* 8x8 (OUT) -> 8x8't' (IN) */ + /* the transposition is implemented as 4 sub-operations. */ + /* 1) transpose upper-left quad */ + /* 2) transpose lower-right quad */ + /* 3) transpose lower-left quad */ + /* 4) transpose upper-right quad */ - // mm0 = 1st row [ A B C D ] row1 - // mm1 = 2nd row [ E F G H ] 2 - // mm2 = 3rd row [ I J K L ] 3 - // mm3 = 4th row [ M N O P ] 4 + /* mm0 = 1st row [ A B C D ] row1 */ + /* mm1 = 2nd row [ E F G H ] 2 */ + /* mm2 = 3rd row [ I J K L ] 3 */ + /* mm3 = 4th row [ M N O P ] 4 */ - // 1) transpose upper-left quad + /* 1) transpose upper-left quad */ out = &qwTemp[0]; movq_m2r(*(out + ROW_STRIDE * 0), mm0); movq_m2r(*(out + ROW_STRIDE * 1), mm1); - movq_r2r(mm0, mm4); // mm4 = copy of row1[A B C D] + movq_r2r(mm0, mm4); /* mm4 = copy of row1[A B C D] */ movq_m2r(*(out + ROW_STRIDE * 2), mm2); - punpcklwd_r2r(mm1, mm0); // mm0 = [ 0 4 1 5] + punpcklwd_r2r(mm1, mm0); /* mm0 = [ 0 4 1 5] */ movq_m2r(*(out + ROW_STRIDE * 3), mm3); - punpckhwd_r2r(mm1, mm4); // mm4 = [ 2 6 3 7] + punpckhwd_r2r(mm1, mm4); /* mm4 = [ 2 6 3 7] */ movq_r2r(mm2, mm6); - punpcklwd_r2r(mm3, mm2); // mm2 = [ 8 12 9 13] + punpcklwd_r2r(mm3, mm2); /* mm2 = [ 8 12 9 13] */ - punpckhwd_r2r(mm3, mm6); // mm6 = 10 14 11 15] - movq_r2r(mm0, mm1); // mm1 = [ 0 4 1 5] + punpckhwd_r2r(mm3, mm6); /* mm6 = 10 14 11 15] */ + movq_r2r(mm0, mm1); /* mm1 = [ 0 4 1 5] */ inptr = blk; - punpckldq_r2r(mm2, mm0); // final result mm0 = row1 [0 4 8 12] + punpckldq_r2r(mm2, mm0); /* final result mm0 = row1 [0 4 8 12] */ - movq_r2r(mm4, mm3); // mm3 = [ 2 6 3 7] - punpckhdq_r2r(mm2, mm1); // mm1 = final result mm1 = row2 [1 5 9 13] + movq_r2r(mm4, mm3); /* mm3 = [ 2 6 3 7] */ + punpckhdq_r2r(mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13] */ - movq_r2m(mm0, *(inptr + ROW_STRIDE * 0)); // store row 1 - punpckldq_r2r(mm6, mm4); // final result mm4 = row3 [2 6 10 14] + movq_r2m(mm0, *(inptr + ROW_STRIDE * 0)); /* store row 1 */ + punpckldq_r2r(mm6, mm4); /* final result mm4 = row3 [2 6 10 14] */ -// begin reading next quadrant (lower-right) +/* begin reading next quadrant (lower-right) */ movq_m2r(*(out + ROW_STRIDE*4 + 4), mm0); - punpckhdq_r2r(mm6, mm3); // final result mm3 = row4 [3 7 11 15] + punpckhdq_r2r(mm6, mm3); /* final result mm3 = row4 [3 7 11 15] */ - movq_r2m(mm4, *(inptr + ROW_STRIDE * 2)); // store row 3 - movq_r2r(mm0, mm4); // mm4 = copy of row1[A B C D] + movq_r2m(mm4, *(inptr + ROW_STRIDE * 2)); /* store row 3 */ + movq_r2r(mm0, mm4); /* mm4 = copy of row1[A B C D] */ - movq_r2m(mm1, *(inptr + ROW_STRIDE * 1)); // store row 2 + movq_r2m(mm1, *(inptr + ROW_STRIDE * 1)); /* store row 2 */ movq_m2r(*(out + ROW_STRIDE*5 + 4), mm1); - movq_r2m(mm3, *(inptr + ROW_STRIDE * 3)); // store row 4 - punpcklwd_r2r(mm1, mm0); // mm0 = [ 0 4 1 5] + movq_r2m(mm3, *(inptr + ROW_STRIDE * 3)); /* store row 4 */ + punpcklwd_r2r(mm1, mm0); /* mm0 = [ 0 4 1 5] */ - // 2) transpose lower-right quadrant + /* 2) transpose lower-right quadrant */ -// movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8] +/* movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8] */ -// movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8] -// movq mm4, mm0; // mm4 = copy of row1[A B C D] +/* movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8] */ +/* movq mm4, mm0; // mm4 = copy of row1[A B C D] */ movq_m2r(*(out + ROW_STRIDE*6 + 4), mm2); -// punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5] - punpckhwd_r2r(mm1, mm4); // mm4 = [ 2 6 3 7] +/* punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5] */ + punpckhwd_r2r(mm1, mm4); /* mm4 = [ 2 6 3 7] */ movq_m2r(*(out + ROW_STRIDE*7 + 4), mm3); movq_r2r(mm2, mm6); - punpcklwd_r2r(mm3, mm2); // mm2 = [ 8 12 9 13] - movq_r2r(mm0, mm1); // mm1 = [ 0 4 1 5] + punpcklwd_r2r(mm3, mm2); /* mm2 = [ 8 12 9 13] */ + movq_r2r(mm0, mm1); /* mm1 = [ 0 4 1 5] */ - punpckhwd_r2r(mm3, mm6); // mm6 = 10 14 11 15] - movq_r2r(mm4, mm3); // mm3 = [ 2 6 3 7] + punpckhwd_r2r(mm3, mm6); /* mm6 = 10 14 11 15] */ + movq_r2r(mm4, mm3); /* mm3 = [ 2 6 3 7] */ - punpckldq_r2r(mm2, mm0); // final result mm0 = row1 [0 4 8 12] + punpckldq_r2r(mm2, mm0); /* final result mm0 = row1 [0 4 8 12] */ - punpckhdq_r2r(mm2, mm1); // mm1 = final result mm1 = row2 [1 5 9 13] - ; // slot + punpckhdq_r2r(mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13] */ + ; /* slot */ - movq_r2m(mm0, *(inptr + ROW_STRIDE*4 + 4)); // store row 1 - punpckldq_r2r(mm6, mm4); // final result mm4 = row3 [2 6 10 14] + movq_r2m(mm0, *(inptr + ROW_STRIDE*4 + 4)); /* store row 1 */ + punpckldq_r2r(mm6, mm4); /* final result mm4 = row3 [2 6 10 14] */ movq_m2r(*(out + ROW_STRIDE * 4 ), mm0); - punpckhdq_r2r(mm6, mm3); // final result mm3 = row4 [3 7 11 15] + punpckhdq_r2r(mm6, mm3); /* final result mm3 = row4 [3 7 11 15] */ - movq_r2m(mm4, *(inptr + ROW_STRIDE*6 + 4)); // store row 3 - movq_r2r(mm0, mm4); // mm4 = copy of row1[A B C D] + movq_r2m(mm4, *(inptr + ROW_STRIDE*6 + 4)); /* store row 3 */ + movq_r2r(mm0, mm4); /* mm4 = copy of row1[A B C D] */ - movq_r2m(mm1, *(inptr + ROW_STRIDE*5 + 4)); // store row 2 - ; // slot + movq_r2m(mm1, *(inptr + ROW_STRIDE*5 + 4)); /* store row 2 */ + ; /* slot */ movq_m2r(*(out + ROW_STRIDE * 5 ), mm1); - ; // slot + ; /* slot */ - movq_r2m(mm3, *(inptr + ROW_STRIDE*7 + 4)); // store row 4 - punpcklwd_r2r(mm1, mm0); // mm0 = [ 0 4 1 5] + movq_r2m(mm3, *(inptr + ROW_STRIDE*7 + 4)); /* store row 4 */ + punpcklwd_r2r(mm1, mm0); /* mm0 = [ 0 4 1 5] */ - // 3) transpose lower-left -// movq mm0, qword ptr [OUT + ROW_STRIDE * 4 ] + /* 3) transpose lower-left */ +/* movq mm0, qword ptr [OUT + ROW_STRIDE * 4 ] */ -// movq mm1, qword ptr [OUT + ROW_STRIDE * 5 ] -// movq mm4, mm0; // mm4 = copy of row1[A B C D] +/* movq mm1, qword ptr [OUT + ROW_STRIDE * 5 ] */ +/* movq mm4, mm0; // mm4 = copy of row1[A B C D] */ movq_m2r(*(out + ROW_STRIDE * 6 ), mm2); -// punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5] - punpckhwd_r2r(mm1, mm4); // mm4 = [ 2 6 3 7] +/* punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5] */ + punpckhwd_r2r(mm1, mm4); /* mm4 = [ 2 6 3 7] */ movq_m2r(*(out + ROW_STRIDE * 7 ), mm3); movq_r2r(mm2, mm6); - punpcklwd_r2r(mm3, mm2); // mm2 = [ 8 12 9 13] - movq_r2r(mm0, mm1); // mm1 = [ 0 4 1 5] + punpcklwd_r2r(mm3, mm2); /* mm2 = [ 8 12 9 13] */ + movq_r2r(mm0, mm1); /* mm1 = [ 0 4 1 5] */ - punpckhwd_r2r(mm3, mm6); // mm6 = 10 14 11 15] - movq_r2r(mm4, mm3); // mm3 = [ 2 6 3 7] + punpckhwd_r2r(mm3, mm6); /* mm6 = 10 14 11 15] */ + movq_r2r(mm4, mm3); /* mm3 = [ 2 6 3 7] */ - punpckldq_r2r(mm2, mm0); // final result mm0 = row1 [0 4 8 12] + punpckldq_r2r(mm2, mm0); /* final result mm0 = row1 [0 4 8 12] */ - punpckhdq_r2r(mm2, mm1); // mm1 = final result mm1 = row2 [1 5 9 13] - ;//slot + punpckhdq_r2r(mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13] */ + ;/*slot */ - movq_r2m(mm0, *(inptr + ROW_STRIDE * 0 + 4 )); // store row 1 - punpckldq_r2r(mm6, mm4); // final result mm4 = row3 [2 6 10 14] + movq_r2m(mm0, *(inptr + ROW_STRIDE * 0 + 4 )); /* store row 1 */ + punpckldq_r2r(mm6, mm4); /* final result mm4 = row3 [2 6 10 14] */ -// begin reading next quadrant (upper-right) +/* begin reading next quadrant (upper-right) */ movq_m2r(*(out + ROW_STRIDE*0 + 4), mm0); - punpckhdq_r2r(mm6, mm3); // final result mm3 = row4 [3 7 11 15] + punpckhdq_r2r(mm6, mm3); /* final result mm3 = row4 [3 7 11 15] */ - movq_r2m(mm4, *(inptr + ROW_STRIDE * 2 + 4)); // store row 3 - movq_r2r(mm0, mm4); // mm4 = copy of row1[A B C D] + movq_r2m(mm4, *(inptr + ROW_STRIDE * 2 + 4)); /* store row 3 */ + movq_r2r(mm0, mm4); /* mm4 = copy of row1[A B C D] */ - movq_r2m(mm1, *(inptr + ROW_STRIDE * 1 + 4)); // store row 2 + movq_r2m(mm1, *(inptr + ROW_STRIDE * 1 + 4)); /* store row 2 */ movq_m2r(*(out + ROW_STRIDE*1 + 4), mm1); - movq_r2m(mm3, *(inptr + ROW_STRIDE * 3 + 4)); // store row 4 - punpcklwd_r2r(mm1, mm0); // mm0 = [ 0 4 1 5] + movq_r2m(mm3, *(inptr + ROW_STRIDE * 3 + 4)); /* store row 4 */ + punpcklwd_r2r(mm1, mm0); /* mm0 = [ 0 4 1 5] */ - // 2) transpose lower-right quadrant + /* 2) transpose lower-right quadrant */ -// movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8] +/* movq mm0, qword ptr [OUT + ROW_STRIDE*4 + 8] */ -// movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8] -// movq mm4, mm0; // mm4 = copy of row1[A B C D] +/* movq mm1, qword ptr [OUT + ROW_STRIDE*5 + 8] */ +/* movq mm4, mm0; // mm4 = copy of row1[A B C D] */ movq_m2r(*(out + ROW_STRIDE*2 + 4), mm2); -// punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5] - punpckhwd_r2r(mm1, mm4); // mm4 = [ 2 6 3 7] +/* punpcklwd mm0, mm1; // mm0 = [ 0 4 1 5] */ + punpckhwd_r2r(mm1, mm4); /* mm4 = [ 2 6 3 7] */ movq_m2r(*(out + ROW_STRIDE*3 + 4), mm3); movq_r2r(mm2, mm6); - punpcklwd_r2r(mm3, mm2); // mm2 = [ 8 12 9 13] - movq_r2r(mm0, mm1); // mm1 = [ 0 4 1 5] + punpcklwd_r2r(mm3, mm2); /* mm2 = [ 8 12 9 13] */ + movq_r2r(mm0, mm1); /* mm1 = [ 0 4 1 5] */ - punpckhwd_r2r(mm3, mm6); // mm6 = 10 14 11 15] - movq_r2r(mm4, mm3); // mm3 = [ 2 6 3 7] + punpckhwd_r2r(mm3, mm6); /* mm6 = 10 14 11 15] */ + movq_r2r(mm4, mm3); /* mm3 = [ 2 6 3 7] */ - punpckldq_r2r(mm2, mm0); // final result mm0 = row1 [0 4 8 12] + punpckldq_r2r(mm2, mm0); /* final result mm0 = row1 [0 4 8 12] */ - punpckhdq_r2r(mm2, mm1); // mm1 = final result mm1 = row2 [1 5 9 13] - ; // slot + punpckhdq_r2r(mm2, mm1); /* mm1 = final result mm1 = row2 [1 5 9 13] */ + ; /* slot */ - movq_r2m(mm0, *(inptr + ROW_STRIDE*4)); // store row 1 - punpckldq_r2r(mm6, mm4); // final result mm4 = row3 [2 6 10 14] + movq_r2m(mm0, *(inptr + ROW_STRIDE*4)); /* store row 1 */ + punpckldq_r2r(mm6, mm4); /* final result mm4 = row3 [2 6 10 14] */ - movq_r2m(mm1, *(inptr + ROW_STRIDE*5)); // store row 2 - punpckhdq_r2r(mm6, mm3); // final result mm3 = row4 [3 7 11 15] + movq_r2m(mm1, *(inptr + ROW_STRIDE*5)); /* store row 2 */ + punpckhdq_r2r(mm6, mm3); /* final result mm3 = row4 [3 7 11 15] */ - movq_r2m(mm4, *(inptr + ROW_STRIDE*6)); // store row 3 - ; // slot + movq_r2m(mm4, *(inptr + ROW_STRIDE*6)); /* store row 3 */ + ; /* slot */ - movq_r2m(mm3, *(inptr + ROW_STRIDE*7)); // store row 4 - ; // slot + movq_r2m(mm3, *(inptr + ROW_STRIDE*7)); /* store row 4 */ + ; /* slot */ } static void -idct_mmx32_cols( short *blk ) // transform all 8 cols of 8x8 iDCT block +idct_mmx32_cols( short *blk ) /* transform all 8 cols of 8x8 iDCT block */ { int x; short *inptr = blk; - // Despite the function's name, the matrix is transformed - // row by row. This function is identical to idct_mmx32_rows(), - // except for the SHIFT amount and ROUND_INV amount. + /* Despite the function's name, the matrix is transformed */ + /* row by row. This function is identical to idct_mmx32_rows(), */ + /* except for the SHIFT amount and ROUND_INV amount. */ - // this subroutine performs two operations - // 1) iDCT row transform - // for( i = 0; i < 8; ++ i) - // DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] ); - // - // 2) transpose the matrix (which was stored in qwTemp[]) - // qwTemp[] -> [8x8 matrix transpose] -> blk[] + /* this subroutine performs two operations */ + /* 1) iDCT row transform */ + /* for( i = 0; i < 8; ++ i) */ + /* DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] ); */ + /* */ + /* 2) transpose the matrix (which was stored in qwTemp[]) */ + /* qwTemp[] -> [8x8 matrix transpose] -> blk[] */ - for (x=0; x<8; x++) { // transform one row per iteration + for (x=0; x<8; x++) { /* transform one row per iteration */ - movq_m2r(*(inptr), mm0); // 0 ; x3 x2 x1 x0 + movq_m2r(*(inptr), mm0); /* 0 ; x3 x2 x1 x0 */ - movq_m2r(*(inptr+4), mm1); // 1 ; x7 x6 x5 x4 - movq_r2r(mm0, mm2); // 2 ; x3 x2 x1 x0 + movq_m2r(*(inptr+4), mm1); /* 1 ; x7 x6 x5 x4 */ + movq_r2r(mm0, mm2); /* 2 ; x3 x2 x1 x0 */ - movq_m2r(*(tab_i_01234567), mm3); // 3 ; w06 w04 w02 w00 - punpcklwd_r2r(mm1, mm0); // x5 x1 x4 x0 + movq_m2r(*(tab_i_01234567), mm3); /* 3 ; w06 w04 w02 w00 */ + punpcklwd_r2r(mm1, mm0); /* x5 x1 x4 x0 */ -// ---------- - movq_r2r(mm0, mm5); // 5 ; x5 x1 x4 x0 - punpckldq_r2r(mm0, mm0); // x4 x0 x4 x0 +/* ---------- */ + movq_r2r(mm0, mm5); /* 5 ; x5 x1 x4 x0 */ + punpckldq_r2r(mm0, mm0); /* x4 x0 x4 x0 */ - movq_m2r(*(tab_i_01234567+4), mm4); // 4 ; w07 w05 w03 w01 - punpckhwd_r2r(mm1, mm2); // 1 ; x7 x3 x6 x2 + movq_m2r(*(tab_i_01234567+4), mm4); /* 4 ; w07 w05 w03 w01 */ + punpckhwd_r2r(mm1, mm2); /* 1 ; x7 x3 x6 x2 */ - pmaddwd_r2r(mm0, mm3); // x4*w06+x0*w04 x4*w02+x0*w00 - movq_r2r(mm2, mm6); // 6 ; x7 x3 x6 x2 + pmaddwd_r2r(mm0, mm3); /* x4*w06+x0*w04 x4*w02+x0*w00 */ + movq_r2r(mm2, mm6); /* 6 ; x7 x3 x6 x2 */ - movq_m2r(*(tab_i_01234567+16), mm1);// 1 ; w22 w20 w18 w16 - punpckldq_r2r(mm2, mm2); // x6 x2 x6 x2 + movq_m2r(*(tab_i_01234567+16), mm1);/* 1 ; w22 w20 w18 w16 */ + punpckldq_r2r(mm2, mm2); /* x6 x2 x6 x2 */ - pmaddwd_r2r(mm2, mm4); // x6*w07+x2*w05 x6*w03+x2*w01 - punpckhdq_r2r(mm5, mm5); // x5 x1 x5 x1 + pmaddwd_r2r(mm2, mm4); /* x6*w07+x2*w05 x6*w03+x2*w01 */ + punpckhdq_r2r(mm5, mm5); /* x5 x1 x5 x1 */ - pmaddwd_m2r(*(tab_i_01234567+8), mm0);// x4*w14+x0*w12 x4*w10+x0*w08 - punpckhdq_r2r(mm6, mm6); // x7 x3 x7 x3 + pmaddwd_m2r(*(tab_i_01234567+8), mm0);/* x4*w14+x0*w12 x4*w10+x0*w08 */ + punpckhdq_r2r(mm6, mm6); /* x7 x3 x7 x3 */ - movq_m2r(*(tab_i_01234567+20), mm7);// 7 ; w23 w21 w19 w17 - pmaddwd_r2r(mm5, mm1); // x5*w22+x1*w20 x5*w18+x1*w16 + movq_m2r(*(tab_i_01234567+20), mm7);/* 7 ; w23 w21 w19 w17 */ + pmaddwd_r2r(mm5, mm1); /* x5*w22+x1*w20 x5*w18+x1*w16 */ - paddd_m2r(*(r_inv_col), mm3);// +rounder - pmaddwd_r2r(mm6, mm7); // x7*w23+x3*w21 x7*w19+x3*w17 + paddd_m2r(*(r_inv_col), mm3);/* +rounder */ + pmaddwd_r2r(mm6, mm7); /* x7*w23+x3*w21 x7*w19+x3*w17 */ - pmaddwd_m2r(*(tab_i_01234567+12), mm2);// x6*w15+x2*w13 x6*w11+x2*w09 - paddd_r2r(mm4, mm3); // 4 ; a1=sum(even1) a0=sum(even0) + pmaddwd_m2r(*(tab_i_01234567+12), mm2);/* x6*w15+x2*w13 x6*w11+x2*w09 */ + paddd_r2r(mm4, mm3); /* 4 ; a1=sum(even1) a0=sum(even0) */ - pmaddwd_m2r(*(tab_i_01234567+24), mm5);// x5*w30+x1*w28 x5*w26+x1*w24 - movq_r2r(mm3, mm4); // 4 ; a1 a0 + pmaddwd_m2r(*(tab_i_01234567+24), mm5);/* x5*w30+x1*w28 x5*w26+x1*w24 */ + movq_r2r(mm3, mm4); /* 4 ; a1 a0 */ - pmaddwd_m2r(*(tab_i_01234567+28), mm6);// x7*w31+x3*w29 x7*w27+x3*w25 - paddd_r2r(mm7, mm1); // 7 ; b1=sum(odd1) b0=sum(odd0) + pmaddwd_m2r(*(tab_i_01234567+28), mm6);/* x7*w31+x3*w29 x7*w27+x3*w25 */ + paddd_r2r(mm7, mm1); /* 7 ; b1=sum(odd1) b0=sum(odd0) */ - paddd_m2r(*(r_inv_col), mm0);// +rounder - psubd_r2r(mm1, mm3); // a1-b1 a0-b0 + paddd_m2r(*(r_inv_col), mm0);/* +rounder */ + psubd_r2r(mm1, mm3); /* a1-b1 a0-b0 */ - psrad_i2r(SHIFT_INV_COL, mm3); // y6=a1-b1 y7=a0-b0 - paddd_r2r(mm4, mm1); // 4 ; a1+b1 a0+b0 + psrad_i2r(SHIFT_INV_COL, mm3); /* y6=a1-b1 y7=a0-b0 */ + paddd_r2r(mm4, mm1); /* 4 ; a1+b1 a0+b0 */ - paddd_r2r(mm2, mm0); // 2 ; a3=sum(even3) a2=sum(even2) - psrad_i2r(SHIFT_INV_COL, mm1); // y1=a1+b1 y0=a0+b0 + paddd_r2r(mm2, mm0); /* 2 ; a3=sum(even3) a2=sum(even2) */ + psrad_i2r(SHIFT_INV_COL, mm1); /* y1=a1+b1 y0=a0+b0 */ - paddd_r2r(mm6, mm5); // 6 ; b3=sum(odd3) b2=sum(odd2) - movq_r2r(mm0, mm4); // 4 ; a3 a2 + paddd_r2r(mm6, mm5); /* 6 ; b3=sum(odd3) b2=sum(odd2) */ + movq_r2r(mm0, mm4); /* 4 ; a3 a2 */ - paddd_r2r(mm5, mm0); // a3+b3 a2+b2 - psubd_r2r(mm5, mm4); // 5 ; a3-b3 a2-b2 + paddd_r2r(mm5, mm0); /* a3+b3 a2+b2 */ + psubd_r2r(mm5, mm4); /* 5 ; a3-b3 a2-b2 */ - psrad_i2r(SHIFT_INV_COL, mm4); // y4=a3-b3 y5=a2-b2 - psrad_i2r(SHIFT_INV_COL, mm0); // y3=a3+b3 y2=a2+b2 + psrad_i2r(SHIFT_INV_COL, mm4); /* y4=a3-b3 y5=a2-b2 */ + psrad_i2r(SHIFT_INV_COL, mm0); /* y3=a3+b3 y2=a2+b2 */ - packssdw_r2r(mm3, mm4); // 3 ; y6 y7 y4 y5 + packssdw_r2r(mm3, mm4); /* 3 ; y6 y7 y4 y5 */ - packssdw_r2r(mm0, mm1); // 0 ; y3 y2 y1 y0 - movq_r2r(mm4, mm7); // 7 ; y6 y7 y4 y5 + packssdw_r2r(mm0, mm1); /* 0 ; y3 y2 y1 y0 */ + movq_r2r(mm4, mm7); /* 7 ; y6 y7 y4 y5 */ - psrld_i2r(16, mm4); // 0 y6 0 y4 + psrld_i2r(16, mm4); /* 0 y6 0 y4 */ - movq_r2m(mm1, *(inptr)); // 1 ; save y3 y2 y1 y0 - pslld_i2r(16, mm7); // y7 0 y5 0 + movq_r2m(mm1, *(inptr)); /* 1 ; save y3 y2 y1 y0 */ + pslld_i2r(16, mm7); /* y7 0 y5 0 */ - por_r2r(mm4, mm7); // 4 ; y7 y6 y5 y4 + por_r2r(mm4, mm7); /* 4 ; y7 y6 y5 y4 */ - // begin processing row 1 - movq_r2m(mm7, *(inptr+4)); // 7 ; save y7 y6 y5 y4 + /* begin processing row 1 */ + movq_r2m(mm7, *(inptr+4)); /* 7 ; save y7 y6 y5 y4 */ inptr += 8; } - // done with the iDCT column-transformation + /* done with the iDCT column-transformation */ } -// -// public interface to MMX32 IDCT 8x8 operation -// +/* */ +/* public interface to MMX32 IDCT 8x8 operation */ +/* */ void gst_idct_mmx32_idct( short *blk ) { - // 1) iDCT row transformation - idct_mmx32_rows( blk ); // 1) transform iDCT row, and transpose + /* 1) iDCT row transformation */ + idct_mmx32_rows( blk ); /* 1) transform iDCT row, and transpose */ - // 2) iDCT column transformation - idct_mmx32_cols( blk ); // 2) transform iDCT row, and transpose + /* 2) iDCT column transformation */ + idct_mmx32_cols( blk ); /* 2) transform iDCT row, and transpose */ - emms(); // restore processor state - // all done + emms(); /* restore processor state */ + /* all done */ } -- cgit v1.2.1