diff options
-rw-r--r-- | ChangeLog | 10 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/greedyh.asm | 475 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/greedyh.c | 266 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/greedyhmacros.h | 42 |
4 files changed, 452 insertions, 341 deletions
@@ -1,3 +1,13 @@ +2008-06-28 Sebastian Dröge <sebastian.droege@collabora.co.uk> + + * gst/deinterlace2/tvtime/greedyh.asm: + * gst/deinterlace2/tvtime/greedyh.c: (greedyDScaler_C), + (deinterlace_frame_di_greedyh), (dscaler_greedyh_get_method): + * gst/deinterlace2/tvtime/greedyhmacros.h: + Add a C implementation for the greedyh deinterlacing method, clean + up the code a bit and mark the SSE version as MMXEXT as it doesn't + require any SSE instructions. + 2008-06-27 Sebastian Dröge <sebastian.droege@collabora.co.uk> * gst/deinterlace2/gstdeinterlace2.c: diff --git a/gst/deinterlace2/tvtime/greedyh.asm b/gst/deinterlace2/tvtime/greedyh.asm index 8fd0ab66..fcd3a647 100644 --- a/gst/deinterlace2/tvtime/greedyh.asm +++ b/gst/deinterlace2/tvtime/greedyh.asm @@ -28,281 +28,216 @@ #include "x86-64_macros.inc" -void FUNCT_NAME( GstDeinterlace2 *object) +void +FUNCT_NAME (uint8_t * L1, uint8_t * L2, uint8_t * L3, uint8_t * L2P, + uint8_t * Dest, int size) { - int64_t i; - int InfoIsOdd = 0; - // in tight loop some vars are accessed faster in local storage - int64_t YMask = 0x00ff00ff00ff00ffull; // to keep only luma - int64_t UVMask = 0xff00ff00ff00ff00ull; // to keep only chroma - int64_t ShiftMask = 0xfefffefffefffeffull; // to avoid shifting chroma to luma - int64_t QW256 = 0x0100010001000100ull; // 4 256's - - // Set up our two parms that are actually evaluated for each pixel - i=GreedyMaxComb; - int64_t MaxComb = i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i; - - i = GreedyMotionThreshold; // scale to range of 0-257 - int64_t MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask; - - i = GreedyMotionSense; // scale to range of 0-257 - int64_t MotionSense = i << 48 | i << 32 | i << 16 | i; - - int Line; - long LoopCtr; - unsigned int Pitch = object->field_stride; - - unsigned char* L1; // ptr to Line1, of 3 - unsigned char* L2; // ptr to Line2, the weave line - unsigned char* L3; // ptr to Line3 - - unsigned char* L2P; // ptr to prev Line2 - unsigned char* Dest = GST_BUFFER_DATA(object->out_buf); - - int64_t QW256B; - int64_t LastAvg=0; //interp value from left qword - - i = 0xffffffff - 256; - QW256B = i << 48 | i << 32 | i << 16 | i; // save a couple instr on PMINSW instruct. - - - // copy first even line no matter what, and the first odd line if we're - // processing an EVEN field. (note diff from other deint rtns.) - - if (object->field_history[object->history_count-1].flags == PICTURE_INTERLACED_BOTTOM) { - InfoIsOdd = 1; - - L1 = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf); - L2 = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf); - L3 = L1 + Pitch; - L2P = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf); - - // copy first even line - object->pMemcpy(Dest, L1, object->line_length); - Dest += object->output_stride; - } - else { - InfoIsOdd = 0; - L1 = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf); - L2 = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf) + Pitch; - L3 = L1 + Pitch; - L2P = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf) + Pitch; - - // copy first even line - object->pMemcpy(Dest, GST_BUFFER_DATA(object->field_history[0].buf), object->line_length); - Dest += object->output_stride; - // then first odd line - object->pMemcpy(Dest, L1, object->line_length); - Dest += object->output_stride; - } - - - long oldbx; - - for (Line = 0; Line < (object->field_height - 1); ++Line) { - LoopCtr = object->line_length / 8 - 1; // there are LineLength / 8 qwords per line but do 1 less, adj at end of loop - - // For ease of reading, the comments below assume that we're operating on an odd - // field (i.e., that InfoIsOdd is true). Assume the obvious for even lines.. - __asm__ __volatile__ - ( - // save ebx (-fPIC) - MOVX" %%"XBX", %[oldbx]\n\t" - - MOVX" %[L1], %%"XAX"\n\t" - LEAX" 8(%%"XAX"), %%"XBX"\n\t" // next qword needed by DJR - MOVX" %[L3], %%"XCX"\n\t" - SUBX" %%"XAX", %%"XCX"\n\t" // carry L3 addr as an offset - MOVX" %[L2P], %%"XDX"\n\t" - MOVX" %[L2], %%"XSI"\n\t" - MOVX" %[Dest], %%"XDI"\n\t" // DL1 if Odd or DL2 if Even - - ".align 8\n\t" - "1:\n\t" - - "movq (%%"XSI"), %%mm0\n\t" // L2 - the newest weave pixel value - "movq (%%"XAX"), %%mm1\n\t" // L1 - the top pixel - "movq (%%"XDX"), %%mm2\n\t" // L2P - the prev weave pixel - "movq (%%"XAX", %%"XCX"), %%mm3\n\t" // L3, next odd row - "movq %%mm1, %%mm6\n\t" // L1 - get simple single pixel interp - // pavgb mm6, mm3 // use macro below - V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]") - - // DJR - Diagonal Jaggie Reduction - // In the event that we are going to use an average (Bob) pixel we do not want a jagged - // stair step effect. To combat this we avg in the 2 horizontally adjacen pixels into the - // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels. - - "movq %[LastAvg], %%mm4\n\t" // the bob value from prev qword in row - "movq %%mm6, %[LastAvg]\n\t" // save for next pass - "psrlq $48, %%mm4\n\t" // right justify 1 pixel - "movq %%mm6, %%mm7\n\t" // copy of simple bob pixel - "psllq $16, %%mm7\n\t" // left justify 3 pixels - "por %%mm7, %%mm4\n\t" // and combine - - "movq (%%"XBX"), %%mm5\n\t" // next horiz qword from L1 - // pavgb mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below - V_PAVGB ("%%mm5", "(%%"XBX",%%"XCX")", "%%mm7", "%[ShiftMask]") - "psllq $48, %%mm5\n\t" // left just 1 pixel - "movq %%mm6, %%mm7\n\t" // another copy of simple bob pixel - "psrlq $16, %%mm7\n\t" // right just 3 pixels - "por %%mm7, %%mm5\n\t" // combine - // pavgb mm4, mm5 // avg of forward and prev by 1 pixel, use macro - V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]") // mm5 gets modified if MMX - // pavgb mm6, mm4 // avg of center and surround interp vals, use macro - V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") - - // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors. + // in tight loop some vars are accessed faster in local storage + int64_t YMask = 0x00ff00ff00ff00ffull; // to keep only luma + int64_t UVMask = 0xff00ff00ff00ff00ull; // to keep only chroma + int64_t ShiftMask = 0xfefefefefefefefeull; // to avoid shifting chroma to luma + int64_t QW256 = 0x0100010001000100ull; // 4 256's + int64_t MaxComb; + int64_t MotionThreshold; + int64_t MotionSense; + int64_t i; + long LoopCtr; + long oldbx; + + int64_t QW256B; + int64_t LastAvg = 0; //interp value from left qword + + // Set up our two parms that are actually evaluated for each pixel + i = GreedyMaxComb; + MaxComb = + i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i; + + i = GreedyMotionThreshold; // scale to range of 0-257 + MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask; + + i = GreedyMotionSense; // scale to range of 0-257 + MotionSense = i << 48 | i << 32 | i << 16 | i; + + i = 0xffffffff - 256; + QW256B = i << 48 | i << 32 | i << 16 | i; // save a couple instr on PMINSW instruct. + + LoopCtr = size / 8 - 1; // there are LineLength / 8 qwords per line but do 1 less, adj at end of loop + + // For ease of reading, the comments below assume that we're operating on an odd + // field (i.e., that InfoIsOdd is true). Assume the obvious for even lines.. + __asm__ __volatile__ ( + // save ebx (-fPIC) + MOVX " %%" XBX ", %[oldbx]\n\t" + MOVX " %[L1], %%" XAX "\n\t" + LEAX " 8(%%" XAX "), %%" XBX "\n\t" // next qword needed by DJR + MOVX " %[L3], %%" XCX "\n\t" + SUBX " %%" XAX ", %%" XCX "\n\t" // carry L3 addr as an offset + MOVX " %[L2P], %%" XDX "\n\t" + MOVX " %[L2], %%" XSI "\n\t" + MOVX " %[Dest], %%" XDI "\n\t" // DL1 if Odd or DL2 if Even + + ".align 8\n\t" + "1:\n\t" + "movq (%%" XSI "), %%mm0\n\t" // L2 - the newest weave pixel value + "movq (%%" XAX "), %%mm1\n\t" // L1 - the top pixel + "movq (%%" XDX "), %%mm2\n\t" // L2P - the prev weave pixel + "movq (%%" XAX ", %%" XCX "), %%mm3\n\t" // L3, next odd row + "movq %%mm1, %%mm6\n\t" // L1 - get simple single pixel interp + + // pavgb mm6, mm3 // use macro below + V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]") + + // DJR - Diagonal Jaggie Reduction + // In the event that we are going to use an average (Bob) pixel we do not want a jagged + // stair step effect. To combat this we avg in the 2 horizontally adjacen pixels into the + // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels. + + "movq %[LastAvg], %%mm4\n\t" // the bob value from prev qword in row + "movq %%mm6, %[LastAvg]\n\t" // save for next pass + "psrlq $48, %%mm4\n\t" // right justify 1 pixel + "movq %%mm6, %%mm7\n\t" // copy of simple bob pixel + "psllq $16, %%mm7\n\t" // left justify 3 pixels + "por %%mm7, %%mm4\n\t" // and combine + "movq (%%" XBX "), %%mm5\n\t" // next horiz qword from L1 + // pavgb mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below + + V_PAVGB ("%%mm5", "(%%" XBX ",%%" XCX ")", "%%mm7", "%[ShiftMask]") + "psllq $48, %%mm5\n\t" // left just 1 pixel + "movq %%mm6, %%mm7\n\t" // another copy of simple bob pixel + "psrlq $16, %%mm7\n\t" // right just 3 pixels + "por %%mm7, %%mm5\n\t" // combine + // pavgb mm4, mm5 // avg of forward and prev by 1 pixel, use macro + V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]") // mm5 gets modified if MMX + // pavgb mm6, mm4 // avg of center and surround interp vals, use macro + V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") + + // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors. #ifndef IS_MMX - // pavgb mm4, mm6 // 1/4 center, 3/4 adjacent - V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]") - // pavgb mm6, mm4 // 3/8 center, 5/8 adjacent - V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") + // pavgb mm4, mm6 // 1/4 center, 3/4 adjacent + V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]") + // pavgb mm6, mm4 // 3/8 center, 5/8 adjacent + V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") #endif - // get abs value of possible L2 comb - "movq %%mm6, %%mm4\n\t" // work copy of interp val - "movq %%mm2, %%mm7\n\t" // L2 - "psubusb %%mm4, %%mm7\n\t" // L2 - avg - "movq %%mm4, %%mm5\n\t" // avg - "psubusb %%mm2, %%mm5\n\t" // avg - L2 - "por %%mm7, %%mm5\n\t" // abs(avg-L2) - - // get abs value of possible L2P comb - "movq %%mm0, %%mm7\n\t" // L2P - "psubusb %%mm4, %%mm7\n\t" // L2P - avg - "psubusb %%mm0, %%mm4\n\t" // avg - L2P - "por %%mm7, %%mm4\n\t" // abs(avg-L2P) - - // use L2 or L2P depending upon which makes smaller comb - "psubusb %%mm5, %%mm4\n\t" // see if it goes to zero - "psubusb %%mm5, %%mm5\n\t" // 0 - "pcmpeqb %%mm5, %%mm4\n\t" // if (mm4=0) then FF else 0 - "pcmpeqb %%mm4, %%mm5\n\t" // opposite of mm4 - - // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 - "pand %%mm2, %%mm5\n\t" // use L2 if mm5 == ff, else 0 - "pand %%mm0, %%mm4\n\t" // use L2P if mm4 = ff, else 0 - "por %%mm5, %%mm4\n\t" // may the best win - - // Inventory: at this point we have the following values: - // mm0 = L2P (or L2) - // mm1 = L1 - // mm2 = L2 (or L2P) - // mm3 = L3 - // mm4 = the best of L2,L2P weave pixel, base upon comb - // mm6 = the avg interpolated value, if we need to use it - - // Let's measure movement, as how much the weave pixel has changed - "movq %%mm2, %%mm7\n\t" - "psubusb %%mm0, %%mm2\n\t" - "psubusb %%mm7, %%mm0\n\t" - "por %%mm2, %%mm0\n\t" // abs value of change, used later - - // Now lets clip our chosen value to be not outside of the range - // of the high/low range L1-L3 by more than MaxComb. - // This allows some comb but limits the damages and also allows more - // detail than a boring oversmoothed clip. - "movq %%mm1, %%mm2\n\t" // copy L1 - // pmaxub mm2, mm3 // use macro - V_PMAXUB ("%%mm2", "%%mm3") // now = Max(L1,L3) - "movq %%mm1, %%mm5\n\t" // copy L1 - // pminub mm5, mm3 // now = Min(L1,L3), use macro - V_PMINUB ("%%mm5", "%%mm3", "%%mm7") - // allow the value to be above the high or below the low by amt of MaxComb - "psubusb %[MaxComb], %%mm5\n\t" // lower min by diff - "paddusb %[MaxComb], %%mm2\n\t" // increase max by diff - // pmaxub mm4, mm5 // now = Max(best,Min(L1,L3) use macro - V_PMAXUB ("%%mm4", "%%mm5") - // pminub mm4, mm2 // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped - V_PMINUB ("%%mm4", "%%mm2", "%%mm7") - - // Blend weave pixel with bob pixel, depending on motion val in mm0 - "psubusb %[MotionThreshold], %%mm0\n\t"// test Threshold, clear chroma change >>>?? - "pmullw %[MotionSense], %%mm0\n\t" // mul by user factor, keep low 16 bits - "movq %[QW256], %%mm7\n\t" -#ifdef HAVE_SSE - "pminsw %%mm7, %%mm0\n\t" // max = 256 + // get abs value of possible L2 comb + "movq %%mm6, %%mm4\n\t" // work copy of interp val + "movq %%mm2, %%mm7\n\t" // L2 + "psubusb %%mm4, %%mm7\n\t" // L2 - avg + "movq %%mm4, %%mm5\n\t" // avg + "psubusb %%mm2, %%mm5\n\t" // avg - L2 + "por %%mm7, %%mm5\n\t" // abs(avg-L2) + + // get abs value of possible L2P comb + "movq %%mm0, %%mm7\n\t" // L2P + "psubusb %%mm4, %%mm7\n\t" // L2P - avg + "psubusb %%mm0, %%mm4\n\t" // avg - L2P + "por %%mm7, %%mm4\n\t" // abs(avg-L2P) + + // use L2 or L2P depending upon which makes smaller comb + "psubusb %%mm5, %%mm4\n\t" // see if it goes to zero + "psubusb %%mm5, %%mm5\n\t" // 0 + "pcmpeqb %%mm5, %%mm4\n\t" // if (mm4=0) then FF else 0 + "pcmpeqb %%mm4, %%mm5\n\t" // opposite of mm4 + + // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 + "pand %%mm2, %%mm5\n\t" // use L2 if mm5 == ff, else 0 + "pand %%mm0, %%mm4\n\t" // use L2P if mm4 = ff, else 0 + "por %%mm5, %%mm4\n\t" // may the best win + + // Inventory: at this point we have the following values: + // mm0 = L2P (or L2) + // mm1 = L1 + // mm2 = L2 (or L2P) + // mm3 = L3 + // mm4 = the best of L2,L2P weave pixel, base upon comb + // mm6 = the avg interpolated value, if we need to use it + // Let's measure movement, as how much the weave pixel has changed + + "movq %%mm2, %%mm7\n\t" + "psubusb %%mm0, %%mm2\n\t" + "psubusb %%mm7, %%mm0\n\t" + "por %%mm2, %%mm0\n\t" // abs value of change, used later + + // Now lets clip our chosen value to be not outside of the range + // of the high/low range L1-L3 by more than MaxComb. + // This allows some comb but limits the damages and also allows more + // detail than a boring oversmoothed clip. + + "movq %%mm1, %%mm2\n\t" // copy L1 + // pmaxub mm2, mm3 // use macro + V_PMAXUB ("%%mm2", "%%mm3") // now = Max(L1,L3) + "movq %%mm1, %%mm5\n\t" // copy L1 + // pminub mm5, mm3 // now = Min(L1,L3), use macro + V_PMINUB ("%%mm5", "%%mm3", "%%mm7") + + // allow the value to be above the high or below the low by amt of MaxComb + "psubusb %[MaxComb], %%mm5\n\t" // lower min by diff + "paddusb %[MaxComb], %%mm2\n\t" // increase max by diff + // pmaxub mm4, mm5 // now = Max(best,Min(L1,L3) use macro + V_PMAXUB ("%%mm4", "%%mm5") + // pminub mm4, mm2 // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped + V_PMINUB ("%%mm4", "%%mm2", "%%mm7") + + // Blend weave pixel with bob pixel, depending on motion val in mm0 + "psubusb %[MotionThreshold], %%mm0\n\t" // test Threshold, clear chroma change >>>?? + "pmullw %[MotionSense], %%mm0\n\t" // mul by user factor, keep low 16 bits + "movq %[QW256], %%mm7\n\t" +#if SIMD_TYPE == MMXEXT + "pminsw %%mm7, %%mm0\n\t" // max = 256 #else - "paddusw %[QW256B], %%mm0\n\t" // add, may sat at fff.. - "psubusw %[QW256B], %%mm0\n\t" // now = Min(L1,256) -#endif - "psubusw %%mm0, %%mm7\n\t" // so the 2 sum to 256, weighted avg - "movq %%mm4, %%mm2\n\t" // save weave chroma info before trashing - "pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value - "pmullw %%mm7, %%mm4\n\t" // use more weave for less motion - "pand %[YMask], %%mm6\n\t" // keep only luma from calc'd value - "pmullw %%mm0, %%mm6\n\t" // use more bob for large motion - "paddusw %%mm6, %%mm4\n\t" // combine - "psrlw $8, %%mm4\n\t" // div by 256 to get weighted avg - - // chroma comes from weave pixel - "pand %[UVMask], %%mm2\n\t" // keep chroma - "por %%mm4, %%mm2\n\t" // and combine - - V_MOVNTQ ("(%%"XDI")", "%%mm2") // move in our clipped best, use macro - - // bump ptrs and loop - LEAX" 8(%%"XAX"), %%"XAX"\n\t" - LEAX" 8(%%"XBX"), %%"XBX"\n\t" - LEAX" 8(%%"XDX"), %%"XDX"\n\t" - LEAX" 8(%%"XDI"), %%"XDI"\n\t" - LEAX" 8(%%"XSI"), %%"XSI"\n\t" - DECX" %[LoopCtr]\n\t" - "jg 1b\n\t" // loop if not to last line - // note P-III default assumes backward branches taken - "jl 1f\n\t" // done - MOVX" %%"XAX", %%"XBX"\n\t" // sharpness lookahead 1 byte only, be wrong on 1 - "jmp 1b\n\t" - - "1:\n\t" - MOVX" %[oldbx], %%"XBX"\n\t" - - : /* no outputs */ - - : [LastAvg] "m"(LastAvg), - [L1] "m"(L1), - [L3] "m"(L3), - [L2P] "m"(L2P), - [L2] "m"(L2), - [Dest] "m"(Dest), - [ShiftMask] "m"(ShiftMask), - [MaxComb] "m"(MaxComb), - [MotionThreshold] "m"(MotionThreshold), - [MotionSense] "m"(MotionSense), - [QW256B] "m"(QW256B), - [YMask] "m"(YMask), - [UVMask] "m"(UVMask), - [LoopCtr] "m"(LoopCtr), - [QW256] "m"(QW256), - [oldbx] "m"(oldbx) - - : XAX, XCX, XDX, XSI, XDI, -#ifdef HAVE_CPU_I386 - "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)", -#endif - /* FIXME: breaks unless compiling with -mmmx - "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", */ - "memory", "cc" - ); - - Dest += object->output_stride; - object->pMemcpy(Dest, L3, object->line_length); - Dest += object->output_stride; - - L1 += Pitch; - L2 += Pitch; - L3 += Pitch; - L2P += Pitch; - } - - if (InfoIsOdd) { - object->pMemcpy(Dest, L2, object->line_length); - } - - // clear out the MMX registers ready for doing floating point again -#ifdef HAVE_CPU_I386 - __asm__ __volatile__ ("emms\n\t"); + "paddusw %[QW256B], %%mm0\n\t" // add, may sat at fff.. + "psubusw %[QW256B], %%mm0\n\t" // now = Min(L1,256) #endif + "psubusw %%mm0, %%mm7\n\t" // so the 2 sum to 256, weighted avg + "movq %%mm4, %%mm2\n\t" // save weave chroma info before trashing + "pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value + "pmullw %%mm7, %%mm4\n\t" // use more weave for less motion + "pand %[YMask], %%mm6\n\t" // keep only luma from calc'd value + "pmullw %%mm0, %%mm6\n\t" // use more bob for large motion + "paddusw %%mm6, %%mm4\n\t" // combine + "psrlw $8, %%mm4\n\t" // div by 256 to get weighted avg + // chroma comes from weave pixel + "pand %[UVMask], %%mm2\n\t" // keep chroma + "por %%mm4, %%mm2\n\t" // and combine + V_MOVNTQ ("(%%" XDI ")", "%%mm2") // move in our clipped best, use macro + // bump ptrs and loop + LEAX " 8(%%" XAX "), %%" XAX "\n\t" + LEAX " 8(%%" XBX "), %%" XBX "\n\t" + LEAX " 8(%%" XDX "), %%" XDX "\n\t" + LEAX " 8(%%" XDI "), %%" XDI "\n\t" + LEAX " 8(%%" XSI "), %%" XSI "\n\t" + DECX " %[LoopCtr]\n\t" + + "jg 1b\n\t" // loop if not to last line + // note P-III default assumes backward branches taken + "jl 1f\n\t" // done + MOVX " %%" XAX ", %%" XBX "\n\t" // sharpness lookahead 1 byte only, be wrong on 1 + "jmp 1b\n\t" + + "1:\n\t" + MOVX " %[oldbx], %%" XBX "\n\t" + "emms\n\t": /* no outputs */ + + :[LastAvg] "m" (LastAvg), + [L1] "m" (L1), + [L3] "m" (L3), + [L2P] "m" (L2P), + [L2] "m" (L2), + [Dest] "m" (Dest), + [ShiftMask] "m" (ShiftMask), + [MaxComb] "m" (MaxComb), + [MotionThreshold] "m" (MotionThreshold), + [MotionSense] "m" (MotionSense), + [QW256B] "m" (QW256B), + [YMask] "m" (YMask), + [UVMask] "m" (UVMask), + [LoopCtr] "m" (LoopCtr), + [QW256] "m" (QW256), + [oldbx] "m" (oldbx) + : XAX, XCX, XDX, XSI, XDI, + "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)", + /* FIXME: breaks unless compiling with -mmmx + "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", */ + "memory", "cc"); } diff --git a/gst/deinterlace2/tvtime/greedyh.c b/gst/deinterlace2/tvtime/greedyh.c index 623c2d8b..f9d33e74 100644 --- a/gst/deinterlace2/tvtime/greedyh.c +++ b/gst/deinterlace2/tvtime/greedyh.c @@ -41,51 +41,244 @@ #include "gstdeinterlace2.h" #include "speedy.h" +static const unsigned int GreedyMaxComb = 5; +static const unsigned int GreedyMotionThreshold = 25; +static const unsigned int GreedyMotionSense = 30; -#define MAXCOMB_DEFAULT 5 -#define MOTIONTHRESHOLD_DEFAULT 25 -#define MOTIONSENSE_DEFAULT 30 +void +greedyDScaler_C (uint8_t * L1, uint8_t * L2, uint8_t * L3, uint8_t * L2P, + uint8_t * Dest, int size) +{ + int Pos; + uint8_t l1_l, l1_1_l, l3_l, l3_1_l; + uint8_t l1_c, l1_1_c, l3_c, l3_1_c; + uint8_t avg_l, avg_c, avg_l_1, avg_c_1; + uint8_t avg_l__1 = 0, avg_c__1 = 0; + uint8_t avg_s_l, avg_s_c; + uint8_t avg_sc_l, avg_sc_c; + uint8_t best_l, best_c; + uint16_t mov_l; + uint8_t out_l, out_c; + uint8_t l2_l, l2_c, lp2_l, lp2_c; + uint8_t l2_l_diff, l2_c_diff, lp2_l_diff, lp2_c_diff; + uint8_t min_l, min_c, max_l, max_c; + + for (Pos = 0; Pos < size; Pos += 2) { + l1_l = L1[0]; + l1_c = L1[1]; + l3_l = L3[0]; + l3_c = L3[1]; + + if (Pos == size - 1) { + l1_1_l = l1_l; + l1_1_c = l1_c; + l3_1_l = l3_l; + l3_1_c = l3_c; + } else { + l1_1_l = L1[2]; + l1_1_c = L1[3]; + l3_1_l = L3[2]; + l3_1_c = L3[3]; + } + + /* Average of L1 and L3 */ + avg_l = (l1_l + l3_l) / 2; + avg_c = (l1_c + l3_c) / 2; + + /* Average of next L1 and next L3 */ + avg_l_1 = (l1_1_l + l3_1_l) / 2; + avg_c_1 = (l1_1_c + l3_1_c) / 2; + + /* Calculate average of one pixel forward and previous */ + avg_s_l = (avg_l__1 + avg_l_1) / 2; + avg_s_c = (avg_c__1 + avg_c_1) / 2; + + /* Calculate average of center and surrounding pixels */ + avg_sc_l = (avg_l + avg_s_l) / 2; + avg_sc_c = (avg_c + avg_s_c) / 2; + + /* move forward */ + avg_l__1 = avg_l; + avg_c__1 = avg_c; + + /* Get best L2/L2P, i.e. least diff from above average */ + l2_l = L2[0]; + l2_c = L2[1]; + lp2_l = L2P[0]; + lp2_c = L2P[1]; + + l2_l_diff = ABS (l2_l - avg_sc_l); + l2_c_diff = ABS (l2_c - avg_sc_c); + + lp2_l_diff = ABS (lp2_l - avg_sc_l); + lp2_c_diff = ABS (lp2_c - avg_sc_c); + + if (l2_l_diff > lp2_l_diff) + best_l = lp2_l; + else + best_l = l2_l; + + if (l2_c_diff > lp2_c_diff) + best_c = lp2_c; + else + best_c = l2_c; + + /* Clip this best L2/L2P by L1/L3 and allow to differ by GreedyMaxComb */ + max_l = MAX (l1_l, l3_l); + min_l = MIN (l1_l, l3_l); -unsigned int GreedyMaxComb; + if (max_l < 256 - GreedyMaxComb) + max_l += GreedyMaxComb; + else + max_l = 255; -unsigned int GreedyMotionThreshold; + if (min_l > GreedyMaxComb) + min_l -= GreedyMaxComb; + else + min_l = 0; -unsigned int GreedyMotionSense; + max_c = MAX (l1_c, l3_c); + min_c = MIN (l1_c, l3_c); + if (max_c < 256 - GreedyMaxComb) + max_c += GreedyMaxComb; + else + max_c = 255; -#define IS_SSE -#define SSE_TYPE SSE -#define FUNCT_NAME greedyDScaler_SSE + if (min_c > GreedyMaxComb) + min_c -= GreedyMaxComb; + else + min_c = 0; + + out_l = CLAMP (best_l, min_l, max_l); + out_c = CLAMP (best_c, min_c, max_c); + + /* Do motion compensation for luma, i.e. how much + * the weave pixel differs */ + mov_l = ABS (l2_l - lp2_l); + if (mov_l > GreedyMotionThreshold) + mov_l -= GreedyMotionThreshold; + else + mov_l = 0; + + mov_l = mov_l * GreedyMotionSense; + if (mov_l > 256) + mov_l = 256; + + /* Weighted sum on clipped weave pixel and average */ + out_l = (out_l * (256 - mov_l) + avg_sc_l * mov_l) / 256; + + Dest[0] = out_l; + Dest[1] = out_c; + + Dest += 2; + L1 += 2; + L2 += 2; + L3 += 2; + L2P += 2; + } +} + +#define IS_MMXEXT +#define SIMD_TYPE MMXEXT +#define FUNCT_NAME greedyDScaler_MMXEXT #include "greedyh.asm" -#undef SSE_TYPE -#undef IS_SSE +#undef SIMD_TYPE +#undef IS_MMXEXT #undef FUNCT_NAME -#define IS_3DNOW +#define IS_TDNOW +#define SIMD_TYPE TDNOW #define FUNCT_NAME greedyDScaler_3DNOW -#define SSE_TYPE 3DNOW #include "greedyh.asm" -#undef SSE_TYPE -#undef IS_3DNOW +#undef SIMD_TYPE +#undef IS_TDNOW #undef FUNCT_NAME #define IS_MMX -#define SSE_TYPE MMX +#define SIMD_TYPE MMX #define FUNCT_NAME greedyDScaler_MMX #include "greedyh.asm" -#undef SSE_TYPE +#undef SIMD_TYPE #undef IS_MMX #undef FUNCT_NAME -void +static void deinterlace_frame_di_greedyh (GstDeinterlace2 * object) { - if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) { - greedyh_filter_sse (object); + void (*func) (uint8_t * L1, uint8_t * L2, uint8_t * L3, uint8_t * L2P, + uint8_t * Dest, int size); + + int InfoIsOdd = 0; + int Line; + unsigned int Pitch = object->field_stride; + + unsigned char *L1; // ptr to Line1, of 3 + unsigned char *L2; // ptr to Line2, the weave line + unsigned char *L3; // ptr to Line3 + + unsigned char *L2P; // ptr to prev Line2 + unsigned char *Dest = GST_BUFFER_DATA (object->out_buf); + + if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) { + func = greedyDScaler_MMXEXT; } else if (object->cpu_feature_flags & OIL_IMPL_FLAG_3DNOW) { - greedyh_filter_3dnow (object); + func = greedyDScaler_3DNOW; + } else if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMX) { + func = greedyDScaler_MMX; } else { - greedyh_filter_mmx (object); + func = greedyDScaler_C; + } + + // copy first even line no matter what, and the first odd line if we're + // processing an EVEN field. (note diff from other deint rtns.) + + if (object->field_history[object->history_count - 1].flags == + PICTURE_INTERLACED_BOTTOM) { + InfoIsOdd = 1; + + L1 = GST_BUFFER_DATA (object->field_history[object->history_count - 2].buf); + L2 = GST_BUFFER_DATA (object->field_history[object->history_count - 1].buf); + L3 = L1 + Pitch; + L2P = + GST_BUFFER_DATA (object->field_history[object->history_count - 3].buf); + + // copy first even line + object->pMemcpy (Dest, L1, object->line_length); + Dest += object->output_stride; + } else { + InfoIsOdd = 0; + L1 = GST_BUFFER_DATA (object->field_history[object->history_count - 2].buf); + L2 = GST_BUFFER_DATA (object->field_history[object->history_count - + 1].buf) + Pitch; + L3 = L1 + Pitch; + L2P = + GST_BUFFER_DATA (object->field_history[object->history_count - 3].buf) + + Pitch; + + // copy first even line + object->pMemcpy (Dest, GST_BUFFER_DATA (object->field_history[0].buf), + object->line_length); + Dest += object->output_stride; + // then first odd line + object->pMemcpy (Dest, L1, object->line_length); + Dest += object->output_stride; + } + + for (Line = 0; Line < (object->field_height - 1); ++Line) { + func (L1, L2, L3, L2P, Dest, object->line_length); + Dest += object->output_stride; + object->pMemcpy (Dest, L3, object->line_length); + Dest += object->output_stride; + + L1 += Pitch; + L2 += Pitch; + L3 += Pitch; + L2P += Pitch; + } + + if (InfoIsOdd) { + object->pMemcpy (Dest, L2, object->line_length); } } @@ -94,7 +287,7 @@ static deinterlace_method_t greedyh_method = { "Motion Adaptive: Advanced Detection", "AdaptiveAdvanced", 4, - OIL_IMPL_FLAG_MMX, + 0, 0, 0, 0, @@ -117,32 +310,5 @@ static deinterlace_method_t greedyh_method = { deinterlace_method_t * dscaler_greedyh_get_method (void) { - greedyh_init (); return &greedyh_method; } - -void -greedyh_init (void) -{ - GreedyMaxComb = MAXCOMB_DEFAULT; - GreedyMotionThreshold = MOTIONTHRESHOLD_DEFAULT; - GreedyMotionSense = MOTIONSENSE_DEFAULT; -} - -void -greedyh_filter_mmx (GstDeinterlace2 * object) -{ - greedyDScaler_MMX (object); -} - -void -greedyh_filter_3dnow (GstDeinterlace2 * object) -{ - greedyDScaler_3DNOW (object); -} - -void -greedyh_filter_sse (GstDeinterlace2 * object) -{ - greedyDScaler_SSE (object); -} diff --git a/gst/deinterlace2/tvtime/greedyhmacros.h b/gst/deinterlace2/tvtime/greedyhmacros.h index 5f65959c..3f1c72c9 100644 --- a/gst/deinterlace2/tvtime/greedyhmacros.h +++ b/gst/deinterlace2/tvtime/greedyhmacros.h @@ -21,7 +21,7 @@ // BEFORE USING THESE YOU MUST SET: -// #define SSE_TYPE SSE (or MMX or 3DNOW) +// #define SIMD_TYPE MMXEXT (or MMX or TDNOW) // some macros for pavgb instruction // V_PAVGB(mmr1, mmr2, mmr work register, smask) mmr2 may = mmrw if you can trash it @@ -33,21 +33,21 @@ "pand "smask", "mmr1"\n\t" \ "psrlw $1, "mmr1"\n\t" \ "paddusb "mmrw", "mmr1"\n\t" -#define V_PAVGB_SSE(mmr1, mmr2, mmrw, smask) "pavgb "mmr2", "mmr1"\n\t" -#define V_PAVGB_3DNOW(mmr1, mmr2, mmrw, smask) "pavgusb "mmr2", "mmr1"\n\t" -#define V_PAVGB(mmr1, mmr2, mmrw, smask) V_PAVGB2(mmr1, mmr2, mmrw, smask, SSE_TYPE) -#define V_PAVGB2(mmr1, mmr2, mmrw, smask, ssetyp) V_PAVGB3(mmr1, mmr2, mmrw, smask, ssetyp) -#define V_PAVGB3(mmr1, mmr2, mmrw, smask, ssetyp) V_PAVGB_##ssetyp(mmr1, mmr2, mmrw, smask) +#define V_PAVGB_MMXEXT(mmr1, mmr2, mmrw, smask) "pavgb "mmr2", "mmr1"\n\t" +#define V_PAVGB_TDNOW(mmr1, mmr2, mmrw, smask) "pavgusb "mmr2", "mmr1"\n\t" +#define V_PAVGB(mmr1, mmr2, mmrw, smask) V_PAVGB2(mmr1, mmr2, mmrw, smask, SIMD_TYPE) +#define V_PAVGB2(mmr1, mmr2, mmrw, smask, simdtype) V_PAVGB3(mmr1, mmr2, mmrw, smask, simdtype) +#define V_PAVGB3(mmr1, mmr2, mmrw, smask, simdtype) V_PAVGB_##simdtype(mmr1, mmr2, mmrw, smask) // some macros for pmaxub instruction #define V_PMAXUB_MMX(mmr1, mmr2) \ "psubusb "mmr2", "mmr1"\n\t" \ "paddusb "mmr2", "mmr1"\n\t" -#define V_PMAXUB_SSE(mmr1, mmr2) "pmaxub "mmr2", "mmr1"\n\t" -#define V_PMAXUB_3DNOW(mmr1, mmr2) V_PMAXUB_MMX(mmr1, mmr2) // use MMX version -#define V_PMAXUB(mmr1, mmr2) V_PMAXUB2(mmr1, mmr2, SSE_TYPE) -#define V_PMAXUB2(mmr1, mmr2, ssetyp) V_PMAXUB3(mmr1, mmr2, ssetyp) -#define V_PMAXUB3(mmr1, mmr2, ssetyp) V_PMAXUB_##ssetyp(mmr1, mmr2) +#define V_PMAXUB_MMXEXT(mmr1, mmr2) "pmaxub "mmr2", "mmr1"\n\t" +#define V_PMAXUB_TDNOW(mmr1, mmr2) V_PMAXUB_MMX(mmr1, mmr2) // use MMX version +#define V_PMAXUB(mmr1, mmr2) V_PMAXUB2(mmr1, mmr2, SIMD_TYPE) +#define V_PMAXUB2(mmr1, mmr2, simdtype) V_PMAXUB3(mmr1, mmr2, simdtype) +#define V_PMAXUB3(mmr1, mmr2, simdtype) V_PMAXUB_##simdtype(mmr1, mmr2) // some macros for pminub instruction // V_PMINUB(mmr1, mmr2, mmr work register) mmr2 may NOT = mmrw @@ -56,19 +56,19 @@ "psubusb "mmr2", "mmrw"\n\t" \ "paddusb "mmrw", "mmr1"\n\t" \ "psubusb "mmrw", "mmr1"\n\t" -#define V_PMINUB_SSE(mmr1, mmr2, mmrw) "pminub "mmr2", "mmr1"\n\t" -#define V_PMINUB_3DNOW(mmr1, mmr2, mmrw) V_PMINUB_MMX(mmr1, mmr2, mmrw) // use MMX version -#define V_PMINUB(mmr1, mmr2, mmrw) V_PMINUB2(mmr1, mmr2, mmrw, SSE_TYPE) -#define V_PMINUB2(mmr1, mmr2, mmrw, ssetyp) V_PMINUB3(mmr1, mmr2, mmrw, ssetyp) -#define V_PMINUB3(mmr1, mmr2, mmrw, ssetyp) V_PMINUB_##ssetyp(mmr1, mmr2, mmrw) +#define V_PMINUB_MMXEXT(mmr1, mmr2, mmrw) "pminub "mmr2", "mmr1"\n\t" +#define V_PMINUB_TDNOW(mmr1, mmr2, mmrw) V_PMINUB_MMX(mmr1, mmr2, mmrw) // use MMX version +#define V_PMINUB(mmr1, mmr2, mmrw) V_PMINUB2(mmr1, mmr2, mmrw, SIMD_TYPE) +#define V_PMINUB2(mmr1, mmr2, mmrw, simdtype) V_PMINUB3(mmr1, mmr2, mmrw, simdtype) +#define V_PMINUB3(mmr1, mmr2, mmrw, simdtype) V_PMINUB_##simdtype(mmr1, mmr2, mmrw) // some macros for movntq instruction // V_MOVNTQ(mmr1, mmr2) #define V_MOVNTQ_MMX(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t" -#define V_MOVNTQ_3DNOW(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t" -#define V_MOVNTQ_SSE(mmr1, mmr2) "movntq "mmr2", "mmr1"\n\t" -#define V_MOVNTQ(mmr1, mmr2) V_MOVNTQ2(mmr1, mmr2, SSE_TYPE) -#define V_MOVNTQ2(mmr1, mmr2, ssetyp) V_MOVNTQ3(mmr1, mmr2, ssetyp) -#define V_MOVNTQ3(mmr1, mmr2, ssetyp) V_MOVNTQ_##ssetyp(mmr1, mmr2) +#define V_MOVNTQ_TDNOW(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t" +#define V_MOVNTQ_MMXEXT(mmr1, mmr2) "movntq "mmr2", "mmr1"\n\t" +#define V_MOVNTQ(mmr1, mmr2) V_MOVNTQ2(mmr1, mmr2, SIMD_TYPE) +#define V_MOVNTQ2(mmr1, mmr2, simdtype) V_MOVNTQ3(mmr1, mmr2, simdtype) +#define V_MOVNTQ3(mmr1, mmr2, simdtype) V_MOVNTQ_##simdtype(mmr1, mmr2) // end of macros |