summaryrefslogtreecommitdiffstats
path: root/gst
diff options
context:
space:
mode:
authorSebastian Dröge <slomo@circular-chaos.org>2008-06-28 17:25:56 +0000
committerSebastian Dröge <slomo@circular-chaos.org>2008-06-28 17:25:56 +0000
commitaae071d922ffabcade0315e2691d671e4cb85478 (patch)
tree4c51c61ac44168f3c18474c174b535af3dbef747 /gst
parentb1bc42dda060735f8795ebb5a21fc68e26261b2a (diff)
downloadgst-plugins-bad-aae071d922ffabcade0315e2691d671e4cb85478.tar.gz
gst-plugins-bad-aae071d922ffabcade0315e2691d671e4cb85478.tar.bz2
gst-plugins-bad-aae071d922ffabcade0315e2691d671e4cb85478.zip
gst/deinterlace2/tvtime/: Add a C implementation for the greedyh deinterlacing method, clean up the code a bit and ma...
Original commit message from CVS: * gst/deinterlace2/tvtime/greedyh.asm: * gst/deinterlace2/tvtime/greedyh.c: (greedyDScaler_C), (deinterlace_frame_di_greedyh), (dscaler_greedyh_get_method): * gst/deinterlace2/tvtime/greedyhmacros.h: Add a C implementation for the greedyh deinterlacing method, clean up the code a bit and mark the SSE version as MMXEXT as it doesn't require any SSE instructions.
Diffstat (limited to 'gst')
-rw-r--r--gst/deinterlace2/tvtime/greedyh.asm475
-rw-r--r--gst/deinterlace2/tvtime/greedyh.c266
-rw-r--r--gst/deinterlace2/tvtime/greedyhmacros.h42
3 files changed, 442 insertions, 341 deletions
diff --git a/gst/deinterlace2/tvtime/greedyh.asm b/gst/deinterlace2/tvtime/greedyh.asm
index 8fd0ab66..fcd3a647 100644
--- a/gst/deinterlace2/tvtime/greedyh.asm
+++ b/gst/deinterlace2/tvtime/greedyh.asm
@@ -28,281 +28,216 @@
#include "x86-64_macros.inc"
-void FUNCT_NAME( GstDeinterlace2 *object)
+void
+FUNCT_NAME (uint8_t * L1, uint8_t * L2, uint8_t * L3, uint8_t * L2P,
+ uint8_t * Dest, int size)
{
- int64_t i;
- int InfoIsOdd = 0;
- // in tight loop some vars are accessed faster in local storage
- int64_t YMask = 0x00ff00ff00ff00ffull; // to keep only luma
- int64_t UVMask = 0xff00ff00ff00ff00ull; // to keep only chroma
- int64_t ShiftMask = 0xfefffefffefffeffull; // to avoid shifting chroma to luma
- int64_t QW256 = 0x0100010001000100ull; // 4 256's
-
- // Set up our two parms that are actually evaluated for each pixel
- i=GreedyMaxComb;
- int64_t MaxComb = i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i;
-
- i = GreedyMotionThreshold; // scale to range of 0-257
- int64_t MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask;
-
- i = GreedyMotionSense; // scale to range of 0-257
- int64_t MotionSense = i << 48 | i << 32 | i << 16 | i;
-
- int Line;
- long LoopCtr;
- unsigned int Pitch = object->field_stride;
-
- unsigned char* L1; // ptr to Line1, of 3
- unsigned char* L2; // ptr to Line2, the weave line
- unsigned char* L3; // ptr to Line3
-
- unsigned char* L2P; // ptr to prev Line2
- unsigned char* Dest = GST_BUFFER_DATA(object->out_buf);
-
- int64_t QW256B;
- int64_t LastAvg=0; //interp value from left qword
-
- i = 0xffffffff - 256;
- QW256B = i << 48 | i << 32 | i << 16 | i; // save a couple instr on PMINSW instruct.
-
-
- // copy first even line no matter what, and the first odd line if we're
- // processing an EVEN field. (note diff from other deint rtns.)
-
- if (object->field_history[object->history_count-1].flags == PICTURE_INTERLACED_BOTTOM) {
- InfoIsOdd = 1;
-
- L1 = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf);
- L2 = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf);
- L3 = L1 + Pitch;
- L2P = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf);
-
- // copy first even line
- object->pMemcpy(Dest, L1, object->line_length);
- Dest += object->output_stride;
- }
- else {
- InfoIsOdd = 0;
- L1 = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf);
- L2 = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf) + Pitch;
- L3 = L1 + Pitch;
- L2P = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf) + Pitch;
-
- // copy first even line
- object->pMemcpy(Dest, GST_BUFFER_DATA(object->field_history[0].buf), object->line_length);
- Dest += object->output_stride;
- // then first odd line
- object->pMemcpy(Dest, L1, object->line_length);
- Dest += object->output_stride;
- }
-
-
- long oldbx;
-
- for (Line = 0; Line < (object->field_height - 1); ++Line) {
- LoopCtr = object->line_length / 8 - 1; // there are LineLength / 8 qwords per line but do 1 less, adj at end of loop
-
- // For ease of reading, the comments below assume that we're operating on an odd
- // field (i.e., that InfoIsOdd is true). Assume the obvious for even lines..
- __asm__ __volatile__
- (
- // save ebx (-fPIC)
- MOVX" %%"XBX", %[oldbx]\n\t"
-
- MOVX" %[L1], %%"XAX"\n\t"
- LEAX" 8(%%"XAX"), %%"XBX"\n\t" // next qword needed by DJR
- MOVX" %[L3], %%"XCX"\n\t"
- SUBX" %%"XAX", %%"XCX"\n\t" // carry L3 addr as an offset
- MOVX" %[L2P], %%"XDX"\n\t"
- MOVX" %[L2], %%"XSI"\n\t"
- MOVX" %[Dest], %%"XDI"\n\t" // DL1 if Odd or DL2 if Even
-
- ".align 8\n\t"
- "1:\n\t"
-
- "movq (%%"XSI"), %%mm0\n\t" // L2 - the newest weave pixel value
- "movq (%%"XAX"), %%mm1\n\t" // L1 - the top pixel
- "movq (%%"XDX"), %%mm2\n\t" // L2P - the prev weave pixel
- "movq (%%"XAX", %%"XCX"), %%mm3\n\t" // L3, next odd row
- "movq %%mm1, %%mm6\n\t" // L1 - get simple single pixel interp
- // pavgb mm6, mm3 // use macro below
- V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]")
-
- // DJR - Diagonal Jaggie Reduction
- // In the event that we are going to use an average (Bob) pixel we do not want a jagged
- // stair step effect. To combat this we avg in the 2 horizontally adjacen pixels into the
- // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels.
-
- "movq %[LastAvg], %%mm4\n\t" // the bob value from prev qword in row
- "movq %%mm6, %[LastAvg]\n\t" // save for next pass
- "psrlq $48, %%mm4\n\t" // right justify 1 pixel
- "movq %%mm6, %%mm7\n\t" // copy of simple bob pixel
- "psllq $16, %%mm7\n\t" // left justify 3 pixels
- "por %%mm7, %%mm4\n\t" // and combine
-
- "movq (%%"XBX"), %%mm5\n\t" // next horiz qword from L1
- // pavgb mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below
- V_PAVGB ("%%mm5", "(%%"XBX",%%"XCX")", "%%mm7", "%[ShiftMask]")
- "psllq $48, %%mm5\n\t" // left just 1 pixel
- "movq %%mm6, %%mm7\n\t" // another copy of simple bob pixel
- "psrlq $16, %%mm7\n\t" // right just 3 pixels
- "por %%mm7, %%mm5\n\t" // combine
- // pavgb mm4, mm5 // avg of forward and prev by 1 pixel, use macro
- V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]") // mm5 gets modified if MMX
- // pavgb mm6, mm4 // avg of center and surround interp vals, use macro
- V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
-
- // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors.
+ // in tight loop some vars are accessed faster in local storage
+ int64_t YMask = 0x00ff00ff00ff00ffull; // to keep only luma
+ int64_t UVMask = 0xff00ff00ff00ff00ull; // to keep only chroma
+ int64_t ShiftMask = 0xfefefefefefefefeull; // to avoid shifting chroma to luma
+ int64_t QW256 = 0x0100010001000100ull; // 4 256's
+ int64_t MaxComb;
+ int64_t MotionThreshold;
+ int64_t MotionSense;
+ int64_t i;
+ long LoopCtr;
+ long oldbx;
+
+ int64_t QW256B;
+ int64_t LastAvg = 0; //interp value from left qword
+
+ // Set up our two parms that are actually evaluated for each pixel
+ i = GreedyMaxComb;
+ MaxComb =
+ i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i;
+
+ i = GreedyMotionThreshold; // scale to range of 0-257
+ MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask;
+
+ i = GreedyMotionSense; // scale to range of 0-257
+ MotionSense = i << 48 | i << 32 | i << 16 | i;
+
+ i = 0xffffffff - 256;
+ QW256B = i << 48 | i << 32 | i << 16 | i; // save a couple instr on PMINSW instruct.
+
+ LoopCtr = size / 8 - 1; // there are LineLength / 8 qwords per line but do 1 less, adj at end of loop
+
+ // For ease of reading, the comments below assume that we're operating on an odd
+ // field (i.e., that InfoIsOdd is true). Assume the obvious for even lines..
+ __asm__ __volatile__ (
+ // save ebx (-fPIC)
+ MOVX " %%" XBX ", %[oldbx]\n\t"
+ MOVX " %[L1], %%" XAX "\n\t"
+ LEAX " 8(%%" XAX "), %%" XBX "\n\t" // next qword needed by DJR
+ MOVX " %[L3], %%" XCX "\n\t"
+ SUBX " %%" XAX ", %%" XCX "\n\t" // carry L3 addr as an offset
+ MOVX " %[L2P], %%" XDX "\n\t"
+ MOVX " %[L2], %%" XSI "\n\t"
+ MOVX " %[Dest], %%" XDI "\n\t" // DL1 if Odd or DL2 if Even
+
+ ".align 8\n\t"
+ "1:\n\t"
+ "movq (%%" XSI "), %%mm0\n\t" // L2 - the newest weave pixel value
+ "movq (%%" XAX "), %%mm1\n\t" // L1 - the top pixel
+ "movq (%%" XDX "), %%mm2\n\t" // L2P - the prev weave pixel
+ "movq (%%" XAX ", %%" XCX "), %%mm3\n\t" // L3, next odd row
+ "movq %%mm1, %%mm6\n\t" // L1 - get simple single pixel interp
+
+ // pavgb mm6, mm3 // use macro below
+ V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]")
+
+ // DJR - Diagonal Jaggie Reduction
+ // In the event that we are going to use an average (Bob) pixel we do not want a jagged
+ // stair step effect. To combat this we avg in the 2 horizontally adjacen pixels into the
+ // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels.
+
+ "movq %[LastAvg], %%mm4\n\t" // the bob value from prev qword in row
+ "movq %%mm6, %[LastAvg]\n\t" // save for next pass
+ "psrlq $48, %%mm4\n\t" // right justify 1 pixel
+ "movq %%mm6, %%mm7\n\t" // copy of simple bob pixel
+ "psllq $16, %%mm7\n\t" // left justify 3 pixels
+ "por %%mm7, %%mm4\n\t" // and combine
+ "movq (%%" XBX "), %%mm5\n\t" // next horiz qword from L1
+ // pavgb mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below
+
+ V_PAVGB ("%%mm5", "(%%" XBX ",%%" XCX ")", "%%mm7", "%[ShiftMask]")
+ "psllq $48, %%mm5\n\t" // left just 1 pixel
+ "movq %%mm6, %%mm7\n\t" // another copy of simple bob pixel
+ "psrlq $16, %%mm7\n\t" // right just 3 pixels
+ "por %%mm7, %%mm5\n\t" // combine
+ // pavgb mm4, mm5 // avg of forward and prev by 1 pixel, use macro
+ V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]") // mm5 gets modified if MMX
+ // pavgb mm6, mm4 // avg of center and surround interp vals, use macro
+ V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
+
+ // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors.
#ifndef IS_MMX
- // pavgb mm4, mm6 // 1/4 center, 3/4 adjacent
- V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]")
- // pavgb mm6, mm4 // 3/8 center, 5/8 adjacent
- V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
+ // pavgb mm4, mm6 // 1/4 center, 3/4 adjacent
+ V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]")
+ // pavgb mm6, mm4 // 3/8 center, 5/8 adjacent
+ V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
#endif
- // get abs value of possible L2 comb
- "movq %%mm6, %%mm4\n\t" // work copy of interp val
- "movq %%mm2, %%mm7\n\t" // L2
- "psubusb %%mm4, %%mm7\n\t" // L2 - avg
- "movq %%mm4, %%mm5\n\t" // avg
- "psubusb %%mm2, %%mm5\n\t" // avg - L2
- "por %%mm7, %%mm5\n\t" // abs(avg-L2)
-
- // get abs value of possible L2P comb
- "movq %%mm0, %%mm7\n\t" // L2P
- "psubusb %%mm4, %%mm7\n\t" // L2P - avg
- "psubusb %%mm0, %%mm4\n\t" // avg - L2P
- "por %%mm7, %%mm4\n\t" // abs(avg-L2P)
-
- // use L2 or L2P depending upon which makes smaller comb
- "psubusb %%mm5, %%mm4\n\t" // see if it goes to zero
- "psubusb %%mm5, %%mm5\n\t" // 0
- "pcmpeqb %%mm5, %%mm4\n\t" // if (mm4=0) then FF else 0
- "pcmpeqb %%mm4, %%mm5\n\t" // opposite of mm4
-
- // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
- "pand %%mm2, %%mm5\n\t" // use L2 if mm5 == ff, else 0
- "pand %%mm0, %%mm4\n\t" // use L2P if mm4 = ff, else 0
- "por %%mm5, %%mm4\n\t" // may the best win
-
- // Inventory: at this point we have the following values:
- // mm0 = L2P (or L2)
- // mm1 = L1
- // mm2 = L2 (or L2P)
- // mm3 = L3
- // mm4 = the best of L2,L2P weave pixel, base upon comb
- // mm6 = the avg interpolated value, if we need to use it
-
- // Let's measure movement, as how much the weave pixel has changed
- "movq %%mm2, %%mm7\n\t"
- "psubusb %%mm0, %%mm2\n\t"
- "psubusb %%mm7, %%mm0\n\t"
- "por %%mm2, %%mm0\n\t" // abs value of change, used later
-
- // Now lets clip our chosen value to be not outside of the range
- // of the high/low range L1-L3 by more than MaxComb.
- // This allows some comb but limits the damages and also allows more
- // detail than a boring oversmoothed clip.
- "movq %%mm1, %%mm2\n\t" // copy L1
- // pmaxub mm2, mm3 // use macro
- V_PMAXUB ("%%mm2", "%%mm3") // now = Max(L1,L3)
- "movq %%mm1, %%mm5\n\t" // copy L1
- // pminub mm5, mm3 // now = Min(L1,L3), use macro
- V_PMINUB ("%%mm5", "%%mm3", "%%mm7")
- // allow the value to be above the high or below the low by amt of MaxComb
- "psubusb %[MaxComb], %%mm5\n\t" // lower min by diff
- "paddusb %[MaxComb], %%mm2\n\t" // increase max by diff
- // pmaxub mm4, mm5 // now = Max(best,Min(L1,L3) use macro
- V_PMAXUB ("%%mm4", "%%mm5")
- // pminub mm4, mm2 // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
- V_PMINUB ("%%mm4", "%%mm2", "%%mm7")
-
- // Blend weave pixel with bob pixel, depending on motion val in mm0
- "psubusb %[MotionThreshold], %%mm0\n\t"// test Threshold, clear chroma change >>>??
- "pmullw %[MotionSense], %%mm0\n\t" // mul by user factor, keep low 16 bits
- "movq %[QW256], %%mm7\n\t"
-#ifdef HAVE_SSE
- "pminsw %%mm7, %%mm0\n\t" // max = 256
+ // get abs value of possible L2 comb
+ "movq %%mm6, %%mm4\n\t" // work copy of interp val
+ "movq %%mm2, %%mm7\n\t" // L2
+ "psubusb %%mm4, %%mm7\n\t" // L2 - avg
+ "movq %%mm4, %%mm5\n\t" // avg
+ "psubusb %%mm2, %%mm5\n\t" // avg - L2
+ "por %%mm7, %%mm5\n\t" // abs(avg-L2)
+
+ // get abs value of possible L2P comb
+ "movq %%mm0, %%mm7\n\t" // L2P
+ "psubusb %%mm4, %%mm7\n\t" // L2P - avg
+ "psubusb %%mm0, %%mm4\n\t" // avg - L2P
+ "por %%mm7, %%mm4\n\t" // abs(avg-L2P)
+
+ // use L2 or L2P depending upon which makes smaller comb
+ "psubusb %%mm5, %%mm4\n\t" // see if it goes to zero
+ "psubusb %%mm5, %%mm5\n\t" // 0
+ "pcmpeqb %%mm5, %%mm4\n\t" // if (mm4=0) then FF else 0
+ "pcmpeqb %%mm4, %%mm5\n\t" // opposite of mm4
+
+ // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
+ "pand %%mm2, %%mm5\n\t" // use L2 if mm5 == ff, else 0
+ "pand %%mm0, %%mm4\n\t" // use L2P if mm4 = ff, else 0
+ "por %%mm5, %%mm4\n\t" // may the best win
+
+ // Inventory: at this point we have the following values:
+ // mm0 = L2P (or L2)
+ // mm1 = L1
+ // mm2 = L2 (or L2P)
+ // mm3 = L3
+ // mm4 = the best of L2,L2P weave pixel, base upon comb
+ // mm6 = the avg interpolated value, if we need to use it
+ // Let's measure movement, as how much the weave pixel has changed
+
+ "movq %%mm2, %%mm7\n\t"
+ "psubusb %%mm0, %%mm2\n\t"
+ "psubusb %%mm7, %%mm0\n\t"
+ "por %%mm2, %%mm0\n\t" // abs value of change, used later
+
+ // Now lets clip our chosen value to be not outside of the range
+ // of the high/low range L1-L3 by more than MaxComb.
+ // This allows some comb but limits the damages and also allows more
+ // detail than a boring oversmoothed clip.
+
+ "movq %%mm1, %%mm2\n\t" // copy L1
+ // pmaxub mm2, mm3 // use macro
+ V_PMAXUB ("%%mm2", "%%mm3") // now = Max(L1,L3)
+ "movq %%mm1, %%mm5\n\t" // copy L1
+ // pminub mm5, mm3 // now = Min(L1,L3), use macro
+ V_PMINUB ("%%mm5", "%%mm3", "%%mm7")
+
+ // allow the value to be above the high or below the low by amt of MaxComb
+ "psubusb %[MaxComb], %%mm5\n\t" // lower min by diff
+ "paddusb %[MaxComb], %%mm2\n\t" // increase max by diff
+ // pmaxub mm4, mm5 // now = Max(best,Min(L1,L3) use macro
+ V_PMAXUB ("%%mm4", "%%mm5")
+ // pminub mm4, mm2 // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
+ V_PMINUB ("%%mm4", "%%mm2", "%%mm7")
+
+ // Blend weave pixel with bob pixel, depending on motion val in mm0
+ "psubusb %[MotionThreshold], %%mm0\n\t" // test Threshold, clear chroma change >>>??
+ "pmullw %[MotionSense], %%mm0\n\t" // mul by user factor, keep low 16 bits
+ "movq %[QW256], %%mm7\n\t"
+#if SIMD_TYPE == MMXEXT
+ "pminsw %%mm7, %%mm0\n\t" // max = 256
#else
- "paddusw %[QW256B], %%mm0\n\t" // add, may sat at fff..
- "psubusw %[QW256B], %%mm0\n\t" // now = Min(L1,256)
-#endif
- "psubusw %%mm0, %%mm7\n\t" // so the 2 sum to 256, weighted avg
- "movq %%mm4, %%mm2\n\t" // save weave chroma info before trashing
- "pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value
- "pmullw %%mm7, %%mm4\n\t" // use more weave for less motion
- "pand %[YMask], %%mm6\n\t" // keep only luma from calc'd value
- "pmullw %%mm0, %%mm6\n\t" // use more bob for large motion
- "paddusw %%mm6, %%mm4\n\t" // combine
- "psrlw $8, %%mm4\n\t" // div by 256 to get weighted avg
-
- // chroma comes from weave pixel
- "pand %[UVMask], %%mm2\n\t" // keep chroma
- "por %%mm4, %%mm2\n\t" // and combine
-
- V_MOVNTQ ("(%%"XDI")", "%%mm2") // move in our clipped best, use macro
-
- // bump ptrs and loop
- LEAX" 8(%%"XAX"), %%"XAX"\n\t"
- LEAX" 8(%%"XBX"), %%"XBX"\n\t"
- LEAX" 8(%%"XDX"), %%"XDX"\n\t"
- LEAX" 8(%%"XDI"), %%"XDI"\n\t"
- LEAX" 8(%%"XSI"), %%"XSI"\n\t"
- DECX" %[LoopCtr]\n\t"
- "jg 1b\n\t" // loop if not to last line
- // note P-III default assumes backward branches taken
- "jl 1f\n\t" // done
- MOVX" %%"XAX", %%"XBX"\n\t" // sharpness lookahead 1 byte only, be wrong on 1
- "jmp 1b\n\t"
-
- "1:\n\t"
- MOVX" %[oldbx], %%"XBX"\n\t"
-
- : /* no outputs */
-
- : [LastAvg] "m"(LastAvg),
- [L1] "m"(L1),
- [L3] "m"(L3),
- [L2P] "m"(L2P),
- [L2] "m"(L2),
- [Dest] "m"(Dest),
- [ShiftMask] "m"(ShiftMask),
- [MaxComb] "m"(MaxComb),
- [MotionThreshold] "m"(MotionThreshold),
- [MotionSense] "m"(MotionSense),
- [QW256B] "m"(QW256B),
- [YMask] "m"(YMask),
- [UVMask] "m"(UVMask),
- [LoopCtr] "m"(LoopCtr),
- [QW256] "m"(QW256),
- [oldbx] "m"(oldbx)
-
- : XAX, XCX, XDX, XSI, XDI,
-#ifdef HAVE_CPU_I386
- "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
-#endif
- /* FIXME: breaks unless compiling with -mmmx
- "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", */
- "memory", "cc"
- );
-
- Dest += object->output_stride;
- object->pMemcpy(Dest, L3, object->line_length);
- Dest += object->output_stride;
-
- L1 += Pitch;
- L2 += Pitch;
- L3 += Pitch;
- L2P += Pitch;
- }
-
- if (InfoIsOdd) {
- object->pMemcpy(Dest, L2, object->line_length);
- }
-
- // clear out the MMX registers ready for doing floating point again
-#ifdef HAVE_CPU_I386
- __asm__ __volatile__ ("emms\n\t");
+ "paddusw %[QW256B], %%mm0\n\t" // add, may sat at fff..
+ "psubusw %[QW256B], %%mm0\n\t" // now = Min(L1,256)
#endif
+ "psubusw %%mm0, %%mm7\n\t" // so the 2 sum to 256, weighted avg
+ "movq %%mm4, %%mm2\n\t" // save weave chroma info before trashing
+ "pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value
+ "pmullw %%mm7, %%mm4\n\t" // use more weave for less motion
+ "pand %[YMask], %%mm6\n\t" // keep only luma from calc'd value
+ "pmullw %%mm0, %%mm6\n\t" // use more bob for large motion
+ "paddusw %%mm6, %%mm4\n\t" // combine
+ "psrlw $8, %%mm4\n\t" // div by 256 to get weighted avg
+ // chroma comes from weave pixel
+ "pand %[UVMask], %%mm2\n\t" // keep chroma
+ "por %%mm4, %%mm2\n\t" // and combine
+ V_MOVNTQ ("(%%" XDI ")", "%%mm2") // move in our clipped best, use macro
+ // bump ptrs and loop
+ LEAX " 8(%%" XAX "), %%" XAX "\n\t"
+ LEAX " 8(%%" XBX "), %%" XBX "\n\t"
+ LEAX " 8(%%" XDX "), %%" XDX "\n\t"
+ LEAX " 8(%%" XDI "), %%" XDI "\n\t"
+ LEAX " 8(%%" XSI "), %%" XSI "\n\t"
+ DECX " %[LoopCtr]\n\t"
+
+ "jg 1b\n\t" // loop if not to last line
+ // note P-III default assumes backward branches taken
+ "jl 1f\n\t" // done
+ MOVX " %%" XAX ", %%" XBX "\n\t" // sharpness lookahead 1 byte only, be wrong on 1
+ "jmp 1b\n\t"
+
+ "1:\n\t"
+ MOVX " %[oldbx], %%" XBX "\n\t"
+ "emms\n\t": /* no outputs */
+
+ :[LastAvg] "m" (LastAvg),
+ [L1] "m" (L1),
+ [L3] "m" (L3),
+ [L2P] "m" (L2P),
+ [L2] "m" (L2),
+ [Dest] "m" (Dest),
+ [ShiftMask] "m" (ShiftMask),
+ [MaxComb] "m" (MaxComb),
+ [MotionThreshold] "m" (MotionThreshold),
+ [MotionSense] "m" (MotionSense),
+ [QW256B] "m" (QW256B),
+ [YMask] "m" (YMask),
+ [UVMask] "m" (UVMask),
+ [LoopCtr] "m" (LoopCtr),
+ [QW256] "m" (QW256),
+ [oldbx] "m" (oldbx)
+ : XAX, XCX, XDX, XSI, XDI,
+ "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
+ /* FIXME: breaks unless compiling with -mmmx
+ "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", */
+ "memory", "cc");
}
diff --git a/gst/deinterlace2/tvtime/greedyh.c b/gst/deinterlace2/tvtime/greedyh.c
index 623c2d8b..f9d33e74 100644
--- a/gst/deinterlace2/tvtime/greedyh.c
+++ b/gst/deinterlace2/tvtime/greedyh.c
@@ -41,51 +41,244 @@
#include "gstdeinterlace2.h"
#include "speedy.h"
+static const unsigned int GreedyMaxComb = 5;
+static const unsigned int GreedyMotionThreshold = 25;
+static const unsigned int GreedyMotionSense = 30;
-#define MAXCOMB_DEFAULT 5
-#define MOTIONTHRESHOLD_DEFAULT 25
-#define MOTIONSENSE_DEFAULT 30
+void
+greedyDScaler_C (uint8_t * L1, uint8_t * L2, uint8_t * L3, uint8_t * L2P,
+ uint8_t * Dest, int size)
+{
+ int Pos;
+ uint8_t l1_l, l1_1_l, l3_l, l3_1_l;
+ uint8_t l1_c, l1_1_c, l3_c, l3_1_c;
+ uint8_t avg_l, avg_c, avg_l_1, avg_c_1;
+ uint8_t avg_l__1 = 0, avg_c__1 = 0;
+ uint8_t avg_s_l, avg_s_c;
+ uint8_t avg_sc_l, avg_sc_c;
+ uint8_t best_l, best_c;
+ uint16_t mov_l;
+ uint8_t out_l, out_c;
+ uint8_t l2_l, l2_c, lp2_l, lp2_c;
+ uint8_t l2_l_diff, l2_c_diff, lp2_l_diff, lp2_c_diff;
+ uint8_t min_l, min_c, max_l, max_c;
+
+ for (Pos = 0; Pos < size; Pos += 2) {
+ l1_l = L1[0];
+ l1_c = L1[1];
+ l3_l = L3[0];
+ l3_c = L3[1];
+
+ if (Pos == size - 1) {
+ l1_1_l = l1_l;
+ l1_1_c = l1_c;
+ l3_1_l = l3_l;
+ l3_1_c = l3_c;
+ } else {
+ l1_1_l = L1[2];
+ l1_1_c = L1[3];
+ l3_1_l = L3[2];
+ l3_1_c = L3[3];
+ }
+
+ /* Average of L1 and L3 */
+ avg_l = (l1_l + l3_l) / 2;
+ avg_c = (l1_c + l3_c) / 2;
+
+ /* Average of next L1 and next L3 */
+ avg_l_1 = (l1_1_l + l3_1_l) / 2;
+ avg_c_1 = (l1_1_c + l3_1_c) / 2;
+
+ /* Calculate average of one pixel forward and previous */
+ avg_s_l = (avg_l__1 + avg_l_1) / 2;
+ avg_s_c = (avg_c__1 + avg_c_1) / 2;
+
+ /* Calculate average of center and surrounding pixels */
+ avg_sc_l = (avg_l + avg_s_l) / 2;
+ avg_sc_c = (avg_c + avg_s_c) / 2;
+
+ /* move forward */
+ avg_l__1 = avg_l;
+ avg_c__1 = avg_c;
+
+ /* Get best L2/L2P, i.e. least diff from above average */
+ l2_l = L2[0];
+ l2_c = L2[1];
+ lp2_l = L2P[0];
+ lp2_c = L2P[1];
+
+ l2_l_diff = ABS (l2_l - avg_sc_l);
+ l2_c_diff = ABS (l2_c - avg_sc_c);
+
+ lp2_l_diff = ABS (lp2_l - avg_sc_l);
+ lp2_c_diff = ABS (lp2_c - avg_sc_c);
+
+ if (l2_l_diff > lp2_l_diff)
+ best_l = lp2_l;
+ else
+ best_l = l2_l;
+
+ if (l2_c_diff > lp2_c_diff)
+ best_c = lp2_c;
+ else
+ best_c = l2_c;
+
+ /* Clip this best L2/L2P by L1/L3 and allow to differ by GreedyMaxComb */
+ max_l = MAX (l1_l, l3_l);
+ min_l = MIN (l1_l, l3_l);
-unsigned int GreedyMaxComb;
+ if (max_l < 256 - GreedyMaxComb)
+ max_l += GreedyMaxComb;
+ else
+ max_l = 255;
-unsigned int GreedyMotionThreshold;
+ if (min_l > GreedyMaxComb)
+ min_l -= GreedyMaxComb;
+ else
+ min_l = 0;
-unsigned int GreedyMotionSense;
+ max_c = MAX (l1_c, l3_c);
+ min_c = MIN (l1_c, l3_c);
+ if (max_c < 256 - GreedyMaxComb)
+ max_c += GreedyMaxComb;
+ else
+ max_c = 255;
-#define IS_SSE
-#define SSE_TYPE SSE
-#define FUNCT_NAME greedyDScaler_SSE
+ if (min_c > GreedyMaxComb)
+ min_c -= GreedyMaxComb;
+ else
+ min_c = 0;
+
+ out_l = CLAMP (best_l, min_l, max_l);
+ out_c = CLAMP (best_c, min_c, max_c);
+
+ /* Do motion compensation for luma, i.e. how much
+ * the weave pixel differs */
+ mov_l = ABS (l2_l - lp2_l);
+ if (mov_l > GreedyMotionThreshold)
+ mov_l -= GreedyMotionThreshold;
+ else
+ mov_l = 0;
+
+ mov_l = mov_l * GreedyMotionSense;
+ if (mov_l > 256)
+ mov_l = 256;
+
+ /* Weighted sum on clipped weave pixel and average */
+ out_l = (out_l * (256 - mov_l) + avg_sc_l * mov_l) / 256;
+
+ Dest[0] = out_l;
+ Dest[1] = out_c;
+
+ Dest += 2;
+ L1 += 2;
+ L2 += 2;
+ L3 += 2;
+ L2P += 2;
+ }
+}
+
+#define IS_MMXEXT
+#define SIMD_TYPE MMXEXT
+#define FUNCT_NAME greedyDScaler_MMXEXT
#include "greedyh.asm"
-#undef SSE_TYPE
-#undef IS_SSE
+#undef SIMD_TYPE
+#undef IS_MMXEXT
#undef FUNCT_NAME
-#define IS_3DNOW
+#define IS_TDNOW
+#define SIMD_TYPE TDNOW
#define FUNCT_NAME greedyDScaler_3DNOW
-#define SSE_TYPE 3DNOW
#include "greedyh.asm"
-#undef SSE_TYPE
-#undef IS_3DNOW
+#undef SIMD_TYPE
+#undef IS_TDNOW
#undef FUNCT_NAME
#define IS_MMX
-#define SSE_TYPE MMX
+#define SIMD_TYPE MMX
#define FUNCT_NAME greedyDScaler_MMX
#include "greedyh.asm"
-#undef SSE_TYPE
+#undef SIMD_TYPE
#undef IS_MMX
#undef FUNCT_NAME
-void
+static void
deinterlace_frame_di_greedyh (GstDeinterlace2 * object)
{
- if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) {
- greedyh_filter_sse (object);
+ void (*func) (uint8_t * L1, uint8_t * L2, uint8_t * L3, uint8_t * L2P,
+ uint8_t * Dest, int size);
+
+ int InfoIsOdd = 0;
+ int Line;
+ unsigned int Pitch = object->field_stride;
+
+ unsigned char *L1; // ptr to Line1, of 3
+ unsigned char *L2; // ptr to Line2, the weave line
+ unsigned char *L3; // ptr to Line3
+
+ unsigned char *L2P; // ptr to prev Line2
+ unsigned char *Dest = GST_BUFFER_DATA (object->out_buf);
+
+ if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
+ func = greedyDScaler_MMXEXT;
} else if (object->cpu_feature_flags & OIL_IMPL_FLAG_3DNOW) {
- greedyh_filter_3dnow (object);
+ func = greedyDScaler_3DNOW;
+ } else if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMX) {
+ func = greedyDScaler_MMX;
} else {
- greedyh_filter_mmx (object);
+ func = greedyDScaler_C;
+ }
+
+ // copy first even line no matter what, and the first odd line if we're
+ // processing an EVEN field. (note diff from other deint rtns.)
+
+ if (object->field_history[object->history_count - 1].flags ==
+ PICTURE_INTERLACED_BOTTOM) {
+ InfoIsOdd = 1;
+
+ L1 = GST_BUFFER_DATA (object->field_history[object->history_count - 2].buf);
+ L2 = GST_BUFFER_DATA (object->field_history[object->history_count - 1].buf);
+ L3 = L1 + Pitch;
+ L2P =
+ GST_BUFFER_DATA (object->field_history[object->history_count - 3].buf);
+
+ // copy first even line
+ object->pMemcpy (Dest, L1, object->line_length);
+ Dest += object->output_stride;
+ } else {
+ InfoIsOdd = 0;
+ L1 = GST_BUFFER_DATA (object->field_history[object->history_count - 2].buf);
+ L2 = GST_BUFFER_DATA (object->field_history[object->history_count -
+ 1].buf) + Pitch;
+ L3 = L1 + Pitch;
+ L2P =
+ GST_BUFFER_DATA (object->field_history[object->history_count - 3].buf) +
+ Pitch;
+
+ // copy first even line
+ object->pMemcpy (Dest, GST_BUFFER_DATA (object->field_history[0].buf),
+ object->line_length);
+ Dest += object->output_stride;
+ // then first odd line
+ object->pMemcpy (Dest, L1, object->line_length);
+ Dest += object->output_stride;
+ }
+
+ for (Line = 0; Line < (object->field_height - 1); ++Line) {
+ func (L1, L2, L3, L2P, Dest, object->line_length);
+ Dest += object->output_stride;
+ object->pMemcpy (Dest, L3, object->line_length);
+ Dest += object->output_stride;
+
+ L1 += Pitch;
+ L2 += Pitch;
+ L3 += Pitch;
+ L2P += Pitch;
+ }
+
+ if (InfoIsOdd) {
+ object->pMemcpy (Dest, L2, object->line_length);
}
}
@@ -94,7 +287,7 @@ static deinterlace_method_t greedyh_method = {
"Motion Adaptive: Advanced Detection",
"AdaptiveAdvanced",
4,
- OIL_IMPL_FLAG_MMX,
+ 0,
0,
0,
0,
@@ -117,32 +310,5 @@ static deinterlace_method_t greedyh_method = {
deinterlace_method_t *
dscaler_greedyh_get_method (void)
{
- greedyh_init ();
return &greedyh_method;
}
-
-void
-greedyh_init (void)
-{
- GreedyMaxComb = MAXCOMB_DEFAULT;
- GreedyMotionThreshold = MOTIONTHRESHOLD_DEFAULT;
- GreedyMotionSense = MOTIONSENSE_DEFAULT;
-}
-
-void
-greedyh_filter_mmx (GstDeinterlace2 * object)
-{
- greedyDScaler_MMX (object);
-}
-
-void
-greedyh_filter_3dnow (GstDeinterlace2 * object)
-{
- greedyDScaler_3DNOW (object);
-}
-
-void
-greedyh_filter_sse (GstDeinterlace2 * object)
-{
- greedyDScaler_SSE (object);
-}
diff --git a/gst/deinterlace2/tvtime/greedyhmacros.h b/gst/deinterlace2/tvtime/greedyhmacros.h
index 5f65959c..3f1c72c9 100644
--- a/gst/deinterlace2/tvtime/greedyhmacros.h
+++ b/gst/deinterlace2/tvtime/greedyhmacros.h
@@ -21,7 +21,7 @@
// BEFORE USING THESE YOU MUST SET:
-// #define SSE_TYPE SSE (or MMX or 3DNOW)
+// #define SIMD_TYPE MMXEXT (or MMX or TDNOW)
// some macros for pavgb instruction
// V_PAVGB(mmr1, mmr2, mmr work register, smask) mmr2 may = mmrw if you can trash it
@@ -33,21 +33,21 @@
"pand "smask", "mmr1"\n\t" \
"psrlw $1, "mmr1"\n\t" \
"paddusb "mmrw", "mmr1"\n\t"
-#define V_PAVGB_SSE(mmr1, mmr2, mmrw, smask) "pavgb "mmr2", "mmr1"\n\t"
-#define V_PAVGB_3DNOW(mmr1, mmr2, mmrw, smask) "pavgusb "mmr2", "mmr1"\n\t"
-#define V_PAVGB(mmr1, mmr2, mmrw, smask) V_PAVGB2(mmr1, mmr2, mmrw, smask, SSE_TYPE)
-#define V_PAVGB2(mmr1, mmr2, mmrw, smask, ssetyp) V_PAVGB3(mmr1, mmr2, mmrw, smask, ssetyp)
-#define V_PAVGB3(mmr1, mmr2, mmrw, smask, ssetyp) V_PAVGB_##ssetyp(mmr1, mmr2, mmrw, smask)
+#define V_PAVGB_MMXEXT(mmr1, mmr2, mmrw, smask) "pavgb "mmr2", "mmr1"\n\t"
+#define V_PAVGB_TDNOW(mmr1, mmr2, mmrw, smask) "pavgusb "mmr2", "mmr1"\n\t"
+#define V_PAVGB(mmr1, mmr2, mmrw, smask) V_PAVGB2(mmr1, mmr2, mmrw, smask, SIMD_TYPE)
+#define V_PAVGB2(mmr1, mmr2, mmrw, smask, simdtype) V_PAVGB3(mmr1, mmr2, mmrw, smask, simdtype)
+#define V_PAVGB3(mmr1, mmr2, mmrw, smask, simdtype) V_PAVGB_##simdtype(mmr1, mmr2, mmrw, smask)
// some macros for pmaxub instruction
#define V_PMAXUB_MMX(mmr1, mmr2) \
"psubusb "mmr2", "mmr1"\n\t" \
"paddusb "mmr2", "mmr1"\n\t"
-#define V_PMAXUB_SSE(mmr1, mmr2) "pmaxub "mmr2", "mmr1"\n\t"
-#define V_PMAXUB_3DNOW(mmr1, mmr2) V_PMAXUB_MMX(mmr1, mmr2) // use MMX version
-#define V_PMAXUB(mmr1, mmr2) V_PMAXUB2(mmr1, mmr2, SSE_TYPE)
-#define V_PMAXUB2(mmr1, mmr2, ssetyp) V_PMAXUB3(mmr1, mmr2, ssetyp)
-#define V_PMAXUB3(mmr1, mmr2, ssetyp) V_PMAXUB_##ssetyp(mmr1, mmr2)
+#define V_PMAXUB_MMXEXT(mmr1, mmr2) "pmaxub "mmr2", "mmr1"\n\t"
+#define V_PMAXUB_TDNOW(mmr1, mmr2) V_PMAXUB_MMX(mmr1, mmr2) // use MMX version
+#define V_PMAXUB(mmr1, mmr2) V_PMAXUB2(mmr1, mmr2, SIMD_TYPE)
+#define V_PMAXUB2(mmr1, mmr2, simdtype) V_PMAXUB3(mmr1, mmr2, simdtype)
+#define V_PMAXUB3(mmr1, mmr2, simdtype) V_PMAXUB_##simdtype(mmr1, mmr2)
// some macros for pminub instruction
// V_PMINUB(mmr1, mmr2, mmr work register) mmr2 may NOT = mmrw
@@ -56,19 +56,19 @@
"psubusb "mmr2", "mmrw"\n\t" \
"paddusb "mmrw", "mmr1"\n\t" \
"psubusb "mmrw", "mmr1"\n\t"
-#define V_PMINUB_SSE(mmr1, mmr2, mmrw) "pminub "mmr2", "mmr1"\n\t"
-#define V_PMINUB_3DNOW(mmr1, mmr2, mmrw) V_PMINUB_MMX(mmr1, mmr2, mmrw) // use MMX version
-#define V_PMINUB(mmr1, mmr2, mmrw) V_PMINUB2(mmr1, mmr2, mmrw, SSE_TYPE)
-#define V_PMINUB2(mmr1, mmr2, mmrw, ssetyp) V_PMINUB3(mmr1, mmr2, mmrw, ssetyp)
-#define V_PMINUB3(mmr1, mmr2, mmrw, ssetyp) V_PMINUB_##ssetyp(mmr1, mmr2, mmrw)
+#define V_PMINUB_MMXEXT(mmr1, mmr2, mmrw) "pminub "mmr2", "mmr1"\n\t"
+#define V_PMINUB_TDNOW(mmr1, mmr2, mmrw) V_PMINUB_MMX(mmr1, mmr2, mmrw) // use MMX version
+#define V_PMINUB(mmr1, mmr2, mmrw) V_PMINUB2(mmr1, mmr2, mmrw, SIMD_TYPE)
+#define V_PMINUB2(mmr1, mmr2, mmrw, simdtype) V_PMINUB3(mmr1, mmr2, mmrw, simdtype)
+#define V_PMINUB3(mmr1, mmr2, mmrw, simdtype) V_PMINUB_##simdtype(mmr1, mmr2, mmrw)
// some macros for movntq instruction
// V_MOVNTQ(mmr1, mmr2)
#define V_MOVNTQ_MMX(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t"
-#define V_MOVNTQ_3DNOW(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t"
-#define V_MOVNTQ_SSE(mmr1, mmr2) "movntq "mmr2", "mmr1"\n\t"
-#define V_MOVNTQ(mmr1, mmr2) V_MOVNTQ2(mmr1, mmr2, SSE_TYPE)
-#define V_MOVNTQ2(mmr1, mmr2, ssetyp) V_MOVNTQ3(mmr1, mmr2, ssetyp)
-#define V_MOVNTQ3(mmr1, mmr2, ssetyp) V_MOVNTQ_##ssetyp(mmr1, mmr2)
+#define V_MOVNTQ_TDNOW(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t"
+#define V_MOVNTQ_MMXEXT(mmr1, mmr2) "movntq "mmr2", "mmr1"\n\t"
+#define V_MOVNTQ(mmr1, mmr2) V_MOVNTQ2(mmr1, mmr2, SIMD_TYPE)
+#define V_MOVNTQ2(mmr1, mmr2, simdtype) V_MOVNTQ3(mmr1, mmr2, simdtype)
+#define V_MOVNTQ3(mmr1, mmr2, simdtype) V_MOVNTQ_##simdtype(mmr1, mmr2)
// end of macros