1 files changed, 205 insertions, 270 deletions
diff --git a/gst/deinterlace2/tvtime/greedyh.asm b/gst/deinterlace2/tvtime/greedyh.asm
index 8fd0ab66..fcd3a647 100644
--- a/gst/deinterlace2/tvtime/greedyh.asm
+++ b/gst/deinterlace2/tvtime/greedyh.asm
@@ -28,281 +28,216 @@
 
 #include "x86-64_macros.inc"
 
-void FUNCT_NAME( GstDeinterlace2 *object)
+void
+FUNCT_NAME (uint8_t * L1, uint8_t * L2, uint8_t * L3, uint8_t * L2P,
+    uint8_t * Dest, int size)
 {
-    int64_t i;
-    int InfoIsOdd = 0;
 
-    // in tight loop some vars are accessed faster in local storage
-    int64_t YMask        = 0x00ff00ff00ff00ffull; // to keep only luma
-    int64_t UVMask       = 0xff00ff00ff00ff00ull; // to keep only chroma
-    int64_t ShiftMask    = 0xfefffefffefffeffull; // to avoid shifting chroma to luma
-    int64_t QW256        = 0x0100010001000100ull; // 4 256's
-
-    // Set up our two parms that are actually evaluated for each pixel
-    i=GreedyMaxComb;
-    int64_t MaxComb = i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i;
-    
-    i = GreedyMotionThreshold;		// scale to range of 0-257
-    int64_t MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask;
-    
-    i = GreedyMotionSense;		// scale to range of 0-257
-    int64_t MotionSense = i << 48 | i << 32 | i << 16 | i;
-
-    int Line;
-    long LoopCtr;
-    unsigned int Pitch = object->field_stride;
-
-    unsigned char* L1;					// ptr to Line1, of 3
-    unsigned char* L2;					// ptr to Line2, the weave line
-    unsigned char* L3;					// ptr to Line3
-
-    unsigned char* L2P;					// ptr to prev Line2
-    unsigned char* Dest = GST_BUFFER_DATA(object->out_buf);
-
-    int64_t QW256B;
-    int64_t LastAvg=0;			//interp value from left qword
-
-    i = 0xffffffff - 256;
-    QW256B =  i << 48 |  i << 32 | i << 16 | i;  // save a couple instr on PMINSW instruct.
-
-
-    // copy first even line no matter what, and the first odd line if we're
-    // processing an EVEN field. (note diff from other deint rtns.)
-
-    if (object->field_history[object->history_count-1].flags == PICTURE_INTERLACED_BOTTOM) {
-      InfoIsOdd = 1;
-
-      L1 = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf);
-      L2 = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf); 
-      L3 = L1 + Pitch;
-      L2P = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf);
-
-      // copy first even line
-      object->pMemcpy(Dest, L1, object->line_length);
-      Dest += object->output_stride;
-    } 
-    else {
-      InfoIsOdd = 0;
-      L1 = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf);
-      L2 = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf) + Pitch; 
-      L3 = L1 + Pitch;
-      L2P = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf) + Pitch;
-
-      // copy first even line
-      object->pMemcpy(Dest, GST_BUFFER_DATA(object->field_history[0].buf), object->line_length);
-      Dest += object->output_stride;
-      // then first odd line
-      object->pMemcpy(Dest, L1, object->line_length);
-      Dest += object->output_stride;
-    }
-
-
-    long oldbx;
-
-    for (Line = 0; Line < (object->field_height - 1); ++Line) {
-        LoopCtr = object->line_length / 8 - 1; // there are LineLength / 8 qwords per line but do 1 less, adj at end of loop
-
-        // For ease of reading, the comments below assume that we're operating on an odd
-        // field (i.e., that InfoIsOdd is true).  Assume the obvious for even lines..
-        __asm__ __volatile__
-            (
-             // save ebx (-fPIC)
-	     MOVX" %%"XBX", %[oldbx]\n\t"
-
-             MOVX"  %[L1],          %%"XAX"\n\t"
-             LEAX"  8(%%"XAX"),     %%"XBX"\n\t"    // next qword needed by DJR
-             MOVX"  %[L3],          %%"XCX"\n\t"
-             SUBX"  %%"XAX",        %%"XCX"\n\t"    // carry L3 addr as an offset
-             MOVX"  %[L2P],         %%"XDX"\n\t"
-             MOVX"  %[L2],          %%"XSI"\n\t"
-             MOVX"  %[Dest],        %%"XDI"\n\t"    // DL1 if Odd or DL2 if Even
-
-             ".align 8\n\t"
-             "1:\n\t"
-
-             "movq  (%%"XSI"),      %%mm0\n\t"      // L2 - the newest weave pixel value
-             "movq  (%%"XAX"),      %%mm1\n\t"      // L1 - the top pixel
-             "movq  (%%"XDX"),      %%mm2\n\t"      // L2P - the prev weave pixel
-             "movq  (%%"XAX", %%"XCX"), %%mm3\n\t"  // L3, next odd row
-             "movq  %%mm1,          %%mm6\n\t"      // L1 - get simple single pixel interp
-             //	pavgb   mm6, mm3                    // use macro below
-             V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]")
-
-             // DJR - Diagonal Jaggie Reduction
-             // In the event that we are going to use an average (Bob) pixel we do not want a jagged
-             // stair step effect.  To combat this we avg in the 2 horizontally adjacen pixels into the
-             // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels.
-
-             "movq  %[LastAvg],     %%mm4\n\t"      // the bob value from prev qword in row
-             "movq  %%mm6,          %[LastAvg]\n\t" // save for next pass
-             "psrlq $48,            %%mm4\n\t"      // right justify 1 pixel
-             "movq  %%mm6,          %%mm7\n\t"      // copy of simple bob pixel
-             "psllq $16,            %%mm7\n\t"      // left justify 3 pixels
-             "por   %%mm7,          %%mm4\n\t"      // and combine
-
-             "movq  (%%"XBX"),      %%mm5\n\t"      // next horiz qword from L1
-             //			pavgb   mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below
-             V_PAVGB ("%%mm5", "(%%"XBX",%%"XCX")", "%%mm7", "%[ShiftMask]")
-             "psllq $48,            %%mm5\n\t"      // left just 1 pixel
-             "movq  %%mm6,          %%mm7\n\t"      // another copy of simple bob pixel
-             "psrlq $16,            %%mm7\n\t"      // right just 3 pixels
-             "por   %%mm7,          %%mm5\n\t"      // combine
-             //			pavgb	mm4, mm5			// avg of forward and prev by 1 pixel, use macro
-             V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]")   // mm5 gets modified if MMX
-             //			pavgb	mm6, mm4			// avg of center and surround interp vals, use macro
-             V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
-
-             // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors.
+  // in tight loop some vars are accessed faster in local storage
+  int64_t YMask = 0x00ff00ff00ff00ffull;        // to keep only luma
+  int64_t UVMask = 0xff00ff00ff00ff00ull;       // to keep only chroma
+  int64_t ShiftMask = 0xfefefefefefefefeull;    // to avoid shifting chroma to luma
+  int64_t QW256 = 0x0100010001000100ull;        // 4 256's
+  int64_t MaxComb;
+  int64_t MotionThreshold;
+  int64_t MotionSense;
+  int64_t i;
+  long LoopCtr;
+  long oldbx;
+
+  int64_t QW256B;
+  int64_t LastAvg = 0;          //interp value from left qword
+
+  // Set up our two parms that are actually evaluated for each pixel
+  i = GreedyMaxComb;
+  MaxComb =
+      i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i;
+
+  i = GreedyMotionThreshold;    // scale to range of 0-257
+  MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask;
+
+  i = GreedyMotionSense;        // scale to range of 0-257
+  MotionSense = i << 48 | i << 32 | i << 16 | i;
+
+  i = 0xffffffff - 256;
+  QW256B = i << 48 | i << 32 | i << 16 | i;     // save a couple instr on PMINSW instruct.
+
+  LoopCtr = size / 8 - 1;       // there are LineLength / 8 qwords per line but do 1 less, adj at end of loop
+
+  // For ease of reading, the comments below assume that we're operating on an odd
+  // field (i.e., that InfoIsOdd is true).  Assume the obvious for even lines..
+  __asm__ __volatile__ (
+      // save ebx (-fPIC)
+      MOVX " %%" XBX ", %[oldbx]\n\t"
+      MOVX "  %[L1],          %%" XAX "\n\t"
+      LEAX "  8(%%" XAX "),     %%" XBX "\n\t"   // next qword needed by DJR
+      MOVX "  %[L3],          %%" XCX "\n\t"
+      SUBX "  %%" XAX ",        %%" XCX "\n\t"   // carry L3 addr as an offset
+      MOVX "  %[L2P],         %%" XDX "\n\t"
+      MOVX "  %[L2],          %%" XSI "\n\t"
+      MOVX "  %[Dest],        %%" XDI "\n\t"      // DL1 if Odd or DL2 if Even
+
+      ".align 8\n\t"
+      "1:\n\t"
+      "movq  (%%" XSI "),      %%mm0\n\t"       // L2 - the newest weave pixel value
+      "movq  (%%" XAX "),      %%mm1\n\t"       // L1 - the top pixel
+      "movq  (%%" XDX "),      %%mm2\n\t"       // L2P - the prev weave pixel
+      "movq  (%%" XAX ", %%" XCX "), %%mm3\n\t" // L3, next odd row
+      "movq  %%mm1,          %%mm6\n\t"         // L1 - get simple single pixel interp
+
+      //        pavgb   mm6, mm3                    // use macro below
+      V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]")
+
+      // DJR - Diagonal Jaggie Reduction
+      // In the event that we are going to use an average (Bob) pixel we do not want a jagged
+      // stair step effect.  To combat this we avg in the 2 horizontally adjacen pixels into the
+      // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels.
+
+      "movq  %[LastAvg],     %%mm4\n\t" // the bob value from prev qword in row
+      "movq  %%mm6,          %[LastAvg]\n\t"    // save for next pass
+      "psrlq $48,            %%mm4\n\t" // right justify 1 pixel
+      "movq  %%mm6,          %%mm7\n\t" // copy of simple bob pixel
+      "psllq $16,            %%mm7\n\t" // left justify 3 pixels
+      "por   %%mm7,          %%mm4\n\t" // and combine
+      "movq  (%%" XBX "),      %%mm5\n\t"       // next horiz qword from L1
+      // pavgb   mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below
+
+      V_PAVGB ("%%mm5", "(%%" XBX ",%%" XCX ")", "%%mm7", "%[ShiftMask]")
+      "psllq $48,            %%mm5\n\t" // left just 1 pixel
+      "movq  %%mm6,          %%mm7\n\t" // another copy of simple bob pixel
+      "psrlq $16,            %%mm7\n\t" // right just 3 pixels
+      "por   %%mm7,          %%mm5\n\t" // combine
+      // pavgb        mm4, mm5                        // avg of forward and prev by 1 pixel, use macro
+      V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]")       // mm5 gets modified if MMX
+      //                        pavgb        mm6, mm4                        // avg of center and surround interp vals, use macro
+      V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
+
+      // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors.
 #ifndef IS_MMX
-             //          pavgb	mm4, mm6			// 1/4 center, 3/4 adjacent
-             V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]")
-             //    		pavgb	mm6, mm4			// 3/8 center, 5/8 adjacent
-             V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
+      //          pavgb        mm4, mm6                        // 1/4 center, 3/4 adjacent
+      V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]")
+      //                    pavgb        mm6, mm4                        // 3/8 center, 5/8 adjacent
+      V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
 #endif
 
-             // get abs value of possible L2 comb
-             "movq    %%mm6,        %%mm4\n\t"      // work copy of interp val
-             "movq    %%mm2,        %%mm7\n\t"      // L2
-             "psubusb %%mm4,        %%mm7\n\t"      // L2 - avg
-             "movq    %%mm4,        %%mm5\n\t"      // avg
-             "psubusb %%mm2,        %%mm5\n\t"      // avg - L2
-             "por     %%mm7,        %%mm5\n\t"      // abs(avg-L2)
-
-             // get abs value of possible L2P comb
-             "movq    %%mm0,        %%mm7\n\t"      // L2P
-             "psubusb %%mm4,        %%mm7\n\t"      // L2P - avg
-             "psubusb %%mm0,        %%mm4\n\t"      // avg - L2P
-             "por     %%mm7,        %%mm4\n\t"      // abs(avg-L2P)
-
-             // use L2 or L2P depending upon which makes smaller comb
-             "psubusb %%mm5,        %%mm4\n\t"      // see if it goes to zero
-             "psubusb %%mm5,        %%mm5\n\t"      // 0
-             "pcmpeqb %%mm5,        %%mm4\n\t"      // if (mm4=0) then FF else 0
-             "pcmpeqb %%mm4,        %%mm5\n\t"      // opposite of mm4
-
-             // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
-             "pand    %%mm2,        %%mm5\n\t"      // use L2 if mm5 == ff, else 0
-             "pand    %%mm0,        %%mm4\n\t"      // use L2P if mm4 = ff, else 0
-             "por     %%mm5,        %%mm4\n\t"      // may the best win
-
-             // Inventory: at this point we have the following values:
-             // mm0 = L2P (or L2)
-             // mm1 = L1
-             // mm2 = L2 (or L2P)
-             // mm3 = L3
-             // mm4 = the best of L2,L2P weave pixel, base upon comb
-             // mm6 = the avg interpolated value, if we need to use it
-
-             // Let's measure movement, as how much the weave pixel has changed
-             "movq    %%mm2,        %%mm7\n\t"
-             "psubusb %%mm0,        %%mm2\n\t"
-             "psubusb %%mm7,        %%mm0\n\t"
-             "por     %%mm2,        %%mm0\n\t"      // abs value of change, used later
-
-             // Now lets clip our chosen value to be not outside of the range
-             // of the high/low range L1-L3 by more than MaxComb.
-             // This allows some comb but limits the damages and also allows more
-             // detail than a boring oversmoothed clip.
-             "movq    %%mm1,        %%mm2\n\t"      // copy L1
-             //	pmaxub mm2, mm3                     // use macro
-             V_PMAXUB ("%%mm2", "%%mm3")            // now = Max(L1,L3)
-             "movq    %%mm1,        %%mm5\n\t"      // copy L1
-             // pminub	mm5, mm3                    // now = Min(L1,L3), use macro
-             V_PMINUB ("%%mm5", "%%mm3", "%%mm7")
-             // allow the value to be above the high or below the low by amt of MaxComb
-             "psubusb %[MaxComb],   %%mm5\n\t"      // lower min by diff
-             "paddusb %[MaxComb],   %%mm2\n\t"      // increase max by diff
-             // pmaxub	mm4, mm5                    // now = Max(best,Min(L1,L3) use macro
-             V_PMAXUB ("%%mm4", "%%mm5")
-             // pminub	mm4, mm2                    // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
-             V_PMINUB ("%%mm4", "%%mm2", "%%mm7")
-
-             // Blend weave pixel with bob pixel, depending on motion val in mm0
-             "psubusb %[MotionThreshold], %%mm0\n\t"// test Threshold, clear chroma change >>>??
-             "pmullw  %[MotionSense], %%mm0\n\t"    // mul by user factor, keep low 16 bits
-             "movq    %[QW256], %%mm7\n\t"
-#ifdef HAVE_SSE
-             "pminsw  %%mm7,        %%mm0\n\t"      // max = 256
+      // get abs value of possible L2 comb
+      "movq    %%mm6,        %%mm4\n\t" // work copy of interp val
+      "movq    %%mm2,        %%mm7\n\t" // L2
+      "psubusb %%mm4,        %%mm7\n\t" // L2 - avg
+      "movq    %%mm4,        %%mm5\n\t" // avg
+      "psubusb %%mm2,        %%mm5\n\t" // avg - L2
+      "por     %%mm7,        %%mm5\n\t" // abs(avg-L2)
+
+      // get abs value of possible L2P comb
+      "movq    %%mm0,        %%mm7\n\t" // L2P
+      "psubusb %%mm4,        %%mm7\n\t" // L2P - avg
+      "psubusb %%mm0,        %%mm4\n\t" // avg - L2P
+      "por     %%mm7,        %%mm4\n\t" // abs(avg-L2P)
+
+      // use L2 or L2P depending upon which makes smaller comb
+      "psubusb %%mm5,        %%mm4\n\t" // see if it goes to zero
+      "psubusb %%mm5,        %%mm5\n\t" // 0
+      "pcmpeqb %%mm5,        %%mm4\n\t" // if (mm4=0) then FF else 0
+      "pcmpeqb %%mm4,        %%mm5\n\t" // opposite of mm4
+
+      // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
+      "pand    %%mm2,        %%mm5\n\t" // use L2 if mm5 == ff, else 0
+      "pand    %%mm0,        %%mm4\n\t" // use L2P if mm4 = ff, else 0
+      "por     %%mm5,        %%mm4\n\t" // may the best win
+
+      // Inventory: at this point we have the following values:
+      // mm0 = L2P (or L2)
+      // mm1 = L1
+      // mm2 = L2 (or L2P)
+      // mm3 = L3
+      // mm4 = the best of L2,L2P weave pixel, base upon comb
+      // mm6 = the avg interpolated value, if we need to use it
+      // Let's measure movement, as how much the weave pixel has changed
+
+      "movq    %%mm2,        %%mm7\n\t"
+      "psubusb %%mm0,        %%mm2\n\t"
+      "psubusb %%mm7,        %%mm0\n\t"
+      "por     %%mm2,        %%mm0\n\t"   // abs value of change, used later
+
+      // Now lets clip our chosen value to be not outside of the range
+      // of the high/low range L1-L3 by more than MaxComb.
+      // This allows some comb but limits the damages and also allows more
+      // detail than a boring oversmoothed clip.
+
+      "movq    %%mm1,        %%mm2\n\t" // copy L1
+      // pmaxub mm2, mm3                     // use macro
+      V_PMAXUB ("%%mm2", "%%mm3")       // now = Max(L1,L3)
+      "movq    %%mm1,        %%mm5\n\t" // copy L1
+      // pminub        mm5, mm3                    // now = Min(L1,L3), use macro
+      V_PMINUB ("%%mm5", "%%mm3", "%%mm7")
+
+      // allow the value to be above the high or below the low by amt of MaxComb
+      "psubusb %[MaxComb],   %%mm5\n\t" // lower min by diff
+      "paddusb %[MaxComb],   %%mm2\n\t" // increase max by diff
+      // pmaxub        mm4, mm5         // now = Max(best,Min(L1,L3) use macro
+      V_PMAXUB ("%%mm4", "%%mm5")
+      // pminub        mm4, mm2         // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
+      V_PMINUB ("%%mm4", "%%mm2", "%%mm7")
+
+      // Blend weave pixel with bob pixel, depending on motion val in mm0
+      "psubusb %[MotionThreshold], %%mm0\n\t"   // test Threshold, clear chroma change >>>??
+      "pmullw  %[MotionSense], %%mm0\n\t"       // mul by user factor, keep low 16 bits
+      "movq    %[QW256], %%mm7\n\t"
+#if SIMD_TYPE == MMXEXT
+      "pminsw  %%mm7,        %%mm0\n\t" // max = 256
 #else
-             "paddusw %[QW256B],    %%mm0\n\t"      // add, may sat at fff..
-             "psubusw %[QW256B],    %%mm0\n\t"      // now = Min(L1,256)
-#endif
-             "psubusw %%mm0,        %%mm7\n\t"      // so the 2 sum to 256, weighted avg
-             "movq    %%mm4,        %%mm2\n\t"      // save weave chroma info before trashing
-             "pand    %[YMask],     %%mm4\n\t"      // keep only luma from calc'd value
-             "pmullw  %%mm7,        %%mm4\n\t"      // use more weave for less motion
-             "pand    %[YMask],     %%mm6\n\t"      // keep only luma from calc'd value
-             "pmullw  %%mm0,        %%mm6\n\t"      // use more bob for large motion
-             "paddusw %%mm6,        %%mm4\n\t"      // combine
-             "psrlw   $8,           %%mm4\n\t"      // div by 256 to get weighted avg
-
-             // chroma comes from weave pixel
-             "pand    %[UVMask],    %%mm2\n\t"      // keep chroma
-             "por     %%mm4,        %%mm2\n\t"      // and combine
-
-             V_MOVNTQ ("(%%"XDI")", "%%mm2")        // move in our clipped best, use macro
-
-             // bump ptrs and loop
-             LEAX"    8(%%"XAX"),   %%"XAX"\n\t"
-             LEAX"    8(%%"XBX"),   %%"XBX"\n\t"
-             LEAX"    8(%%"XDX"),   %%"XDX"\n\t"
-             LEAX"    8(%%"XDI"),   %%"XDI"\n\t"
-             LEAX"    8(%%"XSI"),   %%"XSI"\n\t"
-             DECX"    %[LoopCtr]\n\t"
-             "jg      1b\n\t"                       // loop if not to last line
-                                                    // note P-III default assumes backward branches taken
-             "jl      1f\n\t"                       // done
-             MOVX"    %%"XAX",      %%"XBX"\n\t"  // sharpness lookahead 1 byte only, be wrong on 1
-             "jmp     1b\n\t"
-
-             "1:\n\t"
-	     MOVX" %[oldbx], %%"XBX"\n\t"
-
-             : /* no outputs */
-
-             : [LastAvg]         "m"(LastAvg),
-               [L1]              "m"(L1),
-               [L3]              "m"(L3),
-               [L2P]             "m"(L2P),
-               [L2]              "m"(L2),
-               [Dest]            "m"(Dest),
-               [ShiftMask]       "m"(ShiftMask),
-               [MaxComb]         "m"(MaxComb),
-               [MotionThreshold] "m"(MotionThreshold),
-               [MotionSense]     "m"(MotionSense),
-               [QW256B]          "m"(QW256B),
-               [YMask]           "m"(YMask),
-               [UVMask]          "m"(UVMask),
-               [LoopCtr]         "m"(LoopCtr),
-               [QW256]           "m"(QW256),
-	       [oldbx]           "m"(oldbx)
-
-             : XAX, XCX, XDX, XSI, XDI,
-#ifdef HAVE_CPU_I386
-               "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
-#endif
-               /* FIXME: breaks unless compiling with -mmmx
-	          "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", */
-               "memory", "cc"
-            );
-
-        Dest += object->output_stride;
-        object->pMemcpy(Dest, L3, object->line_length);
-        Dest += object->output_stride;
-
-        L1  += Pitch;
-        L2  += Pitch;
-        L3  += Pitch;
-        L2P += Pitch;
-	}
-
-    if (InfoIsOdd) {
-        object->pMemcpy(Dest, L2, object->line_length);
-    }
-
-    // clear out the MMX registers ready for doing floating point again
-#ifdef HAVE_CPU_I386
-    __asm__ __volatile__ ("emms\n\t");
+      "paddusw %[QW256B],    %%mm0\n\t" // add, may sat at fff..
+      "psubusw %[QW256B],    %%mm0\n\t" // now = Min(L1,256)
 #endif
+      "psubusw %%mm0,        %%mm7\n\t" // so the 2 sum to 256, weighted avg
+      "movq    %%mm4,        %%mm2\n\t" // save weave chroma info before trashing
+      "pand    %[YMask],     %%mm4\n\t" // keep only luma from calc'd value
+      "pmullw  %%mm7,        %%mm4\n\t" // use more weave for less motion
+      "pand    %[YMask],     %%mm6\n\t" // keep only luma from calc'd value
+      "pmullw  %%mm0,        %%mm6\n\t" // use more bob for large motion
+      "paddusw %%mm6,        %%mm4\n\t" // combine
+      "psrlw   $8,           %%mm4\n\t" // div by 256 to get weighted avg
+      // chroma comes from weave pixel
+      "pand    %[UVMask],    %%mm2\n\t" // keep chroma
+      "por     %%mm4,        %%mm2\n\t" // and combine
+      V_MOVNTQ ("(%%" XDI ")", "%%mm2") // move in our clipped best, use macro
+      // bump ptrs and loop
+      LEAX "    8(%%" XAX "),   %%" XAX "\n\t"
+      LEAX "    8(%%" XBX "),   %%" XBX "\n\t"
+      LEAX "    8(%%" XDX "),   %%" XDX "\n\t"
+      LEAX "    8(%%" XDI "),   %%" XDI "\n\t"
+      LEAX "    8(%%" XSI "),   %%" XSI "\n\t"
+      DECX "    %[LoopCtr]\n\t"
+      
+      "jg      1b\n\t"   // loop if not to last line
+      // note P-III default assumes backward branches taken
+      "jl      1f\n\t"          // done
+      MOVX "    %%" XAX ",      %%" XBX "\n\t"  // sharpness lookahead 1 byte only, be wrong on 1
+      "jmp     1b\n\t"
+      
+      "1:\n\t"      
+      MOVX " %[oldbx], %%" XBX "\n\t"
+      "emms\n\t":     /* no outputs */
+
+      :[LastAvg] "m" (LastAvg),
+       [L1] "m" (L1),
+       [L3] "m" (L3),
+       [L2P] "m" (L2P),
+       [L2] "m" (L2),
+       [Dest] "m" (Dest),
+       [ShiftMask] "m" (ShiftMask),
+       [MaxComb] "m" (MaxComb),
+       [MotionThreshold] "m" (MotionThreshold),
+       [MotionSense] "m" (MotionSense),
+       [QW256B] "m" (QW256B),
+       [YMask] "m" (YMask),
+       [UVMask] "m" (UVMask),
+       [LoopCtr] "m" (LoopCtr),
+       [QW256] "m" (QW256),
+       [oldbx] "m" (oldbx)
+      : XAX, XCX, XDX, XSI, XDI,
+      "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
+      /* FIXME: breaks unless compiling with -mmmx
+         "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", */
+      "memory", "cc");
 }