summaryrefslogtreecommitdiffstats
path: root/gst/deinterlace2/tvtime/greedyh.asm
diff options
context:
space:
mode:
authorMartin Eikermann <meiker@upb.de>2008-06-11 11:12:14 +0000
committerSebastian Dröge <slomo@circular-chaos.org>2008-06-11 11:12:14 +0000
commit70ff21342117866ee939f9f7597ee487bae31757 (patch)
tree84c75fd8ce74451d34424c41a793e5462fb86a6b /gst/deinterlace2/tvtime/greedyh.asm
parenta14d311c96c21ae1c700917d4007ae0d3f9a02a8 (diff)
downloadgst-plugins-bad-70ff21342117866ee939f9f7597ee487bae31757.tar.gz
gst-plugins-bad-70ff21342117866ee939f9f7597ee487bae31757.tar.bz2
gst-plugins-bad-70ff21342117866ee939f9f7597ee487bae31757.zip
gst/deinterlace2/: Add a deinterlacer plugin based on the tvtime/DScaler deinterlacer, which was relicensed to LGPL f...
Original commit message from CVS: Based on a patch by: Martin Eikermann <meiker at upb dot de> * gst/deinterlace2/Makefile.am: * gst/deinterlace2/gstdeinterlace2.c: (gst_deinterlace2_method_get_type), (gst_deinterlace2_fields_get_type), (gst_deinterlace2_field_layout_get_type), (gst_deinterlace2_base_init), (gst_deinterlace2_class_init), (gst_deinterlace2_init), (gst_deinterlace2_set_method), (gst_deinterlace2_set_property), (gst_deinterlace2_get_property), (gst_deinterlace2_finalize), (gst_deinterlace2_pop_history), (gst_deinterlace2_head_history), (gst_deinterlace2_push_history), (gst_deinterlace2_deinterlace_scanlines), (gst_deinterlace2_chain), (gst_deinterlace2_setcaps), (gst_deinterlace2_sink_event), (gst_deinterlace2_change_state), (gst_deinterlace2_src_event), (gst_deinterlace2_src_query), (gst_deinterlace2_src_query_types), (plugin_init): * gst/deinterlace2/gstdeinterlace2.h: * gst/deinterlace2/tvtime/greedy.c: (copy_scanline), (deinterlace_greedy_packed422_scanline_mmxext), (dscaler_greedyl_get_method): * gst/deinterlace2/tvtime/greedyh.asm: * gst/deinterlace2/tvtime/greedyh.c: (deinterlace_frame_di_greedyh), (dscaler_greedyh_get_method), (greedyh_init), (greedyh_filter_mmx), (greedyh_filter_3dnow), (greedyh_filter_sse): * gst/deinterlace2/tvtime/greedyh.h: * gst/deinterlace2/tvtime/greedyhmacros.h: * gst/deinterlace2/tvtime/mmx.h: * gst/deinterlace2/tvtime/plugins.h: * gst/deinterlace2/tvtime/speedtools.h: * gst/deinterlace2/tvtime/speedy.c: (multiply_alpha), (clip255), (comb_factor_packed422_scanline_mmx), (diff_factor_packed422_scanline_c), (diff_factor_packed422_scanline_mmx), (diff_packed422_block8x8_mmx), (diff_packed422_block8x8_c), (packed444_to_packed422_scanline_c), (packed422_to_packed444_scanline_c), (packed422_to_packed444_rec601_scanline_c), (vfilter_chroma_121_packed422_scanline_mmx), (vfilter_chroma_121_packed422_scanline_c), (vfilter_chroma_332_packed422_scanline_mmx), (vfilter_chroma_332_packed422_scanline_c), (kill_chroma_packed422_inplace_scanline_mmx), (kill_chroma_packed422_inplace_scanline_c), (invert_colour_packed422_inplace_scanline_mmx), (invert_colour_packed422_inplace_scanline_c), (mirror_packed422_inplace_scanline_c), (interpolate_packed422_scanline_c), (convert_uyvy_to_yuyv_scanline_mmx), (convert_uyvy_to_yuyv_scanline_c), (interpolate_packed422_scanline_mmx), (interpolate_packed422_scanline_mmxext), (blit_colour_packed422_scanline_c), (blit_colour_packed422_scanline_mmx), (blit_colour_packed422_scanline_mmxext), (blit_colour_packed4444_scanline_c), (blit_colour_packed4444_scanline_mmx), (blit_colour_packed4444_scanline_mmxext), (small_memcpy), (speedy_memcpy_c), (speedy_memcpy_mmx), (speedy_memcpy_mmxext), (blit_packed422_scanline_c), (blit_packed422_scanline_mmx), (blit_packed422_scanline_mmxext), (composite_colour4444_alpha_to_packed422_scanline_c), (composite_colour4444_alpha_to_packed422_scanline_mmxext), (composite_packed4444_alpha_to_packed422_scanline_c), (composite_packed4444_alpha_to_packed422_scanline_mmxext), (composite_packed4444_to_packed422_scanline_c), (composite_packed4444_to_packed422_scanline_mmxext), (composite_alphamask_to_packed4444_scanline_c), (composite_alphamask_to_packed4444_scanline_mmxext), (composite_alphamask_alpha_to_packed4444_scanline_c), (premultiply_packed4444_scanline_c), (premultiply_packed4444_scanline_mmxext), (blend_packed422_scanline_c), (blend_packed422_scanline_mmxext), (quarter_blit_vertical_packed422_scanline_mmxext), (quarter_blit_vertical_packed422_scanline_c), (subpix_blit_vertical_packed422_scanline_c), (a8_subpix_blit_scanline_c), (myround), (init_RGB_to_YCbCr_tables), (init_YCbCr_to_RGB_tables), (rgb24_to_packed444_rec601_scanline_c), (rgba32_to_packed4444_rec601_scanline_c), (packed444_to_rgb24_rec601_scanline_c), (packed444_to_nonpremultiplied_packed4444_scanline_c), (aspect_adjust_packed4444_scanline_c), (setup_speedy_calls), (speedy_get_accel): * gst/deinterlace2/tvtime/speedy.h: * gst/deinterlace2/tvtime/sse.h: * gst/deinterlace2/tvtime/tomsmocomp.c: (Fieldcopy), (deinterlace_frame_di_tomsmocomp), (dscaler_tomsmocomp_get_method), (tomsmocomp_init), (tomsmocomp_filter_mmx), (tomsmocomp_filter_3dnow), (tomsmocomp_filter_sse): * gst/deinterlace2/tvtime/tomsmocomp.h: * gst/deinterlace2/tvtime/tomsmocomp/SearchLoop0A.inc: * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopBottom.inc: * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopEdgeA.inc: * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopEdgeA8.inc: * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopOddA.inc: * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopOddA2.inc: * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopOddA6.inc: * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopOddAH.inc: * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopOddAH2.inc: * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopTop.inc: * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopVA.inc: * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopVAH.inc: * gst/deinterlace2/tvtime/tomsmocomp/StrangeBob.inc: * gst/deinterlace2/tvtime/tomsmocomp/TomsMoCompAll.inc: * gst/deinterlace2/tvtime/tomsmocomp/TomsMoCompAll2.inc: * gst/deinterlace2/tvtime/tomsmocomp/WierdBob.inc: * gst/deinterlace2/tvtime/vfir.c: (deinterlace_line), (deinterlace_scanline_vfir), (copy_scanline), (dscaler_vfir_get_method): * gst/deinterlace2/tvtime/x86-64_macros.inc: Add a deinterlacer plugin based on the tvtime/DScaler deinterlacer, which was relicensed to LGPL for GStreamer and in theory provides better and faster results than the simple deinterlace element. Fixes bug #163578. Ported to GStreamer 0.10 but still not enabled or included in the build system by default because of bad artefacts caused by a bug somewhere and as it can be only build on x86/amd64 ATM and requires special CFLAGS. Will be fixed soon.
Diffstat (limited to 'gst/deinterlace2/tvtime/greedyh.asm')
-rw-r--r--gst/deinterlace2/tvtime/greedyh.asm307
1 files changed, 307 insertions, 0 deletions
diff --git a/gst/deinterlace2/tvtime/greedyh.asm b/gst/deinterlace2/tvtime/greedyh.asm
new file mode 100644
index 00000000..92ad1fe1
--- /dev/null
+++ b/gst/deinterlace2/tvtime/greedyh.asm
@@ -0,0 +1,307 @@
+/*
+ *
+ * GStreamer
+ * Copyright (c) 2001 Tom Barry. All rights reserved.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+
+/*
+ * Relicensed for GStreamer from GPL to LGPL with permit from Tom Barry.
+ * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578
+ */
+
+
+#include "x86-64_macros.inc"
+
+void FUNCT_NAME( GstDeinterlace2 *object)
+{
+ int64_t i;
+ int InfoIsOdd = 0;
+
+ // in tight loop some vars are accessed faster in local storage
+ int64_t YMask = 0x00ff00ff00ff00ffull; // to keep only luma
+ int64_t UVMask = 0xff00ff00ff00ff00ull; // to keep only chroma
+ int64_t ShiftMask = 0xfefffefffefffeffull; // to avoid shifting chroma to luma
+ int64_t QW256 = 0x0100010001000100ull; // 4 256's
+
+ // Set up our two parms that are actually evaluated for each pixel
+ i=GreedyMaxComb;
+ int64_t MaxComb = i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i;
+
+ i = GreedyMotionThreshold; // scale to range of 0-257
+ int64_t MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask;
+
+ i = GreedyMotionSense; // scale to range of 0-257
+ int64_t MotionSense = i << 48 | i << 32 | i << 16 | i;
+
+ int Line;
+ long LoopCtr;
+ unsigned int Pitch = object->field_stride;
+
+ unsigned char* L1; // ptr to Line1, of 3
+ unsigned char* L2; // ptr to Line2, the weave line
+ unsigned char* L3; // ptr to Line3
+
+ unsigned char* L2P; // ptr to prev Line2
+ unsigned char* Dest = GST_BUFFER_DATA(object->out_buf);
+
+ int64_t QW256B;
+ int64_t LastAvg=0; //interp value from left qword
+
+ i = 0xffffffff - 256;
+ QW256B = i << 48 | i << 32 | i << 16 | i; // save a couple instr on PMINSW instruct.
+
+
+ // copy first even line no matter what, and the first odd line if we're
+ // processing an EVEN field. (note diff from other deint rtns.)
+
+ if (object->field_history[object->history_count-1].flags == PICTURE_INTERLACED_BOTTOM) {
+ InfoIsOdd = 1;
+
+ L1 = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf);
+ L2 = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf);
+ L3 = L1 + Pitch;
+ L2P = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf);
+
+ // copy first even line
+ object->pMemcpy(Dest, L1, object->line_length);
+ Dest += object->output_stride;
+ }
+ else {
+ InfoIsOdd = 0;
+ L1 = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf);
+ L2 = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf) + Pitch;
+ L3 = L1 + Pitch;
+ L2P = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf) + Pitch;
+
+ // copy first even line
+ object->pMemcpy(Dest, GST_BUFFER_DATA(object->field_history[0].buf), object->line_length);
+ Dest += object->output_stride;
+ // then first odd line
+ object->pMemcpy(Dest, L1, object->line_length);
+ Dest += object->output_stride;
+ }
+
+
+ long oldbx;
+
+ for (Line = 0; Line < (object->field_height - 1); ++Line) {
+ LoopCtr = object->line_length / 8 - 1; // there are LineLength / 8 qwords per line but do 1 less, adj at end of loop
+
+ // For ease of reading, the comments below assume that we're operating on an odd
+ // field (i.e., that InfoIsOdd is true). Assume the obvious for even lines..
+ __asm__ __volatile__
+ (
+ // save ebx (-fPIC)
+ MOVX" %%"XBX", %[oldbx]\n\t"
+
+ MOVX" %[L1], %%"XAX"\n\t"
+ LEAX" 8(%%"XAX"), %%"XBX"\n\t" // next qword needed by DJR
+ MOVX" %[L3], %%"XCX"\n\t"
+ SUBX" %%"XAX", %%"XCX"\n\t" // carry L3 addr as an offset
+ MOVX" %[L2P], %%"XDX"\n\t"
+ MOVX" %[L2], %%"XSI"\n\t"
+ MOVX" %[Dest], %%"XDI"\n\t" // DL1 if Odd or DL2 if Even
+
+ ".align 8\n\t"
+ "1:\n\t"
+
+ "movq (%%"XSI"), %%mm0\n\t" // L2 - the newest weave pixel value
+ "movq (%%"XAX"), %%mm1\n\t" // L1 - the top pixel
+ "movq (%%"XDX"), %%mm2\n\t" // L2P - the prev weave pixel
+ "movq (%%"XAX", %%"XCX"), %%mm3\n\t" // L3, next odd row
+ "movq %%mm1, %%mm6\n\t" // L1 - get simple single pixel interp
+ // pavgb mm6, mm3 // use macro below
+ V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]")
+
+ // DJR - Diagonal Jaggie Reduction
+ // In the event that we are going to use an average (Bob) pixel we do not want a jagged
+ // stair step effect. To combat this we avg in the 2 horizontally adjacen pixels into the
+ // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels.
+
+ "movq %[LastAvg], %%mm4\n\t" // the bob value from prev qword in row
+ "movq %%mm6, %[LastAvg]\n\t" // save for next pass
+ "psrlq $48, %%mm4\n\t" // right justify 1 pixel
+ "movq %%mm6, %%mm7\n\t" // copy of simple bob pixel
+ "psllq $16, %%mm7\n\t" // left justify 3 pixels
+ "por %%mm7, %%mm4\n\t" // and combine
+
+ "movq (%%"XBX"), %%mm5\n\t" // next horiz qword from L1
+ // pavgb mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below
+ V_PAVGB ("%%mm5", "(%%"XBX",%%"XCX")", "%%mm7", "%[ShiftMask]")
+ "psllq $48, %%mm5\n\t" // left just 1 pixel
+ "movq %%mm6, %%mm7\n\t" // another copy of simple bob pixel
+ "psrlq $16, %%mm7\n\t" // right just 3 pixels
+ "por %%mm7, %%mm5\n\t" // combine
+ // pavgb mm4, mm5 // avg of forward and prev by 1 pixel, use macro
+ V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]") // mm5 gets modified if MMX
+ // pavgb mm6, mm4 // avg of center and surround interp vals, use macro
+ V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
+
+ // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors.
+#ifndef IS_MMX
+ // pavgb mm4, mm6 // 1/4 center, 3/4 adjacent
+ V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]")
+ // pavgb mm6, mm4 // 3/8 center, 5/8 adjacent
+ V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
+#endif
+
+ // get abs value of possible L2 comb
+ "movq %%mm6, %%mm4\n\t" // work copy of interp val
+ "movq %%mm2, %%mm7\n\t" // L2
+ "psubusb %%mm4, %%mm7\n\t" // L2 - avg
+ "movq %%mm4, %%mm5\n\t" // avg
+ "psubusb %%mm2, %%mm5\n\t" // avg - L2
+ "por %%mm7, %%mm5\n\t" // abs(avg-L2)
+
+ // get abs value of possible L2P comb
+ "movq %%mm0, %%mm7\n\t" // L2P
+ "psubusb %%mm4, %%mm7\n\t" // L2P - avg
+ "psubusb %%mm0, %%mm4\n\t" // avg - L2P
+ "por %%mm7, %%mm4\n\t" // abs(avg-L2P)
+
+ // use L2 or L2P depending upon which makes smaller comb
+ "psubusb %%mm5, %%mm4\n\t" // see if it goes to zero
+ "psubusb %%mm5, %%mm5\n\t" // 0
+ "pcmpeqb %%mm5, %%mm4\n\t" // if (mm4=0) then FF else 0
+ "pcmpeqb %%mm4, %%mm5\n\t" // opposite of mm4
+
+ // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
+ "pand %%mm2, %%mm5\n\t" // use L2 if mm5 == ff, else 0
+ "pand %%mm0, %%mm4\n\t" // use L2P if mm4 = ff, else 0
+ "por %%mm5, %%mm4\n\t" // may the best win
+
+ // Inventory: at this point we have the following values:
+ // mm0 = L2P (or L2)
+ // mm1 = L1
+ // mm2 = L2 (or L2P)
+ // mm3 = L3
+ // mm4 = the best of L2,L2P weave pixel, base upon comb
+ // mm6 = the avg interpolated value, if we need to use it
+
+ // Let's measure movement, as how much the weave pixel has changed
+ "movq %%mm2, %%mm7\n\t"
+ "psubusb %%mm0, %%mm2\n\t"
+ "psubusb %%mm7, %%mm0\n\t"
+ "por %%mm2, %%mm0\n\t" // abs value of change, used later
+
+ // Now lets clip our chosen value to be not outside of the range
+ // of the high/low range L1-L3 by more than MaxComb.
+ // This allows some comb but limits the damages and also allows more
+ // detail than a boring oversmoothed clip.
+ "movq %%mm1, %%mm2\n\t" // copy L1
+ // pmaxub mm2, mm3 // use macro
+ V_PMAXUB ("%%mm2", "%%mm3") // now = Max(L1,L3)
+ "movq %%mm1, %%mm5\n\t" // copy L1
+ // pminub mm5, mm3 // now = Min(L1,L3), use macro
+ V_PMINUB ("%%mm5", "%%mm3", "%%mm7")
+ // allow the value to be above the high or below the low by amt of MaxComb
+ "psubusb %[MaxComb], %%mm5\n\t" // lower min by diff
+ "paddusb %[MaxComb], %%mm2\n\t" // increase max by diff
+ // pmaxub mm4, mm5 // now = Max(best,Min(L1,L3) use macro
+ V_PMAXUB ("%%mm4", "%%mm5")
+ // pminub mm4, mm2 // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
+ V_PMINUB ("%%mm4", "%%mm2", "%%mm7")
+
+ // Blend weave pixel with bob pixel, depending on motion val in mm0
+ "psubusb %[MotionThreshold], %%mm0\n\t"// test Threshold, clear chroma change >>>??
+ "pmullw %[MotionSense], %%mm0\n\t" // mul by user factor, keep low 16 bits
+ "movq %[QW256], %%mm7\n\t"
+#ifdef HAVE_SSE
+ "pminsw %%mm7, %%mm0\n\t" // max = 256
+#else
+ "paddusw %[QW256B], %%mm0\n\t" // add, may sat at fff..
+ "psubusw %[QW256B], %%mm0\n\t" // now = Min(L1,256)
+#endif
+ "psubusw %%mm0, %%mm7\n\t" // so the 2 sum to 256, weighted avg
+ "movq %%mm4, %%mm2\n\t" // save weave chroma info before trashing
+ "pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value
+ "pmullw %%mm7, %%mm4\n\t" // use more weave for less motion
+ "pand %[YMask], %%mm6\n\t" // keep only luma from calc'd value
+ "pmullw %%mm0, %%mm6\n\t" // use more bob for large motion
+ "paddusw %%mm6, %%mm4\n\t" // combine
+ "psrlw $8, %%mm4\n\t" // div by 256 to get weighted avg
+
+ // chroma comes from weave pixel
+ "pand %[UVMask], %%mm2\n\t" // keep chroma
+ "por %%mm4, %%mm2\n\t" // and combine
+
+ V_MOVNTQ ("(%%"XDI")", "%%mm2") // move in our clipped best, use macro
+
+ // bump ptrs and loop
+ LEAX" 8(%%"XAX"), %%"XAX"\n\t"
+ LEAX" 8(%%"XBX"), %%"XBX"\n\t"
+ LEAX" 8(%%"XDX"), %%"XDX"\n\t"
+ LEAX" 8(%%"XDI"), %%"XDI"\n\t"
+ LEAX" 8(%%"XSI"), %%"XSI"\n\t"
+ DECX" %[LoopCtr]\n\t"
+ "jg 1b\n\t" // loop if not to last line
+ // note P-III default assumes backward branches taken
+ "jl 1f\n\t" // done
+ MOVX" %%"XAX", %%"XBX"\n\t" // sharpness lookahead 1 byte only, be wrong on 1
+ "jmp 1b\n\t"
+
+ "1:\n\t"
+ MOVX" %[oldbx], %%"XBX"\n\t"
+
+ : /* no outputs */
+
+ : [LastAvg] "m"(LastAvg),
+ [L1] "m"(L1),
+ [L3] "m"(L3),
+ [L2P] "m"(L2P),
+ [L2] "m"(L2),
+ [Dest] "m"(Dest),
+ [ShiftMask] "m"(ShiftMask),
+ [MaxComb] "m"(MaxComb),
+ [MotionThreshold] "m"(MotionThreshold),
+ [MotionSense] "m"(MotionSense),
+ [QW256B] "m"(QW256B),
+ [YMask] "m"(YMask),
+ [UVMask] "m"(UVMask),
+ [LoopCtr] "m"(LoopCtr),
+ [QW256] "m"(QW256),
+ [oldbx] "m"(oldbx)
+
+ : XAX, XCX, XDX, XSI, XDI,
+#ifdef HAVE_CPU_I386
+ "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
+#endif
+ "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
+ "memory", "cc"
+ );
+
+ Dest += object->output_stride;
+ object->pMemcpy(Dest, L3, object->line_length);
+ Dest += object->output_stride;
+
+ L1 += Pitch;
+ L2 += Pitch;
+ L3 += Pitch;
+ L2P += Pitch;
+ }
+
+ if (InfoIsOdd) {
+ object->pMemcpy(Dest, L2, object->line_length);
+ }
+
+ // clear out the MMX registers ready for doing floating point again
+#ifdef HAVE_CPU_I386
+ __asm__ __volatile__ ("emms\n\t");
+#endif
+}