summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog123
-rw-r--r--gst/deinterlace2/Makefile.am46
-rw-r--r--gst/deinterlace2/gstdeinterlace2.c893
-rw-r--r--gst/deinterlace2/gstdeinterlace2.h267
-rw-r--r--gst/deinterlace2/tvtime/greedy.c207
-rw-r--r--gst/deinterlace2/tvtime/greedyh.asm307
-rw-r--r--gst/deinterlace2/tvtime/greedyh.c148
-rw-r--r--gst/deinterlace2/tvtime/greedyh.h45
-rw-r--r--gst/deinterlace2/tvtime/greedyhmacros.h74
-rw-r--r--gst/deinterlace2/tvtime/mmx.h723
-rw-r--r--gst/deinterlace2/tvtime/plugins.h42
-rw-r--r--gst/deinterlace2/tvtime/speedtools.h54
-rw-r--r--gst/deinterlace2/tvtime/speedy.c2791
-rw-r--r--gst/deinterlace2/tvtime/speedy.h308
-rw-r--r--gst/deinterlace2/tvtime/sse.h992
-rw-r--r--gst/deinterlace2/tvtime/tomsmocomp.c187
-rw-r--r--gst/deinterlace2/tvtime/tomsmocomp.h61
-rw-r--r--gst/deinterlace2/tvtime/vfir.c184
-rw-r--r--gst/deinterlace2/tvtime/x86-64_macros.inc82
19 files changed, 7534 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog
index f29cc8d6..9bd8d513 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,128 @@
2008-06-11 Sebastian Dröge <slomo@circular-chaos.org>
+ Based on a patch by: Martin Eikermann <meiker at upb dot de>
+
+ * gst/deinterlace2/Makefile.am:
+ * gst/deinterlace2/gstdeinterlace2.c:
+ (gst_deinterlace2_method_get_type),
+ (gst_deinterlace2_fields_get_type),
+ (gst_deinterlace2_field_layout_get_type),
+ (gst_deinterlace2_base_init), (gst_deinterlace2_class_init),
+ (gst_deinterlace2_init), (gst_deinterlace2_set_method),
+ (gst_deinterlace2_set_property), (gst_deinterlace2_get_property),
+ (gst_deinterlace2_finalize), (gst_deinterlace2_pop_history),
+ (gst_deinterlace2_head_history), (gst_deinterlace2_push_history),
+ (gst_deinterlace2_deinterlace_scanlines), (gst_deinterlace2_chain),
+ (gst_deinterlace2_setcaps), (gst_deinterlace2_sink_event),
+ (gst_deinterlace2_change_state), (gst_deinterlace2_src_event),
+ (gst_deinterlace2_src_query), (gst_deinterlace2_src_query_types),
+ (plugin_init):
+ * gst/deinterlace2/gstdeinterlace2.h:
+ * gst/deinterlace2/tvtime/greedy.c: (copy_scanline),
+ (deinterlace_greedy_packed422_scanline_mmxext),
+ (dscaler_greedyl_get_method):
+ * gst/deinterlace2/tvtime/greedyh.asm:
+ * gst/deinterlace2/tvtime/greedyh.c:
+ (deinterlace_frame_di_greedyh), (dscaler_greedyh_get_method),
+ (greedyh_init), (greedyh_filter_mmx), (greedyh_filter_3dnow),
+ (greedyh_filter_sse):
+ * gst/deinterlace2/tvtime/greedyh.h:
+ * gst/deinterlace2/tvtime/greedyhmacros.h:
+ * gst/deinterlace2/tvtime/mmx.h:
+ * gst/deinterlace2/tvtime/plugins.h:
+ * gst/deinterlace2/tvtime/speedtools.h:
+ * gst/deinterlace2/tvtime/speedy.c: (multiply_alpha), (clip255),
+ (comb_factor_packed422_scanline_mmx),
+ (diff_factor_packed422_scanline_c),
+ (diff_factor_packed422_scanline_mmx),
+ (diff_packed422_block8x8_mmx), (diff_packed422_block8x8_c),
+ (packed444_to_packed422_scanline_c),
+ (packed422_to_packed444_scanline_c),
+ (packed422_to_packed444_rec601_scanline_c),
+ (vfilter_chroma_121_packed422_scanline_mmx),
+ (vfilter_chroma_121_packed422_scanline_c),
+ (vfilter_chroma_332_packed422_scanline_mmx),
+ (vfilter_chroma_332_packed422_scanline_c),
+ (kill_chroma_packed422_inplace_scanline_mmx),
+ (kill_chroma_packed422_inplace_scanline_c),
+ (invert_colour_packed422_inplace_scanline_mmx),
+ (invert_colour_packed422_inplace_scanline_c),
+ (mirror_packed422_inplace_scanline_c),
+ (interpolate_packed422_scanline_c),
+ (convert_uyvy_to_yuyv_scanline_mmx),
+ (convert_uyvy_to_yuyv_scanline_c),
+ (interpolate_packed422_scanline_mmx),
+ (interpolate_packed422_scanline_mmxext),
+ (blit_colour_packed422_scanline_c),
+ (blit_colour_packed422_scanline_mmx),
+ (blit_colour_packed422_scanline_mmxext),
+ (blit_colour_packed4444_scanline_c),
+ (blit_colour_packed4444_scanline_mmx),
+ (blit_colour_packed4444_scanline_mmxext), (small_memcpy),
+ (speedy_memcpy_c), (speedy_memcpy_mmx), (speedy_memcpy_mmxext),
+ (blit_packed422_scanline_c), (blit_packed422_scanline_mmx),
+ (blit_packed422_scanline_mmxext),
+ (composite_colour4444_alpha_to_packed422_scanline_c),
+ (composite_colour4444_alpha_to_packed422_scanline_mmxext),
+ (composite_packed4444_alpha_to_packed422_scanline_c),
+ (composite_packed4444_alpha_to_packed422_scanline_mmxext),
+ (composite_packed4444_to_packed422_scanline_c),
+ (composite_packed4444_to_packed422_scanline_mmxext),
+ (composite_alphamask_to_packed4444_scanline_c),
+ (composite_alphamask_to_packed4444_scanline_mmxext),
+ (composite_alphamask_alpha_to_packed4444_scanline_c),
+ (premultiply_packed4444_scanline_c),
+ (premultiply_packed4444_scanline_mmxext),
+ (blend_packed422_scanline_c), (blend_packed422_scanline_mmxext),
+ (quarter_blit_vertical_packed422_scanline_mmxext),
+ (quarter_blit_vertical_packed422_scanline_c),
+ (subpix_blit_vertical_packed422_scanline_c),
+ (a8_subpix_blit_scanline_c), (myround), (init_RGB_to_YCbCr_tables),
+ (init_YCbCr_to_RGB_tables), (rgb24_to_packed444_rec601_scanline_c),
+ (rgba32_to_packed4444_rec601_scanline_c),
+ (packed444_to_rgb24_rec601_scanline_c),
+ (packed444_to_nonpremultiplied_packed4444_scanline_c),
+ (aspect_adjust_packed4444_scanline_c), (setup_speedy_calls),
+ (speedy_get_accel):
+ * gst/deinterlace2/tvtime/speedy.h:
+ * gst/deinterlace2/tvtime/sse.h:
+ * gst/deinterlace2/tvtime/tomsmocomp.c: (Fieldcopy),
+ (deinterlace_frame_di_tomsmocomp), (dscaler_tomsmocomp_get_method),
+ (tomsmocomp_init), (tomsmocomp_filter_mmx),
+ (tomsmocomp_filter_3dnow), (tomsmocomp_filter_sse):
+ * gst/deinterlace2/tvtime/tomsmocomp.h:
+ * gst/deinterlace2/tvtime/tomsmocomp/SearchLoop0A.inc:
+ * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopBottom.inc:
+ * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopEdgeA.inc:
+ * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopEdgeA8.inc:
+ * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopOddA.inc:
+ * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopOddA2.inc:
+ * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopOddA6.inc:
+ * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopOddAH.inc:
+ * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopOddAH2.inc:
+ * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopTop.inc:
+ * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopVA.inc:
+ * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopVAH.inc:
+ * gst/deinterlace2/tvtime/tomsmocomp/StrangeBob.inc:
+ * gst/deinterlace2/tvtime/tomsmocomp/TomsMoCompAll.inc:
+ * gst/deinterlace2/tvtime/tomsmocomp/TomsMoCompAll2.inc:
+ * gst/deinterlace2/tvtime/tomsmocomp/WierdBob.inc:
+ * gst/deinterlace2/tvtime/vfir.c: (deinterlace_line),
+ (deinterlace_scanline_vfir), (copy_scanline),
+ (dscaler_vfir_get_method):
+ * gst/deinterlace2/tvtime/x86-64_macros.inc:
+ Add a deinterlacer plugin based on the tvtime/DScaler deinterlacer,
+ which was relicensed to LGPL for GStreamer and in theory provides
+ better and faster results than the simple deinterlace element.
+ Fixes bug #163578.
+
+ Ported to GStreamer 0.10 but still not enabled or included in the
+ build system by default because of bad artefacts caused by a bug
+ somewhere and as it can be only build on x86/amd64 ATM and requires
+ special CFLAGS. Will be fixed soon.
+
+2008-06-11 Sebastian Dröge <slomo@circular-chaos.org>
+
Based on a patch by: Sonicadvance1 at GMAIL dot COM
* ext/timidity/gstwildmidi.c: (gst_wildmidi_init),
diff --git a/gst/deinterlace2/Makefile.am b/gst/deinterlace2/Makefile.am
new file mode 100644
index 00000000..7d625f73
--- /dev/null
+++ b/gst/deinterlace2/Makefile.am
@@ -0,0 +1,46 @@
+plugin_LTLIBRARIES = libgstdeinterlace2.la
+
+libgstdeinterlace2_la_SOURCES = \
+ gstdeinterlace2.c \
+ tvtime/greedy.c \
+ tvtime/greedyh.asm \
+ tvtime/greedyh.c \
+ tvtime/speedy.c \
+ tvtime/vfir.c \
+ tvtime/x86-64_macros.inc \
+ tvtime/tomsmocomp.c \
+ tvtime/tomsmocomp/SearchLoop0A.inc \
+ tvtime/tomsmocomp/SearchLoopBottom.inc \
+ tvtime/tomsmocomp/SearchLoopEdgeA8.inc \
+ tvtime/tomsmocomp/SearchLoopEdgeA.inc \
+ tvtime/tomsmocomp/SearchLoopOddA2.inc \
+ tvtime/tomsmocomp/SearchLoopOddA6.inc \
+ tvtime/tomsmocomp/SearchLoopOddAH2.inc \
+ tvtime/tomsmocomp/SearchLoopOddAH.inc \
+ tvtime/tomsmocomp/SearchLoopOddA.inc \
+ tvtime/tomsmocomp/SearchLoopTop.inc \
+ tvtime/tomsmocomp/SearchLoopVAH.inc \
+ tvtime/tomsmocomp/SearchLoopVA.inc \
+ tvtime/tomsmocomp/StrangeBob.inc \
+ tvtime/tomsmocomp/TomsMoCompAll2.inc \
+ tvtime/tomsmocomp/TomsMoCompAll.inc \
+ tvtime/tomsmocomp/tomsmocompmacros.h \
+ tvtime/tomsmocomp/WierdBob.inc
+
+libgstdeinterlace2_la_CFLAGS = $(GST_CFLAGS) \
+ $(GST_PLUGINS_BASE_CFLAGS) $(GST_BASE_CFLAGS) $(LIBOIL_CFLAGS) -march=athlon-xp
+libgstdeinterlace2_la_LIBADD = $(GST_LIBS) \
+ $(GST_PLUGINS_BASE_LIBS) -lgstvideo-$(GST_MAJORMINOR) $(GST_BASE_LIBS) $(LIBOIL_LIBS)
+libgstdeinterlace2_la_LDFLAGS = $(GST_PLUGIN_LDFLAGS)
+
+noinst_HEADERS = \
+ gstdeinterlace2.h \
+ tvtime/mmx.h \
+ tvtime/sse.h \
+ tvtime/greedyh.h \
+ tvtime/greedyhmacros.h \
+ tvtime/plugins.h \
+ tvtime/speedtools.h \
+ tvtime/speedy.h \
+ tvtime/tomsmocomp.h
+
diff --git a/gst/deinterlace2/gstdeinterlace2.c b/gst/deinterlace2/gstdeinterlace2.c
new file mode 100644
index 00000000..450a31c9
--- /dev/null
+++ b/gst/deinterlace2/gstdeinterlace2.c
@@ -0,0 +1,893 @@
+/*
+ * GStreamer
+ * Copyright (C) 2005 Martin Eikermann <meiker@upb.de>
+ * Copyright (C) 2008 Sebastian Dröge <slomo@circular-chaos.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "gstdeinterlace2.h"
+#include <gst/gst.h>
+#include <gst/video/video.h>
+
+#include "tvtime/plugins.h"
+#include "tvtime/speedy.h"
+
+GST_DEBUG_CATEGORY_STATIC (deinterlace2_debug);
+#define GST_CAT_DEFAULT (deinterlace2_debug)
+
+/* Object signals and args */
+enum
+{
+ LAST_SIGNAL
+};
+
+/* Arguments */
+enum
+{
+ ARG_0,
+ ARG_METHOD,
+ ARG_FIELDS,
+ ARG_FIELD_LAYOUT
+};
+
+#define GST_TYPE_DEINTERLACE2_METHOD (gst_deinterlace2_method_get_type ())
+static GType
+gst_deinterlace2_method_get_type (void)
+{
+ static GType deinterlace2_method_type = 0;
+
+ static const GEnumValue method_types[] = {
+ {GST_DEINTERLACE2_TOM, "Toms Motion Compensation", "tomsmc"},
+ {GST_DEINTERLACE2_GREEDY_H, "Greedy High Motion", "greedyh"},
+ {GST_DEINTERLACE2_GREEDY_L, "Greedy Low Motion", "greedyl"},
+ {GST_DEINTERLACE2_VFIR, "Vertical Blur", "vfir"},
+ {0, NULL, NULL},
+ };
+
+ if (!deinterlace2_method_type) {
+ deinterlace2_method_type =
+ g_enum_register_static ("GstDeinterlace2Methods", method_types);
+ }
+ return deinterlace2_method_type;
+}
+
+#define GST_TYPE_DEINTERLACE2_FIELDS (gst_deinterlace2_fields_get_type ())
+static GType
+gst_deinterlace2_fields_get_type (void)
+{
+ static GType deinterlace2_fields_type = 0;
+
+ static const GEnumValue fields_types[] = {
+ {GST_DEINTERLACE2_ALL, "All fields", "all"},
+ {GST_DEINTERLACE2_TF, "Top fields only", "top"},
+ {GST_DEINTERLACE2_BF, "Bottom fields only", "bottom"},
+ {0, NULL, NULL},
+ };
+
+ if (!deinterlace2_fields_type) {
+ deinterlace2_fields_type =
+ g_enum_register_static ("GstDeinterlace2Fields", fields_types);
+ }
+ return deinterlace2_fields_type;
+}
+
+#define GST_TYPE_DEINTERLACE2_FIELD_LAYOUT (gst_deinterlace2_field_layout_get_type ())
+static GType
+gst_deinterlace2_field_layout_get_type (void)
+{
+ static GType deinterlace2_field_layout_type = 0;
+
+ static const GEnumValue field_layout_types[] = {
+ {GST_DEINTERLACE2_LAYOUT_AUTO, "Auto detection", "auto"},
+ {GST_DEINTERLACE2_LAYOUT_TFF, "Top field first", "tff"},
+ {GST_DEINTERLACE2_LAYOUT_BFF, "Bottom field first", "bff"},
+ {0, NULL, NULL},
+ };
+
+ if (!deinterlace2_field_layout_type) {
+ deinterlace2_field_layout_type =
+ g_enum_register_static ("GstDeinterlace2FieldLayout",
+ field_layout_types);
+ }
+ return deinterlace2_field_layout_type;
+}
+
+static GstStaticPadTemplate src_templ = GST_STATIC_PAD_TEMPLATE ("src",
+ GST_PAD_SRC,
+ GST_PAD_ALWAYS,
+ GST_STATIC_CAPS (GST_VIDEO_CAPS_YUV ("YUY2"))
+ );
+
+static GstStaticPadTemplate sink_templ = GST_STATIC_PAD_TEMPLATE ("sink",
+ GST_PAD_SINK,
+ GST_PAD_ALWAYS,
+ GST_STATIC_CAPS (GST_VIDEO_CAPS_YUV ("YUY2"))
+ );
+
+static void gst_deinterlace2_finalize (GObject * object);
+
+static void gst_deinterlace2_set_property (GObject * object, guint prop_id,
+ const GValue * value, GParamSpec * pspec);
+static void gst_deinterlace2_get_property (GObject * object, guint prop_id,
+ GValue * value, GParamSpec * pspec);
+
+static gboolean gst_deinterlace2_setcaps (GstPad * pad, GstCaps * caps);
+
+static gboolean gst_deinterlace2_sink_event (GstPad * pad, GstEvent * event);
+
+static GstFlowReturn gst_deinterlace2_chain (GstPad * pad, GstBuffer * buffer);
+
+static GstStateChangeReturn gst_deinterlace2_change_state (GstElement * element,
+ GstStateChange transition);
+
+static gboolean gst_deinterlace2_src_event (GstPad * pad, GstEvent * event);
+
+static gboolean gst_deinterlace2_src_query (GstPad * pad, GstQuery * query);
+
+static const GstQueryType *gst_deinterlace2_src_query_types (GstPad * pad);
+
+static void gst_deinterlace2_deinterlace_scanlines (GstDeinterlace2 * object);
+
+GST_BOILERPLATE (GstDeinterlace2, gst_deinterlace2, GstElement,
+ GST_TYPE_ELEMENT);
+
+static void
+gst_deinterlace2_base_init (gpointer klass)
+{
+ GstElementClass *element_class = GST_ELEMENT_CLASS (klass);
+
+ gst_element_class_add_pad_template (element_class,
+ gst_static_pad_template_get (&src_templ));
+ gst_element_class_add_pad_template (element_class,
+ gst_static_pad_template_get (&sink_templ));
+
+ gst_element_class_set_details_simple (element_class,
+ "Deinterlacer",
+ "Filter/Video",
+ "Deinterlace Methods ported from DScaler/TvTime",
+ "Martin Eikermann <meiker@upb.de>, "
+ "Sebastian Dröge <slomo@circular-chaos.org>");
+}
+
+static void
+gst_deinterlace2_class_init (GstDeinterlace2Class * klass)
+{
+ GObjectClass *gobject_class = (GObjectClass *) klass;
+
+ GstElementClass *element_class = (GstElementClass *) klass;
+
+ gobject_class->set_property = gst_deinterlace2_set_property;
+ gobject_class->get_property = gst_deinterlace2_get_property;
+ gobject_class->finalize = gst_deinterlace2_finalize;
+
+ g_object_class_install_property (gobject_class, ARG_METHOD,
+ g_param_spec_enum ("method",
+ "Method",
+ "Deinterlace Method",
+ GST_TYPE_DEINTERLACE2_METHOD,
+ GST_DEINTERLACE2_GREEDY_H, G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)
+ );
+
+ g_object_class_install_property (gobject_class, ARG_FIELDS,
+ g_param_spec_enum ("fields",
+ "fields",
+ "Fields to use for deinterlacing",
+ GST_TYPE_DEINTERLACE2_FIELDS,
+ GST_DEINTERLACE2_ALL, G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)
+ );
+
+
+ g_object_class_install_property (gobject_class, ARG_FIELDS,
+ g_param_spec_enum ("tff",
+ "tff",
+ "Deinterlace top field first",
+ GST_TYPE_DEINTERLACE2_FIELD_LAYOUT,
+ GST_DEINTERLACE2_LAYOUT_AUTO,
+ G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)
+ );
+
+ element_class->change_state =
+ GST_DEBUG_FUNCPTR (gst_deinterlace2_change_state);
+}
+
+static void
+gst_deinterlace2_init (GstDeinterlace2 * object, GstDeinterlace2Class * klass)
+{
+ object->sinkpad = gst_pad_new_from_static_template (&sink_templ, "sink");
+ gst_pad_set_chain_function (object->sinkpad,
+ GST_DEBUG_FUNCPTR (gst_deinterlace2_chain));
+ gst_pad_set_event_function (object->sinkpad,
+ GST_DEBUG_FUNCPTR (gst_deinterlace2_sink_event));
+ gst_pad_set_setcaps_function (object->sinkpad,
+ GST_DEBUG_FUNCPTR (gst_deinterlace2_setcaps));
+ gst_pad_set_getcaps_function (object->sinkpad,
+ GST_DEBUG_FUNCPTR (gst_pad_proxy_getcaps));
+ gst_element_add_pad (GST_ELEMENT (object), object->sinkpad);
+
+ object->srcpad = gst_pad_new_from_static_template (&src_templ, "src");
+ gst_pad_set_event_function (object->srcpad,
+ GST_DEBUG_FUNCPTR (gst_deinterlace2_src_event));
+ gst_pad_set_query_type_function (object->srcpad,
+ GST_DEBUG_FUNCPTR (gst_deinterlace2_src_query_types));
+ gst_pad_set_query_function (object->srcpad,
+ GST_DEBUG_FUNCPTR (gst_deinterlace2_src_query));
+ gst_pad_set_setcaps_function (object->srcpad,
+ GST_DEBUG_FUNCPTR (gst_deinterlace2_setcaps));
+ gst_pad_set_getcaps_function (object->srcpad,
+ GST_DEBUG_FUNCPTR (gst_pad_proxy_getcaps));
+ gst_element_add_pad (GST_ELEMENT (object), object->srcpad);
+
+ gst_element_no_more_pads (GST_ELEMENT (object));
+
+ object->cpu_feature_flags = oil_cpu_get_flags ();
+
+ setup_speedy_calls (object->cpu_feature_flags, 0);
+ object->pMemcpy = speedy_memcpy;
+
+ object->method = dscaler_tomsmocomp_get_method ();
+
+ object->history_count = 0;
+
+ object->field_layout = GST_DEINTERLACE2_LAYOUT_AUTO;
+
+ object->out_buf = NULL;
+ object->output_stride = 0;
+ object->line_length = 0;
+ object->frame_width = 0;
+ object->frame_height = 0;
+ object->field_height = 0;
+ object->field_stride = 0;
+
+ object->fields = GST_DEINTERLACE2_ALL;
+
+ object->bottom_field = TRUE;
+}
+
+static void
+gst_deinterlace2_set_method (GstDeinterlace2 * object,
+ GstDeinterlace2Methods method)
+{
+
+ switch (method) {
+ case GST_DEINTERLACE2_TOM:
+ object->method_id = method;
+ object->method = dscaler_tomsmocomp_get_method ();
+ break;
+ case GST_DEINTERLACE2_GREEDY_H:
+ object->method_id = method;
+ object->method = dscaler_greedyh_get_method ();
+ break;
+ case GST_DEINTERLACE2_GREEDY_L:
+ object->method_id = method;
+ object->method = dscaler_greedyl_get_method ();
+ break;
+ case GST_DEINTERLACE2_VFIR:
+ object->method_id = method;
+ object->method = dscaler_vfir_get_method ();
+ break;
+ default:
+ GST_WARNING ("Invalid Deinterlacer Method");
+ }
+
+
+ if (object->method->deinterlace_frame == NULL)
+ object->method->deinterlace_frame = gst_deinterlace2_deinterlace_scanlines;
+
+ /* TODO: if current method requires less fields in the history,
+ pop the diff from field_history.
+ */
+
+}
+
+static void
+gst_deinterlace2_set_property (GObject * _object, guint prop_id,
+ const GValue * value, GParamSpec * pspec)
+{
+ GstDeinterlace2 *object;
+
+ g_return_if_fail (GST_IS_DEINTERLACE2 (_object));
+ object = GST_DEINTERLACE2 (_object);
+
+ switch (prop_id) {
+ case ARG_METHOD:
+ gst_deinterlace2_set_method (object, g_value_get_enum (value));
+ break;
+ case ARG_FIELDS:
+ object->fields = g_value_get_enum (value);
+ break;
+ case ARG_FIELD_LAYOUT:
+ object->field_layout = g_value_get_enum (value);
+ break;
+ default:
+ G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
+ }
+
+}
+
+static void
+gst_deinterlace2_get_property (GObject * _object, guint prop_id,
+ GValue * value, GParamSpec * pspec)
+{
+ GstDeinterlace2 *object;
+
+ g_return_if_fail (GST_IS_DEINTERLACE2 (_object));
+ object = GST_DEINTERLACE2 (_object);
+
+ switch (prop_id) {
+ case ARG_METHOD:
+ g_value_set_enum (value, object->method_id);
+ break;
+ case ARG_FIELDS:
+ g_value_set_enum (value, object->fields);
+ break;
+ case ARG_FIELD_LAYOUT:
+ g_value_set_enum (value, object->field_layout);
+ break;
+ default:
+ G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
+ }
+}
+
+static void
+gst_deinterlace2_finalize (GObject * object)
+{
+ G_OBJECT_CLASS (parent_class)->dispose (object);
+}
+
+static GstBuffer *
+gst_deinterlace2_pop_history (GstDeinterlace2 * object)
+{
+ GstBuffer *buffer = NULL;
+
+ g_assert (object->history_count > 0);
+
+ buffer = object->field_history[object->history_count - 1].buf;
+
+ object->history_count--;
+ GST_DEBUG ("pop, size(history): %d", object->history_count);
+
+ return buffer;
+}
+
+#if 0
+static GstBuffer *
+gst_deinterlace2_head_history (GstDeinterlace2 * object)
+{
+ return object->field_history[object->history_count - 1].buf;
+}
+#endif
+
+
+/* invariant: field with smallest timestamp is object->field_history[object->history_count-1]
+
+*/
+
+static void
+gst_deinterlace2_push_history (GstDeinterlace2 * object, GstBuffer * buffer)
+{
+ int i = 1;
+
+ GstClockTime timestamp;
+
+ GstClockTime field_diff;
+
+ g_assert (object->history_count < MAX_FIELD_HISTORY - 2);
+
+ for (i = MAX_FIELD_HISTORY - 1; i >= 2; i--) {
+ object->field_history[i].buf = object->field_history[i - 2].buf;
+ object->field_history[i].flags = object->field_history[i - 2].flags;
+ }
+
+ if (object->field_layout == GST_DEINTERLACE2_LAYOUT_AUTO) {
+ GST_WARNING ("Could not detect field layout. Assuming top field first.");
+ object->field_layout = GST_DEINTERLACE2_LAYOUT_TFF;
+ }
+
+
+ if (object->field_layout == GST_DEINTERLACE2_LAYOUT_TFF) {
+ GST_DEBUG ("Top field first");
+ object->field_history[0].buf =
+ gst_buffer_create_sub (buffer, object->line_length,
+ GST_BUFFER_SIZE (buffer) - object->line_length);
+ object->field_history[0].flags = PICTURE_INTERLACED_BOTTOM;
+ object->field_history[1].buf = buffer;
+ object->field_history[1].flags = PICTURE_INTERLACED_TOP;
+ } else {
+ GST_DEBUG ("Bottom field first");
+ object->field_history[0].buf = buffer;
+ object->field_history[0].flags = PICTURE_INTERLACED_TOP;
+ object->field_history[1].buf =
+ gst_buffer_create_sub (buffer, object->line_length,
+ GST_BUFFER_SIZE (buffer) - object->line_length);
+ object->field_history[1].flags = PICTURE_INTERLACED_BOTTOM;
+ }
+
+ /* Timestamps are assigned to the field buffers under the assumption that
+ the timestamp of the buffer equals the first fields timestamp */
+
+ timestamp = GST_BUFFER_TIMESTAMP (buffer);
+ field_diff = GST_SECOND / (object->frame_rate_d * 2) / object->frame_rate_n;
+ GST_BUFFER_TIMESTAMP (object->field_history[0].buf) = timestamp + field_diff;
+ GST_BUFFER_TIMESTAMP (object->field_history[1].buf) = timestamp;
+
+ object->history_count += 2;
+ GST_DEBUG ("push, size(history): %d", object->history_count);
+}
+
+/* some methods support only deinterlace_/copy_scanline functions.
+ This funtion calls them in the right manner. */
+static void
+gst_deinterlace2_deinterlace_scanlines (GstDeinterlace2 * object)
+{
+
+ gint line = 1;
+
+ gint cur_field_idx = object->history_count - object->method->fields_required;
+
+ GST_INFO ("cur_field_idx: %d", cur_field_idx);
+
+ guint8 *out_data = GST_BUFFER_DATA (object->out_buf);
+
+ guint8 *cur_field =
+ GST_BUFFER_DATA (object->field_history[cur_field_idx].buf);
+ guint8 *last_field = NULL;
+
+ guint8 *second_last_field = NULL;
+
+ /* method can just handle up to 3 history fields,
+ bcs until now there isn't a plugin (with interp./copy scanline methods)
+ that uses more */
+ g_assert (object->method->fields_required <= 3);
+
+ if (object->method->fields_required >= 2) {
+ last_field = GST_BUFFER_DATA (object->field_history[cur_field_idx + 1].buf);
+ }
+ if (object->method->fields_required >= 3) {
+ second_last_field =
+ GST_BUFFER_DATA (object->field_history[cur_field_idx + 2].buf);
+ }
+
+ if (object->field_history[cur_field_idx].flags == PICTURE_INTERLACED_BOTTOM) {
+ /* double the first scanline of the bottom field */
+ blit_packed422_scanline (out_data, cur_field, object->frame_width);
+ out_data += object->output_stride;
+ }
+
+ blit_packed422_scanline (out_data, cur_field, object->frame_width);
+ out_data += object->output_stride;
+ line++;
+
+ for (; line <= object->field_height;) {
+ deinterlace_scanline_data_t data;
+
+ /* interp. scanline */
+ data.t0 = cur_field;
+ data.b0 = cur_field + object->field_stride;
+
+ if (last_field != NULL) {
+ data.tt1 = last_field;
+ data.m1 = last_field + object->field_stride;
+ data.bb1 = last_field + (object->field_stride * 2);
+
+ last_field += object->field_stride;
+ }
+
+ if (second_last_field != NULL) {
+ data.t2 = second_last_field;
+ data.b2 = second_last_field + object->field_stride;
+ }
+
+ /* set valid data for corner cases */
+ if (line == 2) {
+ data.tt1 = data.bb1;
+ } else if (line == object->field_height) {
+ data.bb1 = data.tt1;
+ }
+
+ object->method->interpolate_scanline (object, &data, out_data);
+ out_data += object->output_stride;
+
+ /* copy a scanline */
+ data.tt0 = cur_field;
+ data.m0 = cur_field + (object->field_stride);
+ data.bb0 = cur_field + (object->field_stride * 2);
+ cur_field += object->field_stride;
+
+ if (last_field != NULL) {
+ data.t1 = last_field;
+ data.b1 = last_field + object->field_stride;
+ }
+
+ if (second_last_field != NULL) {
+ data.tt2 = second_last_field;
+ data.m2 = second_last_field + (object->field_stride);
+ data.bb2 = second_last_field + (object->field_stride * 2);
+ second_last_field += object->field_stride;
+ }
+
+ /* set valid data for corner cases */
+ if (line == object->field_height) {
+ data.bb0 = data.tt0;
+ data.bb2 = data.tt2;
+ data.b1 = data.t1;
+ }
+
+ object->method->copy_scanline (object, &data, out_data);
+ out_data += object->output_stride;
+ line++;
+ }
+
+ if (object->field_history[cur_field_idx].flags == PICTURE_INTERLACED_TOP) {
+ /* double the last scanline of the top field */
+ blit_packed422_scanline (out_data, cur_field, object->frame_width);
+ }
+}
+
+static GstFlowReturn
+gst_deinterlace2_chain (GstPad * pad, GstBuffer * buf)
+{
+ //GstBuffer *out_buf = NULL;
+ GstDeinterlace2 *object = NULL;
+
+ GstClockTime timestamp;
+
+ //GstFlowReturn ret = GST_FLOW_OK;
+
+ object = GST_DEINTERLACE2 (GST_PAD_PARENT (pad));
+
+ gst_deinterlace2_push_history (object, buf);
+ buf = NULL;
+
+ if (object->method != NULL) {
+ int cur_field_idx = 0;
+
+ /* Not enough fields in the history */
+ if (object->history_count < object->method->fields_required + 1) {
+ /* TODO: do bob or just forward frame */
+ GST_DEBUG ("HistoryCount=%d", object->history_count);
+ return GST_FLOW_OK;
+ }
+
+ if (object->fields == GST_DEINTERLACE2_ALL)
+ GST_DEBUG ("All fields");
+ if (object->fields == GST_DEINTERLACE2_TF)
+ GST_DEBUG ("Top fields");
+ if (object->fields == GST_DEINTERLACE2_BF)
+ GST_DEBUG ("Bottom fields");
+
+ cur_field_idx = object->history_count - object->method->fields_required;
+
+ if ((object->field_history[cur_field_idx].flags == PICTURE_INTERLACED_TOP
+ && object->fields == GST_DEINTERLACE2_TF) ||
+ object->fields == GST_DEINTERLACE2_ALL) {
+ GST_DEBUG ("deinterlacing top field");
+
+ /* create new buffer */
+ object->out_buf = gst_buffer_new_and_alloc (object->frame_size);
+ gst_buffer_set_caps (object->out_buf, GST_PAD_CAPS (object->srcpad));
+
+ /* do magic calculus */
+ if (object->method->deinterlace_frame != NULL) {
+ object->method->deinterlace_frame (object);
+
+ buf = gst_deinterlace2_pop_history (object);
+ timestamp = GST_BUFFER_TIMESTAMP (buf);
+ gst_buffer_unref (buf);
+
+ GST_BUFFER_TIMESTAMP (object->out_buf) = timestamp;
+ gst_pad_push (object->srcpad, object->out_buf);
+ }
+ }
+ /* no calculation done: remove excess field */
+ else if (object->field_history[cur_field_idx].flags ==
+ PICTURE_INTERLACED_TOP && object->fields == GST_DEINTERLACE2_BF) {
+ GST_DEBUG ("Removing unused top field");
+ buf = gst_deinterlace2_pop_history (object);
+ gst_buffer_unref (buf);
+ }
+
+ cur_field_idx = object->history_count - object->method->fields_required;
+
+ /* deinterlace bottom_field */
+ if ((object->field_history[cur_field_idx].flags == PICTURE_INTERLACED_BOTTOM
+ && object->fields == GST_DEINTERLACE2_BF) ||
+ object->fields == GST_DEINTERLACE2_ALL) {
+ GST_DEBUG ("deinterlacing bottom field");
+
+ /* create new buffer */
+ object->out_buf = gst_buffer_new_and_alloc (object->frame_size);
+ gst_buffer_set_caps (object->out_buf, GST_PAD_CAPS (object->srcpad));
+
+ /* do magic calculus */
+ if (object->method->deinterlace_frame != NULL) {
+ object->method->deinterlace_frame (object);
+
+ buf = gst_deinterlace2_pop_history (object);
+ timestamp = GST_BUFFER_TIMESTAMP (buf);
+ gst_buffer_unref (buf);
+
+ GST_BUFFER_TIMESTAMP (object->out_buf) = timestamp;
+ gst_pad_push (object->srcpad, object->out_buf);
+ }
+ }
+ /* no calculation done: remove excess field */
+ else if (object->field_history[cur_field_idx].flags ==
+ PICTURE_INTERLACED_BOTTOM && object->fields == GST_DEINTERLACE2_TF) {
+ GST_DEBUG ("Removing unused bottom field");
+ buf = gst_deinterlace2_pop_history (object);
+ gst_buffer_unref (buf);
+ }
+
+
+ } else {
+ object->out_buf = gst_deinterlace2_pop_history (object);
+ gst_pad_push (object->srcpad, object->out_buf);
+ }
+ GST_DEBUG ("----chain end ----\n\n");
+
+ return GST_FLOW_OK;
+}
+
+static gboolean
+gst_deinterlace2_setcaps (GstPad * pad, GstCaps * caps)
+{
+ gboolean res = TRUE;
+
+ GstDeinterlace2 *object = GST_DEINTERLACE2 (gst_pad_get_parent (pad));
+
+ GstPad *otherpad;
+
+ GstStructure *structure;
+
+ GstVideoFormat fmt;
+
+ guint32 fourcc;
+
+ otherpad = (pad == object->srcpad) ? object->sinkpad : object->srcpad;
+
+ if (!gst_pad_accept_caps (otherpad, caps)
+ || !gst_pad_set_caps (otherpad, caps))
+ goto caps_not_accepted;
+
+ structure = gst_caps_get_structure (caps, 0);
+
+ res = gst_structure_get_int (structure, "width", &object->frame_width);
+ res &= gst_structure_get_int (structure, "height", &object->frame_height);
+ res &=
+ gst_structure_get_fraction (structure, "framerate", &object->frame_rate_n,
+ &object->frame_rate_d);
+ res &= gst_structure_get_fourcc (structure, "format", &fourcc);
+ /* TODO: get interlaced, field_layout, field_order */
+ if (!res)
+ goto invalid_caps;
+
+ /* TODO: introduce object->field_stride */
+ object->field_height = object->frame_height / 2;
+
+ fmt = gst_video_format_from_fourcc (fourcc);
+
+ /* TODO: only true if fields are subbuffers of interlaced frames,
+ change when the buffer-fields concept has landed */
+ object->field_stride =
+ gst_video_format_get_row_stride (fmt, 0, object->frame_width) * 2;
+ object->output_stride =
+ gst_video_format_get_row_stride (fmt, 0, object->frame_width);
+
+ /* in bytes */
+ object->line_length =
+ gst_video_format_get_row_stride (fmt, 0, object->frame_width);
+ object->frame_size =
+ gst_video_format_get_size (fmt, object->frame_width,
+ object->frame_height);
+
+ GST_DEBUG_OBJECT (object, "Set caps: %" GST_PTR_FORMAT, caps);
+
+done:
+
+ gst_object_unref (object);
+ return res;
+
+invalid_caps:
+ res = FALSE;
+ GST_ERROR_OBJECT (object, "Invalid caps: %" GST_PTR_FORMAT, caps);
+ goto done;
+
+caps_not_accepted:
+ res = FALSE;
+ GST_ERROR_OBJECT (object, "Caps not accepted: %" GST_PTR_FORMAT, caps);
+ goto done;
+}
+
+static gboolean
+gst_deinterlace2_sink_event (GstPad * pad, GstEvent * event)
+{
+ gboolean res = TRUE;
+
+ GstDeinterlace2 *object = GST_DEINTERLACE2 (gst_pad_get_parent (pad));
+
+ GST_LOG_OBJECT (pad, "received %s event", GST_EVENT_TYPE_NAME (event));
+
+ switch (GST_EVENT_TYPE (event)) {
+ case GST_EVENT_FLUSH_STOP:
+ case GST_EVENT_EOS:
+ case GST_EVENT_NEWSEGMENT:
+ /* TODO: reset history */
+
+ /* fall through */
+ default:
+ res = gst_pad_event_default (pad, event);
+ break;
+ }
+
+ gst_object_unref (object);
+ return res;
+}
+
+static GstStateChangeReturn
+gst_deinterlace2_change_state (GstElement * element, GstStateChange transition)
+{
+ GstStateChangeReturn ret;
+
+ switch (transition) {
+ case GST_STATE_CHANGE_NULL_TO_READY:
+ break;
+ case GST_STATE_CHANGE_READY_TO_PAUSED:
+ break;
+ case GST_STATE_CHANGE_PAUSED_TO_PLAYING:
+ break;
+ default:
+ break;
+ }
+
+ ret = GST_ELEMENT_CLASS (parent_class)->change_state (element, transition);
+ if (ret != GST_STATE_CHANGE_SUCCESS)
+ return ret;
+
+ switch (transition) {
+ case GST_STATE_CHANGE_PLAYING_TO_PAUSED:
+ break;
+ case GST_STATE_CHANGE_PAUSED_TO_READY:
+ /* TODO: reset history, clean up, etc */
+ break;
+ case GST_STATE_CHANGE_READY_TO_NULL:
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+static gboolean
+gst_deinterlace2_src_event (GstPad * pad, GstEvent * event)
+{
+ GstDeinterlace2 *object = GST_DEINTERLACE2 (gst_pad_get_parent (pad));
+
+ gboolean res;
+
+ GST_DEBUG_OBJECT (pad, "received %s event", GST_EVENT_TYPE_NAME (event));
+
+ switch (GST_EVENT_TYPE (event)) {
+ default:
+ res = gst_pad_event_default (pad, event);
+ break;
+ }
+
+ gst_object_unref (object);
+
+ return res;
+}
+
+static gboolean
+gst_deinterlace2_src_query (GstPad * pad, GstQuery * query)
+{
+ GstDeinterlace2 *object = GST_DEINTERLACE2 (gst_pad_get_parent (pad));
+
+ gboolean res = FALSE;
+
+ GST_LOG_OBJECT (object, "%s query", GST_QUERY_TYPE_NAME (query));
+
+ switch (GST_QUERY_TYPE (query)) {
+ case GST_QUERY_LATENCY:
+ {
+ GstClockTime min, max;
+
+ gboolean live;
+
+ GstPad *peer;
+
+ if ((peer = gst_pad_get_peer (object->sinkpad))) {
+ if ((res = gst_pad_query (peer, query))) {
+ GstClockTime latency;
+
+ gst_query_parse_latency (query, &live, &min, &max);
+
+ GST_DEBUG ("Peer latency: min %"
+ GST_TIME_FORMAT " max %" GST_TIME_FORMAT,
+ GST_TIME_ARGS (min), GST_TIME_ARGS (max));
+
+ /* TODO: calculate our own latency from framerate
+ * and object->method->fields_required */
+ /* add our own latency */
+
+ latency =
+ gst_util_uint64_scale (object->method->fields_required *
+ GST_SECOND, object->frame_rate_d, object->frame_rate_n);
+
+ GST_DEBUG ("Our latency: min %" GST_TIME_FORMAT
+ ", max %" GST_TIME_FORMAT,
+ GST_TIME_ARGS (latency), GST_TIME_ARGS (latency));
+
+ min += latency;
+ if (max != GST_CLOCK_TIME_NONE)
+ max += latency;
+ else
+ max = latency;
+
+ GST_DEBUG ("Calculated total latency : min %"
+ GST_TIME_FORMAT " max %" GST_TIME_FORMAT,
+ GST_TIME_ARGS (min), GST_TIME_ARGS (max));
+
+ gst_query_set_latency (query, live, min, max);
+ }
+ gst_object_unref (peer);
+ }
+ break;
+ }
+ default:
+ res = gst_pad_query_default (pad, query);
+ break;
+ }
+
+ gst_object_unref (object);
+ return res;
+}
+
+static const GstQueryType *
+gst_deinterlace2_src_query_types (GstPad * pad)
+{
+ static const GstQueryType types[] = {
+ GST_QUERY_LATENCY,
+ GST_QUERY_NONE
+ };
+ return types;
+}
+
+static gboolean
+plugin_init (GstPlugin * plugin)
+{
+ GST_DEBUG_CATEGORY_INIT (deinterlace2_debug, "deinterlace2", 0,
+ "Deinterlacer");
+
+ oil_init ();
+
+ if (!gst_element_register (plugin, "deinterlace2", GST_RANK_NONE,
+ GST_TYPE_DEINTERLACE2)) {
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+GST_PLUGIN_DEFINE (GST_VERSION_MAJOR,
+ GST_VERSION_MINOR,
+ "deinterlace2",
+ "Deinterlacer", plugin_init, VERSION, "LGPL", GST_PACKAGE_NAME,
+ GST_PACKAGE_ORIGIN);
diff --git a/gst/deinterlace2/gstdeinterlace2.h b/gst/deinterlace2/gstdeinterlace2.h
new file mode 100644
index 00000000..0f19484f
--- /dev/null
+++ b/gst/deinterlace2/gstdeinterlace2.h
@@ -0,0 +1,267 @@
+/*
+ * GStreamer
+ * Copyright (C) 2005 Martin Eikermann <meiker@upb.de>
+ * Copyright (C) 2008 Sebastian Dröge <slomo@circular-chaos.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#ifndef __GST_DEINTERLACE_2_H__
+#define __GST_DEINTERLACE_2_H__
+
+#include <liboil/liboil.h>
+#include <liboil/liboilfunction.h>
+#include <liboil/liboilcpu.h>
+
+#include <gst/gst.h>
+#include <gst/base/gstbasetransform.h>
+
+G_BEGIN_DECLS
+
+#define GST_TYPE_DEINTERLACE2 \
+ (gst_deinterlace2_get_type())
+#define GST_DEINTERLACE2(obj) \
+ (G_TYPE_CHECK_INSTANCE_CAST((obj),GST_TYPE_DEINTERLACE2,GstDeinterlace2))
+#define GST_DEINTERLACE2_CLASS(klass) \
+ (G_TYPE_CHECK_CLASS_CAST((klass),GST_TYPE_DEINTERLACE2,GstDeinterlace2))
+#define GST_IS_DEINTERLACE2(obj) \
+ (G_TYPE_CHECK_INSTANCE_TYPE((obj),GST_TYPE_DEINTERLACE2))
+#define GST_IS_DEINTERLACE2_CLASS(obj) \
+ (G_TYPE_CHECK_CLASS_TYPE((klass),GST_TYPE_DEINTERLACE2))
+
+typedef struct _GstDeinterlace2 GstDeinterlace2;
+typedef struct _GstDeinterlace2Class GstDeinterlace2Class;
+
+typedef struct deinterlace_setting_s deinterlace_setting_t;
+typedef struct deinterlace_method_s deinterlace_method_t;
+typedef struct deinterlace_scanline_data_s deinterlace_scanline_data_t;
+typedef struct deinterlace_frame_data_s deinterlace_frame_data_t;
+
+/**
+ * There are two scanline functions that every deinterlacer plugin
+ * must implement to do its work: one for a 'copy' and one for
+ * an 'interpolate' for the currently active field. This so so that
+ * while plugins may be delaying fields, the external API assumes that
+ * the plugin is completely realtime.
+ *
+ * Each deinterlacing routine can require data from up to four fields.
+ * The most recent field captured is field 0, and increasing numbers go
+ * backwards in time.
+ */
+struct deinterlace_scanline_data_s
+{
+ guint8 *tt0, *t0, *m0, *b0, *bb0;
+ guint8 *tt1, *t1, *m1, *b1, *bb1;
+ guint8 *tt2, *t2, *m2, *b2, *bb2;
+ guint8 *tt3, *t3, *m3, *b3, *bb3;
+ int bottom_field;
+};
+
+/**
+ * | t-3 t-2 t-1 t
+ * | Field 3 | Field 2 | Field 1 | Field 0 |
+ * | TT3 | | TT1 | |
+ * | | T2 | | T0 |
+ * | M3 | | M1 | |
+ * | | B2 | | B0 |
+ * | BB3 | | BB1 | |
+ *
+ * While all pointers are passed in, each plugin is only guarenteed for
+ * the ones it indicates it requires (in the fields_required parameter)
+ * to be available.
+ *
+ * Pointers are always to scanlines in the standard packed 4:2:2 format.
+ */
+typedef void (*deinterlace_interp_scanline_t) (GstDeinterlace2 * object,
+ deinterlace_scanline_data_t * data, guint8 * output);
+/**
+ * For the copy scanline, the API is basically the same, except that
+ * we're given a scanline to 'copy'.
+ *
+ * | t-3 t-2 t-1 t
+ * | Field 3 | Field 2 | Field 1 | Field 0 |
+ * | | TT2 | | TT0 |
+ * | T3 | | T1 | |
+ * | | M2 | | M0 |
+ * | B3 | | B1 | |
+ * | | BB2 | | BB0 |
+ */
+typedef void (*deinterlace_copy_scanline_t) (GstDeinterlace2 * object,
+ deinterlace_scanline_data_t * data, guint8 * output);
+
+/**
+ * The frame function is for deinterlacing plugins that can only act
+ * on whole frames, rather than on a scanline at a time.
+ */
+struct deinterlace_frame_data_s
+{
+ guint8 *f0;
+ guint8 *f1;
+ guint8 *f2;
+ guint8 *f3;
+};
+
+typedef void (*deinterlace_frame_t) (GstDeinterlace2 * object);
+
+
+/**
+ * This structure defines the deinterlacer plugin.
+ */
+struct deinterlace_method_s
+{
+ int version;
+ const char *name;
+ const char *short_name;
+ int fields_required;
+ int accelrequired;
+ int doscalerbob;
+ int numsettings;
+ deinterlace_setting_t *settings;
+ int scanlinemode;
+ deinterlace_interp_scanline_t interpolate_scanline;
+ deinterlace_copy_scanline_t copy_scanline;
+ deinterlace_frame_t deinterlace_frame;
+ const char *description[10];
+};
+
+/**
+ * Registers a new deinterlace method.
+ */
+void register_deinterlace_method (deinterlace_method_t * method);
+
+/**
+ * Returns how many deinterlacing methods are available.
+ */
+int get_num_deinterlace_methods (void);
+
+/**
+ * Returns the specified method in the list.
+ */
+deinterlace_method_t *get_deinterlace_method (int i);
+
+/**
+ * Builds the usable method list.
+ */
+void filter_deinterlace_methods (int accel, int fieldsavailable);
+
+#define MAX_FIELD_HISTORY 10
+
+#define PICTURE_PROGRESSIVE 0
+#define PICTURE_INTERLACED_BOTTOM 1
+#define PICTURE_INTERLACED_TOP 2
+#define PICTURE_INTERLACED_MASK (PICTURE_INTERLACED_BOTTOM | PICTURE_INTERLACED_TOP)
+
+typedef void (MEMCPY_FUNC) (void *pOutput, const void *pInput, size_t nSize);
+
+typedef struct
+{
+ /* pointer to the start of data for this field */
+ GstBuffer *buf;
+ /* see PICTURE_ flags */
+ guint flags;
+} GstPicture;
+
+typedef enum
+{
+ GST_DEINTERLACE2_TOM,
+ GST_DEINTERLACE2_GREEDY_H,
+ GST_DEINTERLACE2_GREEDY_L,
+ GST_DEINTERLACE2_VFIR
+} GstDeinterlace2Methods;
+
+typedef enum
+{
+ GST_DEINTERLACE2_ALL, /* All (missing data is interp.) */
+ GST_DEINTERLACE2_TF, /* Top Fields Only */
+ GST_DEINTERLACE2_BF /* Bottom Fields Only */
+} GstDeinterlace2Fields;
+
+typedef enum
+{
+ GST_DEINTERLACE2_LAYOUT_AUTO,
+ GST_DEINTERLACE2_LAYOUT_TFF,
+ GST_DEINTERLACE2_LAYOUT_BFF
+} GstDeinterlace2FieldLayout;
+
+struct _GstDeinterlace2
+{
+ GstElement parent;
+
+ GstPad *srcpad, *sinkpad;
+
+ guint history_count;
+
+ guint cpu_feature_flags;
+ GstDeinterlace2FieldLayout field_layout;
+
+ guint frame_size;
+ gint frame_rate_n, frame_rate_d;
+
+ GstDeinterlace2Fields fields;
+
+ GstDeinterlace2Methods method_id;
+ deinterlace_method_t *method;
+
+ /* The most recent pictures
+ PictureHistory[0] is always the most recent.
+ Pointers are NULL if the picture in question isn't valid, e.g. because
+ the program just started or a picture was skipped.
+ */
+ GstPicture field_history[MAX_FIELD_HISTORY];
+
+ /* Current overlay buffer pointer. */
+ GstBuffer *out_buf;
+
+ /* Overlay pitch (number of bytes between scanlines). */
+ guint output_stride;
+
+ /* Number of bytes of actual data in each scanline. May be less than
+ OverlayPitch since the overlay's scanlines might have alignment
+ requirements. Generally equal to FrameWidth * 2.
+ */
+ guint line_length;
+
+ /* Number of pixels in each scanline. */
+ gint frame_width;
+
+ /* Number of scanlines per frame. */
+ gint frame_height;
+
+ /* Number of scanlines per field. FrameHeight / 2, mostly for
+ cleanliness so we don't have to keep dividing FrameHeight by 2.
+ */
+ gint field_height;
+
+ /* Function pointer to optimized memcpy function */
+ MEMCPY_FUNC *pMemcpy;
+
+ /* distance between lines in image
+ need not match the pixel width
+ */
+ guint field_stride;
+
+ gboolean bottom_field;
+};
+
+struct _GstDeinterlace2Class
+{
+ GstElementClass parent_class;
+};
+
+GType gst_deinterlace2_get_type (void);
+
+G_END_DECLS
+#endif /* __GST_DEINTERLACE_2_H__ */
diff --git a/gst/deinterlace2/tvtime/greedy.c b/gst/deinterlace2/tvtime/greedy.c
new file mode 100644
index 00000000..578eb711
--- /dev/null
+++ b/gst/deinterlace2/tvtime/greedy.c
@@ -0,0 +1,207 @@
+/*
+ *
+ * GStreamer
+ * Copyright (c) 2000 Tom Barry All rights reserved.
+ * mmx.h port copyright (c) 2002 Billy Biggs <vektor@dumbterm.net>.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+ * Relicensed for GStreamer from GPL to LGPL with permit from Tom Barry
+ * and Billy Biggs.
+ * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578
+ */
+
+#include <stdio.h>
+#if defined (__SVR4) && defined (__sun)
+# include <sys/int_types.h>
+#else
+# include <stdint.h>
+#endif
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "mmx.h"
+#include "sse.h"
+#include "gstdeinterlace2.h"
+#include "speedtools.h"
+#include "speedy.h"
+
+// This is a simple lightweight DeInterlace method that uses little CPU time
+// but gives very good results for low or intermedite motion.
+// It defers frames by one field, but that does not seem to produce noticeable
+// lip sync problems.
+//
+// The method used is to take either the older or newer weave pixel depending
+// upon which give the smaller comb factor, and then clip to avoid large damage
+// when wrong.
+//
+// I'd intended this to be part of a larger more elaborate method added to
+// Blended Clip but this give too good results for the CPU to ignore here.
+
+static void
+copy_scanline (GstDeinterlace2 * object,
+ deinterlace_scanline_data_t * data, uint8_t * output)
+{
+ blit_packed422_scanline (output, data->m1, object->frame_width);
+}
+
+static int GreedyMaxComb = 15;
+
+static void
+deinterlace_greedy_packed422_scanline_mmxext (GstDeinterlace2 * object,
+ deinterlace_scanline_data_t * data, uint8_t * output)
+{
+#ifdef HAVE_CPU_I386
+ mmx_t MaxComb;
+
+ uint8_t *m0 = data->m0;
+
+ uint8_t *t1 = data->t1;
+
+ uint8_t *b1 = data->b1;
+
+ uint8_t *m2 = data->m2;
+
+ int width = object->frame_width;
+
+ // How badly do we let it weave? 0-255
+ MaxComb.ub[0] = GreedyMaxComb;
+ MaxComb.ub[1] = GreedyMaxComb;
+ MaxComb.ub[2] = GreedyMaxComb;
+ MaxComb.ub[3] = GreedyMaxComb;
+ MaxComb.ub[4] = GreedyMaxComb;
+ MaxComb.ub[5] = GreedyMaxComb;
+ MaxComb.ub[6] = GreedyMaxComb;
+ MaxComb.ub[7] = GreedyMaxComb;
+
+ // L2 == m0
+ // L1 == t1
+ // L3 == b1
+ // LP2 == m2
+
+ width /= 4;
+ while (width--) {
+ movq_m2r (*t1, mm1); // L1
+ movq_m2r (*m0, mm2); // L2
+ movq_m2r (*b1, mm3); // L3
+ movq_m2r (*m2, mm0); // LP2
+
+ // average L1 and L3 leave result in mm4
+ movq_r2r (mm1, mm4); // L1
+ pavgb_r2r (mm3, mm4); // (L1 + L3)/2
+
+
+ // get abs value of possible L2 comb
+ movq_r2r (mm2, mm7); // L2
+ psubusb_r2r (mm4, mm7); // L2 - avg
+ movq_r2r (mm4, mm5); // avg
+ psubusb_r2r (mm2, mm5); // avg - L2
+ por_r2r (mm7, mm5); // abs(avg-L2)
+ movq_r2r (mm4, mm6); // copy of avg for later
+
+
+ // get abs value of possible LP2 comb
+ movq_r2r (mm0, mm7); // LP2
+ psubusb_r2r (mm4, mm7); // LP2 - avg
+ psubusb_r2r (mm0, mm4); // avg - LP2
+ por_r2r (mm7, mm4); // abs(avg-LP2)
+
+ // use L2 or LP2 depending upon which makes smaller comb
+ psubusb_r2r (mm5, mm4); // see if it goes to zero
+ psubusb_r2r (mm5, mm5); // 0
+ pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0
+ pcmpeqb_r2r (mm4, mm5); // opposite of mm4
+
+ // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
+ pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0
+ pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0
+ por_r2r (mm5, mm4); // may the best win
+
+ // Now lets clip our chosen value to be not outside of the range
+ // of the high/low range L1-L3 by more than abs(L1-L3)
+ // This allows some comb but limits the damages and also allows more
+ // detail than a boring oversmoothed clip.
+
+ movq_r2r (mm1, mm2); // copy L1
+ psubusb_r2r (mm3, mm2); // - L3, with saturation
+ paddusb_r2r (mm3, mm2); // now = Max(L1,L3)
+
+ pcmpeqb_r2r (mm7, mm7); // all ffffffff
+ psubusb_r2r (mm1, mm7); // - L1
+ paddusb_r2r (mm7, mm3); // add, may sat at fff..
+ psubusb_r2r (mm7, mm3); // now = Min(L1,L3)
+
+ // allow the value to be above the high or below the low by amt of MaxComb
+ paddusb_m2r (MaxComb, mm2); // increase max by diff
+ psubusb_m2r (MaxComb, mm3); // lower min by diff
+
+ psubusb_r2r (mm3, mm4); // best - Min
+ paddusb_r2r (mm3, mm4); // now = Max(best,Min(L1,L3)
+
+ pcmpeqb_r2r (mm7, mm7); // all ffffffff
+ psubusb_r2r (mm4, mm7); // - Max(best,Min(best,L3)
+ paddusb_r2r (mm7, mm2); // add may sat at FFF..
+ psubusb_r2r (mm7, mm2); // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
+
+ movntq_r2m (mm2, *output); // move in our clipped best
+
+ // Advance to the next set of pixels.
+ output += 8;
+ m0 += 8;
+ t1 += 8;
+ b1 += 8;
+ m2 += 8;
+ }
+ sfence ();
+ emms ();
+#endif
+}
+
+
+static deinterlace_method_t greedyl_method = {
+ 0, //DEINTERLACE_PLUGIN_API_VERSION,
+ "Motion Adaptive: Simple Detection",
+ "AdaptiveSimple",
+ 3,
+ OIL_IMPL_FLAG_MMXEXT,
+ 0,
+ 0,
+ 0,
+ 1,
+ copy_scanline,
+ deinterlace_greedy_packed422_scanline_mmxext,
+ 0,
+ {"Uses heuristics to detect motion in the input",
+ "frames and reconstruct image detail where",
+ "possible. Use this for high quality output",
+ "even on monitors set to an arbitrary refresh",
+ "rate.",
+ "",
+ "Simple detection uses linear interpolation",
+ "where motion is detected, using a two-field",
+ "buffer. This is the Greedy: Low Motion",
+ "deinterlacer from DScaler."}
+};
+
+deinterlace_method_t *
+dscaler_greedyl_get_method (void)
+{
+ return &greedyl_method;
+}
diff --git a/gst/deinterlace2/tvtime/greedyh.asm b/gst/deinterlace2/tvtime/greedyh.asm
new file mode 100644
index 00000000..92ad1fe1
--- /dev/null
+++ b/gst/deinterlace2/tvtime/greedyh.asm
@@ -0,0 +1,307 @@
+/*
+ *
+ * GStreamer
+ * Copyright (c) 2001 Tom Barry. All rights reserved.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+
+/*
+ * Relicensed for GStreamer from GPL to LGPL with permit from Tom Barry.
+ * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578
+ */
+
+
+#include "x86-64_macros.inc"
+
+void FUNCT_NAME( GstDeinterlace2 *object)
+{
+ int64_t i;
+ int InfoIsOdd = 0;
+
+ // in tight loop some vars are accessed faster in local storage
+ int64_t YMask = 0x00ff00ff00ff00ffull; // to keep only luma
+ int64_t UVMask = 0xff00ff00ff00ff00ull; // to keep only chroma
+ int64_t ShiftMask = 0xfefffefffefffeffull; // to avoid shifting chroma to luma
+ int64_t QW256 = 0x0100010001000100ull; // 4 256's
+
+ // Set up our two parms that are actually evaluated for each pixel
+ i=GreedyMaxComb;
+ int64_t MaxComb = i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i;
+
+ i = GreedyMotionThreshold; // scale to range of 0-257
+ int64_t MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask;
+
+ i = GreedyMotionSense; // scale to range of 0-257
+ int64_t MotionSense = i << 48 | i << 32 | i << 16 | i;
+
+ int Line;
+ long LoopCtr;
+ unsigned int Pitch = object->field_stride;
+
+ unsigned char* L1; // ptr to Line1, of 3
+ unsigned char* L2; // ptr to Line2, the weave line
+ unsigned char* L3; // ptr to Line3
+
+ unsigned char* L2P; // ptr to prev Line2
+ unsigned char* Dest = GST_BUFFER_DATA(object->out_buf);
+
+ int64_t QW256B;
+ int64_t LastAvg=0; //interp value from left qword
+
+ i = 0xffffffff - 256;
+ QW256B = i << 48 | i << 32 | i << 16 | i; // save a couple instr on PMINSW instruct.
+
+
+ // copy first even line no matter what, and the first odd line if we're
+ // processing an EVEN field. (note diff from other deint rtns.)
+
+ if (object->field_history[object->history_count-1].flags == PICTURE_INTERLACED_BOTTOM) {
+ InfoIsOdd = 1;
+
+ L1 = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf);
+ L2 = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf);
+ L3 = L1 + Pitch;
+ L2P = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf);
+
+ // copy first even line
+ object->pMemcpy(Dest, L1, object->line_length);
+ Dest += object->output_stride;
+ }
+ else {
+ InfoIsOdd = 0;
+ L1 = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf);
+ L2 = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf) + Pitch;
+ L3 = L1 + Pitch;
+ L2P = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf) + Pitch;
+
+ // copy first even line
+ object->pMemcpy(Dest, GST_BUFFER_DATA(object->field_history[0].buf), object->line_length);
+ Dest += object->output_stride;
+ // then first odd line
+ object->pMemcpy(Dest, L1, object->line_length);
+ Dest += object->output_stride;
+ }
+
+
+ long oldbx;
+
+ for (Line = 0; Line < (object->field_height - 1); ++Line) {
+ LoopCtr = object->line_length / 8 - 1; // there are LineLength / 8 qwords per line but do 1 less, adj at end of loop
+
+ // For ease of reading, the comments below assume that we're operating on an odd
+ // field (i.e., that InfoIsOdd is true). Assume the obvious for even lines..
+ __asm__ __volatile__
+ (
+ // save ebx (-fPIC)
+ MOVX" %%"XBX", %[oldbx]\n\t"
+
+ MOVX" %[L1], %%"XAX"\n\t"
+ LEAX" 8(%%"XAX"), %%"XBX"\n\t" // next qword needed by DJR
+ MOVX" %[L3], %%"XCX"\n\t"
+ SUBX" %%"XAX", %%"XCX"\n\t" // carry L3 addr as an offset
+ MOVX" %[L2P], %%"XDX"\n\t"
+ MOVX" %[L2], %%"XSI"\n\t"
+ MOVX" %[Dest], %%"XDI"\n\t" // DL1 if Odd or DL2 if Even
+
+ ".align 8\n\t"
+ "1:\n\t"
+
+ "movq (%%"XSI"), %%mm0\n\t" // L2 - the newest weave pixel value
+ "movq (%%"XAX"), %%mm1\n\t" // L1 - the top pixel
+ "movq (%%"XDX"), %%mm2\n\t" // L2P - the prev weave pixel
+ "movq (%%"XAX", %%"XCX"), %%mm3\n\t" // L3, next odd row
+ "movq %%mm1, %%mm6\n\t" // L1 - get simple single pixel interp
+ // pavgb mm6, mm3 // use macro below
+ V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]")
+
+ // DJR - Diagonal Jaggie Reduction
+ // In the event that we are going to use an average (Bob) pixel we do not want a jagged
+ // stair step effect. To combat this we avg in the 2 horizontally adjacen pixels into the
+ // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels.
+
+ "movq %[LastAvg], %%mm4\n\t" // the bob value from prev qword in row
+ "movq %%mm6, %[LastAvg]\n\t" // save for next pass
+ "psrlq $48, %%mm4\n\t" // right justify 1 pixel
+ "movq %%mm6, %%mm7\n\t" // copy of simple bob pixel
+ "psllq $16, %%mm7\n\t" // left justify 3 pixels
+ "por %%mm7, %%mm4\n\t" // and combine
+
+ "movq (%%"XBX"), %%mm5\n\t" // next horiz qword from L1
+ // pavgb mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below
+ V_PAVGB ("%%mm5", "(%%"XBX",%%"XCX")", "%%mm7", "%[ShiftMask]")
+ "psllq $48, %%mm5\n\t" // left just 1 pixel
+ "movq %%mm6, %%mm7\n\t" // another copy of simple bob pixel
+ "psrlq $16, %%mm7\n\t" // right just 3 pixels
+ "por %%mm7, %%mm5\n\t" // combine
+ // pavgb mm4, mm5 // avg of forward and prev by 1 pixel, use macro
+ V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]") // mm5 gets modified if MMX
+ // pavgb mm6, mm4 // avg of center and surround interp vals, use macro
+ V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
+
+ // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors.
+#ifndef IS_MMX
+ // pavgb mm4, mm6 // 1/4 center, 3/4 adjacent
+ V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]")
+ // pavgb mm6, mm4 // 3/8 center, 5/8 adjacent
+ V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
+#endif
+
+ // get abs value of possible L2 comb
+ "movq %%mm6, %%mm4\n\t" // work copy of interp val
+ "movq %%mm2, %%mm7\n\t" // L2
+ "psubusb %%mm4, %%mm7\n\t" // L2 - avg
+ "movq %%mm4, %%mm5\n\t" // avg
+ "psubusb %%mm2, %%mm5\n\t" // avg - L2
+ "por %%mm7, %%mm5\n\t" // abs(avg-L2)
+
+ // get abs value of possible L2P comb
+ "movq %%mm0, %%mm7\n\t" // L2P
+ "psubusb %%mm4, %%mm7\n\t" // L2P - avg
+ "psubusb %%mm0, %%mm4\n\t" // avg - L2P
+ "por %%mm7, %%mm4\n\t" // abs(avg-L2P)
+
+ // use L2 or L2P depending upon which makes smaller comb
+ "psubusb %%mm5, %%mm4\n\t" // see if it goes to zero
+ "psubusb %%mm5, %%mm5\n\t" // 0
+ "pcmpeqb %%mm5, %%mm4\n\t" // if (mm4=0) then FF else 0
+ "pcmpeqb %%mm4, %%mm5\n\t" // opposite of mm4
+
+ // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
+ "pand %%mm2, %%mm5\n\t" // use L2 if mm5 == ff, else 0
+ "pand %%mm0, %%mm4\n\t" // use L2P if mm4 = ff, else 0
+ "por %%mm5, %%mm4\n\t" // may the best win
+
+ // Inventory: at this point we have the following values:
+ // mm0 = L2P (or L2)
+ // mm1 = L1
+ // mm2 = L2 (or L2P)
+ // mm3 = L3
+ // mm4 = the best of L2,L2P weave pixel, base upon comb
+ // mm6 = the avg interpolated value, if we need to use it
+
+ // Let's measure movement, as how much the weave pixel has changed
+ "movq %%mm2, %%mm7\n\t"
+ "psubusb %%mm0, %%mm2\n\t"
+ "psubusb %%mm7, %%mm0\n\t"
+ "por %%mm2, %%mm0\n\t" // abs value of change, used later
+
+ // Now lets clip our chosen value to be not outside of the range
+ // of the high/low range L1-L3 by more than MaxComb.
+ // This allows some comb but limits the damages and also allows more
+ // detail than a boring oversmoothed clip.
+ "movq %%mm1, %%mm2\n\t" // copy L1
+ // pmaxub mm2, mm3 // use macro
+ V_PMAXUB ("%%mm2", "%%mm3") // now = Max(L1,L3)
+ "movq %%mm1, %%mm5\n\t" // copy L1
+ // pminub mm5, mm3 // now = Min(L1,L3), use macro
+ V_PMINUB ("%%mm5", "%%mm3", "%%mm7")
+ // allow the value to be above the high or below the low by amt of MaxComb
+ "psubusb %[MaxComb], %%mm5\n\t" // lower min by diff
+ "paddusb %[MaxComb], %%mm2\n\t" // increase max by diff
+ // pmaxub mm4, mm5 // now = Max(best,Min(L1,L3) use macro
+ V_PMAXUB ("%%mm4", "%%mm5")
+ // pminub mm4, mm2 // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
+ V_PMINUB ("%%mm4", "%%mm2", "%%mm7")
+
+ // Blend weave pixel with bob pixel, depending on motion val in mm0
+ "psubusb %[MotionThreshold], %%mm0\n\t"// test Threshold, clear chroma change >>>??
+ "pmullw %[MotionSense], %%mm0\n\t" // mul by user factor, keep low 16 bits
+ "movq %[QW256], %%mm7\n\t"
+#ifdef HAVE_SSE
+ "pminsw %%mm7, %%mm0\n\t" // max = 256
+#else
+ "paddusw %[QW256B], %%mm0\n\t" // add, may sat at fff..
+ "psubusw %[QW256B], %%mm0\n\t" // now = Min(L1,256)
+#endif
+ "psubusw %%mm0, %%mm7\n\t" // so the 2 sum to 256, weighted avg
+ "movq %%mm4, %%mm2\n\t" // save weave chroma info before trashing
+ "pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value
+ "pmullw %%mm7, %%mm4\n\t" // use more weave for less motion
+ "pand %[YMask], %%mm6\n\t" // keep only luma from calc'd value
+ "pmullw %%mm0, %%mm6\n\t" // use more bob for large motion
+ "paddusw %%mm6, %%mm4\n\t" // combine
+ "psrlw $8, %%mm4\n\t" // div by 256 to get weighted avg
+
+ // chroma comes from weave pixel
+ "pand %[UVMask], %%mm2\n\t" // keep chroma
+ "por %%mm4, %%mm2\n\t" // and combine
+
+ V_MOVNTQ ("(%%"XDI")", "%%mm2") // move in our clipped best, use macro
+
+ // bump ptrs and loop
+ LEAX" 8(%%"XAX"), %%"XAX"\n\t"
+ LEAX" 8(%%"XBX"), %%"XBX"\n\t"
+ LEAX" 8(%%"XDX"), %%"XDX"\n\t"
+ LEAX" 8(%%"XDI"), %%"XDI"\n\t"
+ LEAX" 8(%%"XSI"), %%"XSI"\n\t"
+ DECX" %[LoopCtr]\n\t"
+ "jg 1b\n\t" // loop if not to last line
+ // note P-III default assumes backward branches taken
+ "jl 1f\n\t" // done
+ MOVX" %%"XAX", %%"XBX"\n\t" // sharpness lookahead 1 byte only, be wrong on 1
+ "jmp 1b\n\t"
+
+ "1:\n\t"
+ MOVX" %[oldbx], %%"XBX"\n\t"
+
+ : /* no outputs */
+
+ : [LastAvg] "m"(LastAvg),
+ [L1] "m"(L1),
+ [L3] "m"(L3),
+ [L2P] "m"(L2P),
+ [L2] "m"(L2),
+ [Dest] "m"(Dest),
+ [ShiftMask] "m"(ShiftMask),
+ [MaxComb] "m"(MaxComb),
+ [MotionThreshold] "m"(MotionThreshold),
+ [MotionSense] "m"(MotionSense),
+ [QW256B] "m"(QW256B),
+ [YMask] "m"(YMask),
+ [UVMask] "m"(UVMask),
+ [LoopCtr] "m"(LoopCtr),
+ [QW256] "m"(QW256),
+ [oldbx] "m"(oldbx)
+
+ : XAX, XCX, XDX, XSI, XDI,
+#ifdef HAVE_CPU_I386
+ "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
+#endif
+ "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
+ "memory", "cc"
+ );
+
+ Dest += object->output_stride;
+ object->pMemcpy(Dest, L3, object->line_length);
+ Dest += object->output_stride;
+
+ L1 += Pitch;
+ L2 += Pitch;
+ L3 += Pitch;
+ L2P += Pitch;
+ }
+
+ if (InfoIsOdd) {
+ object->pMemcpy(Dest, L2, object->line_length);
+ }
+
+ // clear out the MMX registers ready for doing floating point again
+#ifdef HAVE_CPU_I386
+ __asm__ __volatile__ ("emms\n\t");
+#endif
+}
diff --git a/gst/deinterlace2/tvtime/greedyh.c b/gst/deinterlace2/tvtime/greedyh.c
new file mode 100644
index 00000000..623c2d8b
--- /dev/null
+++ b/gst/deinterlace2/tvtime/greedyh.c
@@ -0,0 +1,148 @@
+/*
+ *
+ * GStreamer
+ * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+ * Relicensed for GStreamer from GPL to LGPL with permit from Billy Biggs.
+ * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578
+ */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "greedyh.h"
+#include "greedyhmacros.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "gst/gst.h"
+#include "plugins.h"
+#include "gstdeinterlace2.h"
+#include "speedy.h"
+
+
+#define MAXCOMB_DEFAULT 5
+#define MOTIONTHRESHOLD_DEFAULT 25
+#define MOTIONSENSE_DEFAULT 30
+
+unsigned int GreedyMaxComb;
+
+unsigned int GreedyMotionThreshold;
+
+unsigned int GreedyMotionSense;
+
+
+#define IS_SSE
+#define SSE_TYPE SSE
+#define FUNCT_NAME greedyDScaler_SSE
+#include "greedyh.asm"
+#undef SSE_TYPE
+#undef IS_SSE
+#undef FUNCT_NAME
+
+#define IS_3DNOW
+#define FUNCT_NAME greedyDScaler_3DNOW
+#define SSE_TYPE 3DNOW
+#include "greedyh.asm"
+#undef SSE_TYPE
+#undef IS_3DNOW
+#undef FUNCT_NAME
+
+#define IS_MMX
+#define SSE_TYPE MMX
+#define FUNCT_NAME greedyDScaler_MMX
+#include "greedyh.asm"
+#undef SSE_TYPE
+#undef IS_MMX
+#undef FUNCT_NAME
+
+void
+deinterlace_frame_di_greedyh (GstDeinterlace2 * object)
+{
+ if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) {
+ greedyh_filter_sse (object);
+ } else if (object->cpu_feature_flags & OIL_IMPL_FLAG_3DNOW) {
+ greedyh_filter_3dnow (object);
+ } else {
+ greedyh_filter_mmx (object);
+ }
+}
+
+static deinterlace_method_t greedyh_method = {
+ 0, //DEINTERLACE_PLUGIN_API_VERSION,
+ "Motion Adaptive: Advanced Detection",
+ "AdaptiveAdvanced",
+ 4,
+ OIL_IMPL_FLAG_MMX,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ deinterlace_frame_di_greedyh,
+ {"Uses heuristics to detect motion in the input",
+ "frames and reconstruct image detail where",
+ "possible. Use this for high quality output",
+ "even on monitors set to an arbitrary refresh",
+ "rate.",
+ "",
+ "Advanced detection uses linear interpolation",
+ "where motion is detected, using a four-field",
+ "buffer. This is the Greedy: High Motion",
+ "deinterlacer from DScaler."}
+};
+
+deinterlace_method_t *
+dscaler_greedyh_get_method (void)
+{
+ greedyh_init ();
+ return &greedyh_method;
+}
+
+void
+greedyh_init (void)
+{
+ GreedyMaxComb = MAXCOMB_DEFAULT;
+ GreedyMotionThreshold = MOTIONTHRESHOLD_DEFAULT;
+ GreedyMotionSense = MOTIONSENSE_DEFAULT;
+}
+
+void
+greedyh_filter_mmx (GstDeinterlace2 * object)
+{
+ greedyDScaler_MMX (object);
+}
+
+void
+greedyh_filter_3dnow (GstDeinterlace2 * object)
+{
+ greedyDScaler_3DNOW (object);
+}
+
+void
+greedyh_filter_sse (GstDeinterlace2 * object)
+{
+ greedyDScaler_SSE (object);
+}
diff --git a/gst/deinterlace2/tvtime/greedyh.h b/gst/deinterlace2/tvtime/greedyh.h
new file mode 100644
index 00000000..1156836a
--- /dev/null
+++ b/gst/deinterlace2/tvtime/greedyh.h
@@ -0,0 +1,45 @@
+/*
+ *
+ * GStreamer
+ * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+ * Relicensed for GStreamer from GPL to LGPL with permit from Billy Biggs.
+ * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578
+ */
+
+#ifndef GREEDYH_H_INCLUDED
+#define GREEDYH_H_INCLUDED
+
+#include "gstdeinterlace2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void greedyh_init( void );
+void greedyh_filter_mmx( GstDeinterlace2 *object );
+void greedyh_filter_3dnow( GstDeinterlace2 *object );
+void greedyh_filter_sse( GstDeinterlace2 *object );
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif /* GREEDYH_H_INCLUDED */
diff --git a/gst/deinterlace2/tvtime/greedyhmacros.h b/gst/deinterlace2/tvtime/greedyhmacros.h
new file mode 100644
index 00000000..5f65959c
--- /dev/null
+++ b/gst/deinterlace2/tvtime/greedyhmacros.h
@@ -0,0 +1,74 @@
+/////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2001 Tom Barry. All rights reserved.
+/////////////////////////////////////////////////////////////////////////////
+//
+// This file is subject to the terms of the GNU General Public License as
+// published by the Free Software Foundation. A copy of this license is
+// included with this software distribution in the file COPYING. If you
+// do not have a copy, you may obtain a copy by writing to the Free
+// Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+//
+// This software is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details
+//
+/////////////////////////////////////////////////////////////////////////////
+
+// Define a few macros for CPU dependent instructions.
+// I suspect I don't really understand how the C macro preprocessor works but
+// this seems to get the job done. // TRB 7/01
+
+// BEFORE USING THESE YOU MUST SET:
+
+// #define SSE_TYPE SSE (or MMX or 3DNOW)
+
+// some macros for pavgb instruction
+// V_PAVGB(mmr1, mmr2, mmr work register, smask) mmr2 may = mmrw if you can trash it
+
+#define V_PAVGB_MMX(mmr1, mmr2, mmrw, smask) \
+ "movq "mmr2", "mmrw"\n\t" \
+ "pand "smask", "mmrw"\n\t" \
+ "psrlw $1, "mmrw"\n\t" \
+ "pand "smask", "mmr1"\n\t" \
+ "psrlw $1, "mmr1"\n\t" \
+ "paddusb "mmrw", "mmr1"\n\t"
+#define V_PAVGB_SSE(mmr1, mmr2, mmrw, smask) "pavgb "mmr2", "mmr1"\n\t"
+#define V_PAVGB_3DNOW(mmr1, mmr2, mmrw, smask) "pavgusb "mmr2", "mmr1"\n\t"
+#define V_PAVGB(mmr1, mmr2, mmrw, smask) V_PAVGB2(mmr1, mmr2, mmrw, smask, SSE_TYPE)
+#define V_PAVGB2(mmr1, mmr2, mmrw, smask, ssetyp) V_PAVGB3(mmr1, mmr2, mmrw, smask, ssetyp)
+#define V_PAVGB3(mmr1, mmr2, mmrw, smask, ssetyp) V_PAVGB_##ssetyp(mmr1, mmr2, mmrw, smask)
+
+// some macros for pmaxub instruction
+#define V_PMAXUB_MMX(mmr1, mmr2) \
+ "psubusb "mmr2", "mmr1"\n\t" \
+ "paddusb "mmr2", "mmr1"\n\t"
+#define V_PMAXUB_SSE(mmr1, mmr2) "pmaxub "mmr2", "mmr1"\n\t"
+#define V_PMAXUB_3DNOW(mmr1, mmr2) V_PMAXUB_MMX(mmr1, mmr2) // use MMX version
+#define V_PMAXUB(mmr1, mmr2) V_PMAXUB2(mmr1, mmr2, SSE_TYPE)
+#define V_PMAXUB2(mmr1, mmr2, ssetyp) V_PMAXUB3(mmr1, mmr2, ssetyp)
+#define V_PMAXUB3(mmr1, mmr2, ssetyp) V_PMAXUB_##ssetyp(mmr1, mmr2)
+
+// some macros for pminub instruction
+// V_PMINUB(mmr1, mmr2, mmr work register) mmr2 may NOT = mmrw
+#define V_PMINUB_MMX(mmr1, mmr2, mmrw) \
+ "pcmpeqb "mmrw", "mmrw"\n\t" \
+ "psubusb "mmr2", "mmrw"\n\t" \
+ "paddusb "mmrw", "mmr1"\n\t" \
+ "psubusb "mmrw", "mmr1"\n\t"
+#define V_PMINUB_SSE(mmr1, mmr2, mmrw) "pminub "mmr2", "mmr1"\n\t"
+#define V_PMINUB_3DNOW(mmr1, mmr2, mmrw) V_PMINUB_MMX(mmr1, mmr2, mmrw) // use MMX version
+#define V_PMINUB(mmr1, mmr2, mmrw) V_PMINUB2(mmr1, mmr2, mmrw, SSE_TYPE)
+#define V_PMINUB2(mmr1, mmr2, mmrw, ssetyp) V_PMINUB3(mmr1, mmr2, mmrw, ssetyp)
+#define V_PMINUB3(mmr1, mmr2, mmrw, ssetyp) V_PMINUB_##ssetyp(mmr1, mmr2, mmrw)
+
+// some macros for movntq instruction
+// V_MOVNTQ(mmr1, mmr2)
+#define V_MOVNTQ_MMX(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t"
+#define V_MOVNTQ_3DNOW(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t"
+#define V_MOVNTQ_SSE(mmr1, mmr2) "movntq "mmr2", "mmr1"\n\t"
+#define V_MOVNTQ(mmr1, mmr2) V_MOVNTQ2(mmr1, mmr2, SSE_TYPE)
+#define V_MOVNTQ2(mmr1, mmr2, ssetyp) V_MOVNTQ3(mmr1, mmr2, ssetyp)
+#define V_MOVNTQ3(mmr1, mmr2, ssetyp) V_MOVNTQ_##ssetyp(mmr1, mmr2)
+
+// end of macros
diff --git a/gst/deinterlace2/tvtime/mmx.h b/gst/deinterlace2/tvtime/mmx.h
new file mode 100644
index 00000000..3627e61b
--- /dev/null
+++ b/gst/deinterlace2/tvtime/mmx.h
@@ -0,0 +1,723 @@
+/* mmx.h
+
+ MultiMedia eXtensions GCC interface library for IA32.
+
+ To use this library, simply include this header file
+ and compile with GCC. You MUST have inlining enabled
+ in order for mmx_ok() to work; this can be done by
+ simply using -O on the GCC command line.
+
+ Compiling with -DMMX_TRACE will cause detailed trace
+ output to be sent to stderr for each mmx operation.
+ This adds lots of code, and obviously slows execution to
+ a crawl, but can be very useful for debugging.
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
+ LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ AND FITNESS FOR ANY PARTICULAR PURPOSE.
+
+ 1997-98 by H. Dietz and R. Fisher
+
+ History:
+ 97-98* R.Fisher Early versions
+ 980501 R.Fisher Original Release
+ 980611* H.Dietz Rewrite, correctly implementing inlines, and
+ R.Fisher including direct register accesses.
+ 980616 R.Fisher Release of 980611 as 980616.
+ 980714 R.Fisher Minor corrections to Makefile, etc.
+ 980715 R.Fisher mmx_ok() now prevents optimizer from using
+ clobbered values.
+ mmx_ok() now checks if cpuid instruction is
+ available before trying to use it.
+ 980726* R.Fisher mm_support() searches for AMD 3DNow, Cyrix
+ Extended MMX, and standard MMX. It returns a
+ value which is positive if any of these are
+ supported, and can be masked with constants to
+ see which. mmx_ok() is now a call to this
+ 980726* R.Fisher Added i2r support for shift functions
+ 980919 R.Fisher Fixed AMD extended feature recognition bug.
+ 980921 R.Fisher Added definition/check for _MMX_H.
+ Added "float s[2]" to mmx_t for use with
+ 3DNow and EMMX. So same mmx_t can be used.
+ 981013 R.Fisher Fixed cpuid function 1 bug (looked at wrong reg)
+ Fixed psllq_i2r error in mmxtest.c
+
+ * Unreleased (internal or interim) versions
+
+ Notes:
+ It appears that the latest gas has the pand problem fixed, therefore
+ I'll undefine BROKEN_PAND by default.
+ String compares may be quicker than the multiple test/jumps in vendor
+ test sequence in mmx_ok(), but I'm not concerned with that right now.
+
+ Acknowledgments:
+ Jussi Laako for pointing out the errors ultimately found to be
+ connected to the failure to notify the optimizer of clobbered values.
+ Roger Hardiman for reminding us that CPUID isn't everywhere, and that
+ someone may actually try to use this on a machine without CPUID.
+ Also for suggesting code for checking this.
+ Robert Dale for pointing out the AMD recognition bug.
+ Jimmy Mayfield and Carl Witty for pointing out the Intel recognition
+ bug.
+ Carl Witty for pointing out the psllq_i2r test bug.
+*/
+
+#ifndef _MMX_H
+#define _MMX_H
+
+/*#define MMX_TRACE */
+
+/* Warning: at this writing, the version of GAS packaged
+ with most Linux distributions does not handle the
+ parallel AND operation mnemonic correctly. If the
+ symbol BROKEN_PAND is defined, a slower alternative
+ coding will be used. If execution of mmxtest results
+ in an illegal instruction fault, define this symbol.
+*/
+#undef BROKEN_PAND
+
+
+/* The type of an value that fits in an MMX register
+ (note that long long constant values MUST be suffixed
+ by LL and unsigned long long values by ULL, lest
+ they be truncated by the compiler)
+*/
+typedef union {
+ long long q; /* Quadword (64-bit) value */
+ unsigned long long uq; /* Unsigned Quadword */
+ int d[2]; /* 2 Doubleword (32-bit) values */
+ unsigned int ud[2]; /* 2 Unsigned Doubleword */
+ short w[4]; /* 4 Word (16-bit) values */
+ unsigned short uw[4]; /* 4 Unsigned Word */
+ char b[8]; /* 8 Byte (8-bit) values */
+ unsigned char ub[8]; /* 8 Unsigned Byte */
+ float s[2]; /* Single-precision (32-bit) value */
+} mmx_t;
+
+
+/* Function to test if multimedia instructions are supported...
+*/
+inline extern int
+mm_support(void)
+{
+ /* Returns 1 if MMX instructions are supported,
+ 3 if Cyrix MMX and Extended MMX instructions are supported
+ 5 if AMD MMX and 3DNow! instructions are supported
+ 0 if hardware does not support any of these
+ */
+ register int rval = 0;
+
+ __asm__ __volatile__ (
+ /* See if CPUID instruction is supported ... */
+ /* ... Get copies of EFLAGS into eax and ecx */
+ "pushf\n\t"
+ "popl %%eax\n\t"
+ "movl %%eax, %%ecx\n\t"
+
+ /* ... Toggle the ID bit in one copy and store */
+ /* to the EFLAGS reg */
+ "xorl $0x200000, %%eax\n\t"
+ "push %%eax\n\t"
+ "popf\n\t"
+
+ /* ... Get the (hopefully modified) EFLAGS */
+ "pushf\n\t"
+ "popl %%eax\n\t"
+
+ /* ... Compare and test result */
+ "xorl %%eax, %%ecx\n\t"
+ "testl $0x200000, %%ecx\n\t"
+ "jz NotSupported1\n\t" /* Nothing supported */
+
+
+ /* Get standard CPUID information, and
+ go to a specific vendor section */
+ "movl $0, %%eax\n\t"
+ "cpuid\n\t"
+
+ /* Check for Intel */
+ "cmpl $0x756e6547, %%ebx\n\t"
+ "jne TryAMD\n\t"
+ "cmpl $0x49656e69, %%edx\n\t"
+ "jne TryAMD\n\t"
+ "cmpl $0x6c65746e, %%ecx\n"
+ "jne TryAMD\n\t"
+ "jmp Intel\n\t"
+
+ /* Check for AMD */
+ "\nTryAMD:\n\t"
+ "cmpl $0x68747541, %%ebx\n\t"
+ "jne TryCyrix\n\t"
+ "cmpl $0x69746e65, %%edx\n\t"
+ "jne TryCyrix\n\t"
+ "cmpl $0x444d4163, %%ecx\n"
+ "jne TryCyrix\n\t"
+ "jmp AMD\n\t"
+
+ /* Check for Cyrix */
+ "\nTryCyrix:\n\t"
+ "cmpl $0x69727943, %%ebx\n\t"
+ "jne NotSupported2\n\t"
+ "cmpl $0x736e4978, %%edx\n\t"
+ "jne NotSupported3\n\t"
+ "cmpl $0x64616574, %%ecx\n\t"
+ "jne NotSupported4\n\t"
+ /* Drop through to Cyrix... */
+
+
+ /* Cyrix Section */
+ /* See if extended CPUID is supported */
+ "movl $0x80000000, %%eax\n\t"
+ "cpuid\n\t"
+ "cmpl $0x80000000, %%eax\n\t"
+ "jl MMXtest\n\t" /* Try standard CPUID instead */
+
+ /* Extended CPUID supported, so get extended features */
+ "movl $0x80000001, %%eax\n\t"
+ "cpuid\n\t"
+ "testl $0x00800000, %%eax\n\t" /* Test for MMX */
+ "jz NotSupported5\n\t" /* MMX not supported */
+ "testl $0x01000000, %%eax\n\t" /* Test for Ext'd MMX */
+ "jnz EMMXSupported\n\t"
+ "movl $1, %0:\n\n\t" /* MMX Supported */
+ "jmp Return\n\n"
+ "EMMXSupported:\n\t"
+ "movl $3, %0:\n\n\t" /* EMMX and MMX Supported */
+ "jmp Return\n\t"
+
+
+ /* AMD Section */
+ "AMD:\n\t"
+
+ /* See if extended CPUID is supported */
+ "movl $0x80000000, %%eax\n\t"
+ "cpuid\n\t"
+ "cmpl $0x80000000, %%eax\n\t"
+ "jl MMXtest\n\t" /* Try standard CPUID instead */
+
+ /* Extended CPUID supported, so get extended features */
+ "movl $0x80000001, %%eax\n\t"
+ "cpuid\n\t"
+ "testl $0x00800000, %%edx\n\t" /* Test for MMX */
+ "jz NotSupported6\n\t" /* MMX not supported */
+ "testl $0x80000000, %%edx\n\t" /* Test for 3DNow! */
+ "jnz ThreeDNowSupported\n\t"
+ "movl $1, %0:\n\n\t" /* MMX Supported */
+ "jmp Return\n\n"
+ "ThreeDNowSupported:\n\t"
+ "movl $5, %0:\n\n\t" /* 3DNow! and MMX Supported */
+ "jmp Return\n\t"
+
+
+ /* Intel Section */
+ "Intel:\n\t"
+
+ /* Check for MMX */
+ "MMXtest:\n\t"
+ "movl $1, %%eax\n\t"
+ "cpuid\n\t"
+ "testl $0x00800000, %%edx\n\t" /* Test for MMX */
+ "jz NotSupported7\n\t" /* MMX Not supported */
+ "movl $1, %0:\n\n\t" /* MMX Supported */
+ "jmp Return\n\t"
+
+ /* Nothing supported */
+ "\nNotSupported1:\n\t"
+ "#movl $101, %0:\n\n\t"
+ "\nNotSupported2:\n\t"
+ "#movl $102, %0:\n\n\t"
+ "\nNotSupported3:\n\t"
+ "#movl $103, %0:\n\n\t"
+ "\nNotSupported4:\n\t"
+ "#movl $104, %0:\n\n\t"
+ "\nNotSupported5:\n\t"
+ "#movl $105, %0:\n\n\t"
+ "\nNotSupported6:\n\t"
+ "#movl $106, %0:\n\n\t"
+ "\nNotSupported7:\n\t"
+ "#movl $107, %0:\n\n\t"
+ "movl $0, %0:\n\n\t"
+
+ "Return:\n\t"
+ : "=a" (rval)
+ : /* no input */
+ : "eax", "ebx", "ecx", "edx"
+ );
+
+ /* Return */
+ return(rval);
+}
+
+/* Function to test if mmx instructions are supported...
+*/
+inline extern int
+mmx_ok(void)
+{
+ /* Returns 1 if MMX instructions are supported, 0 otherwise */
+ return ( mm_support() & 0x1 );
+}
+
+
+/* Helper functions for the instruction macros that follow...
+ (note that memory-to-register, m2r, instructions are nearly
+ as efficient as register-to-register, r2r, instructions;
+ however, memory-to-memory instructions are really simulated
+ as a convenience, and are only 1/3 as efficient)
+*/
+#ifdef MMX_TRACE
+
+/* Include the stuff for printing a trace to stderr...
+*/
+
+#include <stdio.h>
+
+#define mmx_i2r(op, imm, reg) \
+ { \
+ mmx_t mmx_trace; \
+ mmx_trace = (imm); \
+ fprintf(stderr, #op "_i2r(" #imm "=0x%016llx, ", mmx_trace.q); \
+ __asm__ __volatile__ ("movq %%" #reg ", %0" \
+ : "=X" (mmx_trace) \
+ : /* nothing */ ); \
+ fprintf(stderr, #reg "=0x%016llx) => ", mmx_trace.q); \
+ __asm__ __volatile__ (#op " %0, %%" #reg \
+ : /* nothing */ \
+ : "X" (imm)); \
+ __asm__ __volatile__ ("movq %%" #reg ", %0" \
+ : "=X" (mmx_trace) \
+ : /* nothing */ ); \
+ fprintf(stderr, #reg "=0x%016llx\n", mmx_trace.q); \
+ }
+
+#define mmx_m2r(op, mem, reg) \
+ { \
+ mmx_t mmx_trace; \
+ mmx_trace = (mem); \
+ fprintf(stderr, #op "_m2r(" #mem "=0x%016llx, ", mmx_trace.q); \
+ __asm__ __volatile__ ("movq %%" #reg ", %0" \
+ : "=X" (mmx_trace) \
+ : /* nothing */ ); \
+ fprintf(stderr, #reg "=0x%016llx) => ", mmx_trace.q); \
+ __asm__ __volatile__ (#op " %0, %%" #reg \
+ : /* nothing */ \
+ : "X" (mem)); \
+ __asm__ __volatile__ ("movq %%" #reg ", %0" \
+ : "=X" (mmx_trace) \
+ : /* nothing */ ); \
+ fprintf(stderr, #reg "=0x%016llx\n", mmx_trace.q); \
+ }
+
+#define mmx_r2m(op, reg, mem) \
+ { \
+ mmx_t mmx_trace; \
+ __asm__ __volatile__ ("movq %%" #reg ", %0" \
+ : "=X" (mmx_trace) \
+ : /* nothing */ ); \
+ fprintf(stderr, #op "_r2m(" #reg "=0x%016llx, ", mmx_trace.q); \
+ mmx_trace = (mem); \
+ fprintf(stderr, #mem "=0x%016llx) => ", mmx_trace.q); \
+ __asm__ __volatile__ (#op " %%" #reg ", %0" \
+ : "=X" (mem) \
+ : /* nothing */ ); \
+ mmx_trace = (mem); \
+ fprintf(stderr, #mem "=0x%016llx\n", mmx_trace.q); \
+ }
+
+#define mmx_r2r(op, regs, regd) \
+ { \
+ mmx_t mmx_trace; \
+ __asm__ __volatile__ ("movq %%" #regs ", %0" \
+ : "=X" (mmx_trace) \
+ : /* nothing */ ); \
+ fprintf(stderr, #op "_r2r(" #regs "=0x%016llx, ", mmx_trace.q); \
+ __asm__ __volatile__ ("movq %%" #regd ", %0" \
+ : "=X" (mmx_trace) \
+ : /* nothing */ ); \
+ fprintf(stderr, #regd "=0x%016llx) => ", mmx_trace.q); \
+ __asm__ __volatile__ (#op " %" #regs ", %" #regd); \
+ __asm__ __volatile__ ("movq %%" #regd ", %0" \
+ : "=X" (mmx_trace) \
+ : /* nothing */ ); \
+ fprintf(stderr, #regd "=0x%016llx\n", mmx_trace.q); \
+ }
+
+#define mmx_m2m(op, mems, memd) \
+ { \
+ mmx_t mmx_trace; \
+ mmx_trace = (mems); \
+ fprintf(stderr, #op "_m2m(" #mems "=0x%016llx, ", mmx_trace.q); \
+ mmx_trace = (memd); \
+ fprintf(stderr, #memd "=0x%016llx) => ", mmx_trace.q); \
+ __asm__ __volatile__ ("movq %0, %%mm0\n\t" \
+ #op " %1, %%mm0\n\t" \
+ "movq %%mm0, %0" \
+ : "=X" (memd) \
+ : "X" (mems)); \
+ mmx_trace = (memd); \
+ fprintf(stderr, #memd "=0x%016llx\n", mmx_trace.q); \
+ }
+
+#else
+
+/* These macros are a lot simpler without the tracing...
+*/
+
+#define mmx_i2r(op, imm, reg) \
+ __asm__ __volatile__ (#op " $" #imm ", %%" #reg \
+ : /* nothing */ \
+ : /* nothing */);
+
+#define mmx_m2r(op, mem, reg) \
+ __asm__ __volatile__ (#op " %0, %%" #reg \
+ : /* nothing */ \
+ : "m" (mem))
+
+#define mmx_r2m(op, reg, mem) \
+ __asm__ __volatile__ (#op " %%" #reg ", %0" \
+ : "=m" (mem) \
+ : /* nothing */ )
+
+#define mmx_r2r(op, regs, regd) \
+ __asm__ __volatile__ (#op " %" #regs ", %" #regd)
+
+#define mmx_m2m(op, mems, memd) \
+ __asm__ __volatile__ ("movq %0, %%mm0\n\t" \
+ #op " %1, %%mm0\n\t" \
+ "movq %%mm0, %0" \
+ : "=m" (memd) \
+ : "m" (mems))
+
+#endif
+
+
+/* 1x64 MOVe Quadword
+ (this is both a load and a store...
+ in fact, it is the only way to store)
+*/
+#define movq_m2r(var, reg) mmx_m2r(movq, var, reg)
+#define movq_r2m(reg, var) mmx_r2m(movq, reg, var)
+#define movq_r2r(regs, regd) mmx_r2r(movq, regs, regd)
+#define movq(vars, vard) \
+ __asm__ __volatile__ ("movq %1, %%mm0\n\t" \
+ "movq %%mm0, %0" \
+ : "=X" (vard) \
+ : "X" (vars))
+
+
+/* 1x32 MOVe Doubleword
+ (like movq, this is both load and store...
+ but is most useful for moving things between
+ mmx registers and ordinary registers)
+*/
+#define movd_m2r(var, reg) mmx_m2r(movd, var, reg)
+#define movd_r2m(reg, var) mmx_r2m(movd, reg, var)
+#define movd_r2r(regs, regd) mmx_r2r(movd, regs, regd)
+#define movd(vars, vard) \
+ __asm__ __volatile__ ("movd %1, %%mm0\n\t" \
+ "movd %%mm0, %0" \
+ : "=X" (vard) \
+ : "X" (vars))
+
+
+/* 2x32, 4x16, and 8x8 Parallel ADDs
+*/
+#define paddd_m2r(var, reg) mmx_m2r(paddd, var, reg)
+#define paddd_r2r(regs, regd) mmx_r2r(paddd, regs, regd)
+#define paddd(vars, vard) mmx_m2m(paddd, vars, vard)
+
+#define paddw_m2r(var, reg) mmx_m2r(paddw, var, reg)
+#define paddw_r2r(regs, regd) mmx_r2r(paddw, regs, regd)
+#define paddw(vars, vard) mmx_m2m(paddw, vars, vard)
+
+#define paddb_m2r(var, reg) mmx_m2r(paddb, var, reg)
+#define paddb_r2r(regs, regd) mmx_r2r(paddb, regs, regd)
+#define paddb(vars, vard) mmx_m2m(paddb, vars, vard)
+
+
+/* 4x16 and 8x8 Parallel ADDs using Saturation arithmetic
+*/
+#define paddsw_m2r(var, reg) mmx_m2r(paddsw, var, reg)
+#define paddsw_r2r(regs, regd) mmx_r2r(paddsw, regs, regd)
+#define paddsw(vars, vard) mmx_m2m(paddsw, vars, vard)
+
+#define paddsb_m2r(var, reg) mmx_m2r(paddsb, var, reg)
+#define paddsb_r2r(regs, regd) mmx_r2r(paddsb, regs, regd)
+#define paddsb(vars, vard) mmx_m2m(paddsb, vars, vard)
+
+
+/* 4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic
+*/
+#define paddusw_m2r(var, reg) mmx_m2r(paddusw, var, reg)
+#define paddusw_r2r(regs, regd) mmx_r2r(paddusw, regs, regd)
+#define paddusw(vars, vard) mmx_m2m(paddusw, vars, vard)
+
+#define paddusb_m2r(var, reg) mmx_m2r(paddusb, var, reg)
+#define paddusb_r2r(regs, regd) mmx_r2r(paddusb, regs, regd)
+#define paddusb(vars, vard) mmx_m2m(paddusb, vars, vard)
+
+
+/* 2x32, 4x16, and 8x8 Parallel SUBs
+*/
+#define psubd_m2r(var, reg) mmx_m2r(psubd, var, reg)
+#define psubd_r2r(regs, regd) mmx_r2r(psubd, regs, regd)
+#define psubd(vars, vard) mmx_m2m(psubd, vars, vard)
+
+#define psubw_m2r(var, reg) mmx_m2r(psubw, var, reg)
+#define psubw_r2r(regs, regd) mmx_r2r(psubw, regs, regd)
+#define psubw(vars, vard) mmx_m2m(psubw, vars, vard)
+
+#define psubb_m2r(var, reg) mmx_m2r(psubb, var, reg)
+#define psubb_r2r(regs, regd) mmx_r2r(psubb, regs, regd)
+#define psubb(vars, vard) mmx_m2m(psubb, vars, vard)
+
+
+/* 4x16 and 8x8 Parallel SUBs using Saturation arithmetic
+*/
+#define psubsw_m2r(var, reg) mmx_m2r(psubsw, var, reg)
+#define psubsw_r2r(regs, regd) mmx_r2r(psubsw, regs, regd)
+#define psubsw(vars, vard) mmx_m2m(psubsw, vars, vard)
+
+#define psubsb_m2r(var, reg) mmx_m2r(psubsb, var, reg)
+#define psubsb_r2r(regs, regd) mmx_r2r(psubsb, regs, regd)
+#define psubsb(vars, vard) mmx_m2m(psubsb, vars, vard)
+
+
+/* 4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic
+*/
+#define psubusw_m2r(var, reg) mmx_m2r(psubusw, var, reg)
+#define psubusw_r2r(regs, regd) mmx_r2r(psubusw, regs, regd)
+#define psubusw(vars, vard) mmx_m2m(psubusw, vars, vard)
+
+#define psubusb_m2r(var, reg) mmx_m2r(psubusb, var, reg)
+#define psubusb_r2r(regs, regd) mmx_r2r(psubusb, regs, regd)
+#define psubusb(vars, vard) mmx_m2m(psubusb, vars, vard)
+
+
+/* 4x16 Parallel MULs giving Low 4x16 portions of results
+*/
+#define pmullw_m2r(var, reg) mmx_m2r(pmullw, var, reg)
+#define pmullw_r2r(regs, regd) mmx_r2r(pmullw, regs, regd)
+#define pmullw(vars, vard) mmx_m2m(pmullw, vars, vard)
+
+
+/* 4x16 Parallel MULs giving High 4x16 portions of results
+*/
+#define pmulhw_m2r(var, reg) mmx_m2r(pmulhw, var, reg)
+#define pmulhw_r2r(regs, regd) mmx_r2r(pmulhw, regs, regd)
+#define pmulhw(vars, vard) mmx_m2m(pmulhw, vars, vard)
+
+
+/* 4x16->2x32 Parallel Mul-ADD
+ (muls like pmullw, then adds adjacent 16-bit fields
+ in the multiply result to make the final 2x32 result)
+*/
+#define pmaddwd_m2r(var, reg) mmx_m2r(pmaddwd, var, reg)
+#define pmaddwd_r2r(regs, regd) mmx_r2r(pmaddwd, regs, regd)
+#define pmaddwd(vars, vard) mmx_m2m(pmaddwd, vars, vard)
+
+
+/* 1x64 bitwise AND
+*/
+#ifdef BROKEN_PAND
+#define pand_m2r(var, reg) \
+ { \
+ mmx_m2r(pandn, (mmx_t) -1LL, reg); \
+ mmx_m2r(pandn, var, reg); \
+ }
+#define pand_r2r(regs, regd) \
+ { \
+ mmx_m2r(pandn, (mmx_t) -1LL, regd); \
+ mmx_r2r(pandn, regs, regd); \
+ }
+#define pand(vars, vard) \
+ { \
+ movq_m2r(vard, mm0); \
+ mmx_m2r(pandn, (mmx_t) -1LL, mm0); \
+ mmx_m2r(pandn, vars, mm0); \
+ movq_r2m(mm0, vard); \
+ }
+#else
+#define pand_m2r(var, reg) mmx_m2r(pand, var, reg)
+#define pand_r2r(regs, regd) mmx_r2r(pand, regs, regd)
+#define pand(vars, vard) mmx_m2m(pand, vars, vard)
+#endif
+
+
+/* 1x64 bitwise AND with Not the destination
+*/
+#define pandn_m2r(var, reg) mmx_m2r(pandn, var, reg)
+#define pandn_r2r(regs, regd) mmx_r2r(pandn, regs, regd)
+#define pandn(vars, vard) mmx_m2m(pandn, vars, vard)
+
+
+/* 1x64 bitwise OR
+*/
+#define por_m2r(var, reg) mmx_m2r(por, var, reg)
+#define por_r2r(regs, regd) mmx_r2r(por, regs, regd)
+#define por(vars, vard) mmx_m2m(por, vars, vard)
+
+
+/* 1x64 bitwise eXclusive OR
+*/
+#define pxor_m2r(var, reg) mmx_m2r(pxor, var, reg)
+#define pxor_r2r(regs, regd) mmx_r2r(pxor, regs, regd)
+#define pxor(vars, vard) mmx_m2m(pxor, vars, vard)
+
+
+/* 2x32, 4x16, and 8x8 Parallel CoMPare for EQuality
+ (resulting fields are either 0 or -1)
+*/
+#define pcmpeqd_m2r(var, reg) mmx_m2r(pcmpeqd, var, reg)
+#define pcmpeqd_r2r(regs, regd) mmx_r2r(pcmpeqd, regs, regd)
+#define pcmpeqd(vars, vard) mmx_m2m(pcmpeqd, vars, vard)
+
+#define pcmpeqw_m2r(var, reg) mmx_m2r(pcmpeqw, var, reg)
+#define pcmpeqw_r2r(regs, regd) mmx_r2r(pcmpeqw, regs, regd)
+#define pcmpeqw(vars, vard) mmx_m2m(pcmpeqw, vars, vard)
+
+#define pcmpeqb_m2r(var, reg) mmx_m2r(pcmpeqb, var, reg)
+#define pcmpeqb_r2r(regs, regd) mmx_r2r(pcmpeqb, regs, regd)
+#define pcmpeqb(vars, vard) mmx_m2m(pcmpeqb, vars, vard)
+
+
+/* 2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than
+ (resulting fields are either 0 or -1)
+*/
+#define pcmpgtd_m2r(var, reg) mmx_m2r(pcmpgtd, var, reg)
+#define pcmpgtd_r2r(regs, regd) mmx_r2r(pcmpgtd, regs, regd)
+#define pcmpgtd(vars, vard) mmx_m2m(pcmpgtd, vars, vard)
+
+#define pcmpgtw_m2r(var, reg) mmx_m2r(pcmpgtw, var, reg)
+#define pcmpgtw_r2r(regs, regd) mmx_r2r(pcmpgtw, regs, regd)
+#define pcmpgtw(vars, vard) mmx_m2m(pcmpgtw, vars, vard)
+
+#define pcmpgtb_m2r(var, reg) mmx_m2r(pcmpgtb, var, reg)
+#define pcmpgtb_r2r(regs, regd) mmx_r2r(pcmpgtb, regs, regd)
+#define pcmpgtb(vars, vard) mmx_m2m(pcmpgtb, vars, vard)
+
+
+/* 1x64, 2x32, and 4x16 Parallel Shift Left Logical
+*/
+#define psllq_i2r(imm, reg) mmx_i2r(psllq, imm, reg)
+#define psllq_m2r(var, reg) mmx_m2r(psllq, var, reg)
+#define psllq_r2r(regs, regd) mmx_r2r(psllq, regs, regd)
+#define psllq(vars, vard) mmx_m2m(psllq, vars, vard)
+
+#define pslld_i2r(imm, reg) mmx_i2r(pslld, imm, reg)
+#define pslld_m2r(var, reg) mmx_m2r(pslld, var, reg)
+#define pslld_r2r(regs, regd) mmx_r2r(pslld, regs, regd)
+#define pslld(vars, vard) mmx_m2m(pslld, vars, vard)
+
+#define psllw_i2r(imm, reg) mmx_i2r(psllw, imm, reg)
+#define psllw_m2r(var, reg) mmx_m2r(psllw, var, reg)
+#define psllw_r2r(regs, regd) mmx_r2r(psllw, regs, regd)
+#define psllw(vars, vard) mmx_m2m(psllw, vars, vard)
+
+
+/* 1x64, 2x32, and 4x16 Parallel Shift Right Logical
+*/
+#define psrlq_i2r(imm, reg) mmx_i2r(psrlq, imm, reg)
+#define psrlq_m2r(var, reg) mmx_m2r(psrlq, var, reg)
+#define psrlq_r2r(regs, regd) mmx_r2r(psrlq, regs, regd)
+#define psrlq(vars, vard) mmx_m2m(psrlq, vars, vard)
+
+#define psrld_i2r(imm, reg) mmx_i2r(psrld, imm, reg)
+#define psrld_m2r(var, reg) mmx_m2r(psrld, var, reg)
+#define psrld_r2r(regs, regd) mmx_r2r(psrld, regs, regd)
+#define psrld(vars, vard) mmx_m2m(psrld, vars, vard)
+
+#define psrlw_i2r(imm, reg) mmx_i2r(psrlw, imm, reg)
+#define psrlw_m2r(var, reg) mmx_m2r(psrlw, var, reg)
+#define psrlw_r2r(regs, regd) mmx_r2r(psrlw, regs, regd)
+#define psrlw(vars, vard) mmx_m2m(psrlw, vars, vard)
+
+
+/* 2x32 and 4x16 Parallel Shift Right Arithmetic
+*/
+#define psrad_i2r(imm, reg) mmx_i2r(psrad, imm, reg)
+#define psrad_m2r(var, reg) mmx_m2r(psrad, var, reg)
+#define psrad_r2r(regs, regd) mmx_r2r(psrad, regs, regd)
+#define psrad(vars, vard) mmx_m2m(psrad, vars, vard)
+
+#define psraw_i2r(imm, reg) mmx_i2r(psraw, imm, reg)
+#define psraw_m2r(var, reg) mmx_m2r(psraw, var, reg)
+#define psraw_r2r(regs, regd) mmx_r2r(psraw, regs, regd)
+#define psraw(vars, vard) mmx_m2m(psraw, vars, vard)
+
+
+/* 2x32->4x16 and 4x16->8x8 PACK and Signed Saturate
+ (packs source and dest fields into dest in that order)
+*/
+#define packssdw_m2r(var, reg) mmx_m2r(packssdw, var, reg)
+#define packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd)
+#define packssdw(vars, vard) mmx_m2m(packssdw, vars, vard)
+
+#define packsswb_m2r(var, reg) mmx_m2r(packsswb, var, reg)
+#define packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd)
+#define packsswb(vars, vard) mmx_m2m(packsswb, vars, vard)
+
+
+/* 4x16->8x8 PACK and Unsigned Saturate
+ (packs source and dest fields into dest in that order)
+*/
+#define packuswb_m2r(var, reg) mmx_m2r(packuswb, var, reg)
+#define packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd)
+#define packuswb(vars, vard) mmx_m2m(packuswb, vars, vard)
+
+
+/* 2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low
+ (interleaves low half of dest with low half of source
+ as padding in each result field)
+*/
+#define punpckldq_m2r(var, reg) mmx_m2r(punpckldq, var, reg)
+#define punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd)
+#define punpckldq(vars, vard) mmx_m2m(punpckldq, vars, vard)
+
+#define punpcklwd_m2r(var, reg) mmx_m2r(punpcklwd, var, reg)
+#define punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd)
+#define punpcklwd(vars, vard) mmx_m2m(punpcklwd, vars, vard)
+
+#define punpcklbw_m2r(var, reg) mmx_m2r(punpcklbw, var, reg)
+#define punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd)
+#define punpcklbw(vars, vard) mmx_m2m(punpcklbw, vars, vard)
+
+
+/* 2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High
+ (interleaves high half of dest with high half of source
+ as padding in each result field)
+*/
+#define punpckhdq_m2r(var, reg) mmx_m2r(punpckhdq, var, reg)
+#define punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd)
+#define punpckhdq(vars, vard) mmx_m2m(punpckhdq, vars, vard)
+
+#define punpckhwd_m2r(var, reg) mmx_m2r(punpckhwd, var, reg)
+#define punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd)
+#define punpckhwd(vars, vard) mmx_m2m(punpckhwd, vars, vard)
+
+#define punpckhbw_m2r(var, reg) mmx_m2r(punpckhbw, var, reg)
+#define punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd)
+#define punpckhbw(vars, vard) mmx_m2m(punpckhbw, vars, vard)
+
+
+/* Empty MMx State
+ (used to clean-up when going from mmx to float use
+ of the registers that are shared by both; note that
+ there is no float-to-mmx operation needed, because
+ only the float tag word info is corruptible)
+*/
+#ifdef MMX_TRACE
+
+#define emms() \
+ { \
+ fprintf(stderr, "emms()\n"); \
+ __asm__ __volatile__ ("emms"); \
+ }
+
+#else
+
+#define emms() __asm__ __volatile__ ("emms")
+
+#endif
+
+#endif
diff --git a/gst/deinterlace2/tvtime/plugins.h b/gst/deinterlace2/tvtime/plugins.h
new file mode 100644
index 00000000..0eb90c0b
--- /dev/null
+++ b/gst/deinterlace2/tvtime/plugins.h
@@ -0,0 +1,42 @@
+/*
+ *
+ * GStreamer
+ * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+ * Relicensed for GStreamer from GPL to LGPL with permit from Billy Biggs.
+ * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578
+ */
+
+#ifndef TVTIME_PLUGINS_H_INCLUDED
+#define TVTIME_PLUGINS_H_INCLUDED
+
+deinterlace_method_t* dscaler_tomsmocomp_get_method( void );
+deinterlace_method_t* dscaler_greedyh_get_method( void );
+deinterlace_method_t* dscaler_greedyl_get_method( void );
+deinterlace_method_t* dscaler_vfir_get_method( void );
+
+//void linear_plugin_init( void );
+//void scalerbob_plugin_init( void );
+//void linearblend_plugin_init( void );
+//void weave_plugin_init( void );
+//void weavetff_plugin_init( void );
+//void weavebff_plugin_init( void );
+
+#endif /* TVTIME_PLUGINS_H_INCLUDED */
diff --git a/gst/deinterlace2/tvtime/speedtools.h b/gst/deinterlace2/tvtime/speedtools.h
new file mode 100644
index 00000000..677bb5e3
--- /dev/null
+++ b/gst/deinterlace2/tvtime/speedtools.h
@@ -0,0 +1,54 @@
+/*
+ *
+ * GStreamer
+ * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+ * Relicensed for GStreamer from GPL to LGPL with permit from Billy Biggs.
+ * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578
+ */
+
+#ifndef SPEEDTOOLS_H_INCLUDED
+#define SPEEDTOOLS_H_INCLUDED
+
+#define PREFETCH_2048(x) \
+ { int *pfetcha = (int *) x; \
+ prefetchnta( pfetcha ); \
+ prefetchnta( pfetcha + 64 ); \
+ prefetchnta( pfetcha + 128 ); \
+ prefetchnta( pfetcha + 192 ); \
+ pfetcha += 256; \
+ prefetchnta( pfetcha ); \
+ prefetchnta( pfetcha + 64 ); \
+ prefetchnta( pfetcha + 128 ); \
+ prefetchnta( pfetcha + 192 ); }
+
+#define READ_PREFETCH_2048(x) \
+ { int *pfetcha = (int *) x; int pfetchtmp; \
+ pfetchtmp = pfetcha[ 0 ] + pfetcha[ 16 ] + pfetcha[ 32 ] + pfetcha[ 48 ] + \
+ pfetcha[ 64 ] + pfetcha[ 80 ] + pfetcha[ 96 ] + pfetcha[ 112 ] + \
+ pfetcha[ 128 ] + pfetcha[ 144 ] + pfetcha[ 160 ] + pfetcha[ 176 ] + \
+ pfetcha[ 192 ] + pfetcha[ 208 ] + pfetcha[ 224 ] + pfetcha[ 240 ]; \
+ pfetcha += 256; \
+ pfetchtmp = pfetcha[ 0 ] + pfetcha[ 16 ] + pfetcha[ 32 ] + pfetcha[ 48 ] + \
+ pfetcha[ 64 ] + pfetcha[ 80 ] + pfetcha[ 96 ] + pfetcha[ 112 ] + \
+ pfetcha[ 128 ] + pfetcha[ 144 ] + pfetcha[ 160 ] + pfetcha[ 176 ] + \
+ pfetcha[ 192 ] + pfetcha[ 208 ] + pfetcha[ 224 ] + pfetcha[ 240 ]; }
+
+#endif /* SPEEDTOOLS_H_INCLUDED */
diff --git a/gst/deinterlace2/tvtime/speedy.c b/gst/deinterlace2/tvtime/speedy.c
new file mode 100644
index 00000000..821cc254
--- /dev/null
+++ b/gst/deinterlace2/tvtime/speedy.c
@@ -0,0 +1,2791 @@
+/**
+ * Copyright (c) 2002, 2003 Billy Biggs <vektor@dumbterm.net>.
+ * Copyright (C) 2001 Matthew J. Marjanovic <maddog@mir.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+/**
+ * Includes 420to422, 422to444 scaling filters from the MPEG2 reference
+ * implementation. The v12 source code indicates that they were written
+ * by Cheung Auyeung <auyeung@mot.com>. The file they were in was:
+ *
+ * store.c, picture output routines
+ * Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved.
+ *
+ * Disclaimer of Warranty
+ *
+ * These software programs are available to the user without any license fee or
+ * royalty on an "as is" basis. The MPEG Software Simulation Group disclaims
+ * any and all warranties, whether express, implied, or statuary, including any
+ * implied warranties or merchantability or of fitness for a particular
+ * purpose. In no event shall the copyright-holder be liable for any
+ * incidental, punitive, or consequential damages of any kind whatsoever
+ * arising from the use of these programs.
+ *
+ * This disclaimer of warranty extends to the user of these programs and user's
+ * customers, employees, agents, transferees, successors, and assigns.
+ *
+ * The MPEG Software Simulation Group does not represent or warrant that the
+ * programs furnished hereunder are free of infringement of any third-party
+ * patents.
+ *
+ * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
+ * are subject to royalty fees to patent holders. Many of these patents are
+ * general enough such that they are unavoidable regardless of implementation
+ * design.
+ *
+ */
+
+/**
+ * Code for the UYVY to YUYV routine comes from rivatv:
+ *
+ * rivatv-convert.c video image conversion routines
+ *
+ * Copyright (C) 2002 Stefan Jahn <stefan@lkcc.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "gst/gst.h"
+#include "gstdeinterlace2.h"
+#include "speedy.h"
+#include "speedtools.h"
+#include "mmx.h"
+#include "sse.h"
+
+// TODO: remove includes
+//#include "attributes.h"
+//#include "mm_accel.h"
+
+/* Function pointer definitions. */
+void (*interpolate_packed422_scanline) (uint8_t * output, uint8_t * top,
+ uint8_t * bot, int width);
+void (*blit_colour_packed422_scanline) (uint8_t * output,
+ int width, int y, int cb, int cr);
+void (*blit_colour_packed4444_scanline) (uint8_t * output,
+ int width, int alpha, int luma, int cb, int cr);
+void (*blit_packed422_scanline) (uint8_t * dest, const uint8_t * src,
+ int width);
+void (*composite_packed4444_to_packed422_scanline) (uint8_t * output,
+ uint8_t * input, uint8_t * foreground, int width);
+void (*composite_packed4444_alpha_to_packed422_scanline) (uint8_t * output,
+ uint8_t * input, uint8_t * foreground, int width, int alpha);
+void (*composite_alphamask_to_packed4444_scanline) (uint8_t * output,
+ uint8_t * input, uint8_t * mask, int width, int textluma, int textcb,
+ int textcr);
+void (*composite_alphamask_alpha_to_packed4444_scanline) (uint8_t * output,
+ uint8_t * input, uint8_t * mask, int width, int textluma, int textcb,
+ int textcr, int alpha);
+void (*premultiply_packed4444_scanline) (uint8_t * output, uint8_t * input,
+ int width);
+void (*blend_packed422_scanline) (uint8_t * output, uint8_t * src1,
+ uint8_t * src2, int width, int pos);
+unsigned int (*diff_factor_packed422_scanline) (uint8_t * cur, uint8_t * old,
+ int width);
+unsigned int (*comb_factor_packed422_scanline) (uint8_t * top, uint8_t * mid,
+ uint8_t * bot, int width);
+void (*kill_chroma_packed422_inplace_scanline) (uint8_t * data, int width);
+
+void (*mirror_packed422_inplace_scanline) (uint8_t * data, int width);
+
+void (*speedy_memcpy) (void *output, const void *input, size_t size);
+
+void (*diff_packed422_block8x8) (pulldown_metrics_t * m, uint8_t * old,
+ uint8_t * new, int os, int ns);
+void (*a8_subpix_blit_scanline) (uint8_t * output, uint8_t * input,
+ int lasta, int startpos, int width);
+void (*quarter_blit_vertical_packed422_scanline) (uint8_t * output,
+ uint8_t * one, uint8_t * three, int width);
+void (*subpix_blit_vertical_packed422_scanline) (uint8_t * output,
+ uint8_t * top, uint8_t * bot, int subpixpos, int width);
+void (*packed444_to_nonpremultiplied_packed4444_scanline) (uint8_t * output,
+ uint8_t * input, int width, int alpha);
+void (*aspect_adjust_packed4444_scanline) (uint8_t * output, uint8_t * input,
+ int width, double pixel_aspect);
+void (*packed444_to_packed422_scanline) (uint8_t * output, uint8_t * input,
+ int width);
+void (*packed422_to_packed444_scanline) (uint8_t * output, uint8_t * input,
+ int width);
+void (*packed422_to_packed444_rec601_scanline) (uint8_t * dest, uint8_t * src,
+ int width);
+void (*packed444_to_rgb24_rec601_scanline) (uint8_t * output, uint8_t * input,
+ int width);
+void (*rgb24_to_packed444_rec601_scanline) (uint8_t * output, uint8_t * input,
+ int width);
+void (*rgba32_to_packed4444_rec601_scanline) (uint8_t * output, uint8_t * input,
+ int width);
+void (*invert_colour_packed422_inplace_scanline) (uint8_t * data, int width);
+
+void (*vfilter_chroma_121_packed422_scanline) (uint8_t * output, int width,
+ uint8_t * m, uint8_t * t, uint8_t * b);
+void (*vfilter_chroma_332_packed422_scanline) (uint8_t * output, int width,
+ uint8_t * m, uint8_t * t, uint8_t * b);
+void (*convert_uyvy_to_yuyv_scanline) (uint8_t * uyvy_buf, uint8_t * yuyv_buf,
+ int width);
+void (*composite_colour4444_alpha_to_packed422_scanline) (uint8_t * output,
+ uint8_t * input, int af, int y, int cb, int cr, int width, int alpha);
+
+/**
+ * result = (1 - alpha)B + alpha*F
+ * = B - alpha*B + alpha*F
+ * = B + alpha*(F - B)
+ */
+
+static inline __attribute__ ((always_inline, const))
+ int multiply_alpha (int a, int r)
+{
+ int temp;
+
+ temp = (r * a) + 0x80;
+ return ((temp + (temp >> 8)) >> 8);
+}
+
+static inline __attribute__ ((always_inline, const))
+ uint8_t clip255 (int x)
+{
+ if (x > 255) {
+ return 255;
+ } else if (x < 0) {
+ return 0;
+ } else {
+ return x;
+ }
+}
+
+unsigned long CombJaggieThreshold = 73;
+
+#ifdef HAVE_CPU_I386
+static unsigned int
+comb_factor_packed422_scanline_mmx (uint8_t * top, uint8_t * mid,
+ uint8_t * bot, int width)
+{
+ const mmx_t qwYMask = { 0x00ff00ff00ff00ffULL };
+ const mmx_t qwOnes = { 0x0001000100010001ULL };
+ mmx_t qwThreshold;
+
+ unsigned int temp1, temp2;
+
+ width /= 4;
+
+ qwThreshold.uw[0] = CombJaggieThreshold;
+ qwThreshold.uw[1] = CombJaggieThreshold;
+ qwThreshold.uw[2] = CombJaggieThreshold;
+ qwThreshold.uw[3] = CombJaggieThreshold;
+
+ movq_m2r (qwThreshold, mm0);
+ movq_m2r (qwYMask, mm1);
+ movq_m2r (qwOnes, mm2);
+ pxor_r2r (mm7, mm7); /* mm7 = 0. */
+
+ while (width--) {
+ /* Load and keep just the luma. */
+ movq_m2r (*top, mm3);
+ movq_m2r (*mid, mm4);
+ movq_m2r (*bot, mm5);
+
+ pand_r2r (mm1, mm3);
+ pand_r2r (mm1, mm4);
+ pand_r2r (mm1, mm5);
+
+ /* Work out mm6 = (top - mid) * (bot - mid) - ( (top - mid)^2 >> 7 ) */
+ psrlw_i2r (1, mm3);
+ psrlw_i2r (1, mm4);
+ psrlw_i2r (1, mm5);
+
+ /* mm6 = (top - mid) */
+ movq_r2r (mm3, mm6);
+ psubw_r2r (mm4, mm6);
+
+ /* mm3 = (top - bot) */
+ psubw_r2r (mm5, mm3);
+
+ /* mm5 = (bot - mid) */
+ psubw_r2r (mm4, mm5);
+
+ /* mm6 = (top - mid) * (bot - mid) */
+ pmullw_r2r (mm5, mm6);
+
+ /* mm3 = (top - bot)^2 >> 7 */
+ pmullw_r2r (mm3, mm3); /* mm3 = (top - bot)^2 */
+ psrlw_i2r (7, mm3); /* mm3 = ((top - bot)^2 >> 7) */
+
+ /* mm6 is what we want. */
+ psubw_r2r (mm3, mm6);
+
+ /* FF's if greater than qwTheshold */
+ pcmpgtw_r2r (mm0, mm6);
+
+ /* Add to count if we are greater than threshold */
+ pand_r2r (mm2, mm6);
+ paddw_r2r (mm6, mm7);
+
+ top += 8;
+ mid += 8;
+ bot += 8;
+ }
+
+ movd_r2m (mm7, temp1);
+ psrlq_i2r (32, mm7);
+ movd_r2m (mm7, temp2);
+ temp1 += temp2;
+ temp2 = temp1;
+ temp1 >>= 16;
+ temp1 += temp2 & 0xffff;
+
+ emms ();
+
+ return temp1;
+}
+#endif
+
+static unsigned long BitShift = 6;
+
+static unsigned int
+diff_factor_packed422_scanline_c (uint8_t * cur, uint8_t * old, int width)
+{
+ unsigned int ret = 0;
+
+ width /= 4;
+
+ while (width--) {
+ unsigned int tmp1 = (cur[0] + cur[2] + cur[4] + cur[6] + 2) >> 2;
+
+ unsigned int tmp2 = (old[0] + old[2] + old[4] + old[6] + 2) >> 2;
+
+ tmp1 = (tmp1 - tmp2);
+ tmp1 *= tmp1;
+ tmp1 >>= BitShift;
+ ret += tmp1;
+ cur += 8;
+ old += 8;
+ }
+
+ return ret;
+}
+
+/*
+static unsigned int diff_factor_packed422_scanline_test_c( uint8_t *cur, uint8_t *old, int width )
+{
+ unsigned int ret = 0;
+
+ width /= 16;
+
+ while( width-- ) {
+ unsigned int tmp1 = (cur[ 0 ] + cur[ 2 ] + cur[ 4 ] + cur[ 6 ])>>2;
+ unsigned int tmp2 = (old[ 0 ] + old[ 2 ] + old[ 4 ] + old[ 6 ])>>2;
+ tmp1 = (tmp1 - tmp2);
+ tmp1 *= tmp1;
+ tmp1 >>= BitShift;
+ ret += tmp1;
+ cur += (8*4);
+ old += (8*4);
+ }
+
+ return ret;
+}
+*/
+
+#ifdef HAVE_CPU_I386
+static unsigned int
+diff_factor_packed422_scanline_mmx (uint8_t * cur, uint8_t * old, int width)
+{
+ const mmx_t qwYMask = { 0x00ff00ff00ff00ffULL };
+ unsigned int temp1, temp2;
+
+ width /= 4;
+
+ movq_m2r (qwYMask, mm1);
+ movd_m2r (BitShift, mm7);
+ pxor_r2r (mm0, mm0);
+
+ while (width--) {
+ movq_m2r (*cur, mm4);
+ movq_m2r (*old, mm5);
+
+ pand_r2r (mm1, mm4);
+ pand_r2r (mm1, mm5);
+
+ psubw_r2r (mm5, mm4); /* mm4 = Y1 - Y2 */
+ pmaddwd_r2r (mm4, mm4); /* mm4 = (Y1 - Y2)^2 */
+ psrld_r2r (mm7, mm4); /* divide mm4 by 2^BitShift */
+ paddd_r2r (mm4, mm0); /* keep total in mm0 */
+
+ cur += 8;
+ old += 8;
+ }
+
+ movd_r2m (mm0, temp1);
+ psrlq_i2r (32, mm0);
+ movd_r2m (mm0, temp2);
+ temp1 += temp2;
+
+ emms ();
+
+ return temp1;
+}
+#endif
+
+// defined in glib/gmacros.h #define ABS(a) (((a) < 0)?-(a):(a))
+
+#ifdef HAVE_CPU_I386
+static void
+diff_packed422_block8x8_mmx (pulldown_metrics_t * m, uint8_t * old,
+ uint8_t * new, int os, int ns)
+{
+ const mmx_t ymask = { 0x00ff00ff00ff00ffULL };
+ short out[24]; /* Output buffer for the partial metrics from the mmx code. */
+
+ uint8_t *outdata = (uint8_t *) out;
+
+ uint8_t *oldp, *newp;
+
+ int i;
+
+ pxor_r2r (mm4, mm4); // 4 even difference sums.
+ pxor_r2r (mm5, mm5); // 4 odd difference sums.
+ pxor_r2r (mm7, mm7); // zeros
+
+ oldp = old;
+ newp = new;
+ for (i = 4; i; --i) {
+ // Even difference.
+ movq_m2r (oldp[0], mm0);
+ movq_m2r (oldp[8], mm2);
+ pand_m2r (ymask, mm0);
+ pand_m2r (ymask, mm2);
+ oldp += os;
+
+ movq_m2r (newp[0], mm1);
+ movq_m2r (newp[8], mm3);
+ pand_m2r (ymask, mm1);
+ pand_m2r (ymask, mm3);
+ newp += ns;
+
+ movq_r2r (mm0, mm6);
+ psubusb_r2r (mm1, mm0);
+ psubusb_r2r (mm6, mm1);
+ movq_r2r (mm2, mm6);
+ psubusb_r2r (mm3, mm2);
+ psubusb_r2r (mm6, mm3);
+
+ paddw_r2r (mm0, mm4);
+ paddw_r2r (mm1, mm4);
+ paddw_r2r (mm2, mm4);
+ paddw_r2r (mm3, mm4);
+
+ // Odd difference.
+ movq_m2r (oldp[0], mm0);
+ movq_m2r (oldp[8], mm2);
+ pand_m2r (ymask, mm0);
+ pand_m2r (ymask, mm2);
+ oldp += os;
+
+ movq_m2r (newp[0], mm1);
+ movq_m2r (newp[8], mm3);
+ pand_m2r (ymask, mm1);
+ pand_m2r (ymask, mm3);
+ newp += ns;
+
+ movq_r2r (mm0, mm6);
+ psubusb_r2r (mm1, mm0);
+ psubusb_r2r (mm6, mm1);
+ movq_r2r (mm2, mm6);
+ psubusb_r2r (mm3, mm2);
+ psubusb_r2r (mm6, mm3);
+
+ paddw_r2r (mm0, mm5);
+ paddw_r2r (mm1, mm5);
+ paddw_r2r (mm2, mm5);
+ paddw_r2r (mm3, mm5);
+ }
+ movq_r2m (mm4, outdata[0]);
+ movq_r2m (mm5, outdata[8]);
+
+ m->e = out[0] + out[1] + out[2] + out[3];
+ m->o = out[4] + out[5] + out[6] + out[7];
+ m->d = m->e + m->o;
+
+ pxor_r2r (mm4, mm4); // Past spacial noise.
+ pxor_r2r (mm5, mm5); // Temporal noise.
+ pxor_r2r (mm6, mm6); // Current spacial noise.
+
+ // First loop to measure first four columns
+ oldp = old;
+ newp = new;
+ for (i = 4; i; --i) {
+ movq_m2r (oldp[0], mm0);
+ movq_m2r (oldp[os], mm1);
+ pand_m2r (ymask, mm0);
+ pand_m2r (ymask, mm1);
+ oldp += (os * 2);
+
+ movq_m2r (newp[0], mm2);
+ movq_m2r (newp[ns], mm3);
+ pand_m2r (ymask, mm2);
+ pand_m2r (ymask, mm3);
+ newp += (ns * 2);
+
+ paddw_r2r (mm1, mm4);
+ paddw_r2r (mm1, mm5);
+ paddw_r2r (mm3, mm6);
+ psubw_r2r (mm0, mm4);
+ psubw_r2r (mm2, mm5);
+ psubw_r2r (mm2, mm6);
+ }
+ movq_r2m (mm4, outdata[0]);
+ movq_r2m (mm5, outdata[16]);
+ movq_r2m (mm6, outdata[32]);
+
+ pxor_r2r (mm4, mm4);
+ pxor_r2r (mm5, mm5);
+ pxor_r2r (mm6, mm6);
+
+ // Second loop for the last four columns
+ oldp = old;
+ newp = new;
+ for (i = 4; i; --i) {
+ movq_m2r (oldp[8], mm0);
+ movq_m2r (oldp[os + 8], mm1);
+ pand_m2r (ymask, mm0);
+ pand_m2r (ymask, mm1);
+ oldp += (os * 2);
+
+ movq_m2r (newp[8], mm2);
+ movq_m2r (newp[ns + 8], mm3);
+ pand_m2r (ymask, mm2);
+ pand_m2r (ymask, mm3);
+ newp += (ns * 2);
+
+ paddw_r2r (mm1, mm4);
+ paddw_r2r (mm1, mm5);
+ paddw_r2r (mm3, mm6);
+ psubw_r2r (mm0, mm4);
+ psubw_r2r (mm2, mm5);
+ psubw_r2r (mm2, mm6);
+ }
+ movq_r2m (mm4, outdata[8]);
+ movq_r2m (mm5, outdata[24]);
+ movq_r2m (mm6, outdata[40]);
+
+ m->p = m->t = m->s = 0;
+ for (i = 0; i < 8; i++) {
+ // FIXME: move abs() into the mmx code!
+ m->p += ABS (out[i]);
+ m->t += ABS (out[8 + i]);
+ m->s += ABS (out[16 + i]);
+ }
+
+ emms ();
+}
+#endif
+
+static void
+diff_packed422_block8x8_c (pulldown_metrics_t * m, uint8_t * old,
+ uint8_t * new, int os, int ns)
+{
+ int x, y, e = 0, o = 0, s = 0, p = 0, t = 0;
+
+ uint8_t *oldp, *newp;
+
+ m->s = m->p = m->t = 0;
+ for (x = 8; x; x--) {
+ oldp = old;
+ old += 2;
+ newp = new;
+ new += 2;
+ s = p = t = 0;
+ for (y = 4; y; y--) {
+ e += ABS (newp[0] - oldp[0]);
+ o += ABS (newp[ns] - oldp[os]);
+ s += newp[ns] - newp[0];
+ p += oldp[os] - oldp[0];
+ t += oldp[os] - newp[0];
+ oldp += os << 1;
+ newp += ns << 1;
+ }
+ m->s += ABS (s);
+ m->p += ABS (p);
+ m->t += ABS (t);
+ }
+ m->e = e;
+ m->o = o;
+ m->d = e + o;
+}
+
+static void
+packed444_to_packed422_scanline_c (uint8_t * output, uint8_t * input, int width)
+{
+ width /= 2;
+ while (width--) {
+ output[0] = input[0];
+ output[1] = input[1];
+ output[2] = input[3];
+ output[3] = input[2];
+ output += 4;
+ input += 6;
+ }
+}
+
+static void
+packed422_to_packed444_scanline_c (uint8_t * output, uint8_t * input, int width)
+{
+ width /= 2;
+ while (width--) {
+ output[0] = input[0];
+ output[1] = input[1];
+ output[2] = input[3];
+ output[3] = input[2];
+ output[4] = input[1];
+ output[5] = input[3];
+ output += 6;
+ input += 4;
+ }
+}
+
+/**
+ * For the middle pixels, the filter kernel is:
+ *
+ * [-1 3 -6 12 -24 80 80 -24 12 -6 3 -1]
+ */
+static void
+packed422_to_packed444_rec601_scanline_c (uint8_t * dest, uint8_t * src,
+ int width)
+{
+ int i;
+
+ /* Process two input pixels at a time. Input is [Y'][Cb][Y'][Cr]. */
+ for (i = 0; i < width / 2; i++) {
+ dest[(i * 6) + 0] = src[(i * 4) + 0];
+ dest[(i * 6) + 1] = src[(i * 4) + 1];
+ dest[(i * 6) + 2] = src[(i * 4) + 3];
+
+ dest[(i * 6) + 3] = src[(i * 4) + 2];
+ if (i > (5 * 2) && i < ((width / 2) - (6 * 2))) {
+ dest[(i * 6) + 4] =
+ clip255 ((((80 * (src[(i * 4) + 1] + src[(i * 4) + 5]))
+ - (24 * (src[(i * 4) - 3] + src[(i * 4) + 9]))
+ + (12 * (src[(i * 4) - 7] + src[(i * 4) + 13]))
+ - (6 * (src[(i * 4) - 11] + src[(i * 4) + 17]))
+ + (3 * (src[(i * 4) - 15] + src[(i * 4) + 21]))
+ - ((src[(i * 4) - 19] + src[(i * 4) + 25]))) + 64) >> 7);
+ dest[(i * 6) + 5] =
+ clip255 ((((80 * (src[(i * 4) + 3] + src[(i * 4) + 7]))
+ - (24 * (src[(i * 4) - 1] + src[(i * 4) + 11]))
+ + (12 * (src[(i * 4) - 5] + src[(i * 4) + 15]))
+ - (6 * (src[(i * 4) - 9] + src[(i * 4) + 19]))
+ + (3 * (src[(i * 4) - 13] + src[(i * 4) + 23]))
+ - ((src[(i * 4) - 17] + src[(i * 4) + 27]))) + 64) >> 7);
+ } else if (i < ((width / 2) - 1)) {
+ dest[(i * 6) + 4] = (src[(i * 4) + 1] + src[(i * 4) + 5] + 1) >> 1;
+ dest[(i * 6) + 5] = (src[(i * 4) + 3] + src[(i * 4) + 7] + 1) >> 1;
+ } else {
+ dest[(i * 6) + 4] = src[(i * 4) + 1];
+ dest[(i * 6) + 5] = src[(i * 4) + 3];
+ }
+ }
+}
+
+#ifdef HAVE_CPU_I386
+static void
+vfilter_chroma_121_packed422_scanline_mmx (uint8_t * output, int width,
+ uint8_t * m, uint8_t * t, uint8_t * b)
+{
+ int i;
+ const mmx_t ymask = { 0x00ff00ff00ff00ffULL };
+ const mmx_t cmask = { 0xff00ff00ff00ff00ULL };
+
+ // Get width in bytes.
+ width *= 2;
+ i = width / 8;
+ width -= i * 8;
+
+ movq_m2r (ymask, mm7);
+ movq_m2r (cmask, mm6);
+
+ while (i--) {
+ movq_m2r (*t, mm0);
+ movq_m2r (*b, mm1);
+ movq_m2r (*m, mm2);
+
+ movq_r2r (mm2, mm3);
+ pand_r2r (mm7, mm3);
+
+ pand_r2r (mm6, mm0);
+ pand_r2r (mm6, mm1);
+ pand_r2r (mm6, mm2);
+
+ psrlq_i2r (8, mm0);
+ psrlq_i2r (8, mm1);
+ psrlq_i2r (7, mm2);
+
+ paddw_r2r (mm0, mm2);
+ paddw_r2r (mm1, mm2);
+
+ psllw_i2r (6, mm2);
+ pand_r2r (mm6, mm2);
+
+ por_r2r (mm3, mm2);
+
+ movq_r2m (mm2, *output);
+ output += 8;
+ t += 8;
+ b += 8;
+ m += 8;
+ }
+ output++;
+ t++;
+ b++;
+ m++;
+ while (width--) {
+ *output = (*t + *b + (*m << 1)) >> 2;
+ output += 2;
+ t += 2;
+ b += 2;
+ m += 2;
+ }
+
+ emms ();
+}
+#endif
+
+static void
+vfilter_chroma_121_packed422_scanline_c (uint8_t * output, int width,
+ uint8_t * m, uint8_t * t, uint8_t * b)
+{
+ output++;
+ t++;
+ b++;
+ m++;
+ while (width--) {
+ *output = (*t + *b + (*m << 1)) >> 2;
+ output += 2;
+ t += 2;
+ b += 2;
+ m += 2;
+ }
+}
+
+#ifdef HAVE_CPU_I386
+static void
+vfilter_chroma_332_packed422_scanline_mmx (uint8_t * output, int width,
+ uint8_t * m, uint8_t * t, uint8_t * b)
+{
+ int i;
+ const mmx_t ymask = { 0x00ff00ff00ff00ffULL };
+ const mmx_t cmask = { 0xff00ff00ff00ff00ULL };
+
+ // Get width in bytes.
+ width *= 2;
+ i = width / 8;
+ width -= i * 8;
+
+ movq_m2r (ymask, mm7);
+ movq_m2r (cmask, mm6);
+
+ while (i--) {
+ movq_m2r (*t, mm0);
+ movq_m2r (*b, mm1);
+ movq_m2r (*m, mm2);
+
+ movq_r2r (mm2, mm3);
+ pand_r2r (mm7, mm3);
+
+ pand_r2r (mm6, mm0);
+ pand_r2r (mm6, mm1);
+ pand_r2r (mm6, mm2);
+
+ psrlq_i2r (8, mm0);
+ psrlq_i2r (7, mm1);
+ psrlq_i2r (8, mm2);
+
+ movq_r2r (mm0, mm4);
+ psllw_i2r (1, mm4);
+ paddw_r2r (mm4, mm0);
+
+ movq_r2r (mm2, mm4);
+ psllw_i2r (1, mm4);
+ paddw_r2r (mm4, mm2);
+
+ paddw_r2r (mm0, mm2);
+ paddw_r2r (mm1, mm2);
+
+ psllw_i2r (5, mm2);
+ pand_r2r (mm6, mm2);
+
+ por_r2r (mm3, mm2);
+
+ movq_r2m (mm2, *output);
+ output += 8;
+ t += 8;
+ b += 8;
+ m += 8;
+ }
+ output++;
+ t++;
+ b++;
+ m++;
+ while (width--) {
+ *output = (3 * *t + 3 * *m + 2 * *b) >> 3;
+ output += 2;
+ t += 2;
+ b += 2;
+ m += 2;
+ }
+
+ emms ();
+}
+#endif
+
+static void
+vfilter_chroma_332_packed422_scanline_c (uint8_t * output, int width,
+ uint8_t * m, uint8_t * t, uint8_t * b)
+{
+ output++;
+ t++;
+ b++;
+ m++;
+ while (width--) {
+ *output = (3 * *t + 3 * *m + 2 * *b) >> 3;
+ output += 2;
+ t += 2;
+ b += 2;
+ m += 2;
+ }
+}
+
+#ifdef HAVE_CPU_I386
+static void
+kill_chroma_packed422_inplace_scanline_mmx (uint8_t * data, int width)
+{
+ const mmx_t ymask = { 0x00ff00ff00ff00ffULL };
+ const mmx_t nullchroma = { 0x8000800080008000ULL };
+
+ movq_m2r (ymask, mm7);
+ movq_m2r (nullchroma, mm6);
+ for (; width > 4; width -= 4) {
+ movq_m2r (*data, mm0);
+ pand_r2r (mm7, mm0);
+ paddb_r2r (mm6, mm0);
+ movq_r2m (mm0, *data);
+ data += 8;
+ }
+ emms ();
+
+ while (width--) {
+ data[1] = 128;
+ data += 2;
+ }
+}
+#endif
+
+static void
+kill_chroma_packed422_inplace_scanline_c (uint8_t * data, int width)
+{
+ while (width--) {
+ data[1] = 128;
+ data += 2;
+ }
+}
+
+#ifdef HAVE_CPU_I386
+static void
+invert_colour_packed422_inplace_scanline_mmx (uint8_t * data, int width)
+{
+ const mmx_t allones = { 0xffffffffffffffffULL };
+
+ movq_m2r (allones, mm1);
+ for (; width > 4; width -= 4) {
+ movq_r2r (mm1, mm2);
+ movq_m2r (*data, mm0);
+ psubb_r2r (mm0, mm2);
+ movq_r2m (mm2, *data);
+ data += 8;
+ }
+ emms ();
+
+ width *= 2;
+ while (width--) {
+ *data = 255 - *data;
+ data++;
+ }
+}
+#endif
+
+static void
+invert_colour_packed422_inplace_scanline_c (uint8_t * data, int width)
+{
+ width *= 2;
+ while (width--) {
+ *data = 255 - *data;
+ data++;
+ }
+}
+
+static void
+mirror_packed422_inplace_scanline_c (uint8_t * data, int width)
+{
+ int x, tmp1, tmp2;
+
+ int width2 = width * 2;
+
+ for (x = 0; x < width; x += 2) {
+ tmp1 = data[x];
+ tmp2 = data[x + 1];
+ data[x] = data[width2 - x];
+ data[x + 1] = data[width2 - x + 1];
+ data[width2 - x] = tmp1;
+ data[width2 - x + 1] = tmp2;
+ }
+}
+
+static void
+interpolate_packed422_scanline_c (uint8_t * output, uint8_t * top,
+ uint8_t * bot, int width)
+{
+ int i;
+
+ for (i = width * 2; i; --i) {
+ *output++ = ((*top++) + (*bot++)) >> 1;
+ }
+}
+
+#ifdef HAVE_CPU_I386
+static void
+convert_uyvy_to_yuyv_scanline_mmx (uint8_t * uyvy_buf, uint8_t * yuyv_buf,
+ int width)
+{
+#if defined(HAVE_CPU_I386) && !defined(HAVE_CPU_X86_64)
+ __asm__ __volatile__ (" movl %0, %%esi \n"
+ " movl %1, %%edi \n"
+ " movl %2, %%edx \n" " shrl $3, %%edx \n"
+ /* Process 8 pixels at once */
+ "1: movq (%%esi), %%mm0 \n" /* mm0 = Y3V2Y2U2Y1V0Y0U0 */
+ " movq 8(%%esi), %%mm2 \n" /* mm2 = Y7V6Y6U6Y5V4Y4U4 */
+ " movq %%mm0, %%mm1 \n" /* mm1 = Y3V2Y2U2Y1V0Y0U0 */
+ " movq %%mm2, %%mm3 \n" /* mm3 = Y7V6Y6U6Y5V4Y4U4 */
+ " psllw $8, %%mm0 \n" /* mm0 = V2__U2__V0__U0__ */
+ " psrlw $8, %%mm1 \n" /* mm1 = __Y3__Y2__Y1__Y0 */
+ " psllw $8, %%mm2 \n" /* mm2 = V6__U6__V4__U4__ */
+ " psrlw $8, %%mm3 \n" /* mm3 = __Y7__Y6__Y5__Y4 */
+ " por %%mm1, %%mm0 \n" /* mm0 = V2Y3U2Y2V0Y1U0Y0 */
+ " por %%mm3, %%mm2 \n" /* mm2 = V6Y7U6Y6V4Y5U4Y4 */
+ " movq %%mm0, (%%edi) \n"
+ " movq %%mm2, 8(%%edi) \n"
+ " addl $16, %%esi \n"
+ " addl $16, %%edi \n"
+ " decl %%edx \n"
+ " jnz 1b \n" " emms \n"
+ /* output */ :
+ /* input */ :"g" (uyvy_buf), "g" (yuyv_buf), "g" (width)
+ /* clobber registers */
+ :"cc", "edx", "esi", "edi");
+#endif
+#ifdef HAVE_CPU_X86_64
+ __asm__ __volatile__ (" movq %0, %%rsi \n"
+ " movq %1, %%rdi \n"
+ " xorq %%rdx, %%rdx \n"
+ " movl %2, %%edx \n" " shrq $3, %%rdx \n"
+ /* Process 8 pixels at once */
+ "1: movq (%%rsi), %%mm0 \n" /* mm0 = Y3V2Y2U2Y1V0Y0U0 */
+ " movq 8(%%rsi), %%mm2 \n" /* mm2 = Y7V6Y6U6Y5V4Y4U4 */
+ " movq %%mm0, %%mm1 \n" /* mm1 = Y3V2Y2U2Y1V0Y0U0 */
+ " movq %%mm2, %%mm3 \n" /* mm3 = Y7V6Y6U6Y5V4Y4U4 */
+ " psllw $8, %%mm0 \n" /* mm0 = V2__U2__V0__U0__ */
+ " psrlw $8, %%mm1 \n" /* mm1 = __Y3__Y2__Y1__Y0 */
+ " psllw $8, %%mm2 \n" /* mm2 = V6__U6__V4__U4__ */
+ " psrlw $8, %%mm3 \n" /* mm3 = __Y7__Y6__Y5__Y4 */
+ " por %%mm1, %%mm0 \n" /* mm0 = V2Y3U2Y2V0Y1U0Y0 */
+ " por %%mm3, %%mm2 \n" /* mm2 = V6Y7U6Y6V4Y5U4Y4 */
+ " movq %%mm0, (%%rdi) \n"
+ " movq %%mm2, 8(%%rdi) \n"
+ " addq $16, %%rsi \n"
+ " addq $16, %%rdi \n"
+ " decq %%rdx \n"
+ " jnz 1b \n" " emms \n"
+ /* output */ :
+ /* input */ :"g" (uyvy_buf), "g" (yuyv_buf), "g" (width)
+ /* clobber registers */
+ :"cc", "rdx", "rsi", "rdi");
+#endif
+ if (width & 7) {
+ uint32_t *uyvy = (uint32_t *) uyvy_buf;
+
+ uint32_t *yuyv = (uint32_t *) yuyv_buf;
+
+ uint32_t val;
+
+ width &= 7;
+ width >>= 1;
+ while (width--) {
+ val = *uyvy++;
+ val = ((val << 8) & ~0x00FF0000) | ((val >> 8) & ~0x0000FF00);
+ *yuyv++ = val;
+ }
+ }
+}
+#endif
+
+static void
+convert_uyvy_to_yuyv_scanline_c (uint8_t * uyvy_buf, uint8_t * yuyv_buf,
+ int width)
+{
+ uint32_t *uyvy = (uint32_t *) uyvy_buf;
+
+ uint32_t *yuyv = (uint32_t *) yuyv_buf;
+
+ uint32_t val;
+
+ width >>= 1;
+ while (width--) {
+ val = *uyvy++;
+ val = ((val << 8) & ~0x00FF0000) | ((val >> 8) & ~0x0000FF00);
+ *yuyv++ = val;
+ }
+}
+
+
+#ifdef HAVE_CPU_I386
+static void
+interpolate_packed422_scanline_mmx (uint8_t * output, uint8_t * top,
+ uint8_t * bot, int width)
+{
+ const mmx_t shiftmask = { 0xfefffefffefffeffULL }; /* To avoid shifting chroma to luma. */
+ int i;
+
+ for (i = width / 16; i; --i) {
+ movq_m2r (*bot, mm0);
+ movq_m2r (*top, mm1);
+ movq_m2r (*(bot + 8), mm2);
+ movq_m2r (*(top + 8), mm3);
+ movq_m2r (*(bot + 16), mm4);
+ movq_m2r (*(top + 16), mm5);
+ movq_m2r (*(bot + 24), mm6);
+ movq_m2r (*(top + 24), mm7);
+ pand_m2r (shiftmask, mm0);
+ pand_m2r (shiftmask, mm1);
+ pand_m2r (shiftmask, mm2);
+ pand_m2r (shiftmask, mm3);
+ pand_m2r (shiftmask, mm4);
+ pand_m2r (shiftmask, mm5);
+ pand_m2r (shiftmask, mm6);
+ pand_m2r (shiftmask, mm7);
+ psrlw_i2r (1, mm0);
+ psrlw_i2r (1, mm1);
+ psrlw_i2r (1, mm2);
+ psrlw_i2r (1, mm3);
+ psrlw_i2r (1, mm4);
+ psrlw_i2r (1, mm5);
+ psrlw_i2r (1, mm6);
+ psrlw_i2r (1, mm7);
+ paddb_r2r (mm1, mm0);
+ paddb_r2r (mm3, mm2);
+ paddb_r2r (mm5, mm4);
+ paddb_r2r (mm7, mm6);
+ movq_r2m (mm0, *output);
+ movq_r2m (mm2, *(output + 8));
+ movq_r2m (mm4, *(output + 16));
+ movq_r2m (mm6, *(output + 24));
+ output += 32;
+ top += 32;
+ bot += 32;
+ }
+ width = (width & 0xf);
+
+ for (i = width / 4; i; --i) {
+ movq_m2r (*bot, mm0);
+ movq_m2r (*top, mm1);
+ pand_m2r (shiftmask, mm0);
+ pand_m2r (shiftmask, mm1);
+ psrlw_i2r (1, mm0);
+ psrlw_i2r (1, mm1);
+ paddb_r2r (mm1, mm0);
+ movq_r2m (mm0, *output);
+ output += 8;
+ top += 8;
+ bot += 8;
+ }
+ width = width & 0x7;
+
+ /* Handle last few pixels. */
+ for (i = width * 2; i; --i) {
+ *output++ = ((*top++) + (*bot++)) >> 1;
+ }
+
+ emms ();
+}
+#endif
+
+#ifdef HAVE_CPU_I386
+static void
+interpolate_packed422_scanline_mmxext (uint8_t * output, uint8_t * top,
+ uint8_t * bot, int width)
+{
+ int i;
+
+ for (i = width / 16; i; --i) {
+ movq_m2r (*bot, mm0);
+ movq_m2r (*top, mm1);
+ movq_m2r (*(bot + 8), mm2);
+ movq_m2r (*(top + 8), mm3);
+ movq_m2r (*(bot + 16), mm4);
+ movq_m2r (*(top + 16), mm5);
+ movq_m2r (*(bot + 24), mm6);
+ movq_m2r (*(top + 24), mm7);
+ pavgb_r2r (mm1, mm0);
+ pavgb_r2r (mm3, mm2);
+ pavgb_r2r (mm5, mm4);
+ pavgb_r2r (mm7, mm6);
+ movntq_r2m (mm0, *output);
+ movntq_r2m (mm2, *(output + 8));
+ movntq_r2m (mm4, *(output + 16));
+ movntq_r2m (mm6, *(output + 24));
+ output += 32;
+ top += 32;
+ bot += 32;
+ }
+ width = (width & 0xf);
+
+ for (i = width / 4; i; --i) {
+ movq_m2r (*bot, mm0);
+ movq_m2r (*top, mm1);
+ pavgb_r2r (mm1, mm0);
+ movntq_r2m (mm0, *output);
+ output += 8;
+ top += 8;
+ bot += 8;
+ }
+ width = width & 0x7;
+
+ /* Handle last few pixels. */
+ for (i = width * 2; i; --i) {
+ *output++ = ((*top++) + (*bot++)) >> 1;
+ }
+
+ sfence ();
+ emms ();
+}
+#endif
+
+static void
+blit_colour_packed422_scanline_c (uint8_t * output, int width, int y, int cb,
+ int cr)
+{
+ uint32_t colour = cr << 24 | y << 16 | cb << 8 | y;
+
+ uint32_t *o = (uint32_t *) output;
+
+ for (width /= 2; width; --width) {
+ *o++ = colour;
+ }
+}
+
+#ifdef HAVE_CPU_I386
+static void
+blit_colour_packed422_scanline_mmx (uint8_t * output, int width, int y, int cb,
+ int cr)
+{
+ uint32_t colour = cr << 24 | y << 16 | cb << 8 | y;
+
+ int i;
+
+ movd_m2r (colour, mm1);
+ movd_m2r (colour, mm2);
+ psllq_i2r (32, mm1);
+ por_r2r (mm1, mm2);
+
+ for (i = width / 16; i; --i) {
+ movq_r2m (mm2, *output);
+ movq_r2m (mm2, *(output + 8));
+ movq_r2m (mm2, *(output + 16));
+ movq_r2m (mm2, *(output + 24));
+ output += 32;
+ }
+ width = (width & 0xf);
+
+ for (i = width / 4; i; --i) {
+ movq_r2m (mm2, *output);
+ output += 8;
+ }
+ width = (width & 0x7);
+
+ for (i = width / 2; i; --i) {
+ *((uint32_t *) output) = colour;
+ output += 4;
+ }
+
+ if (width & 1) {
+ *output = y;
+ *(output + 1) = cb;
+ }
+
+ emms ();
+}
+#endif
+
+#ifdef HAVE_CPU_I386
+static void
+blit_colour_packed422_scanline_mmxext (uint8_t * output, int width, int y,
+ int cb, int cr)
+{
+ uint32_t colour = cr << 24 | y << 16 | cb << 8 | y;
+
+ int i;
+
+ movd_m2r (colour, mm1);
+ movd_m2r (colour, mm2);
+ psllq_i2r (32, mm1);
+ por_r2r (mm1, mm2);
+
+ for (i = width / 16; i; --i) {
+ movntq_r2m (mm2, *output);
+ movntq_r2m (mm2, *(output + 8));
+ movntq_r2m (mm2, *(output + 16));
+ movntq_r2m (mm2, *(output + 24));
+ output += 32;
+ }
+ width = (width & 0xf);
+
+ for (i = width / 4; i; --i) {
+ movntq_r2m (mm2, *output);
+ output += 8;
+ }
+ width = (width & 0x7);
+
+ for (i = width / 2; i; --i) {
+ *((uint32_t *) output) = colour;
+ output += 4;
+ }
+
+ if (width & 1) {
+ *output = y;
+ *(output + 1) = cb;
+ }
+
+ sfence ();
+ emms ();
+}
+#endif
+
+static void
+blit_colour_packed4444_scanline_c (uint8_t * output, int width,
+ int alpha, int luma, int cb, int cr)
+{
+ int j;
+
+ for (j = 0; j < width; j++) {
+ *output++ = alpha;
+ *output++ = luma;
+ *output++ = cb;
+ *output++ = cr;
+ }
+}
+
+#ifdef HAVE_CPU_I386
+static void
+blit_colour_packed4444_scanline_mmx (uint8_t * output, int width,
+ int alpha, int luma, int cb, int cr)
+{
+ uint32_t colour = (cr << 24) | (cb << 16) | (luma << 8) | alpha;
+
+ int i;
+
+ movd_m2r (colour, mm1);
+ movd_m2r (colour, mm2);
+ psllq_i2r (32, mm1);
+ por_r2r (mm1, mm2);
+
+ for (i = width / 8; i; --i) {
+ movq_r2m (mm2, *output);
+ movq_r2m (mm2, *(output + 8));
+ movq_r2m (mm2, *(output + 16));
+ movq_r2m (mm2, *(output + 24));
+ output += 32;
+ }
+ width = (width & 0x7);
+
+ for (i = width / 2; i; --i) {
+ movq_r2m (mm2, *output);
+ output += 8;
+ }
+ width = (width & 0x1);
+
+ if (width) {
+ *((uint32_t *) output) = colour;
+ output += 4;
+ }
+
+ emms ();
+}
+#endif
+
+#ifdef HAVE_CPU_I386
+static void
+blit_colour_packed4444_scanline_mmxext (uint8_t * output, int width,
+ int alpha, int luma, int cb, int cr)
+{
+ uint32_t colour = (cr << 24) | (cb << 16) | (luma << 8) | alpha;
+
+ int i;
+
+ movd_m2r (colour, mm1);
+ movd_m2r (colour, mm2);
+ psllq_i2r (32, mm1);
+ por_r2r (mm1, mm2);
+
+ for (i = width / 8; i; --i) {
+ movntq_r2m (mm2, *output);
+ movntq_r2m (mm2, *(output + 8));
+ movntq_r2m (mm2, *(output + 16));
+ movntq_r2m (mm2, *(output + 24));
+ output += 32;
+ }
+ width = (width & 0x7);
+
+ for (i = width / 2; i; --i) {
+ movntq_r2m (mm2, *output);
+ output += 8;
+ }
+ width = (width & 0x1);
+
+ if (width) {
+ *((uint32_t *) output) = colour;
+ output += 4;
+ }
+
+ sfence ();
+ emms ();
+}
+#endif
+
+
+/**
+ * Some memcpy code inspired by the xine code which originally came
+ * from mplayer.
+ */
+
+/* linux kernel __memcpy (from: /include/asm/string.h) */
+#ifdef HAVE_CPU_I386
+static inline __attribute__ ((always_inline, const))
+ void small_memcpy (void *to, const void *from, size_t n)
+{
+ int d0, d1, d2;
+
+ __asm__ __volatile__ ("rep ; movsl\n\t"
+ "testb $2,%b4\n\t"
+ "je 1f\n\t"
+ "movsw\n"
+ "1:\ttestb $1,%b4\n\t"
+ "je 2f\n\t" "movsb\n" "2:":"=&c" (d0), "=&D" (d1), "=&S" (d2)
+ :"0" (n / 4), "q" (n), "1" ((long) to), "2" ((long) from)
+ :"memory");
+}
+#endif
+
+static void
+speedy_memcpy_c (void *dest, const void *src, size_t n)
+{
+ if (dest != src) {
+ memcpy (dest, src, n);
+ }
+}
+
+#ifdef HAVE_CPU_I386
+static void
+speedy_memcpy_mmx (void *d, const void *s, size_t n)
+{
+ const uint8_t *src = s;
+
+ uint8_t *dest = d;
+
+ if (dest != src) {
+ while (n > 64) {
+ movq_m2r (src[0], mm0);
+ movq_m2r (src[8], mm1);
+ movq_m2r (src[16], mm2);
+ movq_m2r (src[24], mm3);
+ movq_m2r (src[32], mm4);
+ movq_m2r (src[40], mm5);
+ movq_m2r (src[48], mm6);
+ movq_m2r (src[56], mm7);
+ movq_r2m (mm0, dest[0]);
+ movq_r2m (mm1, dest[8]);
+ movq_r2m (mm2, dest[16]);
+ movq_r2m (mm3, dest[24]);
+ movq_r2m (mm4, dest[32]);
+ movq_r2m (mm5, dest[40]);
+ movq_r2m (mm6, dest[48]);
+ movq_r2m (mm7, dest[56]);
+ dest += 64;
+ src += 64;
+ n -= 64;
+ }
+
+ while (n > 8) {
+ movq_m2r (src[0], mm0);
+ movq_r2m (mm0, dest[0]);
+ dest += 8;
+ src += 8;
+ n -= 8;
+ }
+
+ if (n)
+ small_memcpy (dest, src, n);
+
+ emms ();
+ }
+}
+#endif
+
+#ifdef HAVE_CPU_I386
+static void
+speedy_memcpy_mmxext (void *d, const void *s, size_t n)
+{
+ const uint8_t *src = s;
+
+ uint8_t *dest = d;
+
+ if (dest != src) {
+ while (n > 64) {
+ movq_m2r (src[0], mm0);
+ movq_m2r (src[8], mm1);
+ movq_m2r (src[16], mm2);
+ movq_m2r (src[24], mm3);
+ movq_m2r (src[32], mm4);
+ movq_m2r (src[40], mm5);
+ movq_m2r (src[48], mm6);
+ movq_m2r (src[56], mm7);
+ movntq_r2m (mm0, dest[0]);
+ movntq_r2m (mm1, dest[8]);
+ movntq_r2m (mm2, dest[16]);
+ movntq_r2m (mm3, dest[24]);
+ movntq_r2m (mm4, dest[32]);
+ movntq_r2m (mm5, dest[40]);
+ movntq_r2m (mm6, dest[48]);
+ movntq_r2m (mm7, dest[56]);
+ dest += 64;
+ src += 64;
+ n -= 64;
+ }
+
+ while (n > 8) {
+ movq_m2r (src[0], mm0);
+ movntq_r2m (mm0, dest[0]);
+ dest += 8;
+ src += 8;
+ n -= 8;
+ }
+
+ if (n)
+ small_memcpy (dest, src, n);
+
+ sfence ();
+ emms ();
+ }
+}
+#endif
+
+static void
+blit_packed422_scanline_c (uint8_t * dest, const uint8_t * src, int width)
+{
+ speedy_memcpy_c (dest, src, width * 2);
+}
+
+#ifdef HAVE_CPU_I386
+static void
+blit_packed422_scanline_mmx (uint8_t * dest, const uint8_t * src, int width)
+{
+ speedy_memcpy_mmx (dest, src, width * 2);
+}
+#endif
+
+#ifdef HAVE_CPU_I386
+static void
+blit_packed422_scanline_mmxext (uint8_t * dest, const uint8_t * src, int width)
+{
+ speedy_memcpy_mmxext (dest, src, width * 2);
+}
+#endif
+
+static void
+composite_colour4444_alpha_to_packed422_scanline_c (uint8_t * output,
+ uint8_t * input, int af, int y, int cb, int cr, int width, int alpha)
+{
+ int a = ((af * alpha) + 0x80) >> 8;
+
+ if (a == 0xff) {
+ blit_colour_packed422_scanline (output, width, y, cb, cr);
+ } else if (a) {
+ int i;
+
+ for (i = 0; i < width; i++) {
+ /**
+ * (1 - alpha)*B + alpha*F
+ * (1 - af*a)*B + af*a*F
+ * B - af*a*B + af*a*F
+ * B + a*(af*F - af*B)
+ */
+
+ output[0] =
+ input[0] + ((alpha * (y - multiply_alpha (af,
+ input[0])) + 0x80) >> 8);
+
+ if ((i & 1) == 0) {
+
+ /**
+ * At first I thought I was doing this incorrectly, but
+ * the following math has convinced me otherwise.
+ *
+ * C_r = (1 - alpha)*B + alpha*F
+ * C_r = B - af*a*B + af*a*F
+ *
+ * C_r = 128 + ((1 - af*a)*(B - 128) + a*af*(F - 128))
+ * C_r = 128 + (B - af*a*B - 128 + af*a*128 + a*af*F - a*af*128)
+ * C_r = B - af*a*B + a*af*F
+ */
+
+ output[1] =
+ input[1] + ((alpha * (cb - multiply_alpha (af,
+ input[1])) + 0x80) >> 8);
+ output[3] =
+ input[3] + ((alpha * (cr - multiply_alpha (af,
+ input[3])) + 0x80) >> 8);
+ }
+ output += 2;
+ input += 2;
+ }
+ }
+}
+
+#ifdef HAVE_CPU_I386
+static void
+composite_colour4444_alpha_to_packed422_scanline_mmxext (uint8_t * output,
+ uint8_t * input, int af, int y, int cb, int cr, int width, int alpha)
+{
+ const mmx_t alpha2 = { 0x0000FFFF00000000ULL };
+ const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL };
+ const mmx_t round = { 0x0080008000800080ULL };
+ mmx_t foreground;
+
+ int i;
+
+ if (!alpha) {
+ blit_packed422_scanline (output, input, width);
+ return;
+ }
+
+ foreground.ub[0] = foreground.ub[4] = af;
+ foreground.ub[1] = foreground.ub[5] = y;
+ foreground.ub[2] = foreground.ub[6] = cb;
+ foreground.ub[3] = foreground.ub[7] = cr;
+
+ movq_m2r (alpha, mm2);
+ pshufw_r2r (mm2, mm2, 0);
+ pxor_r2r (mm7, mm7);
+
+ for (i = width / 2; i; i--) {
+ /* mm1 = [ cr ][ y ][ cb ][ y ] */
+ movd_m2r (*input, mm1);
+ punpcklbw_r2r (mm7, mm1);
+
+ movq_m2r (foreground, mm3);
+ movq_r2r (mm3, mm4);
+ punpcklbw_r2r (mm7, mm3);
+ punpckhbw_r2r (mm7, mm4);
+ /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */
+
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 0 a ][ 0 a ][ 0 a ] */
+ pshufw_r2r (mm3, mm5, 0);
+ pshufw_r2r (mm4, mm6, 0);
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */
+ pshufw_r2r (mm3, mm3, 201);
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */
+ pshufw_r2r (mm4, mm4, 16);
+
+ pand_m2r (alpha1, mm3);
+ pand_m2r (alpha2, mm4);
+ pand_m2r (alpha1, mm5);
+ pand_m2r (alpha2, mm6);
+ por_r2r (mm4, mm3);
+ por_r2r (mm6, mm5);
+
+ /* now, mm5 is af and mm1 is B. Need to multiply them. */
+ pmullw_r2r (mm1, mm5);
+
+ /* Multiply by appalpha. */
+ pmullw_r2r (mm2, mm3);
+ paddw_m2r (round, mm3);
+ psrlw_i2r (8, mm3);
+ /* Result is now B + F. */
+ paddw_r2r (mm3, mm1);
+
+ /* Round up appropriately. */
+ paddw_m2r (round, mm5);
+
+ /* mm6 contains our i>>8; */
+ movq_r2r (mm5, mm6);
+ psrlw_i2r (8, mm6);
+
+ /* Add mm6 back into mm5. Now our result is in the high bytes. */
+ paddw_r2r (mm6, mm5);
+
+ /* Shift down. */
+ psrlw_i2r (8, mm5);
+
+ /* Multiply by appalpha. */
+ pmullw_r2r (mm2, mm5);
+ paddw_m2r (round, mm5);
+ psrlw_i2r (8, mm5);
+
+ psubusw_r2r (mm5, mm1);
+
+ /* mm1 = [ B + F - af*B ] */
+ packuswb_r2r (mm1, mm1);
+ movd_r2m (mm1, *output);
+
+ output += 4;
+ input += 4;
+ }
+ sfence ();
+ emms ();
+}
+#endif
+
+
+static void
+composite_packed4444_alpha_to_packed422_scanline_c (uint8_t * output,
+ uint8_t * input, uint8_t * foreground, int width, int alpha)
+{
+ int i;
+
+ for (i = 0; i < width; i++) {
+ int af = foreground[0];
+
+ if (af) {
+ int a = ((af * alpha) + 0x80) >> 8;
+
+
+ if (a == 0xff) {
+ output[0] = foreground[1];
+
+ if ((i & 1) == 0) {
+ output[1] = foreground[2];
+ output[3] = foreground[3];
+ }
+ } else if (a) {
+ /**
+ * (1 - alpha)*B + alpha*F
+ * (1 - af*a)*B + af*a*F
+ * B - af*a*B + af*a*F
+ * B + a*(af*F - af*B)
+ */
+
+ output[0] = input[0]
+ + ((alpha * (foreground[1]
+ - multiply_alpha (foreground[0], input[0])) + 0x80) >> 8);
+
+ if ((i & 1) == 0) {
+
+ /**
+ * At first I thought I was doing this incorrectly, but
+ * the following math has convinced me otherwise.
+ *
+ * C_r = (1 - alpha)*B + alpha*F
+ * C_r = B - af*a*B + af*a*F
+ *
+ * C_r = 128 + ((1 - af*a)*(B - 128) + a*af*(F - 128))
+ * C_r = 128 + (B - af*a*B - 128 + af*a*128 + a*af*F - a*af*128)
+ * C_r = B - af*a*B + a*af*F
+ */
+
+ output[1] = input[1] + ((alpha * (foreground[2]
+ - multiply_alpha (foreground[0], input[1])) + 0x80) >> 8);
+ output[3] = input[3] + ((alpha * (foreground[3]
+ - multiply_alpha (foreground[0], input[3])) + 0x80) >> 8);
+ }
+ }
+ }
+ foreground += 4;
+ output += 2;
+ input += 2;
+ }
+}
+
+#ifdef HAVE_CPU_I386
+static void
+composite_packed4444_alpha_to_packed422_scanline_mmxext (uint8_t * output,
+ uint8_t * input, uint8_t * foreground, int width, int alpha)
+{
+ const mmx_t alpha2 = { 0x0000FFFF00000000ULL };
+ const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL };
+ const mmx_t round = { 0x0080008000800080ULL };
+ int i;
+
+ if (!alpha) {
+ blit_packed422_scanline (output, input, width);
+ return;
+ }
+
+ if (alpha == 256) {
+ composite_packed4444_to_packed422_scanline (output, input, foreground,
+ width);
+ return;
+ }
+
+ READ_PREFETCH_2048 (input);
+ READ_PREFETCH_2048 (foreground);
+
+ movq_m2r (alpha, mm2);
+ pshufw_r2r (mm2, mm2, 0);
+ pxor_r2r (mm7, mm7);
+
+ for (i = width / 2; i; i--) {
+ int fg1 = *((uint32_t *) foreground);
+
+ int fg2 = *(((uint32_t *) foreground) + 1);
+
+ if (fg1 || fg2) {
+ /* mm1 = [ cr ][ y ][ cb ][ y ] */
+ movd_m2r (*input, mm1);
+ punpcklbw_r2r (mm7, mm1);
+
+ movq_m2r (*foreground, mm3);
+ movq_r2r (mm3, mm4);
+ punpcklbw_r2r (mm7, mm3);
+ punpckhbw_r2r (mm7, mm4);
+ /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */
+
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 0 a ][ 0 a ][ 0 a ] */
+ pshufw_r2r (mm3, mm5, 0);
+ pshufw_r2r (mm4, mm6, 0);
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */
+ pshufw_r2r (mm3, mm3, 201);
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */
+ pshufw_r2r (mm4, mm4, 16);
+
+ pand_m2r (alpha1, mm3);
+ pand_m2r (alpha2, mm4);
+ pand_m2r (alpha1, mm5);
+ pand_m2r (alpha2, mm6);
+ por_r2r (mm4, mm3);
+ por_r2r (mm6, mm5);
+
+ /* now, mm5 is af and mm1 is B. Need to multiply them. */
+ pmullw_r2r (mm1, mm5);
+
+ /* Multiply by appalpha. */
+ pmullw_r2r (mm2, mm3);
+ paddw_m2r (round, mm3);
+ psrlw_i2r (8, mm3);
+ /* Result is now B + F. */
+ paddw_r2r (mm3, mm1);
+
+ /* Round up appropriately. */
+ paddw_m2r (round, mm5);
+
+ /* mm6 contains our i>>8; */
+ movq_r2r (mm5, mm6);
+ psrlw_i2r (8, mm6);
+
+ /* Add mm6 back into mm5. Now our result is in the high bytes. */
+ paddw_r2r (mm6, mm5);
+
+ /* Shift down. */
+ psrlw_i2r (8, mm5);
+
+ /* Multiply by appalpha. */
+ pmullw_r2r (mm2, mm5);
+ paddw_m2r (round, mm5);
+ psrlw_i2r (8, mm5);
+
+ psubusw_r2r (mm5, mm1);
+
+ /* mm1 = [ B + F - af*B ] */
+ packuswb_r2r (mm1, mm1);
+ movd_r2m (mm1, *output);
+ }
+
+ foreground += 8;
+ output += 4;
+ input += 4;
+ }
+ sfence ();
+ emms ();
+}
+#endif
+
+static void
+composite_packed4444_to_packed422_scanline_c (uint8_t * output, uint8_t * input,
+ uint8_t * foreground, int width)
+{
+ int i;
+
+ for (i = 0; i < width; i++) {
+ int a = foreground[0];
+
+ if (a == 0xff) {
+ output[0] = foreground[1];
+
+ if ((i & 1) == 0) {
+ output[1] = foreground[2];
+ output[3] = foreground[3];
+ }
+ } else if (a) {
+ /**
+ * (1 - alpha)*B + alpha*F
+ * B + af*F - af*B
+ */
+
+ output[0] =
+ input[0] + foreground[1] - multiply_alpha (foreground[0], input[0]);
+
+ if ((i & 1) == 0) {
+
+ /**
+ * C_r = (1 - af)*B + af*F
+ * C_r = B - af*B + af*F
+ */
+
+ output[1] =
+ input[1] + foreground[2] - multiply_alpha (foreground[0], input[1]);
+ output[3] =
+ input[3] + foreground[3] - multiply_alpha (foreground[0], input[3]);
+ }
+ }
+ foreground += 4;
+ output += 2;
+ input += 2;
+ }
+}
+
+
+#ifdef HAVE_CPU_I386
+static void
+composite_packed4444_to_packed422_scanline_mmxext (uint8_t * output,
+ uint8_t * input, uint8_t * foreground, int width)
+{
+ const mmx_t alpha2 = { 0x0000FFFF00000000ULL };
+ const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL };
+ const mmx_t round = { 0x0080008000800080ULL };
+ int i;
+
+ READ_PREFETCH_2048 (input);
+ READ_PREFETCH_2048 (foreground);
+
+ pxor_r2r (mm7, mm7);
+ for (i = width / 2; i; i--) {
+ int fg1 = *((uint32_t *) foreground);
+
+ int fg2 = *(((uint32_t *) foreground) + 1);
+
+ if ((fg1 & 0xff) == 0xff && (fg2 & 0xff) == 0xff) {
+ movq_m2r (*foreground, mm3);
+ movq_r2r (mm3, mm4);
+ punpcklbw_r2r (mm7, mm3);
+ punpckhbw_r2r (mm7, mm4);
+ /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */
+ pshufw_r2r (mm3, mm3, 201);
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */
+ pshufw_r2r (mm4, mm4, 16);
+ pand_m2r (alpha1, mm3);
+ pand_m2r (alpha2, mm4);
+ por_r2r (mm4, mm3);
+ /* mm1 = [ B + F - af*B ] */
+ packuswb_r2r (mm3, mm3);
+ movd_r2m (mm3, *output);
+ } else if (fg1 || fg2) {
+
+ /* mm1 = [ cr ][ y ][ cb ][ y ] */
+ movd_m2r (*input, mm1);
+ punpcklbw_r2r (mm7, mm1);
+
+ movq_m2r (*foreground, mm3);
+ movq_r2r (mm3, mm4);
+ punpcklbw_r2r (mm7, mm3);
+ punpckhbw_r2r (mm7, mm4);
+ /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */
+
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 0 a ][ 0 a ][ 0 a ] */
+ pshufw_r2r (mm3, mm5, 0);
+ pshufw_r2r (mm4, mm6, 0);
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */
+ pshufw_r2r (mm3, mm3, 201);
+ /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */
+ pshufw_r2r (mm4, mm4, 16);
+
+ pand_m2r (alpha1, mm3);
+ pand_m2r (alpha2, mm4);
+ pand_m2r (alpha1, mm5);
+ pand_m2r (alpha2, mm6);
+ por_r2r (mm4, mm3);
+ por_r2r (mm6, mm5);
+
+ /* now, mm5 is af and mm1 is B. Need to multiply them. */
+ pmullw_r2r (mm1, mm5);
+
+ /* Result is now B + F. */
+ paddw_r2r (mm3, mm1);
+
+ /* Round up appropriately. */
+ paddw_m2r (round, mm5);
+
+ /* mm6 contains our i>>8; */
+ movq_r2r (mm5, mm6);
+ psrlw_i2r (8, mm6);
+
+ /* Add mm6 back into mm5. Now our result is in the high bytes. */
+ paddw_r2r (mm6, mm5);
+
+ /* Shift down. */
+ psrlw_i2r (8, mm5);
+
+ psubusw_r2r (mm5, mm1);
+
+ /* mm1 = [ B + F - af*B ] */
+ packuswb_r2r (mm1, mm1);
+ movd_r2m (mm1, *output);
+ }
+
+ foreground += 8;
+ output += 4;
+ input += 4;
+ }
+ sfence ();
+ emms ();
+}
+#endif
+
+/**
+ * um... just need some scrap paper...
+ * D = (1 - alpha)*B + alpha*F
+ * D = (1 - a)*B + a*textluma
+ * = B - a*B + a*textluma
+ * = B + a*(textluma - B)
+ * Da = (1 - a)*b + a
+ */
+static void
+composite_alphamask_to_packed4444_scanline_c (uint8_t * output,
+ uint8_t * input,
+ uint8_t * mask, int width, int textluma, int textcb, int textcr)
+{
+ uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff;
+
+ int i;
+
+ for (i = 0; i < width; i++) {
+ int a = *mask;
+
+ if (a == 0xff) {
+ *((uint32_t *) output) = opaque;
+ } else if ((input[0] == 0x00)) {
+ *((uint32_t *) output) = (multiply_alpha (a, textcr) << 24)
+ | (multiply_alpha (a, textcb) << 16)
+ | (multiply_alpha (a, textluma) << 8) | a;
+ } else if (a) {
+ *((uint32_t *) output) =
+ ((input[3] + multiply_alpha (a, textcr - input[3])) << 24)
+ | ((input[2] + multiply_alpha (a, textcb - input[2])) << 16)
+ | ((input[1] + multiply_alpha (a, textluma - input[1])) << 8)
+ | (input[0] + multiply_alpha (a, 0xff - input[0]));
+ }
+ mask++;
+ output += 4;
+ input += 4;
+ }
+}
+
+#ifdef HAVE_CPU_I386
+static void
+composite_alphamask_to_packed4444_scanline_mmxext (uint8_t * output,
+ uint8_t * input,
+ uint8_t * mask, int width, int textluma, int textcb, int textcr)
+{
+ uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff;
+ const mmx_t round = { 0x0080008000800080ULL };
+ const mmx_t fullalpha = { 0x00000000000000ffULL };
+ mmx_t colour;
+
+ colour.w[0] = 0x00;
+ colour.w[1] = textluma;
+ colour.w[2] = textcb;
+ colour.w[3] = textcr;
+
+ movq_m2r (colour, mm1);
+ movq_r2r (mm1, mm0);
+
+ /* mm0 = [ cr ][ cb ][ y ][ 0xff ] */
+ paddw_m2r (fullalpha, mm0);
+
+ /* mm7 = 0 */
+ pxor_r2r (mm7, mm7);
+
+ /* mm6 = round */
+ movq_m2r (round, mm6);
+
+ while (width--) {
+ int a = *mask;
+
+ if (a == 0xff) {
+ *((uint32_t *) output) = opaque;
+ } else if ((input[0] == 0x00)) {
+ /* We just need to multiply our colour by the alpha value. */
+
+ /* mm2 = [ a ][ a ][ a ][ a ] */
+ movd_m2r (a, mm2);
+ movq_r2r (mm2, mm3);
+ pshufw_r2r (mm2, mm2, 0);
+
+ /* mm5 = [ cr ][ cb ][ y ][ 0 ] */
+ movq_r2r (mm1, mm5);
+
+ /* Multiply by alpha. */
+ pmullw_r2r (mm2, mm5);
+ paddw_m2r (round, mm5);
+ movq_r2r (mm5, mm6);
+ psrlw_i2r (8, mm6);
+ paddw_r2r (mm6, mm5);
+ psrlw_i2r (8, mm5);
+
+ /* Set alpha to a. */
+ por_r2r (mm3, mm5);
+
+ /* Pack and write our result. */
+ packuswb_r2r (mm5, mm5);
+ movd_r2m (mm5, *output);
+ } else if (a) {
+ /* mm2 = [ a ][ a ][ a ][ a ] */
+ movd_m2r (a, mm2);
+ pshufw_r2r (mm2, mm2, 0);
+
+ /* mm3 = [ cr ][ cb ][ y ][ 0xff ] */
+ movq_r2r (mm0, mm3);
+
+ /* mm4 = [ i_cr ][ i_cb ][ i_y ][ i_a ] */
+ movd_m2r (*input, mm4);
+ punpcklbw_r2r (mm7, mm4);
+
+ /* Subtract input and colour. */
+ psubw_r2r (mm4, mm3); /* mm3 = mm3 - mm4 */
+
+ /* Multiply alpha. */
+ pmullw_r2r (mm2, mm3);
+ paddw_r2r (mm6, mm3);
+ movq_r2r (mm3, mm2);
+ psrlw_i2r (8, mm3);
+ paddw_r2r (mm2, mm3);
+ psrlw_i2r (8, mm3);
+
+ /* Add back in the input. */
+ paddb_r2r (mm3, mm4);
+
+ /* Write result. */
+ packuswb_r2r (mm4, mm4);
+ movd_r2m (mm4, *output);
+ }
+ mask++;
+ output += 4;
+ input += 4;
+ }
+ sfence ();
+ emms ();
+}
+#endif
+
+static void
+composite_alphamask_alpha_to_packed4444_scanline_c (uint8_t * output,
+ uint8_t * input,
+ uint8_t * mask, int width, int textluma, int textcb, int textcr, int alpha)
+{
+ uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff;
+
+ int i;
+
+ for (i = 0; i < width; i++) {
+ int af = *mask;
+
+ if (af) {
+ int a = ((af * alpha) + 0x80) >> 8;
+
+ if (a == 0xff) {
+ *((uint32_t *) output) = opaque;
+ } else if (input[0] == 0x00) {
+ *((uint32_t *) output) = (multiply_alpha (a, textcr) << 24)
+ | (multiply_alpha (a, textcb) << 16)
+ | (multiply_alpha (a, textluma) << 8) | a;
+ } else if (a) {
+ *((uint32_t *) output) =
+ ((input[3] + multiply_alpha (a, textcr - input[3])) << 24)
+ | ((input[2] + multiply_alpha (a, textcb - input[2])) << 16)
+ | ((input[1] + multiply_alpha (a, textluma - input[1])) << 8)
+ | (a + multiply_alpha (0xff - a, input[0]));
+ }
+ }
+ mask++;
+ output += 4;
+ input += 4;
+ }
+}
+
+static void
+premultiply_packed4444_scanline_c (uint8_t * output, uint8_t * input, int width)
+{
+ while (width--) {
+ unsigned int cur_a = input[0];
+
+ *((uint32_t *) output) = (multiply_alpha (cur_a, input[3]) << 24)
+ | (multiply_alpha (cur_a, input[2]) << 16)
+ | (multiply_alpha (cur_a, input[1]) << 8)
+ | cur_a;
+
+ output += 4;
+ input += 4;
+ }
+}
+
+#ifdef HAVE_CPU_I386
+static void
+premultiply_packed4444_scanline_mmxext (uint8_t * output, uint8_t * input,
+ int width)
+{
+ const mmx_t round = { 0x0080008000800080ULL };
+ const mmx_t alpha = { 0x00000000000000ffULL };
+ const mmx_t noalp = { 0xffffffffffff0000ULL };
+
+ pxor_r2r (mm7, mm7);
+ while (width--) {
+ movd_m2r (*input, mm0);
+ punpcklbw_r2r (mm7, mm0);
+
+ movq_r2r (mm0, mm2);
+ pshufw_r2r (mm2, mm2, 0);
+ movq_r2r (mm2, mm4);
+ pand_m2r (alpha, mm4);
+
+ pmullw_r2r (mm2, mm0);
+ paddw_m2r (round, mm0);
+
+ movq_r2r (mm0, mm3);
+ psrlw_i2r (8, mm3);
+ paddw_r2r (mm3, mm0);
+ psrlw_i2r (8, mm0);
+
+ pand_m2r (noalp, mm0);
+ paddw_r2r (mm4, mm0);
+
+ packuswb_r2r (mm0, mm0);
+ movd_r2m (mm0, *output);
+
+ output += 4;
+ input += 4;
+ }
+ sfence ();
+ emms ();
+}
+#endif
+
+static void
+blend_packed422_scanline_c (uint8_t * output, uint8_t * src1,
+ uint8_t * src2, int width, int pos)
+{
+ if (pos == 0) {
+ blit_packed422_scanline (output, src1, width);
+ } else if (pos == 256) {
+ blit_packed422_scanline (output, src2, width);
+ } else if (pos == 128) {
+ interpolate_packed422_scanline (output, src1, src2, width);
+ } else {
+ width *= 2;
+ while (width--) {
+ *output++ = ((*src1++ * (256 - pos)) + (*src2++ * pos) + 0x80) >> 8;
+ }
+ }
+}
+
+#ifdef HAVE_CPU_I386
+static void
+blend_packed422_scanline_mmxext (uint8_t * output, uint8_t * src1,
+ uint8_t * src2, int width, int pos)
+{
+ if (pos <= 0) {
+ blit_packed422_scanline (output, src1, width);
+ } else if (pos >= 256) {
+ blit_packed422_scanline (output, src2, width);
+ } else if (pos == 128) {
+ interpolate_packed422_scanline (output, src1, src2, width);
+ } else {
+ const mmx_t all256 = { 0x0100010001000100ULL };
+ const mmx_t round = { 0x0080008000800080ULL };
+
+ movd_m2r (pos, mm0);
+ pshufw_r2r (mm0, mm0, 0);
+ movq_m2r (all256, mm1);
+ psubw_r2r (mm0, mm1);
+ pxor_r2r (mm7, mm7);
+
+ for (width /= 2; width; width--) {
+ movd_m2r (*src1, mm3);
+ movd_m2r (*src2, mm4);
+ punpcklbw_r2r (mm7, mm3);
+ punpcklbw_r2r (mm7, mm4);
+
+ pmullw_r2r (mm1, mm3);
+ pmullw_r2r (mm0, mm4);
+ paddw_r2r (mm4, mm3);
+ paddw_m2r (round, mm3);
+ psrlw_i2r (8, mm3);
+
+ packuswb_r2r (mm3, mm3);
+ movd_r2m (mm3, *output);
+
+ output += 4;
+ src1 += 4;
+ src2 += 4;
+ }
+ sfence ();
+ emms ();
+ }
+}
+#endif
+
+#ifdef HAVE_CPU_I386
+static void
+quarter_blit_vertical_packed422_scanline_mmxext (uint8_t * output,
+ uint8_t * one, uint8_t * three, int width)
+{
+ int i;
+
+ for (i = width / 16; i; --i) {
+ movq_m2r (*one, mm0);
+ movq_m2r (*three, mm1);
+ movq_m2r (*(one + 8), mm2);
+ movq_m2r (*(three + 8), mm3);
+ movq_m2r (*(one + 16), mm4);
+ movq_m2r (*(three + 16), mm5);
+ movq_m2r (*(one + 24), mm6);
+ movq_m2r (*(three + 24), mm7);
+ pavgb_r2r (mm1, mm0);
+ pavgb_r2r (mm1, mm0);
+ pavgb_r2r (mm3, mm2);
+ pavgb_r2r (mm3, mm2);
+ pavgb_r2r (mm5, mm4);
+ pavgb_r2r (mm5, mm4);
+ pavgb_r2r (mm7, mm6);
+ pavgb_r2r (mm7, mm6);
+ movntq_r2m (mm0, *output);
+ movntq_r2m (mm2, *(output + 8));
+ movntq_r2m (mm4, *(output + 16));
+ movntq_r2m (mm6, *(output + 24));
+ output += 32;
+ one += 32;
+ three += 32;
+ }
+ width = (width & 0xf);
+
+ for (i = width / 4; i; --i) {
+ movq_m2r (*one, mm0);
+ movq_m2r (*three, mm1);
+ pavgb_r2r (mm1, mm0);
+ pavgb_r2r (mm1, mm0);
+ movntq_r2m (mm0, *output);
+ output += 8;
+ one += 8;
+ three += 8;
+ }
+ width = width & 0x7;
+
+ /* Handle last few pixels. */
+ for (i = width * 2; i; --i) {
+ *output++ = (*one + *three + *three + *three + 2) / 4;
+ one++;
+ three++;
+ }
+
+ sfence ();
+ emms ();
+}
+#endif
+
+
+static void
+quarter_blit_vertical_packed422_scanline_c (uint8_t * output, uint8_t * one,
+ uint8_t * three, int width)
+{
+ width *= 2;
+ while (width--) {
+ *output++ = (*one + *three + *three + *three + 2) / 4;
+ one++;
+ three++;
+ }
+}
+
+static void
+subpix_blit_vertical_packed422_scanline_c (uint8_t * output, uint8_t * top,
+ uint8_t * bot, int subpixpos, int width)
+{
+ if (subpixpos == 32768) {
+ interpolate_packed422_scanline (output, top, bot, width);
+ } else if (subpixpos == 16384) {
+ quarter_blit_vertical_packed422_scanline (output, top, bot, width);
+ } else if (subpixpos == 49152) {
+ quarter_blit_vertical_packed422_scanline (output, bot, top, width);
+ } else {
+ int x;
+
+ width *= 2;
+ for (x = 0; x < width; x++) {
+ output[x] =
+ ((top[x] * subpixpos) + (bot[x] * (0xffff - subpixpos))) >> 16;
+ }
+ }
+}
+
+static void
+a8_subpix_blit_scanline_c (uint8_t * output, uint8_t * input,
+ int lasta, int startpos, int width)
+{
+ int pos = 0xffff - (startpos & 0xffff);
+
+ int prev = lasta;
+
+ int x;
+
+ for (x = 0; x < width; x++) {
+ output[x] = ((prev * pos) + (input[x] * (0xffff - pos))) >> 16;
+ prev = input[x];
+ }
+}
+
+/**
+ * These are from lavtools in mjpegtools:
+ *
+ * colorspace.c: Routines to perform colorspace conversions.
+ *
+ * Copyright (C) 2001 Matthew J. Marjanovic <maddog@mir.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#define FP_BITS 18
+
+/* precomputed tables */
+
+static int Y_R[256];
+
+static int Y_G[256];
+
+static int Y_B[256];
+
+static int Cb_R[256];
+
+static int Cb_G[256];
+
+static int Cb_B[256];
+
+static int Cr_R[256];
+
+static int Cr_G[256];
+
+static int Cr_B[256];
+
+static int conv_RY_inited = 0;
+
+static int RGB_Y[256];
+
+static int R_Cr[256];
+
+static int G_Cb[256];
+
+static int G_Cr[256];
+
+static int B_Cb[256];
+
+static int conv_YR_inited = 0;
+
+static int
+myround (double n)
+{
+ if (n >= 0)
+ return (int) (n + 0.5);
+ else
+ return (int) (n - 0.5);
+}
+
+static void
+init_RGB_to_YCbCr_tables (void)
+{
+ int i;
+
+ /*
+ * Q_Z[i] = (coefficient * i
+ * * (Q-excursion) / (Z-excursion) * fixed-point-factor)
+ *
+ * to one of each, add the following:
+ * + (fixed-point-factor / 2) --- for rounding later
+ * + (Q-offset * fixed-point-factor) --- to add the offset
+ *
+ */
+ for (i = 0; i < 256; i++) {
+ Y_R[i] =
+ myround (0.299 * (double) i * 219.0 / 255.0 * (double) (1 << FP_BITS));
+ Y_G[i] =
+ myround (0.587 * (double) i * 219.0 / 255.0 * (double) (1 << FP_BITS));
+ Y_B[i] =
+ myround ((0.114 * (double) i * 219.0 / 255.0 * (double) (1 << FP_BITS))
+ + (double) (1 << (FP_BITS - 1)) + (16.0 * (double) (1 << FP_BITS)));
+
+ Cb_R[i] =
+ myround (-0.168736 * (double) i * 224.0 / 255.0 *
+ (double) (1 << FP_BITS));
+ Cb_G[i] =
+ myround (-0.331264 * (double) i * 224.0 / 255.0 *
+ (double) (1 << FP_BITS));
+ Cb_B[i] =
+ myround ((0.500 * (double) i * 224.0 / 255.0 * (double) (1 << FP_BITS))
+ + (double) (1 << (FP_BITS - 1)) + (128.0 * (double) (1 << FP_BITS)));
+
+ Cr_R[i] =
+ myround (0.500 * (double) i * 224.0 / 255.0 * (double) (1 << FP_BITS));
+ Cr_G[i] =
+ myround (-0.418688 * (double) i * 224.0 / 255.0 *
+ (double) (1 << FP_BITS));
+ Cr_B[i] =
+ myround ((-0.081312 * (double) i * 224.0 / 255.0 *
+ (double) (1 << FP_BITS))
+ + (double) (1 << (FP_BITS - 1)) + (128.0 * (double) (1 << FP_BITS)));
+ }
+ conv_RY_inited = 1;
+}
+
+static void
+init_YCbCr_to_RGB_tables (void)
+{
+ int i;
+
+ /*
+ * Q_Z[i] = (coefficient * i
+ * * (Q-excursion) / (Z-excursion) * fixed-point-factor)
+ *
+ * to one of each, add the following:
+ * + (fixed-point-factor / 2) --- for rounding later
+ * + (Q-offset * fixed-point-factor) --- to add the offset
+ *
+ */
+
+ /* clip Y values under 16 */
+ for (i = 0; i < 16; i++) {
+ RGB_Y[i] =
+ myround ((1.0 * (double) (16) * 255.0 / 219.0 * (double) (1 << FP_BITS))
+ + (double) (1 << (FP_BITS - 1)));
+ }
+ for (i = 16; i < 236; i++) {
+ RGB_Y[i] =
+ myround ((1.0 * (double) (i -
+ 16) * 255.0 / 219.0 * (double) (1 << FP_BITS))
+ + (double) (1 << (FP_BITS - 1)));
+ }
+ /* clip Y values above 235 */
+ for (i = 236; i < 256; i++) {
+ RGB_Y[i] =
+ myround ((1.0 * (double) (235) * 255.0 / 219.0 *
+ (double) (1 << FP_BITS))
+ + (double) (1 << (FP_BITS - 1)));
+ }
+
+ /* clip Cb/Cr values below 16 */
+ for (i = 0; i < 16; i++) {
+ R_Cr[i] =
+ myround (1.402 * (double) (-112) * 255.0 / 224.0 *
+ (double) (1 << FP_BITS));
+ G_Cr[i] =
+ myround (-0.714136 * (double) (-112) * 255.0 / 224.0 *
+ (double) (1 << FP_BITS));
+ G_Cb[i] =
+ myround (-0.344136 * (double) (-112) * 255.0 / 224.0 *
+ (double) (1 << FP_BITS));
+ B_Cb[i] =
+ myround (1.772 * (double) (-112) * 255.0 / 224.0 *
+ (double) (1 << FP_BITS));
+ }
+ for (i = 16; i < 241; i++) {
+ R_Cr[i] =
+ myround (1.402 * (double) (i -
+ 128) * 255.0 / 224.0 * (double) (1 << FP_BITS));
+ G_Cr[i] =
+ myround (-0.714136 * (double) (i -
+ 128) * 255.0 / 224.0 * (double) (1 << FP_BITS));
+ G_Cb[i] =
+ myround (-0.344136 * (double) (i -
+ 128) * 255.0 / 224.0 * (double) (1 << FP_BITS));
+ B_Cb[i] =
+ myround (1.772 * (double) (i -
+ 128) * 255.0 / 224.0 * (double) (1 << FP_BITS));
+ }
+ /* clip Cb/Cr values above 240 */
+ for (i = 241; i < 256; i++) {
+ R_Cr[i] =
+ myround (1.402 * (double) (112) * 255.0 / 224.0 *
+ (double) (1 << FP_BITS));
+ G_Cr[i] =
+ myround (-0.714136 * (double) (112) * 255.0 / 224.0 *
+ (double) (1 << FP_BITS));
+ G_Cb[i] =
+ myround (-0.344136 * (double) (i -
+ 128) * 255.0 / 224.0 * (double) (1 << FP_BITS));
+ B_Cb[i] =
+ myround (1.772 * (double) (112) * 255.0 / 224.0 *
+ (double) (1 << FP_BITS));
+ }
+ conv_YR_inited = 1;
+}
+
+static void
+rgb24_to_packed444_rec601_scanline_c (uint8_t * output, uint8_t * input,
+ int width)
+{
+ if (!conv_RY_inited)
+ init_RGB_to_YCbCr_tables ();
+
+ while (width--) {
+ int r = input[0];
+
+ int g = input[1];
+
+ int b = input[2];
+
+ output[0] = (Y_R[r] + Y_G[g] + Y_B[b]) >> FP_BITS;
+ output[1] = (Cb_R[r] + Cb_G[g] + Cb_B[b]) >> FP_BITS;
+ output[2] = (Cr_R[r] + Cr_G[g] + Cr_B[b]) >> FP_BITS;
+ output += 3;
+ input += 3;
+ }
+}
+
+static void
+rgba32_to_packed4444_rec601_scanline_c (uint8_t * output, uint8_t * input,
+ int width)
+{
+ if (!conv_RY_inited)
+ init_RGB_to_YCbCr_tables ();
+
+ while (width--) {
+ int r = input[0];
+
+ int g = input[1];
+
+ int b = input[2];
+
+ int a = input[3];
+
+ output[0] = a;
+ output[1] = (Y_R[r] + Y_G[g] + Y_B[b]) >> FP_BITS;
+ output[2] = (Cb_R[r] + Cb_G[g] + Cb_B[b]) >> FP_BITS;
+ output[3] = (Cr_R[r] + Cr_G[g] + Cr_B[b]) >> FP_BITS;
+ output += 4;
+ input += 4;
+ }
+}
+
+static void
+packed444_to_rgb24_rec601_scanline_c (uint8_t * output, uint8_t * input,
+ int width)
+{
+ if (!conv_YR_inited)
+ init_YCbCr_to_RGB_tables ();
+
+ while (width--) {
+ int luma = input[0];
+
+ int cb = input[1];
+
+ int cr = input[2];
+
+ output[0] = clip255 ((RGB_Y[luma] + R_Cr[cr]) >> FP_BITS);
+ output[1] = clip255 ((RGB_Y[luma] + G_Cb[cb] + G_Cr[cr]) >> FP_BITS);
+ output[2] = clip255 ((RGB_Y[luma] + B_Cb[cb]) >> FP_BITS);
+
+ output += 3;
+ input += 3;
+ }
+}
+
+/**
+ * 601 numbers:
+ *
+ * Y' = 0.299*R' + 0.587*G' + 0.114*B' (in 0.0 to 1.0)
+ * Cb = -0.169*R' - 0.331*G' + 0.500*B' (in -0.5 to +0.5)
+ * Cr = 0.500*R' - 0.419*G' - 0.081*B' (in -0.5 to +0.5)
+ *
+ * Inverse:
+ * Y Cb Cr
+ * R 1.0000 -0.0009 1.4017
+ * G 1.0000 -0.3437 -0.7142
+ * B 1.0000 1.7722 0.0010
+ *
+ * S170M numbers:
+ * Y' = 0.299*R' + 0.587*G' + 0.114*B' (in 0.0 to 1.0)
+ * B-Y' = -0.299*R' - 0.587*G' + 0.886*B'
+ * R-Y' = 0.701*R' - 0.587*G' - 0.114*B'
+ */
+/*
+static void packed444_to_rgb24_rec601_reference_scanline( uint8_t *output, uint8_t *input, int width )
+{
+ while( width-- ) {
+ double yp = (((double) input[ 0 ]) - 16.0) / 255.0;
+ double cb = (((double) input[ 1 ]) - 128.0) / 255.0;
+ double cr = (((double) input[ 2 ]) - 128.0) / 255.0;
+ double r, g, b;
+
+ r = yp - (0.0009*cb) + (1.4017*cr);
+ g = yp - (0.3437*cb) - (0.7142*cr);
+ b = yp + (1.7722*cb) + (0.0010*cr);
+
+ if( r > 1.0 ) r = 1.0; else if( r < 0.0 ) r = 0.0;
+ if( g > 1.0 ) g = 1.0; else if( g < 0.0 ) g = 0.0;
+ if( b > 1.0 ) b = 1.0; else if( b < 0.0 ) b = 0.0;
+
+ output[ 0 ] = (int) ((r * 255.0) + 0.5);
+ output[ 1 ] = (int) ((g * 255.0) + 0.5);
+ output[ 2 ] = (int) ((b * 255.0) + 0.5);
+
+ output += 3;
+ input += 3;
+ }
+}
+*/
+
+static void
+packed444_to_nonpremultiplied_packed4444_scanline_c (uint8_t * output,
+ uint8_t * input, int width, int alpha)
+{
+ int i;
+
+ for (i = 0; i < width; i++) {
+ output[0] = alpha & 0xff;
+ output[1] = input[0] & 0xff;
+ output[2] = input[1] & 0xff;
+ output[3] = input[2] & 0xff;
+
+ output += 4;
+ input += 3;
+ }
+}
+
+static void
+aspect_adjust_packed4444_scanline_c (uint8_t * output,
+ uint8_t * input, int width, double pixel_aspect)
+{
+ double i;
+
+ int prev_i = 0;
+
+ int w = 0;
+
+ pixel_aspect = 1.0 / pixel_aspect;
+
+ for (i = 0.0; i < width; i += pixel_aspect) {
+ uint8_t *curin = input + ((int) i) * 4;
+
+ if (!prev_i) {
+ output[0] = curin[0];
+ output[1] = curin[1];
+ output[2] = curin[2];
+ output[3] = curin[3];
+ } else {
+ int avg_a = 0;
+
+ int avg_y = 0;
+
+ int avg_cb = 0;
+
+ int avg_cr = 0;
+
+ int pos = prev_i * 4;
+
+ int c = 0;
+
+ int j;
+
+ for (j = prev_i; j <= (int) i; j++) {
+ avg_a += input[pos++];
+ avg_y += input[pos++];
+ avg_cb += input[pos++];
+ avg_cr += input[pos++];
+ c++;
+ }
+ output[0] = avg_a / c;
+ output[1] = avg_y / c;
+ output[2] = avg_cb / c;
+ output[3] = avg_cr / c;
+ }
+ output += 4;
+ prev_i = (int) i;
+ w++;
+ }
+}
+
+static uint32_t speedy_accel;
+
+void
+setup_speedy_calls (uint32_t accel, int verbose)
+{
+ speedy_accel = accel;
+
+ interpolate_packed422_scanline = interpolate_packed422_scanline_c;
+ blit_colour_packed422_scanline = blit_colour_packed422_scanline_c;
+ blit_colour_packed4444_scanline = blit_colour_packed4444_scanline_c;
+ blit_packed422_scanline = blit_packed422_scanline_c;
+ composite_packed4444_to_packed422_scanline =
+ composite_packed4444_to_packed422_scanline_c;
+ composite_packed4444_alpha_to_packed422_scanline =
+ composite_packed4444_alpha_to_packed422_scanline_c;
+ composite_alphamask_to_packed4444_scanline =
+ composite_alphamask_to_packed4444_scanline_c;
+ composite_alphamask_alpha_to_packed4444_scanline =
+ composite_alphamask_alpha_to_packed4444_scanline_c;
+ premultiply_packed4444_scanline = premultiply_packed4444_scanline_c;
+ blend_packed422_scanline = blend_packed422_scanline_c;
+ comb_factor_packed422_scanline = 0;
+ diff_factor_packed422_scanline = diff_factor_packed422_scanline_c;
+ kill_chroma_packed422_inplace_scanline =
+ kill_chroma_packed422_inplace_scanline_c;
+ mirror_packed422_inplace_scanline = mirror_packed422_inplace_scanline_c;
+ speedy_memcpy = speedy_memcpy_c;
+ diff_packed422_block8x8 = diff_packed422_block8x8_c;
+ a8_subpix_blit_scanline = a8_subpix_blit_scanline_c;
+ quarter_blit_vertical_packed422_scanline =
+ quarter_blit_vertical_packed422_scanline_c;
+ subpix_blit_vertical_packed422_scanline =
+ subpix_blit_vertical_packed422_scanline_c;
+ packed444_to_nonpremultiplied_packed4444_scanline =
+ packed444_to_nonpremultiplied_packed4444_scanline_c;
+ aspect_adjust_packed4444_scanline = aspect_adjust_packed4444_scanline_c;
+ packed444_to_packed422_scanline = packed444_to_packed422_scanline_c;
+ packed422_to_packed444_scanline = packed422_to_packed444_scanline_c;
+ packed422_to_packed444_rec601_scanline =
+ packed422_to_packed444_rec601_scanline_c;
+ packed444_to_rgb24_rec601_scanline = packed444_to_rgb24_rec601_scanline_c;
+ rgb24_to_packed444_rec601_scanline = rgb24_to_packed444_rec601_scanline_c;
+ rgba32_to_packed4444_rec601_scanline = rgba32_to_packed4444_rec601_scanline_c;
+ invert_colour_packed422_inplace_scanline =
+ invert_colour_packed422_inplace_scanline_c;
+ vfilter_chroma_121_packed422_scanline =
+ vfilter_chroma_121_packed422_scanline_c;
+ vfilter_chroma_332_packed422_scanline =
+ vfilter_chroma_332_packed422_scanline_c;
+ convert_uyvy_to_yuyv_scanline = convert_uyvy_to_yuyv_scanline_c;
+ composite_colour4444_alpha_to_packed422_scanline =
+ composite_colour4444_alpha_to_packed422_scanline_c;
+
+#ifdef HAVE_CPU_I386
+ if (speedy_accel & OIL_IMPL_FLAG_MMXEXT) {
+ if (verbose) {
+ fprintf (stderr, "speedycode: Using MMXEXT optimized functions.\n");
+ }
+ interpolate_packed422_scanline = interpolate_packed422_scanline_mmxext;
+ blit_colour_packed422_scanline = blit_colour_packed422_scanline_mmxext;
+ blit_colour_packed4444_scanline = blit_colour_packed4444_scanline_mmxext;
+ blit_packed422_scanline = blit_packed422_scanline_mmxext;
+ composite_packed4444_to_packed422_scanline =
+ composite_packed4444_to_packed422_scanline_mmxext;
+ composite_packed4444_alpha_to_packed422_scanline =
+ composite_packed4444_alpha_to_packed422_scanline_mmxext;
+ composite_alphamask_to_packed4444_scanline =
+ composite_alphamask_to_packed4444_scanline_mmxext;
+ premultiply_packed4444_scanline = premultiply_packed4444_scanline_mmxext;
+ kill_chroma_packed422_inplace_scanline =
+ kill_chroma_packed422_inplace_scanline_mmx;
+ blend_packed422_scanline = blend_packed422_scanline_mmxext;
+ diff_factor_packed422_scanline = diff_factor_packed422_scanline_mmx;
+ comb_factor_packed422_scanline = comb_factor_packed422_scanline_mmx;
+ diff_packed422_block8x8 = diff_packed422_block8x8_mmx;
+ quarter_blit_vertical_packed422_scanline =
+ quarter_blit_vertical_packed422_scanline_mmxext;
+ invert_colour_packed422_inplace_scanline =
+ invert_colour_packed422_inplace_scanline_mmx;
+ vfilter_chroma_121_packed422_scanline =
+ vfilter_chroma_121_packed422_scanline_mmx;
+ vfilter_chroma_332_packed422_scanline =
+ vfilter_chroma_332_packed422_scanline_mmx;
+ convert_uyvy_to_yuyv_scanline = convert_uyvy_to_yuyv_scanline_mmx;
+ composite_colour4444_alpha_to_packed422_scanline =
+ composite_colour4444_alpha_to_packed422_scanline_mmxext;
+ speedy_memcpy = speedy_memcpy_mmxext;
+ } else if (speedy_accel & OIL_IMPL_FLAG_MMX) {
+ if (verbose) {
+ fprintf (stderr, "speedycode: Using MMX optimized functions.\n");
+ }
+ interpolate_packed422_scanline = interpolate_packed422_scanline_mmx;
+ blit_colour_packed422_scanline = blit_colour_packed422_scanline_mmx;
+ blit_colour_packed4444_scanline = blit_colour_packed4444_scanline_mmx;
+ blit_packed422_scanline = blit_packed422_scanline_mmx;
+ diff_factor_packed422_scanline = diff_factor_packed422_scanline_mmx;
+ comb_factor_packed422_scanline = comb_factor_packed422_scanline_mmx;
+ kill_chroma_packed422_inplace_scanline =
+ kill_chroma_packed422_inplace_scanline_mmx;
+ diff_packed422_block8x8 = diff_packed422_block8x8_mmx;
+ invert_colour_packed422_inplace_scanline =
+ invert_colour_packed422_inplace_scanline_mmx;
+ vfilter_chroma_121_packed422_scanline =
+ vfilter_chroma_121_packed422_scanline_mmx;
+ vfilter_chroma_332_packed422_scanline =
+ vfilter_chroma_332_packed422_scanline_mmx;
+ convert_uyvy_to_yuyv_scanline = convert_uyvy_to_yuyv_scanline_mmx;
+ speedy_memcpy = speedy_memcpy_mmx;
+ } else {
+ if (verbose) {
+ fprintf (stderr,
+ "speedycode: No MMX or MMXEXT support detected, using C fallbacks.\n");
+ }
+ }
+#endif
+}
+
+uint32_t
+speedy_get_accel (void)
+{
+ return speedy_accel;
+}
diff --git a/gst/deinterlace2/tvtime/speedy.h b/gst/deinterlace2/tvtime/speedy.h
new file mode 100644
index 00000000..fb833ff1
--- /dev/null
+++ b/gst/deinterlace2/tvtime/speedy.h
@@ -0,0 +1,308 @@
+/*
+ *
+ * GStreamer
+ * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+ * Relicensed for GStreamer from GPL to LGPL with permit from Billy Biggs.
+ * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578
+ */
+
+#ifndef SPEEDY_H_INCLUDED
+#define SPEEDY_H_INCLUDED
+
+#if defined (__SVR4) && defined (__sun)
+# include <sys/int_types.h>
+#else
+# include <stdint.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Speedy is a collection of optimized functions plus their C fallbacks.
+ * This includes a simple system to select which functions to use
+ * at runtime.
+ *
+ * The optimizations are done with the help of the mmx.h system, from
+ * libmpeg2 by Michel Lespinasse and Aaron Holtzman.
+ *
+ * The library is a collection of function pointers which must be first
+ * initialized by setup_speedy_calls() to point at the fastest available
+ * implementation of each function.
+ */
+
+/**
+ * Struct for pulldown detection metrics.
+ */
+typedef struct pulldown_metrics_s {
+ /* difference: total, even lines, odd lines */
+ int d, e, o;
+ /* noise: temporal, spacial (current), spacial (past) */
+ int t, s, p;
+} pulldown_metrics_t;
+
+/**
+ * Interpolates a packed 4:2:2 scanline using linear interpolation.
+ */
+extern void (*interpolate_packed422_scanline)( uint8_t *output, uint8_t *top,
+ uint8_t *bot, int width );
+
+/**
+ * Blits a colour to a packed 4:2:2 scanline.
+ */
+extern void (*blit_colour_packed422_scanline)( uint8_t *output,
+ int width, int y, int cb, int cr );
+
+/**
+ * Blits a colour to a packed 4:4:4:4 scanline. I use luma/cb/cr instead of
+ * RGB but this will of course work for either.
+ */
+extern void (*blit_colour_packed4444_scanline)( uint8_t *output,
+ int width, int alpha, int luma,
+ int cb, int cr );
+
+/**
+ * Blit from and to packed 4:2:2 scanline.
+ */
+extern void (*blit_packed422_scanline)( uint8_t *dest, const uint8_t *src, int width );
+
+/**
+ * Composites a premultiplied 4:4:4:4 pixel onto a packed 4:2:2 scanline.
+ */
+extern void (*composite_colour4444_alpha_to_packed422_scanline)( uint8_t *output, uint8_t *input,
+ int af, int y, int cb, int cr,
+ int width, int alpha );
+
+/**
+ * Composites a packed 4:4:4:4 scanline onto a packed 4:2:2 scanline.
+ * Chroma is downsampled by dropping samples (nearest neighbour).
+ */
+extern void (*composite_packed4444_to_packed422_scanline)( uint8_t *output,
+ uint8_t *input,
+ uint8_t *foreground,
+ int width );
+
+/**
+ * Composites a packed 4:4:4:4 scanline onto a packed 4:2:2 scanline.
+ * Chroma is downsampled by dropping samples (nearest neighbour). The
+ * alpha value provided is in the range 0-256 and is first applied to
+ * the input (for fadeouts).
+ */
+extern void (*composite_packed4444_alpha_to_packed422_scanline)( uint8_t *output,
+ uint8_t *input,
+ uint8_t *foreground,
+ int width, int alpha );
+
+/**
+ * Takes an alphamask and the given colour (in Y'CbCr) and composites it
+ * onto a packed 4:4:4:4 scanline.
+ */
+extern void (*composite_alphamask_to_packed4444_scanline)( uint8_t *output,
+ uint8_t *input,
+ uint8_t *mask, int width,
+ int textluma, int textcb,
+ int textcr );
+
+/**
+ * Takes an alphamask and the given colour (in Y'CbCr) and composites it
+ * onto a packed 4:4:4:4 scanline. The alpha value provided is in the
+ * range 0-256 and is first applied to the input (for fadeouts).
+ */
+extern void (*composite_alphamask_alpha_to_packed4444_scanline)( uint8_t *output,
+ uint8_t *input,
+ uint8_t *mask, int width,
+ int textluma, int textcb,
+ int textcr, int alpha );
+
+/**
+ * Premultiplies the colour by the alpha channel in a packed 4:4:4:4
+ * scanline.
+ */
+extern void (*premultiply_packed4444_scanline)( uint8_t *output, uint8_t *input, int width );
+
+/**
+ * Blend between two packed 4:2:2 scanline. Pos is the fade value in
+ * the range 0-256. A value of 0 gives 100% src1, and a value of 256
+ * gives 100% src2. Anything in between gives the appropriate faded
+ * version.
+ */
+extern void (*blend_packed422_scanline)( uint8_t *output, uint8_t *src1,
+ uint8_t *src2, int width, int pos );
+
+/**
+ * Calculates the 'difference factor' for two scanlines. This is a
+ * metric where higher values indicate that the two scanlines are more
+ * different.
+ */
+extern unsigned int (*diff_factor_packed422_scanline)( uint8_t *cur, uint8_t *old, int width );
+
+/**
+ * Calculates the 'comb factor' for a set of three scanlines. This is a
+ * metric where higher values indicate a more likely chance that the two
+ * fields are at separate points in time.
+ */
+extern unsigned int (*comb_factor_packed422_scanline)( uint8_t *top, uint8_t *mid,
+ uint8_t *bot, int width );
+
+/**
+ * Vertical [1 2 1] chroma filter.
+ */
+extern void (*vfilter_chroma_121_packed422_scanline)( uint8_t *output, int width,
+ uint8_t *m, uint8_t *t, uint8_t *b );
+
+/**
+ * Vertical [3 3 2] chroma filter.
+ */
+extern void (*vfilter_chroma_332_packed422_scanline)( uint8_t *output, int width,
+ uint8_t *m, uint8_t *t, uint8_t *b );
+
+/**
+ * Sets the chroma of the scanline to neutral (128) in-place.
+ */
+extern void (*kill_chroma_packed422_inplace_scanline)( uint8_t *data, int width );
+
+/**
+ * Mirrors the scanline in-place.
+ */
+extern void (*mirror_packed422_inplace_scanline)( uint8_t *data, int width );
+
+/**
+ * Inverts the colours on a scanline in-place.
+ */
+extern void (*invert_colour_packed422_inplace_scanline)( uint8_t *data, int width );
+
+/**
+ * Fast memcpy function, used by all of the blit functions. Won't blit
+ * anything if dest == src.
+ */
+extern void (*speedy_memcpy)( void *output, const void *input, size_t size );
+
+/**
+ * Calculates the block difference metrics for dalias' pulldown
+ * detection algorithm.
+ */
+extern void (*diff_packed422_block8x8)( pulldown_metrics_t *m, uint8_t *old,
+ uint8_t *new, int os, int ns );
+
+/**
+ * Takes an alpha mask and subpixelly blits it using linear
+ * interpolation.
+ */
+extern void (*a8_subpix_blit_scanline)( uint8_t *output, uint8_t *input,
+ int lasta, int startpos, int width );
+
+/**
+ * 1/4 vertical subpixel blit for packed 4:2:2 scanlines using linear
+ * interpolation.
+ */
+extern void (*quarter_blit_vertical_packed422_scanline)( uint8_t *output, uint8_t *one,
+ uint8_t *three, int width );
+
+/**
+ * Vertical subpixel blit for packed 4:2:2 scanlines using linear
+ * interpolation.
+ */
+extern void (*subpix_blit_vertical_packed422_scanline)( uint8_t *output, uint8_t *top,
+ uint8_t *bot, int subpixpos, int width );
+
+/**
+ * Simple function to convert a 4:4:4 scanline to a 4:4:4:4 scanline by
+ * adding an alpha channel. Result is non-premultiplied.
+ */
+extern void (*packed444_to_nonpremultiplied_packed4444_scanline)( uint8_t *output,
+ uint8_t *input,
+ int width, int alpha );
+
+/**
+ * I think this function needs to be rethought and renamed, but here
+ * it is for now. This function horizontally resamples a scanline
+ * using linear interpolation to compensate for a change in pixel
+ * aspect ratio.
+ */
+extern void (*aspect_adjust_packed4444_scanline)( uint8_t *output,
+ uint8_t *input,
+ int width,
+ double pixel_aspect );
+
+/**
+ * Convert a packed 4:4:4 surface to a packed 4:2:2 surface using
+ * nearest neighbour chroma downsampling.
+ */
+extern void (*packed444_to_packed422_scanline)( uint8_t *output,
+ uint8_t *input,
+ int width );
+
+/**
+ * Converts packed 4:2:2 to packed 4:4:4 scanlines using nearest
+ * neighbour chroma upsampling.
+ */
+extern void (*packed422_to_packed444_scanline)( uint8_t *output,
+ uint8_t *input,
+ int width );
+
+/**
+ * This filter actually does not meet the spec so calling it rec601
+ * is a bit of a lie. I got the filter from Poynton's site. This
+ * converts a scanline from packed 4:2:2 to packed 4:4:4. But this
+ * function should point at some high quality to-the-spec resampler.
+ */
+extern void (*packed422_to_packed444_rec601_scanline)( uint8_t *dest,
+ uint8_t *src,
+ int width );
+
+/**
+ * Conversions between Y'CbCr and R'G'B'. We use Rec.601 numbers
+ * since our source is broadcast video, but I think there is an
+ * argument to be made for switching to Rec.709.
+ */
+extern void (*packed444_to_rgb24_rec601_scanline)( uint8_t *output,
+ uint8_t *input,
+ int width );
+extern void (*rgb24_to_packed444_rec601_scanline)( uint8_t *output,
+ uint8_t *input,
+ int width );
+extern void (*rgba32_to_packed4444_rec601_scanline)( uint8_t *output,
+ uint8_t *input,
+ int width );
+
+/**
+ * Convert from 4:2:2 with UYVY ordering to 4:2:2 with YUYV ordering.
+ */
+extern void (*convert_uyvy_to_yuyv_scanline)( uint8_t *uyvy_buf,
+ uint8_t *yuyv_buf, int width );
+
+/**
+ * Sets up the function pointers to point at the fastest function
+ * available. Requires accelleration settings (see mm_accel.h).
+ */
+void setup_speedy_calls( uint32_t accel, int verbose );
+
+/**
+ * Returns a bitfield of what accellerations were used when speedy was
+ * initialized. See mm_accel.h.
+ */
+uint32_t speedy_get_accel( void );
+
+#ifdef __cplusplus
+};
+#endif
+#endif /* SPEEDY_H_INCLUDED */
diff --git a/gst/deinterlace2/tvtime/sse.h b/gst/deinterlace2/tvtime/sse.h
new file mode 100644
index 00000000..2e00ee0c
--- /dev/null
+++ b/gst/deinterlace2/tvtime/sse.h
@@ -0,0 +1,992 @@
+/* sse.h
+
+ Streaming SIMD Extenstions (a.k.a. Katmai New Instructions)
+ GCC interface library for IA32.
+
+ To use this library, simply include this header file
+ and compile with GCC. You MUST have inlining enabled
+ in order for sse_ok() to work; this can be done by
+ simply using -O on the GCC command line.
+
+ Compiling with -DSSE_TRACE will cause detailed trace
+ output to be sent to stderr for each sse operation.
+ This adds lots of code, and obviously slows execution to
+ a crawl, but can be very useful for debugging.
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
+ LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ AND FITNESS FOR ANY PARTICULAR PURPOSE.
+
+ 1999 by R. Fisher
+ Based on libmmx by H. Dietz and R. Fisher
+
+ Notes:
+ This is still extremely alpha.
+ Because this library depends on an assembler which understands the
+ SSE opcodes, you probably won't be able to use this yet.
+ For now, do not use TRACE versions. These both make use
+ of the MMX registers, not the SSE registers. This will be resolved
+ at a later date.
+ ToDo:
+ Rewrite TRACE macros
+ Major Debugging Work
+*/
+
+#ifndef _SSE_H
+#define _SSE_H
+
+
+
+/* The type of an value that fits in an SSE register
+ (note that long long constant values MUST be suffixed
+ by LL and unsigned long long values by ULL, lest
+ they be truncated by the compiler)
+*/
+typedef union {
+ float sf[4]; /* Single-precision (32-bit) value */
+} __attribute__ ((aligned (16))) sse_t; /* On a 16 byte (128-bit) boundary */
+
+
+#if 0
+/* Function to test if multimedia instructions are supported...
+*/
+inline extern int
+mm_support(void)
+{
+ /* Returns 1 if MMX instructions are supported,
+ 3 if Cyrix MMX and Extended MMX instructions are supported
+ 5 if AMD MMX and 3DNow! instructions are supported
+ 9 if MMX and SSE instructions are supported
+ 0 if hardware does not support any of these
+ */
+ register int rval = 0;
+
+ __asm__ __volatile__ (
+ /* See if CPUID instruction is supported ... */
+ /* ... Get copies of EFLAGS into eax and ecx */
+ "pushf\n\t"
+ "popl %%eax\n\t"
+ "movl %%eax, %%ecx\n\t"
+
+ /* ... Toggle the ID bit in one copy and store */
+ /* to the EFLAGS reg */
+ "xorl $0x200000, %%eax\n\t"
+ "push %%eax\n\t"
+ "popf\n\t"
+
+ /* ... Get the (hopefully modified) EFLAGS */
+ "pushf\n\t"
+ "popl %%eax\n\t"
+
+ /* ... Compare and test result */
+ "xorl %%eax, %%ecx\n\t"
+ "testl $0x200000, %%ecx\n\t"
+ "jz NotSupported1\n\t" /* CPUID not supported */
+
+
+ /* Get standard CPUID information, and
+ go to a specific vendor section */
+ "movl $0, %%eax\n\t"
+ "cpuid\n\t"
+
+ /* Check for Intel */
+ "cmpl $0x756e6547, %%ebx\n\t"
+ "jne TryAMD\n\t"
+ "cmpl $0x49656e69, %%edx\n\t"
+ "jne TryAMD\n\t"
+ "cmpl $0x6c65746e, %%ecx\n"
+ "jne TryAMD\n\t"
+ "jmp Intel\n\t"
+
+ /* Check for AMD */
+ "\nTryAMD:\n\t"
+ "cmpl $0x68747541, %%ebx\n\t"
+ "jne TryCyrix\n\t"
+ "cmpl $0x69746e65, %%edx\n\t"
+ "jne TryCyrix\n\t"
+ "cmpl $0x444d4163, %%ecx\n"
+ "jne TryCyrix\n\t"
+ "jmp AMD\n\t"
+
+ /* Check for Cyrix */
+ "\nTryCyrix:\n\t"
+ "cmpl $0x69727943, %%ebx\n\t"
+ "jne NotSupported2\n\t"
+ "cmpl $0x736e4978, %%edx\n\t"
+ "jne NotSupported3\n\t"
+ "cmpl $0x64616574, %%ecx\n\t"
+ "jne NotSupported4\n\t"
+ /* Drop through to Cyrix... */
+
+
+ /* Cyrix Section */
+ /* See if extended CPUID level 80000001 is supported */
+ /* The value of CPUID/80000001 for the 6x86MX is undefined
+ according to the Cyrix CPU Detection Guide (Preliminary
+ Rev. 1.01 table 1), so we'll check the value of eax for
+ CPUID/0 to see if standard CPUID level 2 is supported.
+ According to the table, the only CPU which supports level
+ 2 is also the only one which supports extended CPUID levels.
+ */
+ "cmpl $0x2, %%eax\n\t"
+ "jne MMXtest\n\t" /* Use standard CPUID instead */
+
+ /* Extended CPUID supported (in theory), so get extended
+ features */
+ "movl $0x80000001, %%eax\n\t"
+ "cpuid\n\t"
+ "testl $0x00800000, %%eax\n\t" /* Test for MMX */
+ "jz NotSupported5\n\t" /* MMX not supported */
+ "testl $0x01000000, %%eax\n\t" /* Test for Ext'd MMX */
+ "jnz EMMXSupported\n\t"
+ "movl $1, %0:\n\n\t" /* MMX Supported */
+ "jmp Return\n\n"
+ "EMMXSupported:\n\t"
+ "movl $3, %0:\n\n\t" /* EMMX and MMX Supported */
+ "jmp Return\n\t"
+
+
+ /* AMD Section */
+ "AMD:\n\t"
+
+ /* See if extended CPUID is supported */
+ "movl $0x80000000, %%eax\n\t"
+ "cpuid\n\t"
+ "cmpl $0x80000000, %%eax\n\t"
+ "jl MMXtest\n\t" /* Use standard CPUID instead */
+
+ /* Extended CPUID supported, so get extended features */
+ "movl $0x80000001, %%eax\n\t"
+ "cpuid\n\t"
+ "testl $0x00800000, %%edx\n\t" /* Test for MMX */
+ "jz NotSupported6\n\t" /* MMX not supported */
+ "testl $0x80000000, %%edx\n\t" /* Test for 3DNow! */
+ "jnz ThreeDNowSupported\n\t"
+ "movl $1, %0:\n\n\t" /* MMX Supported */
+ "jmp Return\n\n"
+ "ThreeDNowSupported:\n\t"
+ "movl $5, %0:\n\n\t" /* 3DNow! and MMX Supported */
+ "jmp Return\n\t"
+
+
+ /* Intel Section */
+ "Intel:\n\t"
+
+ /* Check for SSE */
+ "SSEtest:\n\t"
+ "movl $1, %%eax\n\t"
+ "cpuid\n\t"
+ "testl $0x02000000, %%edx\n\t" /* Test for SSE */
+ "jz MMXtest\n\t" /* SSE Not supported */
+ "movl $9, %0:\n\n\t" /* SSE Supported */
+ "jmp Return\n\t"
+
+ /* Check for MMX */
+ "MMXtest:\n\t"
+ "movl $1, %%eax\n\t"
+ "cpuid\n\t"
+ "testl $0x00800000, %%edx\n\t" /* Test for MMX */
+ "jz NotSupported7\n\t" /* MMX Not supported */
+ "movl $1, %0:\n\n\t" /* MMX Supported */
+ "jmp Return\n\t"
+
+ /* Nothing supported */
+ "\nNotSupported1:\n\t"
+ "#movl $101, %0:\n\n\t"
+ "\nNotSupported2:\n\t"
+ "#movl $102, %0:\n\n\t"
+ "\nNotSupported3:\n\t"
+ "#movl $103, %0:\n\n\t"
+ "\nNotSupported4:\n\t"
+ "#movl $104, %0:\n\n\t"
+ "\nNotSupported5:\n\t"
+ "#movl $105, %0:\n\n\t"
+ "\nNotSupported6:\n\t"
+ "#movl $106, %0:\n\n\t"
+ "\nNotSupported7:\n\t"
+ "#movl $107, %0:\n\n\t"
+ "movl $0, %0:\n\n\t"
+
+ "Return:\n\t"
+ : "=a" (rval)
+ : /* no input */
+ : "eax", "ebx", "ecx", "edx"
+ );
+
+ /* Return */
+ return(rval);
+}
+
+/* Function to test if sse instructions are supported...
+*/
+inline extern int
+sse_ok(void)
+{
+ /* Returns 1 if SSE instructions are supported, 0 otherwise */
+ return ( (mm_support() & 0x8) >> 3 );
+}
+#endif
+
+
+
+/* Helper functions for the instruction macros that follow...
+ (note that memory-to-register, m2r, instructions are nearly
+ as efficient as register-to-register, r2r, instructions;
+ however, memory-to-memory instructions are really simulated
+ as a convenience, and are only 1/3 as efficient)
+*/
+#ifdef SSE_TRACE
+
+/* Include the stuff for printing a trace to stderr...
+*/
+
+#include <stdio.h>
+
+#define sse_i2r(op, imm, reg) \
+ { \
+ sse_t sse_trace; \
+ sse_trace.uq = (imm); \
+ fprintf(stderr, #op "_i2r(" #imm "=0x%08x%08x, ", \
+ sse_trace.d[1], sse_trace.d[0]); \
+ __asm__ __volatile__ ("movq %%" #reg ", %0" \
+ : "=X" (sse_trace) \
+ : /* nothing */ ); \
+ fprintf(stderr, #reg "=0x%08x%08x) => ", \
+ sse_trace.d[1], sse_trace.d[0]); \
+ __asm__ __volatile__ (#op " %0, %%" #reg \
+ : /* nothing */ \
+ : "X" (imm)); \
+ __asm__ __volatile__ ("movq %%" #reg ", %0" \
+ : "=X" (sse_trace) \
+ : /* nothing */ ); \
+ fprintf(stderr, #reg "=0x%08x%08x\n", \
+ sse_trace.d[1], sse_trace.d[0]); \
+ }
+
+#define sse_m2r(op, mem, reg) \
+ { \
+ sse_t sse_trace; \
+ sse_trace = (mem); \
+ fprintf(stderr, #op "_m2r(" #mem "=0x%08x%08x, ", \
+ sse_trace.d[1], sse_trace.d[0]); \
+ __asm__ __volatile__ ("movq %%" #reg ", %0" \
+ : "=X" (sse_trace) \
+ : /* nothing */ ); \
+ fprintf(stderr, #reg "=0x%08x%08x) => ", \
+ sse_trace.d[1], sse_trace.d[0]); \
+ __asm__ __volatile__ (#op " %0, %%" #reg \
+ : /* nothing */ \
+ : "X" (mem)); \
+ __asm__ __volatile__ ("movq %%" #reg ", %0" \
+ : "=X" (sse_trace) \
+ : /* nothing */ ); \
+ fprintf(stderr, #reg "=0x%08x%08x\n", \
+ sse_trace.d[1], sse_trace.d[0]); \
+ }
+
+#define sse_r2m(op, reg, mem) \
+ { \
+ sse_t sse_trace; \
+ __asm__ __volatile__ ("movq %%" #reg ", %0" \
+ : "=X" (sse_trace) \
+ : /* nothing */ ); \
+ fprintf(stderr, #op "_r2m(" #reg "=0x%08x%08x, ", \
+ sse_trace.d[1], sse_trace.d[0]); \
+ sse_trace = (mem); \
+ fprintf(stderr, #mem "=0x%08x%08x) => ", \
+ sse_trace.d[1], sse_trace.d[0]); \
+ __asm__ __volatile__ (#op " %%" #reg ", %0" \
+ : "=X" (mem) \
+ : /* nothing */ ); \
+ sse_trace = (mem); \
+ fprintf(stderr, #mem "=0x%08x%08x\n", \
+ sse_trace.d[1], sse_trace.d[0]); \
+ }
+
+#define sse_r2r(op, regs, regd) \
+ { \
+ sse_t sse_trace; \
+ __asm__ __volatile__ ("movq %%" #regs ", %0" \
+ : "=X" (sse_trace) \
+ : /* nothing */ ); \
+ fprintf(stderr, #op "_r2r(" #regs "=0x%08x%08x, ", \
+ sse_trace.d[1], sse_trace.d[0]); \
+ __asm__ __volatile__ ("movq %%" #regd ", %0" \
+ : "=X" (sse_trace) \
+ : /* nothing */ ); \
+ fprintf(stderr, #regd "=0x%08x%08x) => ", \
+ sse_trace.d[1], sse_trace.d[0]); \
+ __asm__ __volatile__ (#op " %" #regs ", %" #regd); \
+ __asm__ __volatile__ ("movq %%" #regd ", %0" \
+ : "=X" (sse_trace) \
+ : /* nothing */ ); \
+ fprintf(stderr, #regd "=0x%08x%08x\n", \
+ sse_trace.d[1], sse_trace.d[0]); \
+ }
+
+#define sse_m2m(op, mems, memd) \
+ { \
+ sse_t sse_trace; \
+ sse_trace = (mems); \
+ fprintf(stderr, #op "_m2m(" #mems "=0x%08x%08x, ", \
+ sse_trace.d[1], sse_trace.d[0]); \
+ sse_trace = (memd); \
+ fprintf(stderr, #memd "=0x%08x%08x) => ", \
+ sse_trace.d[1], sse_trace.d[0]); \
+ __asm__ __volatile__ ("movq %0, %%mm0\n\t" \
+ #op " %1, %%mm0\n\t" \
+ "movq %%mm0, %0" \
+ : "=X" (memd) \
+ : "X" (mems)); \
+ sse_trace = (memd); \
+ fprintf(stderr, #memd "=0x%08x%08x\n", \
+ sse_trace.d[1], sse_trace.d[0]); \
+ }
+
+#else
+
+/* These macros are a lot simpler without the tracing...
+*/
+
+#define sse_i2r(op, imm, reg) \
+ __asm__ __volatile__ (#op " %0, %%" #reg \
+ : /* nothing */ \
+ : "X" (imm) )
+
+#define sse_m2r(op, mem, reg) \
+ __asm__ __volatile__ (#op " %0, %%" #reg \
+ : /* nothing */ \
+ : "X" (mem))
+
+#define sse_r2m(op, reg, mem) \
+ __asm__ __volatile__ (#op " %%" #reg ", %0" \
+ : "=X" (mem) \
+ : /* nothing */ )
+
+#define sse_r2r(op, regs, regd) \
+ __asm__ __volatile__ (#op " %" #regs ", %" #regd)
+
+#define sse_r2ri(op, regs, regd, imm) \
+ __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
+ : /* nothing */ \
+ : "X" (imm) )
+
+/* Load data from mems to xmmreg, operate on xmmreg, and store data to memd */
+#define sse_m2m(op, mems, memd, xmmreg) \
+ __asm__ __volatile__ ("movups %0, %%xmm0\n\t" \
+ #op " %1, %%xmm0\n\t" \
+ "movups %%mm0, %0" \
+ : "=X" (memd) \
+ : "X" (mems))
+
+#define sse_m2ri(op, mem, reg, subop) \
+ __asm__ __volatile__ (#op " %0, %%" #reg ", " #subop \
+ : /* nothing */ \
+ : "X" (mem))
+
+#define sse_m2mi(op, mems, memd, xmmreg, subop) \
+ __asm__ __volatile__ ("movups %0, %%xmm0\n\t" \
+ #op " %1, %%xmm0, " #subop "\n\t" \
+ "movups %%mm0, %0" \
+ : "=X" (memd) \
+ : "X" (mems))
+#endif
+
+
+
+
+/* 1x128 MOVe Aligned four Packed Single-fp
+*/
+#define movaps_m2r(var, reg) sse_m2r(movaps, var, reg)
+#define movaps_r2m(reg, var) sse_r2m(movaps, reg, var)
+#define movaps_r2r(regs, regd) sse_r2r(movaps, regs, regd)
+#define movaps(vars, vard) \
+ __asm__ __volatile__ ("movaps %1, %%mm0\n\t" \
+ "movaps %%mm0, %0" \
+ : "=X" (vard) \
+ : "X" (vars))
+
+
+/* 1x128 MOVe aligned Non-Temporal four Packed Single-fp
+*/
+#define movntps_r2m(xmmreg, var) sse_r2m(movntps, xmmreg, var)
+
+
+/* 1x64 MOVe Non-Temporal Quadword
+*/
+#define movntq_r2m(mmreg, var) sse_r2m(movntq, mmreg, var)
+
+
+/* 1x128 MOVe Unaligned four Packed Single-fp
+*/
+#define movups_m2r(var, reg) sse_m2r(movups, var, reg)
+#define movups_r2m(reg, var) sse_r2m(movups, reg, var)
+#define movups_r2r(regs, regd) sse_r2r(movups, regs, regd)
+#define movups(vars, vard) \
+ __asm__ __volatile__ ("movups %1, %%mm0\n\t" \
+ "movups %%mm0, %0" \
+ : "=X" (vard) \
+ : "X" (vars))
+
+
+/* MOVe High to Low Packed Single-fp
+ high half of 4x32f (x) -> low half of 4x32f (y)
+*/
+#define movhlps_r2r(regs, regd) sse_r2r(movhlps, regs, regd)
+
+
+/* MOVe Low to High Packed Single-fp
+ low half of 4x32f (x) -> high half of 4x32f (y)
+*/
+#define movlhps_r2r(regs, regd) sse_r2r(movlhps, regs, regd)
+
+
+/* MOVe High Packed Single-fp
+ 2x32f -> high half of 4x32f
+*/
+#define movhps_m2r(var, reg) sse_m2r(movhps, var, reg)
+#define movhps_r2m(reg, var) sse_r2m(movhps, reg, var)
+#define movhps(vars, vard) \
+ __asm__ __volatile__ ("movhps %1, %%mm0\n\t" \
+ "movhps %%mm0, %0" \
+ : "=X" (vard) \
+ : "X" (vars))
+
+
+/* MOVe Low Packed Single-fp
+ 2x32f -> low half of 4x32f
+*/
+#define movlps_m2r(var, reg) sse_m2r(movlps, var, reg)
+#define movlps_r2m(reg, var) sse_r2m(movlps, reg, var)
+#define movlps(vars, vard) \
+ __asm__ __volatile__ ("movlps %1, %%mm0\n\t" \
+ "movlps %%mm0, %0" \
+ : "=X" (vard) \
+ : "X" (vars))
+
+
+/* MOVe Scalar Single-fp
+ lowest field of 4x32f (x) -> lowest field of 4x32f (y)
+*/
+#define movss_m2r(var, reg) sse_m2r(movss, var, reg)
+#define movss_r2m(reg, var) sse_r2m(movss, reg, var)
+#define movss_r2r(regs, regd) sse_r2r(movss, regs, regd)
+#define movss(vars, vard) \
+ __asm__ __volatile__ ("movss %1, %%mm0\n\t" \
+ "movss %%mm0, %0" \
+ : "=X" (vard) \
+ : "X" (vars))
+
+
+/* 4x16 Packed SHUFfle Word
+*/
+#define pshufw_m2r(var, reg, index) sse_m2ri(pshufw, var, reg, index)
+#define pshufw_r2r(regs, regd, index) sse_r2ri(pshufw, regs, regd, index)
+
+
+/* 1x128 SHUFfle Packed Single-fp
+*/
+#define shufps_m2r(var, reg, index) sse_m2ri(shufps, var, reg, index)
+#define shufps_r2r(regs, regd, index) sse_r2ri(shufps, regs, regd, index)
+
+
+/* ConVerT Packed signed Int32 to(2) Packed Single-fp
+*/
+#define cvtpi2ps_m2r(var, xmmreg) sse_m2r(cvtpi2ps, var, xmmreg)
+#define cvtpi2ps_r2r(mmreg, xmmreg) sse_r2r(cvtpi2ps, mmreg, xmmreg)
+
+
+/* ConVerT Packed Single-fp to(2) Packed signed Int32
+*/
+#define cvtps2pi_m2r(var, mmreg) sse_m2r(cvtps2pi, var, mmreg)
+#define cvtps2pi_r2r(xmmreg, mmreg) sse_r2r(cvtps2pi, mmreg, xmmreg)
+
+
+/* ConVerT with Truncate Packed Single-fp to(2) Packed Int32
+*/
+#define cvttps2pi_m2r(var, mmreg) sse_m2r(cvttps2pi, var, mmreg)
+#define cvttps2pi_r2r(xmmreg, mmreg) sse_r2r(cvttps2pi, mmreg, xmmreg)
+
+
+/* ConVerT Signed Int32 to(2) Single-fp (Scalar)
+*/
+#define cvtsi2ss_m2r(var, xmmreg) sse_m2r(cvtsi2ss, var, xmmreg)
+#define cvtsi2ss_r2r(reg, xmmreg) sse_r2r(cvtsi2ss, reg, xmmreg)
+
+
+/* ConVerT Scalar Single-fp to(2) Signed Int32
+*/
+#define cvtss2si_m2r(var, reg) sse_m2r(cvtss2si, var, reg)
+#define cvtss2si_r2r(xmmreg, reg) sse_r2r(cvtss2si, xmmreg, reg)
+
+
+/* ConVerT with Truncate Scalar Single-fp to(2) Signed Int32
+*/
+#define cvttss2si_m2r(var, reg) sse_m2r(cvtss2si, var, reg)
+#define cvttss2si_r2r(xmmreg, reg) sse_r2r(cvtss2si, xmmreg, reg)
+
+
+/* Parallel EXTRact Word from 4x16
+*/
+#define pextrw_r2r(mmreg, reg, field) sse_r2ri(pextrw, mmreg, reg, field)
+
+
+/* Parallel INSeRt Word from 4x16
+*/
+#define pinsrw_r2r(reg, mmreg, field) sse_r2ri(pinsrw, reg, mmreg, field)
+
+
+
+/* MOVe MaSK from Packed Single-fp
+*/
+#ifdef SSE_TRACE
+ #define movmskps(xmmreg, reg) \
+ { \
+ fprintf(stderr, "movmskps()\n"); \
+ __asm__ __volatile__ ("movmskps %" #xmmreg ", %" #reg) \
+ }
+#else
+ #define movmskps(xmmreg, reg) \
+ __asm__ __volatile__ ("movmskps %" #xmmreg ", %" #reg)
+#endif
+
+
+/* Parallel MOVe MaSK from mmx reg to 32-bit reg
+*/
+#ifdef SSE_TRACE
+ #define pmovmskb(mmreg, reg) \
+ { \
+ fprintf(stderr, "movmskps()\n"); \
+ __asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg) \
+ }
+#else
+ #define pmovmskb(mmreg, reg) \
+ __asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg)
+#endif
+
+
+/* MASKed MOVe from 8x8 to memory pointed to by (e)di register
+*/
+#define maskmovq(mmregs, fieldreg) sse_r2ri(maskmovq, mmregs, fieldreg)
+
+
+
+
+/* 4x32f Parallel ADDs
+*/
+#define addps_m2r(var, reg) sse_m2r(addps, var, reg)
+#define addps_r2r(regs, regd) sse_r2r(addps, regs, regd)
+#define addps(vars, vard, xmmreg) sse_m2m(addps, vars, vard, xmmreg)
+
+
+/* Lowest Field of 4x32f Parallel ADDs
+*/
+#define addss_m2r(var, reg) sse_m2r(addss, var, reg)
+#define addss_r2r(regs, regd) sse_r2r(addss, regs, regd)
+#define addss(vars, vard, xmmreg) sse_m2m(addss, vars, vard, xmmreg)
+
+
+/* 4x32f Parallel SUBs
+*/
+#define subps_m2r(var, reg) sse_m2r(subps, var, reg)
+#define subps_r2r(regs, regd) sse_r2r(subps, regs, regd)
+#define subps(vars, vard, xmmreg) sse_m2m(subps, vars, vard, xmmreg)
+
+
+/* Lowest Field of 4x32f Parallel SUBs
+*/
+#define subss_m2r(var, reg) sse_m2r(subss, var, reg)
+#define subss_r2r(regs, regd) sse_r2r(subss, regs, regd)
+#define subss(vars, vard, xmmreg) sse_m2m(subss, vars, vard, xmmreg)
+
+
+/* 8x8u -> 4x16u Packed Sum of Absolute Differences
+*/
+#define psadbw_m2r(var, reg) sse_m2r(psadbw, var, reg)
+#define psadbw_r2r(regs, regd) sse_r2r(psadbw, regs, regd)
+#define psadbw(vars, vard, mmreg) sse_m2m(psadbw, vars, vard, mmreg)
+
+
+/* 4x16u Parallel MUL High Unsigned
+*/
+#define pmulhuw_m2r(var, reg) sse_m2r(pmulhuw, var, reg)
+#define pmulhuw_r2r(regs, regd) sse_r2r(pmulhuw, regs, regd)
+#define pmulhuw(vars, vard, mmreg) sse_m2m(pmulhuw, vars, vard, mmreg)
+
+
+/* 4x32f Parallel MULs
+*/
+#define mulps_m2r(var, reg) sse_m2r(mulps, var, reg)
+#define mulps_r2r(regs, regd) sse_r2r(mulps, regs, regd)
+#define mulps(vars, vard, xmmreg) sse_m2m(mulps, vars, vard, xmmreg)
+
+
+/* Lowest Field of 4x32f Parallel MULs
+*/
+#define mulss_m2r(var, reg) sse_m2r(mulss, var, reg)
+#define mulss_r2r(regs, regd) sse_r2r(mulss, regs, regd)
+#define mulss(vars, vard, xmmreg) sse_m2m(mulss, vars, vard, xmmreg)
+
+
+/* 4x32f Parallel DIVs
+*/
+#define divps_m2r(var, reg) sse_m2r(divps, var, reg)
+#define divps_r2r(regs, regd) sse_r2r(divps, regs, regd)
+#define divps(vars, vard, xmmreg) sse_m2m(divps, vars, vard, xmmreg)
+
+
+/* Lowest Field of 4x32f Parallel DIVs
+*/
+#define divss_m2r(var, reg) sse_m2r(divss, var, reg)
+#define divss_r2r(regs, regd) sse_r2r(divss, regs, regd)
+#define divss(vars, vard, xmmreg) sse_m2m(divss, vars, vard, xmmreg)
+
+
+/* 4x32f Parallel Reciprocals
+*/
+#define rcpps_m2r(var, reg) sse_m2r(rcpps, var, reg)
+#define rcpps_r2r(regs, regd) sse_r2r(rcpps, regs, regd)
+#define rcpps(vars, vard, xmmreg) sse_m2m(rcpps, vars, vard, xmmreg)
+
+
+/* Lowest Field of 4x32f Parallel Reciprocals
+*/
+#define rcpss_m2r(var, reg) sse_m2r(rcpss, var, reg)
+#define rcpss_r2r(regs, regd) sse_r2r(rcpss, regs, regd)
+#define rcpss(vars, vard, xmmreg) sse_m2m(rcpss, vars, vard, xmmreg)
+
+
+/* 4x32f Parallel Square Root of Reciprocals
+*/
+#define rsqrtps_m2r(var, reg) sse_m2r(rsqrtps, var, reg)
+#define rsqrtps_r2r(regs, regd) sse_r2r(rsqrtps, regs, regd)
+#define rsqrtps(vars, vard, xmmreg) sse_m2m(rsqrtps, vars, vard, xmmreg)
+
+
+/* Lowest Field of 4x32f Parallel Square Root of Reciprocals
+*/
+#define rsqrtss_m2r(var, reg) sse_m2r(rsqrtss, var, reg)
+#define rsqrtss_r2r(regs, regd) sse_r2r(rsqrtss, regs, regd)
+#define rsqrtss(vars, vard, xmmreg) sse_m2m(rsqrtss, vars, vard, xmmreg)
+
+
+/* 4x32f Parallel Square Roots
+*/
+#define sqrtps_m2r(var, reg) sse_m2r(sqrtps, var, reg)
+#define sqrtps_r2r(regs, regd) sse_r2r(sqrtps, regs, regd)
+#define sqrtps(vars, vard, xmmreg) sse_m2m(sqrtps, vars, vard, xmmreg)
+
+
+/* Lowest Field of 4x32f Parallel Square Roots
+*/
+#define sqrtss_m2r(var, reg) sse_m2r(sqrtss, var, reg)
+#define sqrtss_r2r(regs, regd) sse_r2r(sqrtss, regs, regd)
+#define sqrtss(vars, vard, xmmreg) sse_m2m(sqrtss, vars, vard, xmmreg)
+
+
+/* 8x8u and 4x16u Parallel AVeraGe
+*/
+#define pavgb_m2r(var, reg) sse_m2r(pavgb, var, reg)
+#define pavgb_r2r(regs, regd) sse_r2r(pavgb, regs, regd)
+#define pavgb(vars, vard, mmreg) sse_m2m(pavgb, vars, vard, mmreg)
+
+#define pavgw_m2r(var, reg) sse_m2r(pavgw, var, reg)
+#define pavgw_r2r(regs, regd) sse_r2r(pavgw, regs, regd)
+#define pavgw(vars, vard, mmreg) sse_m2m(pavgw, vars, vard, mmreg)
+
+
+/* 1x128 bitwise AND
+*/
+#define andps_m2r(var, reg) sse_m2r(andps, var, reg)
+#define andps_r2r(regs, regd) sse_r2r(andps, regs, regd)
+#define andps(vars, vard, xmmreg) sse_m2m(andps, vars, vard, xmmreg)
+
+
+/* 1x128 bitwise AND with Not the destination
+*/
+#define andnps_m2r(var, reg) sse_m2r(andnps, var, reg)
+#define andnps_r2r(regs, regd) sse_r2r(andnps, regs, regd)
+#define andnps(vars, vard, xmmreg) sse_m2m(andnps, vars, vard, xmmreg)
+
+
+/* 1x128 bitwise OR
+*/
+#define orps_m2r(var, reg) sse_m2r(orps, var, reg)
+#define orps_r2r(regs, regd) sse_r2r(orps, regs, regd)
+#define orps(vars, vard, xmmreg) sse_m2m(orps, vars, vard, xmmreg)
+
+
+/* 1x128 bitwise eXclusive OR
+*/
+#define xorps_m2r(var, reg) sse_m2r(xorps, var, reg)
+#define xorps_r2r(regs, regd) sse_r2r(xorps, regs, regd)
+#define xorps(vars, vard, xmmreg) sse_m2m(xorps, vars, vard, xmmreg)
+
+
+/* 8x8u, 4x16, and 4x32f Parallel Maximum
+*/
+#define pmaxub_m2r(var, reg) sse_m2r(pmaxub, var, reg)
+#define pmaxub_r2r(regs, regd) sse_r2r(pmaxub, regs, regd)
+#define pmaxub(vars, vard, mmreg) sse_m2m(pmaxub, vars, vard, mmreg)
+
+#define pmaxsw_m2r(var, reg) sse_m2r(pmaxsw, var, reg)
+#define pmaxsw_r2r(regs, regd) sse_r2r(pmaxsw, regs, regd)
+#define pmaxsw(vars, vard, mmreg) sse_m2m(pmaxsw, vars, vard, mmreg)
+
+#define maxps_m2r(var, reg) sse_m2r(maxps, var, reg)
+#define maxps_r2r(regs, regd) sse_r2r(maxps, regs, regd)
+#define maxps(vars, vard, xmmreg) sse_m2m(maxps, vars, vard, xmmreg)
+
+
+/* Lowest Field of 4x32f Parallel Maximum
+*/
+#define maxss_m2r(var, reg) sse_m2r(maxss, var, reg)
+#define maxss_r2r(regs, regd) sse_r2r(maxss, regs, regd)
+#define maxss(vars, vard, xmmreg) sse_m2m(maxss, vars, vard, xmmreg)
+
+
+/* 8x8u, 4x16, and 4x32f Parallel Minimum
+*/
+#define pminub_m2r(var, reg) sse_m2r(pminub, var, reg)
+#define pminub_r2r(regs, regd) sse_r2r(pminub, regs, regd)
+#define pminub(vars, vard, mmreg) sse_m2m(pminub, vars, vard, mmreg)
+
+#define pminsw_m2r(var, reg) sse_m2r(pminsw, var, reg)
+#define pminsw_r2r(regs, regd) sse_r2r(pminsw, regs, regd)
+#define pminsw(vars, vard, mmreg) sse_m2m(pminsw, vars, vard, mmreg)
+
+#define minps_m2r(var, reg) sse_m2r(minps, var, reg)
+#define minps_r2r(regs, regd) sse_r2r(minps, regs, regd)
+#define minps(vars, vard, xmmreg) sse_m2m(minps, vars, vard, xmmreg)
+
+
+/* Lowest Field of 4x32f Parallel Minimum
+*/
+#define minss_m2r(var, reg) sse_m2r(minss, var, reg)
+#define minss_r2r(regs, regd) sse_r2r(minss, regs, regd)
+#define minss(vars, vard, xmmreg) sse_m2m(minss, vars, vard, xmmreg)
+
+
+/* 4x32f Parallel CoMPares
+ (resulting fields are either 0 or -1)
+*/
+#define cmpps_m2r(var, reg, op) sse_m2ri(cmpps, var, reg, op)
+#define cmpps_r2r(regs, regd, op) sse_r2ri(cmpps, regs, regd, op)
+#define cmpps(vars, vard, op, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, op)
+
+#define cmpeqps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 0)
+#define cmpeqps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 0)
+#define cmpeqps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 0)
+
+#define cmpltps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 1)
+#define cmpltps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 1)
+#define cmpltps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 1)
+
+#define cmpleps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 2)
+#define cmpleps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 2)
+#define cmpleps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 2)
+
+#define cmpunordps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 3)
+#define cmpunordps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 3)
+#define cmpunordps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 3)
+
+#define cmpneqps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 4)
+#define cmpneqps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 4)
+#define cmpneqps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 4)
+
+#define cmpnltps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 5)
+#define cmpnltps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 5)
+#define cmpnltps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 5)
+
+#define cmpnleps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 6)
+#define cmpnleps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 6)
+#define cmpnleps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 6)
+
+#define cmpordps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 7)
+#define cmpordps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 7)
+#define cmpordps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 7)
+
+
+/* Lowest Field of 4x32f Parallel CoMPares
+ (resulting fields are either 0 or -1)
+*/
+#define cmpss_m2r(var, reg, op) sse_m2ri(cmpss, var, reg, op)
+#define cmpss_r2r(regs, regd, op) sse_r2ri(cmpss, regs, regd, op)
+#define cmpss(vars, vard, op, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, op)
+
+#define cmpeqss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 0)
+#define cmpeqss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 0)
+#define cmpeqss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 0)
+
+#define cmpltss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 1)
+#define cmpltss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 1)
+#define cmpltss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 1)
+
+#define cmpless_m2r(var, reg) sse_m2ri(cmpss, var, reg, 2)
+#define cmpless_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 2)
+#define cmpless(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 2)
+
+#define cmpunordss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 3)
+#define cmpunordss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 3)
+#define cmpunordss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 3)
+
+#define cmpneqss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 4)
+#define cmpneqss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 4)
+#define cmpneqss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 4)
+
+#define cmpnltss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 5)
+#define cmpnltss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 5)
+#define cmpnltss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 5)
+
+#define cmpnless_m2r(var, reg) sse_m2ri(cmpss, var, reg, 6)
+#define cmpnless_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 6)
+#define cmpnless(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 6)
+
+#define cmpordss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 7)
+#define cmpordss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 7)
+#define cmpordss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 7)
+
+
+/* Lowest Field of 4x32f Parallel CoMPares to set EFLAGS
+ (resulting fields are either 0 or -1)
+*/
+#define comiss_m2r(var, reg) sse_m2r(comiss, var, reg)
+#define comiss_r2r(regs, regd) sse_r2r(comiss, regs, regd)
+#define comiss(vars, vard, xmmreg) sse_m2m(comiss, vars, vard, xmmreg)
+
+
+/* Lowest Field of 4x32f Unordered Parallel CoMPares to set EFLAGS
+ (resulting fields are either 0 or -1)
+*/
+#define ucomiss_m2r(var, reg) sse_m2r(ucomiss, var, reg)
+#define ucomiss_r2r(regs, regd) sse_r2r(ucomiss, regs, regd)
+#define ucomiss(vars, vard, xmmreg) sse_m2m(ucomiss, vars, vard, xmmreg)
+
+
+/* 2-(4x32f) -> 4x32f UNPaCK Low Packed Single-fp
+ (interleaves low half of dest with low half of source
+ as padding in each result field)
+*/
+#define unpcklps_m2r(var, reg) sse_m2r(unpcklps, var, reg)
+#define unpcklps_r2r(regs, regd) sse_r2r(unpcklps, regs, regd)
+
+
+/* 2-(4x32f) -> 4x32f UNPaCK High Packed Single-fp
+ (interleaves high half of dest with high half of source
+ as padding in each result field)
+*/
+#define unpckhps_m2r(var, reg) sse_m2r(unpckhps, var, reg)
+#define unpckhps_r2r(regs, regd) sse_r2r(unpckhps, regs, regd)
+
+
+
+/* Fp and mmX ReSTORe state
+*/
+#ifdef SSE_TRACE
+ #define fxrstor(mem) \
+ { \
+ fprintf(stderr, "fxrstor()\n"); \
+ __asm__ __volatile__ ("fxrstor %0" \
+ : /* nothing */ \
+ : "X" (mem)) \
+ }
+#else
+ #define fxrstor(mem) \
+ __asm__ __volatile__ ("fxrstor %0" \
+ : /* nothing */ \
+ : "X" (mem))
+#endif
+
+
+/* Fp and mmX SAVE state
+*/
+#ifdef SSE_TRACE
+ #define fxsave(mem) \
+ { \
+ fprintf(stderr, "fxsave()\n"); \
+ __asm__ __volatile__ ("fxsave %0" \
+ : /* nothing */ \
+ : "X" (mem)) \
+ }
+#else
+ #define fxsave(mem) \
+ __asm__ __volatile__ ("fxsave %0" \
+ : /* nothing */ \
+ : "X" (mem))
+#endif
+
+
+/* STore streaMing simd eXtensions Control/Status Register
+*/
+#ifdef SSE_TRACE
+ #define stmxcsr(mem) \
+ { \
+ fprintf(stderr, "stmxcsr()\n"); \
+ __asm__ __volatile__ ("stmxcsr %0" \
+ : /* nothing */ \
+ : "X" (mem)) \
+ }
+#else
+ #define stmxcsr(mem) \
+ __asm__ __volatile__ ("stmxcsr %0" \
+ : /* nothing */ \
+ : "X" (mem))
+#endif
+
+
+/* LoaD streaMing simd eXtensions Control/Status Register
+*/
+#ifdef SSE_TRACE
+ #define ldmxcsr(mem) \
+ { \
+ fprintf(stderr, "ldmxcsr()\n"); \
+ __asm__ __volatile__ ("ldmxcsr %0" \
+ : /* nothing */ \
+ : "X" (mem)) \
+ }
+#else
+ #define ldmxcsr(mem) \
+ __asm__ __volatile__ ("ldmxcsr %0" \
+ : /* nothing */ \
+ : "X" (mem))
+#endif
+
+
+/* Store FENCE - enforce ordering of stores before fence vs. stores
+ occuring after fence in source code.
+*/
+#ifdef SSE_TRACE
+ #define sfence() \
+ { \
+ fprintf(stderr, "sfence()\n"); \
+ __asm__ __volatile__ ("sfence\n\t") \
+ }
+#else
+ #define sfence() \
+ __asm__ __volatile__ ("sfence\n\t")
+#endif
+
+
+/* PREFETCH data using T0, T1, T2, or NTA hint
+ T0 = Prefetch into all cache levels
+ T1 = Prefetch into all cache levels except 0th level
+ T2 = Prefetch into all cache levels except 0th and 1st levels
+ NTA = Prefetch data into non-temporal cache structure
+*/
+#ifdef SSE_TRACE
+#else
+ #define prefetch(mem, hint) \
+ __asm__ __volatile__ ("prefetch" #hint " %0" \
+ : /* nothing */ \
+ : "X" (mem))
+
+ #define prefetcht0(mem) prefetch(mem, t0)
+ #define prefetcht1(mem) prefetch(mem, t1)
+ #define prefetcht2(mem) prefetch(mem, t2)
+ #define prefetchnta(mem) prefetch(mem, nta)
+#endif
+
+
+
+#endif
diff --git a/gst/deinterlace2/tvtime/tomsmocomp.c b/gst/deinterlace2/tvtime/tomsmocomp.c
new file mode 100644
index 00000000..f0b73677
--- /dev/null
+++ b/gst/deinterlace2/tvtime/tomsmocomp.c
@@ -0,0 +1,187 @@
+/**
+ * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "gst/gst.h"
+#include "gstdeinterlace2.h"
+#include "plugins.h"
+#include "speedy.h"
+
+#include "tomsmocomp.h"
+#include "tomsmocomp/tomsmocompmacros.h"
+#include "x86-64_macros.inc"
+
+
+#define SearchEffortDefault 5
+#define UseStrangeBobDefault 0
+
+long SearchEffort;
+
+int UseStrangeBob;
+
+MEMCPY_FUNC *pMyMemcpy;
+
+int IsOdd;
+
+const unsigned char *pWeaveSrc;
+
+const unsigned char *pWeaveSrcP;
+
+unsigned char *pWeaveDest;
+
+const unsigned char *pCopySrc;
+
+const unsigned char *pCopySrcP;
+
+unsigned char *pCopyDest;
+
+int src_pitch;
+
+int dst_pitch;
+
+int rowsize;
+
+int height;
+
+int FldHeight;
+
+int
+Fieldcopy (void *dest, const void *src, size_t count,
+ int rows, int dst_pitch, int src_pitch)
+{
+ unsigned char *pDest = (unsigned char *) dest;
+
+ unsigned char *pSrc = (unsigned char *) src;
+
+ int i;
+
+ for (i = 0; i < rows; i++) {
+ pMyMemcpy (pDest, pSrc, count);
+ pSrc += src_pitch;
+ pDest += dst_pitch;
+ }
+ return 0;
+}
+
+
+#define IS_MMX
+#define SSE_TYPE MMX
+#define FUNCT_NAME tomsmocompDScaler_MMX
+#include "tomsmocomp/TomsMoCompAll.inc"
+#undef IS_MMX
+#undef SSE_TYPE
+#undef FUNCT_NAME
+
+#define IS_3DNOW
+#define SSE_TYPE 3DNOW
+#define FUNCT_NAME tomsmocompDScaler_3DNOW
+#include "tomsmocomp/TomsMoCompAll.inc"
+#undef IS_3DNOW
+#undef SSE_TYPE
+#undef FUNCT_NAME
+
+#define IS_SSE
+#define SSE_TYPE SSE
+#define FUNCT_NAME tomsmocompDScaler_SSE
+#include "tomsmocomp/TomsMoCompAll.inc"
+#undef IS_SSE
+#undef SSE_TYPE
+#undef FUNCT_NAME
+
+
+
+void
+deinterlace_frame_di_tomsmocomp (GstDeinterlace2 * object)
+{
+ if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) {
+ tomsmocomp_filter_sse (object);
+ } else if (object->cpu_feature_flags & OIL_IMPL_FLAG_3DNOW) {
+ tomsmocomp_filter_3dnow (object);
+ } else {
+ tomsmocomp_filter_mmx (object);
+ }
+}
+
+static deinterlace_method_t tomsmocompmethod = {
+ 0, //DEINTERLACE_PLUGIN_API_VERSION,
+ "Motion Adaptive: Motion Search",
+ "AdaptiveSearch",
+ 4,
+ OIL_IMPL_FLAG_MMX,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ deinterlace_frame_di_tomsmocomp,
+ {"Uses heuristics to detect motion in the input",
+ "frames and reconstruct image detail where",
+ "possible. Use this for high quality output",
+ "even on monitors set to an arbitrary refresh",
+ "rate.",
+ "",
+ "Motion search mode finds and follows motion",
+ "vectors for accurate interpolation. This is",
+ "the TomsMoComp deinterlacer from DScaler.",
+ ""}
+};
+
+
+
+deinterlace_method_t *
+dscaler_tomsmocomp_get_method (void)
+{
+ tomsmocomp_init ();
+ return &tomsmocompmethod;
+}
+
+
+
+void
+tomsmocomp_init (void)
+{
+ SearchEffort = SearchEffortDefault;
+ UseStrangeBob = UseStrangeBobDefault;
+}
+
+void
+tomsmocomp_filter_mmx (GstDeinterlace2 * object)
+{
+ tomsmocompDScaler_MMX (object);
+}
+
+void
+tomsmocomp_filter_3dnow (GstDeinterlace2 * object)
+{
+ tomsmocompDScaler_3DNOW (object);
+}
+
+void
+tomsmocomp_filter_sse (GstDeinterlace2 * object)
+{
+ tomsmocompDScaler_SSE (object);
+}
diff --git a/gst/deinterlace2/tvtime/tomsmocomp.h b/gst/deinterlace2/tvtime/tomsmocomp.h
new file mode 100644
index 00000000..12127800
--- /dev/null
+++ b/gst/deinterlace2/tvtime/tomsmocomp.h
@@ -0,0 +1,61 @@
+/**
+ * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#ifndef TOMSMOCOMP_H_INCLUDED
+#define TOMSMOCOMP_H_INCLUDED
+
+#include "gstdeinterlace2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int Search_Effort_0();
+int Search_Effort_1();
+int Search_Effort_3();
+int Search_Effort_5();
+int Search_Effort_9();
+int Search_Effort_11();
+int Search_Effort_13();
+int Search_Effort_15();
+int Search_Effort_19();
+int Search_Effort_21();
+int Search_Effort_Max();
+
+int Search_Effort_0_SB();
+int Search_Effort_1_SB();
+int Search_Effort_3_SB();
+int Search_Effort_5_SB();
+int Search_Effort_9_SB();
+int Search_Effort_11_SB();
+int Search_Effort_13_SB();
+int Search_Effort_15_SB();
+int Search_Effort_19_SB();
+int Search_Effort_21_SB();
+int Search_Effort_Max_SB();
+
+void tomsmocomp_init( void );
+void tomsmocomp_filter_mmx( GstDeinterlace2 *object );
+void tomsmocomp_filter_3dnow( GstDeinterlace2 *object );
+void tomsmocomp_filter_sse( GstDeinterlace2 *object );
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif /* TOMSMOCOMP_H_INCLUDED */
diff --git a/gst/deinterlace2/tvtime/vfir.c b/gst/deinterlace2/tvtime/vfir.c
new file mode 100644
index 00000000..bb42f5d3
--- /dev/null
+++ b/gst/deinterlace2/tvtime/vfir.c
@@ -0,0 +1,184 @@
+/*
+ *
+ * GStreamer
+ * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net>
+ * Copyright (c) 2001, 2002, 2003 Fabrice Bellard.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+ * This file contains code from ffmpeg, see http://ffmpeg.org/ (LGPL)
+ * and modifications by Billy Biggs.
+ *
+ * Relicensed for GStreamer from GPL to LGPL with permit from Billy Biggs.
+ * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578
+ */
+
+#include <stdio.h>
+#if defined (__SVR4) && defined (__sun)
+# include <sys/int_types.h>
+#else
+# include <stdint.h>
+#endif
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "mmx.h"
+#include "speedy.h"
+#include "gstdeinterlace2.h"
+
+/**
+ * The MPEG2 spec uses a slightly harsher filter, they specify
+ * [-1 8 2 8 -1]. ffmpeg uses a similar filter but with more of
+ * a tendancy to blur than to use the local information. The
+ * filter taps here are: [-1 4 2 4 -1].
+ */
+
+static void
+deinterlace_line (uint8_t * dst, uint8_t * lum_m4,
+ uint8_t * lum_m3, uint8_t * lum_m2,
+ uint8_t * lum_m1, uint8_t * lum, int size)
+{
+#ifdef HAVE_CPU_I386
+ mmx_t rounder;
+
+ rounder.uw[0] = 4;
+ rounder.uw[1] = 4;
+ rounder.uw[2] = 4;
+ rounder.uw[3] = 4;
+ pxor_r2r (mm7, mm7);
+ movq_m2r (rounder, mm6);
+
+ for (; size > 3; size -= 4) {
+ movd_m2r (lum_m4[0], mm0);
+ movd_m2r (lum_m3[0], mm1);
+ movd_m2r (lum_m2[0], mm2);
+ movd_m2r (lum_m1[0], mm3);
+ movd_m2r (lum[0], mm4);
+ punpcklbw_r2r (mm7, mm0);
+ punpcklbw_r2r (mm7, mm1);
+ punpcklbw_r2r (mm7, mm2);
+ punpcklbw_r2r (mm7, mm3);
+ punpcklbw_r2r (mm7, mm4);
+ paddw_r2r (mm3, mm1);
+ psllw_i2r (1, mm2);
+ paddw_r2r (mm4, mm0);
+ psllw_i2r (2, mm1); // 2
+ paddw_r2r (mm6, mm2);
+ paddw_r2r (mm2, mm1);
+ psubusw_r2r (mm0, mm1);
+ psrlw_i2r (3, mm1); // 3
+ packuswb_r2r (mm7, mm1);
+ movd_r2m (mm1, dst[0]);
+ lum_m4 += 4;
+ lum_m3 += 4;
+ lum_m2 += 4;
+ lum_m1 += 4;
+ lum += 4;
+ dst += 4;
+ }
+ emms ();
+#else
+ /**
+ * C implementation.
+ */
+ int sum;
+
+ for (; size > 0; size--) {
+ sum = -lum_m4[0];
+ sum += lum_m3[0] << 2;
+ sum += lum_m2[0] << 1;
+ sum += lum_m1[0] << 2;
+ sum += -lum[0];
+ dst[0] = (sum + 4) >> 3; // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3];
+ lum_m4++;
+ lum_m3++;
+ lum_m2++;
+ lum_m1++;
+ lum++;
+ dst++;
+ }
+#endif
+}
+
+
+/**
+ * The commented-out method below that uses the bottom_field member is more
+ * like the filter as specified in the MPEG2 spec, but it doesn't seem to
+ * have the desired effect.
+ */
+
+static void
+deinterlace_scanline_vfir (GstDeinterlace2 * object,
+ deinterlace_scanline_data_t * data, uint8_t * output)
+{
+ deinterlace_line (output, data->tt1, data->t0, data->m1, data->b0, data->bb1,
+ object->frame_width * 2);
+ // blit_packed422_scanline( output, data->m1, width );
+}
+
+static void
+copy_scanline (GstDeinterlace2 * object,
+ deinterlace_scanline_data_t * data, uint8_t * output)
+{
+ blit_packed422_scanline (output, data->m0, object->frame_width);
+ /*
+ if( data->bottom_field ) {
+ deinterlace_line( output, data->tt2, data->t1, data->m2, data->b1, data->bb2, width*2 );
+ } else {
+ deinterlace_line( output, data->tt0, data->t1, data->m0, data->b1, data->bb0, width*2 );
+ }
+ */
+}
+
+
+static deinterlace_method_t vfirmethod = {
+ 0, //DEINTERLACE_PLUGIN_API_VERSION,
+ "Blur: Vertical",
+ "BlurVertical",
+ 2,
+#ifdef HAVE_CPU_I386
+ OIL_IMPL_FLAG_MMXEXT,
+#else
+ 0,
+#endif
+ 0,
+ 0,
+ 0,
+ 1,
+ deinterlace_scanline_vfir,
+ copy_scanline,
+ 0,
+ {"Avoids flicker by blurring consecutive frames",
+ "of input. Use this if you want to run your",
+ "monitor at an arbitrary refresh rate and not",
+ "use much CPU, and are willing to sacrifice",
+ "detail.",
+ "",
+ "Vertical mode blurs favouring the most recent",
+ "field for less visible trails. From the",
+ "deinterlacer filter in ffmpeg.",
+ ""}
+};
+
+deinterlace_method_t *
+dscaler_vfir_get_method (void)
+{
+ return &vfirmethod;
+}
diff --git a/gst/deinterlace2/tvtime/x86-64_macros.inc b/gst/deinterlace2/tvtime/x86-64_macros.inc
new file mode 100644
index 00000000..2e9df758
--- /dev/null
+++ b/gst/deinterlace2/tvtime/x86-64_macros.inc
@@ -0,0 +1,82 @@
+/*
+ *
+ * GStreamer
+ * Copyright (C) 2004 Dirk Ziegelmeier <dziegel@gmx.de>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+ *
+ * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578
+ */
+
+/*
+ * This file is copied from TVTIME's sources.
+ * Original author: Achim Schneider <batchall@mordor.ch>
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef XAX
+
+#if defined (HAVE_CPU_I386) && !defined(HAVE_CPU_X86_64)
+
+#define XAX "eax"
+#define XBX "ebx"
+#define XCX "ecx"
+#define XDX "edx"
+#define XSI "esi"
+#define XDI "edi"
+#define XSP "esp"
+#define MOVX "movl"
+#define LEAX "leal"
+#define DECX "decl"
+#define PUSHX "pushl"
+#define POPX "popl"
+#define CMPX "cmpl"
+#define ADDX "addl"
+#define SHLX "shll"
+#define SHRX "shrl"
+#define SUBX "subl"
+
+#elif defined (HAVE_CPU_X86_64)
+
+#define XAX "rax"
+#define XBX "rbx"
+#define XCX "rcx"
+#define XDX "rdx"
+#define XSI "rsi"
+#define XDI "rdi"
+#define XSP "rsp"
+#define MOVX "movq"
+#define LEAX "leaq"
+#define DECX "decq"
+#define PUSHX "pushq"
+#define POPX "popq"
+#define CMPX "cmpq"
+#define ADDX "addq"
+#define SHLX "shlq"
+#define SHRX "shrq"
+#define SUBX "subq"
+
+#else
+#error Undefined architecture. Define either ARCH_X86 or ARCH_X86_64.
+#endif
+
+#endif