diff options
author | Martin Eikermann <meiker@upb.de> | 2008-06-11 11:12:14 +0000 |
---|---|---|
committer | Sebastian Dröge <slomo@circular-chaos.org> | 2008-06-11 11:12:14 +0000 |
commit | 70ff21342117866ee939f9f7597ee487bae31757 (patch) | |
tree | 84c75fd8ce74451d34424c41a793e5462fb86a6b /gst/deinterlace2/tvtime | |
parent | a14d311c96c21ae1c700917d4007ae0d3f9a02a8 (diff) | |
download | gst-plugins-bad-70ff21342117866ee939f9f7597ee487bae31757.tar.gz gst-plugins-bad-70ff21342117866ee939f9f7597ee487bae31757.tar.bz2 gst-plugins-bad-70ff21342117866ee939f9f7597ee487bae31757.zip |
gst/deinterlace2/: Add a deinterlacer plugin based on the tvtime/DScaler deinterlacer, which was relicensed to LGPL f...
Original commit message from CVS:
Based on a patch by: Martin Eikermann <meiker at upb dot de>
* gst/deinterlace2/Makefile.am:
* gst/deinterlace2/gstdeinterlace2.c:
(gst_deinterlace2_method_get_type),
(gst_deinterlace2_fields_get_type),
(gst_deinterlace2_field_layout_get_type),
(gst_deinterlace2_base_init), (gst_deinterlace2_class_init),
(gst_deinterlace2_init), (gst_deinterlace2_set_method),
(gst_deinterlace2_set_property), (gst_deinterlace2_get_property),
(gst_deinterlace2_finalize), (gst_deinterlace2_pop_history),
(gst_deinterlace2_head_history), (gst_deinterlace2_push_history),
(gst_deinterlace2_deinterlace_scanlines), (gst_deinterlace2_chain),
(gst_deinterlace2_setcaps), (gst_deinterlace2_sink_event),
(gst_deinterlace2_change_state), (gst_deinterlace2_src_event),
(gst_deinterlace2_src_query), (gst_deinterlace2_src_query_types),
(plugin_init):
* gst/deinterlace2/gstdeinterlace2.h:
* gst/deinterlace2/tvtime/greedy.c: (copy_scanline),
(deinterlace_greedy_packed422_scanline_mmxext),
(dscaler_greedyl_get_method):
* gst/deinterlace2/tvtime/greedyh.asm:
* gst/deinterlace2/tvtime/greedyh.c:
(deinterlace_frame_di_greedyh), (dscaler_greedyh_get_method),
(greedyh_init), (greedyh_filter_mmx), (greedyh_filter_3dnow),
(greedyh_filter_sse):
* gst/deinterlace2/tvtime/greedyh.h:
* gst/deinterlace2/tvtime/greedyhmacros.h:
* gst/deinterlace2/tvtime/mmx.h:
* gst/deinterlace2/tvtime/plugins.h:
* gst/deinterlace2/tvtime/speedtools.h:
* gst/deinterlace2/tvtime/speedy.c: (multiply_alpha), (clip255),
(comb_factor_packed422_scanline_mmx),
(diff_factor_packed422_scanline_c),
(diff_factor_packed422_scanline_mmx),
(diff_packed422_block8x8_mmx), (diff_packed422_block8x8_c),
(packed444_to_packed422_scanline_c),
(packed422_to_packed444_scanline_c),
(packed422_to_packed444_rec601_scanline_c),
(vfilter_chroma_121_packed422_scanline_mmx),
(vfilter_chroma_121_packed422_scanline_c),
(vfilter_chroma_332_packed422_scanline_mmx),
(vfilter_chroma_332_packed422_scanline_c),
(kill_chroma_packed422_inplace_scanline_mmx),
(kill_chroma_packed422_inplace_scanline_c),
(invert_colour_packed422_inplace_scanline_mmx),
(invert_colour_packed422_inplace_scanline_c),
(mirror_packed422_inplace_scanline_c),
(interpolate_packed422_scanline_c),
(convert_uyvy_to_yuyv_scanline_mmx),
(convert_uyvy_to_yuyv_scanline_c),
(interpolate_packed422_scanline_mmx),
(interpolate_packed422_scanline_mmxext),
(blit_colour_packed422_scanline_c),
(blit_colour_packed422_scanline_mmx),
(blit_colour_packed422_scanline_mmxext),
(blit_colour_packed4444_scanline_c),
(blit_colour_packed4444_scanline_mmx),
(blit_colour_packed4444_scanline_mmxext), (small_memcpy),
(speedy_memcpy_c), (speedy_memcpy_mmx), (speedy_memcpy_mmxext),
(blit_packed422_scanline_c), (blit_packed422_scanline_mmx),
(blit_packed422_scanline_mmxext),
(composite_colour4444_alpha_to_packed422_scanline_c),
(composite_colour4444_alpha_to_packed422_scanline_mmxext),
(composite_packed4444_alpha_to_packed422_scanline_c),
(composite_packed4444_alpha_to_packed422_scanline_mmxext),
(composite_packed4444_to_packed422_scanline_c),
(composite_packed4444_to_packed422_scanline_mmxext),
(composite_alphamask_to_packed4444_scanline_c),
(composite_alphamask_to_packed4444_scanline_mmxext),
(composite_alphamask_alpha_to_packed4444_scanline_c),
(premultiply_packed4444_scanline_c),
(premultiply_packed4444_scanline_mmxext),
(blend_packed422_scanline_c), (blend_packed422_scanline_mmxext),
(quarter_blit_vertical_packed422_scanline_mmxext),
(quarter_blit_vertical_packed422_scanline_c),
(subpix_blit_vertical_packed422_scanline_c),
(a8_subpix_blit_scanline_c), (myround), (init_RGB_to_YCbCr_tables),
(init_YCbCr_to_RGB_tables), (rgb24_to_packed444_rec601_scanline_c),
(rgba32_to_packed4444_rec601_scanline_c),
(packed444_to_rgb24_rec601_scanline_c),
(packed444_to_nonpremultiplied_packed4444_scanline_c),
(aspect_adjust_packed4444_scanline_c), (setup_speedy_calls),
(speedy_get_accel):
* gst/deinterlace2/tvtime/speedy.h:
* gst/deinterlace2/tvtime/sse.h:
* gst/deinterlace2/tvtime/tomsmocomp.c: (Fieldcopy),
(deinterlace_frame_di_tomsmocomp), (dscaler_tomsmocomp_get_method),
(tomsmocomp_init), (tomsmocomp_filter_mmx),
(tomsmocomp_filter_3dnow), (tomsmocomp_filter_sse):
* gst/deinterlace2/tvtime/tomsmocomp.h:
* gst/deinterlace2/tvtime/tomsmocomp/SearchLoop0A.inc:
* gst/deinterlace2/tvtime/tomsmocomp/SearchLoopBottom.inc:
* gst/deinterlace2/tvtime/tomsmocomp/SearchLoopEdgeA.inc:
* gst/deinterlace2/tvtime/tomsmocomp/SearchLoopEdgeA8.inc:
* gst/deinterlace2/tvtime/tomsmocomp/SearchLoopOddA.inc:
* gst/deinterlace2/tvtime/tomsmocomp/SearchLoopOddA2.inc:
* gst/deinterlace2/tvtime/tomsmocomp/SearchLoopOddA6.inc:
* gst/deinterlace2/tvtime/tomsmocomp/SearchLoopOddAH.inc:
* gst/deinterlace2/tvtime/tomsmocomp/SearchLoopOddAH2.inc:
* gst/deinterlace2/tvtime/tomsmocomp/SearchLoopTop.inc:
* gst/deinterlace2/tvtime/tomsmocomp/SearchLoopVA.inc:
* gst/deinterlace2/tvtime/tomsmocomp/SearchLoopVAH.inc:
* gst/deinterlace2/tvtime/tomsmocomp/StrangeBob.inc:
* gst/deinterlace2/tvtime/tomsmocomp/TomsMoCompAll.inc:
* gst/deinterlace2/tvtime/tomsmocomp/TomsMoCompAll2.inc:
* gst/deinterlace2/tvtime/tomsmocomp/WierdBob.inc:
* gst/deinterlace2/tvtime/vfir.c: (deinterlace_line),
(deinterlace_scanline_vfir), (copy_scanline),
(dscaler_vfir_get_method):
* gst/deinterlace2/tvtime/x86-64_macros.inc:
Add a deinterlacer plugin based on the tvtime/DScaler deinterlacer,
which was relicensed to LGPL for GStreamer and in theory provides
better and faster results than the simple deinterlace element.
Fixes bug #163578.
Ported to GStreamer 0.10 but still not enabled or included in the
build system by default because of bad artefacts caused by a bug
somewhere and as it can be only build on x86/amd64 ATM and requires
special CFLAGS. Will be fixed soon.
Diffstat (limited to 'gst/deinterlace2/tvtime')
-rw-r--r-- | gst/deinterlace2/tvtime/greedy.c | 207 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/greedyh.asm | 307 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/greedyh.c | 148 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/greedyh.h | 45 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/greedyhmacros.h | 74 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/mmx.h | 723 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/plugins.h | 42 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/speedtools.h | 54 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/speedy.c | 2791 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/speedy.h | 308 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/sse.h | 992 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/tomsmocomp.c | 187 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/tomsmocomp.h | 61 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/vfir.c | 184 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/x86-64_macros.inc | 82 |
15 files changed, 6205 insertions, 0 deletions
diff --git a/gst/deinterlace2/tvtime/greedy.c b/gst/deinterlace2/tvtime/greedy.c new file mode 100644 index 00000000..578eb711 --- /dev/null +++ b/gst/deinterlace2/tvtime/greedy.c @@ -0,0 +1,207 @@ +/* + * + * GStreamer + * Copyright (c) 2000 Tom Barry All rights reserved. + * mmx.h port copyright (c) 2002 Billy Biggs <vektor@dumbterm.net>. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Relicensed for GStreamer from GPL to LGPL with permit from Tom Barry + * and Billy Biggs. + * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 + */ + +#include <stdio.h> +#if defined (__SVR4) && defined (__sun) +# include <sys/int_types.h> +#else +# include <stdint.h> +#endif + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "mmx.h" +#include "sse.h" +#include "gstdeinterlace2.h" +#include "speedtools.h" +#include "speedy.h" + +// This is a simple lightweight DeInterlace method that uses little CPU time +// but gives very good results for low or intermedite motion. +// It defers frames by one field, but that does not seem to produce noticeable +// lip sync problems. +// +// The method used is to take either the older or newer weave pixel depending +// upon which give the smaller comb factor, and then clip to avoid large damage +// when wrong. +// +// I'd intended this to be part of a larger more elaborate method added to +// Blended Clip but this give too good results for the CPU to ignore here. + +static void +copy_scanline (GstDeinterlace2 * object, + deinterlace_scanline_data_t * data, uint8_t * output) +{ + blit_packed422_scanline (output, data->m1, object->frame_width); +} + +static int GreedyMaxComb = 15; + +static void +deinterlace_greedy_packed422_scanline_mmxext (GstDeinterlace2 * object, + deinterlace_scanline_data_t * data, uint8_t * output) +{ +#ifdef HAVE_CPU_I386 + mmx_t MaxComb; + + uint8_t *m0 = data->m0; + + uint8_t *t1 = data->t1; + + uint8_t *b1 = data->b1; + + uint8_t *m2 = data->m2; + + int width = object->frame_width; + + // How badly do we let it weave? 0-255 + MaxComb.ub[0] = GreedyMaxComb; + MaxComb.ub[1] = GreedyMaxComb; + MaxComb.ub[2] = GreedyMaxComb; + MaxComb.ub[3] = GreedyMaxComb; + MaxComb.ub[4] = GreedyMaxComb; + MaxComb.ub[5] = GreedyMaxComb; + MaxComb.ub[6] = GreedyMaxComb; + MaxComb.ub[7] = GreedyMaxComb; + + // L2 == m0 + // L1 == t1 + // L3 == b1 + // LP2 == m2 + + width /= 4; + while (width--) { + movq_m2r (*t1, mm1); // L1 + movq_m2r (*m0, mm2); // L2 + movq_m2r (*b1, mm3); // L3 + movq_m2r (*m2, mm0); // LP2 + + // average L1 and L3 leave result in mm4 + movq_r2r (mm1, mm4); // L1 + pavgb_r2r (mm3, mm4); // (L1 + L3)/2 + + + // get abs value of possible L2 comb + movq_r2r (mm2, mm7); // L2 + psubusb_r2r (mm4, mm7); // L2 - avg + movq_r2r (mm4, mm5); // avg + psubusb_r2r (mm2, mm5); // avg - L2 + por_r2r (mm7, mm5); // abs(avg-L2) + movq_r2r (mm4, mm6); // copy of avg for later + + + // get abs value of possible LP2 comb + movq_r2r (mm0, mm7); // LP2 + psubusb_r2r (mm4, mm7); // LP2 - avg + psubusb_r2r (mm0, mm4); // avg - LP2 + por_r2r (mm7, mm4); // abs(avg-LP2) + + // use L2 or LP2 depending upon which makes smaller comb + psubusb_r2r (mm5, mm4); // see if it goes to zero + psubusb_r2r (mm5, mm5); // 0 + pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0 + pcmpeqb_r2r (mm4, mm5); // opposite of mm4 + + // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 + pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0 + pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0 + por_r2r (mm5, mm4); // may the best win + + // Now lets clip our chosen value to be not outside of the range + // of the high/low range L1-L3 by more than abs(L1-L3) + // This allows some comb but limits the damages and also allows more + // detail than a boring oversmoothed clip. + + movq_r2r (mm1, mm2); // copy L1 + psubusb_r2r (mm3, mm2); // - L3, with saturation + paddusb_r2r (mm3, mm2); // now = Max(L1,L3) + + pcmpeqb_r2r (mm7, mm7); // all ffffffff + psubusb_r2r (mm1, mm7); // - L1 + paddusb_r2r (mm7, mm3); // add, may sat at fff.. + psubusb_r2r (mm7, mm3); // now = Min(L1,L3) + + // allow the value to be above the high or below the low by amt of MaxComb + paddusb_m2r (MaxComb, mm2); // increase max by diff + psubusb_m2r (MaxComb, mm3); // lower min by diff + + psubusb_r2r (mm3, mm4); // best - Min + paddusb_r2r (mm3, mm4); // now = Max(best,Min(L1,L3) + + pcmpeqb_r2r (mm7, mm7); // all ffffffff + psubusb_r2r (mm4, mm7); // - Max(best,Min(best,L3) + paddusb_r2r (mm7, mm2); // add may sat at FFF.. + psubusb_r2r (mm7, mm2); // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped + + movntq_r2m (mm2, *output); // move in our clipped best + + // Advance to the next set of pixels. + output += 8; + m0 += 8; + t1 += 8; + b1 += 8; + m2 += 8; + } + sfence (); + emms (); +#endif +} + + +static deinterlace_method_t greedyl_method = { + 0, //DEINTERLACE_PLUGIN_API_VERSION, + "Motion Adaptive: Simple Detection", + "AdaptiveSimple", + 3, + OIL_IMPL_FLAG_MMXEXT, + 0, + 0, + 0, + 1, + copy_scanline, + deinterlace_greedy_packed422_scanline_mmxext, + 0, + {"Uses heuristics to detect motion in the input", + "frames and reconstruct image detail where", + "possible. Use this for high quality output", + "even on monitors set to an arbitrary refresh", + "rate.", + "", + "Simple detection uses linear interpolation", + "where motion is detected, using a two-field", + "buffer. This is the Greedy: Low Motion", + "deinterlacer from DScaler."} +}; + +deinterlace_method_t * +dscaler_greedyl_get_method (void) +{ + return &greedyl_method; +} diff --git a/gst/deinterlace2/tvtime/greedyh.asm b/gst/deinterlace2/tvtime/greedyh.asm new file mode 100644 index 00000000..92ad1fe1 --- /dev/null +++ b/gst/deinterlace2/tvtime/greedyh.asm @@ -0,0 +1,307 @@ +/* + * + * GStreamer + * Copyright (c) 2001 Tom Barry. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + + +/* + * Relicensed for GStreamer from GPL to LGPL with permit from Tom Barry. + * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 + */ + + +#include "x86-64_macros.inc" + +void FUNCT_NAME( GstDeinterlace2 *object) +{ + int64_t i; + int InfoIsOdd = 0; + + // in tight loop some vars are accessed faster in local storage + int64_t YMask = 0x00ff00ff00ff00ffull; // to keep only luma + int64_t UVMask = 0xff00ff00ff00ff00ull; // to keep only chroma + int64_t ShiftMask = 0xfefffefffefffeffull; // to avoid shifting chroma to luma + int64_t QW256 = 0x0100010001000100ull; // 4 256's + + // Set up our two parms that are actually evaluated for each pixel + i=GreedyMaxComb; + int64_t MaxComb = i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i; + + i = GreedyMotionThreshold; // scale to range of 0-257 + int64_t MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask; + + i = GreedyMotionSense; // scale to range of 0-257 + int64_t MotionSense = i << 48 | i << 32 | i << 16 | i; + + int Line; + long LoopCtr; + unsigned int Pitch = object->field_stride; + + unsigned char* L1; // ptr to Line1, of 3 + unsigned char* L2; // ptr to Line2, the weave line + unsigned char* L3; // ptr to Line3 + + unsigned char* L2P; // ptr to prev Line2 + unsigned char* Dest = GST_BUFFER_DATA(object->out_buf); + + int64_t QW256B; + int64_t LastAvg=0; //interp value from left qword + + i = 0xffffffff - 256; + QW256B = i << 48 | i << 32 | i << 16 | i; // save a couple instr on PMINSW instruct. + + + // copy first even line no matter what, and the first odd line if we're + // processing an EVEN field. (note diff from other deint rtns.) + + if (object->field_history[object->history_count-1].flags == PICTURE_INTERLACED_BOTTOM) { + InfoIsOdd = 1; + + L1 = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf); + L2 = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf); + L3 = L1 + Pitch; + L2P = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf); + + // copy first even line + object->pMemcpy(Dest, L1, object->line_length); + Dest += object->output_stride; + } + else { + InfoIsOdd = 0; + L1 = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf); + L2 = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf) + Pitch; + L3 = L1 + Pitch; + L2P = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf) + Pitch; + + // copy first even line + object->pMemcpy(Dest, GST_BUFFER_DATA(object->field_history[0].buf), object->line_length); + Dest += object->output_stride; + // then first odd line + object->pMemcpy(Dest, L1, object->line_length); + Dest += object->output_stride; + } + + + long oldbx; + + for (Line = 0; Line < (object->field_height - 1); ++Line) { + LoopCtr = object->line_length / 8 - 1; // there are LineLength / 8 qwords per line but do 1 less, adj at end of loop + + // For ease of reading, the comments below assume that we're operating on an odd + // field (i.e., that InfoIsOdd is true). Assume the obvious for even lines.. + __asm__ __volatile__ + ( + // save ebx (-fPIC) + MOVX" %%"XBX", %[oldbx]\n\t" + + MOVX" %[L1], %%"XAX"\n\t" + LEAX" 8(%%"XAX"), %%"XBX"\n\t" // next qword needed by DJR + MOVX" %[L3], %%"XCX"\n\t" + SUBX" %%"XAX", %%"XCX"\n\t" // carry L3 addr as an offset + MOVX" %[L2P], %%"XDX"\n\t" + MOVX" %[L2], %%"XSI"\n\t" + MOVX" %[Dest], %%"XDI"\n\t" // DL1 if Odd or DL2 if Even + + ".align 8\n\t" + "1:\n\t" + + "movq (%%"XSI"), %%mm0\n\t" // L2 - the newest weave pixel value + "movq (%%"XAX"), %%mm1\n\t" // L1 - the top pixel + "movq (%%"XDX"), %%mm2\n\t" // L2P - the prev weave pixel + "movq (%%"XAX", %%"XCX"), %%mm3\n\t" // L3, next odd row + "movq %%mm1, %%mm6\n\t" // L1 - get simple single pixel interp + // pavgb mm6, mm3 // use macro below + V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]") + + // DJR - Diagonal Jaggie Reduction + // In the event that we are going to use an average (Bob) pixel we do not want a jagged + // stair step effect. To combat this we avg in the 2 horizontally adjacen pixels into the + // interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels. + + "movq %[LastAvg], %%mm4\n\t" // the bob value from prev qword in row + "movq %%mm6, %[LastAvg]\n\t" // save for next pass + "psrlq $48, %%mm4\n\t" // right justify 1 pixel + "movq %%mm6, %%mm7\n\t" // copy of simple bob pixel + "psllq $16, %%mm7\n\t" // left justify 3 pixels + "por %%mm7, %%mm4\n\t" // and combine + + "movq (%%"XBX"), %%mm5\n\t" // next horiz qword from L1 + // pavgb mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below + V_PAVGB ("%%mm5", "(%%"XBX",%%"XCX")", "%%mm7", "%[ShiftMask]") + "psllq $48, %%mm5\n\t" // left just 1 pixel + "movq %%mm6, %%mm7\n\t" // another copy of simple bob pixel + "psrlq $16, %%mm7\n\t" // right just 3 pixels + "por %%mm7, %%mm5\n\t" // combine + // pavgb mm4, mm5 // avg of forward and prev by 1 pixel, use macro + V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]") // mm5 gets modified if MMX + // pavgb mm6, mm4 // avg of center and surround interp vals, use macro + V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") + + // Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors. +#ifndef IS_MMX + // pavgb mm4, mm6 // 1/4 center, 3/4 adjacent + V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]") + // pavgb mm6, mm4 // 3/8 center, 5/8 adjacent + V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]") +#endif + + // get abs value of possible L2 comb + "movq %%mm6, %%mm4\n\t" // work copy of interp val + "movq %%mm2, %%mm7\n\t" // L2 + "psubusb %%mm4, %%mm7\n\t" // L2 - avg + "movq %%mm4, %%mm5\n\t" // avg + "psubusb %%mm2, %%mm5\n\t" // avg - L2 + "por %%mm7, %%mm5\n\t" // abs(avg-L2) + + // get abs value of possible L2P comb + "movq %%mm0, %%mm7\n\t" // L2P + "psubusb %%mm4, %%mm7\n\t" // L2P - avg + "psubusb %%mm0, %%mm4\n\t" // avg - L2P + "por %%mm7, %%mm4\n\t" // abs(avg-L2P) + + // use L2 or L2P depending upon which makes smaller comb + "psubusb %%mm5, %%mm4\n\t" // see if it goes to zero + "psubusb %%mm5, %%mm5\n\t" // 0 + "pcmpeqb %%mm5, %%mm4\n\t" // if (mm4=0) then FF else 0 + "pcmpeqb %%mm4, %%mm5\n\t" // opposite of mm4 + + // if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 + "pand %%mm2, %%mm5\n\t" // use L2 if mm5 == ff, else 0 + "pand %%mm0, %%mm4\n\t" // use L2P if mm4 = ff, else 0 + "por %%mm5, %%mm4\n\t" // may the best win + + // Inventory: at this point we have the following values: + // mm0 = L2P (or L2) + // mm1 = L1 + // mm2 = L2 (or L2P) + // mm3 = L3 + // mm4 = the best of L2,L2P weave pixel, base upon comb + // mm6 = the avg interpolated value, if we need to use it + + // Let's measure movement, as how much the weave pixel has changed + "movq %%mm2, %%mm7\n\t" + "psubusb %%mm0, %%mm2\n\t" + "psubusb %%mm7, %%mm0\n\t" + "por %%mm2, %%mm0\n\t" // abs value of change, used later + + // Now lets clip our chosen value to be not outside of the range + // of the high/low range L1-L3 by more than MaxComb. + // This allows some comb but limits the damages and also allows more + // detail than a boring oversmoothed clip. + "movq %%mm1, %%mm2\n\t" // copy L1 + // pmaxub mm2, mm3 // use macro + V_PMAXUB ("%%mm2", "%%mm3") // now = Max(L1,L3) + "movq %%mm1, %%mm5\n\t" // copy L1 + // pminub mm5, mm3 // now = Min(L1,L3), use macro + V_PMINUB ("%%mm5", "%%mm3", "%%mm7") + // allow the value to be above the high or below the low by amt of MaxComb + "psubusb %[MaxComb], %%mm5\n\t" // lower min by diff + "paddusb %[MaxComb], %%mm2\n\t" // increase max by diff + // pmaxub mm4, mm5 // now = Max(best,Min(L1,L3) use macro + V_PMAXUB ("%%mm4", "%%mm5") + // pminub mm4, mm2 // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped + V_PMINUB ("%%mm4", "%%mm2", "%%mm7") + + // Blend weave pixel with bob pixel, depending on motion val in mm0 + "psubusb %[MotionThreshold], %%mm0\n\t"// test Threshold, clear chroma change >>>?? + "pmullw %[MotionSense], %%mm0\n\t" // mul by user factor, keep low 16 bits + "movq %[QW256], %%mm7\n\t" +#ifdef HAVE_SSE + "pminsw %%mm7, %%mm0\n\t" // max = 256 +#else + "paddusw %[QW256B], %%mm0\n\t" // add, may sat at fff.. + "psubusw %[QW256B], %%mm0\n\t" // now = Min(L1,256) +#endif + "psubusw %%mm0, %%mm7\n\t" // so the 2 sum to 256, weighted avg + "movq %%mm4, %%mm2\n\t" // save weave chroma info before trashing + "pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value + "pmullw %%mm7, %%mm4\n\t" // use more weave for less motion + "pand %[YMask], %%mm6\n\t" // keep only luma from calc'd value + "pmullw %%mm0, %%mm6\n\t" // use more bob for large motion + "paddusw %%mm6, %%mm4\n\t" // combine + "psrlw $8, %%mm4\n\t" // div by 256 to get weighted avg + + // chroma comes from weave pixel + "pand %[UVMask], %%mm2\n\t" // keep chroma + "por %%mm4, %%mm2\n\t" // and combine + + V_MOVNTQ ("(%%"XDI")", "%%mm2") // move in our clipped best, use macro + + // bump ptrs and loop + LEAX" 8(%%"XAX"), %%"XAX"\n\t" + LEAX" 8(%%"XBX"), %%"XBX"\n\t" + LEAX" 8(%%"XDX"), %%"XDX"\n\t" + LEAX" 8(%%"XDI"), %%"XDI"\n\t" + LEAX" 8(%%"XSI"), %%"XSI"\n\t" + DECX" %[LoopCtr]\n\t" + "jg 1b\n\t" // loop if not to last line + // note P-III default assumes backward branches taken + "jl 1f\n\t" // done + MOVX" %%"XAX", %%"XBX"\n\t" // sharpness lookahead 1 byte only, be wrong on 1 + "jmp 1b\n\t" + + "1:\n\t" + MOVX" %[oldbx], %%"XBX"\n\t" + + : /* no outputs */ + + : [LastAvg] "m"(LastAvg), + [L1] "m"(L1), + [L3] "m"(L3), + [L2P] "m"(L2P), + [L2] "m"(L2), + [Dest] "m"(Dest), + [ShiftMask] "m"(ShiftMask), + [MaxComb] "m"(MaxComb), + [MotionThreshold] "m"(MotionThreshold), + [MotionSense] "m"(MotionSense), + [QW256B] "m"(QW256B), + [YMask] "m"(YMask), + [UVMask] "m"(UVMask), + [LoopCtr] "m"(LoopCtr), + [QW256] "m"(QW256), + [oldbx] "m"(oldbx) + + : XAX, XCX, XDX, XSI, XDI, +#ifdef HAVE_CPU_I386 + "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)", +#endif + "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", + "memory", "cc" + ); + + Dest += object->output_stride; + object->pMemcpy(Dest, L3, object->line_length); + Dest += object->output_stride; + + L1 += Pitch; + L2 += Pitch; + L3 += Pitch; + L2P += Pitch; + } + + if (InfoIsOdd) { + object->pMemcpy(Dest, L2, object->line_length); + } + + // clear out the MMX registers ready for doing floating point again +#ifdef HAVE_CPU_I386 + __asm__ __volatile__ ("emms\n\t"); +#endif +} diff --git a/gst/deinterlace2/tvtime/greedyh.c b/gst/deinterlace2/tvtime/greedyh.c new file mode 100644 index 00000000..623c2d8b --- /dev/null +++ b/gst/deinterlace2/tvtime/greedyh.c @@ -0,0 +1,148 @@ +/* + * + * GStreamer + * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Relicensed for GStreamer from GPL to LGPL with permit from Billy Biggs. + * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "greedyh.h" +#include "greedyhmacros.h" + +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <string.h> + +#include "gst/gst.h" +#include "plugins.h" +#include "gstdeinterlace2.h" +#include "speedy.h" + + +#define MAXCOMB_DEFAULT 5 +#define MOTIONTHRESHOLD_DEFAULT 25 +#define MOTIONSENSE_DEFAULT 30 + +unsigned int GreedyMaxComb; + +unsigned int GreedyMotionThreshold; + +unsigned int GreedyMotionSense; + + +#define IS_SSE +#define SSE_TYPE SSE +#define FUNCT_NAME greedyDScaler_SSE +#include "greedyh.asm" +#undef SSE_TYPE +#undef IS_SSE +#undef FUNCT_NAME + +#define IS_3DNOW +#define FUNCT_NAME greedyDScaler_3DNOW +#define SSE_TYPE 3DNOW +#include "greedyh.asm" +#undef SSE_TYPE +#undef IS_3DNOW +#undef FUNCT_NAME + +#define IS_MMX +#define SSE_TYPE MMX +#define FUNCT_NAME greedyDScaler_MMX +#include "greedyh.asm" +#undef SSE_TYPE +#undef IS_MMX +#undef FUNCT_NAME + +void +deinterlace_frame_di_greedyh (GstDeinterlace2 * object) +{ + if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) { + greedyh_filter_sse (object); + } else if (object->cpu_feature_flags & OIL_IMPL_FLAG_3DNOW) { + greedyh_filter_3dnow (object); + } else { + greedyh_filter_mmx (object); + } +} + +static deinterlace_method_t greedyh_method = { + 0, //DEINTERLACE_PLUGIN_API_VERSION, + "Motion Adaptive: Advanced Detection", + "AdaptiveAdvanced", + 4, + OIL_IMPL_FLAG_MMX, + 0, + 0, + 0, + 0, + 0, + 0, + deinterlace_frame_di_greedyh, + {"Uses heuristics to detect motion in the input", + "frames and reconstruct image detail where", + "possible. Use this for high quality output", + "even on monitors set to an arbitrary refresh", + "rate.", + "", + "Advanced detection uses linear interpolation", + "where motion is detected, using a four-field", + "buffer. This is the Greedy: High Motion", + "deinterlacer from DScaler."} +}; + +deinterlace_method_t * +dscaler_greedyh_get_method (void) +{ + greedyh_init (); + return &greedyh_method; +} + +void +greedyh_init (void) +{ + GreedyMaxComb = MAXCOMB_DEFAULT; + GreedyMotionThreshold = MOTIONTHRESHOLD_DEFAULT; + GreedyMotionSense = MOTIONSENSE_DEFAULT; +} + +void +greedyh_filter_mmx (GstDeinterlace2 * object) +{ + greedyDScaler_MMX (object); +} + +void +greedyh_filter_3dnow (GstDeinterlace2 * object) +{ + greedyDScaler_3DNOW (object); +} + +void +greedyh_filter_sse (GstDeinterlace2 * object) +{ + greedyDScaler_SSE (object); +} diff --git a/gst/deinterlace2/tvtime/greedyh.h b/gst/deinterlace2/tvtime/greedyh.h new file mode 100644 index 00000000..1156836a --- /dev/null +++ b/gst/deinterlace2/tvtime/greedyh.h @@ -0,0 +1,45 @@ +/* + * + * GStreamer + * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Relicensed for GStreamer from GPL to LGPL with permit from Billy Biggs. + * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 + */ + +#ifndef GREEDYH_H_INCLUDED +#define GREEDYH_H_INCLUDED + +#include "gstdeinterlace2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void greedyh_init( void ); +void greedyh_filter_mmx( GstDeinterlace2 *object ); +void greedyh_filter_3dnow( GstDeinterlace2 *object ); +void greedyh_filter_sse( GstDeinterlace2 *object ); + +#ifdef __cplusplus +}; +#endif + +#endif /* GREEDYH_H_INCLUDED */ diff --git a/gst/deinterlace2/tvtime/greedyhmacros.h b/gst/deinterlace2/tvtime/greedyhmacros.h new file mode 100644 index 00000000..5f65959c --- /dev/null +++ b/gst/deinterlace2/tvtime/greedyhmacros.h @@ -0,0 +1,74 @@ +///////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2001 Tom Barry. All rights reserved. +///////////////////////////////////////////////////////////////////////////// +// +// This file is subject to the terms of the GNU General Public License as +// published by the Free Software Foundation. A copy of this license is +// included with this software distribution in the file COPYING. If you +// do not have a copy, you may obtain a copy by writing to the Free +// Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. +// +// This software is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details +// +///////////////////////////////////////////////////////////////////////////// + +// Define a few macros for CPU dependent instructions. +// I suspect I don't really understand how the C macro preprocessor works but +// this seems to get the job done. // TRB 7/01 + +// BEFORE USING THESE YOU MUST SET: + +// #define SSE_TYPE SSE (or MMX or 3DNOW) + +// some macros for pavgb instruction +// V_PAVGB(mmr1, mmr2, mmr work register, smask) mmr2 may = mmrw if you can trash it + +#define V_PAVGB_MMX(mmr1, mmr2, mmrw, smask) \ + "movq "mmr2", "mmrw"\n\t" \ + "pand "smask", "mmrw"\n\t" \ + "psrlw $1, "mmrw"\n\t" \ + "pand "smask", "mmr1"\n\t" \ + "psrlw $1, "mmr1"\n\t" \ + "paddusb "mmrw", "mmr1"\n\t" +#define V_PAVGB_SSE(mmr1, mmr2, mmrw, smask) "pavgb "mmr2", "mmr1"\n\t" +#define V_PAVGB_3DNOW(mmr1, mmr2, mmrw, smask) "pavgusb "mmr2", "mmr1"\n\t" +#define V_PAVGB(mmr1, mmr2, mmrw, smask) V_PAVGB2(mmr1, mmr2, mmrw, smask, SSE_TYPE) +#define V_PAVGB2(mmr1, mmr2, mmrw, smask, ssetyp) V_PAVGB3(mmr1, mmr2, mmrw, smask, ssetyp) +#define V_PAVGB3(mmr1, mmr2, mmrw, smask, ssetyp) V_PAVGB_##ssetyp(mmr1, mmr2, mmrw, smask) + +// some macros for pmaxub instruction +#define V_PMAXUB_MMX(mmr1, mmr2) \ + "psubusb "mmr2", "mmr1"\n\t" \ + "paddusb "mmr2", "mmr1"\n\t" +#define V_PMAXUB_SSE(mmr1, mmr2) "pmaxub "mmr2", "mmr1"\n\t" +#define V_PMAXUB_3DNOW(mmr1, mmr2) V_PMAXUB_MMX(mmr1, mmr2) // use MMX version +#define V_PMAXUB(mmr1, mmr2) V_PMAXUB2(mmr1, mmr2, SSE_TYPE) +#define V_PMAXUB2(mmr1, mmr2, ssetyp) V_PMAXUB3(mmr1, mmr2, ssetyp) +#define V_PMAXUB3(mmr1, mmr2, ssetyp) V_PMAXUB_##ssetyp(mmr1, mmr2) + +// some macros for pminub instruction +// V_PMINUB(mmr1, mmr2, mmr work register) mmr2 may NOT = mmrw +#define V_PMINUB_MMX(mmr1, mmr2, mmrw) \ + "pcmpeqb "mmrw", "mmrw"\n\t" \ + "psubusb "mmr2", "mmrw"\n\t" \ + "paddusb "mmrw", "mmr1"\n\t" \ + "psubusb "mmrw", "mmr1"\n\t" +#define V_PMINUB_SSE(mmr1, mmr2, mmrw) "pminub "mmr2", "mmr1"\n\t" +#define V_PMINUB_3DNOW(mmr1, mmr2, mmrw) V_PMINUB_MMX(mmr1, mmr2, mmrw) // use MMX version +#define V_PMINUB(mmr1, mmr2, mmrw) V_PMINUB2(mmr1, mmr2, mmrw, SSE_TYPE) +#define V_PMINUB2(mmr1, mmr2, mmrw, ssetyp) V_PMINUB3(mmr1, mmr2, mmrw, ssetyp) +#define V_PMINUB3(mmr1, mmr2, mmrw, ssetyp) V_PMINUB_##ssetyp(mmr1, mmr2, mmrw) + +// some macros for movntq instruction +// V_MOVNTQ(mmr1, mmr2) +#define V_MOVNTQ_MMX(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t" +#define V_MOVNTQ_3DNOW(mmr1, mmr2) "movq "mmr2", "mmr1"\n\t" +#define V_MOVNTQ_SSE(mmr1, mmr2) "movntq "mmr2", "mmr1"\n\t" +#define V_MOVNTQ(mmr1, mmr2) V_MOVNTQ2(mmr1, mmr2, SSE_TYPE) +#define V_MOVNTQ2(mmr1, mmr2, ssetyp) V_MOVNTQ3(mmr1, mmr2, ssetyp) +#define V_MOVNTQ3(mmr1, mmr2, ssetyp) V_MOVNTQ_##ssetyp(mmr1, mmr2) + +// end of macros diff --git a/gst/deinterlace2/tvtime/mmx.h b/gst/deinterlace2/tvtime/mmx.h new file mode 100644 index 00000000..3627e61b --- /dev/null +++ b/gst/deinterlace2/tvtime/mmx.h @@ -0,0 +1,723 @@ +/* mmx.h + + MultiMedia eXtensions GCC interface library for IA32. + + To use this library, simply include this header file + and compile with GCC. You MUST have inlining enabled + in order for mmx_ok() to work; this can be done by + simply using -O on the GCC command line. + + Compiling with -DMMX_TRACE will cause detailed trace + output to be sent to stderr for each mmx operation. + This adds lots of code, and obviously slows execution to + a crawl, but can be very useful for debugging. + + THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY + EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT + LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY + AND FITNESS FOR ANY PARTICULAR PURPOSE. + + 1997-98 by H. Dietz and R. Fisher + + History: + 97-98* R.Fisher Early versions + 980501 R.Fisher Original Release + 980611* H.Dietz Rewrite, correctly implementing inlines, and + R.Fisher including direct register accesses. + 980616 R.Fisher Release of 980611 as 980616. + 980714 R.Fisher Minor corrections to Makefile, etc. + 980715 R.Fisher mmx_ok() now prevents optimizer from using + clobbered values. + mmx_ok() now checks if cpuid instruction is + available before trying to use it. + 980726* R.Fisher mm_support() searches for AMD 3DNow, Cyrix + Extended MMX, and standard MMX. It returns a + value which is positive if any of these are + supported, and can be masked with constants to + see which. mmx_ok() is now a call to this + 980726* R.Fisher Added i2r support for shift functions + 980919 R.Fisher Fixed AMD extended feature recognition bug. + 980921 R.Fisher Added definition/check for _MMX_H. + Added "float s[2]" to mmx_t for use with + 3DNow and EMMX. So same mmx_t can be used. + 981013 R.Fisher Fixed cpuid function 1 bug (looked at wrong reg) + Fixed psllq_i2r error in mmxtest.c + + * Unreleased (internal or interim) versions + + Notes: + It appears that the latest gas has the pand problem fixed, therefore + I'll undefine BROKEN_PAND by default. + String compares may be quicker than the multiple test/jumps in vendor + test sequence in mmx_ok(), but I'm not concerned with that right now. + + Acknowledgments: + Jussi Laako for pointing out the errors ultimately found to be + connected to the failure to notify the optimizer of clobbered values. + Roger Hardiman for reminding us that CPUID isn't everywhere, and that + someone may actually try to use this on a machine without CPUID. + Also for suggesting code for checking this. + Robert Dale for pointing out the AMD recognition bug. + Jimmy Mayfield and Carl Witty for pointing out the Intel recognition + bug. + Carl Witty for pointing out the psllq_i2r test bug. +*/ + +#ifndef _MMX_H +#define _MMX_H + +/*#define MMX_TRACE */ + +/* Warning: at this writing, the version of GAS packaged + with most Linux distributions does not handle the + parallel AND operation mnemonic correctly. If the + symbol BROKEN_PAND is defined, a slower alternative + coding will be used. If execution of mmxtest results + in an illegal instruction fault, define this symbol. +*/ +#undef BROKEN_PAND + + +/* The type of an value that fits in an MMX register + (note that long long constant values MUST be suffixed + by LL and unsigned long long values by ULL, lest + they be truncated by the compiler) +*/ +typedef union { + long long q; /* Quadword (64-bit) value */ + unsigned long long uq; /* Unsigned Quadword */ + int d[2]; /* 2 Doubleword (32-bit) values */ + unsigned int ud[2]; /* 2 Unsigned Doubleword */ + short w[4]; /* 4 Word (16-bit) values */ + unsigned short uw[4]; /* 4 Unsigned Word */ + char b[8]; /* 8 Byte (8-bit) values */ + unsigned char ub[8]; /* 8 Unsigned Byte */ + float s[2]; /* Single-precision (32-bit) value */ +} mmx_t; + + +/* Function to test if multimedia instructions are supported... +*/ +inline extern int +mm_support(void) +{ + /* Returns 1 if MMX instructions are supported, + 3 if Cyrix MMX and Extended MMX instructions are supported + 5 if AMD MMX and 3DNow! instructions are supported + 0 if hardware does not support any of these + */ + register int rval = 0; + + __asm__ __volatile__ ( + /* See if CPUID instruction is supported ... */ + /* ... Get copies of EFLAGS into eax and ecx */ + "pushf\n\t" + "popl %%eax\n\t" + "movl %%eax, %%ecx\n\t" + + /* ... Toggle the ID bit in one copy and store */ + /* to the EFLAGS reg */ + "xorl $0x200000, %%eax\n\t" + "push %%eax\n\t" + "popf\n\t" + + /* ... Get the (hopefully modified) EFLAGS */ + "pushf\n\t" + "popl %%eax\n\t" + + /* ... Compare and test result */ + "xorl %%eax, %%ecx\n\t" + "testl $0x200000, %%ecx\n\t" + "jz NotSupported1\n\t" /* Nothing supported */ + + + /* Get standard CPUID information, and + go to a specific vendor section */ + "movl $0, %%eax\n\t" + "cpuid\n\t" + + /* Check for Intel */ + "cmpl $0x756e6547, %%ebx\n\t" + "jne TryAMD\n\t" + "cmpl $0x49656e69, %%edx\n\t" + "jne TryAMD\n\t" + "cmpl $0x6c65746e, %%ecx\n" + "jne TryAMD\n\t" + "jmp Intel\n\t" + + /* Check for AMD */ + "\nTryAMD:\n\t" + "cmpl $0x68747541, %%ebx\n\t" + "jne TryCyrix\n\t" + "cmpl $0x69746e65, %%edx\n\t" + "jne TryCyrix\n\t" + "cmpl $0x444d4163, %%ecx\n" + "jne TryCyrix\n\t" + "jmp AMD\n\t" + + /* Check for Cyrix */ + "\nTryCyrix:\n\t" + "cmpl $0x69727943, %%ebx\n\t" + "jne NotSupported2\n\t" + "cmpl $0x736e4978, %%edx\n\t" + "jne NotSupported3\n\t" + "cmpl $0x64616574, %%ecx\n\t" + "jne NotSupported4\n\t" + /* Drop through to Cyrix... */ + + + /* Cyrix Section */ + /* See if extended CPUID is supported */ + "movl $0x80000000, %%eax\n\t" + "cpuid\n\t" + "cmpl $0x80000000, %%eax\n\t" + "jl MMXtest\n\t" /* Try standard CPUID instead */ + + /* Extended CPUID supported, so get extended features */ + "movl $0x80000001, %%eax\n\t" + "cpuid\n\t" + "testl $0x00800000, %%eax\n\t" /* Test for MMX */ + "jz NotSupported5\n\t" /* MMX not supported */ + "testl $0x01000000, %%eax\n\t" /* Test for Ext'd MMX */ + "jnz EMMXSupported\n\t" + "movl $1, %0:\n\n\t" /* MMX Supported */ + "jmp Return\n\n" + "EMMXSupported:\n\t" + "movl $3, %0:\n\n\t" /* EMMX and MMX Supported */ + "jmp Return\n\t" + + + /* AMD Section */ + "AMD:\n\t" + + /* See if extended CPUID is supported */ + "movl $0x80000000, %%eax\n\t" + "cpuid\n\t" + "cmpl $0x80000000, %%eax\n\t" + "jl MMXtest\n\t" /* Try standard CPUID instead */ + + /* Extended CPUID supported, so get extended features */ + "movl $0x80000001, %%eax\n\t" + "cpuid\n\t" + "testl $0x00800000, %%edx\n\t" /* Test for MMX */ + "jz NotSupported6\n\t" /* MMX not supported */ + "testl $0x80000000, %%edx\n\t" /* Test for 3DNow! */ + "jnz ThreeDNowSupported\n\t" + "movl $1, %0:\n\n\t" /* MMX Supported */ + "jmp Return\n\n" + "ThreeDNowSupported:\n\t" + "movl $5, %0:\n\n\t" /* 3DNow! and MMX Supported */ + "jmp Return\n\t" + + + /* Intel Section */ + "Intel:\n\t" + + /* Check for MMX */ + "MMXtest:\n\t" + "movl $1, %%eax\n\t" + "cpuid\n\t" + "testl $0x00800000, %%edx\n\t" /* Test for MMX */ + "jz NotSupported7\n\t" /* MMX Not supported */ + "movl $1, %0:\n\n\t" /* MMX Supported */ + "jmp Return\n\t" + + /* Nothing supported */ + "\nNotSupported1:\n\t" + "#movl $101, %0:\n\n\t" + "\nNotSupported2:\n\t" + "#movl $102, %0:\n\n\t" + "\nNotSupported3:\n\t" + "#movl $103, %0:\n\n\t" + "\nNotSupported4:\n\t" + "#movl $104, %0:\n\n\t" + "\nNotSupported5:\n\t" + "#movl $105, %0:\n\n\t" + "\nNotSupported6:\n\t" + "#movl $106, %0:\n\n\t" + "\nNotSupported7:\n\t" + "#movl $107, %0:\n\n\t" + "movl $0, %0:\n\n\t" + + "Return:\n\t" + : "=a" (rval) + : /* no input */ + : "eax", "ebx", "ecx", "edx" + ); + + /* Return */ + return(rval); +} + +/* Function to test if mmx instructions are supported... +*/ +inline extern int +mmx_ok(void) +{ + /* Returns 1 if MMX instructions are supported, 0 otherwise */ + return ( mm_support() & 0x1 ); +} + + +/* Helper functions for the instruction macros that follow... + (note that memory-to-register, m2r, instructions are nearly + as efficient as register-to-register, r2r, instructions; + however, memory-to-memory instructions are really simulated + as a convenience, and are only 1/3 as efficient) +*/ +#ifdef MMX_TRACE + +/* Include the stuff for printing a trace to stderr... +*/ + +#include <stdio.h> + +#define mmx_i2r(op, imm, reg) \ + { \ + mmx_t mmx_trace; \ + mmx_trace = (imm); \ + fprintf(stderr, #op "_i2r(" #imm "=0x%016llx, ", mmx_trace.q); \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (mmx_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #reg "=0x%016llx) => ", mmx_trace.q); \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "X" (imm)); \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (mmx_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #reg "=0x%016llx\n", mmx_trace.q); \ + } + +#define mmx_m2r(op, mem, reg) \ + { \ + mmx_t mmx_trace; \ + mmx_trace = (mem); \ + fprintf(stderr, #op "_m2r(" #mem "=0x%016llx, ", mmx_trace.q); \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (mmx_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #reg "=0x%016llx) => ", mmx_trace.q); \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "X" (mem)); \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (mmx_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #reg "=0x%016llx\n", mmx_trace.q); \ + } + +#define mmx_r2m(op, reg, mem) \ + { \ + mmx_t mmx_trace; \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (mmx_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #op "_r2m(" #reg "=0x%016llx, ", mmx_trace.q); \ + mmx_trace = (mem); \ + fprintf(stderr, #mem "=0x%016llx) => ", mmx_trace.q); \ + __asm__ __volatile__ (#op " %%" #reg ", %0" \ + : "=X" (mem) \ + : /* nothing */ ); \ + mmx_trace = (mem); \ + fprintf(stderr, #mem "=0x%016llx\n", mmx_trace.q); \ + } + +#define mmx_r2r(op, regs, regd) \ + { \ + mmx_t mmx_trace; \ + __asm__ __volatile__ ("movq %%" #regs ", %0" \ + : "=X" (mmx_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #op "_r2r(" #regs "=0x%016llx, ", mmx_trace.q); \ + __asm__ __volatile__ ("movq %%" #regd ", %0" \ + : "=X" (mmx_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #regd "=0x%016llx) => ", mmx_trace.q); \ + __asm__ __volatile__ (#op " %" #regs ", %" #regd); \ + __asm__ __volatile__ ("movq %%" #regd ", %0" \ + : "=X" (mmx_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #regd "=0x%016llx\n", mmx_trace.q); \ + } + +#define mmx_m2m(op, mems, memd) \ + { \ + mmx_t mmx_trace; \ + mmx_trace = (mems); \ + fprintf(stderr, #op "_m2m(" #mems "=0x%016llx, ", mmx_trace.q); \ + mmx_trace = (memd); \ + fprintf(stderr, #memd "=0x%016llx) => ", mmx_trace.q); \ + __asm__ __volatile__ ("movq %0, %%mm0\n\t" \ + #op " %1, %%mm0\n\t" \ + "movq %%mm0, %0" \ + : "=X" (memd) \ + : "X" (mems)); \ + mmx_trace = (memd); \ + fprintf(stderr, #memd "=0x%016llx\n", mmx_trace.q); \ + } + +#else + +/* These macros are a lot simpler without the tracing... +*/ + +#define mmx_i2r(op, imm, reg) \ + __asm__ __volatile__ (#op " $" #imm ", %%" #reg \ + : /* nothing */ \ + : /* nothing */); + +#define mmx_m2r(op, mem, reg) \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "m" (mem)) + +#define mmx_r2m(op, reg, mem) \ + __asm__ __volatile__ (#op " %%" #reg ", %0" \ + : "=m" (mem) \ + : /* nothing */ ) + +#define mmx_r2r(op, regs, regd) \ + __asm__ __volatile__ (#op " %" #regs ", %" #regd) + +#define mmx_m2m(op, mems, memd) \ + __asm__ __volatile__ ("movq %0, %%mm0\n\t" \ + #op " %1, %%mm0\n\t" \ + "movq %%mm0, %0" \ + : "=m" (memd) \ + : "m" (mems)) + +#endif + + +/* 1x64 MOVe Quadword + (this is both a load and a store... + in fact, it is the only way to store) +*/ +#define movq_m2r(var, reg) mmx_m2r(movq, var, reg) +#define movq_r2m(reg, var) mmx_r2m(movq, reg, var) +#define movq_r2r(regs, regd) mmx_r2r(movq, regs, regd) +#define movq(vars, vard) \ + __asm__ __volatile__ ("movq %1, %%mm0\n\t" \ + "movq %%mm0, %0" \ + : "=X" (vard) \ + : "X" (vars)) + + +/* 1x32 MOVe Doubleword + (like movq, this is both load and store... + but is most useful for moving things between + mmx registers and ordinary registers) +*/ +#define movd_m2r(var, reg) mmx_m2r(movd, var, reg) +#define movd_r2m(reg, var) mmx_r2m(movd, reg, var) +#define movd_r2r(regs, regd) mmx_r2r(movd, regs, regd) +#define movd(vars, vard) \ + __asm__ __volatile__ ("movd %1, %%mm0\n\t" \ + "movd %%mm0, %0" \ + : "=X" (vard) \ + : "X" (vars)) + + +/* 2x32, 4x16, and 8x8 Parallel ADDs +*/ +#define paddd_m2r(var, reg) mmx_m2r(paddd, var, reg) +#define paddd_r2r(regs, regd) mmx_r2r(paddd, regs, regd) +#define paddd(vars, vard) mmx_m2m(paddd, vars, vard) + +#define paddw_m2r(var, reg) mmx_m2r(paddw, var, reg) +#define paddw_r2r(regs, regd) mmx_r2r(paddw, regs, regd) +#define paddw(vars, vard) mmx_m2m(paddw, vars, vard) + +#define paddb_m2r(var, reg) mmx_m2r(paddb, var, reg) +#define paddb_r2r(regs, regd) mmx_r2r(paddb, regs, regd) +#define paddb(vars, vard) mmx_m2m(paddb, vars, vard) + + +/* 4x16 and 8x8 Parallel ADDs using Saturation arithmetic +*/ +#define paddsw_m2r(var, reg) mmx_m2r(paddsw, var, reg) +#define paddsw_r2r(regs, regd) mmx_r2r(paddsw, regs, regd) +#define paddsw(vars, vard) mmx_m2m(paddsw, vars, vard) + +#define paddsb_m2r(var, reg) mmx_m2r(paddsb, var, reg) +#define paddsb_r2r(regs, regd) mmx_r2r(paddsb, regs, regd) +#define paddsb(vars, vard) mmx_m2m(paddsb, vars, vard) + + +/* 4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic +*/ +#define paddusw_m2r(var, reg) mmx_m2r(paddusw, var, reg) +#define paddusw_r2r(regs, regd) mmx_r2r(paddusw, regs, regd) +#define paddusw(vars, vard) mmx_m2m(paddusw, vars, vard) + +#define paddusb_m2r(var, reg) mmx_m2r(paddusb, var, reg) +#define paddusb_r2r(regs, regd) mmx_r2r(paddusb, regs, regd) +#define paddusb(vars, vard) mmx_m2m(paddusb, vars, vard) + + +/* 2x32, 4x16, and 8x8 Parallel SUBs +*/ +#define psubd_m2r(var, reg) mmx_m2r(psubd, var, reg) +#define psubd_r2r(regs, regd) mmx_r2r(psubd, regs, regd) +#define psubd(vars, vard) mmx_m2m(psubd, vars, vard) + +#define psubw_m2r(var, reg) mmx_m2r(psubw, var, reg) +#define psubw_r2r(regs, regd) mmx_r2r(psubw, regs, regd) +#define psubw(vars, vard) mmx_m2m(psubw, vars, vard) + +#define psubb_m2r(var, reg) mmx_m2r(psubb, var, reg) +#define psubb_r2r(regs, regd) mmx_r2r(psubb, regs, regd) +#define psubb(vars, vard) mmx_m2m(psubb, vars, vard) + + +/* 4x16 and 8x8 Parallel SUBs using Saturation arithmetic +*/ +#define psubsw_m2r(var, reg) mmx_m2r(psubsw, var, reg) +#define psubsw_r2r(regs, regd) mmx_r2r(psubsw, regs, regd) +#define psubsw(vars, vard) mmx_m2m(psubsw, vars, vard) + +#define psubsb_m2r(var, reg) mmx_m2r(psubsb, var, reg) +#define psubsb_r2r(regs, regd) mmx_r2r(psubsb, regs, regd) +#define psubsb(vars, vard) mmx_m2m(psubsb, vars, vard) + + +/* 4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic +*/ +#define psubusw_m2r(var, reg) mmx_m2r(psubusw, var, reg) +#define psubusw_r2r(regs, regd) mmx_r2r(psubusw, regs, regd) +#define psubusw(vars, vard) mmx_m2m(psubusw, vars, vard) + +#define psubusb_m2r(var, reg) mmx_m2r(psubusb, var, reg) +#define psubusb_r2r(regs, regd) mmx_r2r(psubusb, regs, regd) +#define psubusb(vars, vard) mmx_m2m(psubusb, vars, vard) + + +/* 4x16 Parallel MULs giving Low 4x16 portions of results +*/ +#define pmullw_m2r(var, reg) mmx_m2r(pmullw, var, reg) +#define pmullw_r2r(regs, regd) mmx_r2r(pmullw, regs, regd) +#define pmullw(vars, vard) mmx_m2m(pmullw, vars, vard) + + +/* 4x16 Parallel MULs giving High 4x16 portions of results +*/ +#define pmulhw_m2r(var, reg) mmx_m2r(pmulhw, var, reg) +#define pmulhw_r2r(regs, regd) mmx_r2r(pmulhw, regs, regd) +#define pmulhw(vars, vard) mmx_m2m(pmulhw, vars, vard) + + +/* 4x16->2x32 Parallel Mul-ADD + (muls like pmullw, then adds adjacent 16-bit fields + in the multiply result to make the final 2x32 result) +*/ +#define pmaddwd_m2r(var, reg) mmx_m2r(pmaddwd, var, reg) +#define pmaddwd_r2r(regs, regd) mmx_r2r(pmaddwd, regs, regd) +#define pmaddwd(vars, vard) mmx_m2m(pmaddwd, vars, vard) + + +/* 1x64 bitwise AND +*/ +#ifdef BROKEN_PAND +#define pand_m2r(var, reg) \ + { \ + mmx_m2r(pandn, (mmx_t) -1LL, reg); \ + mmx_m2r(pandn, var, reg); \ + } +#define pand_r2r(regs, regd) \ + { \ + mmx_m2r(pandn, (mmx_t) -1LL, regd); \ + mmx_r2r(pandn, regs, regd); \ + } +#define pand(vars, vard) \ + { \ + movq_m2r(vard, mm0); \ + mmx_m2r(pandn, (mmx_t) -1LL, mm0); \ + mmx_m2r(pandn, vars, mm0); \ + movq_r2m(mm0, vard); \ + } +#else +#define pand_m2r(var, reg) mmx_m2r(pand, var, reg) +#define pand_r2r(regs, regd) mmx_r2r(pand, regs, regd) +#define pand(vars, vard) mmx_m2m(pand, vars, vard) +#endif + + +/* 1x64 bitwise AND with Not the destination +*/ +#define pandn_m2r(var, reg) mmx_m2r(pandn, var, reg) +#define pandn_r2r(regs, regd) mmx_r2r(pandn, regs, regd) +#define pandn(vars, vard) mmx_m2m(pandn, vars, vard) + + +/* 1x64 bitwise OR +*/ +#define por_m2r(var, reg) mmx_m2r(por, var, reg) +#define por_r2r(regs, regd) mmx_r2r(por, regs, regd) +#define por(vars, vard) mmx_m2m(por, vars, vard) + + +/* 1x64 bitwise eXclusive OR +*/ +#define pxor_m2r(var, reg) mmx_m2r(pxor, var, reg) +#define pxor_r2r(regs, regd) mmx_r2r(pxor, regs, regd) +#define pxor(vars, vard) mmx_m2m(pxor, vars, vard) + + +/* 2x32, 4x16, and 8x8 Parallel CoMPare for EQuality + (resulting fields are either 0 or -1) +*/ +#define pcmpeqd_m2r(var, reg) mmx_m2r(pcmpeqd, var, reg) +#define pcmpeqd_r2r(regs, regd) mmx_r2r(pcmpeqd, regs, regd) +#define pcmpeqd(vars, vard) mmx_m2m(pcmpeqd, vars, vard) + +#define pcmpeqw_m2r(var, reg) mmx_m2r(pcmpeqw, var, reg) +#define pcmpeqw_r2r(regs, regd) mmx_r2r(pcmpeqw, regs, regd) +#define pcmpeqw(vars, vard) mmx_m2m(pcmpeqw, vars, vard) + +#define pcmpeqb_m2r(var, reg) mmx_m2r(pcmpeqb, var, reg) +#define pcmpeqb_r2r(regs, regd) mmx_r2r(pcmpeqb, regs, regd) +#define pcmpeqb(vars, vard) mmx_m2m(pcmpeqb, vars, vard) + + +/* 2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than + (resulting fields are either 0 or -1) +*/ +#define pcmpgtd_m2r(var, reg) mmx_m2r(pcmpgtd, var, reg) +#define pcmpgtd_r2r(regs, regd) mmx_r2r(pcmpgtd, regs, regd) +#define pcmpgtd(vars, vard) mmx_m2m(pcmpgtd, vars, vard) + +#define pcmpgtw_m2r(var, reg) mmx_m2r(pcmpgtw, var, reg) +#define pcmpgtw_r2r(regs, regd) mmx_r2r(pcmpgtw, regs, regd) +#define pcmpgtw(vars, vard) mmx_m2m(pcmpgtw, vars, vard) + +#define pcmpgtb_m2r(var, reg) mmx_m2r(pcmpgtb, var, reg) +#define pcmpgtb_r2r(regs, regd) mmx_r2r(pcmpgtb, regs, regd) +#define pcmpgtb(vars, vard) mmx_m2m(pcmpgtb, vars, vard) + + +/* 1x64, 2x32, and 4x16 Parallel Shift Left Logical +*/ +#define psllq_i2r(imm, reg) mmx_i2r(psllq, imm, reg) +#define psllq_m2r(var, reg) mmx_m2r(psllq, var, reg) +#define psllq_r2r(regs, regd) mmx_r2r(psllq, regs, regd) +#define psllq(vars, vard) mmx_m2m(psllq, vars, vard) + +#define pslld_i2r(imm, reg) mmx_i2r(pslld, imm, reg) +#define pslld_m2r(var, reg) mmx_m2r(pslld, var, reg) +#define pslld_r2r(regs, regd) mmx_r2r(pslld, regs, regd) +#define pslld(vars, vard) mmx_m2m(pslld, vars, vard) + +#define psllw_i2r(imm, reg) mmx_i2r(psllw, imm, reg) +#define psllw_m2r(var, reg) mmx_m2r(psllw, var, reg) +#define psllw_r2r(regs, regd) mmx_r2r(psllw, regs, regd) +#define psllw(vars, vard) mmx_m2m(psllw, vars, vard) + + +/* 1x64, 2x32, and 4x16 Parallel Shift Right Logical +*/ +#define psrlq_i2r(imm, reg) mmx_i2r(psrlq, imm, reg) +#define psrlq_m2r(var, reg) mmx_m2r(psrlq, var, reg) +#define psrlq_r2r(regs, regd) mmx_r2r(psrlq, regs, regd) +#define psrlq(vars, vard) mmx_m2m(psrlq, vars, vard) + +#define psrld_i2r(imm, reg) mmx_i2r(psrld, imm, reg) +#define psrld_m2r(var, reg) mmx_m2r(psrld, var, reg) +#define psrld_r2r(regs, regd) mmx_r2r(psrld, regs, regd) +#define psrld(vars, vard) mmx_m2m(psrld, vars, vard) + +#define psrlw_i2r(imm, reg) mmx_i2r(psrlw, imm, reg) +#define psrlw_m2r(var, reg) mmx_m2r(psrlw, var, reg) +#define psrlw_r2r(regs, regd) mmx_r2r(psrlw, regs, regd) +#define psrlw(vars, vard) mmx_m2m(psrlw, vars, vard) + + +/* 2x32 and 4x16 Parallel Shift Right Arithmetic +*/ +#define psrad_i2r(imm, reg) mmx_i2r(psrad, imm, reg) +#define psrad_m2r(var, reg) mmx_m2r(psrad, var, reg) +#define psrad_r2r(regs, regd) mmx_r2r(psrad, regs, regd) +#define psrad(vars, vard) mmx_m2m(psrad, vars, vard) + +#define psraw_i2r(imm, reg) mmx_i2r(psraw, imm, reg) +#define psraw_m2r(var, reg) mmx_m2r(psraw, var, reg) +#define psraw_r2r(regs, regd) mmx_r2r(psraw, regs, regd) +#define psraw(vars, vard) mmx_m2m(psraw, vars, vard) + + +/* 2x32->4x16 and 4x16->8x8 PACK and Signed Saturate + (packs source and dest fields into dest in that order) +*/ +#define packssdw_m2r(var, reg) mmx_m2r(packssdw, var, reg) +#define packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd) +#define packssdw(vars, vard) mmx_m2m(packssdw, vars, vard) + +#define packsswb_m2r(var, reg) mmx_m2r(packsswb, var, reg) +#define packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd) +#define packsswb(vars, vard) mmx_m2m(packsswb, vars, vard) + + +/* 4x16->8x8 PACK and Unsigned Saturate + (packs source and dest fields into dest in that order) +*/ +#define packuswb_m2r(var, reg) mmx_m2r(packuswb, var, reg) +#define packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd) +#define packuswb(vars, vard) mmx_m2m(packuswb, vars, vard) + + +/* 2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low + (interleaves low half of dest with low half of source + as padding in each result field) +*/ +#define punpckldq_m2r(var, reg) mmx_m2r(punpckldq, var, reg) +#define punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd) +#define punpckldq(vars, vard) mmx_m2m(punpckldq, vars, vard) + +#define punpcklwd_m2r(var, reg) mmx_m2r(punpcklwd, var, reg) +#define punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd) +#define punpcklwd(vars, vard) mmx_m2m(punpcklwd, vars, vard) + +#define punpcklbw_m2r(var, reg) mmx_m2r(punpcklbw, var, reg) +#define punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd) +#define punpcklbw(vars, vard) mmx_m2m(punpcklbw, vars, vard) + + +/* 2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High + (interleaves high half of dest with high half of source + as padding in each result field) +*/ +#define punpckhdq_m2r(var, reg) mmx_m2r(punpckhdq, var, reg) +#define punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd) +#define punpckhdq(vars, vard) mmx_m2m(punpckhdq, vars, vard) + +#define punpckhwd_m2r(var, reg) mmx_m2r(punpckhwd, var, reg) +#define punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd) +#define punpckhwd(vars, vard) mmx_m2m(punpckhwd, vars, vard) + +#define punpckhbw_m2r(var, reg) mmx_m2r(punpckhbw, var, reg) +#define punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd) +#define punpckhbw(vars, vard) mmx_m2m(punpckhbw, vars, vard) + + +/* Empty MMx State + (used to clean-up when going from mmx to float use + of the registers that are shared by both; note that + there is no float-to-mmx operation needed, because + only the float tag word info is corruptible) +*/ +#ifdef MMX_TRACE + +#define emms() \ + { \ + fprintf(stderr, "emms()\n"); \ + __asm__ __volatile__ ("emms"); \ + } + +#else + +#define emms() __asm__ __volatile__ ("emms") + +#endif + +#endif diff --git a/gst/deinterlace2/tvtime/plugins.h b/gst/deinterlace2/tvtime/plugins.h new file mode 100644 index 00000000..0eb90c0b --- /dev/null +++ b/gst/deinterlace2/tvtime/plugins.h @@ -0,0 +1,42 @@ +/* + * + * GStreamer + * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Relicensed for GStreamer from GPL to LGPL with permit from Billy Biggs. + * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 + */ + +#ifndef TVTIME_PLUGINS_H_INCLUDED +#define TVTIME_PLUGINS_H_INCLUDED + +deinterlace_method_t* dscaler_tomsmocomp_get_method( void ); +deinterlace_method_t* dscaler_greedyh_get_method( void ); +deinterlace_method_t* dscaler_greedyl_get_method( void ); +deinterlace_method_t* dscaler_vfir_get_method( void ); + +//void linear_plugin_init( void ); +//void scalerbob_plugin_init( void ); +//void linearblend_plugin_init( void ); +//void weave_plugin_init( void ); +//void weavetff_plugin_init( void ); +//void weavebff_plugin_init( void ); + +#endif /* TVTIME_PLUGINS_H_INCLUDED */ diff --git a/gst/deinterlace2/tvtime/speedtools.h b/gst/deinterlace2/tvtime/speedtools.h new file mode 100644 index 00000000..677bb5e3 --- /dev/null +++ b/gst/deinterlace2/tvtime/speedtools.h @@ -0,0 +1,54 @@ +/* + * + * GStreamer + * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Relicensed for GStreamer from GPL to LGPL with permit from Billy Biggs. + * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 + */ + +#ifndef SPEEDTOOLS_H_INCLUDED +#define SPEEDTOOLS_H_INCLUDED + +#define PREFETCH_2048(x) \ + { int *pfetcha = (int *) x; \ + prefetchnta( pfetcha ); \ + prefetchnta( pfetcha + 64 ); \ + prefetchnta( pfetcha + 128 ); \ + prefetchnta( pfetcha + 192 ); \ + pfetcha += 256; \ + prefetchnta( pfetcha ); \ + prefetchnta( pfetcha + 64 ); \ + prefetchnta( pfetcha + 128 ); \ + prefetchnta( pfetcha + 192 ); } + +#define READ_PREFETCH_2048(x) \ + { int *pfetcha = (int *) x; int pfetchtmp; \ + pfetchtmp = pfetcha[ 0 ] + pfetcha[ 16 ] + pfetcha[ 32 ] + pfetcha[ 48 ] + \ + pfetcha[ 64 ] + pfetcha[ 80 ] + pfetcha[ 96 ] + pfetcha[ 112 ] + \ + pfetcha[ 128 ] + pfetcha[ 144 ] + pfetcha[ 160 ] + pfetcha[ 176 ] + \ + pfetcha[ 192 ] + pfetcha[ 208 ] + pfetcha[ 224 ] + pfetcha[ 240 ]; \ + pfetcha += 256; \ + pfetchtmp = pfetcha[ 0 ] + pfetcha[ 16 ] + pfetcha[ 32 ] + pfetcha[ 48 ] + \ + pfetcha[ 64 ] + pfetcha[ 80 ] + pfetcha[ 96 ] + pfetcha[ 112 ] + \ + pfetcha[ 128 ] + pfetcha[ 144 ] + pfetcha[ 160 ] + pfetcha[ 176 ] + \ + pfetcha[ 192 ] + pfetcha[ 208 ] + pfetcha[ 224 ] + pfetcha[ 240 ]; } + +#endif /* SPEEDTOOLS_H_INCLUDED */ diff --git a/gst/deinterlace2/tvtime/speedy.c b/gst/deinterlace2/tvtime/speedy.c new file mode 100644 index 00000000..821cc254 --- /dev/null +++ b/gst/deinterlace2/tvtime/speedy.c @@ -0,0 +1,2791 @@ +/** + * Copyright (c) 2002, 2003 Billy Biggs <vektor@dumbterm.net>. + * Copyright (C) 2001 Matthew J. Marjanovic <maddog@mir.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +/** + * Includes 420to422, 422to444 scaling filters from the MPEG2 reference + * implementation. The v12 source code indicates that they were written + * by Cheung Auyeung <auyeung@mot.com>. The file they were in was: + * + * store.c, picture output routines + * Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. + * + * Disclaimer of Warranty + * + * These software programs are available to the user without any license fee or + * royalty on an "as is" basis. The MPEG Software Simulation Group disclaims + * any and all warranties, whether express, implied, or statuary, including any + * implied warranties or merchantability or of fitness for a particular + * purpose. In no event shall the copyright-holder be liable for any + * incidental, punitive, or consequential damages of any kind whatsoever + * arising from the use of these programs. + * + * This disclaimer of warranty extends to the user of these programs and user's + * customers, employees, agents, transferees, successors, and assigns. + * + * The MPEG Software Simulation Group does not represent or warrant that the + * programs furnished hereunder are free of infringement of any third-party + * patents. + * + * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware, + * are subject to royalty fees to patent holders. Many of these patents are + * general enough such that they are unavoidable regardless of implementation + * design. + * + */ + +/** + * Code for the UYVY to YUYV routine comes from rivatv: + * + * rivatv-convert.c video image conversion routines + * + * Copyright (C) 2002 Stefan Jahn <stefan@lkcc.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <stdio.h> +#include <string.h> + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "gst/gst.h" +#include "gstdeinterlace2.h" +#include "speedy.h" +#include "speedtools.h" +#include "mmx.h" +#include "sse.h" + +// TODO: remove includes +//#include "attributes.h" +//#include "mm_accel.h" + +/* Function pointer definitions. */ +void (*interpolate_packed422_scanline) (uint8_t * output, uint8_t * top, + uint8_t * bot, int width); +void (*blit_colour_packed422_scanline) (uint8_t * output, + int width, int y, int cb, int cr); +void (*blit_colour_packed4444_scanline) (uint8_t * output, + int width, int alpha, int luma, int cb, int cr); +void (*blit_packed422_scanline) (uint8_t * dest, const uint8_t * src, + int width); +void (*composite_packed4444_to_packed422_scanline) (uint8_t * output, + uint8_t * input, uint8_t * foreground, int width); +void (*composite_packed4444_alpha_to_packed422_scanline) (uint8_t * output, + uint8_t * input, uint8_t * foreground, int width, int alpha); +void (*composite_alphamask_to_packed4444_scanline) (uint8_t * output, + uint8_t * input, uint8_t * mask, int width, int textluma, int textcb, + int textcr); +void (*composite_alphamask_alpha_to_packed4444_scanline) (uint8_t * output, + uint8_t * input, uint8_t * mask, int width, int textluma, int textcb, + int textcr, int alpha); +void (*premultiply_packed4444_scanline) (uint8_t * output, uint8_t * input, + int width); +void (*blend_packed422_scanline) (uint8_t * output, uint8_t * src1, + uint8_t * src2, int width, int pos); +unsigned int (*diff_factor_packed422_scanline) (uint8_t * cur, uint8_t * old, + int width); +unsigned int (*comb_factor_packed422_scanline) (uint8_t * top, uint8_t * mid, + uint8_t * bot, int width); +void (*kill_chroma_packed422_inplace_scanline) (uint8_t * data, int width); + +void (*mirror_packed422_inplace_scanline) (uint8_t * data, int width); + +void (*speedy_memcpy) (void *output, const void *input, size_t size); + +void (*diff_packed422_block8x8) (pulldown_metrics_t * m, uint8_t * old, + uint8_t * new, int os, int ns); +void (*a8_subpix_blit_scanline) (uint8_t * output, uint8_t * input, + int lasta, int startpos, int width); +void (*quarter_blit_vertical_packed422_scanline) (uint8_t * output, + uint8_t * one, uint8_t * three, int width); +void (*subpix_blit_vertical_packed422_scanline) (uint8_t * output, + uint8_t * top, uint8_t * bot, int subpixpos, int width); +void (*packed444_to_nonpremultiplied_packed4444_scanline) (uint8_t * output, + uint8_t * input, int width, int alpha); +void (*aspect_adjust_packed4444_scanline) (uint8_t * output, uint8_t * input, + int width, double pixel_aspect); +void (*packed444_to_packed422_scanline) (uint8_t * output, uint8_t * input, + int width); +void (*packed422_to_packed444_scanline) (uint8_t * output, uint8_t * input, + int width); +void (*packed422_to_packed444_rec601_scanline) (uint8_t * dest, uint8_t * src, + int width); +void (*packed444_to_rgb24_rec601_scanline) (uint8_t * output, uint8_t * input, + int width); +void (*rgb24_to_packed444_rec601_scanline) (uint8_t * output, uint8_t * input, + int width); +void (*rgba32_to_packed4444_rec601_scanline) (uint8_t * output, uint8_t * input, + int width); +void (*invert_colour_packed422_inplace_scanline) (uint8_t * data, int width); + +void (*vfilter_chroma_121_packed422_scanline) (uint8_t * output, int width, + uint8_t * m, uint8_t * t, uint8_t * b); +void (*vfilter_chroma_332_packed422_scanline) (uint8_t * output, int width, + uint8_t * m, uint8_t * t, uint8_t * b); +void (*convert_uyvy_to_yuyv_scanline) (uint8_t * uyvy_buf, uint8_t * yuyv_buf, + int width); +void (*composite_colour4444_alpha_to_packed422_scanline) (uint8_t * output, + uint8_t * input, int af, int y, int cb, int cr, int width, int alpha); + +/** + * result = (1 - alpha)B + alpha*F + * = B - alpha*B + alpha*F + * = B + alpha*(F - B) + */ + +static inline __attribute__ ((always_inline, const)) + int multiply_alpha (int a, int r) +{ + int temp; + + temp = (r * a) + 0x80; + return ((temp + (temp >> 8)) >> 8); +} + +static inline __attribute__ ((always_inline, const)) + uint8_t clip255 (int x) +{ + if (x > 255) { + return 255; + } else if (x < 0) { + return 0; + } else { + return x; + } +} + +unsigned long CombJaggieThreshold = 73; + +#ifdef HAVE_CPU_I386 +static unsigned int +comb_factor_packed422_scanline_mmx (uint8_t * top, uint8_t * mid, + uint8_t * bot, int width) +{ + const mmx_t qwYMask = { 0x00ff00ff00ff00ffULL }; + const mmx_t qwOnes = { 0x0001000100010001ULL }; + mmx_t qwThreshold; + + unsigned int temp1, temp2; + + width /= 4; + + qwThreshold.uw[0] = CombJaggieThreshold; + qwThreshold.uw[1] = CombJaggieThreshold; + qwThreshold.uw[2] = CombJaggieThreshold; + qwThreshold.uw[3] = CombJaggieThreshold; + + movq_m2r (qwThreshold, mm0); + movq_m2r (qwYMask, mm1); + movq_m2r (qwOnes, mm2); + pxor_r2r (mm7, mm7); /* mm7 = 0. */ + + while (width--) { + /* Load and keep just the luma. */ + movq_m2r (*top, mm3); + movq_m2r (*mid, mm4); + movq_m2r (*bot, mm5); + + pand_r2r (mm1, mm3); + pand_r2r (mm1, mm4); + pand_r2r (mm1, mm5); + + /* Work out mm6 = (top - mid) * (bot - mid) - ( (top - mid)^2 >> 7 ) */ + psrlw_i2r (1, mm3); + psrlw_i2r (1, mm4); + psrlw_i2r (1, mm5); + + /* mm6 = (top - mid) */ + movq_r2r (mm3, mm6); + psubw_r2r (mm4, mm6); + + /* mm3 = (top - bot) */ + psubw_r2r (mm5, mm3); + + /* mm5 = (bot - mid) */ + psubw_r2r (mm4, mm5); + + /* mm6 = (top - mid) * (bot - mid) */ + pmullw_r2r (mm5, mm6); + + /* mm3 = (top - bot)^2 >> 7 */ + pmullw_r2r (mm3, mm3); /* mm3 = (top - bot)^2 */ + psrlw_i2r (7, mm3); /* mm3 = ((top - bot)^2 >> 7) */ + + /* mm6 is what we want. */ + psubw_r2r (mm3, mm6); + + /* FF's if greater than qwTheshold */ + pcmpgtw_r2r (mm0, mm6); + + /* Add to count if we are greater than threshold */ + pand_r2r (mm2, mm6); + paddw_r2r (mm6, mm7); + + top += 8; + mid += 8; + bot += 8; + } + + movd_r2m (mm7, temp1); + psrlq_i2r (32, mm7); + movd_r2m (mm7, temp2); + temp1 += temp2; + temp2 = temp1; + temp1 >>= 16; + temp1 += temp2 & 0xffff; + + emms (); + + return temp1; +} +#endif + +static unsigned long BitShift = 6; + +static unsigned int +diff_factor_packed422_scanline_c (uint8_t * cur, uint8_t * old, int width) +{ + unsigned int ret = 0; + + width /= 4; + + while (width--) { + unsigned int tmp1 = (cur[0] + cur[2] + cur[4] + cur[6] + 2) >> 2; + + unsigned int tmp2 = (old[0] + old[2] + old[4] + old[6] + 2) >> 2; + + tmp1 = (tmp1 - tmp2); + tmp1 *= tmp1; + tmp1 >>= BitShift; + ret += tmp1; + cur += 8; + old += 8; + } + + return ret; +} + +/* +static unsigned int diff_factor_packed422_scanline_test_c( uint8_t *cur, uint8_t *old, int width ) +{ + unsigned int ret = 0; + + width /= 16; + + while( width-- ) { + unsigned int tmp1 = (cur[ 0 ] + cur[ 2 ] + cur[ 4 ] + cur[ 6 ])>>2; + unsigned int tmp2 = (old[ 0 ] + old[ 2 ] + old[ 4 ] + old[ 6 ])>>2; + tmp1 = (tmp1 - tmp2); + tmp1 *= tmp1; + tmp1 >>= BitShift; + ret += tmp1; + cur += (8*4); + old += (8*4); + } + + return ret; +} +*/ + +#ifdef HAVE_CPU_I386 +static unsigned int +diff_factor_packed422_scanline_mmx (uint8_t * cur, uint8_t * old, int width) +{ + const mmx_t qwYMask = { 0x00ff00ff00ff00ffULL }; + unsigned int temp1, temp2; + + width /= 4; + + movq_m2r (qwYMask, mm1); + movd_m2r (BitShift, mm7); + pxor_r2r (mm0, mm0); + + while (width--) { + movq_m2r (*cur, mm4); + movq_m2r (*old, mm5); + + pand_r2r (mm1, mm4); + pand_r2r (mm1, mm5); + + psubw_r2r (mm5, mm4); /* mm4 = Y1 - Y2 */ + pmaddwd_r2r (mm4, mm4); /* mm4 = (Y1 - Y2)^2 */ + psrld_r2r (mm7, mm4); /* divide mm4 by 2^BitShift */ + paddd_r2r (mm4, mm0); /* keep total in mm0 */ + + cur += 8; + old += 8; + } + + movd_r2m (mm0, temp1); + psrlq_i2r (32, mm0); + movd_r2m (mm0, temp2); + temp1 += temp2; + + emms (); + + return temp1; +} +#endif + +// defined in glib/gmacros.h #define ABS(a) (((a) < 0)?-(a):(a)) + +#ifdef HAVE_CPU_I386 +static void +diff_packed422_block8x8_mmx (pulldown_metrics_t * m, uint8_t * old, + uint8_t * new, int os, int ns) +{ + const mmx_t ymask = { 0x00ff00ff00ff00ffULL }; + short out[24]; /* Output buffer for the partial metrics from the mmx code. */ + + uint8_t *outdata = (uint8_t *) out; + + uint8_t *oldp, *newp; + + int i; + + pxor_r2r (mm4, mm4); // 4 even difference sums. + pxor_r2r (mm5, mm5); // 4 odd difference sums. + pxor_r2r (mm7, mm7); // zeros + + oldp = old; + newp = new; + for (i = 4; i; --i) { + // Even difference. + movq_m2r (oldp[0], mm0); + movq_m2r (oldp[8], mm2); + pand_m2r (ymask, mm0); + pand_m2r (ymask, mm2); + oldp += os; + + movq_m2r (newp[0], mm1); + movq_m2r (newp[8], mm3); + pand_m2r (ymask, mm1); + pand_m2r (ymask, mm3); + newp += ns; + + movq_r2r (mm0, mm6); + psubusb_r2r (mm1, mm0); + psubusb_r2r (mm6, mm1); + movq_r2r (mm2, mm6); + psubusb_r2r (mm3, mm2); + psubusb_r2r (mm6, mm3); + + paddw_r2r (mm0, mm4); + paddw_r2r (mm1, mm4); + paddw_r2r (mm2, mm4); + paddw_r2r (mm3, mm4); + + // Odd difference. + movq_m2r (oldp[0], mm0); + movq_m2r (oldp[8], mm2); + pand_m2r (ymask, mm0); + pand_m2r (ymask, mm2); + oldp += os; + + movq_m2r (newp[0], mm1); + movq_m2r (newp[8], mm3); + pand_m2r (ymask, mm1); + pand_m2r (ymask, mm3); + newp += ns; + + movq_r2r (mm0, mm6); + psubusb_r2r (mm1, mm0); + psubusb_r2r (mm6, mm1); + movq_r2r (mm2, mm6); + psubusb_r2r (mm3, mm2); + psubusb_r2r (mm6, mm3); + + paddw_r2r (mm0, mm5); + paddw_r2r (mm1, mm5); + paddw_r2r (mm2, mm5); + paddw_r2r (mm3, mm5); + } + movq_r2m (mm4, outdata[0]); + movq_r2m (mm5, outdata[8]); + + m->e = out[0] + out[1] + out[2] + out[3]; + m->o = out[4] + out[5] + out[6] + out[7]; + m->d = m->e + m->o; + + pxor_r2r (mm4, mm4); // Past spacial noise. + pxor_r2r (mm5, mm5); // Temporal noise. + pxor_r2r (mm6, mm6); // Current spacial noise. + + // First loop to measure first four columns + oldp = old; + newp = new; + for (i = 4; i; --i) { + movq_m2r (oldp[0], mm0); + movq_m2r (oldp[os], mm1); + pand_m2r (ymask, mm0); + pand_m2r (ymask, mm1); + oldp += (os * 2); + + movq_m2r (newp[0], mm2); + movq_m2r (newp[ns], mm3); + pand_m2r (ymask, mm2); + pand_m2r (ymask, mm3); + newp += (ns * 2); + + paddw_r2r (mm1, mm4); + paddw_r2r (mm1, mm5); + paddw_r2r (mm3, mm6); + psubw_r2r (mm0, mm4); + psubw_r2r (mm2, mm5); + psubw_r2r (mm2, mm6); + } + movq_r2m (mm4, outdata[0]); + movq_r2m (mm5, outdata[16]); + movq_r2m (mm6, outdata[32]); + + pxor_r2r (mm4, mm4); + pxor_r2r (mm5, mm5); + pxor_r2r (mm6, mm6); + + // Second loop for the last four columns + oldp = old; + newp = new; + for (i = 4; i; --i) { + movq_m2r (oldp[8], mm0); + movq_m2r (oldp[os + 8], mm1); + pand_m2r (ymask, mm0); + pand_m2r (ymask, mm1); + oldp += (os * 2); + + movq_m2r (newp[8], mm2); + movq_m2r (newp[ns + 8], mm3); + pand_m2r (ymask, mm2); + pand_m2r (ymask, mm3); + newp += (ns * 2); + + paddw_r2r (mm1, mm4); + paddw_r2r (mm1, mm5); + paddw_r2r (mm3, mm6); + psubw_r2r (mm0, mm4); + psubw_r2r (mm2, mm5); + psubw_r2r (mm2, mm6); + } + movq_r2m (mm4, outdata[8]); + movq_r2m (mm5, outdata[24]); + movq_r2m (mm6, outdata[40]); + + m->p = m->t = m->s = 0; + for (i = 0; i < 8; i++) { + // FIXME: move abs() into the mmx code! + m->p += ABS (out[i]); + m->t += ABS (out[8 + i]); + m->s += ABS (out[16 + i]); + } + + emms (); +} +#endif + +static void +diff_packed422_block8x8_c (pulldown_metrics_t * m, uint8_t * old, + uint8_t * new, int os, int ns) +{ + int x, y, e = 0, o = 0, s = 0, p = 0, t = 0; + + uint8_t *oldp, *newp; + + m->s = m->p = m->t = 0; + for (x = 8; x; x--) { + oldp = old; + old += 2; + newp = new; + new += 2; + s = p = t = 0; + for (y = 4; y; y--) { + e += ABS (newp[0] - oldp[0]); + o += ABS (newp[ns] - oldp[os]); + s += newp[ns] - newp[0]; + p += oldp[os] - oldp[0]; + t += oldp[os] - newp[0]; + oldp += os << 1; + newp += ns << 1; + } + m->s += ABS (s); + m->p += ABS (p); + m->t += ABS (t); + } + m->e = e; + m->o = o; + m->d = e + o; +} + +static void +packed444_to_packed422_scanline_c (uint8_t * output, uint8_t * input, int width) +{ + width /= 2; + while (width--) { + output[0] = input[0]; + output[1] = input[1]; + output[2] = input[3]; + output[3] = input[2]; + output += 4; + input += 6; + } +} + +static void +packed422_to_packed444_scanline_c (uint8_t * output, uint8_t * input, int width) +{ + width /= 2; + while (width--) { + output[0] = input[0]; + output[1] = input[1]; + output[2] = input[3]; + output[3] = input[2]; + output[4] = input[1]; + output[5] = input[3]; + output += 6; + input += 4; + } +} + +/** + * For the middle pixels, the filter kernel is: + * + * [-1 3 -6 12 -24 80 80 -24 12 -6 3 -1] + */ +static void +packed422_to_packed444_rec601_scanline_c (uint8_t * dest, uint8_t * src, + int width) +{ + int i; + + /* Process two input pixels at a time. Input is [Y'][Cb][Y'][Cr]. */ + for (i = 0; i < width / 2; i++) { + dest[(i * 6) + 0] = src[(i * 4) + 0]; + dest[(i * 6) + 1] = src[(i * 4) + 1]; + dest[(i * 6) + 2] = src[(i * 4) + 3]; + + dest[(i * 6) + 3] = src[(i * 4) + 2]; + if (i > (5 * 2) && i < ((width / 2) - (6 * 2))) { + dest[(i * 6) + 4] = + clip255 ((((80 * (src[(i * 4) + 1] + src[(i * 4) + 5])) + - (24 * (src[(i * 4) - 3] + src[(i * 4) + 9])) + + (12 * (src[(i * 4) - 7] + src[(i * 4) + 13])) + - (6 * (src[(i * 4) - 11] + src[(i * 4) + 17])) + + (3 * (src[(i * 4) - 15] + src[(i * 4) + 21])) + - ((src[(i * 4) - 19] + src[(i * 4) + 25]))) + 64) >> 7); + dest[(i * 6) + 5] = + clip255 ((((80 * (src[(i * 4) + 3] + src[(i * 4) + 7])) + - (24 * (src[(i * 4) - 1] + src[(i * 4) + 11])) + + (12 * (src[(i * 4) - 5] + src[(i * 4) + 15])) + - (6 * (src[(i * 4) - 9] + src[(i * 4) + 19])) + + (3 * (src[(i * 4) - 13] + src[(i * 4) + 23])) + - ((src[(i * 4) - 17] + src[(i * 4) + 27]))) + 64) >> 7); + } else if (i < ((width / 2) - 1)) { + dest[(i * 6) + 4] = (src[(i * 4) + 1] + src[(i * 4) + 5] + 1) >> 1; + dest[(i * 6) + 5] = (src[(i * 4) + 3] + src[(i * 4) + 7] + 1) >> 1; + } else { + dest[(i * 6) + 4] = src[(i * 4) + 1]; + dest[(i * 6) + 5] = src[(i * 4) + 3]; + } + } +} + +#ifdef HAVE_CPU_I386 +static void +vfilter_chroma_121_packed422_scanline_mmx (uint8_t * output, int width, + uint8_t * m, uint8_t * t, uint8_t * b) +{ + int i; + const mmx_t ymask = { 0x00ff00ff00ff00ffULL }; + const mmx_t cmask = { 0xff00ff00ff00ff00ULL }; + + // Get width in bytes. + width *= 2; + i = width / 8; + width -= i * 8; + + movq_m2r (ymask, mm7); + movq_m2r (cmask, mm6); + + while (i--) { + movq_m2r (*t, mm0); + movq_m2r (*b, mm1); + movq_m2r (*m, mm2); + + movq_r2r (mm2, mm3); + pand_r2r (mm7, mm3); + + pand_r2r (mm6, mm0); + pand_r2r (mm6, mm1); + pand_r2r (mm6, mm2); + + psrlq_i2r (8, mm0); + psrlq_i2r (8, mm1); + psrlq_i2r (7, mm2); + + paddw_r2r (mm0, mm2); + paddw_r2r (mm1, mm2); + + psllw_i2r (6, mm2); + pand_r2r (mm6, mm2); + + por_r2r (mm3, mm2); + + movq_r2m (mm2, *output); + output += 8; + t += 8; + b += 8; + m += 8; + } + output++; + t++; + b++; + m++; + while (width--) { + *output = (*t + *b + (*m << 1)) >> 2; + output += 2; + t += 2; + b += 2; + m += 2; + } + + emms (); +} +#endif + +static void +vfilter_chroma_121_packed422_scanline_c (uint8_t * output, int width, + uint8_t * m, uint8_t * t, uint8_t * b) +{ + output++; + t++; + b++; + m++; + while (width--) { + *output = (*t + *b + (*m << 1)) >> 2; + output += 2; + t += 2; + b += 2; + m += 2; + } +} + +#ifdef HAVE_CPU_I386 +static void +vfilter_chroma_332_packed422_scanline_mmx (uint8_t * output, int width, + uint8_t * m, uint8_t * t, uint8_t * b) +{ + int i; + const mmx_t ymask = { 0x00ff00ff00ff00ffULL }; + const mmx_t cmask = { 0xff00ff00ff00ff00ULL }; + + // Get width in bytes. + width *= 2; + i = width / 8; + width -= i * 8; + + movq_m2r (ymask, mm7); + movq_m2r (cmask, mm6); + + while (i--) { + movq_m2r (*t, mm0); + movq_m2r (*b, mm1); + movq_m2r (*m, mm2); + + movq_r2r (mm2, mm3); + pand_r2r (mm7, mm3); + + pand_r2r (mm6, mm0); + pand_r2r (mm6, mm1); + pand_r2r (mm6, mm2); + + psrlq_i2r (8, mm0); + psrlq_i2r (7, mm1); + psrlq_i2r (8, mm2); + + movq_r2r (mm0, mm4); + psllw_i2r (1, mm4); + paddw_r2r (mm4, mm0); + + movq_r2r (mm2, mm4); + psllw_i2r (1, mm4); + paddw_r2r (mm4, mm2); + + paddw_r2r (mm0, mm2); + paddw_r2r (mm1, mm2); + + psllw_i2r (5, mm2); + pand_r2r (mm6, mm2); + + por_r2r (mm3, mm2); + + movq_r2m (mm2, *output); + output += 8; + t += 8; + b += 8; + m += 8; + } + output++; + t++; + b++; + m++; + while (width--) { + *output = (3 * *t + 3 * *m + 2 * *b) >> 3; + output += 2; + t += 2; + b += 2; + m += 2; + } + + emms (); +} +#endif + +static void +vfilter_chroma_332_packed422_scanline_c (uint8_t * output, int width, + uint8_t * m, uint8_t * t, uint8_t * b) +{ + output++; + t++; + b++; + m++; + while (width--) { + *output = (3 * *t + 3 * *m + 2 * *b) >> 3; + output += 2; + t += 2; + b += 2; + m += 2; + } +} + +#ifdef HAVE_CPU_I386 +static void +kill_chroma_packed422_inplace_scanline_mmx (uint8_t * data, int width) +{ + const mmx_t ymask = { 0x00ff00ff00ff00ffULL }; + const mmx_t nullchroma = { 0x8000800080008000ULL }; + + movq_m2r (ymask, mm7); + movq_m2r (nullchroma, mm6); + for (; width > 4; width -= 4) { + movq_m2r (*data, mm0); + pand_r2r (mm7, mm0); + paddb_r2r (mm6, mm0); + movq_r2m (mm0, *data); + data += 8; + } + emms (); + + while (width--) { + data[1] = 128; + data += 2; + } +} +#endif + +static void +kill_chroma_packed422_inplace_scanline_c (uint8_t * data, int width) +{ + while (width--) { + data[1] = 128; + data += 2; + } +} + +#ifdef HAVE_CPU_I386 +static void +invert_colour_packed422_inplace_scanline_mmx (uint8_t * data, int width) +{ + const mmx_t allones = { 0xffffffffffffffffULL }; + + movq_m2r (allones, mm1); + for (; width > 4; width -= 4) { + movq_r2r (mm1, mm2); + movq_m2r (*data, mm0); + psubb_r2r (mm0, mm2); + movq_r2m (mm2, *data); + data += 8; + } + emms (); + + width *= 2; + while (width--) { + *data = 255 - *data; + data++; + } +} +#endif + +static void +invert_colour_packed422_inplace_scanline_c (uint8_t * data, int width) +{ + width *= 2; + while (width--) { + *data = 255 - *data; + data++; + } +} + +static void +mirror_packed422_inplace_scanline_c (uint8_t * data, int width) +{ + int x, tmp1, tmp2; + + int width2 = width * 2; + + for (x = 0; x < width; x += 2) { + tmp1 = data[x]; + tmp2 = data[x + 1]; + data[x] = data[width2 - x]; + data[x + 1] = data[width2 - x + 1]; + data[width2 - x] = tmp1; + data[width2 - x + 1] = tmp2; + } +} + +static void +interpolate_packed422_scanline_c (uint8_t * output, uint8_t * top, + uint8_t * bot, int width) +{ + int i; + + for (i = width * 2; i; --i) { + *output++ = ((*top++) + (*bot++)) >> 1; + } +} + +#ifdef HAVE_CPU_I386 +static void +convert_uyvy_to_yuyv_scanline_mmx (uint8_t * uyvy_buf, uint8_t * yuyv_buf, + int width) +{ +#if defined(HAVE_CPU_I386) && !defined(HAVE_CPU_X86_64) + __asm__ __volatile__ (" movl %0, %%esi \n" + " movl %1, %%edi \n" + " movl %2, %%edx \n" " shrl $3, %%edx \n" + /* Process 8 pixels at once */ + "1: movq (%%esi), %%mm0 \n" /* mm0 = Y3V2Y2U2Y1V0Y0U0 */ + " movq 8(%%esi), %%mm2 \n" /* mm2 = Y7V6Y6U6Y5V4Y4U4 */ + " movq %%mm0, %%mm1 \n" /* mm1 = Y3V2Y2U2Y1V0Y0U0 */ + " movq %%mm2, %%mm3 \n" /* mm3 = Y7V6Y6U6Y5V4Y4U4 */ + " psllw $8, %%mm0 \n" /* mm0 = V2__U2__V0__U0__ */ + " psrlw $8, %%mm1 \n" /* mm1 = __Y3__Y2__Y1__Y0 */ + " psllw $8, %%mm2 \n" /* mm2 = V6__U6__V4__U4__ */ + " psrlw $8, %%mm3 \n" /* mm3 = __Y7__Y6__Y5__Y4 */ + " por %%mm1, %%mm0 \n" /* mm0 = V2Y3U2Y2V0Y1U0Y0 */ + " por %%mm3, %%mm2 \n" /* mm2 = V6Y7U6Y6V4Y5U4Y4 */ + " movq %%mm0, (%%edi) \n" + " movq %%mm2, 8(%%edi) \n" + " addl $16, %%esi \n" + " addl $16, %%edi \n" + " decl %%edx \n" + " jnz 1b \n" " emms \n" + /* output */ : + /* input */ :"g" (uyvy_buf), "g" (yuyv_buf), "g" (width) + /* clobber registers */ + :"cc", "edx", "esi", "edi"); +#endif +#ifdef HAVE_CPU_X86_64 + __asm__ __volatile__ (" movq %0, %%rsi \n" + " movq %1, %%rdi \n" + " xorq %%rdx, %%rdx \n" + " movl %2, %%edx \n" " shrq $3, %%rdx \n" + /* Process 8 pixels at once */ + "1: movq (%%rsi), %%mm0 \n" /* mm0 = Y3V2Y2U2Y1V0Y0U0 */ + " movq 8(%%rsi), %%mm2 \n" /* mm2 = Y7V6Y6U6Y5V4Y4U4 */ + " movq %%mm0, %%mm1 \n" /* mm1 = Y3V2Y2U2Y1V0Y0U0 */ + " movq %%mm2, %%mm3 \n" /* mm3 = Y7V6Y6U6Y5V4Y4U4 */ + " psllw $8, %%mm0 \n" /* mm0 = V2__U2__V0__U0__ */ + " psrlw $8, %%mm1 \n" /* mm1 = __Y3__Y2__Y1__Y0 */ + " psllw $8, %%mm2 \n" /* mm2 = V6__U6__V4__U4__ */ + " psrlw $8, %%mm3 \n" /* mm3 = __Y7__Y6__Y5__Y4 */ + " por %%mm1, %%mm0 \n" /* mm0 = V2Y3U2Y2V0Y1U0Y0 */ + " por %%mm3, %%mm2 \n" /* mm2 = V6Y7U6Y6V4Y5U4Y4 */ + " movq %%mm0, (%%rdi) \n" + " movq %%mm2, 8(%%rdi) \n" + " addq $16, %%rsi \n" + " addq $16, %%rdi \n" + " decq %%rdx \n" + " jnz 1b \n" " emms \n" + /* output */ : + /* input */ :"g" (uyvy_buf), "g" (yuyv_buf), "g" (width) + /* clobber registers */ + :"cc", "rdx", "rsi", "rdi"); +#endif + if (width & 7) { + uint32_t *uyvy = (uint32_t *) uyvy_buf; + + uint32_t *yuyv = (uint32_t *) yuyv_buf; + + uint32_t val; + + width &= 7; + width >>= 1; + while (width--) { + val = *uyvy++; + val = ((val << 8) & ~0x00FF0000) | ((val >> 8) & ~0x0000FF00); + *yuyv++ = val; + } + } +} +#endif + +static void +convert_uyvy_to_yuyv_scanline_c (uint8_t * uyvy_buf, uint8_t * yuyv_buf, + int width) +{ + uint32_t *uyvy = (uint32_t *) uyvy_buf; + + uint32_t *yuyv = (uint32_t *) yuyv_buf; + + uint32_t val; + + width >>= 1; + while (width--) { + val = *uyvy++; + val = ((val << 8) & ~0x00FF0000) | ((val >> 8) & ~0x0000FF00); + *yuyv++ = val; + } +} + + +#ifdef HAVE_CPU_I386 +static void +interpolate_packed422_scanline_mmx (uint8_t * output, uint8_t * top, + uint8_t * bot, int width) +{ + const mmx_t shiftmask = { 0xfefffefffefffeffULL }; /* To avoid shifting chroma to luma. */ + int i; + + for (i = width / 16; i; --i) { + movq_m2r (*bot, mm0); + movq_m2r (*top, mm1); + movq_m2r (*(bot + 8), mm2); + movq_m2r (*(top + 8), mm3); + movq_m2r (*(bot + 16), mm4); + movq_m2r (*(top + 16), mm5); + movq_m2r (*(bot + 24), mm6); + movq_m2r (*(top + 24), mm7); + pand_m2r (shiftmask, mm0); + pand_m2r (shiftmask, mm1); + pand_m2r (shiftmask, mm2); + pand_m2r (shiftmask, mm3); + pand_m2r (shiftmask, mm4); + pand_m2r (shiftmask, mm5); + pand_m2r (shiftmask, mm6); + pand_m2r (shiftmask, mm7); + psrlw_i2r (1, mm0); + psrlw_i2r (1, mm1); + psrlw_i2r (1, mm2); + psrlw_i2r (1, mm3); + psrlw_i2r (1, mm4); + psrlw_i2r (1, mm5); + psrlw_i2r (1, mm6); + psrlw_i2r (1, mm7); + paddb_r2r (mm1, mm0); + paddb_r2r (mm3, mm2); + paddb_r2r (mm5, mm4); + paddb_r2r (mm7, mm6); + movq_r2m (mm0, *output); + movq_r2m (mm2, *(output + 8)); + movq_r2m (mm4, *(output + 16)); + movq_r2m (mm6, *(output + 24)); + output += 32; + top += 32; + bot += 32; + } + width = (width & 0xf); + + for (i = width / 4; i; --i) { + movq_m2r (*bot, mm0); + movq_m2r (*top, mm1); + pand_m2r (shiftmask, mm0); + pand_m2r (shiftmask, mm1); + psrlw_i2r (1, mm0); + psrlw_i2r (1, mm1); + paddb_r2r (mm1, mm0); + movq_r2m (mm0, *output); + output += 8; + top += 8; + bot += 8; + } + width = width & 0x7; + + /* Handle last few pixels. */ + for (i = width * 2; i; --i) { + *output++ = ((*top++) + (*bot++)) >> 1; + } + + emms (); +} +#endif + +#ifdef HAVE_CPU_I386 +static void +interpolate_packed422_scanline_mmxext (uint8_t * output, uint8_t * top, + uint8_t * bot, int width) +{ + int i; + + for (i = width / 16; i; --i) { + movq_m2r (*bot, mm0); + movq_m2r (*top, mm1); + movq_m2r (*(bot + 8), mm2); + movq_m2r (*(top + 8), mm3); + movq_m2r (*(bot + 16), mm4); + movq_m2r (*(top + 16), mm5); + movq_m2r (*(bot + 24), mm6); + movq_m2r (*(top + 24), mm7); + pavgb_r2r (mm1, mm0); + pavgb_r2r (mm3, mm2); + pavgb_r2r (mm5, mm4); + pavgb_r2r (mm7, mm6); + movntq_r2m (mm0, *output); + movntq_r2m (mm2, *(output + 8)); + movntq_r2m (mm4, *(output + 16)); + movntq_r2m (mm6, *(output + 24)); + output += 32; + top += 32; + bot += 32; + } + width = (width & 0xf); + + for (i = width / 4; i; --i) { + movq_m2r (*bot, mm0); + movq_m2r (*top, mm1); + pavgb_r2r (mm1, mm0); + movntq_r2m (mm0, *output); + output += 8; + top += 8; + bot += 8; + } + width = width & 0x7; + + /* Handle last few pixels. */ + for (i = width * 2; i; --i) { + *output++ = ((*top++) + (*bot++)) >> 1; + } + + sfence (); + emms (); +} +#endif + +static void +blit_colour_packed422_scanline_c (uint8_t * output, int width, int y, int cb, + int cr) +{ + uint32_t colour = cr << 24 | y << 16 | cb << 8 | y; + + uint32_t *o = (uint32_t *) output; + + for (width /= 2; width; --width) { + *o++ = colour; + } +} + +#ifdef HAVE_CPU_I386 +static void +blit_colour_packed422_scanline_mmx (uint8_t * output, int width, int y, int cb, + int cr) +{ + uint32_t colour = cr << 24 | y << 16 | cb << 8 | y; + + int i; + + movd_m2r (colour, mm1); + movd_m2r (colour, mm2); + psllq_i2r (32, mm1); + por_r2r (mm1, mm2); + + for (i = width / 16; i; --i) { + movq_r2m (mm2, *output); + movq_r2m (mm2, *(output + 8)); + movq_r2m (mm2, *(output + 16)); + movq_r2m (mm2, *(output + 24)); + output += 32; + } + width = (width & 0xf); + + for (i = width / 4; i; --i) { + movq_r2m (mm2, *output); + output += 8; + } + width = (width & 0x7); + + for (i = width / 2; i; --i) { + *((uint32_t *) output) = colour; + output += 4; + } + + if (width & 1) { + *output = y; + *(output + 1) = cb; + } + + emms (); +} +#endif + +#ifdef HAVE_CPU_I386 +static void +blit_colour_packed422_scanline_mmxext (uint8_t * output, int width, int y, + int cb, int cr) +{ + uint32_t colour = cr << 24 | y << 16 | cb << 8 | y; + + int i; + + movd_m2r (colour, mm1); + movd_m2r (colour, mm2); + psllq_i2r (32, mm1); + por_r2r (mm1, mm2); + + for (i = width / 16; i; --i) { + movntq_r2m (mm2, *output); + movntq_r2m (mm2, *(output + 8)); + movntq_r2m (mm2, *(output + 16)); + movntq_r2m (mm2, *(output + 24)); + output += 32; + } + width = (width & 0xf); + + for (i = width / 4; i; --i) { + movntq_r2m (mm2, *output); + output += 8; + } + width = (width & 0x7); + + for (i = width / 2; i; --i) { + *((uint32_t *) output) = colour; + output += 4; + } + + if (width & 1) { + *output = y; + *(output + 1) = cb; + } + + sfence (); + emms (); +} +#endif + +static void +blit_colour_packed4444_scanline_c (uint8_t * output, int width, + int alpha, int luma, int cb, int cr) +{ + int j; + + for (j = 0; j < width; j++) { + *output++ = alpha; + *output++ = luma; + *output++ = cb; + *output++ = cr; + } +} + +#ifdef HAVE_CPU_I386 +static void +blit_colour_packed4444_scanline_mmx (uint8_t * output, int width, + int alpha, int luma, int cb, int cr) +{ + uint32_t colour = (cr << 24) | (cb << 16) | (luma << 8) | alpha; + + int i; + + movd_m2r (colour, mm1); + movd_m2r (colour, mm2); + psllq_i2r (32, mm1); + por_r2r (mm1, mm2); + + for (i = width / 8; i; --i) { + movq_r2m (mm2, *output); + movq_r2m (mm2, *(output + 8)); + movq_r2m (mm2, *(output + 16)); + movq_r2m (mm2, *(output + 24)); + output += 32; + } + width = (width & 0x7); + + for (i = width / 2; i; --i) { + movq_r2m (mm2, *output); + output += 8; + } + width = (width & 0x1); + + if (width) { + *((uint32_t *) output) = colour; + output += 4; + } + + emms (); +} +#endif + +#ifdef HAVE_CPU_I386 +static void +blit_colour_packed4444_scanline_mmxext (uint8_t * output, int width, + int alpha, int luma, int cb, int cr) +{ + uint32_t colour = (cr << 24) | (cb << 16) | (luma << 8) | alpha; + + int i; + + movd_m2r (colour, mm1); + movd_m2r (colour, mm2); + psllq_i2r (32, mm1); + por_r2r (mm1, mm2); + + for (i = width / 8; i; --i) { + movntq_r2m (mm2, *output); + movntq_r2m (mm2, *(output + 8)); + movntq_r2m (mm2, *(output + 16)); + movntq_r2m (mm2, *(output + 24)); + output += 32; + } + width = (width & 0x7); + + for (i = width / 2; i; --i) { + movntq_r2m (mm2, *output); + output += 8; + } + width = (width & 0x1); + + if (width) { + *((uint32_t *) output) = colour; + output += 4; + } + + sfence (); + emms (); +} +#endif + + +/** + * Some memcpy code inspired by the xine code which originally came + * from mplayer. + */ + +/* linux kernel __memcpy (from: /include/asm/string.h) */ +#ifdef HAVE_CPU_I386 +static inline __attribute__ ((always_inline, const)) + void small_memcpy (void *to, const void *from, size_t n) +{ + int d0, d1, d2; + + __asm__ __volatile__ ("rep ; movsl\n\t" + "testb $2,%b4\n\t" + "je 1f\n\t" + "movsw\n" + "1:\ttestb $1,%b4\n\t" + "je 2f\n\t" "movsb\n" "2:":"=&c" (d0), "=&D" (d1), "=&S" (d2) + :"0" (n / 4), "q" (n), "1" ((long) to), "2" ((long) from) + :"memory"); +} +#endif + +static void +speedy_memcpy_c (void *dest, const void *src, size_t n) +{ + if (dest != src) { + memcpy (dest, src, n); + } +} + +#ifdef HAVE_CPU_I386 +static void +speedy_memcpy_mmx (void *d, const void *s, size_t n) +{ + const uint8_t *src = s; + + uint8_t *dest = d; + + if (dest != src) { + while (n > 64) { + movq_m2r (src[0], mm0); + movq_m2r (src[8], mm1); + movq_m2r (src[16], mm2); + movq_m2r (src[24], mm3); + movq_m2r (src[32], mm4); + movq_m2r (src[40], mm5); + movq_m2r (src[48], mm6); + movq_m2r (src[56], mm7); + movq_r2m (mm0, dest[0]); + movq_r2m (mm1, dest[8]); + movq_r2m (mm2, dest[16]); + movq_r2m (mm3, dest[24]); + movq_r2m (mm4, dest[32]); + movq_r2m (mm5, dest[40]); + movq_r2m (mm6, dest[48]); + movq_r2m (mm7, dest[56]); + dest += 64; + src += 64; + n -= 64; + } + + while (n > 8) { + movq_m2r (src[0], mm0); + movq_r2m (mm0, dest[0]); + dest += 8; + src += 8; + n -= 8; + } + + if (n) + small_memcpy (dest, src, n); + + emms (); + } +} +#endif + +#ifdef HAVE_CPU_I386 +static void +speedy_memcpy_mmxext (void *d, const void *s, size_t n) +{ + const uint8_t *src = s; + + uint8_t *dest = d; + + if (dest != src) { + while (n > 64) { + movq_m2r (src[0], mm0); + movq_m2r (src[8], mm1); + movq_m2r (src[16], mm2); + movq_m2r (src[24], mm3); + movq_m2r (src[32], mm4); + movq_m2r (src[40], mm5); + movq_m2r (src[48], mm6); + movq_m2r (src[56], mm7); + movntq_r2m (mm0, dest[0]); + movntq_r2m (mm1, dest[8]); + movntq_r2m (mm2, dest[16]); + movntq_r2m (mm3, dest[24]); + movntq_r2m (mm4, dest[32]); + movntq_r2m (mm5, dest[40]); + movntq_r2m (mm6, dest[48]); + movntq_r2m (mm7, dest[56]); + dest += 64; + src += 64; + n -= 64; + } + + while (n > 8) { + movq_m2r (src[0], mm0); + movntq_r2m (mm0, dest[0]); + dest += 8; + src += 8; + n -= 8; + } + + if (n) + small_memcpy (dest, src, n); + + sfence (); + emms (); + } +} +#endif + +static void +blit_packed422_scanline_c (uint8_t * dest, const uint8_t * src, int width) +{ + speedy_memcpy_c (dest, src, width * 2); +} + +#ifdef HAVE_CPU_I386 +static void +blit_packed422_scanline_mmx (uint8_t * dest, const uint8_t * src, int width) +{ + speedy_memcpy_mmx (dest, src, width * 2); +} +#endif + +#ifdef HAVE_CPU_I386 +static void +blit_packed422_scanline_mmxext (uint8_t * dest, const uint8_t * src, int width) +{ + speedy_memcpy_mmxext (dest, src, width * 2); +} +#endif + +static void +composite_colour4444_alpha_to_packed422_scanline_c (uint8_t * output, + uint8_t * input, int af, int y, int cb, int cr, int width, int alpha) +{ + int a = ((af * alpha) + 0x80) >> 8; + + if (a == 0xff) { + blit_colour_packed422_scanline (output, width, y, cb, cr); + } else if (a) { + int i; + + for (i = 0; i < width; i++) { + /** + * (1 - alpha)*B + alpha*F + * (1 - af*a)*B + af*a*F + * B - af*a*B + af*a*F + * B + a*(af*F - af*B) + */ + + output[0] = + input[0] + ((alpha * (y - multiply_alpha (af, + input[0])) + 0x80) >> 8); + + if ((i & 1) == 0) { + + /** + * At first I thought I was doing this incorrectly, but + * the following math has convinced me otherwise. + * + * C_r = (1 - alpha)*B + alpha*F + * C_r = B - af*a*B + af*a*F + * + * C_r = 128 + ((1 - af*a)*(B - 128) + a*af*(F - 128)) + * C_r = 128 + (B - af*a*B - 128 + af*a*128 + a*af*F - a*af*128) + * C_r = B - af*a*B + a*af*F + */ + + output[1] = + input[1] + ((alpha * (cb - multiply_alpha (af, + input[1])) + 0x80) >> 8); + output[3] = + input[3] + ((alpha * (cr - multiply_alpha (af, + input[3])) + 0x80) >> 8); + } + output += 2; + input += 2; + } + } +} + +#ifdef HAVE_CPU_I386 +static void +composite_colour4444_alpha_to_packed422_scanline_mmxext (uint8_t * output, + uint8_t * input, int af, int y, int cb, int cr, int width, int alpha) +{ + const mmx_t alpha2 = { 0x0000FFFF00000000ULL }; + const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL }; + const mmx_t round = { 0x0080008000800080ULL }; + mmx_t foreground; + + int i; + + if (!alpha) { + blit_packed422_scanline (output, input, width); + return; + } + + foreground.ub[0] = foreground.ub[4] = af; + foreground.ub[1] = foreground.ub[5] = y; + foreground.ub[2] = foreground.ub[6] = cb; + foreground.ub[3] = foreground.ub[7] = cr; + + movq_m2r (alpha, mm2); + pshufw_r2r (mm2, mm2, 0); + pxor_r2r (mm7, mm7); + + for (i = width / 2; i; i--) { + /* mm1 = [ cr ][ y ][ cb ][ y ] */ + movd_m2r (*input, mm1); + punpcklbw_r2r (mm7, mm1); + + movq_m2r (foreground, mm3); + movq_r2r (mm3, mm4); + punpcklbw_r2r (mm7, mm3); + punpckhbw_r2r (mm7, mm4); + /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */ + + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 0 a ][ 0 a ][ 0 a ] */ + pshufw_r2r (mm3, mm5, 0); + pshufw_r2r (mm4, mm6, 0); + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */ + pshufw_r2r (mm3, mm3, 201); + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */ + pshufw_r2r (mm4, mm4, 16); + + pand_m2r (alpha1, mm3); + pand_m2r (alpha2, mm4); + pand_m2r (alpha1, mm5); + pand_m2r (alpha2, mm6); + por_r2r (mm4, mm3); + por_r2r (mm6, mm5); + + /* now, mm5 is af and mm1 is B. Need to multiply them. */ + pmullw_r2r (mm1, mm5); + + /* Multiply by appalpha. */ + pmullw_r2r (mm2, mm3); + paddw_m2r (round, mm3); + psrlw_i2r (8, mm3); + /* Result is now B + F. */ + paddw_r2r (mm3, mm1); + + /* Round up appropriately. */ + paddw_m2r (round, mm5); + + /* mm6 contains our i>>8; */ + movq_r2r (mm5, mm6); + psrlw_i2r (8, mm6); + + /* Add mm6 back into mm5. Now our result is in the high bytes. */ + paddw_r2r (mm6, mm5); + + /* Shift down. */ + psrlw_i2r (8, mm5); + + /* Multiply by appalpha. */ + pmullw_r2r (mm2, mm5); + paddw_m2r (round, mm5); + psrlw_i2r (8, mm5); + + psubusw_r2r (mm5, mm1); + + /* mm1 = [ B + F - af*B ] */ + packuswb_r2r (mm1, mm1); + movd_r2m (mm1, *output); + + output += 4; + input += 4; + } + sfence (); + emms (); +} +#endif + + +static void +composite_packed4444_alpha_to_packed422_scanline_c (uint8_t * output, + uint8_t * input, uint8_t * foreground, int width, int alpha) +{ + int i; + + for (i = 0; i < width; i++) { + int af = foreground[0]; + + if (af) { + int a = ((af * alpha) + 0x80) >> 8; + + + if (a == 0xff) { + output[0] = foreground[1]; + + if ((i & 1) == 0) { + output[1] = foreground[2]; + output[3] = foreground[3]; + } + } else if (a) { + /** + * (1 - alpha)*B + alpha*F + * (1 - af*a)*B + af*a*F + * B - af*a*B + af*a*F + * B + a*(af*F - af*B) + */ + + output[0] = input[0] + + ((alpha * (foreground[1] + - multiply_alpha (foreground[0], input[0])) + 0x80) >> 8); + + if ((i & 1) == 0) { + + /** + * At first I thought I was doing this incorrectly, but + * the following math has convinced me otherwise. + * + * C_r = (1 - alpha)*B + alpha*F + * C_r = B - af*a*B + af*a*F + * + * C_r = 128 + ((1 - af*a)*(B - 128) + a*af*(F - 128)) + * C_r = 128 + (B - af*a*B - 128 + af*a*128 + a*af*F - a*af*128) + * C_r = B - af*a*B + a*af*F + */ + + output[1] = input[1] + ((alpha * (foreground[2] + - multiply_alpha (foreground[0], input[1])) + 0x80) >> 8); + output[3] = input[3] + ((alpha * (foreground[3] + - multiply_alpha (foreground[0], input[3])) + 0x80) >> 8); + } + } + } + foreground += 4; + output += 2; + input += 2; + } +} + +#ifdef HAVE_CPU_I386 +static void +composite_packed4444_alpha_to_packed422_scanline_mmxext (uint8_t * output, + uint8_t * input, uint8_t * foreground, int width, int alpha) +{ + const mmx_t alpha2 = { 0x0000FFFF00000000ULL }; + const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL }; + const mmx_t round = { 0x0080008000800080ULL }; + int i; + + if (!alpha) { + blit_packed422_scanline (output, input, width); + return; + } + + if (alpha == 256) { + composite_packed4444_to_packed422_scanline (output, input, foreground, + width); + return; + } + + READ_PREFETCH_2048 (input); + READ_PREFETCH_2048 (foreground); + + movq_m2r (alpha, mm2); + pshufw_r2r (mm2, mm2, 0); + pxor_r2r (mm7, mm7); + + for (i = width / 2; i; i--) { + int fg1 = *((uint32_t *) foreground); + + int fg2 = *(((uint32_t *) foreground) + 1); + + if (fg1 || fg2) { + /* mm1 = [ cr ][ y ][ cb ][ y ] */ + movd_m2r (*input, mm1); + punpcklbw_r2r (mm7, mm1); + + movq_m2r (*foreground, mm3); + movq_r2r (mm3, mm4); + punpcklbw_r2r (mm7, mm3); + punpckhbw_r2r (mm7, mm4); + /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */ + + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 0 a ][ 0 a ][ 0 a ] */ + pshufw_r2r (mm3, mm5, 0); + pshufw_r2r (mm4, mm6, 0); + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */ + pshufw_r2r (mm3, mm3, 201); + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */ + pshufw_r2r (mm4, mm4, 16); + + pand_m2r (alpha1, mm3); + pand_m2r (alpha2, mm4); + pand_m2r (alpha1, mm5); + pand_m2r (alpha2, mm6); + por_r2r (mm4, mm3); + por_r2r (mm6, mm5); + + /* now, mm5 is af and mm1 is B. Need to multiply them. */ + pmullw_r2r (mm1, mm5); + + /* Multiply by appalpha. */ + pmullw_r2r (mm2, mm3); + paddw_m2r (round, mm3); + psrlw_i2r (8, mm3); + /* Result is now B + F. */ + paddw_r2r (mm3, mm1); + + /* Round up appropriately. */ + paddw_m2r (round, mm5); + + /* mm6 contains our i>>8; */ + movq_r2r (mm5, mm6); + psrlw_i2r (8, mm6); + + /* Add mm6 back into mm5. Now our result is in the high bytes. */ + paddw_r2r (mm6, mm5); + + /* Shift down. */ + psrlw_i2r (8, mm5); + + /* Multiply by appalpha. */ + pmullw_r2r (mm2, mm5); + paddw_m2r (round, mm5); + psrlw_i2r (8, mm5); + + psubusw_r2r (mm5, mm1); + + /* mm1 = [ B + F - af*B ] */ + packuswb_r2r (mm1, mm1); + movd_r2m (mm1, *output); + } + + foreground += 8; + output += 4; + input += 4; + } + sfence (); + emms (); +} +#endif + +static void +composite_packed4444_to_packed422_scanline_c (uint8_t * output, uint8_t * input, + uint8_t * foreground, int width) +{ + int i; + + for (i = 0; i < width; i++) { + int a = foreground[0]; + + if (a == 0xff) { + output[0] = foreground[1]; + + if ((i & 1) == 0) { + output[1] = foreground[2]; + output[3] = foreground[3]; + } + } else if (a) { + /** + * (1 - alpha)*B + alpha*F + * B + af*F - af*B + */ + + output[0] = + input[0] + foreground[1] - multiply_alpha (foreground[0], input[0]); + + if ((i & 1) == 0) { + + /** + * C_r = (1 - af)*B + af*F + * C_r = B - af*B + af*F + */ + + output[1] = + input[1] + foreground[2] - multiply_alpha (foreground[0], input[1]); + output[3] = + input[3] + foreground[3] - multiply_alpha (foreground[0], input[3]); + } + } + foreground += 4; + output += 2; + input += 2; + } +} + + +#ifdef HAVE_CPU_I386 +static void +composite_packed4444_to_packed422_scanline_mmxext (uint8_t * output, + uint8_t * input, uint8_t * foreground, int width) +{ + const mmx_t alpha2 = { 0x0000FFFF00000000ULL }; + const mmx_t alpha1 = { 0xFFFF0000FFFFFFFFULL }; + const mmx_t round = { 0x0080008000800080ULL }; + int i; + + READ_PREFETCH_2048 (input); + READ_PREFETCH_2048 (foreground); + + pxor_r2r (mm7, mm7); + for (i = width / 2; i; i--) { + int fg1 = *((uint32_t *) foreground); + + int fg2 = *(((uint32_t *) foreground) + 1); + + if ((fg1 & 0xff) == 0xff && (fg2 & 0xff) == 0xff) { + movq_m2r (*foreground, mm3); + movq_r2r (mm3, mm4); + punpcklbw_r2r (mm7, mm3); + punpckhbw_r2r (mm7, mm4); + /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */ + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */ + pshufw_r2r (mm3, mm3, 201); + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */ + pshufw_r2r (mm4, mm4, 16); + pand_m2r (alpha1, mm3); + pand_m2r (alpha2, mm4); + por_r2r (mm4, mm3); + /* mm1 = [ B + F - af*B ] */ + packuswb_r2r (mm3, mm3); + movd_r2m (mm3, *output); + } else if (fg1 || fg2) { + + /* mm1 = [ cr ][ y ][ cb ][ y ] */ + movd_m2r (*input, mm1); + punpcklbw_r2r (mm7, mm1); + + movq_m2r (*foreground, mm3); + movq_r2r (mm3, mm4); + punpcklbw_r2r (mm7, mm3); + punpckhbw_r2r (mm7, mm4); + /* mm3 and mm4 will be the appropriate colours, mm5 and mm6 for alpha. */ + + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 0 a ][ 0 a ][ 0 a ] */ + pshufw_r2r (mm3, mm5, 0); + pshufw_r2r (mm4, mm6, 0); + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 3 cr ][ 0 a ][ 2 cb ][ 1 y ] == 11001000 == 201 */ + pshufw_r2r (mm3, mm3, 201); + /* [ 3 cr ][ 2 cb ][ 1 y ][ 0 a ] -> [ 0 a ][ 1 y ][ 0 a ][ 0 a ] == 00010000 == 16 */ + pshufw_r2r (mm4, mm4, 16); + + pand_m2r (alpha1, mm3); + pand_m2r (alpha2, mm4); + pand_m2r (alpha1, mm5); + pand_m2r (alpha2, mm6); + por_r2r (mm4, mm3); + por_r2r (mm6, mm5); + + /* now, mm5 is af and mm1 is B. Need to multiply them. */ + pmullw_r2r (mm1, mm5); + + /* Result is now B + F. */ + paddw_r2r (mm3, mm1); + + /* Round up appropriately. */ + paddw_m2r (round, mm5); + + /* mm6 contains our i>>8; */ + movq_r2r (mm5, mm6); + psrlw_i2r (8, mm6); + + /* Add mm6 back into mm5. Now our result is in the high bytes. */ + paddw_r2r (mm6, mm5); + + /* Shift down. */ + psrlw_i2r (8, mm5); + + psubusw_r2r (mm5, mm1); + + /* mm1 = [ B + F - af*B ] */ + packuswb_r2r (mm1, mm1); + movd_r2m (mm1, *output); + } + + foreground += 8; + output += 4; + input += 4; + } + sfence (); + emms (); +} +#endif + +/** + * um... just need some scrap paper... + * D = (1 - alpha)*B + alpha*F + * D = (1 - a)*B + a*textluma + * = B - a*B + a*textluma + * = B + a*(textluma - B) + * Da = (1 - a)*b + a + */ +static void +composite_alphamask_to_packed4444_scanline_c (uint8_t * output, + uint8_t * input, + uint8_t * mask, int width, int textluma, int textcb, int textcr) +{ + uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff; + + int i; + + for (i = 0; i < width; i++) { + int a = *mask; + + if (a == 0xff) { + *((uint32_t *) output) = opaque; + } else if ((input[0] == 0x00)) { + *((uint32_t *) output) = (multiply_alpha (a, textcr) << 24) + | (multiply_alpha (a, textcb) << 16) + | (multiply_alpha (a, textluma) << 8) | a; + } else if (a) { + *((uint32_t *) output) = + ((input[3] + multiply_alpha (a, textcr - input[3])) << 24) + | ((input[2] + multiply_alpha (a, textcb - input[2])) << 16) + | ((input[1] + multiply_alpha (a, textluma - input[1])) << 8) + | (input[0] + multiply_alpha (a, 0xff - input[0])); + } + mask++; + output += 4; + input += 4; + } +} + +#ifdef HAVE_CPU_I386 +static void +composite_alphamask_to_packed4444_scanline_mmxext (uint8_t * output, + uint8_t * input, + uint8_t * mask, int width, int textluma, int textcb, int textcr) +{ + uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff; + const mmx_t round = { 0x0080008000800080ULL }; + const mmx_t fullalpha = { 0x00000000000000ffULL }; + mmx_t colour; + + colour.w[0] = 0x00; + colour.w[1] = textluma; + colour.w[2] = textcb; + colour.w[3] = textcr; + + movq_m2r (colour, mm1); + movq_r2r (mm1, mm0); + + /* mm0 = [ cr ][ cb ][ y ][ 0xff ] */ + paddw_m2r (fullalpha, mm0); + + /* mm7 = 0 */ + pxor_r2r (mm7, mm7); + + /* mm6 = round */ + movq_m2r (round, mm6); + + while (width--) { + int a = *mask; + + if (a == 0xff) { + *((uint32_t *) output) = opaque; + } else if ((input[0] == 0x00)) { + /* We just need to multiply our colour by the alpha value. */ + + /* mm2 = [ a ][ a ][ a ][ a ] */ + movd_m2r (a, mm2); + movq_r2r (mm2, mm3); + pshufw_r2r (mm2, mm2, 0); + + /* mm5 = [ cr ][ cb ][ y ][ 0 ] */ + movq_r2r (mm1, mm5); + + /* Multiply by alpha. */ + pmullw_r2r (mm2, mm5); + paddw_m2r (round, mm5); + movq_r2r (mm5, mm6); + psrlw_i2r (8, mm6); + paddw_r2r (mm6, mm5); + psrlw_i2r (8, mm5); + + /* Set alpha to a. */ + por_r2r (mm3, mm5); + + /* Pack and write our result. */ + packuswb_r2r (mm5, mm5); + movd_r2m (mm5, *output); + } else if (a) { + /* mm2 = [ a ][ a ][ a ][ a ] */ + movd_m2r (a, mm2); + pshufw_r2r (mm2, mm2, 0); + + /* mm3 = [ cr ][ cb ][ y ][ 0xff ] */ + movq_r2r (mm0, mm3); + + /* mm4 = [ i_cr ][ i_cb ][ i_y ][ i_a ] */ + movd_m2r (*input, mm4); + punpcklbw_r2r (mm7, mm4); + + /* Subtract input and colour. */ + psubw_r2r (mm4, mm3); /* mm3 = mm3 - mm4 */ + + /* Multiply alpha. */ + pmullw_r2r (mm2, mm3); + paddw_r2r (mm6, mm3); + movq_r2r (mm3, mm2); + psrlw_i2r (8, mm3); + paddw_r2r (mm2, mm3); + psrlw_i2r (8, mm3); + + /* Add back in the input. */ + paddb_r2r (mm3, mm4); + + /* Write result. */ + packuswb_r2r (mm4, mm4); + movd_r2m (mm4, *output); + } + mask++; + output += 4; + input += 4; + } + sfence (); + emms (); +} +#endif + +static void +composite_alphamask_alpha_to_packed4444_scanline_c (uint8_t * output, + uint8_t * input, + uint8_t * mask, int width, int textluma, int textcb, int textcr, int alpha) +{ + uint32_t opaque = (textcr << 24) | (textcb << 16) | (textluma << 8) | 0xff; + + int i; + + for (i = 0; i < width; i++) { + int af = *mask; + + if (af) { + int a = ((af * alpha) + 0x80) >> 8; + + if (a == 0xff) { + *((uint32_t *) output) = opaque; + } else if (input[0] == 0x00) { + *((uint32_t *) output) = (multiply_alpha (a, textcr) << 24) + | (multiply_alpha (a, textcb) << 16) + | (multiply_alpha (a, textluma) << 8) | a; + } else if (a) { + *((uint32_t *) output) = + ((input[3] + multiply_alpha (a, textcr - input[3])) << 24) + | ((input[2] + multiply_alpha (a, textcb - input[2])) << 16) + | ((input[1] + multiply_alpha (a, textluma - input[1])) << 8) + | (a + multiply_alpha (0xff - a, input[0])); + } + } + mask++; + output += 4; + input += 4; + } +} + +static void +premultiply_packed4444_scanline_c (uint8_t * output, uint8_t * input, int width) +{ + while (width--) { + unsigned int cur_a = input[0]; + + *((uint32_t *) output) = (multiply_alpha (cur_a, input[3]) << 24) + | (multiply_alpha (cur_a, input[2]) << 16) + | (multiply_alpha (cur_a, input[1]) << 8) + | cur_a; + + output += 4; + input += 4; + } +} + +#ifdef HAVE_CPU_I386 +static void +premultiply_packed4444_scanline_mmxext (uint8_t * output, uint8_t * input, + int width) +{ + const mmx_t round = { 0x0080008000800080ULL }; + const mmx_t alpha = { 0x00000000000000ffULL }; + const mmx_t noalp = { 0xffffffffffff0000ULL }; + + pxor_r2r (mm7, mm7); + while (width--) { + movd_m2r (*input, mm0); + punpcklbw_r2r (mm7, mm0); + + movq_r2r (mm0, mm2); + pshufw_r2r (mm2, mm2, 0); + movq_r2r (mm2, mm4); + pand_m2r (alpha, mm4); + + pmullw_r2r (mm2, mm0); + paddw_m2r (round, mm0); + + movq_r2r (mm0, mm3); + psrlw_i2r (8, mm3); + paddw_r2r (mm3, mm0); + psrlw_i2r (8, mm0); + + pand_m2r (noalp, mm0); + paddw_r2r (mm4, mm0); + + packuswb_r2r (mm0, mm0); + movd_r2m (mm0, *output); + + output += 4; + input += 4; + } + sfence (); + emms (); +} +#endif + +static void +blend_packed422_scanline_c (uint8_t * output, uint8_t * src1, + uint8_t * src2, int width, int pos) +{ + if (pos == 0) { + blit_packed422_scanline (output, src1, width); + } else if (pos == 256) { + blit_packed422_scanline (output, src2, width); + } else if (pos == 128) { + interpolate_packed422_scanline (output, src1, src2, width); + } else { + width *= 2; + while (width--) { + *output++ = ((*src1++ * (256 - pos)) + (*src2++ * pos) + 0x80) >> 8; + } + } +} + +#ifdef HAVE_CPU_I386 +static void +blend_packed422_scanline_mmxext (uint8_t * output, uint8_t * src1, + uint8_t * src2, int width, int pos) +{ + if (pos <= 0) { + blit_packed422_scanline (output, src1, width); + } else if (pos >= 256) { + blit_packed422_scanline (output, src2, width); + } else if (pos == 128) { + interpolate_packed422_scanline (output, src1, src2, width); + } else { + const mmx_t all256 = { 0x0100010001000100ULL }; + const mmx_t round = { 0x0080008000800080ULL }; + + movd_m2r (pos, mm0); + pshufw_r2r (mm0, mm0, 0); + movq_m2r (all256, mm1); + psubw_r2r (mm0, mm1); + pxor_r2r (mm7, mm7); + + for (width /= 2; width; width--) { + movd_m2r (*src1, mm3); + movd_m2r (*src2, mm4); + punpcklbw_r2r (mm7, mm3); + punpcklbw_r2r (mm7, mm4); + + pmullw_r2r (mm1, mm3); + pmullw_r2r (mm0, mm4); + paddw_r2r (mm4, mm3); + paddw_m2r (round, mm3); + psrlw_i2r (8, mm3); + + packuswb_r2r (mm3, mm3); + movd_r2m (mm3, *output); + + output += 4; + src1 += 4; + src2 += 4; + } + sfence (); + emms (); + } +} +#endif + +#ifdef HAVE_CPU_I386 +static void +quarter_blit_vertical_packed422_scanline_mmxext (uint8_t * output, + uint8_t * one, uint8_t * three, int width) +{ + int i; + + for (i = width / 16; i; --i) { + movq_m2r (*one, mm0); + movq_m2r (*three, mm1); + movq_m2r (*(one + 8), mm2); + movq_m2r (*(three + 8), mm3); + movq_m2r (*(one + 16), mm4); + movq_m2r (*(three + 16), mm5); + movq_m2r (*(one + 24), mm6); + movq_m2r (*(three + 24), mm7); + pavgb_r2r (mm1, mm0); + pavgb_r2r (mm1, mm0); + pavgb_r2r (mm3, mm2); + pavgb_r2r (mm3, mm2); + pavgb_r2r (mm5, mm4); + pavgb_r2r (mm5, mm4); + pavgb_r2r (mm7, mm6); + pavgb_r2r (mm7, mm6); + movntq_r2m (mm0, *output); + movntq_r2m (mm2, *(output + 8)); + movntq_r2m (mm4, *(output + 16)); + movntq_r2m (mm6, *(output + 24)); + output += 32; + one += 32; + three += 32; + } + width = (width & 0xf); + + for (i = width / 4; i; --i) { + movq_m2r (*one, mm0); + movq_m2r (*three, mm1); + pavgb_r2r (mm1, mm0); + pavgb_r2r (mm1, mm0); + movntq_r2m (mm0, *output); + output += 8; + one += 8; + three += 8; + } + width = width & 0x7; + + /* Handle last few pixels. */ + for (i = width * 2; i; --i) { + *output++ = (*one + *three + *three + *three + 2) / 4; + one++; + three++; + } + + sfence (); + emms (); +} +#endif + + +static void +quarter_blit_vertical_packed422_scanline_c (uint8_t * output, uint8_t * one, + uint8_t * three, int width) +{ + width *= 2; + while (width--) { + *output++ = (*one + *three + *three + *three + 2) / 4; + one++; + three++; + } +} + +static void +subpix_blit_vertical_packed422_scanline_c (uint8_t * output, uint8_t * top, + uint8_t * bot, int subpixpos, int width) +{ + if (subpixpos == 32768) { + interpolate_packed422_scanline (output, top, bot, width); + } else if (subpixpos == 16384) { + quarter_blit_vertical_packed422_scanline (output, top, bot, width); + } else if (subpixpos == 49152) { + quarter_blit_vertical_packed422_scanline (output, bot, top, width); + } else { + int x; + + width *= 2; + for (x = 0; x < width; x++) { + output[x] = + ((top[x] * subpixpos) + (bot[x] * (0xffff - subpixpos))) >> 16; + } + } +} + +static void +a8_subpix_blit_scanline_c (uint8_t * output, uint8_t * input, + int lasta, int startpos, int width) +{ + int pos = 0xffff - (startpos & 0xffff); + + int prev = lasta; + + int x; + + for (x = 0; x < width; x++) { + output[x] = ((prev * pos) + (input[x] * (0xffff - pos))) >> 16; + prev = input[x]; + } +} + +/** + * These are from lavtools in mjpegtools: + * + * colorspace.c: Routines to perform colorspace conversions. + * + * Copyright (C) 2001 Matthew J. Marjanovic <maddog@mir.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#define FP_BITS 18 + +/* precomputed tables */ + +static int Y_R[256]; + +static int Y_G[256]; + +static int Y_B[256]; + +static int Cb_R[256]; + +static int Cb_G[256]; + +static int Cb_B[256]; + +static int Cr_R[256]; + +static int Cr_G[256]; + +static int Cr_B[256]; + +static int conv_RY_inited = 0; + +static int RGB_Y[256]; + +static int R_Cr[256]; + +static int G_Cb[256]; + +static int G_Cr[256]; + +static int B_Cb[256]; + +static int conv_YR_inited = 0; + +static int +myround (double n) +{ + if (n >= 0) + return (int) (n + 0.5); + else + return (int) (n - 0.5); +} + +static void +init_RGB_to_YCbCr_tables (void) +{ + int i; + + /* + * Q_Z[i] = (coefficient * i + * * (Q-excursion) / (Z-excursion) * fixed-point-factor) + * + * to one of each, add the following: + * + (fixed-point-factor / 2) --- for rounding later + * + (Q-offset * fixed-point-factor) --- to add the offset + * + */ + for (i = 0; i < 256; i++) { + Y_R[i] = + myround (0.299 * (double) i * 219.0 / 255.0 * (double) (1 << FP_BITS)); + Y_G[i] = + myround (0.587 * (double) i * 219.0 / 255.0 * (double) (1 << FP_BITS)); + Y_B[i] = + myround ((0.114 * (double) i * 219.0 / 255.0 * (double) (1 << FP_BITS)) + + (double) (1 << (FP_BITS - 1)) + (16.0 * (double) (1 << FP_BITS))); + + Cb_R[i] = + myround (-0.168736 * (double) i * 224.0 / 255.0 * + (double) (1 << FP_BITS)); + Cb_G[i] = + myround (-0.331264 * (double) i * 224.0 / 255.0 * + (double) (1 << FP_BITS)); + Cb_B[i] = + myround ((0.500 * (double) i * 224.0 / 255.0 * (double) (1 << FP_BITS)) + + (double) (1 << (FP_BITS - 1)) + (128.0 * (double) (1 << FP_BITS))); + + Cr_R[i] = + myround (0.500 * (double) i * 224.0 / 255.0 * (double) (1 << FP_BITS)); + Cr_G[i] = + myround (-0.418688 * (double) i * 224.0 / 255.0 * + (double) (1 << FP_BITS)); + Cr_B[i] = + myround ((-0.081312 * (double) i * 224.0 / 255.0 * + (double) (1 << FP_BITS)) + + (double) (1 << (FP_BITS - 1)) + (128.0 * (double) (1 << FP_BITS))); + } + conv_RY_inited = 1; +} + +static void +init_YCbCr_to_RGB_tables (void) +{ + int i; + + /* + * Q_Z[i] = (coefficient * i + * * (Q-excursion) / (Z-excursion) * fixed-point-factor) + * + * to one of each, add the following: + * + (fixed-point-factor / 2) --- for rounding later + * + (Q-offset * fixed-point-factor) --- to add the offset + * + */ + + /* clip Y values under 16 */ + for (i = 0; i < 16; i++) { + RGB_Y[i] = + myround ((1.0 * (double) (16) * 255.0 / 219.0 * (double) (1 << FP_BITS)) + + (double) (1 << (FP_BITS - 1))); + } + for (i = 16; i < 236; i++) { + RGB_Y[i] = + myround ((1.0 * (double) (i - + 16) * 255.0 / 219.0 * (double) (1 << FP_BITS)) + + (double) (1 << (FP_BITS - 1))); + } + /* clip Y values above 235 */ + for (i = 236; i < 256; i++) { + RGB_Y[i] = + myround ((1.0 * (double) (235) * 255.0 / 219.0 * + (double) (1 << FP_BITS)) + + (double) (1 << (FP_BITS - 1))); + } + + /* clip Cb/Cr values below 16 */ + for (i = 0; i < 16; i++) { + R_Cr[i] = + myround (1.402 * (double) (-112) * 255.0 / 224.0 * + (double) (1 << FP_BITS)); + G_Cr[i] = + myround (-0.714136 * (double) (-112) * 255.0 / 224.0 * + (double) (1 << FP_BITS)); + G_Cb[i] = + myround (-0.344136 * (double) (-112) * 255.0 / 224.0 * + (double) (1 << FP_BITS)); + B_Cb[i] = + myround (1.772 * (double) (-112) * 255.0 / 224.0 * + (double) (1 << FP_BITS)); + } + for (i = 16; i < 241; i++) { + R_Cr[i] = + myround (1.402 * (double) (i - + 128) * 255.0 / 224.0 * (double) (1 << FP_BITS)); + G_Cr[i] = + myround (-0.714136 * (double) (i - + 128) * 255.0 / 224.0 * (double) (1 << FP_BITS)); + G_Cb[i] = + myround (-0.344136 * (double) (i - + 128) * 255.0 / 224.0 * (double) (1 << FP_BITS)); + B_Cb[i] = + myround (1.772 * (double) (i - + 128) * 255.0 / 224.0 * (double) (1 << FP_BITS)); + } + /* clip Cb/Cr values above 240 */ + for (i = 241; i < 256; i++) { + R_Cr[i] = + myround (1.402 * (double) (112) * 255.0 / 224.0 * + (double) (1 << FP_BITS)); + G_Cr[i] = + myround (-0.714136 * (double) (112) * 255.0 / 224.0 * + (double) (1 << FP_BITS)); + G_Cb[i] = + myround (-0.344136 * (double) (i - + 128) * 255.0 / 224.0 * (double) (1 << FP_BITS)); + B_Cb[i] = + myround (1.772 * (double) (112) * 255.0 / 224.0 * + (double) (1 << FP_BITS)); + } + conv_YR_inited = 1; +} + +static void +rgb24_to_packed444_rec601_scanline_c (uint8_t * output, uint8_t * input, + int width) +{ + if (!conv_RY_inited) + init_RGB_to_YCbCr_tables (); + + while (width--) { + int r = input[0]; + + int g = input[1]; + + int b = input[2]; + + output[0] = (Y_R[r] + Y_G[g] + Y_B[b]) >> FP_BITS; + output[1] = (Cb_R[r] + Cb_G[g] + Cb_B[b]) >> FP_BITS; + output[2] = (Cr_R[r] + Cr_G[g] + Cr_B[b]) >> FP_BITS; + output += 3; + input += 3; + } +} + +static void +rgba32_to_packed4444_rec601_scanline_c (uint8_t * output, uint8_t * input, + int width) +{ + if (!conv_RY_inited) + init_RGB_to_YCbCr_tables (); + + while (width--) { + int r = input[0]; + + int g = input[1]; + + int b = input[2]; + + int a = input[3]; + + output[0] = a; + output[1] = (Y_R[r] + Y_G[g] + Y_B[b]) >> FP_BITS; + output[2] = (Cb_R[r] + Cb_G[g] + Cb_B[b]) >> FP_BITS; + output[3] = (Cr_R[r] + Cr_G[g] + Cr_B[b]) >> FP_BITS; + output += 4; + input += 4; + } +} + +static void +packed444_to_rgb24_rec601_scanline_c (uint8_t * output, uint8_t * input, + int width) +{ + if (!conv_YR_inited) + init_YCbCr_to_RGB_tables (); + + while (width--) { + int luma = input[0]; + + int cb = input[1]; + + int cr = input[2]; + + output[0] = clip255 ((RGB_Y[luma] + R_Cr[cr]) >> FP_BITS); + output[1] = clip255 ((RGB_Y[luma] + G_Cb[cb] + G_Cr[cr]) >> FP_BITS); + output[2] = clip255 ((RGB_Y[luma] + B_Cb[cb]) >> FP_BITS); + + output += 3; + input += 3; + } +} + +/** + * 601 numbers: + * + * Y' = 0.299*R' + 0.587*G' + 0.114*B' (in 0.0 to 1.0) + * Cb = -0.169*R' - 0.331*G' + 0.500*B' (in -0.5 to +0.5) + * Cr = 0.500*R' - 0.419*G' - 0.081*B' (in -0.5 to +0.5) + * + * Inverse: + * Y Cb Cr + * R 1.0000 -0.0009 1.4017 + * G 1.0000 -0.3437 -0.7142 + * B 1.0000 1.7722 0.0010 + * + * S170M numbers: + * Y' = 0.299*R' + 0.587*G' + 0.114*B' (in 0.0 to 1.0) + * B-Y' = -0.299*R' - 0.587*G' + 0.886*B' + * R-Y' = 0.701*R' - 0.587*G' - 0.114*B' + */ +/* +static void packed444_to_rgb24_rec601_reference_scanline( uint8_t *output, uint8_t *input, int width ) +{ + while( width-- ) { + double yp = (((double) input[ 0 ]) - 16.0) / 255.0; + double cb = (((double) input[ 1 ]) - 128.0) / 255.0; + double cr = (((double) input[ 2 ]) - 128.0) / 255.0; + double r, g, b; + + r = yp - (0.0009*cb) + (1.4017*cr); + g = yp - (0.3437*cb) - (0.7142*cr); + b = yp + (1.7722*cb) + (0.0010*cr); + + if( r > 1.0 ) r = 1.0; else if( r < 0.0 ) r = 0.0; + if( g > 1.0 ) g = 1.0; else if( g < 0.0 ) g = 0.0; + if( b > 1.0 ) b = 1.0; else if( b < 0.0 ) b = 0.0; + + output[ 0 ] = (int) ((r * 255.0) + 0.5); + output[ 1 ] = (int) ((g * 255.0) + 0.5); + output[ 2 ] = (int) ((b * 255.0) + 0.5); + + output += 3; + input += 3; + } +} +*/ + +static void +packed444_to_nonpremultiplied_packed4444_scanline_c (uint8_t * output, + uint8_t * input, int width, int alpha) +{ + int i; + + for (i = 0; i < width; i++) { + output[0] = alpha & 0xff; + output[1] = input[0] & 0xff; + output[2] = input[1] & 0xff; + output[3] = input[2] & 0xff; + + output += 4; + input += 3; + } +} + +static void +aspect_adjust_packed4444_scanline_c (uint8_t * output, + uint8_t * input, int width, double pixel_aspect) +{ + double i; + + int prev_i = 0; + + int w = 0; + + pixel_aspect = 1.0 / pixel_aspect; + + for (i = 0.0; i < width; i += pixel_aspect) { + uint8_t *curin = input + ((int) i) * 4; + + if (!prev_i) { + output[0] = curin[0]; + output[1] = curin[1]; + output[2] = curin[2]; + output[3] = curin[3]; + } else { + int avg_a = 0; + + int avg_y = 0; + + int avg_cb = 0; + + int avg_cr = 0; + + int pos = prev_i * 4; + + int c = 0; + + int j; + + for (j = prev_i; j <= (int) i; j++) { + avg_a += input[pos++]; + avg_y += input[pos++]; + avg_cb += input[pos++]; + avg_cr += input[pos++]; + c++; + } + output[0] = avg_a / c; + output[1] = avg_y / c; + output[2] = avg_cb / c; + output[3] = avg_cr / c; + } + output += 4; + prev_i = (int) i; + w++; + } +} + +static uint32_t speedy_accel; + +void +setup_speedy_calls (uint32_t accel, int verbose) +{ + speedy_accel = accel; + + interpolate_packed422_scanline = interpolate_packed422_scanline_c; + blit_colour_packed422_scanline = blit_colour_packed422_scanline_c; + blit_colour_packed4444_scanline = blit_colour_packed4444_scanline_c; + blit_packed422_scanline = blit_packed422_scanline_c; + composite_packed4444_to_packed422_scanline = + composite_packed4444_to_packed422_scanline_c; + composite_packed4444_alpha_to_packed422_scanline = + composite_packed4444_alpha_to_packed422_scanline_c; + composite_alphamask_to_packed4444_scanline = + composite_alphamask_to_packed4444_scanline_c; + composite_alphamask_alpha_to_packed4444_scanline = + composite_alphamask_alpha_to_packed4444_scanline_c; + premultiply_packed4444_scanline = premultiply_packed4444_scanline_c; + blend_packed422_scanline = blend_packed422_scanline_c; + comb_factor_packed422_scanline = 0; + diff_factor_packed422_scanline = diff_factor_packed422_scanline_c; + kill_chroma_packed422_inplace_scanline = + kill_chroma_packed422_inplace_scanline_c; + mirror_packed422_inplace_scanline = mirror_packed422_inplace_scanline_c; + speedy_memcpy = speedy_memcpy_c; + diff_packed422_block8x8 = diff_packed422_block8x8_c; + a8_subpix_blit_scanline = a8_subpix_blit_scanline_c; + quarter_blit_vertical_packed422_scanline = + quarter_blit_vertical_packed422_scanline_c; + subpix_blit_vertical_packed422_scanline = + subpix_blit_vertical_packed422_scanline_c; + packed444_to_nonpremultiplied_packed4444_scanline = + packed444_to_nonpremultiplied_packed4444_scanline_c; + aspect_adjust_packed4444_scanline = aspect_adjust_packed4444_scanline_c; + packed444_to_packed422_scanline = packed444_to_packed422_scanline_c; + packed422_to_packed444_scanline = packed422_to_packed444_scanline_c; + packed422_to_packed444_rec601_scanline = + packed422_to_packed444_rec601_scanline_c; + packed444_to_rgb24_rec601_scanline = packed444_to_rgb24_rec601_scanline_c; + rgb24_to_packed444_rec601_scanline = rgb24_to_packed444_rec601_scanline_c; + rgba32_to_packed4444_rec601_scanline = rgba32_to_packed4444_rec601_scanline_c; + invert_colour_packed422_inplace_scanline = + invert_colour_packed422_inplace_scanline_c; + vfilter_chroma_121_packed422_scanline = + vfilter_chroma_121_packed422_scanline_c; + vfilter_chroma_332_packed422_scanline = + vfilter_chroma_332_packed422_scanline_c; + convert_uyvy_to_yuyv_scanline = convert_uyvy_to_yuyv_scanline_c; + composite_colour4444_alpha_to_packed422_scanline = + composite_colour4444_alpha_to_packed422_scanline_c; + +#ifdef HAVE_CPU_I386 + if (speedy_accel & OIL_IMPL_FLAG_MMXEXT) { + if (verbose) { + fprintf (stderr, "speedycode: Using MMXEXT optimized functions.\n"); + } + interpolate_packed422_scanline = interpolate_packed422_scanline_mmxext; + blit_colour_packed422_scanline = blit_colour_packed422_scanline_mmxext; + blit_colour_packed4444_scanline = blit_colour_packed4444_scanline_mmxext; + blit_packed422_scanline = blit_packed422_scanline_mmxext; + composite_packed4444_to_packed422_scanline = + composite_packed4444_to_packed422_scanline_mmxext; + composite_packed4444_alpha_to_packed422_scanline = + composite_packed4444_alpha_to_packed422_scanline_mmxext; + composite_alphamask_to_packed4444_scanline = + composite_alphamask_to_packed4444_scanline_mmxext; + premultiply_packed4444_scanline = premultiply_packed4444_scanline_mmxext; + kill_chroma_packed422_inplace_scanline = + kill_chroma_packed422_inplace_scanline_mmx; + blend_packed422_scanline = blend_packed422_scanline_mmxext; + diff_factor_packed422_scanline = diff_factor_packed422_scanline_mmx; + comb_factor_packed422_scanline = comb_factor_packed422_scanline_mmx; + diff_packed422_block8x8 = diff_packed422_block8x8_mmx; + quarter_blit_vertical_packed422_scanline = + quarter_blit_vertical_packed422_scanline_mmxext; + invert_colour_packed422_inplace_scanline = + invert_colour_packed422_inplace_scanline_mmx; + vfilter_chroma_121_packed422_scanline = + vfilter_chroma_121_packed422_scanline_mmx; + vfilter_chroma_332_packed422_scanline = + vfilter_chroma_332_packed422_scanline_mmx; + convert_uyvy_to_yuyv_scanline = convert_uyvy_to_yuyv_scanline_mmx; + composite_colour4444_alpha_to_packed422_scanline = + composite_colour4444_alpha_to_packed422_scanline_mmxext; + speedy_memcpy = speedy_memcpy_mmxext; + } else if (speedy_accel & OIL_IMPL_FLAG_MMX) { + if (verbose) { + fprintf (stderr, "speedycode: Using MMX optimized functions.\n"); + } + interpolate_packed422_scanline = interpolate_packed422_scanline_mmx; + blit_colour_packed422_scanline = blit_colour_packed422_scanline_mmx; + blit_colour_packed4444_scanline = blit_colour_packed4444_scanline_mmx; + blit_packed422_scanline = blit_packed422_scanline_mmx; + diff_factor_packed422_scanline = diff_factor_packed422_scanline_mmx; + comb_factor_packed422_scanline = comb_factor_packed422_scanline_mmx; + kill_chroma_packed422_inplace_scanline = + kill_chroma_packed422_inplace_scanline_mmx; + diff_packed422_block8x8 = diff_packed422_block8x8_mmx; + invert_colour_packed422_inplace_scanline = + invert_colour_packed422_inplace_scanline_mmx; + vfilter_chroma_121_packed422_scanline = + vfilter_chroma_121_packed422_scanline_mmx; + vfilter_chroma_332_packed422_scanline = + vfilter_chroma_332_packed422_scanline_mmx; + convert_uyvy_to_yuyv_scanline = convert_uyvy_to_yuyv_scanline_mmx; + speedy_memcpy = speedy_memcpy_mmx; + } else { + if (verbose) { + fprintf (stderr, + "speedycode: No MMX or MMXEXT support detected, using C fallbacks.\n"); + } + } +#endif +} + +uint32_t +speedy_get_accel (void) +{ + return speedy_accel; +} diff --git a/gst/deinterlace2/tvtime/speedy.h b/gst/deinterlace2/tvtime/speedy.h new file mode 100644 index 00000000..fb833ff1 --- /dev/null +++ b/gst/deinterlace2/tvtime/speedy.h @@ -0,0 +1,308 @@ +/* + * + * GStreamer + * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Relicensed for GStreamer from GPL to LGPL with permit from Billy Biggs. + * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 + */ + +#ifndef SPEEDY_H_INCLUDED +#define SPEEDY_H_INCLUDED + +#if defined (__SVR4) && defined (__sun) +# include <sys/int_types.h> +#else +# include <stdint.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Speedy is a collection of optimized functions plus their C fallbacks. + * This includes a simple system to select which functions to use + * at runtime. + * + * The optimizations are done with the help of the mmx.h system, from + * libmpeg2 by Michel Lespinasse and Aaron Holtzman. + * + * The library is a collection of function pointers which must be first + * initialized by setup_speedy_calls() to point at the fastest available + * implementation of each function. + */ + +/** + * Struct for pulldown detection metrics. + */ +typedef struct pulldown_metrics_s { + /* difference: total, even lines, odd lines */ + int d, e, o; + /* noise: temporal, spacial (current), spacial (past) */ + int t, s, p; +} pulldown_metrics_t; + +/** + * Interpolates a packed 4:2:2 scanline using linear interpolation. + */ +extern void (*interpolate_packed422_scanline)( uint8_t *output, uint8_t *top, + uint8_t *bot, int width ); + +/** + * Blits a colour to a packed 4:2:2 scanline. + */ +extern void (*blit_colour_packed422_scanline)( uint8_t *output, + int width, int y, int cb, int cr ); + +/** + * Blits a colour to a packed 4:4:4:4 scanline. I use luma/cb/cr instead of + * RGB but this will of course work for either. + */ +extern void (*blit_colour_packed4444_scanline)( uint8_t *output, + int width, int alpha, int luma, + int cb, int cr ); + +/** + * Blit from and to packed 4:2:2 scanline. + */ +extern void (*blit_packed422_scanline)( uint8_t *dest, const uint8_t *src, int width ); + +/** + * Composites a premultiplied 4:4:4:4 pixel onto a packed 4:2:2 scanline. + */ +extern void (*composite_colour4444_alpha_to_packed422_scanline)( uint8_t *output, uint8_t *input, + int af, int y, int cb, int cr, + int width, int alpha ); + +/** + * Composites a packed 4:4:4:4 scanline onto a packed 4:2:2 scanline. + * Chroma is downsampled by dropping samples (nearest neighbour). + */ +extern void (*composite_packed4444_to_packed422_scanline)( uint8_t *output, + uint8_t *input, + uint8_t *foreground, + int width ); + +/** + * Composites a packed 4:4:4:4 scanline onto a packed 4:2:2 scanline. + * Chroma is downsampled by dropping samples (nearest neighbour). The + * alpha value provided is in the range 0-256 and is first applied to + * the input (for fadeouts). + */ +extern void (*composite_packed4444_alpha_to_packed422_scanline)( uint8_t *output, + uint8_t *input, + uint8_t *foreground, + int width, int alpha ); + +/** + * Takes an alphamask and the given colour (in Y'CbCr) and composites it + * onto a packed 4:4:4:4 scanline. + */ +extern void (*composite_alphamask_to_packed4444_scanline)( uint8_t *output, + uint8_t *input, + uint8_t *mask, int width, + int textluma, int textcb, + int textcr ); + +/** + * Takes an alphamask and the given colour (in Y'CbCr) and composites it + * onto a packed 4:4:4:4 scanline. The alpha value provided is in the + * range 0-256 and is first applied to the input (for fadeouts). + */ +extern void (*composite_alphamask_alpha_to_packed4444_scanline)( uint8_t *output, + uint8_t *input, + uint8_t *mask, int width, + int textluma, int textcb, + int textcr, int alpha ); + +/** + * Premultiplies the colour by the alpha channel in a packed 4:4:4:4 + * scanline. + */ +extern void (*premultiply_packed4444_scanline)( uint8_t *output, uint8_t *input, int width ); + +/** + * Blend between two packed 4:2:2 scanline. Pos is the fade value in + * the range 0-256. A value of 0 gives 100% src1, and a value of 256 + * gives 100% src2. Anything in between gives the appropriate faded + * version. + */ +extern void (*blend_packed422_scanline)( uint8_t *output, uint8_t *src1, + uint8_t *src2, int width, int pos ); + +/** + * Calculates the 'difference factor' for two scanlines. This is a + * metric where higher values indicate that the two scanlines are more + * different. + */ +extern unsigned int (*diff_factor_packed422_scanline)( uint8_t *cur, uint8_t *old, int width ); + +/** + * Calculates the 'comb factor' for a set of three scanlines. This is a + * metric where higher values indicate a more likely chance that the two + * fields are at separate points in time. + */ +extern unsigned int (*comb_factor_packed422_scanline)( uint8_t *top, uint8_t *mid, + uint8_t *bot, int width ); + +/** + * Vertical [1 2 1] chroma filter. + */ +extern void (*vfilter_chroma_121_packed422_scanline)( uint8_t *output, int width, + uint8_t *m, uint8_t *t, uint8_t *b ); + +/** + * Vertical [3 3 2] chroma filter. + */ +extern void (*vfilter_chroma_332_packed422_scanline)( uint8_t *output, int width, + uint8_t *m, uint8_t *t, uint8_t *b ); + +/** + * Sets the chroma of the scanline to neutral (128) in-place. + */ +extern void (*kill_chroma_packed422_inplace_scanline)( uint8_t *data, int width ); + +/** + * Mirrors the scanline in-place. + */ +extern void (*mirror_packed422_inplace_scanline)( uint8_t *data, int width ); + +/** + * Inverts the colours on a scanline in-place. + */ +extern void (*invert_colour_packed422_inplace_scanline)( uint8_t *data, int width ); + +/** + * Fast memcpy function, used by all of the blit functions. Won't blit + * anything if dest == src. + */ +extern void (*speedy_memcpy)( void *output, const void *input, size_t size ); + +/** + * Calculates the block difference metrics for dalias' pulldown + * detection algorithm. + */ +extern void (*diff_packed422_block8x8)( pulldown_metrics_t *m, uint8_t *old, + uint8_t *new, int os, int ns ); + +/** + * Takes an alpha mask and subpixelly blits it using linear + * interpolation. + */ +extern void (*a8_subpix_blit_scanline)( uint8_t *output, uint8_t *input, + int lasta, int startpos, int width ); + +/** + * 1/4 vertical subpixel blit for packed 4:2:2 scanlines using linear + * interpolation. + */ +extern void (*quarter_blit_vertical_packed422_scanline)( uint8_t *output, uint8_t *one, + uint8_t *three, int width ); + +/** + * Vertical subpixel blit for packed 4:2:2 scanlines using linear + * interpolation. + */ +extern void (*subpix_blit_vertical_packed422_scanline)( uint8_t *output, uint8_t *top, + uint8_t *bot, int subpixpos, int width ); + +/** + * Simple function to convert a 4:4:4 scanline to a 4:4:4:4 scanline by + * adding an alpha channel. Result is non-premultiplied. + */ +extern void (*packed444_to_nonpremultiplied_packed4444_scanline)( uint8_t *output, + uint8_t *input, + int width, int alpha ); + +/** + * I think this function needs to be rethought and renamed, but here + * it is for now. This function horizontally resamples a scanline + * using linear interpolation to compensate for a change in pixel + * aspect ratio. + */ +extern void (*aspect_adjust_packed4444_scanline)( uint8_t *output, + uint8_t *input, + int width, + double pixel_aspect ); + +/** + * Convert a packed 4:4:4 surface to a packed 4:2:2 surface using + * nearest neighbour chroma downsampling. + */ +extern void (*packed444_to_packed422_scanline)( uint8_t *output, + uint8_t *input, + int width ); + +/** + * Converts packed 4:2:2 to packed 4:4:4 scanlines using nearest + * neighbour chroma upsampling. + */ +extern void (*packed422_to_packed444_scanline)( uint8_t *output, + uint8_t *input, + int width ); + +/** + * This filter actually does not meet the spec so calling it rec601 + * is a bit of a lie. I got the filter from Poynton's site. This + * converts a scanline from packed 4:2:2 to packed 4:4:4. But this + * function should point at some high quality to-the-spec resampler. + */ +extern void (*packed422_to_packed444_rec601_scanline)( uint8_t *dest, + uint8_t *src, + int width ); + +/** + * Conversions between Y'CbCr and R'G'B'. We use Rec.601 numbers + * since our source is broadcast video, but I think there is an + * argument to be made for switching to Rec.709. + */ +extern void (*packed444_to_rgb24_rec601_scanline)( uint8_t *output, + uint8_t *input, + int width ); +extern void (*rgb24_to_packed444_rec601_scanline)( uint8_t *output, + uint8_t *input, + int width ); +extern void (*rgba32_to_packed4444_rec601_scanline)( uint8_t *output, + uint8_t *input, + int width ); + +/** + * Convert from 4:2:2 with UYVY ordering to 4:2:2 with YUYV ordering. + */ +extern void (*convert_uyvy_to_yuyv_scanline)( uint8_t *uyvy_buf, + uint8_t *yuyv_buf, int width ); + +/** + * Sets up the function pointers to point at the fastest function + * available. Requires accelleration settings (see mm_accel.h). + */ +void setup_speedy_calls( uint32_t accel, int verbose ); + +/** + * Returns a bitfield of what accellerations were used when speedy was + * initialized. See mm_accel.h. + */ +uint32_t speedy_get_accel( void ); + +#ifdef __cplusplus +}; +#endif +#endif /* SPEEDY_H_INCLUDED */ diff --git a/gst/deinterlace2/tvtime/sse.h b/gst/deinterlace2/tvtime/sse.h new file mode 100644 index 00000000..2e00ee0c --- /dev/null +++ b/gst/deinterlace2/tvtime/sse.h @@ -0,0 +1,992 @@ +/* sse.h + + Streaming SIMD Extenstions (a.k.a. Katmai New Instructions) + GCC interface library for IA32. + + To use this library, simply include this header file + and compile with GCC. You MUST have inlining enabled + in order for sse_ok() to work; this can be done by + simply using -O on the GCC command line. + + Compiling with -DSSE_TRACE will cause detailed trace + output to be sent to stderr for each sse operation. + This adds lots of code, and obviously slows execution to + a crawl, but can be very useful for debugging. + + THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY + EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT + LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY + AND FITNESS FOR ANY PARTICULAR PURPOSE. + + 1999 by R. Fisher + Based on libmmx by H. Dietz and R. Fisher + + Notes: + This is still extremely alpha. + Because this library depends on an assembler which understands the + SSE opcodes, you probably won't be able to use this yet. + For now, do not use TRACE versions. These both make use + of the MMX registers, not the SSE registers. This will be resolved + at a later date. + ToDo: + Rewrite TRACE macros + Major Debugging Work +*/ + +#ifndef _SSE_H +#define _SSE_H + + + +/* The type of an value that fits in an SSE register + (note that long long constant values MUST be suffixed + by LL and unsigned long long values by ULL, lest + they be truncated by the compiler) +*/ +typedef union { + float sf[4]; /* Single-precision (32-bit) value */ +} __attribute__ ((aligned (16))) sse_t; /* On a 16 byte (128-bit) boundary */ + + +#if 0 +/* Function to test if multimedia instructions are supported... +*/ +inline extern int +mm_support(void) +{ + /* Returns 1 if MMX instructions are supported, + 3 if Cyrix MMX and Extended MMX instructions are supported + 5 if AMD MMX and 3DNow! instructions are supported + 9 if MMX and SSE instructions are supported + 0 if hardware does not support any of these + */ + register int rval = 0; + + __asm__ __volatile__ ( + /* See if CPUID instruction is supported ... */ + /* ... Get copies of EFLAGS into eax and ecx */ + "pushf\n\t" + "popl %%eax\n\t" + "movl %%eax, %%ecx\n\t" + + /* ... Toggle the ID bit in one copy and store */ + /* to the EFLAGS reg */ + "xorl $0x200000, %%eax\n\t" + "push %%eax\n\t" + "popf\n\t" + + /* ... Get the (hopefully modified) EFLAGS */ + "pushf\n\t" + "popl %%eax\n\t" + + /* ... Compare and test result */ + "xorl %%eax, %%ecx\n\t" + "testl $0x200000, %%ecx\n\t" + "jz NotSupported1\n\t" /* CPUID not supported */ + + + /* Get standard CPUID information, and + go to a specific vendor section */ + "movl $0, %%eax\n\t" + "cpuid\n\t" + + /* Check for Intel */ + "cmpl $0x756e6547, %%ebx\n\t" + "jne TryAMD\n\t" + "cmpl $0x49656e69, %%edx\n\t" + "jne TryAMD\n\t" + "cmpl $0x6c65746e, %%ecx\n" + "jne TryAMD\n\t" + "jmp Intel\n\t" + + /* Check for AMD */ + "\nTryAMD:\n\t" + "cmpl $0x68747541, %%ebx\n\t" + "jne TryCyrix\n\t" + "cmpl $0x69746e65, %%edx\n\t" + "jne TryCyrix\n\t" + "cmpl $0x444d4163, %%ecx\n" + "jne TryCyrix\n\t" + "jmp AMD\n\t" + + /* Check for Cyrix */ + "\nTryCyrix:\n\t" + "cmpl $0x69727943, %%ebx\n\t" + "jne NotSupported2\n\t" + "cmpl $0x736e4978, %%edx\n\t" + "jne NotSupported3\n\t" + "cmpl $0x64616574, %%ecx\n\t" + "jne NotSupported4\n\t" + /* Drop through to Cyrix... */ + + + /* Cyrix Section */ + /* See if extended CPUID level 80000001 is supported */ + /* The value of CPUID/80000001 for the 6x86MX is undefined + according to the Cyrix CPU Detection Guide (Preliminary + Rev. 1.01 table 1), so we'll check the value of eax for + CPUID/0 to see if standard CPUID level 2 is supported. + According to the table, the only CPU which supports level + 2 is also the only one which supports extended CPUID levels. + */ + "cmpl $0x2, %%eax\n\t" + "jne MMXtest\n\t" /* Use standard CPUID instead */ + + /* Extended CPUID supported (in theory), so get extended + features */ + "movl $0x80000001, %%eax\n\t" + "cpuid\n\t" + "testl $0x00800000, %%eax\n\t" /* Test for MMX */ + "jz NotSupported5\n\t" /* MMX not supported */ + "testl $0x01000000, %%eax\n\t" /* Test for Ext'd MMX */ + "jnz EMMXSupported\n\t" + "movl $1, %0:\n\n\t" /* MMX Supported */ + "jmp Return\n\n" + "EMMXSupported:\n\t" + "movl $3, %0:\n\n\t" /* EMMX and MMX Supported */ + "jmp Return\n\t" + + + /* AMD Section */ + "AMD:\n\t" + + /* See if extended CPUID is supported */ + "movl $0x80000000, %%eax\n\t" + "cpuid\n\t" + "cmpl $0x80000000, %%eax\n\t" + "jl MMXtest\n\t" /* Use standard CPUID instead */ + + /* Extended CPUID supported, so get extended features */ + "movl $0x80000001, %%eax\n\t" + "cpuid\n\t" + "testl $0x00800000, %%edx\n\t" /* Test for MMX */ + "jz NotSupported6\n\t" /* MMX not supported */ + "testl $0x80000000, %%edx\n\t" /* Test for 3DNow! */ + "jnz ThreeDNowSupported\n\t" + "movl $1, %0:\n\n\t" /* MMX Supported */ + "jmp Return\n\n" + "ThreeDNowSupported:\n\t" + "movl $5, %0:\n\n\t" /* 3DNow! and MMX Supported */ + "jmp Return\n\t" + + + /* Intel Section */ + "Intel:\n\t" + + /* Check for SSE */ + "SSEtest:\n\t" + "movl $1, %%eax\n\t" + "cpuid\n\t" + "testl $0x02000000, %%edx\n\t" /* Test for SSE */ + "jz MMXtest\n\t" /* SSE Not supported */ + "movl $9, %0:\n\n\t" /* SSE Supported */ + "jmp Return\n\t" + + /* Check for MMX */ + "MMXtest:\n\t" + "movl $1, %%eax\n\t" + "cpuid\n\t" + "testl $0x00800000, %%edx\n\t" /* Test for MMX */ + "jz NotSupported7\n\t" /* MMX Not supported */ + "movl $1, %0:\n\n\t" /* MMX Supported */ + "jmp Return\n\t" + + /* Nothing supported */ + "\nNotSupported1:\n\t" + "#movl $101, %0:\n\n\t" + "\nNotSupported2:\n\t" + "#movl $102, %0:\n\n\t" + "\nNotSupported3:\n\t" + "#movl $103, %0:\n\n\t" + "\nNotSupported4:\n\t" + "#movl $104, %0:\n\n\t" + "\nNotSupported5:\n\t" + "#movl $105, %0:\n\n\t" + "\nNotSupported6:\n\t" + "#movl $106, %0:\n\n\t" + "\nNotSupported7:\n\t" + "#movl $107, %0:\n\n\t" + "movl $0, %0:\n\n\t" + + "Return:\n\t" + : "=a" (rval) + : /* no input */ + : "eax", "ebx", "ecx", "edx" + ); + + /* Return */ + return(rval); +} + +/* Function to test if sse instructions are supported... +*/ +inline extern int +sse_ok(void) +{ + /* Returns 1 if SSE instructions are supported, 0 otherwise */ + return ( (mm_support() & 0x8) >> 3 ); +} +#endif + + + +/* Helper functions for the instruction macros that follow... + (note that memory-to-register, m2r, instructions are nearly + as efficient as register-to-register, r2r, instructions; + however, memory-to-memory instructions are really simulated + as a convenience, and are only 1/3 as efficient) +*/ +#ifdef SSE_TRACE + +/* Include the stuff for printing a trace to stderr... +*/ + +#include <stdio.h> + +#define sse_i2r(op, imm, reg) \ + { \ + sse_t sse_trace; \ + sse_trace.uq = (imm); \ + fprintf(stderr, #op "_i2r(" #imm "=0x%08x%08x, ", \ + sse_trace.d[1], sse_trace.d[0]); \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (sse_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #reg "=0x%08x%08x) => ", \ + sse_trace.d[1], sse_trace.d[0]); \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "X" (imm)); \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (sse_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #reg "=0x%08x%08x\n", \ + sse_trace.d[1], sse_trace.d[0]); \ + } + +#define sse_m2r(op, mem, reg) \ + { \ + sse_t sse_trace; \ + sse_trace = (mem); \ + fprintf(stderr, #op "_m2r(" #mem "=0x%08x%08x, ", \ + sse_trace.d[1], sse_trace.d[0]); \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (sse_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #reg "=0x%08x%08x) => ", \ + sse_trace.d[1], sse_trace.d[0]); \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "X" (mem)); \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (sse_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #reg "=0x%08x%08x\n", \ + sse_trace.d[1], sse_trace.d[0]); \ + } + +#define sse_r2m(op, reg, mem) \ + { \ + sse_t sse_trace; \ + __asm__ __volatile__ ("movq %%" #reg ", %0" \ + : "=X" (sse_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #op "_r2m(" #reg "=0x%08x%08x, ", \ + sse_trace.d[1], sse_trace.d[0]); \ + sse_trace = (mem); \ + fprintf(stderr, #mem "=0x%08x%08x) => ", \ + sse_trace.d[1], sse_trace.d[0]); \ + __asm__ __volatile__ (#op " %%" #reg ", %0" \ + : "=X" (mem) \ + : /* nothing */ ); \ + sse_trace = (mem); \ + fprintf(stderr, #mem "=0x%08x%08x\n", \ + sse_trace.d[1], sse_trace.d[0]); \ + } + +#define sse_r2r(op, regs, regd) \ + { \ + sse_t sse_trace; \ + __asm__ __volatile__ ("movq %%" #regs ", %0" \ + : "=X" (sse_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #op "_r2r(" #regs "=0x%08x%08x, ", \ + sse_trace.d[1], sse_trace.d[0]); \ + __asm__ __volatile__ ("movq %%" #regd ", %0" \ + : "=X" (sse_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #regd "=0x%08x%08x) => ", \ + sse_trace.d[1], sse_trace.d[0]); \ + __asm__ __volatile__ (#op " %" #regs ", %" #regd); \ + __asm__ __volatile__ ("movq %%" #regd ", %0" \ + : "=X" (sse_trace) \ + : /* nothing */ ); \ + fprintf(stderr, #regd "=0x%08x%08x\n", \ + sse_trace.d[1], sse_trace.d[0]); \ + } + +#define sse_m2m(op, mems, memd) \ + { \ + sse_t sse_trace; \ + sse_trace = (mems); \ + fprintf(stderr, #op "_m2m(" #mems "=0x%08x%08x, ", \ + sse_trace.d[1], sse_trace.d[0]); \ + sse_trace = (memd); \ + fprintf(stderr, #memd "=0x%08x%08x) => ", \ + sse_trace.d[1], sse_trace.d[0]); \ + __asm__ __volatile__ ("movq %0, %%mm0\n\t" \ + #op " %1, %%mm0\n\t" \ + "movq %%mm0, %0" \ + : "=X" (memd) \ + : "X" (mems)); \ + sse_trace = (memd); \ + fprintf(stderr, #memd "=0x%08x%08x\n", \ + sse_trace.d[1], sse_trace.d[0]); \ + } + +#else + +/* These macros are a lot simpler without the tracing... +*/ + +#define sse_i2r(op, imm, reg) \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "X" (imm) ) + +#define sse_m2r(op, mem, reg) \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "X" (mem)) + +#define sse_r2m(op, reg, mem) \ + __asm__ __volatile__ (#op " %%" #reg ", %0" \ + : "=X" (mem) \ + : /* nothing */ ) + +#define sse_r2r(op, regs, regd) \ + __asm__ __volatile__ (#op " %" #regs ", %" #regd) + +#define sse_r2ri(op, regs, regd, imm) \ + __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \ + : /* nothing */ \ + : "X" (imm) ) + +/* Load data from mems to xmmreg, operate on xmmreg, and store data to memd */ +#define sse_m2m(op, mems, memd, xmmreg) \ + __asm__ __volatile__ ("movups %0, %%xmm0\n\t" \ + #op " %1, %%xmm0\n\t" \ + "movups %%mm0, %0" \ + : "=X" (memd) \ + : "X" (mems)) + +#define sse_m2ri(op, mem, reg, subop) \ + __asm__ __volatile__ (#op " %0, %%" #reg ", " #subop \ + : /* nothing */ \ + : "X" (mem)) + +#define sse_m2mi(op, mems, memd, xmmreg, subop) \ + __asm__ __volatile__ ("movups %0, %%xmm0\n\t" \ + #op " %1, %%xmm0, " #subop "\n\t" \ + "movups %%mm0, %0" \ + : "=X" (memd) \ + : "X" (mems)) +#endif + + + + +/* 1x128 MOVe Aligned four Packed Single-fp +*/ +#define movaps_m2r(var, reg) sse_m2r(movaps, var, reg) +#define movaps_r2m(reg, var) sse_r2m(movaps, reg, var) +#define movaps_r2r(regs, regd) sse_r2r(movaps, regs, regd) +#define movaps(vars, vard) \ + __asm__ __volatile__ ("movaps %1, %%mm0\n\t" \ + "movaps %%mm0, %0" \ + : "=X" (vard) \ + : "X" (vars)) + + +/* 1x128 MOVe aligned Non-Temporal four Packed Single-fp +*/ +#define movntps_r2m(xmmreg, var) sse_r2m(movntps, xmmreg, var) + + +/* 1x64 MOVe Non-Temporal Quadword +*/ +#define movntq_r2m(mmreg, var) sse_r2m(movntq, mmreg, var) + + +/* 1x128 MOVe Unaligned four Packed Single-fp +*/ +#define movups_m2r(var, reg) sse_m2r(movups, var, reg) +#define movups_r2m(reg, var) sse_r2m(movups, reg, var) +#define movups_r2r(regs, regd) sse_r2r(movups, regs, regd) +#define movups(vars, vard) \ + __asm__ __volatile__ ("movups %1, %%mm0\n\t" \ + "movups %%mm0, %0" \ + : "=X" (vard) \ + : "X" (vars)) + + +/* MOVe High to Low Packed Single-fp + high half of 4x32f (x) -> low half of 4x32f (y) +*/ +#define movhlps_r2r(regs, regd) sse_r2r(movhlps, regs, regd) + + +/* MOVe Low to High Packed Single-fp + low half of 4x32f (x) -> high half of 4x32f (y) +*/ +#define movlhps_r2r(regs, regd) sse_r2r(movlhps, regs, regd) + + +/* MOVe High Packed Single-fp + 2x32f -> high half of 4x32f +*/ +#define movhps_m2r(var, reg) sse_m2r(movhps, var, reg) +#define movhps_r2m(reg, var) sse_r2m(movhps, reg, var) +#define movhps(vars, vard) \ + __asm__ __volatile__ ("movhps %1, %%mm0\n\t" \ + "movhps %%mm0, %0" \ + : "=X" (vard) \ + : "X" (vars)) + + +/* MOVe Low Packed Single-fp + 2x32f -> low half of 4x32f +*/ +#define movlps_m2r(var, reg) sse_m2r(movlps, var, reg) +#define movlps_r2m(reg, var) sse_r2m(movlps, reg, var) +#define movlps(vars, vard) \ + __asm__ __volatile__ ("movlps %1, %%mm0\n\t" \ + "movlps %%mm0, %0" \ + : "=X" (vard) \ + : "X" (vars)) + + +/* MOVe Scalar Single-fp + lowest field of 4x32f (x) -> lowest field of 4x32f (y) +*/ +#define movss_m2r(var, reg) sse_m2r(movss, var, reg) +#define movss_r2m(reg, var) sse_r2m(movss, reg, var) +#define movss_r2r(regs, regd) sse_r2r(movss, regs, regd) +#define movss(vars, vard) \ + __asm__ __volatile__ ("movss %1, %%mm0\n\t" \ + "movss %%mm0, %0" \ + : "=X" (vard) \ + : "X" (vars)) + + +/* 4x16 Packed SHUFfle Word +*/ +#define pshufw_m2r(var, reg, index) sse_m2ri(pshufw, var, reg, index) +#define pshufw_r2r(regs, regd, index) sse_r2ri(pshufw, regs, regd, index) + + +/* 1x128 SHUFfle Packed Single-fp +*/ +#define shufps_m2r(var, reg, index) sse_m2ri(shufps, var, reg, index) +#define shufps_r2r(regs, regd, index) sse_r2ri(shufps, regs, regd, index) + + +/* ConVerT Packed signed Int32 to(2) Packed Single-fp +*/ +#define cvtpi2ps_m2r(var, xmmreg) sse_m2r(cvtpi2ps, var, xmmreg) +#define cvtpi2ps_r2r(mmreg, xmmreg) sse_r2r(cvtpi2ps, mmreg, xmmreg) + + +/* ConVerT Packed Single-fp to(2) Packed signed Int32 +*/ +#define cvtps2pi_m2r(var, mmreg) sse_m2r(cvtps2pi, var, mmreg) +#define cvtps2pi_r2r(xmmreg, mmreg) sse_r2r(cvtps2pi, mmreg, xmmreg) + + +/* ConVerT with Truncate Packed Single-fp to(2) Packed Int32 +*/ +#define cvttps2pi_m2r(var, mmreg) sse_m2r(cvttps2pi, var, mmreg) +#define cvttps2pi_r2r(xmmreg, mmreg) sse_r2r(cvttps2pi, mmreg, xmmreg) + + +/* ConVerT Signed Int32 to(2) Single-fp (Scalar) +*/ +#define cvtsi2ss_m2r(var, xmmreg) sse_m2r(cvtsi2ss, var, xmmreg) +#define cvtsi2ss_r2r(reg, xmmreg) sse_r2r(cvtsi2ss, reg, xmmreg) + + +/* ConVerT Scalar Single-fp to(2) Signed Int32 +*/ +#define cvtss2si_m2r(var, reg) sse_m2r(cvtss2si, var, reg) +#define cvtss2si_r2r(xmmreg, reg) sse_r2r(cvtss2si, xmmreg, reg) + + +/* ConVerT with Truncate Scalar Single-fp to(2) Signed Int32 +*/ +#define cvttss2si_m2r(var, reg) sse_m2r(cvtss2si, var, reg) +#define cvttss2si_r2r(xmmreg, reg) sse_r2r(cvtss2si, xmmreg, reg) + + +/* Parallel EXTRact Word from 4x16 +*/ +#define pextrw_r2r(mmreg, reg, field) sse_r2ri(pextrw, mmreg, reg, field) + + +/* Parallel INSeRt Word from 4x16 +*/ +#define pinsrw_r2r(reg, mmreg, field) sse_r2ri(pinsrw, reg, mmreg, field) + + + +/* MOVe MaSK from Packed Single-fp +*/ +#ifdef SSE_TRACE + #define movmskps(xmmreg, reg) \ + { \ + fprintf(stderr, "movmskps()\n"); \ + __asm__ __volatile__ ("movmskps %" #xmmreg ", %" #reg) \ + } +#else + #define movmskps(xmmreg, reg) \ + __asm__ __volatile__ ("movmskps %" #xmmreg ", %" #reg) +#endif + + +/* Parallel MOVe MaSK from mmx reg to 32-bit reg +*/ +#ifdef SSE_TRACE + #define pmovmskb(mmreg, reg) \ + { \ + fprintf(stderr, "movmskps()\n"); \ + __asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg) \ + } +#else + #define pmovmskb(mmreg, reg) \ + __asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg) +#endif + + +/* MASKed MOVe from 8x8 to memory pointed to by (e)di register +*/ +#define maskmovq(mmregs, fieldreg) sse_r2ri(maskmovq, mmregs, fieldreg) + + + + +/* 4x32f Parallel ADDs +*/ +#define addps_m2r(var, reg) sse_m2r(addps, var, reg) +#define addps_r2r(regs, regd) sse_r2r(addps, regs, regd) +#define addps(vars, vard, xmmreg) sse_m2m(addps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel ADDs +*/ +#define addss_m2r(var, reg) sse_m2r(addss, var, reg) +#define addss_r2r(regs, regd) sse_r2r(addss, regs, regd) +#define addss(vars, vard, xmmreg) sse_m2m(addss, vars, vard, xmmreg) + + +/* 4x32f Parallel SUBs +*/ +#define subps_m2r(var, reg) sse_m2r(subps, var, reg) +#define subps_r2r(regs, regd) sse_r2r(subps, regs, regd) +#define subps(vars, vard, xmmreg) sse_m2m(subps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel SUBs +*/ +#define subss_m2r(var, reg) sse_m2r(subss, var, reg) +#define subss_r2r(regs, regd) sse_r2r(subss, regs, regd) +#define subss(vars, vard, xmmreg) sse_m2m(subss, vars, vard, xmmreg) + + +/* 8x8u -> 4x16u Packed Sum of Absolute Differences +*/ +#define psadbw_m2r(var, reg) sse_m2r(psadbw, var, reg) +#define psadbw_r2r(regs, regd) sse_r2r(psadbw, regs, regd) +#define psadbw(vars, vard, mmreg) sse_m2m(psadbw, vars, vard, mmreg) + + +/* 4x16u Parallel MUL High Unsigned +*/ +#define pmulhuw_m2r(var, reg) sse_m2r(pmulhuw, var, reg) +#define pmulhuw_r2r(regs, regd) sse_r2r(pmulhuw, regs, regd) +#define pmulhuw(vars, vard, mmreg) sse_m2m(pmulhuw, vars, vard, mmreg) + + +/* 4x32f Parallel MULs +*/ +#define mulps_m2r(var, reg) sse_m2r(mulps, var, reg) +#define mulps_r2r(regs, regd) sse_r2r(mulps, regs, regd) +#define mulps(vars, vard, xmmreg) sse_m2m(mulps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel MULs +*/ +#define mulss_m2r(var, reg) sse_m2r(mulss, var, reg) +#define mulss_r2r(regs, regd) sse_r2r(mulss, regs, regd) +#define mulss(vars, vard, xmmreg) sse_m2m(mulss, vars, vard, xmmreg) + + +/* 4x32f Parallel DIVs +*/ +#define divps_m2r(var, reg) sse_m2r(divps, var, reg) +#define divps_r2r(regs, regd) sse_r2r(divps, regs, regd) +#define divps(vars, vard, xmmreg) sse_m2m(divps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel DIVs +*/ +#define divss_m2r(var, reg) sse_m2r(divss, var, reg) +#define divss_r2r(regs, regd) sse_r2r(divss, regs, regd) +#define divss(vars, vard, xmmreg) sse_m2m(divss, vars, vard, xmmreg) + + +/* 4x32f Parallel Reciprocals +*/ +#define rcpps_m2r(var, reg) sse_m2r(rcpps, var, reg) +#define rcpps_r2r(regs, regd) sse_r2r(rcpps, regs, regd) +#define rcpps(vars, vard, xmmreg) sse_m2m(rcpps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel Reciprocals +*/ +#define rcpss_m2r(var, reg) sse_m2r(rcpss, var, reg) +#define rcpss_r2r(regs, regd) sse_r2r(rcpss, regs, regd) +#define rcpss(vars, vard, xmmreg) sse_m2m(rcpss, vars, vard, xmmreg) + + +/* 4x32f Parallel Square Root of Reciprocals +*/ +#define rsqrtps_m2r(var, reg) sse_m2r(rsqrtps, var, reg) +#define rsqrtps_r2r(regs, regd) sse_r2r(rsqrtps, regs, regd) +#define rsqrtps(vars, vard, xmmreg) sse_m2m(rsqrtps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel Square Root of Reciprocals +*/ +#define rsqrtss_m2r(var, reg) sse_m2r(rsqrtss, var, reg) +#define rsqrtss_r2r(regs, regd) sse_r2r(rsqrtss, regs, regd) +#define rsqrtss(vars, vard, xmmreg) sse_m2m(rsqrtss, vars, vard, xmmreg) + + +/* 4x32f Parallel Square Roots +*/ +#define sqrtps_m2r(var, reg) sse_m2r(sqrtps, var, reg) +#define sqrtps_r2r(regs, regd) sse_r2r(sqrtps, regs, regd) +#define sqrtps(vars, vard, xmmreg) sse_m2m(sqrtps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel Square Roots +*/ +#define sqrtss_m2r(var, reg) sse_m2r(sqrtss, var, reg) +#define sqrtss_r2r(regs, regd) sse_r2r(sqrtss, regs, regd) +#define sqrtss(vars, vard, xmmreg) sse_m2m(sqrtss, vars, vard, xmmreg) + + +/* 8x8u and 4x16u Parallel AVeraGe +*/ +#define pavgb_m2r(var, reg) sse_m2r(pavgb, var, reg) +#define pavgb_r2r(regs, regd) sse_r2r(pavgb, regs, regd) +#define pavgb(vars, vard, mmreg) sse_m2m(pavgb, vars, vard, mmreg) + +#define pavgw_m2r(var, reg) sse_m2r(pavgw, var, reg) +#define pavgw_r2r(regs, regd) sse_r2r(pavgw, regs, regd) +#define pavgw(vars, vard, mmreg) sse_m2m(pavgw, vars, vard, mmreg) + + +/* 1x128 bitwise AND +*/ +#define andps_m2r(var, reg) sse_m2r(andps, var, reg) +#define andps_r2r(regs, regd) sse_r2r(andps, regs, regd) +#define andps(vars, vard, xmmreg) sse_m2m(andps, vars, vard, xmmreg) + + +/* 1x128 bitwise AND with Not the destination +*/ +#define andnps_m2r(var, reg) sse_m2r(andnps, var, reg) +#define andnps_r2r(regs, regd) sse_r2r(andnps, regs, regd) +#define andnps(vars, vard, xmmreg) sse_m2m(andnps, vars, vard, xmmreg) + + +/* 1x128 bitwise OR +*/ +#define orps_m2r(var, reg) sse_m2r(orps, var, reg) +#define orps_r2r(regs, regd) sse_r2r(orps, regs, regd) +#define orps(vars, vard, xmmreg) sse_m2m(orps, vars, vard, xmmreg) + + +/* 1x128 bitwise eXclusive OR +*/ +#define xorps_m2r(var, reg) sse_m2r(xorps, var, reg) +#define xorps_r2r(regs, regd) sse_r2r(xorps, regs, regd) +#define xorps(vars, vard, xmmreg) sse_m2m(xorps, vars, vard, xmmreg) + + +/* 8x8u, 4x16, and 4x32f Parallel Maximum +*/ +#define pmaxub_m2r(var, reg) sse_m2r(pmaxub, var, reg) +#define pmaxub_r2r(regs, regd) sse_r2r(pmaxub, regs, regd) +#define pmaxub(vars, vard, mmreg) sse_m2m(pmaxub, vars, vard, mmreg) + +#define pmaxsw_m2r(var, reg) sse_m2r(pmaxsw, var, reg) +#define pmaxsw_r2r(regs, regd) sse_r2r(pmaxsw, regs, regd) +#define pmaxsw(vars, vard, mmreg) sse_m2m(pmaxsw, vars, vard, mmreg) + +#define maxps_m2r(var, reg) sse_m2r(maxps, var, reg) +#define maxps_r2r(regs, regd) sse_r2r(maxps, regs, regd) +#define maxps(vars, vard, xmmreg) sse_m2m(maxps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel Maximum +*/ +#define maxss_m2r(var, reg) sse_m2r(maxss, var, reg) +#define maxss_r2r(regs, regd) sse_r2r(maxss, regs, regd) +#define maxss(vars, vard, xmmreg) sse_m2m(maxss, vars, vard, xmmreg) + + +/* 8x8u, 4x16, and 4x32f Parallel Minimum +*/ +#define pminub_m2r(var, reg) sse_m2r(pminub, var, reg) +#define pminub_r2r(regs, regd) sse_r2r(pminub, regs, regd) +#define pminub(vars, vard, mmreg) sse_m2m(pminub, vars, vard, mmreg) + +#define pminsw_m2r(var, reg) sse_m2r(pminsw, var, reg) +#define pminsw_r2r(regs, regd) sse_r2r(pminsw, regs, regd) +#define pminsw(vars, vard, mmreg) sse_m2m(pminsw, vars, vard, mmreg) + +#define minps_m2r(var, reg) sse_m2r(minps, var, reg) +#define minps_r2r(regs, regd) sse_r2r(minps, regs, regd) +#define minps(vars, vard, xmmreg) sse_m2m(minps, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Parallel Minimum +*/ +#define minss_m2r(var, reg) sse_m2r(minss, var, reg) +#define minss_r2r(regs, regd) sse_r2r(minss, regs, regd) +#define minss(vars, vard, xmmreg) sse_m2m(minss, vars, vard, xmmreg) + + +/* 4x32f Parallel CoMPares + (resulting fields are either 0 or -1) +*/ +#define cmpps_m2r(var, reg, op) sse_m2ri(cmpps, var, reg, op) +#define cmpps_r2r(regs, regd, op) sse_r2ri(cmpps, regs, regd, op) +#define cmpps(vars, vard, op, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, op) + +#define cmpeqps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 0) +#define cmpeqps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 0) +#define cmpeqps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 0) + +#define cmpltps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 1) +#define cmpltps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 1) +#define cmpltps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 1) + +#define cmpleps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 2) +#define cmpleps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 2) +#define cmpleps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 2) + +#define cmpunordps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 3) +#define cmpunordps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 3) +#define cmpunordps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 3) + +#define cmpneqps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 4) +#define cmpneqps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 4) +#define cmpneqps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 4) + +#define cmpnltps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 5) +#define cmpnltps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 5) +#define cmpnltps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 5) + +#define cmpnleps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 6) +#define cmpnleps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 6) +#define cmpnleps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 6) + +#define cmpordps_m2r(var, reg) sse_m2ri(cmpps, var, reg, 7) +#define cmpordps_r2r(regs, regd) sse_r2ri(cmpps, regs, regd, 7) +#define cmpordps(vars, vard, xmmreg) sse_m2mi(cmpps, vars, vard, xmmreg, 7) + + +/* Lowest Field of 4x32f Parallel CoMPares + (resulting fields are either 0 or -1) +*/ +#define cmpss_m2r(var, reg, op) sse_m2ri(cmpss, var, reg, op) +#define cmpss_r2r(regs, regd, op) sse_r2ri(cmpss, regs, regd, op) +#define cmpss(vars, vard, op, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, op) + +#define cmpeqss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 0) +#define cmpeqss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 0) +#define cmpeqss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 0) + +#define cmpltss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 1) +#define cmpltss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 1) +#define cmpltss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 1) + +#define cmpless_m2r(var, reg) sse_m2ri(cmpss, var, reg, 2) +#define cmpless_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 2) +#define cmpless(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 2) + +#define cmpunordss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 3) +#define cmpunordss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 3) +#define cmpunordss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 3) + +#define cmpneqss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 4) +#define cmpneqss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 4) +#define cmpneqss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 4) + +#define cmpnltss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 5) +#define cmpnltss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 5) +#define cmpnltss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 5) + +#define cmpnless_m2r(var, reg) sse_m2ri(cmpss, var, reg, 6) +#define cmpnless_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 6) +#define cmpnless(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 6) + +#define cmpordss_m2r(var, reg) sse_m2ri(cmpss, var, reg, 7) +#define cmpordss_r2r(regs, regd) sse_r2ri(cmpss, regs, regd, 7) +#define cmpordss(vars, vard, xmmreg) sse_m2mi(cmpss, vars, vard, xmmreg, 7) + + +/* Lowest Field of 4x32f Parallel CoMPares to set EFLAGS + (resulting fields are either 0 or -1) +*/ +#define comiss_m2r(var, reg) sse_m2r(comiss, var, reg) +#define comiss_r2r(regs, regd) sse_r2r(comiss, regs, regd) +#define comiss(vars, vard, xmmreg) sse_m2m(comiss, vars, vard, xmmreg) + + +/* Lowest Field of 4x32f Unordered Parallel CoMPares to set EFLAGS + (resulting fields are either 0 or -1) +*/ +#define ucomiss_m2r(var, reg) sse_m2r(ucomiss, var, reg) +#define ucomiss_r2r(regs, regd) sse_r2r(ucomiss, regs, regd) +#define ucomiss(vars, vard, xmmreg) sse_m2m(ucomiss, vars, vard, xmmreg) + + +/* 2-(4x32f) -> 4x32f UNPaCK Low Packed Single-fp + (interleaves low half of dest with low half of source + as padding in each result field) +*/ +#define unpcklps_m2r(var, reg) sse_m2r(unpcklps, var, reg) +#define unpcklps_r2r(regs, regd) sse_r2r(unpcklps, regs, regd) + + +/* 2-(4x32f) -> 4x32f UNPaCK High Packed Single-fp + (interleaves high half of dest with high half of source + as padding in each result field) +*/ +#define unpckhps_m2r(var, reg) sse_m2r(unpckhps, var, reg) +#define unpckhps_r2r(regs, regd) sse_r2r(unpckhps, regs, regd) + + + +/* Fp and mmX ReSTORe state +*/ +#ifdef SSE_TRACE + #define fxrstor(mem) \ + { \ + fprintf(stderr, "fxrstor()\n"); \ + __asm__ __volatile__ ("fxrstor %0" \ + : /* nothing */ \ + : "X" (mem)) \ + } +#else + #define fxrstor(mem) \ + __asm__ __volatile__ ("fxrstor %0" \ + : /* nothing */ \ + : "X" (mem)) +#endif + + +/* Fp and mmX SAVE state +*/ +#ifdef SSE_TRACE + #define fxsave(mem) \ + { \ + fprintf(stderr, "fxsave()\n"); \ + __asm__ __volatile__ ("fxsave %0" \ + : /* nothing */ \ + : "X" (mem)) \ + } +#else + #define fxsave(mem) \ + __asm__ __volatile__ ("fxsave %0" \ + : /* nothing */ \ + : "X" (mem)) +#endif + + +/* STore streaMing simd eXtensions Control/Status Register +*/ +#ifdef SSE_TRACE + #define stmxcsr(mem) \ + { \ + fprintf(stderr, "stmxcsr()\n"); \ + __asm__ __volatile__ ("stmxcsr %0" \ + : /* nothing */ \ + : "X" (mem)) \ + } +#else + #define stmxcsr(mem) \ + __asm__ __volatile__ ("stmxcsr %0" \ + : /* nothing */ \ + : "X" (mem)) +#endif + + +/* LoaD streaMing simd eXtensions Control/Status Register +*/ +#ifdef SSE_TRACE + #define ldmxcsr(mem) \ + { \ + fprintf(stderr, "ldmxcsr()\n"); \ + __asm__ __volatile__ ("ldmxcsr %0" \ + : /* nothing */ \ + : "X" (mem)) \ + } +#else + #define ldmxcsr(mem) \ + __asm__ __volatile__ ("ldmxcsr %0" \ + : /* nothing */ \ + : "X" (mem)) +#endif + + +/* Store FENCE - enforce ordering of stores before fence vs. stores + occuring after fence in source code. +*/ +#ifdef SSE_TRACE + #define sfence() \ + { \ + fprintf(stderr, "sfence()\n"); \ + __asm__ __volatile__ ("sfence\n\t") \ + } +#else + #define sfence() \ + __asm__ __volatile__ ("sfence\n\t") +#endif + + +/* PREFETCH data using T0, T1, T2, or NTA hint + T0 = Prefetch into all cache levels + T1 = Prefetch into all cache levels except 0th level + T2 = Prefetch into all cache levels except 0th and 1st levels + NTA = Prefetch data into non-temporal cache structure +*/ +#ifdef SSE_TRACE +#else + #define prefetch(mem, hint) \ + __asm__ __volatile__ ("prefetch" #hint " %0" \ + : /* nothing */ \ + : "X" (mem)) + + #define prefetcht0(mem) prefetch(mem, t0) + #define prefetcht1(mem) prefetch(mem, t1) + #define prefetcht2(mem) prefetch(mem, t2) + #define prefetchnta(mem) prefetch(mem, nta) +#endif + + + +#endif diff --git a/gst/deinterlace2/tvtime/tomsmocomp.c b/gst/deinterlace2/tvtime/tomsmocomp.c new file mode 100644 index 00000000..f0b73677 --- /dev/null +++ b/gst/deinterlace2/tvtime/tomsmocomp.c @@ -0,0 +1,187 @@ +/** + * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <string.h> + +#include "gst/gst.h" +#include "gstdeinterlace2.h" +#include "plugins.h" +#include "speedy.h" + +#include "tomsmocomp.h" +#include "tomsmocomp/tomsmocompmacros.h" +#include "x86-64_macros.inc" + + +#define SearchEffortDefault 5 +#define UseStrangeBobDefault 0 + +long SearchEffort; + +int UseStrangeBob; + +MEMCPY_FUNC *pMyMemcpy; + +int IsOdd; + +const unsigned char *pWeaveSrc; + +const unsigned char *pWeaveSrcP; + +unsigned char *pWeaveDest; + +const unsigned char *pCopySrc; + +const unsigned char *pCopySrcP; + +unsigned char *pCopyDest; + +int src_pitch; + +int dst_pitch; + +int rowsize; + +int height; + +int FldHeight; + +int +Fieldcopy (void *dest, const void *src, size_t count, + int rows, int dst_pitch, int src_pitch) +{ + unsigned char *pDest = (unsigned char *) dest; + + unsigned char *pSrc = (unsigned char *) src; + + int i; + + for (i = 0; i < rows; i++) { + pMyMemcpy (pDest, pSrc, count); + pSrc += src_pitch; + pDest += dst_pitch; + } + return 0; +} + + +#define IS_MMX +#define SSE_TYPE MMX +#define FUNCT_NAME tomsmocompDScaler_MMX +#include "tomsmocomp/TomsMoCompAll.inc" +#undef IS_MMX +#undef SSE_TYPE +#undef FUNCT_NAME + +#define IS_3DNOW +#define SSE_TYPE 3DNOW +#define FUNCT_NAME tomsmocompDScaler_3DNOW +#include "tomsmocomp/TomsMoCompAll.inc" +#undef IS_3DNOW +#undef SSE_TYPE +#undef FUNCT_NAME + +#define IS_SSE +#define SSE_TYPE SSE +#define FUNCT_NAME tomsmocompDScaler_SSE +#include "tomsmocomp/TomsMoCompAll.inc" +#undef IS_SSE +#undef SSE_TYPE +#undef FUNCT_NAME + + + +void +deinterlace_frame_di_tomsmocomp (GstDeinterlace2 * object) +{ + if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) { + tomsmocomp_filter_sse (object); + } else if (object->cpu_feature_flags & OIL_IMPL_FLAG_3DNOW) { + tomsmocomp_filter_3dnow (object); + } else { + tomsmocomp_filter_mmx (object); + } +} + +static deinterlace_method_t tomsmocompmethod = { + 0, //DEINTERLACE_PLUGIN_API_VERSION, + "Motion Adaptive: Motion Search", + "AdaptiveSearch", + 4, + OIL_IMPL_FLAG_MMX, + 0, + 0, + 0, + 0, + 0, + 0, + deinterlace_frame_di_tomsmocomp, + {"Uses heuristics to detect motion in the input", + "frames and reconstruct image detail where", + "possible. Use this for high quality output", + "even on monitors set to an arbitrary refresh", + "rate.", + "", + "Motion search mode finds and follows motion", + "vectors for accurate interpolation. This is", + "the TomsMoComp deinterlacer from DScaler.", + ""} +}; + + + +deinterlace_method_t * +dscaler_tomsmocomp_get_method (void) +{ + tomsmocomp_init (); + return &tomsmocompmethod; +} + + + +void +tomsmocomp_init (void) +{ + SearchEffort = SearchEffortDefault; + UseStrangeBob = UseStrangeBobDefault; +} + +void +tomsmocomp_filter_mmx (GstDeinterlace2 * object) +{ + tomsmocompDScaler_MMX (object); +} + +void +tomsmocomp_filter_3dnow (GstDeinterlace2 * object) +{ + tomsmocompDScaler_3DNOW (object); +} + +void +tomsmocomp_filter_sse (GstDeinterlace2 * object) +{ + tomsmocompDScaler_SSE (object); +} diff --git a/gst/deinterlace2/tvtime/tomsmocomp.h b/gst/deinterlace2/tvtime/tomsmocomp.h new file mode 100644 index 00000000..12127800 --- /dev/null +++ b/gst/deinterlace2/tvtime/tomsmocomp.h @@ -0,0 +1,61 @@ +/** + * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef TOMSMOCOMP_H_INCLUDED +#define TOMSMOCOMP_H_INCLUDED + +#include "gstdeinterlace2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int Search_Effort_0(); +int Search_Effort_1(); +int Search_Effort_3(); +int Search_Effort_5(); +int Search_Effort_9(); +int Search_Effort_11(); +int Search_Effort_13(); +int Search_Effort_15(); +int Search_Effort_19(); +int Search_Effort_21(); +int Search_Effort_Max(); + +int Search_Effort_0_SB(); +int Search_Effort_1_SB(); +int Search_Effort_3_SB(); +int Search_Effort_5_SB(); +int Search_Effort_9_SB(); +int Search_Effort_11_SB(); +int Search_Effort_13_SB(); +int Search_Effort_15_SB(); +int Search_Effort_19_SB(); +int Search_Effort_21_SB(); +int Search_Effort_Max_SB(); + +void tomsmocomp_init( void ); +void tomsmocomp_filter_mmx( GstDeinterlace2 *object ); +void tomsmocomp_filter_3dnow( GstDeinterlace2 *object ); +void tomsmocomp_filter_sse( GstDeinterlace2 *object ); + +#ifdef __cplusplus +}; +#endif + +#endif /* TOMSMOCOMP_H_INCLUDED */ diff --git a/gst/deinterlace2/tvtime/vfir.c b/gst/deinterlace2/tvtime/vfir.c new file mode 100644 index 00000000..bb42f5d3 --- /dev/null +++ b/gst/deinterlace2/tvtime/vfir.c @@ -0,0 +1,184 @@ +/* + * + * GStreamer + * Copyright (C) 2004 Billy Biggs <vektor@dumbterm.net> + * Copyright (c) 2001, 2002, 2003 Fabrice Bellard. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * This file contains code from ffmpeg, see http://ffmpeg.org/ (LGPL) + * and modifications by Billy Biggs. + * + * Relicensed for GStreamer from GPL to LGPL with permit from Billy Biggs. + * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 + */ + +#include <stdio.h> +#if defined (__SVR4) && defined (__sun) +# include <sys/int_types.h> +#else +# include <stdint.h> +#endif + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "mmx.h" +#include "speedy.h" +#include "gstdeinterlace2.h" + +/** + * The MPEG2 spec uses a slightly harsher filter, they specify + * [-1 8 2 8 -1]. ffmpeg uses a similar filter but with more of + * a tendancy to blur than to use the local information. The + * filter taps here are: [-1 4 2 4 -1]. + */ + +static void +deinterlace_line (uint8_t * dst, uint8_t * lum_m4, + uint8_t * lum_m3, uint8_t * lum_m2, + uint8_t * lum_m1, uint8_t * lum, int size) +{ +#ifdef HAVE_CPU_I386 + mmx_t rounder; + + rounder.uw[0] = 4; + rounder.uw[1] = 4; + rounder.uw[2] = 4; + rounder.uw[3] = 4; + pxor_r2r (mm7, mm7); + movq_m2r (rounder, mm6); + + for (; size > 3; size -= 4) { + movd_m2r (lum_m4[0], mm0); + movd_m2r (lum_m3[0], mm1); + movd_m2r (lum_m2[0], mm2); + movd_m2r (lum_m1[0], mm3); + movd_m2r (lum[0], mm4); + punpcklbw_r2r (mm7, mm0); + punpcklbw_r2r (mm7, mm1); + punpcklbw_r2r (mm7, mm2); + punpcklbw_r2r (mm7, mm3); + punpcklbw_r2r (mm7, mm4); + paddw_r2r (mm3, mm1); + psllw_i2r (1, mm2); + paddw_r2r (mm4, mm0); + psllw_i2r (2, mm1); // 2 + paddw_r2r (mm6, mm2); + paddw_r2r (mm2, mm1); + psubusw_r2r (mm0, mm1); + psrlw_i2r (3, mm1); // 3 + packuswb_r2r (mm7, mm1); + movd_r2m (mm1, dst[0]); + lum_m4 += 4; + lum_m3 += 4; + lum_m2 += 4; + lum_m1 += 4; + lum += 4; + dst += 4; + } + emms (); +#else + /** + * C implementation. + */ + int sum; + + for (; size > 0; size--) { + sum = -lum_m4[0]; + sum += lum_m3[0] << 2; + sum += lum_m2[0] << 1; + sum += lum_m1[0] << 2; + sum += -lum[0]; + dst[0] = (sum + 4) >> 3; // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3]; + lum_m4++; + lum_m3++; + lum_m2++; + lum_m1++; + lum++; + dst++; + } +#endif +} + + +/** + * The commented-out method below that uses the bottom_field member is more + * like the filter as specified in the MPEG2 spec, but it doesn't seem to + * have the desired effect. + */ + +static void +deinterlace_scanline_vfir (GstDeinterlace2 * object, + deinterlace_scanline_data_t * data, uint8_t * output) +{ + deinterlace_line (output, data->tt1, data->t0, data->m1, data->b0, data->bb1, + object->frame_width * 2); + // blit_packed422_scanline( output, data->m1, width ); +} + +static void +copy_scanline (GstDeinterlace2 * object, + deinterlace_scanline_data_t * data, uint8_t * output) +{ + blit_packed422_scanline (output, data->m0, object->frame_width); + /* + if( data->bottom_field ) { + deinterlace_line( output, data->tt2, data->t1, data->m2, data->b1, data->bb2, width*2 ); + } else { + deinterlace_line( output, data->tt0, data->t1, data->m0, data->b1, data->bb0, width*2 ); + } + */ +} + + +static deinterlace_method_t vfirmethod = { + 0, //DEINTERLACE_PLUGIN_API_VERSION, + "Blur: Vertical", + "BlurVertical", + 2, +#ifdef HAVE_CPU_I386 + OIL_IMPL_FLAG_MMXEXT, +#else + 0, +#endif + 0, + 0, + 0, + 1, + deinterlace_scanline_vfir, + copy_scanline, + 0, + {"Avoids flicker by blurring consecutive frames", + "of input. Use this if you want to run your", + "monitor at an arbitrary refresh rate and not", + "use much CPU, and are willing to sacrifice", + "detail.", + "", + "Vertical mode blurs favouring the most recent", + "field for less visible trails. From the", + "deinterlacer filter in ffmpeg.", + ""} +}; + +deinterlace_method_t * +dscaler_vfir_get_method (void) +{ + return &vfirmethod; +} diff --git a/gst/deinterlace2/tvtime/x86-64_macros.inc b/gst/deinterlace2/tvtime/x86-64_macros.inc new file mode 100644 index 00000000..2e9df758 --- /dev/null +++ b/gst/deinterlace2/tvtime/x86-64_macros.inc @@ -0,0 +1,82 @@ +/* + * + * GStreamer + * Copyright (C) 2004 Dirk Ziegelmeier <dziegel@gmx.de> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * + * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578 + */ + +/* + * This file is copied from TVTIME's sources. + * Original author: Achim Schneider <batchall@mordor.ch> + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifndef XAX + +#if defined (HAVE_CPU_I386) && !defined(HAVE_CPU_X86_64) + +#define XAX "eax" +#define XBX "ebx" +#define XCX "ecx" +#define XDX "edx" +#define XSI "esi" +#define XDI "edi" +#define XSP "esp" +#define MOVX "movl" +#define LEAX "leal" +#define DECX "decl" +#define PUSHX "pushl" +#define POPX "popl" +#define CMPX "cmpl" +#define ADDX "addl" +#define SHLX "shll" +#define SHRX "shrl" +#define SUBX "subl" + +#elif defined (HAVE_CPU_X86_64) + +#define XAX "rax" +#define XBX "rbx" +#define XCX "rcx" +#define XDX "rdx" +#define XSI "rsi" +#define XDI "rdi" +#define XSP "rsp" +#define MOVX "movq" +#define LEAX "leaq" +#define DECX "decq" +#define PUSHX "pushq" +#define POPX "popq" +#define CMPX "cmpq" +#define ADDX "addq" +#define SHLX "shlq" +#define SHRX "shrq" +#define SUBX "subq" + +#else +#error Undefined architecture. Define either ARCH_X86 or ARCH_X86_64. +#endif + +#endif |