summaryrefslogtreecommitdiffstats
path: root/gst
diff options
context:
space:
mode:
authorSebastian Dröge <slomo@circular-chaos.org>2008-06-24 09:10:46 +0000
committerSebastian Dröge <slomo@circular-chaos.org>2008-06-24 09:10:46 +0000
commit6fd4ed3965ecfebdcba954ccb97d9464f3bd12d7 (patch)
treeb8831236c5d74c11990e649ea2c09c74107ade7f /gst
parentd7cca015530caeb2411db65d7a4ed283e60c36a6 (diff)
downloadgst-plugins-bad-6fd4ed3965ecfebdcba954ccb97d9464f3bd12d7.tar.gz
gst-plugins-bad-6fd4ed3965ecfebdcba954ccb97d9464f3bd12d7.tar.bz2
gst-plugins-bad-6fd4ed3965ecfebdcba954ccb97d9464f3bd12d7.zip
gst/deinterlace2/tvtime/greedy.c: Fix the C implementation to produce correct results and optimize the
Original commit message from CVS: * gst/deinterlace2/tvtime/greedy.c: (deinterlace_greedy_packed422_scanline_c), (deinterlace_greedy_packed422_scanline_mmxext), (deinterlace_greedy_packed422_scanline): Fix the C implementation to produce correct results and optimize the MMXEXT implementation. Handle odd widths and don't read over array boundaries in the MMXEXT implementation. * gst/deinterlace2/tvtime/vfir.c: (deinterlace_line_c), (deinterlace_line_mmx), (deinterlace_scanline_vfir): Fix a small rounding bug in the MMX implementation, the MMX implementation doesn't actually need MMXEXT instructions so don't mark it as such. Handle odd widths in both implementations.
Diffstat (limited to 'gst')
-rw-r--r--gst/deinterlace2/tvtime/greedy.c158
-rw-r--r--gst/deinterlace2/tvtime/vfir.c77
2 files changed, 112 insertions, 123 deletions
diff --git a/gst/deinterlace2/tvtime/greedy.c b/gst/deinterlace2/tvtime/greedy.c
index c25af036..66b8799d 100644
--- a/gst/deinterlace2/tvtime/greedy.c
+++ b/gst/deinterlace2/tvtime/greedy.c
@@ -60,26 +60,65 @@ copy_scanline (GstDeinterlace2 * object,
blit_packed422_scanline (output, data->m1, object->frame_width);
}
-static int GreedyMaxComb = 15;
+static const int GreedyMaxComb = 15;
-#ifdef HAVE_CPU_I386
-#include "mmx.h"
-#include "sse.h"
-static void
-deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
- deinterlace_scanline_data_t * data, uint8_t * output)
+static inline void
+deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
+ uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output,
+ int width)
{
- mmx_t MaxComb;
+ int avg, l2_diff, lp2_diff, max, min, best;
+
+ // L2 == m0
+ // L1 == t1
+ // L3 == b1
+ // LP2 == m2
+
+ while (width--) {
+ avg = (*t1 + *b1) / 2;
+
+ l2_diff = ABS (*m0 - avg);
+ lp2_diff = ABS (*m2 - avg);
+
+ if (l2_diff > lp2_diff)
+ best = *m2;
+ else
+ best = *m0;
+
+ max = MAX (*t1, *b1);
+ min = MIN (*t1, *b1);
+
+ if (max < 256 - GreedyMaxComb)
+ max += GreedyMaxComb;
+ else
+ max = 255;
- uint8_t *m0 = data->m0;
+ if (min > GreedyMaxComb)
+ min -= GreedyMaxComb;
+ else
+ min = 0;
- uint8_t *t1 = data->t1;
+ *output = CLAMP (best, min, max);
- uint8_t *b1 = data->b1;
+ // Advance to the next set of pixels.
+ output += 1;
+ m0 += 1;
+ t1 += 1;
+ b1 += 1;
+ m2 += 1;
+ }
+}
- uint8_t *m2 = data->m2;
+#ifdef HAVE_CPU_I386
+#include "mmx.h"
+#include "sse.h"
- int width = object->frame_width;
+static void
+deinterlace_greedy_packed422_scanline_mmxext (GstDeinterlace2 * object,
+ uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output,
+ int width)
+{
+ mmx_t MaxComb;
// How badly do we let it weave? 0-255
MaxComb.ub[0] = GreedyMaxComb;
@@ -96,8 +135,7 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
// L3 == b1
// LP2 == m2
- width /= 4;
- while (width--) {
+ for (; width > 7; width -= 8) {
movq_m2r (*t1, mm1); // L1
movq_m2r (*m0, mm2); // L2
movq_m2r (*b1, mm3); // L3
@@ -107,15 +145,12 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
movq_r2r (mm1, mm4); // L1
pavgb_r2r (mm3, mm4); // (L1 + L3)/2
-
// get abs value of possible L2 comb
movq_r2r (mm2, mm7); // L2
psubusb_r2r (mm4, mm7); // L2 - avg
movq_r2r (mm4, mm5); // avg
psubusb_r2r (mm2, mm5); // avg - L2
por_r2r (mm7, mm5); // abs(avg-L2)
- movq_r2r (mm4, mm6); // copy of avg for later
-
// get abs value of possible LP2 comb
movq_r2r (mm0, mm7); // LP2
@@ -125,7 +160,7 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
// use L2 or LP2 depending upon which makes smaller comb
psubusb_r2r (mm5, mm4); // see if it goes to zero
- psubusb_r2r (mm5, mm5); // 0
+ pxor_r2r (mm5, mm5); // 0
pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0
pcmpeqb_r2r (mm4, mm5); // opposite of mm4
@@ -140,27 +175,19 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
// detail than a boring oversmoothed clip.
movq_r2r (mm1, mm2); // copy L1
- psubusb_r2r (mm3, mm2); // - L3, with saturation
- paddusb_r2r (mm3, mm2); // now = Max(L1,L3)
+ pmaxub_r2r (mm3, mm2); // now = Max(L1,L3)
- pcmpeqb_r2r (mm7, mm7); // all ffffffff
- psubusb_r2r (mm1, mm7); // - L1
- paddusb_r2r (mm7, mm3); // add, may sat at fff..
- psubusb_r2r (mm7, mm3); // now = Min(L1,L3)
+ pminub_r2r (mm1, mm3); // now = Min(L1,L3)
// allow the value to be above the high or below the low by amt of MaxComb
paddusb_m2r (MaxComb, mm2); // increase max by diff
psubusb_m2r (MaxComb, mm3); // lower min by diff
- psubusb_r2r (mm3, mm4); // best - Min
- paddusb_r2r (mm3, mm4); // now = Max(best,Min(L1,L3)
- pcmpeqb_r2r (mm7, mm7); // all ffffffff
- psubusb_r2r (mm4, mm7); // - Max(best,Min(best,L3)
- paddusb_r2r (mm7, mm2); // add may sat at FFF..
- psubusb_r2r (mm7, mm2); // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
+ pmaxub_r2r (mm3, mm4); // now = Max(best,Min(L1,L3)
+ pminub_r2r (mm4, mm2); // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped
- movntq_r2m (mm2, *output); // move in our clipped best
+ movq_r2m (mm2, *output); // move in our clipped best
// Advance to the next set of pixels.
output += 8;
@@ -171,72 +198,29 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
}
sfence ();
emms ();
-}
-#endif
-
-static void
-deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
- deinterlace_scanline_data_t * data, uint8_t * output)
-{
- uint8_t *m0 = data->m0;
-
- uint8_t *t1 = data->t1;
-
- uint8_t *b1 = data->b1;
-
- uint8_t *m2 = data->m2;
-
- int width = 2 * object->frame_width;
-
- uint16_t avg, l2_diff, lp2_diff, max, min, best;
-
- // L2 == m0
- // L1 == t1
- // L3 == b1
- // LP2 == m2
-
- while (width--) {
- avg = (*t1 + *b1) / 2;
-
- l2_diff = ABS (*m0 - avg);
- lp2_diff = ABS (*m2 - avg);
-
- if (l2_diff > lp2_diff)
- best = *m2;
- else
- best = *m0;
-
- max = MAX (*t1, *b1);
- min = MIN (*t1, *b1);
-
- if (max < 256 - GreedyMaxComb)
- max += GreedyMaxComb;
- if (min > GreedyMaxComb)
- min -= GreedyMaxComb;
-
- *output = MIN (MAX (best, min), max);
- // Advance to the next set of pixels.
- output += 1;
- m0 += 1;
- t1 += 1;
- b1 += 1;
- m2 += 1;
- }
+ if (width > 0)
+ deinterlace_greedy_packed422_scanline_c (object, m0, t1, b1, m2, output,
+ width);
}
+#endif
+
static void
deinterlace_greedy_packed422_scanline (GstDeinterlace2 * object,
deinterlace_scanline_data_t * data, uint8_t * output)
{
#ifdef HAVE_CPU_I386
- if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) {
- deinterlace_greedy_packed422_scanline_sse (object, data, output);
+ if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
+ deinterlace_greedy_packed422_scanline_mmxext (object, data->m0, data->t1,
+ data->b1, data->m2, output, 2 * object->frame_width);
} else {
- deinterlace_greedy_packed422_scanline_c (object, data, output);
+ deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1,
+ data->b1, data->m2, output, 2 * object->frame_width);
}
#else
- deinterlace_greedy_packed422_scanline_c (object, data, output);
+ deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1, data->b1,
+ data->m2, output, 2 * object->frame_width);
#endif
}
diff --git a/gst/deinterlace2/tvtime/vfir.c b/gst/deinterlace2/tvtime/vfir.c
index f32be654..479ee440 100644
--- a/gst/deinterlace2/tvtime/vfir.c
+++ b/gst/deinterlace2/tvtime/vfir.c
@@ -49,10 +49,36 @@
* filter taps here are: [-1 4 2 4 -1].
*/
+/**
+ * C implementation.
+ */
+static inline void
+deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4,
+ uint8_t * lum_m3, uint8_t * lum_m2,
+ uint8_t * lum_m1, uint8_t * lum, int size)
+{
+ int sum;
+
+ for (; size > 0; size--) {
+ sum = -lum_m4[0];
+ sum += lum_m3[0] << 2;
+ sum += lum_m2[0] << 1;
+ sum += lum_m1[0] << 2;
+ sum += -lum[0];
+ dst[0] = (sum + 4) >> 3; // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3];
+ lum_m4++;
+ lum_m3++;
+ lum_m2++;
+ lum_m1++;
+ lum++;
+ dst++;
+ }
+}
+
#ifdef HAVE_CPU_I386
#include "mmx.h"
static void
-deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4,
+deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4,
uint8_t * lum_m3, uint8_t * lum_m2,
uint8_t * lum_m1, uint8_t * lum, int size)
{
@@ -63,14 +89,15 @@ deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4,
rounder.uw[2] = 4;
rounder.uw[3] = 4;
pxor_r2r (mm7, mm7);
- movq_m2r (rounder, mm6);
+ movd_m2r (rounder, mm6);
+ punpcklbw_r2r (mm7, mm6);
for (; size > 3; size -= 4) {
- movd_m2r (lum_m4[0], mm0);
- movd_m2r (lum_m3[0], mm1);
- movd_m2r (lum_m2[0], mm2);
- movd_m2r (lum_m1[0], mm3);
- movd_m2r (lum[0], mm4);
+ movd_m2r (*lum_m4, mm0);
+ movd_m2r (*lum_m3, mm1);
+ movd_m2r (*lum_m2, mm2);
+ movd_m2r (*lum_m1, mm3);
+ movd_m2r (*lum, mm4);
punpcklbw_r2r (mm7, mm0);
punpcklbw_r2r (mm7, mm1);
punpcklbw_r2r (mm7, mm2);
@@ -85,7 +112,7 @@ deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4,
psubusw_r2r (mm0, mm1);
psrlw_i2r (3, mm1); // 3
packuswb_r2r (mm7, mm1);
- movd_r2m (mm1, dst[0]);
+ movd_r2m (mm1, *dst);
lum_m4 += 4;
lum_m3 += 4;
lum_m2 += 4;
@@ -94,34 +121,12 @@ deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4,
dst += 4;
}
emms ();
-}
-#endif
-/**
- * C implementation.
- */
-static void
-deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4,
- uint8_t * lum_m3, uint8_t * lum_m2,
- uint8_t * lum_m1, uint8_t * lum, int size)
-{
- int sum;
-
- for (; size > 0; size--) {
- sum = -lum_m4[0];
- sum += lum_m3[0] << 2;
- sum += lum_m2[0] << 1;
- sum += lum_m1[0] << 2;
- sum += -lum[0];
- dst[0] = (sum + 4) >> 3; // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3];
- lum_m4++;
- lum_m3++;
- lum_m2++;
- lum_m1++;
- lum++;
- dst++;
- }
+ /* Handle odd widths */
+ if (size > 0)
+ deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size);
}
+#endif
/*
* The commented-out method below that uses the bottom_field member is more
@@ -134,8 +139,8 @@ deinterlace_scanline_vfir (GstDeinterlace2 * object,
deinterlace_scanline_data_t * data, uint8_t * output)
{
#ifdef HAVE_CPU_I386
- if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
- deinterlace_line_mmxext (output, data->tt1, data->t0, data->m1, data->b0,
+ if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMX) {
+ deinterlace_line_mmx (output, data->tt1, data->t0, data->m1, data->b0,
data->bb1, object->frame_width * 2);
} else {
deinterlace_line_c (output, data->tt1, data->t0, data->m1, data->b0,