diff options
author | Sebastian Dröge <slomo@circular-chaos.org> | 2008-06-24 09:10:46 +0000 |
---|---|---|
committer | Sebastian Dröge <slomo@circular-chaos.org> | 2008-06-24 09:10:46 +0000 |
commit | 6fd4ed3965ecfebdcba954ccb97d9464f3bd12d7 (patch) | |
tree | b8831236c5d74c11990e649ea2c09c74107ade7f /gst/deinterlace2/tvtime | |
parent | d7cca015530caeb2411db65d7a4ed283e60c36a6 (diff) | |
download | gst-plugins-bad-6fd4ed3965ecfebdcba954ccb97d9464f3bd12d7.tar.gz gst-plugins-bad-6fd4ed3965ecfebdcba954ccb97d9464f3bd12d7.tar.bz2 gst-plugins-bad-6fd4ed3965ecfebdcba954ccb97d9464f3bd12d7.zip |
gst/deinterlace2/tvtime/greedy.c: Fix the C implementation to produce correct results and optimize the
Original commit message from CVS:
* gst/deinterlace2/tvtime/greedy.c:
(deinterlace_greedy_packed422_scanline_c),
(deinterlace_greedy_packed422_scanline_mmxext),
(deinterlace_greedy_packed422_scanline):
Fix the C implementation to produce correct results and optimize the
MMXEXT implementation.
Handle odd widths and don't read over array boundaries in the MMXEXT
implementation.
* gst/deinterlace2/tvtime/vfir.c: (deinterlace_line_c),
(deinterlace_line_mmx), (deinterlace_scanline_vfir):
Fix a small rounding bug in the MMX implementation, the MMX
implementation doesn't actually need MMXEXT instructions so don't mark
it as such.
Handle odd widths in both implementations.
Diffstat (limited to 'gst/deinterlace2/tvtime')
-rw-r--r-- | gst/deinterlace2/tvtime/greedy.c | 158 | ||||
-rw-r--r-- | gst/deinterlace2/tvtime/vfir.c | 77 |
2 files changed, 112 insertions, 123 deletions
diff --git a/gst/deinterlace2/tvtime/greedy.c b/gst/deinterlace2/tvtime/greedy.c index c25af036..66b8799d 100644 --- a/gst/deinterlace2/tvtime/greedy.c +++ b/gst/deinterlace2/tvtime/greedy.c @@ -60,26 +60,65 @@ copy_scanline (GstDeinterlace2 * object, blit_packed422_scanline (output, data->m1, object->frame_width); } -static int GreedyMaxComb = 15; +static const int GreedyMaxComb = 15; -#ifdef HAVE_CPU_I386 -#include "mmx.h" -#include "sse.h" -static void -deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object, - deinterlace_scanline_data_t * data, uint8_t * output) +static inline void +deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object, + uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output, + int width) { - mmx_t MaxComb; + int avg, l2_diff, lp2_diff, max, min, best; + + // L2 == m0 + // L1 == t1 + // L3 == b1 + // LP2 == m2 + + while (width--) { + avg = (*t1 + *b1) / 2; + + l2_diff = ABS (*m0 - avg); + lp2_diff = ABS (*m2 - avg); + + if (l2_diff > lp2_diff) + best = *m2; + else + best = *m0; + + max = MAX (*t1, *b1); + min = MIN (*t1, *b1); + + if (max < 256 - GreedyMaxComb) + max += GreedyMaxComb; + else + max = 255; - uint8_t *m0 = data->m0; + if (min > GreedyMaxComb) + min -= GreedyMaxComb; + else + min = 0; - uint8_t *t1 = data->t1; + *output = CLAMP (best, min, max); - uint8_t *b1 = data->b1; + // Advance to the next set of pixels. + output += 1; + m0 += 1; + t1 += 1; + b1 += 1; + m2 += 1; + } +} - uint8_t *m2 = data->m2; +#ifdef HAVE_CPU_I386 +#include "mmx.h" +#include "sse.h" - int width = object->frame_width; +static void +deinterlace_greedy_packed422_scanline_mmxext (GstDeinterlace2 * object, + uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output, + int width) +{ + mmx_t MaxComb; // How badly do we let it weave? 0-255 MaxComb.ub[0] = GreedyMaxComb; @@ -96,8 +135,7 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object, // L3 == b1 // LP2 == m2 - width /= 4; - while (width--) { + for (; width > 7; width -= 8) { movq_m2r (*t1, mm1); // L1 movq_m2r (*m0, mm2); // L2 movq_m2r (*b1, mm3); // L3 @@ -107,15 +145,12 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object, movq_r2r (mm1, mm4); // L1 pavgb_r2r (mm3, mm4); // (L1 + L3)/2 - // get abs value of possible L2 comb movq_r2r (mm2, mm7); // L2 psubusb_r2r (mm4, mm7); // L2 - avg movq_r2r (mm4, mm5); // avg psubusb_r2r (mm2, mm5); // avg - L2 por_r2r (mm7, mm5); // abs(avg-L2) - movq_r2r (mm4, mm6); // copy of avg for later - // get abs value of possible LP2 comb movq_r2r (mm0, mm7); // LP2 @@ -125,7 +160,7 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object, // use L2 or LP2 depending upon which makes smaller comb psubusb_r2r (mm5, mm4); // see if it goes to zero - psubusb_r2r (mm5, mm5); // 0 + pxor_r2r (mm5, mm5); // 0 pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0 pcmpeqb_r2r (mm4, mm5); // opposite of mm4 @@ -140,27 +175,19 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object, // detail than a boring oversmoothed clip. movq_r2r (mm1, mm2); // copy L1 - psubusb_r2r (mm3, mm2); // - L3, with saturation - paddusb_r2r (mm3, mm2); // now = Max(L1,L3) + pmaxub_r2r (mm3, mm2); // now = Max(L1,L3) - pcmpeqb_r2r (mm7, mm7); // all ffffffff - psubusb_r2r (mm1, mm7); // - L1 - paddusb_r2r (mm7, mm3); // add, may sat at fff.. - psubusb_r2r (mm7, mm3); // now = Min(L1,L3) + pminub_r2r (mm1, mm3); // now = Min(L1,L3) // allow the value to be above the high or below the low by amt of MaxComb paddusb_m2r (MaxComb, mm2); // increase max by diff psubusb_m2r (MaxComb, mm3); // lower min by diff - psubusb_r2r (mm3, mm4); // best - Min - paddusb_r2r (mm3, mm4); // now = Max(best,Min(L1,L3) - pcmpeqb_r2r (mm7, mm7); // all ffffffff - psubusb_r2r (mm4, mm7); // - Max(best,Min(best,L3) - paddusb_r2r (mm7, mm2); // add may sat at FFF.. - psubusb_r2r (mm7, mm2); // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped + pmaxub_r2r (mm3, mm4); // now = Max(best,Min(L1,L3) + pminub_r2r (mm4, mm2); // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped - movntq_r2m (mm2, *output); // move in our clipped best + movq_r2m (mm2, *output); // move in our clipped best // Advance to the next set of pixels. output += 8; @@ -171,72 +198,29 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object, } sfence (); emms (); -} -#endif - -static void -deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object, - deinterlace_scanline_data_t * data, uint8_t * output) -{ - uint8_t *m0 = data->m0; - - uint8_t *t1 = data->t1; - - uint8_t *b1 = data->b1; - - uint8_t *m2 = data->m2; - - int width = 2 * object->frame_width; - - uint16_t avg, l2_diff, lp2_diff, max, min, best; - - // L2 == m0 - // L1 == t1 - // L3 == b1 - // LP2 == m2 - - while (width--) { - avg = (*t1 + *b1) / 2; - - l2_diff = ABS (*m0 - avg); - lp2_diff = ABS (*m2 - avg); - - if (l2_diff > lp2_diff) - best = *m2; - else - best = *m0; - - max = MAX (*t1, *b1); - min = MIN (*t1, *b1); - - if (max < 256 - GreedyMaxComb) - max += GreedyMaxComb; - if (min > GreedyMaxComb) - min -= GreedyMaxComb; - - *output = MIN (MAX (best, min), max); - // Advance to the next set of pixels. - output += 1; - m0 += 1; - t1 += 1; - b1 += 1; - m2 += 1; - } + if (width > 0) + deinterlace_greedy_packed422_scanline_c (object, m0, t1, b1, m2, output, + width); } +#endif + static void deinterlace_greedy_packed422_scanline (GstDeinterlace2 * object, deinterlace_scanline_data_t * data, uint8_t * output) { #ifdef HAVE_CPU_I386 - if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) { - deinterlace_greedy_packed422_scanline_sse (object, data, output); + if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) { + deinterlace_greedy_packed422_scanline_mmxext (object, data->m0, data->t1, + data->b1, data->m2, output, 2 * object->frame_width); } else { - deinterlace_greedy_packed422_scanline_c (object, data, output); + deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1, + data->b1, data->m2, output, 2 * object->frame_width); } #else - deinterlace_greedy_packed422_scanline_c (object, data, output); + deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1, data->b1, + data->m2, output, 2 * object->frame_width); #endif } diff --git a/gst/deinterlace2/tvtime/vfir.c b/gst/deinterlace2/tvtime/vfir.c index f32be654..479ee440 100644 --- a/gst/deinterlace2/tvtime/vfir.c +++ b/gst/deinterlace2/tvtime/vfir.c @@ -49,10 +49,36 @@ * filter taps here are: [-1 4 2 4 -1]. */ +/** + * C implementation. + */ +static inline void +deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4, + uint8_t * lum_m3, uint8_t * lum_m2, + uint8_t * lum_m1, uint8_t * lum, int size) +{ + int sum; + + for (; size > 0; size--) { + sum = -lum_m4[0]; + sum += lum_m3[0] << 2; + sum += lum_m2[0] << 1; + sum += lum_m1[0] << 2; + sum += -lum[0]; + dst[0] = (sum + 4) >> 3; // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3]; + lum_m4++; + lum_m3++; + lum_m2++; + lum_m1++; + lum++; + dst++; + } +} + #ifdef HAVE_CPU_I386 #include "mmx.h" static void -deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4, +deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4, uint8_t * lum_m3, uint8_t * lum_m2, uint8_t * lum_m1, uint8_t * lum, int size) { @@ -63,14 +89,15 @@ deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4, rounder.uw[2] = 4; rounder.uw[3] = 4; pxor_r2r (mm7, mm7); - movq_m2r (rounder, mm6); + movd_m2r (rounder, mm6); + punpcklbw_r2r (mm7, mm6); for (; size > 3; size -= 4) { - movd_m2r (lum_m4[0], mm0); - movd_m2r (lum_m3[0], mm1); - movd_m2r (lum_m2[0], mm2); - movd_m2r (lum_m1[0], mm3); - movd_m2r (lum[0], mm4); + movd_m2r (*lum_m4, mm0); + movd_m2r (*lum_m3, mm1); + movd_m2r (*lum_m2, mm2); + movd_m2r (*lum_m1, mm3); + movd_m2r (*lum, mm4); punpcklbw_r2r (mm7, mm0); punpcklbw_r2r (mm7, mm1); punpcklbw_r2r (mm7, mm2); @@ -85,7 +112,7 @@ deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4, psubusw_r2r (mm0, mm1); psrlw_i2r (3, mm1); // 3 packuswb_r2r (mm7, mm1); - movd_r2m (mm1, dst[0]); + movd_r2m (mm1, *dst); lum_m4 += 4; lum_m3 += 4; lum_m2 += 4; @@ -94,34 +121,12 @@ deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4, dst += 4; } emms (); -} -#endif -/** - * C implementation. - */ -static void -deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4, - uint8_t * lum_m3, uint8_t * lum_m2, - uint8_t * lum_m1, uint8_t * lum, int size) -{ - int sum; - - for (; size > 0; size--) { - sum = -lum_m4[0]; - sum += lum_m3[0] << 2; - sum += lum_m2[0] << 1; - sum += lum_m1[0] << 2; - sum += -lum[0]; - dst[0] = (sum + 4) >> 3; // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3]; - lum_m4++; - lum_m3++; - lum_m2++; - lum_m1++; - lum++; - dst++; - } + /* Handle odd widths */ + if (size > 0) + deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size); } +#endif /* * The commented-out method below that uses the bottom_field member is more @@ -134,8 +139,8 @@ deinterlace_scanline_vfir (GstDeinterlace2 * object, deinterlace_scanline_data_t * data, uint8_t * output) { #ifdef HAVE_CPU_I386 - if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) { - deinterlace_line_mmxext (output, data->tt1, data->t0, data->m1, data->b0, + if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMX) { + deinterlace_line_mmx (output, data->tt1, data->t0, data->m1, data->b0, data->bb1, object->frame_width * 2); } else { deinterlace_line_c (output, data->tt1, data->t0, data->m1, data->b0, |