From 6fd4ed3965ecfebdcba954ccb97d9464f3bd12d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Dr=C3=B6ge?= Date: Tue, 24 Jun 2008 09:10:46 +0000 Subject: gst/deinterlace2/tvtime/greedy.c: Fix the C implementation to produce correct results and optimize the Original commit message from CVS: * gst/deinterlace2/tvtime/greedy.c: (deinterlace_greedy_packed422_scanline_c), (deinterlace_greedy_packed422_scanline_mmxext), (deinterlace_greedy_packed422_scanline): Fix the C implementation to produce correct results and optimize the MMXEXT implementation. Handle odd widths and don't read over array boundaries in the MMXEXT implementation. * gst/deinterlace2/tvtime/vfir.c: (deinterlace_line_c), (deinterlace_line_mmx), (deinterlace_scanline_vfir): Fix a small rounding bug in the MMX implementation, the MMX implementation doesn't actually need MMXEXT instructions so don't mark it as such. Handle odd widths in both implementations. --- gst/deinterlace2/tvtime/vfir.c | 77 ++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 36 deletions(-) (limited to 'gst/deinterlace2/tvtime/vfir.c') diff --git a/gst/deinterlace2/tvtime/vfir.c b/gst/deinterlace2/tvtime/vfir.c index f32be654..479ee440 100644 --- a/gst/deinterlace2/tvtime/vfir.c +++ b/gst/deinterlace2/tvtime/vfir.c @@ -49,10 +49,36 @@ * filter taps here are: [-1 4 2 4 -1]. */ +/** + * C implementation. + */ +static inline void +deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4, + uint8_t * lum_m3, uint8_t * lum_m2, + uint8_t * lum_m1, uint8_t * lum, int size) +{ + int sum; + + for (; size > 0; size--) { + sum = -lum_m4[0]; + sum += lum_m3[0] << 2; + sum += lum_m2[0] << 1; + sum += lum_m1[0] << 2; + sum += -lum[0]; + dst[0] = (sum + 4) >> 3; // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3]; + lum_m4++; + lum_m3++; + lum_m2++; + lum_m1++; + lum++; + dst++; + } +} + #ifdef HAVE_CPU_I386 #include "mmx.h" static void -deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4, +deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4, uint8_t * lum_m3, uint8_t * lum_m2, uint8_t * lum_m1, uint8_t * lum, int size) { @@ -63,14 +89,15 @@ deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4, rounder.uw[2] = 4; rounder.uw[3] = 4; pxor_r2r (mm7, mm7); - movq_m2r (rounder, mm6); + movd_m2r (rounder, mm6); + punpcklbw_r2r (mm7, mm6); for (; size > 3; size -= 4) { - movd_m2r (lum_m4[0], mm0); - movd_m2r (lum_m3[0], mm1); - movd_m2r (lum_m2[0], mm2); - movd_m2r (lum_m1[0], mm3); - movd_m2r (lum[0], mm4); + movd_m2r (*lum_m4, mm0); + movd_m2r (*lum_m3, mm1); + movd_m2r (*lum_m2, mm2); + movd_m2r (*lum_m1, mm3); + movd_m2r (*lum, mm4); punpcklbw_r2r (mm7, mm0); punpcklbw_r2r (mm7, mm1); punpcklbw_r2r (mm7, mm2); @@ -85,7 +112,7 @@ deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4, psubusw_r2r (mm0, mm1); psrlw_i2r (3, mm1); // 3 packuswb_r2r (mm7, mm1); - movd_r2m (mm1, dst[0]); + movd_r2m (mm1, *dst); lum_m4 += 4; lum_m3 += 4; lum_m2 += 4; @@ -94,34 +121,12 @@ deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4, dst += 4; } emms (); -} -#endif -/** - * C implementation. - */ -static void -deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4, - uint8_t * lum_m3, uint8_t * lum_m2, - uint8_t * lum_m1, uint8_t * lum, int size) -{ - int sum; - - for (; size > 0; size--) { - sum = -lum_m4[0]; - sum += lum_m3[0] << 2; - sum += lum_m2[0] << 1; - sum += lum_m1[0] << 2; - sum += -lum[0]; - dst[0] = (sum + 4) >> 3; // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3]; - lum_m4++; - lum_m3++; - lum_m2++; - lum_m1++; - lum++; - dst++; - } + /* Handle odd widths */ + if (size > 0) + deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size); } +#endif /* * The commented-out method below that uses the bottom_field member is more @@ -134,8 +139,8 @@ deinterlace_scanline_vfir (GstDeinterlace2 * object, deinterlace_scanline_data_t * data, uint8_t * output) { #ifdef HAVE_CPU_I386 - if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) { - deinterlace_line_mmxext (output, data->tt1, data->t0, data->m1, data->b0, + if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMX) { + deinterlace_line_mmx (output, data->tt1, data->t0, data->m1, data->b0, data->bb1, object->frame_width * 2); } else { deinterlace_line_c (output, data->tt1, data->t0, data->m1, data->b0, -- cgit v1.2.1