/*
 *
 * GStreamer
 * Copyright (c) 2000 Tom Barry  All rights reserved.
 * mmx.h port copyright (c) 2002 Billy Biggs <vektor@dumbterm.net>.
 *
 * Copyright (C) 2008 Sebastian Dröge <slomo@collabora.co.uk>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

/*
 * Relicensed for GStreamer from GPL to LGPL with permit from Tom Barry
 * and Billy Biggs.
 * See: http://bugzilla.gnome.org/show_bug.cgi?id=163578
 */

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

#include "_stdint.h"

#include "gstdeinterlace2.h"
#include <string.h>

#define GST_TYPE_DEINTERLACE_METHOD_GREEDY_L	(gst_deinterlace_method_greedy_l_get_type ())
#define GST_IS_DEINTERLACE_METHOD_GREEDY_L(obj)		(G_TYPE_CHECK_INSTANCE_TYPE ((obj), GST_TYPE_DEINTERLACE_METHOD_GREEDY_L))
#define GST_IS_DEINTERLACE_METHOD_GREEDY_L_CLASS(klass)	(G_TYPE_CHECK_CLASS_TYPE ((klass), GST_TYPE_DEINTERLACE_METHOD_GREEDY_L))
#define GST_DEINTERLACE_METHOD_GREEDY_L_GET_CLASS(obj)	(G_TYPE_INSTANCE_GET_CLASS ((obj), GST_TYPE_DEINTERLACE_METHOD_GREEDY_L, GstDeinterlaceMethodGreedyLClass))
#define GST_DEINTERLACE_METHOD_GREEDY_L(obj)		(G_TYPE_CHECK_INSTANCE_CAST ((obj), GST_TYPE_DEINTERLACE_METHOD_GREEDY_L, GstDeinterlaceMethodGreedyL))
#define GST_DEINTERLACE_METHOD_GREEDY_L_CLASS(klass)	(G_TYPE_CHECK_CLASS_CAST ((klass), GST_TYPE_DEINTERLACE_METHOD_GREEDY_L, GstDeinterlaceMethodGreedyLClass))
#define GST_DEINTERLACE_METHOD_GREEDY_L_CAST(obj)	((GstDeinterlaceMethodGreedyL*)(obj))

GType gst_deinterlace_method_greedy_l_get_type (void);

typedef struct
{
  GstDeinterlaceMethod parent;

  guint max_comb;
} GstDeinterlaceMethodGreedyL;

typedef struct
{
  GstDeinterlaceMethodClass parent_class;
  void (*scanline) (GstDeinterlaceMethodGreedyL * self, uint8_t * L2,
      uint8_t * L1, uint8_t * L3, uint8_t * L2P, uint8_t * Dest, int size);
} GstDeinterlaceMethodGreedyLClass;

// This is a simple lightweight DeInterlace method that uses little CPU time
// but gives very good results for low or intermedite motion.
// It defers frames by one field, but that does not seem to produce noticeable
// lip sync problems.
//
// The method used is to take either the older or newer weave pixel depending
// upon which give the smaller comb factor, and then clip to avoid large damage
// when wrong.
//
// I'd intended this to be part of a larger more elaborate method added to 
// Blended Clip but this give too good results for the CPU to ignore here.

static inline void
deinterlace_greedy_packed422_scanline_c (GstDeinterlaceMethodGreedyL * self,
    uint8_t * m0, uint8_t * t1,
    uint8_t * b1, uint8_t * m2, uint8_t * output, int width)
{
  int avg, l2_diff, lp2_diff, max, min, best;
  guint max_comb = self->max_comb;

  // L2 == m0
  // L1 == t1
  // L3 == b1
  // LP2 == m2

  while (width--) {
    avg = (*t1 + *b1) / 2;

    l2_diff = ABS (*m0 - avg);
    lp2_diff = ABS (*m2 - avg);

    if (l2_diff > lp2_diff)
      best = *m2;
    else
      best = *m0;

    max = MAX (*t1, *b1);
    min = MIN (*t1, *b1);

    if (max < 256 - max_comb)
      max += max_comb;
    else
      max = 255;

    if (min > max_comb)
      min -= max_comb;
    else
      min = 0;

    *output = CLAMP (best, min, max);

    // Advance to the next set of pixels.
    output += 1;
    m0 += 1;
    t1 += 1;
    b1 += 1;
    m2 += 1;
  }
}

#ifdef BUILD_X86_ASM
#include "mmx.h"
static void
deinterlace_greedy_packed422_scanline_mmx (GstDeinterlaceMethodGreedyL * self,
    uint8_t * m0, uint8_t * t1,
    uint8_t * b1, uint8_t * m2, uint8_t * output, int width)
{
  mmx_t MaxComb;
  mmx_t ShiftMask;

  // How badly do we let it weave? 0-255
  MaxComb.ub[0] = self->max_comb;
  MaxComb.ub[1] = self->max_comb;
  MaxComb.ub[2] = self->max_comb;
  MaxComb.ub[3] = self->max_comb;
  MaxComb.ub[4] = self->max_comb;
  MaxComb.ub[5] = self->max_comb;
  MaxComb.ub[6] = self->max_comb;
  MaxComb.ub[7] = self->max_comb;

  ShiftMask.ub[0] = 0x7f;
  ShiftMask.ub[1] = 0x7f;
  ShiftMask.ub[2] = 0x7f;
  ShiftMask.ub[3] = 0x7f;
  ShiftMask.ub[4] = 0x7f;
  ShiftMask.ub[5] = 0x7f;
  ShiftMask.ub[6] = 0x7f;
  ShiftMask.ub[7] = 0x7f;

  // L2 == m0
  // L1 == t1
  // L3 == b1
  // LP2 == m2  

  movq_m2r (MaxComb, mm6);

  for (; width > 7; width -= 8) {
    movq_m2r (*t1, mm1);        // L1
    movq_m2r (*m0, mm2);        // L2
    movq_m2r (*b1, mm3);        // L3
    movq_m2r (*m2, mm0);        // LP2

    // average L1 and L3 leave result in mm4
    movq_r2r (mm1, mm4);        // L1
    movq_r2r (mm3, mm5);        // L3
    psrlw_i2r (1, mm4);         // L1/2
    pand_m2r (ShiftMask, mm4);
    psrlw_i2r (1, mm5);         // L3/2
    pand_m2r (ShiftMask, mm5);
    paddusb_r2r (mm5, mm4);     // (L1 + L3) / 2

    // get abs value of possible L2 comb
    movq_r2r (mm2, mm7);        // L2
    psubusb_r2r (mm4, mm7);     // L2 - avg
    movq_r2r (mm4, mm5);        // avg
    psubusb_r2r (mm2, mm5);     // avg - L2
    por_r2r (mm7, mm5);         // abs(avg-L2)

    // get abs value of possible LP2 comb
    movq_r2r (mm0, mm7);        // LP2
    psubusb_r2r (mm4, mm7);     // LP2 - avg
    psubusb_r2r (mm0, mm4);     // avg - LP2
    por_r2r (mm7, mm4);         // abs(avg-LP2)

    // use L2 or LP2 depending upon which makes smaller comb
    psubusb_r2r (mm5, mm4);     // see if it goes to zero
    psubusb_r2r (mm5, mm5);     // 0
    pcmpeqb_r2r (mm5, mm4);     // if (mm4=0) then FF else 0
    pcmpeqb_r2r (mm4, mm5);     // opposite of mm4

    // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
    pand_r2r (mm2, mm5);        // use L2 if mm5 == ff, else 0
    pand_r2r (mm0, mm4);        // use LP2 if mm4 = ff, else 0
    por_r2r (mm5, mm4);         // may the best win

    // Now lets clip our chosen value to be not outside of the range
    // of the high/low range L1-L3 by more than abs(L1-L3)
    // This allows some comb but limits the damages and also allows more
    // detail than a boring oversmoothed clip.

    movq_r2r (mm1, mm2);        // copy L1
    psubusb_r2r (mm3, mm2);     // - L3, with saturation
    paddusb_r2r (mm3, mm2);     // now = Max(L1,L3)

    pcmpeqb_r2r (mm7, mm7);     // all ffffffff
    psubusb_r2r (mm1, mm7);     // - L1 
    paddusb_r2r (mm7, mm3);     // add, may sat at fff..
    psubusb_r2r (mm7, mm3);     // now = Min(L1,L3)

    // allow the value to be above the high or below the low by amt of MaxComb
    paddusb_r2r (mm6, mm2);     // increase max by diff
    psubusb_r2r (mm6, mm3);     // lower min by diff

    psubusb_r2r (mm3, mm4);     // best - Min
    paddusb_r2r (mm3, mm4);     // now = Max(best,Min(L1,L3)

    pcmpeqb_r2r (mm7, mm7);     // all ffffffff
    psubusb_r2r (mm4, mm7);     // - Max(best,Min(best,L3) 
    paddusb_r2r (mm7, mm2);     // add may sat at FFF..
    psubusb_r2r (mm7, mm2);     // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped

    movq_r2m (mm2, *output);    // move in our clipped best

    // Advance to the next set of pixels.
    output += 8;
    m0 += 8;
    t1 += 8;
    b1 += 8;
    m2 += 8;
  }
  emms ();
  if (width > 0)
    deinterlace_greedy_packed422_scanline_c (self, m0, t1, b1, m2, output,
        width);
}

#include "sse.h"

static void
deinterlace_greedy_packed422_scanline_mmxext (GstDeinterlaceMethodGreedyL *
    self, uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2,
    uint8_t * output, int width)
{
  mmx_t MaxComb;

  // How badly do we let it weave? 0-255
  MaxComb.ub[0] = self->max_comb;
  MaxComb.ub[1] = self->max_comb;
  MaxComb.ub[2] = self->max_comb;
  MaxComb.ub[3] = self->max_comb;
  MaxComb.ub[4] = self->max_comb;
  MaxComb.ub[5] = self->max_comb;
  MaxComb.ub[6] = self->max_comb;
  MaxComb.ub[7] = self->max_comb;

  // L2 == m0
  // L1 == t1
  // L3 == b1
  // LP2 == m2

  movq_m2r (MaxComb, mm6);

  for (; width > 7; width -= 8) {
    movq_m2r (*t1, mm1);        // L1
    movq_m2r (*m0, mm2);        // L2
    movq_m2r (*b1, mm3);        // L3
    movq_m2r (*m2, mm0);        // LP2

    // average L1 and L3 leave result in mm4
    movq_r2r (mm1, mm4);        // L1
    pavgb_r2r (mm3, mm4);       // (L1 + L3)/2

    // get abs value of possible L2 comb
    movq_r2r (mm2, mm7);        // L2
    psubusb_r2r (mm4, mm7);     // L2 - avg
    movq_r2r (mm4, mm5);        // avg
    psubusb_r2r (mm2, mm5);     // avg - L2
    por_r2r (mm7, mm5);         // abs(avg-L2)

    // get abs value of possible LP2 comb
    movq_r2r (mm0, mm7);        // LP2
    psubusb_r2r (mm4, mm7);     // LP2 - avg
    psubusb_r2r (mm0, mm4);     // avg - LP2
    por_r2r (mm7, mm4);         // abs(avg-LP2)

    // use L2 or LP2 depending upon which makes smaller comb
    psubusb_r2r (mm5, mm4);     // see if it goes to zero
    pxor_r2r (mm5, mm5);        // 0
    pcmpeqb_r2r (mm5, mm4);     // if (mm4=0) then FF else 0
    pcmpeqb_r2r (mm4, mm5);     // opposite of mm4

    // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
    pand_r2r (mm2, mm5);        // use L2 if mm5 == ff, else 0
    pand_r2r (mm0, mm4);        // use LP2 if mm4 = ff, else 0
    por_r2r (mm5, mm4);         // may the best win

    // Now lets clip our chosen value to be not outside of the range
    // of the high/low range L1-L3 by more than abs(L1-L3)
    // This allows some comb but limits the damages and also allows more
    // detail than a boring oversmoothed clip.

    movq_r2r (mm1, mm2);        // copy L1
    pmaxub_r2r (mm3, mm2);      // now = Max(L1,L3)

    pminub_r2r (mm1, mm3);      // now = Min(L1,L3)

    // allow the value to be above the high or below the low by amt of MaxComb
    paddusb_r2r (mm6, mm2);     // increase max by diff
    psubusb_r2r (mm6, mm3);     // lower min by diff


    pmaxub_r2r (mm3, mm4);      // now = Max(best,Min(L1,L3)
    pminub_r2r (mm4, mm2);      // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped

    movq_r2m (mm2, *output);    // move in our clipped best

    // Advance to the next set of pixels.
    output += 8;
    m0 += 8;
    t1 += 8;
    b1 += 8;
    m2 += 8;
  }
  emms ();

  if (width > 0)
    deinterlace_greedy_packed422_scanline_c (self, m0, t1, b1, m2, output,
        width);
}

#endif

static void
deinterlace_frame_di_greedy (GstDeinterlaceMethod * d_method,
    GstDeinterlace2 * object)
{
  GstDeinterlaceMethodGreedyL *self =
      GST_DEINTERLACE_METHOD_GREEDY_L (d_method);
  GstDeinterlaceMethodGreedyLClass *klass =
      GST_DEINTERLACE_METHOD_GREEDY_L_GET_CLASS (self);
  int InfoIsOdd = 0;
  int Line;
  unsigned int Pitch = object->field_stride;
  unsigned char *L1;            // ptr to Line1, of 3
  unsigned char *L2;            // ptr to Line2, the weave line
  unsigned char *L3;            // ptr to Line3

  unsigned char *L2P;           // ptr to prev Line2
  unsigned char *Dest = GST_BUFFER_DATA (object->out_buf);

  // copy first even line no matter what, and the first odd line if we're
  // processing an EVEN field. (note diff from other deint rtns.)

  if (object->field_history[object->history_count - 1].flags ==
      PICTURE_INTERLACED_BOTTOM) {
    InfoIsOdd = 1;

    L1 = GST_BUFFER_DATA (object->field_history[object->history_count - 2].buf);
    L2 = GST_BUFFER_DATA (object->field_history[object->history_count - 1].buf);
    L3 = L1 + Pitch;
    L2P =
        GST_BUFFER_DATA (object->field_history[object->history_count - 3].buf);

    // copy first even line
    oil_memcpy (Dest, L1, object->line_length);
    Dest += object->output_stride;
  } else {
    InfoIsOdd = 0;
    L1 = GST_BUFFER_DATA (object->field_history[object->history_count - 2].buf);
    L2 = GST_BUFFER_DATA (object->field_history[object->history_count -
            1].buf) + Pitch;
    L3 = L1 + Pitch;
    L2P =
        GST_BUFFER_DATA (object->field_history[object->history_count - 3].buf) +
        Pitch;

    // copy first even line
    oil_memcpy (Dest, GST_BUFFER_DATA (object->field_history[0].buf),
        object->line_length);
    Dest += object->output_stride;
    // then first odd line
    oil_memcpy (Dest, L1, object->line_length);
    Dest += object->output_stride;
  }

  for (Line = 0; Line < (object->field_height - 1); ++Line) {
    klass->scanline (self, L2, L1, L3, L2P, Dest, object->line_length);
    Dest += object->output_stride;
    oil_memcpy (Dest, L3, object->line_length);
    Dest += object->output_stride;

    L1 += Pitch;
    L2 += Pitch;
    L3 += Pitch;
    L2P += Pitch;
  }

  if (InfoIsOdd) {
    oil_memcpy (Dest, L2, object->line_length);
  }
}


G_DEFINE_TYPE (GstDeinterlaceMethodGreedyL, gst_deinterlace_method_greedy_l,
    GST_TYPE_DEINTERLACE_METHOD);

enum
{
  ARG_0,
  ARG_MAX_COMB
};

static void
gst_deinterlace_method_greedy_l_set_property (GObject * object, guint prop_id,
    const GValue * value, GParamSpec * pspec)
{
  GstDeinterlaceMethodGreedyL *self = GST_DEINTERLACE_METHOD_GREEDY_L (object);

  switch (prop_id) {
    case ARG_MAX_COMB:
      self->max_comb = g_value_get_uint (value);
      break;
    default:
      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
  }
}

static void
gst_deinterlace_method_greedy_l_get_property (GObject * object, guint prop_id,
    GValue * value, GParamSpec * pspec)
{
  GstDeinterlaceMethodGreedyL *self = GST_DEINTERLACE_METHOD_GREEDY_L (object);

  switch (prop_id) {
    case ARG_MAX_COMB:
      g_value_set_uint (value, self->max_comb);
      break;
    default:
      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
  }
}

static void
gst_deinterlace_method_greedy_l_class_init (GstDeinterlaceMethodGreedyLClass *
    klass)
{
  GstDeinterlaceMethodClass *dim_class = (GstDeinterlaceMethodClass *) klass;
  GObjectClass *gobject_class = (GObjectClass *) klass;
#ifdef BUILD_X86_ASM
  guint cpu_flags = oil_cpu_get_flags ();
#endif

  gobject_class->set_property = gst_deinterlace_method_greedy_l_set_property;
  gobject_class->get_property = gst_deinterlace_method_greedy_l_get_property;

  g_object_class_install_property (gobject_class, ARG_MAX_COMB,
      g_param_spec_uint ("max-comb",
          "Max comb",
          "Max Comb", 0, 255, 15, G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)
      );

  dim_class->fields_required = 4;
  dim_class->deinterlace_frame = deinterlace_frame_di_greedy;
  dim_class->name = "Motion Adaptive: Simple Detection";
  dim_class->nick = "greedyl";
  dim_class->latency = 1;

#ifdef BUILD_X86_ASM
  if (cpu_flags & OIL_IMPL_FLAG_MMXEXT) {
    klass->scanline = deinterlace_greedy_packed422_scanline_mmxext;
  } else if (cpu_flags & OIL_IMPL_FLAG_MMX) {
    klass->scanline = deinterlace_greedy_packed422_scanline_mmx;
  } else {
    klass->scanline = deinterlace_greedy_packed422_scanline_c;
  }
#else
  klass->scanline = deinterlace_greedy_packed422_scanline_c;
#endif
}

static void
gst_deinterlace_method_greedy_l_init (GstDeinterlaceMethodGreedyL * self)
{
  self->max_comb = 15;
}