/* GStreamer
 * Copyright (C) <1999> Erik Walthinsen <omega@cse.ogi.edu>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <math.h>
#include <stdlib.h>

#include "yuv2rgb.h"

#undef HAVE_LIBMMX

#ifdef HAVE_LIBMMX
#include <mmx.h>
#endif

static int V_r_tab[256];
static int V_g_tab[256];
static int U_g_tab[256];
static int U_b_tab[256];

#define CB_BASE 1
#define CR_BASE (CB_BASE*CB_RANGE)
#define LUM_BASE (CR_BASE*CR_RANGE)

#define Min(x,y) (((x) < (y)) ? (x) : (y))
#define Max(x,y) (((x) > (y)) ? (x) : (y))

#define GAMMA_CORRECTION(x) ((int)(pow((x) / 255.0, 1.0 / gammaCorrect) * 255.0))
#define CHROMA_CORRECTION256(x) ((x) >= 128 \
	                        ? 128 + Min(127, (int)(((x) - 128.0) * chromaCorrect)) \
	                        : 128 - Min(128, (int)((128.0 - (x)) * chromaCorrect)))
#define CHROMA_CORRECTION128(x) ((x) >= 0 \
	                        ? Min(127,  (int)(((x) * chromaCorrect))) \
	                        : Max(-128, (int)(((x) * chromaCorrect))))
#define CHROMA_CORRECTION256D(x) ((x) >= 128 \
	                        ? 128.0 + Min(127.0, (((x) - 128.0) * chromaCorrect)) \
	                        : 128.0 - Min(128.0, (((128.0 - (x)) * chromaCorrect))))
#define CHROMA_CORRECTION128D(x) ((x) >= 0 \
	                        ? Min(127.0,  ((x) * chromaCorrect)) \
	                        : Max(-128.0, ((x) * chromaCorrect)))


void gst_colorspace_I420_to_rgb16 (GstColorspace * space, unsigned char *src,
    unsigned char *dest);
void gst_colorspace_I420_to_rgb24 (GstColorspace * space, unsigned char *src,
    unsigned char *dest);
void gst_colorspace_I420_to_rgb32 (GstColorspace * space, unsigned char *src,
    unsigned char *dest);
#ifdef HAVE_LIBMMX
void gst_colorspace_I420_to_bgr16_mmx (GstColorspace * space,
    unsigned char *src, unsigned char *dest);
void gst_colorspace_I420_to_bgr32_mmx (GstColorspace * space,
    unsigned char *src, unsigned char *dest);
#endif

void gst_colorspace_YV12_to_rgb16 (GstColorspace * space, unsigned char *src,
    unsigned char *dest);
void gst_colorspace_YV12_to_rgb24 (GstColorspace * space, unsigned char *src,
    unsigned char *dest);
void gst_colorspace_YV12_to_rgb32 (GstColorspace * space, unsigned char *src,
    unsigned char *dest);
#ifdef HAVE_LIBMMX
void gst_colorspace_YV12_to_bgr16_mmx (GstColorspace * space,
    unsigned char *src, unsigned char *dest);
void gst_colorspace_YV12_to_bgr32_mmx (GstColorspace * space,
    unsigned char *src, unsigned char *dest);
#endif

static void
gst_colorspace_yuv_to_rgb16 (GstColorspace * space,
    unsigned char *out,
    unsigned char *lum,
    unsigned char *cr, unsigned char *cb, int cols, int rows);
static void
gst_colorspace_yuv_to_rgb24 (GstColorspace * space,
    unsigned char *out,
    unsigned char *lum,
    unsigned char *cr, unsigned char *cb, int cols, int rows);
static void
gst_colorspace_yuv_to_rgb32 (GstColorspace * space,
    unsigned char *out,
    unsigned char *lum,
    unsigned char *cr, unsigned char *cb, int cols, int rows);
#if 0
static void gst_colorspace_yuv_to_rgb16 (GstColorspaceYUVTables * tables,
    unsigned char *lum,
    unsigned char *cr,
    unsigned char *cb, unsigned char *out, int cols, int rows);
static void gst_colorspace_yuv_to_rgb24 (GstColorspaceYUVTables * tables,
    unsigned char *lum,
    unsigned char *cr,
    unsigned char *cb, unsigned char *out, int cols, int rows);
static void gst_colorspace_yuv_to_rgb32 (GstColorspaceYUVTables * tables,
    unsigned char *lum,
    unsigned char *cr,
    unsigned char *cb, unsigned char *out, int cols, int rows);
#ifdef HAVE_LIBMMX
void gst_colorspace_yuv_to_bgr32_mmx (GstColorspaceYUVTables * tables,
    unsigned char *lum,
    unsigned char *cr,
    unsigned char *cb, unsigned char *out, int cols, int rows);
extern void gst_colorspace_yuv_to_bgr16_mmx (GstColorspaceYUVTables * tables,
    unsigned char *lum,
    unsigned char *cr,
    unsigned char *cb, unsigned char *out, int cols, int rows);
#endif
#endif

#define ROUND_UP_2(x)  (((x)+1)&~1)
#define ROUND_UP_4(x)  (((x)+3)&~3)
#define ROUND_UP_8(x)  (((x)+7)&~7)


void
gst_colorspace_I420_to_rgb32 (GstColorspace * space, unsigned char *dest,
    unsigned char *src)
{
  unsigned char *src_U;
  unsigned char *src_V;

  src_U = src + ROUND_UP_4 (space->width) * ROUND_UP_2 (space->height);
  src_V =
      src_U + ROUND_UP_8 (space->width) / 2 * ROUND_UP_2 (space->height) / 2;

  gst_colorspace_yuv_to_rgb32 (space,
      dest, src, src_U, src_V, space->width, space->height);
}

void
gst_colorspace_I420_to_rgb24 (GstColorspace * space, unsigned char *dest,
    unsigned char *src)
{
  unsigned char *src_U;
  unsigned char *src_V;

  src_U = src + ROUND_UP_4 (space->width) * ROUND_UP_2 (space->height);
  src_V =
      src_U + ROUND_UP_8 (space->width) / 2 * ROUND_UP_2 (space->height) / 2;

  gst_colorspace_yuv_to_rgb24 (space,
      dest, src, src_U, src_V, space->width, space->height);
}

void
gst_colorspace_I420_to_rgb16 (GstColorspace * space, unsigned char *dest,
    unsigned char *src)
{
  unsigned char *src_U;
  unsigned char *src_V;

  src_U = src + ROUND_UP_4 (space->width) * ROUND_UP_2 (space->height);
  src_V =
      src_U + ROUND_UP_8 (space->width) / 2 * ROUND_UP_2 (space->height) / 2;

  gst_colorspace_yuv_to_rgb16 (space,
      dest, src, src_U, src_V, space->width, space->height);
}

void
gst_colorspace_YV12_to_rgb32 (GstColorspace * space, unsigned char *dest,
    unsigned char *src)
{
  unsigned char *src_U;
  unsigned char *src_V;

  src_V = src + ROUND_UP_4 (space->width) * ROUND_UP_2 (space->height);
  src_U =
      src_V + ROUND_UP_8 (space->width) / 2 * ROUND_UP_2 (space->height) / 2;

  gst_colorspace_yuv_to_rgb32 (space,
      dest, src, src_U, src_V, space->width, space->height);
}

void
gst_colorspace_YV12_to_rgb24 (GstColorspace * space, unsigned char *dest,
    unsigned char *src)
{
  unsigned char *src_U;
  unsigned char *src_V;

  src_V = src + ROUND_UP_4 (space->width) * ROUND_UP_2 (space->height);
  src_U =
      src_V + ROUND_UP_8 (space->width) / 2 * ROUND_UP_2 (space->height) / 2;

  gst_colorspace_yuv_to_rgb24 (space,
      dest, src, src_U, src_V, space->width, space->height);
}

void
gst_colorspace_YV12_to_rgb16 (GstColorspace * space, unsigned char *dest,
    unsigned char *src)
{
  unsigned char *src_U;
  unsigned char *src_V;

  src_V = src + ROUND_UP_4 (space->width) * ROUND_UP_2 (space->height);
  src_U =
      src_V + ROUND_UP_8 (space->width) / 2 * ROUND_UP_2 (space->height) / 2;

  gst_colorspace_yuv_to_rgb16 (space,
      dest, src, src_U, src_V, space->width, space->height);
}

#ifdef HAVE_LIBMMX
void
gst_colorspace_I420_to_bgr32_mmx (GstColorspace * space, unsigned char *src,
    unsigned char *dest)
{
  int size;

  GST_DEBUG ("gst_colorspace_I420_to_rgb32_mmx");

  size = space->width * space->height;

  gst_colorspace_yuv_to_bgr32_mmx (NULL, src,   /* Y component */
      src + size,               /* cr component */
      src + size + (size >> 2), /* cb component */
      dest, space->height, space->width);

}

void
gst_colorspace_I420_to_bgr16_mmx (GstColorspace * space, unsigned char *src,
    unsigned char *dest)
{
  int size;

  GST_DEBUG ("gst_colorspace_I420_to_bgr16_mmx ");

  size = space->width * space->height;

  gst_colorspace_yuv_to_bgr16_mmx (NULL, src,   /* Y component */
      src + size,               /* cr component */
      src + size + (size >> 2), /* cb component */
      dest, space->height, space->width);
  GST_DEBUG ("gst_colorspace_I420_to_bgr16_mmx done");

}

void
gst_colorspace_YV12_to_bgr32_mmx (GstColorspace * space, unsigned char *src,
    unsigned char *dest)
{
  int size;

  GST_DEBUG ("gst_colorspace_YV12_to_rgb32_mmx");

  size = space->width * space->height;

  gst_colorspace_yuv_to_bgr32_mmx (NULL, src,   /* Y component */
      src + size + (size >> 2), /* cb component */
      src + size,               /* cr component */
      dest, space->height, space->width);

}

void
gst_colorspace_YV12_to_bgr16_mmx (GstColorspace * space, unsigned char *src,
    unsigned char *dest)
{
  int size;

  GST_DEBUG ("gst_colorspace_YV12_to_bgr16_mmx ");

  size = space->width * space->height;

  gst_colorspace_yuv_to_bgr16_mmx (NULL, src,   /* Y component */
      src + size + (size >> 2), /* cb component */
      src + size,               /* cr component */
      dest, space->height, space->width);
  GST_DEBUG ("gst_colorspace_YV12_to_bgr16_mmx done");

}
#endif
/*
 * How many 1 bits are there in the longword.
 * Low performance, do not call often.
 */

static int
number_of_bits_set (a)
     unsigned long a;
{
  if (!a)
    return 0;
  if (a & 1)
    return 1 + number_of_bits_set (a >> 1);
  return (number_of_bits_set (a >> 1));
}

/*
 * How many 0 bits are there at most significant end of longword.
 * Low performance, do not call often.
 */
static int
free_bits_at_top (a)
     unsigned long a;
{
  /* assume char is 8 bits */
  if (!a)
    return sizeof (unsigned long) * 8;
  /* assume twos complement */
  if (((long) a) < 0l)
    return 0;
  return 1 + free_bits_at_top (a << 1);
}

/*
 * How many 0 bits are there at least significant end of longword.
 * Low performance, do not call often.
 */
static int
free_bits_at_bottom (a)
     unsigned long a;
{
  /* assume char is 8 bits */
  if (!a)
    return sizeof (unsigned long) * 8;
  if (((long) a) & 1l)
    return 0;
  return 1 + free_bits_at_bottom (a >> 1);
}

/*
 *--------------------------------------------------------------
 *
 * InitColor16Dither --
 *
 *	To get rid of the multiply and other conversions in color
 *	dither, we use a lookup table.
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	The lookup tables are initialized.
 *
 *--------------------------------------------------------------
 */

void
gst_colorspace_table_init (GstColorspace * space)
{
  int i;

  for (i = 0; i < 256; i++) {
    V_r_tab[i] = (0.419 / 0.299) * (i - 128);
    V_g_tab[i] = -(0.299 / 0.419) * (i - 128);
    U_g_tab[i] = -(0.114 / 0.331) * (i - 128);
    U_b_tab[i] = (0.587 / 0.331) * (i - 128);
  }
#if 0
  int CR, CB, i;
  int *L_tab, *Cr_r_tab, *Cr_g_tab, *Cb_g_tab, *Cb_b_tab;
  long *r_2_pix_alloc;
  long *g_2_pix_alloc;
  long *b_2_pix_alloc;
  long depth = 32;
  long red_mask = 0xff0000;
  long green_mask = 0x00ff00;
  long blue_mask = 0x0000ff;

  L_tab = space->L_tab = (int *) malloc (256 * sizeof (int));
  Cr_r_tab = space->Cr_r_tab = (int *) malloc (256 * sizeof (int));
  Cr_g_tab = space->Cr_g_tab = (int *) malloc (256 * sizeof (int));
  Cb_g_tab = space->Cb_g_tab = (int *) malloc (256 * sizeof (int));
  Cb_b_tab = space->Cb_b_tab = (int *) malloc (256 * sizeof (int));

  r_2_pix_alloc = (long *) malloc (768 * sizeof (long));
  g_2_pix_alloc = (long *) malloc (768 * sizeof (long));
  b_2_pix_alloc = (long *) malloc (768 * sizeof (long));

  if (L_tab == NULL ||
      Cr_r_tab == NULL ||
      Cr_g_tab == NULL ||
      Cb_g_tab == NULL ||
      Cb_b_tab == NULL ||
      r_2_pix_alloc == NULL || g_2_pix_alloc == NULL || b_2_pix_alloc == NULL) {
    fprintf (stderr, "Could not get enough memory in InitColorDither\n");
    exit (1);
  }

  for (i = 0; i < 256; i++) {
    L_tab[i] = i;
    /*
       if (gammaCorrectFlag) {
       L_tab[i] = GAMMA_CORRECTION(i);
       }
     */

    CB = CR = i;
    /*
       if (chromaCorrectFlag) {
       CB -= 128; 
       CB = CHROMA_CORRECTION128(CB);
       CR -= 128;
       CR = CHROMA_CORRECTION128(CR);
       } 
       else 
     */
    {
      CB -= 128;
      CR -= 128;
    }
    Cr_r_tab[i] = (0.419 / 0.299) * CR;
    Cr_g_tab[i] = -(0.299 / 0.419) * CR;
    Cb_g_tab[i] = -(0.114 / 0.331) * CB;
    Cb_b_tab[i] = (0.587 / 0.331) * CB;

  }

  /* 
   * Set up entries 0-255 in rgb-to-pixel value tables.
   */
  for (i = 0; i < 256; i++) {
    r_2_pix_alloc[i + 256] = i >> (8 - number_of_bits_set (red_mask));
    r_2_pix_alloc[i + 256] <<= free_bits_at_bottom (red_mask);
    g_2_pix_alloc[i + 256] = i >> (8 - number_of_bits_set (green_mask));
    g_2_pix_alloc[i + 256] <<= free_bits_at_bottom (green_mask);
    b_2_pix_alloc[i + 256] = i >> (8 - number_of_bits_set (blue_mask));
    b_2_pix_alloc[i + 256] <<= free_bits_at_bottom (blue_mask);
    /*
     * If we have 16-bit output depth, then we double the value
     * in the top word. This means that we can write out both
     * pixels in the pixel doubling mode with one op. It is 
     * harmless in the normal case as storing a 32-bit value
     * through a short pointer will lose the top bits anyway.
     * A similar optimisation for Alpha for 64 bit has been
     * prepared for, but is not yet implemented.
     */
    if (!(depth == 32) && !(depth == 24)) {

      r_2_pix_alloc[i + 256] |= (r_2_pix_alloc[i + 256]) << 16;
      g_2_pix_alloc[i + 256] |= (g_2_pix_alloc[i + 256]) << 16;
      b_2_pix_alloc[i + 256] |= (b_2_pix_alloc[i + 256]) << 16;

    }
#ifdef SIXTYFOUR_BIT
    if (depth == 32) {

      r_2_pix_alloc[i + 256] |= (r_2_pix_alloc[i + 256]) << 32;
      g_2_pix_alloc[i + 256] |= (g_2_pix_alloc[i + 256]) << 32;
      b_2_pix_alloc[i + 256] |= (b_2_pix_alloc[i + 256]) << 32;

    }
#endif
  }

  /*
   * Spread out the values we have to the rest of the array so that
   * we do not need to check for overflow.
   */
  for (i = 0; i < 256; i++) {
    r_2_pix_alloc[i] = r_2_pix_alloc[256];
    r_2_pix_alloc[i + 512] = r_2_pix_alloc[511];
    g_2_pix_alloc[i] = g_2_pix_alloc[256];
    g_2_pix_alloc[i + 512] = g_2_pix_alloc[511];
    b_2_pix_alloc[i] = b_2_pix_alloc[256];
    b_2_pix_alloc[i + 512] = b_2_pix_alloc[511];
  }

  space->r_2_pix = r_2_pix_alloc + 256;
  space->g_2_pix = g_2_pix_alloc + 256;
  space->b_2_pix = b_2_pix_alloc + 256;
#endif
}

static void
gst_colorspace_yuv_to_rgb32 (GstColorspace * space,
    unsigned char *dest,
    unsigned char *Y, unsigned char *U, unsigned char *V, int width, int height)
{
  int x, y;
  int src_rowstride;
  int dest_rowstride;

  src_rowstride = ROUND_UP_4 (space->width);
  dest_rowstride = width * 4;
  for (y = 0; y < height; y++) {
    for (x = 0; x < width; x++) {
      dest[x * 4 + 0] = 0;
      dest[x * 3 + 1] = CLAMP (Y[x] + V_r_tab[V[x / 2]], 0, 255);
      dest[x * 3 + 2] =
          CLAMP (Y[x] + U_g_tab[U[x / 2]] + V_g_tab[V[x / 2]], 0, 255);
      dest[x * 3 + 3] = CLAMP (Y[x] + U_b_tab[U[x / 2]], 0, 255);
    }
    Y += src_rowstride;
    dest += dest_rowstride;
    if (y & 1) {
      U += src_rowstride / 2;
      V += src_rowstride / 2;
    }
  }
}

static void
gst_colorspace_yuv_to_rgb24 (GstColorspace * space,
    unsigned char *dest,
    unsigned char *Y, unsigned char *U, unsigned char *V, int width, int height)
{
  int x, y;
  int src_rowstride;
  int dest_rowstride;

  src_rowstride = ROUND_UP_4 (space->width);
  dest_rowstride = ROUND_UP_4 (width * 3);
  for (y = 0; y < height; y++) {
    for (x = 0; x < width; x++) {
      dest[x * 3 + 0] = CLAMP (Y[x] + V_r_tab[V[x / 2]], 0, 255);
      dest[x * 3 + 1] =
          CLAMP (Y[x] + U_g_tab[U[x / 2]] + V_g_tab[V[x / 2]], 0, 255);
      dest[x * 3 + 2] = CLAMP (Y[x] + U_b_tab[U[x / 2]], 0, 255);
    }
    Y += src_rowstride;
    dest += dest_rowstride;
    if (y & 1) {
      U += src_rowstride / 2;
      V += src_rowstride / 2;
    }
  }
}

static void
gst_colorspace_yuv_to_rgb16 (GstColorspace * space,
    unsigned char *dest,
    unsigned char *Y, unsigned char *U, unsigned char *V, int width, int height)
{
  int x, y;
  int src_rowstride;
  int dest_rowstride;
  int r, g, b;

  src_rowstride = ROUND_UP_4 (space->width);
  dest_rowstride = ROUND_UP_4 (width * 2);
  for (y = 0; y < height; y++) {
    for (x = 0; x < width; x++) {
      r = CLAMP (Y[x] + V_r_tab[V[x / 2]], 0, 255);
      g = CLAMP (Y[x] + U_g_tab[U[x / 2]] + V_g_tab[V[x / 2]], 0, 255);
      b = CLAMP (Y[x] + U_b_tab[U[x / 2]], 0, 255);
      *(unsigned short *) (dest + x * 2) =
          ((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3);
    }
    Y += src_rowstride;
    dest += dest_rowstride;
    if (y & 1) {
      U += src_rowstride / 2;
      V += src_rowstride / 2;
    }
  }
}

#ifdef HAVE_LIBMMX
static mmx_t MMX_80w = (mmx_t) (long long) 0x0080008000800080LL;        /*dd    00080 0080h, 000800080h */

static mmx_t MMX_00FFw = (mmx_t) (long long) 0x00ff00ff00ff00ffLL;      /*dd    000FF 00FFh, 000FF00FFh */
static mmx_t MMX_FF00w = (mmx_t) (long long) 0xff00ff00ff00ff00LL;      /*dd    000FF 00FFh, 000FF00FFh */

static mmx_t MMX32_Vredcoeff = (mmx_t) (long long) 0x0059005900590059LL;
static mmx_t MMX32_Ubluecoeff = (mmx_t) (long long) 0x0072007200720072LL;
static mmx_t MMX32_Ugrncoeff = (mmx_t) (long long) 0xffeaffeaffeaffeaLL;
static mmx_t MMX32_Vgrncoeff = (mmx_t) (long long) 0xffd2ffd2ffd2ffd2LL;

static void
gst_colorspace_yuv_to_bgr32_mmx (tables, lum, cr, cb, out, rows, cols)
     GstColorspaceYUVTables *tables;
     unsigned char *lum;
     unsigned char *cr;
     unsigned char *cb;
     unsigned char *out;
     int cols, rows;

{
  guint32 *row1 = (guint32 *) out;      /* 32 bit target */
  int cols4 = cols >> 2;

  int y, x;

  for (y = rows >> 1; y; y--) {
    for (x = cols4; x; x--) {

      /* create Cr (result in mm1) */
      movd_m2r (*(mmx_t *) cb, mm1);    /*         0  0  0  0  v3 v2 v1 v0 */
      pxor_r2r (mm7, mm7);      /*         00 00 00 00 00 00 00 00 */
      movd_m2r (*(mmx_t *) lum, mm2);   /*          0  0  0  0 l3 l2 l1 l0 */
      punpcklbw_r2r (mm7, mm1); /*         0  v3 0  v2 00 v1 00 v0 */
      punpckldq_r2r (mm1, mm1); /*         00 v1 00 v0 00 v1 00 v0 */
      psubw_m2r (MMX_80w, mm1); /* mm1-128:r1 r1 r0 r0 r1 r1 r0 r0  */

      /* create Cr_g (result in mm0) */
      movq_r2r (mm1, mm0);      /* r1 r1 r0 r0 r1 r1 r0 r0 */
      pmullw_m2r (MMX32_Vgrncoeff, mm0);        /* red*-46dec=0.7136*64 */
      pmullw_m2r (MMX32_Vredcoeff, mm1);        /* red*89dec=1.4013*64 */
      psraw_i2r (6, mm0);       /* red=red/64 */
      psraw_i2r (6, mm1);       /* red=red/64 */

      /* create L1 L2 (result in mm2,mm4) */
      /* L2=lum+cols */
      movq_m2r (*(mmx_t *) (lum + cols), mm3);  /*    0  0  0  0 L3 L2 L1 L0 */
      punpckldq_r2r (mm3, mm2); /*   L3 L2 L1 L0 l3 l2 l1 l0 */
      movq_r2r (mm2, mm4);      /*   L3 L2 L1 L0 l3 l2 l1 l0 */
      pand_m2r (MMX_FF00w, mm2);        /*   L3 0  L1  0 l3  0 l1  0 */
      pand_m2r (MMX_00FFw, mm4);        /*   0  L2  0 L0  0 l2  0 l0 */
      psrlw_i2r (8, mm2);       /*   0  L3  0 L1  0 l3  0 l1 */

      /* create R (result in mm6) */
      movq_r2r (mm2, mm5);      /*   0 L3  0 L1  0 l3  0 l1 */
      movq_r2r (mm4, mm6);      /*   0 L2  0 L0  0 l2  0 l0 */
      paddsw_r2r (mm1, mm5);    /* lum1+red:x R3 x R1 x r3 x r1 */
      paddsw_r2r (mm1, mm6);    /* lum1+red:x R2 x R0 x r2 x r0 */
      packuswb_r2r (mm5, mm5);  /*  R3 R1 r3 r1 R3 R1 r3 r1 */
      packuswb_r2r (mm6, mm6);  /*  R2 R0 r2 r0 R2 R0 r2 r0 */
      pxor_r2r (mm7, mm7);      /*  00 00 00 00 00 00 00 00 */
      punpcklbw_r2r (mm5, mm6); /*  R3 R2 R1 R0 r3 r2 r1 r0 */

      /* create Cb (result in mm1) */
      movd_m2r (*(mmx_t *) cr, mm1);    /*         0  0  0  0  u3 u2 u1 u0 */
      punpcklbw_r2r (mm7, mm1); /*         0  u3 0  u2 00 u1 00 u0 */
      punpckldq_r2r (mm1, mm1); /*         00 u1 00 u0 00 u1 00 u0 */
      psubw_m2r (MMX_80w, mm1); /* mm1-128:u1 u1 u0 u0 u1 u1 u0 u0  */
      /* create Cb_g (result in mm5) */
      movq_r2r (mm1, mm5);      /* u1 u1 u0 u0 u1 u1 u0 u0 */
      pmullw_m2r (MMX32_Ugrncoeff, mm5);        /* blue*-109dec=1.7129*64 */
      pmullw_m2r (MMX32_Ubluecoeff, mm1);       /* blue*114dec=1.78125*64 */
      psraw_i2r (6, mm5);       /* blue=red/64 */
      psraw_i2r (6, mm1);       /* blue=blue/64 */

      /* create G (result in mm7) */
      movq_r2r (mm2, mm3);      /*   0  L3  0 L1  0 l3  0 l1 */
      movq_r2r (mm4, mm7);      /*   0  L2  0 L0  0 l2  0 l1 */
      paddsw_r2r (mm5, mm3);    /* lum1+Cb_g:x G3t x G1t x g3t x g1t */
      paddsw_r2r (mm5, mm7);    /* lum1+Cb_g:x G2t x G0t x g2t x g0t */
      paddsw_r2r (mm0, mm3);    /* lum1+Cr_g:x G3  x G1  x g3  x g1 */
      paddsw_r2r (mm0, mm7);    /* lum1+blue:x G2  x G0  x g2  x g0 */
      packuswb_r2r (mm3, mm3);  /* G3 G1 g3 g1 G3 G1 g3 g1 */
      packuswb_r2r (mm7, mm7);  /* G2 G0 g2 g0 G2 G0 g2 g0 */
      punpcklbw_r2r (mm3, mm7); /* G3 G2 G1 G0 g3 g2 g1 g0 */

      /* create B (result in mm5) */
      movq_r2r (mm2, mm3);      /*   0  L3  0 L1  0 l3  0 l1 */
      movq_r2r (mm4, mm5);      /*   0  L2  0 L0  0 l2  0 l1 */
      paddsw_r2r (mm1, mm3);    /* lum1+blue:x B3 x B1 x b3 x b1 */
      paddsw_r2r (mm1, mm5);    /* lum1+blue:x B2 x B0 x b2 x b0 */
      packuswb_r2r (mm3, mm3);  /* B3 B1 b3 b1 B3 B1 b3 b1 */
      packuswb_r2r (mm5, mm5);  /* B2 B0 b2 b0 B2 B0 b2 b0 */
      punpcklbw_r2r (mm3, mm5); /* B3 B2 B1 B0 b3 b2 b1 b0 */

      /* fill destination row1 (needed are mm6=Rr,mm7=Gg,mm5=Bb) */

      pxor_r2r (mm2, mm2);      /*  0  0  0  0  0  0  0  0 */
      pxor_r2r (mm4, mm4);      /*  0  0  0  0  0  0  0  0 */
      movq_r2r (mm6, mm1);      /* R3 R2 R1 R0 r3 r2 r1 r0 */
      movq_r2r (mm5, mm3);      /* B3 B2 B1 B0 b3 b2 b1 b0 */
      /* process lower lum */
      punpcklbw_r2r (mm4, mm1); /*  0 r3  0 r2  0 r1  0 r0 */
      punpcklbw_r2r (mm4, mm3); /*  0 b3  0 b2  0 b1  0 b0 */
      movq_r2r (mm1, mm2);      /*  0 r3  0 r2  0 r1  0 r0 */
      movq_r2r (mm3, mm0);      /*  0 b3  0 b2  0 b1  0 b0 */
      punpcklwd_r2r (mm1, mm3); /*  0 r1  0 b1  0 r0  0 b0 */
      punpckhwd_r2r (mm2, mm0); /*  0 r3  0 b3  0 r2  0 b2 */

      pxor_r2r (mm2, mm2);      /*  0  0  0  0  0  0  0  0 */
      movq_r2r (mm7, mm1);      /* G3 G2 G1 G0 g3 g2 g1 g0 */
      punpcklbw_r2r (mm1, mm2); /* g3  0 g2  0 g1  0 g0  0 */
      punpcklwd_r2r (mm4, mm2); /*  0  0 g1  0  0  0 g0  0  */
      por_r2r (mm3, mm2);       /*  0 r1 g1 b1  0 r0 g0 b0 */
      movq_r2m (mm2, *(mmx_t *) row1);  /* wrote out ! row1 */

      pxor_r2r (mm2, mm2);      /*  0  0  0  0  0  0  0  0 */
      punpcklbw_r2r (mm1, mm4); /* g3  0 g2  0 g1  0 g0  0 */
      punpckhwd_r2r (mm2, mm4); /*  0  0 g3  0  0  0 g2  0  */
      por_r2r (mm0, mm4);       /*  0 r3 g3 b3  0 r2 g2 b2 */
      movq_r2m (mm4, *(mmx_t *) (row1 + 2));    /* wrote out ! row1 */

      /* fill destination row2 (needed are mm6=Rr,mm7=Gg,mm5=Bb) */
      /* this can be done "destructive" */
      pxor_r2r (mm2, mm2);      /*  0  0  0  0  0  0  0  0 */
      punpckhbw_r2r (mm2, mm6); /*  0 R3  0 R2  0 R1  0 R0 */
      punpckhbw_r2r (mm1, mm5); /* G3 B3 G2 B2 G1 B1 G0 B0 */
      movq_r2r (mm5, mm1);      /* G3 B3 G2 B2 G1 B1 G0 B0 */
      punpcklwd_r2r (mm6, mm1); /*  0 R1 G1 B1  0 R0 G0 B0 */
      movq_r2m (mm1, *(mmx_t *) (row1 + cols)); /* wrote out ! row2 */
      punpckhwd_r2r (mm6, mm5); /*  0 R3 G3 B3  0 R2 G2 B2 */
      movq_r2m (mm5, *(mmx_t *) (row1 + cols + 2));     /* wrote out ! row2 */

      lum += 4;
      cr += 2;
      cb += 2;
      row1 += 4;
    }
    lum += cols;
    row1 += cols;
  }

  emms ();

}
#endif