/* 
    This program is free software; you can redristibute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

   This file is a modified version of RTjpeg 0.1.2, (C) Justin Schoeman 1998

   (991101) Wim Taymans : added MMX dct and idct from intels site.
*/


/*

Main Routines

This file contains most of the initialisation and control functions

(C) Justin Schoeman 1998

*/

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef unsigned char __u8;
typedef signed char __s8;
typedef unsigned short __u16;
typedef signed short __s16;
typedef unsigned long __u32;
typedef signed long __s32;
typedef unsigned long long __u64;

/*#define MMX_TRACE */


#ifdef HAVE_LIBMMX
#include "mmx.h"
#endif

static const unsigned char RTjpeg_ZZ[64] = {
  0,
  8, 1,
  2, 9, 16,
  24, 17, 10, 3,
  4, 11, 18, 25, 32,
  40, 33, 26, 19, 12, 5,
  6, 13, 20, 27, 34, 41, 48,
  56, 49, 42, 35, 28, 21, 14, 7,
  15, 22, 29, 36, 43, 50, 57,
  58, 51, 44, 37, 30, 23,
  31, 38, 45, 52, 59,
  60, 53, 46, 39,
  47, 54, 61,
  62, 55,
  63
};

static const __u64 RTjpeg_aan_tab[64] = {
  4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL,
  3374581504ULL, 2324432128ULL, 1184891264ULL,
  5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL,
  4680582144ULL, 3224107520ULL, 1643641088ULL,
  5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL,
  4408998912ULL, 3036936960ULL, 1548224000ULL,
  5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL,
  3968072960ULL, 2733115392ULL, 1393296000ULL,
  4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL,
  3374581504ULL, 2324432128ULL, 1184891264ULL,
  3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL,
  2651326208ULL, 1826357504ULL, 931136000ULL,
  2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL,
  1826357504ULL, 1258030336ULL, 641204288ULL,
  1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL,
  931136000ULL, 641204288ULL, 326894240ULL,
};

#ifndef HAVE_LIBMMX
static __s32 RTjpeg_ws[64 + 31];
#endif
__u8 RTjpeg_alldata[2 * 64 + 4 * 64 + 4 * 64 + 4 * 64 + 4 * 64 + 32];

__s16 *RTjpeg_block;
__s32 *RTjpeg_lqt;
__s32 *RTjpeg_cqt;
__u32 *RTjpeg_liqt;
__u32 *RTjpeg_ciqt;

unsigned char RTjpeg_lb8;
unsigned char RTjpeg_cb8;
int RTjpeg_width, RTjpeg_height;
int RTjpeg_Ywidth, RTjpeg_Cwidth;
int RTjpeg_Ysize, RTjpeg_Csize;

__s16 *RTjpeg_old = NULL;

#ifdef HAVE_LIBMMX
mmx_t RTjpeg_lmask;
mmx_t RTjpeg_cmask;
#else
__u16 RTjpeg_lmask;
__u16 RTjpeg_cmask;
#endif
int RTjpeg_mtest = 0;

static const unsigned char RTjpeg_lum_quant_tbl[64] = {
  16, 11, 10, 16, 24, 40, 51, 61,
  12, 12, 14, 19, 26, 58, 60, 55,
  14, 13, 16, 24, 40, 57, 69, 56,
  14, 17, 22, 29, 51, 87, 80, 62,
  18, 22, 37, 56, 68, 109, 103, 77,
  24, 35, 55, 64, 81, 104, 113, 92,
  49, 64, 78, 87, 103, 121, 120, 101,
  72, 92, 95, 98, 112, 100, 103, 99
};

static const unsigned char RTjpeg_chrom_quant_tbl[64] = {
  17, 18, 24, 47, 99, 99, 99, 99,
  18, 21, 26, 66, 99, 99, 99, 99,
  24, 26, 56, 99, 99, 99, 99, 99,
  47, 66, 99, 99, 99, 99, 99, 99,
  99, 99, 99, 99, 99, 99, 99, 99,
  99, 99, 99, 99, 99, 99, 99, 99,
  99, 99, 99, 99, 99, 99, 99, 99,
  99, 99, 99, 99, 99, 99, 99, 99
};

int
RTjpeg_b2s (__s16 * data, __s8 * strm, __u8 bt8)
{
  register int ci, co = 1, tmp;
  register __s16 ZZvalue;

  strm[0] =
      (__u8) (data[RTjpeg_ZZ[0]] > 254) ? 254 : ((data[RTjpeg_ZZ[0]] <
          0) ? 0 : data[RTjpeg_ZZ[0]]);

  for (ci = 1; ci <= bt8; ci++) {
    ZZvalue = data[RTjpeg_ZZ[ci]];

    if (ZZvalue > 0) {
      strm[co++] = (__s8) (ZZvalue > 127) ? 127 : ZZvalue;
    } else {
      strm[co++] = (__s8) (ZZvalue < -128) ? -128 : ZZvalue;
    }
  }

  for (; ci < 64; ci++) {
    ZZvalue = data[RTjpeg_ZZ[ci]];

    if (ZZvalue > 0) {
      strm[co++] = (__s8) (ZZvalue > 63) ? 63 : ZZvalue;
    } else if (ZZvalue < 0) {
      strm[co++] = (__s8) (ZZvalue < -64) ? -64 : ZZvalue;
    } else {                    /* compress zeros */

      tmp = ci;
      do {
        ci++;
      }
      while ((ci < 64) && (data[RTjpeg_ZZ[ci]] == 0));

      strm[co++] = (__s8) (63 + (ci - tmp));
      ci--;
    }
  }
  return (int) co;
}

int
RTjpeg_s2b (__s16 * data, __s8 * strm, __u8 bt8, __u32 * qtbl)
{
  int ci = 1, co = 1, tmp;
  register int i;

  i = RTjpeg_ZZ[0];
  data[i] = ((__u8) strm[0]) * qtbl[i];

  for (co = 1; co <= bt8; co++) {
    i = RTjpeg_ZZ[co];
    data[i] = strm[ci++] * qtbl[i];
  }

  for (; co < 64; co++) {
    if (strm[ci] > 63) {
      tmp = co + strm[ci] - 63;
      for (; co < tmp; co++)
        data[RTjpeg_ZZ[co]] = 0;
      co--;
    } else {
      i = RTjpeg_ZZ[co];
      data[i] = strm[ci] * qtbl[i];
    }
    ci++;
  }
  return (int) ci;
}

#if defined(HAVE_LIBMMX)
void
RTjpeg_quant_init (void)
{
  int i;
  __s16 *qtbl;

  qtbl = (__s16 *) RTjpeg_lqt;
  for (i = 0; i < 64; i++)
    qtbl[i] = (__s16) RTjpeg_lqt[i];

  qtbl = (__s16 *) RTjpeg_cqt;
  for (i = 0; i < 64; i++)
    qtbl[i] = (__s16) RTjpeg_cqt[i];
}

static mmx_t RTjpeg_ones = (mmx_t) (long long) 0x0001000100010001LL;
static mmx_t RTjpeg_half = (mmx_t) (long long) 0x7fff7fff7fff7fffLL;

void
RTjpeg_quant (__s16 * block, __s32 * qtbl)
{
  int i;
  mmx_t *bl, *ql;

  ql = (mmx_t *) qtbl;
  bl = (mmx_t *) block;

  movq_m2r (RTjpeg_ones, mm6);
  movq_m2r (RTjpeg_half, mm7);

  for (i = 16; i; i--) {
    movq_m2r (*(ql++), mm0);    /* quant vals (4) */
    movq_m2r (*bl, mm2);        /* block vals (4) */
    movq_r2r (mm0, mm1);
    movq_r2r (mm2, mm3);

    punpcklwd_r2r (mm6, mm0);   /*           1 qb 1 qa */
    punpckhwd_r2r (mm6, mm1);   /* 1 qd 1 qc */

    punpcklwd_r2r (mm7, mm2);   /*                   32767 bb 32767 ba */
    punpckhwd_r2r (mm7, mm3);   /* 32767 bd 32767 bc */

    pmaddwd_r2r (mm2, mm0);     /*                         32767+bb*qb 32767+ba*qa */
    pmaddwd_r2r (mm3, mm1);     /* 32767+bd*qd 32767+bc*qc */

    psrad_i2r (16, mm0);
    psrad_i2r (16, mm1);

    packssdw_r2r (mm1, mm0);

    movq_r2m (mm0, *(bl++));

  }
}
#else
void
RTjpeg_quant_init (void)
{
}

void
RTjpeg_quant (__s16 * block, __s32 * qtbl)
{
  int i;

  for (i = 0; i < 64; i++)
    block[i] = (__s16) ((block[i] * qtbl[i] + 32767) >> 16);
}
#endif

/*
 * Perform the forward DCT on one block of samples.
 */
#ifdef HAVE_LIBMMX
static mmx_t RTjpeg_C4 = (mmx_t) (long long) 0x2D412D412D412D41LL;
static mmx_t RTjpeg_C6 = (mmx_t) (long long) 0x187E187E187E187ELL;
static mmx_t RTjpeg_C2mC6 = (mmx_t) (long long) 0x22A322A322A322A3LL;
static mmx_t RTjpeg_C2pC6 = (mmx_t) (long long) 0x539F539F539F539FLL;
static mmx_t RTjpeg_zero = (mmx_t) (long long) 0x0000000000000000LL;

#else

#define FIX_0_382683433  ((__s32)   98) /* FIX(0.382683433) */
#define FIX_0_541196100  ((__s32)  139) /* FIX(0.541196100) */
#define FIX_0_707106781  ((__s32)  181) /* FIX(0.707106781) */
#define FIX_1_306562965  ((__s32)  334) /* FIX(1.306562965) */

#define DESCALE10(x) (__s16)( ((x)+128) >> 8)
#define DESCALE20(x)  (__s16)(((x)+32768) >> 16)
#define D_MULTIPLY(var,const)  ((__s32) ((var) * (const)))
#endif

void
RTjpeg_dct_init (void)
{
  int i;

  for (i = 0; i < 64; i++) {
    RTjpeg_lqt[i] = (((__u64) RTjpeg_lqt[i] << 32) / RTjpeg_aan_tab[i]);
    RTjpeg_cqt[i] = (((__u64) RTjpeg_cqt[i] << 32) / RTjpeg_aan_tab[i]);
  }
}

void
RTjpeg_dctY (__u8 * idata, __s16 * odata, int rskip)
{
#ifndef HAVE_LIBMMX
  __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  __s32 tmp10, tmp11, tmp12, tmp13;
  __s32 z1, z2, z3, z4, z5, z11, z13;
  __u8 *idataptr;
  __s16 *odataptr;
  __s32 *wsptr;
  int ctr;

  idataptr = idata;
  wsptr = RTjpeg_ws;
  for (ctr = 7; ctr >= 0; ctr--) {
    tmp0 = idataptr[0] + idataptr[7];
    tmp7 = idataptr[0] - idataptr[7];
    tmp1 = idataptr[1] + idataptr[6];
    tmp6 = idataptr[1] - idataptr[6];
    tmp2 = idataptr[2] + idataptr[5];
    tmp5 = idataptr[2] - idataptr[5];
    tmp3 = idataptr[3] + idataptr[4];
    tmp4 = idataptr[3] - idataptr[4];

    tmp10 = (tmp0 + tmp3);      /* phase 2 */
    tmp13 = tmp0 - tmp3;
    tmp11 = (tmp1 + tmp2);
    tmp12 = tmp1 - tmp2;

    wsptr[0] = (tmp10 + tmp11) << 8;    /* phase 3 */
    wsptr[4] = (tmp10 - tmp11) << 8;

    z1 = D_MULTIPLY (tmp12 + tmp13, FIX_0_707106781);   /* c4 */
    wsptr[2] = (tmp13 << 8) + z1;       /* phase 5 */
    wsptr[6] = (tmp13 << 8) - z1;

    tmp10 = tmp4 + tmp5;        /* phase 2 */
    tmp11 = tmp5 + tmp6;
    tmp12 = tmp6 + tmp7;

    z5 = D_MULTIPLY (tmp10 - tmp12, FIX_0_382683433);   /* c6 */
    z2 = D_MULTIPLY (tmp10, FIX_0_541196100) + z5;      /* c2-c6 */
    z4 = D_MULTIPLY (tmp12, FIX_1_306562965) + z5;      /* c2+c6 */
    z3 = D_MULTIPLY (tmp11, FIX_0_707106781);   /* c4 */

    z11 = (tmp7 << 8) + z3;     /* phase 5 */
    z13 = (tmp7 << 8) - z3;

    wsptr[5] = z13 + z2;        /* phase 6 */
    wsptr[3] = z13 - z2;
    wsptr[1] = z11 + z4;
    wsptr[7] = z11 - z4;

    idataptr += rskip << 3;     /* advance pointer to next row */
    wsptr += 8;
  }

  wsptr = RTjpeg_ws;
  odataptr = odata;
  for (ctr = 7; ctr >= 0; ctr--) {
    tmp0 = wsptr[0] + wsptr[56];
    tmp7 = wsptr[0] - wsptr[56];
    tmp1 = wsptr[8] + wsptr[48];
    tmp6 = wsptr[8] - wsptr[48];
    tmp2 = wsptr[16] + wsptr[40];
    tmp5 = wsptr[16] - wsptr[40];
    tmp3 = wsptr[24] + wsptr[32];
    tmp4 = wsptr[24] - wsptr[32];

    tmp10 = tmp0 + tmp3;        /* phase 2 */
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;

    odataptr[0] = DESCALE10 (tmp10 + tmp11);    /* phase 3 */
    odataptr[32] = DESCALE10 (tmp10 - tmp11);

    z1 = D_MULTIPLY (tmp12 + tmp13, FIX_0_707106781);   /* c4 */
    odataptr[16] = DESCALE20 ((tmp13 << 8) + z1);       /* phase 5 */
    odataptr[48] = DESCALE20 ((tmp13 << 8) - z1);

    tmp10 = tmp4 + tmp5;        /* phase 2 */
    tmp11 = tmp5 + tmp6;
    tmp12 = tmp6 + tmp7;

    z5 = D_MULTIPLY (tmp10 - tmp12, FIX_0_382683433);   /* c6 */
    z2 = D_MULTIPLY (tmp10, FIX_0_541196100) + z5;      /* c2-c6 */
    z4 = D_MULTIPLY (tmp12, FIX_1_306562965) + z5;      /* c2+c6 */
    z3 = D_MULTIPLY (tmp11, FIX_0_707106781);   /* c4 */

    z11 = (tmp7 << 8) + z3;     /* phase 5 */
    z13 = (tmp7 << 8) - z3;

    odataptr[40] = DESCALE20 (z13 + z2);        /* phase 6 */
    odataptr[24] = DESCALE20 (z13 - z2);
    odataptr[8] = DESCALE20 (z11 + z4);
    odataptr[56] = DESCALE20 (z11 - z4);

    odataptr++;                 /* advance pointer to next column */
    wsptr++;
  }
#else
  mmx_t tmp6, tmp7;
  register mmx_t *dataptr = (mmx_t *) odata;
  mmx_t *idata2 = (mmx_t *) idata;

  /* first copy the input 8 bit to the destination 16 bits */

  movq_m2r (RTjpeg_zero, mm2);


  movq_m2r (*idata2, mm0);
  movq_r2r (mm0, mm1);

  punpcklbw_r2r (mm2, mm0);
  movq_r2m (mm0, *(dataptr));

  punpckhbw_r2r (mm2, mm1);
  movq_r2m (mm1, *(dataptr + 1));

  idata2 += rskip;

  movq_m2r (*idata2, mm0);
  movq_r2r (mm0, mm1);

  punpcklbw_r2r (mm2, mm0);
  movq_r2m (mm0, *(dataptr + 2));

  punpckhbw_r2r (mm2, mm1);
  movq_r2m (mm1, *(dataptr + 3));

  idata2 += rskip;

  movq_m2r (*idata2, mm0);
  movq_r2r (mm0, mm1);

  punpcklbw_r2r (mm2, mm0);
  movq_r2m (mm0, *(dataptr + 4));

  punpckhbw_r2r (mm2, mm1);
  movq_r2m (mm1, *(dataptr + 5));

  idata2 += rskip;

  movq_m2r (*idata2, mm0);
  movq_r2r (mm0, mm1);

  punpcklbw_r2r (mm2, mm0);
  movq_r2m (mm0, *(dataptr + 6));

  punpckhbw_r2r (mm2, mm1);
  movq_r2m (mm1, *(dataptr + 7));

  idata2 += rskip;

  movq_m2r (*idata2, mm0);
  movq_r2r (mm0, mm1);

  punpcklbw_r2r (mm2, mm0);
  movq_r2m (mm0, *(dataptr + 8));

  punpckhbw_r2r (mm2, mm1);
  movq_r2m (mm1, *(dataptr + 9));

  idata2 += rskip;

  movq_m2r (*idata2, mm0);
  movq_r2r (mm0, mm1);

  punpcklbw_r2r (mm2, mm0);
  movq_r2m (mm0, *(dataptr + 10));

  punpckhbw_r2r (mm2, mm1);
  movq_r2m (mm1, *(dataptr + 11));

  idata2 += rskip;

  movq_m2r (*idata2, mm0);
  movq_r2r (mm0, mm1);

  punpcklbw_r2r (mm2, mm0);
  movq_r2m (mm0, *(dataptr + 12));

  punpckhbw_r2r (mm2, mm1);
  movq_r2m (mm1, *(dataptr + 13));

  idata2 += rskip;

  movq_m2r (*idata2, mm0);
  movq_r2r (mm0, mm1);

  punpcklbw_r2r (mm2, mm0);
  movq_r2m (mm0, *(dataptr + 14));

  punpckhbw_r2r (mm2, mm1);
  movq_r2m (mm1, *(dataptr + 15));

/*  Start Transpose to do calculations on rows */

  movq_m2r (*(dataptr + 9), mm7);       /* m03:m02|m01:m00 - first line (line 4)and copy into m5 */

  movq_m2r (*(dataptr + 13), mm6);      /* m23:m22|m21:m20 - third line (line 6)and copy into m2 */
  movq_r2r (mm7, mm5);

  punpcklwd_m2r (*(dataptr + 11), mm7); /* m11:m01|m10:m00 - interleave first and second lines */
  movq_r2r (mm6, mm2);

  punpcklwd_m2r (*(dataptr + 15), mm6); /* m31:m21|m30:m20 - interleave third and fourth lines */
  movq_r2r (mm7, mm1);

  movq_m2r (*(dataptr + 11), mm3);      /* m13:m13|m11:m10 - second line   */
  punpckldq_r2r (mm6, mm7);     /* m30:m20|m10:m00 - interleave to produce result 1 */

  movq_m2r (*(dataptr + 15), mm0);      /* m13:m13|m11:m10 - fourth line */
  punpckhdq_r2r (mm6, mm1);     /* m31:m21|m11:m01 - interleave to produce result 2 */

  movq_r2m (mm7, *(dataptr + 9));       /* write result 1 */
  punpckhwd_r2r (mm3, mm5);     /* m13:m03|m12:m02 - interleave first and second lines */

  movq_r2m (mm1, *(dataptr + 11));      /* write result 2 */
  punpckhwd_r2r (mm0, mm2);     /* m33:m23|m32:m22 - interleave third and fourth lines */

  movq_r2r (mm5, mm1);
  punpckldq_r2r (mm2, mm5);     /* m32:m22|m12:m02 - interleave to produce result 3 */

  movq_m2r (*(dataptr + 1), mm0);       /* m03:m02|m01:m00 - first line, 4x4 */
  punpckhdq_r2r (mm2, mm1);     /* m33:m23|m13:m03 - interleave to produce result 4 */

  movq_r2m (mm5, *(dataptr + 13));      /* write result 3 */

  /* last 4x4 done */

  movq_r2m (mm1, *(dataptr + 15));      /* write result 4, last 4x4 */

  movq_m2r (*(dataptr + 5), mm2);       /* m23:m22|m21:m20 - third line */
  movq_r2r (mm0, mm6);

  punpcklwd_m2r (*(dataptr + 3), mm0);  /* m11:m01|m10:m00 - interleave first and second lines */
  movq_r2r (mm2, mm7);

  punpcklwd_m2r (*(dataptr + 7), mm2);  /* m31:m21|m30:m20 - interleave third and fourth lines */
  movq_r2r (mm0, mm4);


  movq_m2r (*(dataptr + 8), mm1);       /* n03:n02|n01:n00 - first line  */
  punpckldq_r2r (mm2, mm0);     /* m30:m20|m10:m00 - interleave to produce first result */

  movq_m2r (*(dataptr + 12), mm3);      /* n23:n22|n21:n20 - third line */
  punpckhdq_r2r (mm2, mm4);     /* m31:m21|m11:m01 - interleave to produce second result */

  punpckhwd_m2r (*(dataptr + 3), mm6);  /* m13:m03|m12:m02 - interleave first and second lines */
  movq_r2r (mm1, mm2);          /* copy first line */

  punpckhwd_m2r (*(dataptr + 7), mm7);  /* m33:m23|m32:m22 - interleave third and fourth lines */
  movq_r2r (mm6, mm5);          /* copy first intermediate result */

  movq_r2m (mm0, *(dataptr + 8));       /* write result 1 */
  punpckhdq_r2r (mm7, mm5);     /* m33:m23|m13:m03 - produce third result */

  punpcklwd_m2r (*(dataptr + 10), mm1); /* n11:n01|n10:n00 - interleave first and second lines */
  movq_r2r (mm3, mm0);          /* copy third line */

  punpckhwd_m2r (*(dataptr + 10), mm2); /* n13:n03|n12:n02 - interleave first and second lines */

  movq_r2m (mm4, *(dataptr + 10));      /* write result 2 out */
  punpckldq_r2r (mm7, mm6);     /* m32:m22|m12:m02 - produce fourth result */

  punpcklwd_m2r (*(dataptr + 14), mm3); /* n31:n21|n30:n20 - interleave third and fourth lines */
  movq_r2r (mm1, mm4);

  movq_r2m (mm6, *(dataptr + 12));      /* write result 3 out */
  punpckldq_r2r (mm3, mm1);     /* n30:n20|n10:n00 - produce first result */

  punpckhwd_m2r (*(dataptr + 14), mm0); /* n33:n23|n32:n22 - interleave third and fourth lines */
  movq_r2r (mm2, mm6);

  movq_r2m (mm5, *(dataptr + 14));      /* write result 4 out */
  punpckhdq_r2r (mm3, mm4);     /* n31:n21|n11:n01- produce second result */

  movq_r2m (mm1, *(dataptr + 1));       /* write result 5 out - (first result for other 4 x 4 block) */
  punpckldq_r2r (mm0, mm2);     /* n32:n22|n12:n02- produce third result */

  movq_r2m (mm4, *(dataptr + 3));       /* write result 6 out */
  punpckhdq_r2r (mm0, mm6);     /* n33:n23|n13:n03 - produce fourth result */

  movq_r2m (mm2, *(dataptr + 5));       /* write result 7 out */

  movq_m2r (*dataptr, mm0);     /* m03:m02|m01:m00 - first line, first 4x4 */

  movq_r2m (mm6, *(dataptr + 7));       /* write result 8 out */


/* Do first 4x4 quadrant, which is used in the beginning of the DCT: */

  movq_m2r (*(dataptr + 4), mm7);       /* m23:m22|m21:m20 - third line */
  movq_r2r (mm0, mm2);

  punpcklwd_m2r (*(dataptr + 2), mm0);  /* m11:m01|m10:m00 - interleave first and second lines */
  movq_r2r (mm7, mm4);

  punpcklwd_m2r (*(dataptr + 6), mm7);  /* m31:m21|m30:m20 - interleave third and fourth lines */
  movq_r2r (mm0, mm1);

  movq_m2r (*(dataptr + 2), mm6);       /* m13:m12|m11:m10 - second line */
  punpckldq_r2r (mm7, mm0);     /* m30:m20|m10:m00 - interleave to produce result 1 */

  movq_m2r (*(dataptr + 6), mm5);       /* m33:m32|m31:m30 - fourth line */
  punpckhdq_r2r (mm7, mm1);     /* m31:m21|m11:m01 - interleave to produce result 2 */

  movq_r2r (mm0, mm7);          /* write result 1 */
  punpckhwd_r2r (mm6, mm2);     /* m13:m03|m12:m02 - interleave first and second lines */

  psubw_m2r (*(dataptr + 14), mm7);     /* tmp07=x0-x7: Stage 1 */
  movq_r2r (mm1, mm6);          /* write result 2 */

  paddw_m2r (*(dataptr + 14), mm0);     /* tmp00=x0+x7: Stage 1 */
  punpckhwd_r2r (mm5, mm4);     /* m33:m23|m32:m22 - interleave third and fourth lines */

  paddw_m2r (*(dataptr + 12), mm1);     /* tmp01=x1+x6: Stage 1 */
  movq_r2r (mm2, mm3);          /* copy first intermediate result */

  psubw_m2r (*(dataptr + 12), mm6);     /* tmp06=x1-x6: Stage 1 */
  punpckldq_r2r (mm4, mm2);     /* m32:m22|m12:m02 - interleave to produce result 3 */

  movq_r2m (mm7, tmp7);
  movq_r2r (mm2, mm5);          /* write result 3 */

  movq_r2m (mm6, tmp6);
  punpckhdq_r2r (mm4, mm3);     /* m33:m23|m13:m03 - interleave to produce result 4 */

  paddw_m2r (*(dataptr + 10), mm2);     /* tmp02=x2+5: Stage 1 */
  movq_r2r (mm3, mm4);          /* write result 4 */

/************************************************************************************************
                                        End of Transpose
************************************************************************************************/


  paddw_m2r (*(dataptr + 8), mm3);      /* tmp03=x3+x4: stage 1 */
  movq_r2r (mm0, mm7);

  psubw_m2r (*(dataptr + 8), mm4);      /* tmp04=x3-x4: stage 1 */
  movq_r2r (mm1, mm6);

  paddw_r2r (mm3, mm0);         /* tmp10 = tmp00 + tmp03: even 2 */
  psubw_r2r (mm3, mm7);         /* tmp13 = tmp00 - tmp03: even 2 */

  psubw_r2r (mm2, mm6);         /* tmp12 = tmp01 - tmp02: even 2 */
  paddw_r2r (mm2, mm1);         /* tmp11 = tmp01 + tmp02: even 2 */

  psubw_m2r (*(dataptr + 10), mm5);     /* tmp05=x2-x5: stage 1 */
  paddw_r2r (mm7, mm6);         /* tmp12 + tmp13 */

  /* stage 3 */

  movq_m2r (tmp6, mm2);
  movq_r2r (mm0, mm3);

  psllw_i2r (2, mm6);           /* m8 * 2^2 */
  paddw_r2r (mm1, mm0);

  pmulhw_m2r (RTjpeg_C4, mm6);  /* z1 */
  psubw_r2r (mm1, mm3);

  movq_r2m (mm0, *dataptr);
  movq_r2r (mm7, mm0);

  /* Odd part */
  movq_r2m (mm3, *(dataptr + 8));
  paddw_r2r (mm5, mm4);         /* tmp10 */

  movq_m2r (tmp7, mm3);
  paddw_r2r (mm6, mm0);         /* tmp32 */

  paddw_r2r (mm2, mm5);         /* tmp11 */
  psubw_r2r (mm6, mm7);         /* tmp33 */

  movq_r2m (mm0, *(dataptr + 4));
  paddw_r2r (mm3, mm2);         /* tmp12 */

  /* stage 4 */

  movq_r2m (mm7, *(dataptr + 12));
  movq_r2r (mm4, mm1);          /* copy of tmp10 */

  psubw_r2r (mm2, mm1);         /* tmp10 - tmp12 */
  psllw_i2r (2, mm4);           /* m8 * 2^2 */

  movq_m2r (RTjpeg_C2mC6, mm0);
  psllw_i2r (2, mm1);

  pmulhw_m2r (RTjpeg_C6, mm1);  /* z5 */
  psllw_i2r (2, mm2);

  pmulhw_r2r (mm0, mm4);        /* z5 */

  /* stage 5 */

  pmulhw_m2r (RTjpeg_C2pC6, mm2);
  psllw_i2r (2, mm5);

  pmulhw_m2r (RTjpeg_C4, mm5);  /* z3 */
  movq_r2r (mm3, mm0);          /* copy tmp7 */

  movq_m2r (*(dataptr + 1), mm7);
  paddw_r2r (mm1, mm4);         /* z2 */

  paddw_r2r (mm1, mm2);         /* z4 */

  paddw_r2r (mm5, mm0);         /* z11 */
  psubw_r2r (mm5, mm3);         /* z13 */

  /* stage 6 */

  movq_r2r (mm3, mm5);          /* copy z13 */
  psubw_r2r (mm4, mm3);         /* y3=z13 - z2 */

  paddw_r2r (mm4, mm5);         /* y5=z13 + z2 */
  movq_r2r (mm0, mm6);          /* copy z11 */

  movq_r2m (mm3, *(dataptr + 6));       /*save y3 */
  psubw_r2r (mm2, mm0);         /* y7=z11 - z4 */

  movq_r2m (mm5, *(dataptr + 10));      /*save y5 */
  paddw_r2r (mm2, mm6);         /* y1=z11 + z4 */

  movq_r2m (mm0, *(dataptr + 14));      /*save y7 */

        /************************************************
         *  End of 1st 4 rows
         ************************************************/

  movq_m2r (*(dataptr + 3), mm1);       /* load x1: stage 1 */
  movq_r2r (mm7, mm0);          /* copy x0 */

  movq_r2m (mm6, *(dataptr + 2));       /*save y1 */

  movq_m2r (*(dataptr + 5), mm2);       /* load x2: stage 1 */
  movq_r2r (mm1, mm6);          /* copy x1 */

  paddw_m2r (*(dataptr + 15), mm0);     /* tmp00 = x0 + x7 */

  movq_m2r (*(dataptr + 7), mm3);       /* load x3 : stage 1 */
  movq_r2r (mm2, mm5);          /* copy x2 */

  psubw_m2r (*(dataptr + 15), mm7);     /* tmp07 = x0 - x7 */
  movq_r2r (mm3, mm4);          /* copy x3 */

  paddw_m2r (*(dataptr + 13), mm1);     /* tmp01 = x1 + x6 */

  movq_r2m (mm7, tmp7);         /* save tmp07 */
  movq_r2r (mm0, mm7);          /* copy tmp00 */

  psubw_m2r (*(dataptr + 13), mm6);     /* tmp06 = x1 - x6 */

  /* stage 2, Even Part */

  paddw_m2r (*(dataptr + 9), mm3);      /* tmp03 = x3 + x4 */

  movq_r2m (mm6, tmp6);         /* save tmp07 */
  movq_r2r (mm1, mm6);          /* copy tmp01 */

  paddw_m2r (*(dataptr + 11), mm2);     /* tmp02 = x2 + x5 */
  paddw_r2r (mm3, mm0);         /* tmp10 = tmp00 + tmp03 */

  psubw_r2r (mm3, mm7);         /* tmp13 = tmp00 - tmp03 */

  psubw_m2r (*(dataptr + 9), mm4);      /* tmp04 = x3 - x4 */
  psubw_r2r (mm2, mm6);         /* tmp12 = tmp01 - tmp02 */

  paddw_r2r (mm2, mm1);         /* tmp11 = tmp01 + tmp02 */

  psubw_m2r (*(dataptr + 11), mm5);     /* tmp05 = x2 - x5 */
  paddw_r2r (mm7, mm6);         /*  tmp12 + tmp13 */

  /* stage 3, Even and stage 4 & 5 even */

  movq_m2r (tmp6, mm2);         /* load tmp6 */
  movq_r2r (mm0, mm3);          /* copy tmp10 */

  psllw_i2r (2, mm6);           /* shift z1 */
  paddw_r2r (mm1, mm0);         /* y0=tmp10 + tmp11 */

  pmulhw_m2r (RTjpeg_C4, mm6);  /* z1 */
  psubw_r2r (mm1, mm3);         /* y4=tmp10 - tmp11 */

  movq_r2m (mm0, *(dataptr + 1));       /*save y0 */
  movq_r2r (mm7, mm0);          /* copy tmp13 */

  /* odd part */

  movq_r2m (mm3, *(dataptr + 9));       /*save y4 */
  paddw_r2r (mm5, mm4);         /* tmp10 = tmp4 + tmp5 */

  movq_m2r (tmp7, mm3);         /* load tmp7 */
  paddw_r2r (mm6, mm0);         /* tmp32 = tmp13 + z1 */

  paddw_r2r (mm2, mm5);         /* tmp11 = tmp5 + tmp6 */
  psubw_r2r (mm6, mm7);         /* tmp33 = tmp13 - z1 */

  movq_r2m (mm0, *(dataptr + 5));       /*save y2 */
  paddw_r2r (mm3, mm2);         /* tmp12 = tmp6 + tmp7 */

  /* stage 4 */

  movq_r2m (mm7, *(dataptr + 13));      /*save y6 */
  movq_r2r (mm4, mm1);          /* copy tmp10 */

  psubw_r2r (mm2, mm1);         /* tmp10 - tmp12 */
  psllw_i2r (2, mm4);           /* shift tmp10 */

  movq_m2r (RTjpeg_C2mC6, mm0); /* load C2mC6 */
  psllw_i2r (2, mm1);           /* shift (tmp10-tmp12) */

  pmulhw_m2r (RTjpeg_C6, mm1);  /* z5 */
  psllw_i2r (2, mm5);           /* prepare for multiply  */

  pmulhw_r2r (mm0, mm4);        /* multiply by converted real */

  /* stage 5 */

  pmulhw_m2r (RTjpeg_C4, mm5);  /* z3 */
  psllw_i2r (2, mm2);           /* prepare for multiply  */

  pmulhw_m2r (RTjpeg_C2pC6, mm2);       /* multiply */
  movq_r2r (mm3, mm0);          /* copy tmp7 */

  movq_m2r (*(dataptr + 9), mm7);       /* m03:m02|m01:m00 - first line (line 4)and copy into mm7 */
  paddw_r2r (mm1, mm4);         /* z2 */

  paddw_r2r (mm5, mm0);         /* z11 */
  psubw_r2r (mm5, mm3);         /* z13 */

  /* stage 6 */

  movq_r2r (mm3, mm5);          /* copy z13 */
  paddw_r2r (mm1, mm2);         /* z4 */

  movq_r2r (mm0, mm6);          /* copy z11 */
  psubw_r2r (mm4, mm5);         /* y3 */

  paddw_r2r (mm2, mm6);         /* y1 */
  paddw_r2r (mm4, mm3);         /* y5 */

  movq_r2m (mm5, *(dataptr + 7));       /*save y3 */

  movq_r2m (mm6, *(dataptr + 3));       /*save y1 */
  psubw_r2r (mm2, mm0);         /* y7 */

/************************************************************************************************
                                        Start of Transpose
************************************************************************************************/

  movq_m2r (*(dataptr + 13), mm6);      /* m23:m22|m21:m20 - third line (line 6)and copy into m2 */
  movq_r2r (mm7, mm5);          /* copy first line */

  punpcklwd_r2r (mm3, mm7);     /* m11:m01|m10:m00 - interleave first and second lines */
  movq_r2r (mm6, mm2);          /* copy third line */

  punpcklwd_r2r (mm0, mm6);     /* m31:m21|m30:m20 - interleave third and fourth lines */
  movq_r2r (mm7, mm1);          /* copy first intermediate result */

  punpckldq_r2r (mm6, mm7);     /* m30:m20|m10:m00 - interleave to produce result 1 */

  punpckhdq_r2r (mm6, mm1);     /* m31:m21|m11:m01 - interleave to produce result 2 */

  movq_r2m (mm7, *(dataptr + 9));       /* write result 1 */
  punpckhwd_r2r (mm3, mm5);     /* m13:m03|m12:m02 - interleave first and second lines */

  movq_r2m (mm1, *(dataptr + 11));      /* write result 2 */
  punpckhwd_r2r (mm0, mm2);     /* m33:m23|m32:m22 - interleave third and fourth lines */

  movq_r2r (mm5, mm1);          /* copy first intermediate result */
  punpckldq_r2r (mm2, mm5);     /* m32:m22|m12:m02 - interleave to produce result 3 */

  movq_m2r (*(dataptr + 1), mm0);       /* m03:m02|m01:m00 - first line, 4x4 */
  punpckhdq_r2r (mm2, mm1);     /* m33:m23|m13:m03 - interleave to produce result 4 */

  movq_r2m (mm5, *(dataptr + 13));      /* write result 3 */

        /****** last 4x4 done */

  movq_r2m (mm1, *(dataptr + 15));      /* write result 4, last 4x4 */

  movq_m2r (*(dataptr + 5), mm2);       /* m23:m22|m21:m20 - third line */
  movq_r2r (mm0, mm6);          /* copy first line */

  punpcklwd_m2r (*(dataptr + 3), mm0);  /* m11:m01|m10:m00 - interleave first and second lines */
  movq_r2r (mm2, mm7);          /* copy third line */

  punpcklwd_m2r (*(dataptr + 7), mm2);  /* m31:m21|m30:m20 - interleave third and fourth lines */
  movq_r2r (mm0, mm4);          /* copy first intermediate result */



  movq_m2r (*(dataptr + 8), mm1);       /* n03:n02|n01:n00 - first line  */
  punpckldq_r2r (mm2, mm0);     /* m30:m20|m10:m00 - interleave to produce first result */

  movq_m2r (*(dataptr + 12), mm3);      /* n23:n22|n21:n20 - third line */
  punpckhdq_r2r (mm2, mm4);     /* m31:m21|m11:m01 - interleave to produce second result */

  punpckhwd_m2r (*(dataptr + 3), mm6);  /* m13:m03|m12:m02 - interleave first and second lines */
  movq_r2r (mm1, mm2);          /* copy first line */

  punpckhwd_m2r (*(dataptr + 7), mm7);  /* m33:m23|m32:m22 - interleave third and fourth lines */
  movq_r2r (mm6, mm5);          /* copy first intermediate result */

  movq_r2m (mm0, *(dataptr + 8));       /* write result 1 */
  punpckhdq_r2r (mm7, mm5);     /* m33:m23|m13:m03 - produce third result */

  punpcklwd_m2r (*(dataptr + 10), mm1); /* n11:n01|n10:n00 - interleave first and second lines */
  movq_r2r (mm3, mm0);          /* copy third line */

  punpckhwd_m2r (*(dataptr + 10), mm2); /* n13:n03|n12:n02 - interleave first and second lines */

  movq_r2m (mm4, *(dataptr + 10));      /* write result 2 out */
  punpckldq_r2r (mm7, mm6);     /* m32:m22|m12:m02 - produce fourth result */

  punpcklwd_m2r (*(dataptr + 14), mm3); /* n33:n23|n32:n22 - interleave third and fourth lines */
  movq_r2r (mm1, mm4);          /* copy second intermediate result */

  movq_r2m (mm6, *(dataptr + 12));      /* write result 3 out */
  punpckldq_r2r (mm3, mm1);     /*  */

  punpckhwd_m2r (*(dataptr + 14), mm0); /* n33:n23|n32:n22 - interleave third and fourth lines */
  movq_r2r (mm2, mm6);          /* copy second intermediate result */

  movq_r2m (mm5, *(dataptr + 14));      /* write result 4 out */
  punpckhdq_r2r (mm3, mm4);     /* n31:n21|n11:n01- produce second result */

  movq_r2m (mm1, *(dataptr + 1));       /* write result 5 out - (first result for other 4 x 4 block) */
  punpckldq_r2r (mm0, mm2);     /* n32:n22|n12:n02- produce third result */

  movq_r2m (mm4, *(dataptr + 3));       /* write result 6 out */
  punpckhdq_r2r (mm0, mm6);     /* n33:n23|n13:n03 - produce fourth result */

  movq_r2m (mm2, *(dataptr + 5));       /* write result 7 out */

  movq_m2r (*dataptr, mm0);     /* m03:m02|m01:m00 - first line, first 4x4 */

  movq_r2m (mm6, *(dataptr + 7));       /* write result 8 out */

/* Do first 4x4 quadrant, which is used in the beginning of the DCT: */

  movq_m2r (*(dataptr + 4), mm7);       /* m23:m22|m21:m20 - third line */
  movq_r2r (mm0, mm2);          /* copy first line */

  punpcklwd_m2r (*(dataptr + 2), mm0);  /* m11:m01|m10:m00 - interleave first and second lines */
  movq_r2r (mm7, mm4);          /* copy third line */

  punpcklwd_m2r (*(dataptr + 6), mm7);  /* m31:m21|m30:m20 - interleave third and fourth lines */
  movq_r2r (mm0, mm1);          /* copy first intermediate result */

  movq_m2r (*(dataptr + 2), mm6);       /* m13:m12|m11:m10 - second line */
  punpckldq_r2r (mm7, mm0);     /* m30:m20|m10:m00 - interleave to produce result 1 */

  movq_m2r (*(dataptr + 6), mm5);       /* m33:m32|m31:m30 - fourth line */
  punpckhdq_r2r (mm7, mm1);     /* m31:m21|m11:m01 - interleave to produce result 2 */

  movq_r2r (mm0, mm7);          /* write result 1 */
  punpckhwd_r2r (mm6, mm2);     /* m13:m03|m12:m02 - interleave first and second lines */

  psubw_m2r (*(dataptr + 14), mm7);     /* tmp07=x0-x7: Stage 1 */
  movq_r2r (mm1, mm6);          /* write result 2 */

  paddw_m2r (*(dataptr + 14), mm0);     /* tmp00=x0+x7: Stage 1 */
  punpckhwd_r2r (mm5, mm4);     /* m33:m23|m32:m22 - interleave third and fourth lines */

  paddw_m2r (*(dataptr + 12), mm1);     /* tmp01=x1+x6: Stage 1 */
  movq_r2r (mm2, mm3);          /* copy first intermediate result */

  psubw_m2r (*(dataptr + 12), mm6);     /* tmp06=x1-x6: Stage 1 */
  punpckldq_r2r (mm4, mm2);     /* m32:m22|m12:m02 - interleave to produce result 3 */

  movq_r2m (mm7, tmp7);         /* save tmp07 */
  movq_r2r (mm2, mm5);          /* write result 3 */

  movq_r2m (mm6, tmp6);         /* save tmp06 */

  punpckhdq_r2r (mm4, mm3);     /* m33:m23|m13:m03 - interleave to produce result 4 */

  paddw_m2r (*(dataptr + 10), mm2);     /* tmp02=x2+x5: stage 1 */
  movq_r2r (mm3, mm4);          /* write result 4 */

/************************************************************************************************
                                        End of Transpose 2
************************************************************************************************/

  paddw_m2r (*(dataptr + 8), mm3);      /* tmp03=x3+x4: stage 1 */
  movq_r2r (mm0, mm7);

  psubw_m2r (*(dataptr + 8), mm4);      /* tmp04=x3-x4: stage 1 */
  movq_r2r (mm1, mm6);

  paddw_r2r (mm3, mm0);         /* tmp10 = tmp00 + tmp03: even 2 */
  psubw_r2r (mm3, mm7);         /* tmp13 = tmp00 - tmp03: even 2 */

  psubw_r2r (mm2, mm6);         /* tmp12 = tmp01 - tmp02: even 2 */
  paddw_r2r (mm2, mm1);         /* tmp11 = tmp01 + tmp02: even 2 */

  psubw_m2r (*(dataptr + 10), mm5);     /* tmp05=x2-x5: stage 1 */
  paddw_r2r (mm7, mm6);         /* tmp12 + tmp13 */

  /* stage 3 */

  movq_m2r (tmp6, mm2);
  movq_r2r (mm0, mm3);

  psllw_i2r (2, mm6);           /* m8 * 2^2 */
  paddw_r2r (mm1, mm0);

  pmulhw_m2r (RTjpeg_C4, mm6);  /* z1 */
  psubw_r2r (mm1, mm3);

  movq_r2m (mm0, *dataptr);
  movq_r2r (mm7, mm0);

  /* Odd part */
  movq_r2m (mm3, *(dataptr + 8));
  paddw_r2r (mm5, mm4);         /* tmp10 */

  movq_m2r (tmp7, mm3);
  paddw_r2r (mm6, mm0);         /* tmp32 */

  paddw_r2r (mm2, mm5);         /* tmp11 */
  psubw_r2r (mm6, mm7);         /* tmp33 */

  movq_r2m (mm0, *(dataptr + 4));
  paddw_r2r (mm3, mm2);         /* tmp12 */

  /* stage 4 */
  movq_r2m (mm7, *(dataptr + 12));
  movq_r2r (mm4, mm1);          /* copy of tmp10 */

  psubw_r2r (mm2, mm1);         /* tmp10 - tmp12 */
  psllw_i2r (2, mm4);           /* m8 * 2^2 */

  movq_m2r (RTjpeg_C2mC6, mm0);
  psllw_i2r (2, mm1);

  pmulhw_m2r (RTjpeg_C6, mm1);  /* z5 */
  psllw_i2r (2, mm2);

  pmulhw_r2r (mm0, mm4);        /* z5 */

  /* stage 5 */

  pmulhw_m2r (RTjpeg_C2pC6, mm2);
  psllw_i2r (2, mm5);

  pmulhw_m2r (RTjpeg_C4, mm5);  /* z3 */
  movq_r2r (mm3, mm0);          /* copy tmp7 */

  movq_m2r (*(dataptr + 1), mm7);
  paddw_r2r (mm1, mm4);         /* z2 */

  paddw_r2r (mm1, mm2);         /* z4 */

  paddw_r2r (mm5, mm0);         /* z11 */
  psubw_r2r (mm5, mm3);         /* z13 */

  /* stage 6 */

  movq_r2r (mm3, mm5);          /* copy z13 */
  psubw_r2r (mm4, mm3);         /* y3=z13 - z2 */

  paddw_r2r (mm4, mm5);         /* y5=z13 + z2 */
  movq_r2r (mm0, mm6);          /* copy z11 */

  movq_r2m (mm3, *(dataptr + 6));       /*save y3 */
  psubw_r2r (mm2, mm0);         /* y7=z11 - z4 */

  movq_r2m (mm5, *(dataptr + 10));      /*save y5 */
  paddw_r2r (mm2, mm6);         /* y1=z11 + z4 */

  movq_r2m (mm0, *(dataptr + 14));      /*save y7 */

        /************************************************
         *  End of 1st 4 rows
         ************************************************/

  movq_m2r (*(dataptr + 3), mm1);       /* load x1  : stage 1 */
  movq_r2r (mm7, mm0);          /* copy x0 */

  movq_r2m (mm6, *(dataptr + 2));       /*save y1 */

  movq_m2r (*(dataptr + 5), mm2);       /* load x2  : stage 1 */
  movq_r2r (mm1, mm6);          /* copy x1 */

  paddw_m2r (*(dataptr + 15), mm0);     /* tmp00 = x0 + x7 */

  movq_m2r (*(dataptr + 7), mm3);       /* load x3  : stage 1 */
  movq_r2r (mm2, mm5);          /* copy x2 */

  psubw_m2r (*(dataptr + 15), mm7);     /* tmp07 = x0 - x7 */
  movq_r2r (mm3, mm4);          /* copy x3 */

  paddw_m2r (*(dataptr + 13), mm1);     /* tmp01 = x1 + x6 */

  movq_r2m (mm7, tmp7);         /* save tmp07 */
  movq_r2r (mm0, mm7);          /* copy tmp00 */

  psubw_m2r (*(dataptr + 13), mm6);     /* tmp06 = x1 - x6 */

  /* stage 2, Even Part */

  paddw_m2r (*(dataptr + 9), mm3);      /* tmp03 = x3 + x4 */

  movq_r2m (mm6, tmp6);         /* save tmp07 */
  movq_r2r (mm1, mm6);          /* copy tmp01 */

  paddw_m2r (*(dataptr + 11), mm2);     /* tmp02 = x2 + x5 */
  paddw_r2r (mm3, mm0);         /* tmp10 = tmp00 + tmp03 */

  psubw_r2r (mm3, mm7);         /* tmp13 = tmp00 - tmp03 */

  psubw_m2r (*(dataptr + 9), mm4);      /* tmp04 = x3 - x4 */
  psubw_r2r (mm2, mm6);         /* tmp12 = tmp01 - tmp02 */

  paddw_r2r (mm2, mm1);         /* tmp11 = tmp01 + tmp02 */

  psubw_m2r (*(dataptr + 11), mm5);     /* tmp05 = x2 - x5 */
  paddw_r2r (mm7, mm6);         /*  tmp12 + tmp13 */

  /* stage 3, Even and stage 4 & 5 even */

  movq_m2r (tmp6, mm2);         /* load tmp6 */
  movq_r2r (mm0, mm3);          /* copy tmp10 */

  psllw_i2r (2, mm6);           /* shift z1 */
  paddw_r2r (mm1, mm0);         /* y0=tmp10 + tmp11 */

  pmulhw_m2r (RTjpeg_C4, mm6);  /* z1 */
  psubw_r2r (mm1, mm3);         /* y4=tmp10 - tmp11 */

  movq_r2m (mm0, *(dataptr + 1));       /*save y0 */
  movq_r2r (mm7, mm0);          /* copy tmp13 */

  /* odd part */

  movq_r2m (mm3, *(dataptr + 9));       /*save y4 */
  paddw_r2r (mm5, mm4);         /* tmp10 = tmp4 + tmp5 */

  movq_m2r (tmp7, mm3);         /* load tmp7 */
  paddw_r2r (mm6, mm0);         /* tmp32 = tmp13 + z1 */

  paddw_r2r (mm2, mm5);         /* tmp11 = tmp5 + tmp6 */
  psubw_r2r (mm6, mm7);         /* tmp33 = tmp13 - z1 */

  movq_r2m (mm0, *(dataptr + 5));       /*save y2 */
  paddw_r2r (mm3, mm2);         /* tmp12 = tmp6 + tmp7 */

  /* stage 4 */

  movq_r2m (mm7, *(dataptr + 13));      /*save y6 */
  movq_r2r (mm4, mm1);          /* copy tmp10 */

  psubw_r2r (mm2, mm1);         /* tmp10 - tmp12 */
  psllw_i2r (2, mm4);           /* shift tmp10 */

  movq_m2r (RTjpeg_C2mC6, mm0); /* load C2mC6 */
  psllw_i2r (2, mm1);           /* shift (tmp10-tmp12) */

  pmulhw_m2r (RTjpeg_C6, mm1);  /* z5 */
  psllw_i2r (2, mm5);           /* prepare for multiply  */

  pmulhw_r2r (mm0, mm4);        /* multiply by converted real */

  /* stage 5 */

  pmulhw_m2r (RTjpeg_C4, mm5);  /* z3 */
  psllw_i2r (2, mm2);           /* prepare for multiply  */

  pmulhw_m2r (RTjpeg_C2pC6, mm2);       /* multiply */
  movq_r2r (mm3, mm0);          /* copy tmp7 */

  movq_m2r (*(dataptr + 9), mm7);       /* m03:m02|m01:m00 - first line (line 4)and copy into mm7 */
  paddw_r2r (mm1, mm4);         /* z2 */

  paddw_r2r (mm5, mm0);         /* z11 */
  psubw_r2r (mm5, mm3);         /* z13 */

  /* stage 6 */

  movq_r2r (mm3, mm5);          /* copy z13 */
  paddw_r2r (mm1, mm2);         /* z4 */

  movq_r2r (mm0, mm6);          /* copy z11 */
  psubw_r2r (mm4, mm5);         /* y3 */

  paddw_r2r (mm2, mm6);         /* y1 */
  paddw_r2r (mm4, mm3);         /* y5 */

  movq_r2m (mm5, *(dataptr + 7));       /*save y3 */
  psubw_r2r (mm2, mm0);         /* y�=z11 - z4 */

  movq_r2m (mm3, *(dataptr + 11));      /*save y5 */

  movq_r2m (mm6, *(dataptr + 3));       /*save y1 */

  movq_r2m (mm0, *(dataptr + 15));      /*save y7 */


#endif
}

#define FIX_1_082392200  ((__s32)  277) /* FIX(1.082392200) */
#define FIX_1_414213562  ((__s32)  362) /* FIX(1.414213562) */
#define FIX_1_847759065  ((__s32)  473) /* FIX(1.847759065) */
#define FIX_2_613125930  ((__s32)  669) /* FIX(2.613125930) */

#define DESCALE(x) (__s16)( ((x)+4) >> 3)

/* clip yuv to 16..235 (should be 16..240 for cr/cb but ... */

#define RL(x) ((x)>235) ? 235 : (((x)<16) ? 16 : (x))
#define MULTIPLY(var,const)  (((__s32) ((var) * (const)) + 128)>>8)

void
RTjpeg_idct_init (void)
{
  int i;

  for (i = 0; i < 64; i++) {
    RTjpeg_liqt[i] = ((__u64) RTjpeg_liqt[i] * RTjpeg_aan_tab[i]) >> 32;
    RTjpeg_ciqt[i] = ((__u64) RTjpeg_ciqt[i] * RTjpeg_aan_tab[i]) >> 32;
  }
}

void
RTjpeg_idct (__u8 * odata, __s16 * data, int rskip)
{
#ifdef HAVE_LIBMMX

  static mmx_t fix_141 = (mmx_t) (long long) 0x5a825a825a825a82LL;
  static mmx_t fix_184n261 = (mmx_t) (long long) 0xcf04cf04cf04cf04LL;
  static mmx_t fix_184 = (mmx_t) (long long) 0x7641764176417641LL;
  static mmx_t fix_n184 = (mmx_t) (long long) 0x896f896f896f896fLL;
  static mmx_t fix_108n184 = (mmx_t) (long long) 0xcf04cf04cf04cf04LL;

  mmx_t workspace[64];
  mmx_t *wsptr = workspace;
  register mmx_t *dataptr = (mmx_t *) odata;
  mmx_t *idata = (mmx_t *) data;

  rskip = rskip >> 3;
/*
 * Perform inverse DCT on one block of coefficients.
 */

  /* Odd part */

  movq_m2r (*(idata + 10), mm1);        /* load idata[DCTSIZE*5] */

  movq_m2r (*(idata + 6), mm0); /* load idata[DCTSIZE*3] */

  movq_m2r (*(idata + 2), mm3); /* load idata[DCTSIZE*1] */

  movq_r2r (mm1, mm2);          /* copy tmp6    : phase 6 */
  */movq_m2r (*(idata + 14), mm4);      /* load idata[DCTSIZE*7] */

  paddw_r2r (mm0, mm1);         /* z13 = tmp6 + tmp5; */

  psubw_r2r (mm0, mm2);         /* z10 = tmp6 - tmp5    */

  psllw_i2r (2, mm2);           /* shift z10 */
  movq_r2r (mm2, mm0);          /* copy z10 */

  pmulhw_m2r (fix_184n261, mm2);        /* MULTIPLY( z12, FIX_1_847759065); : 2*c2 */
  movq_r2r (mm3, mm5);          /* copy tmp4 */

  pmulhw_m2r (fix_n184, mm0);   /* MULTIPLY(z10, -FIX_1_847759065); : 2*c2 */
  paddw_r2r (mm4, mm3);         /* z11 = tmp4 + tmp7; */

  movq_r2r (mm3, mm6);          /* copy z11                     : phase 5 */
  psubw_r2r (mm4, mm5);         /* z12 = tmp4 - tmp7; */

  psubw_r2r (mm1, mm6);         /* z11-z13 */
  psllw_i2r (2, mm5);           /*      shift z12 */

  movq_m2r (*(idata + 12), mm4);        /* load idata[DCTSIZE*6], even part */
  movq_r2r (mm5, mm7);          /*      copy z12 */

  pmulhw_m2r (fix_108n184, mm5);        /*        MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; 2*(c2-c6): even part */
  paddw_r2r (mm1, mm3);         /* tmp7 = z11 + z13;     */

  /*ok */

  /* Even part */
  pmulhw_m2r (fix_184, mm7);    /* MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; -2*(c2+c6) */
  psllw_i2r (2, mm6);

  movq_m2r (*(idata + 4), mm1); /* load idata[DCTSIZE*2] */

  paddw_r2r (mm5, mm0);         /*      tmp10 */

  paddw_r2r (mm7, mm2);         /* tmp12 */

  pmulhw_m2r (fix_141, mm6);    /* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); 2*c4 */
  psubw_r2r (mm3, mm2);         /* tmp6 = tmp12 - tmp7 */

  movq_r2r (mm1, mm5);          /* copy tmp1 */
  paddw_r2r (mm4, mm1);         /* tmp13= tmp1 + tmp3; phases 5-3 */

  psubw_r2r (mm4, mm5);         /* tmp1-tmp3 */
  psubw_r2r (mm2, mm6);         /* tmp5 = tmp11 - tmp6; */

  movq_r2m (mm1, *(wsptr));     /* save tmp13 in workspace */
  psllw_i2r (2, mm5);           /* shift tmp1-tmp3 */

  movq_m2r (*(idata), mm7);     /* load idata[DCTSIZE*0] */

  pmulhw_m2r (fix_141, mm5);    /* MULTIPLY(tmp1 - tmp3, FIX_1_414213562) */
  paddw_r2r (mm6, mm0);         /* tmp4 = tmp10 + tmp5; */

  movq_m2r (*(idata + 8), mm4); /* load idata[DCTSIZE*4] */

  psubw_r2r (mm1, mm5);         /* tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; 2*c4 */

  movq_r2m (mm0, *(wsptr + 4)); /* save tmp4 in workspace */
  movq_r2r (mm7, mm1);          /* copy tmp0    : phase 3 */

  movq_r2m (mm5, *(wsptr + 2)); /* save tmp12 in workspace */
  psubw_r2r (mm4, mm1);         /* tmp11 = tmp0 - tmp2;  */

  paddw_r2r (mm4, mm7);         /* tmp10 = tmp0 + tmp2; */
  movq_r2r (mm1, mm5);          /* copy tmp11 */

  paddw_m2r (*(wsptr + 2), mm1);        /* tmp1 = tmp11 + tmp12; */
  movq_r2r (mm7, mm4);          /* copy tmp10           : phase 2 */

  paddw_m2r (*(wsptr), mm7);    /* tmp0 = tmp10 + tmp13;         */

  psubw_m2r (*(wsptr), mm4);    /* tmp3 = tmp10 - tmp13; */
  movq_r2r (mm7, mm0);          /*      copy tmp0 */

  psubw_m2r (*(wsptr + 2), mm5);        /* tmp2 = tmp11 - tmp12; */
  paddw_r2r (mm3, mm7);         /*      wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); */

  psubw_r2r (mm3, mm0);         /* wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); */

  movq_r2m (mm7, *(wsptr));     /*      wsptr[DCTSIZE*0] */
  movq_r2r (mm1, mm3);          /*      copy tmp1 */

  movq_r2m (mm0, *(wsptr + 14));        /* wsptr[DCTSIZE*7] */
  paddw_r2r (mm2, mm1);         /* wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); */

  psubw_r2r (mm2, mm3);         /* wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); */

  movq_r2m (mm1, *(wsptr + 2)); /* wsptr[DCTSIZE*1] */
  movq_r2r (mm4, mm1);          /*      copy tmp3 */

  movq_r2m (mm3, *(wsptr + 12));        /* wsptr[DCTSIZE*6] */

  paddw_m2r (*(wsptr + 4), mm4);        /* wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); */

  psubw_m2r (*(wsptr + 4), mm1);        /* wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); */

  movq_r2m (mm4, *(wsptr + 8));
  movq_r2r (mm5, mm7);          /* copy tmp2 */

  paddw_r2r (mm6, mm5);         /* wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) */

  movq_r2m (mm1, *(wsptr + 6));
  psubw_r2r (mm6, mm7);         /*      wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); */

  movq_r2m (mm5, *(wsptr + 4));

  movq_r2m (mm7, *(wsptr + 10));

  /*ok */


/*****************************************************************/

  idata++;
  wsptr++;

/*****************************************************************/

  movq_m2r (*(idata + 10), mm1);        /* load idata[DCTSIZE*5] */

  movq_m2r (*(idata + 6), mm0); /* load idata[DCTSIZE*3] */

  movq_m2r (*(idata + 2), mm3); /* load idata[DCTSIZE*1] */
  movq_r2r (mm1, mm2);          /*      copy tmp6       : phase 6 */
  */movq_m2r (*(idata + 14), mm4);      /* load idata[DCTSIZE*7] */
  paddw_r2r (mm0, mm1);         /*      z13 = tmp6 + tmp5; */

  psubw_r2r (mm0, mm2);         /*      z10 = tmp6 - tmp5    */

  psllw_i2r (2, mm2);           /*      shift z10 */
  movq_r2r (mm2, mm0);          /*      copy z10 */

  pmulhw_m2r (fix_184n261, mm2);        /* MULTIPLY( z12, FIX_1_847759065); : 2*c2 */
  movq_r2r (mm3, mm5);          /*      copy tmp4 */

  pmulhw_m2r (fix_n184, mm0);   /* MULTIPLY(z10, -FIX_1_847759065); : 2*c2 */
  paddw_r2r (mm4, mm3);         /* z11 = tmp4 + tmp7; */

  movq_r2r (mm3, mm6);          /* copy z11                     : phase 5 */
  psubw_r2r (mm4, mm5);         /*      z12 = tmp4 - tmp7; */

  psubw_r2r (mm1, mm6);         /* z11-z13 */
  psllw_i2r (2, mm5);           /*      shift z12 */

  movq_m2r (*(idata + 12), mm4);        /* load idata[DCTSIZE*6], even part */
  movq_r2r (mm5, mm7);          /* copy z12 */

  pmulhw_m2r (fix_108n184, mm5);        /* MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; 2*(c2-c6) even part */
  paddw_r2r (mm1, mm3);         /* tmp7 = z11 + z13;     */

  /*ok */

  /* Even part */
  pmulhw_m2r (fix_184, mm7);    /* MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; -2*(c2+c6) */
  psllw_i2r (2, mm6);

  movq_m2r (*(idata + 4), mm1); /* load idata[DCTSIZE*2] */

  paddw_r2r (mm5, mm0);         /*      tmp10 */

  paddw_r2r (mm7, mm2);         /* tmp12 */

  pmulhw_m2r (fix_141, mm6);    /* tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); 2*c4 */
  psubw_r2r (mm3, mm2);         /* tmp6 = tmp12 - tmp7 */

  movq_r2r (mm1, mm5);          /* copy tmp1 */
  paddw_r2r (mm4, mm1);         /* tmp13= tmp1 + tmp3;  phases 5-3 */

  psubw_r2r (mm4, mm5);         /* tmp1-tmp3 */
  psubw_r2r (mm2, mm6);         /* tmp5 = tmp11 - tmp6; */

  movq_r2m (mm1, *(wsptr));     /* save tmp13 in workspace */
  psllw_i2r (2, mm5);           /* shift tmp1-tmp3 */

  movq_m2r (*(idata), mm7);     /* load idata[DCTSIZE*0] */
  paddw_r2r (mm6, mm0);         /* tmp4 = tmp10 + tmp5; */

  pmulhw_m2r (fix_141, mm5);    /* MULTIPLY(tmp1 - tmp3, FIX_1_414213562) */

  movq_m2r (*(idata + 8), mm4); /* load idata[DCTSIZE*4] */

  psubw_r2r (mm1, mm5);         /* tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; 2*c4 */

  movq_r2m (mm0, *(wsptr + 4)); /* save tmp4 in workspace */
  movq_r2r (mm7, mm1);          /* copy tmp0: phase 3 */

  movq_r2m (mm5, *(wsptr + 2)); /* save tmp12 in workspace */
  psubw_r2r (mm4, mm1);         /* tmp11 = tmp0 - tmp2;  */

  paddw_r2r (mm4, mm7);         /* tmp10 = tmp0 + tmp2; */
  movq_r2r (mm1, mm5);          /* copy tmp11 */

  paddw_m2r (*(wsptr + 2), mm1);        /* tmp1 = tmp11 + tmp12; */
  movq_r2r (mm7, mm4);          /* copy tmp10: phase 2 */

  paddw_m2r (*(wsptr), mm7);    /* tmp0 = tmp10 + tmp13;         */

  psubw_m2r (*(wsptr), mm4);    /* tmp3 = tmp10 - tmp13; */
  movq_r2r (mm7, mm0);          /* copy tmp0 */

  psubw_m2r (*(wsptr + 2), mm5);        /* tmp2 = tmp11 - tmp12; */
  paddw_r2r (mm3, mm7);         /* wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); */

  psubw_r2r (mm3, mm0);         /* wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); */

  movq_r2m (mm7, *(wsptr));     /* wsptr[DCTSIZE*0] */
  movq_r2r (mm1, mm3);          /* copy tmp1 */

  movq_r2m (mm0, *(wsptr + 14));        /* wsptr[DCTSIZE*7] */
  paddw_r2r (mm2, mm1);         /* wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); */

  psubw_r2r (mm2, mm3);         /* wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); */

  movq_r2m (mm1, *(wsptr + 2)); /* wsptr[DCTSIZE*1] */
  movq_r2r (mm4, mm1);          /* copy tmp3 */

  movq_r2m (mm3, *(wsptr + 12));        /* wsptr[DCTSIZE*6] */

  paddw_m2r (*(wsptr + 4), mm4);        /* wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); */

  psubw_m2r (*(wsptr + 4), mm1);        /* wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); */

  movq_r2m (mm4, *(wsptr + 8));
  movq_r2r (mm5, mm7);          /* copy tmp2 */

  paddw_r2r (mm6, mm5);         /* wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5) */

  movq_r2m (mm1, *(wsptr + 6));
  psubw_r2r (mm6, mm7);         /* wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); */

  movq_r2m (mm5, *(wsptr + 4));

  movq_r2m (mm7, *(wsptr + 10));

/*****************************************************************/

  /* Pass 2: process rows from work array, store into output array. */
  /* Note that we must descale the results by a factor of 8 == 2**3, */
  /* and also undo the PASS1_BITS scaling. */

/*****************************************************************/
  /* Even part */

  wsptr--;

/*    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */
/*    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */
/*    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */
/*    tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); */
  movq_m2r (*(wsptr), mm0);     /* wsptr[0,0],[0,1],[0,2],[0,3] */

  movq_m2r (*(wsptr + 1), mm1); /* wsptr[0,4],[0,5],[0,6],[0,7] */
  movq_r2r (mm0, mm2);

  movq_m2r (*(wsptr + 2), mm3); /* wsptr[1,0],[1,1],[1,2],[1,3] */
  paddw_r2r (mm1, mm0);         /* wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] */

  movq_m2r (*(wsptr + 3), mm4); /* wsptr[1,4],[1,5],[1,6],[1,7] */
  psubw_r2r (mm1, mm2);         /* wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] */

  movq_r2r (mm0, mm6);
  movq_r2r (mm3, mm5);

  paddw_r2r (mm4, mm3);         /* wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] */
  movq_r2r (mm2, mm1);

  psubw_r2r (mm4, mm5);         /* wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] */
  punpcklwd_r2r (mm3, mm0);     /* wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] */

  movq_m2r (*(wsptr + 7), mm7); /* wsptr[3,4],[3,5],[3,6],[3,7] */
  punpckhwd_r2r (mm3, mm6);     /* wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] */

  movq_m2r (*(wsptr + 4), mm3); /* wsptr[2,0],[2,1],[2,2],[2,3] */
  punpckldq_r2r (mm6, mm0);     /* wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */

  punpcklwd_r2r (mm5, mm1);     /* wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] */
  movq_r2r (mm3, mm4);

  movq_m2r (*(wsptr + 6), mm6); /* wsptr[3,0],[3,1],[3,2],[3,3] */
  punpckhwd_r2r (mm5, mm2);     /* wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] */

  movq_m2r (*(wsptr + 5), mm5); /* wsptr[2,4],[2,5],[2,6],[2,7] */
  punpckldq_r2r (mm2, mm1);     /* wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */


  paddw_r2r (mm5, mm3);         /* wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] */
  movq_r2r (mm6, mm2);

  psubw_r2r (mm5, mm4);         /* wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] */
  paddw_r2r (mm7, mm6);         /* wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] */

  movq_r2r (mm3, mm5);
  punpcklwd_r2r (mm6, mm3);     /* wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] */

  psubw_r2r (mm7, mm2);         /* wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] */
  punpckhwd_r2r (mm6, mm5);     /* wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] */

  movq_r2r (mm4, mm7);
  punpckldq_r2r (mm5, mm3);     /* wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] */

  punpcklwd_r2r (mm2, mm4);     /* wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] */

  punpckhwd_r2r (mm2, mm7);     /* wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] */

  punpckldq_r2r (mm7, mm4);     /* wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] */
  movq_r2r (mm1, mm6);

  /*ok */

/*      mm0 =   ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */
/*      mm1 =   ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */


  movq_r2r (mm0, mm2);
  punpckhdq_r2r (mm4, mm6);     /* wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] */

  punpckldq_r2r (mm4, mm1);     /* wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] */
  psllw_i2r (2, mm6);

  pmulhw_m2r (fix_141, mm6);
  punpckldq_r2r (mm3, mm0);     /* wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] */

  punpckhdq_r2r (mm3, mm2);     /* wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] */
  movq_r2r (mm0, mm7);

/*    tmp0 = tmp10 + tmp13; */
/*    tmp3 = tmp10 - tmp13; */
  paddw_r2r (mm2, mm0);         /* [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] */
  psubw_r2r (mm2, mm7);         /* [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] */

/*    tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; */
  psubw_r2r (mm2, mm6);         /* wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] */
/*    tmp1 = tmp11 + tmp12; */
/*    tmp2 = tmp11 - tmp12; */
  movq_r2r (mm1, mm5);

  /*OK */

  /* Odd part */

/*    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */
/*    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */
/*    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */
/*    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */
  movq_m2r (*(wsptr), mm3);     /* wsptr[0,0],[0,1],[0,2],[0,3] */
  paddw_r2r (mm6, mm1);         /* [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] */

  movq_m2r (*(wsptr + 1), mm4); /* wsptr[0,4],[0,5],[0,6],[0,7] */
  psubw_r2r (mm6, mm5);         /* [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] */

  movq_r2r (mm3, mm6);
  punpckldq_r2r (mm4, mm3);     /* wsptr[0,0],[0,1],[0,4],[0,5] */

  punpckhdq_r2r (mm6, mm4);     /* wsptr[0,6],[0,7],[0,2],[0,3] */
  movq_r2r (mm3, mm2);

/*Save tmp0 and tmp1 in wsptr */
  movq_r2m (mm0, *(wsptr));     /* save tmp0 */
  paddw_r2r (mm4, mm2);         /* wsptr[xxx],[0,z11],[xxx],[0,z13] */


/*Continue with z10 --- z13 */
  movq_m2r (*(wsptr + 2), mm6); /* wsptr[1,0],[1,1],[1,2],[1,3] */
  psubw_r2r (mm4, mm3);         /* wsptr[xxx],[0,z12],[xxx],[0,z10] */

  movq_m2r (*(wsptr + 3), mm0); /* wsptr[1,4],[1,5],[1,6],[1,7] */
  movq_r2r (mm6, mm4);

  movq_r2m (mm1, *(wsptr + 1)); /* save tmp1 */
  punpckldq_r2r (mm0, mm6);     /* wsptr[1,0],[1,1],[1,4],[1,5] */

  punpckhdq_r2r (mm4, mm0);     /* wsptr[1,6],[1,7],[1,2],[1,3] */
  movq_r2r (mm6, mm1);

/*Save tmp2 and tmp3 in wsptr */
  paddw_r2r (mm0, mm6);         /* wsptr[xxx],[1,z11],[xxx],[1,z13] */
  movq_r2r (mm2, mm4);

/*Continue with z10 --- z13 */
  movq_r2m (mm5, *(wsptr + 2)); /* save tmp2 */
  punpcklwd_r2r (mm6, mm2);     /* wsptr[xxx],[xxx],[0,z11],[1,z11] */

  psubw_r2r (mm0, mm1);         /* wsptr[xxx],[1,z12],[xxx],[1,z10] */
  punpckhwd_r2r (mm6, mm4);     /* wsptr[xxx],[xxx],[0,z13],[1,z13] */

  movq_r2r (mm3, mm0);
  punpcklwd_r2r (mm1, mm3);     /* wsptr[xxx],[xxx],[0,z12],[1,z12] */

  movq_r2m (mm7, *(wsptr + 3)); /* save tmp3 */
  punpckhwd_r2r (mm1, mm0);     /* wsptr[xxx],[xxx],[0,z10],[1,z10] */

  movq_m2r (*(wsptr + 4), mm6); /* wsptr[2,0],[2,1],[2,2],[2,3] */
  punpckhdq_r2r (mm2, mm0);     /* wsptr[0,z10],[1,z10],[0,z11],[1,z11] */

  movq_m2r (*(wsptr + 5), mm7); /* wsptr[2,4],[2,5],[2,6],[2,7] */
  punpckhdq_r2r (mm4, mm3);     /* wsptr[0,z12],[1,z12],[0,z13],[1,z13] */

  movq_m2r (*(wsptr + 6), mm1); /* wsptr[3,0],[3,1],[3,2],[3,3] */
  movq_r2r (mm6, mm4);

  punpckldq_r2r (mm7, mm6);     /* wsptr[2,0],[2,1],[2,4],[2,5] */
  movq_r2r (mm1, mm5);

  punpckhdq_r2r (mm4, mm7);     /* wsptr[2,6],[2,7],[2,2],[2,3] */
  movq_r2r (mm6, mm2);

  movq_m2r (*(wsptr + 7), mm4); /* wsptr[3,4],[3,5],[3,6],[3,7] */
  paddw_r2r (mm7, mm6);         /* wsptr[xxx],[2,z11],[xxx],[2,z13] */

  psubw_r2r (mm7, mm2);         /* wsptr[xxx],[2,z12],[xxx],[2,z10] */
  punpckldq_r2r (mm4, mm1);     /* wsptr[3,0],[3,1],[3,4],[3,5] */

  punpckhdq_r2r (mm5, mm4);     /* wsptr[3,6],[3,7],[3,2],[3,3] */
  movq_r2r (mm1, mm7);

  paddw_r2r (mm4, mm1);         /* wsptr[xxx],[3,z11],[xxx],[3,z13] */
  psubw_r2r (mm4, mm7);         /* wsptr[xxx],[3,z12],[xxx],[3,z10] */

  movq_r2r (mm6, mm5);
  punpcklwd_r2r (mm1, mm6);     /* wsptr[xxx],[xxx],[2,z11],[3,z11] */

  punpckhwd_r2r (mm1, mm5);     /* wsptr[xxx],[xxx],[2,z13],[3,z13] */
  movq_r2r (mm2, mm4);

  punpcklwd_r2r (mm7, mm2);     /* wsptr[xxx],[xxx],[2,z12],[3,z12] */

  punpckhwd_r2r (mm7, mm4);     /* wsptr[xxx],[xxx],[2,z10],[3,z10] */

  punpckhdq_r2r (mm6, mm4);     /*/ wsptr[2,z10],[3,z10],[2,z11],[3,z11] */

  punpckhdq_r2r (mm5, mm2);     /* wsptr[2,z12],[3,z12],[2,z13],[3,z13] */
  movq_r2r (mm0, mm5);

  punpckldq_r2r (mm4, mm0);     /* wsptr[0,z10],[1,z10],[2,z10],[3,z10] */

  punpckhdq_r2r (mm4, mm5);     /* wsptr[0,z11],[1,z11],[2,z11],[3,z11] */
  movq_r2r (mm3, mm4);

  punpckhdq_r2r (mm2, mm4);     /* wsptr[0,z13],[1,z13],[2,z13],[3,z13] */
  movq_r2r (mm5, mm1);

  punpckldq_r2r (mm2, mm3);     /* wsptr[0,z12],[1,z12],[2,z12],[3,z12] */
/*    tmp7 = z11 + z13;         : phase 5 */
/*    tmp8 = z11 - z13;         : phase 5 */
  psubw_r2r (mm4, mm1);         /* tmp8 */

  paddw_r2r (mm4, mm5);         /* tmp7 */
/*    tmp21 = MULTIPLY(tmp8, FIX_1_414213562); 2*c4  */
  psllw_i2r (2, mm1);

  psllw_i2r (2, mm0);

  pmulhw_m2r (fix_141, mm1);    /* tmp21 */
/*    tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065))  2*(c2-c6) */
/*                      + MULTIPLY(z10, - FIX_1_847759065); : 2*c2 */
  psllw_i2r (2, mm3);
  movq_r2r (mm0, mm7);

  pmulhw_m2r (fix_n184, mm7);
  movq_r2r (mm3, mm6);

  movq_m2r (*(wsptr), mm2);     /* tmp0,final1 */

  pmulhw_m2r (fix_108n184, mm6);
/*       tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) : -2*(c2+c6) */
/*                      + MULTIPLY(z12, FIX_1_847759065); 2*c2 */
  movq_r2r (mm2, mm4);          /* final1 */

  pmulhw_m2r (fix_184n261, mm0);
  paddw_r2r (mm5, mm2);         /* tmp0+tmp7,final1 */

  pmulhw_m2r (fix_184, mm3);
  psubw_r2r (mm5, mm4);         /* tmp0-tmp7,final1 */

/*    tmp6 = tmp22 - tmp7;      phase 2 */
  psraw_i2r (3, mm2);           /* outptr[0,0],[1,0],[2,0],[3,0],final1 */

  paddw_r2r (mm6, mm7);         /* tmp20 */
  psraw_i2r (3, mm4);           /* outptr[0,7],[1,7],[2,7],[3,7],final1 */

  paddw_r2r (mm0, mm3);         /* tmp22 */

/*    tmp5 = tmp21 - tmp6; */
  psubw_r2r (mm5, mm3);         /* tmp6 */

/*    tmp4 = tmp20 + tmp5; */
  movq_m2r (*(wsptr + 1), mm0); /* tmp1,final2 */
  psubw_r2r (mm3, mm1);         /* tmp5 */

  movq_r2r (mm0, mm6);          /* final2 */
  paddw_r2r (mm3, mm0);         /* tmp1+tmp6,final2 */

  /* Final output stage: scale down by a factor of 8 and range-limit */


/*    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */
/*                          & RANGE_MASK]; */
/*    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */
/*                          & RANGE_MASK];      final1 */


/*    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */
/*                          & RANGE_MASK]; */
/*    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */
/*                          & RANGE_MASK];      final2 */
  psubw_r2r (mm3, mm6);         /* tmp1-tmp6,final2 */
  psraw_i2r (3, mm0);           /* outptr[0,1],[1,1],[2,1],[3,1] */

  psraw_i2r (3, mm6);           /* outptr[0,6],[1,6],[2,6],[3,6] */

  packuswb_r2r (mm4, mm0);      /* out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] */

  movq_m2r (*(wsptr + 2), mm5); /* tmp2,final3 */
  packuswb_r2r (mm6, mm2);      /* out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] */

/*    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */
/*                          & RANGE_MASK]; */
/*    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */
/*                          & RANGE_MASK];      final3 */
  paddw_r2r (mm1, mm7);         /* tmp4 */
  movq_r2r (mm5, mm3);

  paddw_r2r (mm1, mm5);         /* tmp2+tmp5 */
  psubw_r2r (mm1, mm3);         /* tmp2-tmp5 */

  psraw_i2r (3, mm5);           /* outptr[0,2],[1,2],[2,2],[3,2] */

  movq_m2r (*(wsptr + 3), mm4); /* tmp3,final4 */
  psraw_i2r (3, mm3);           /* outptr[0,5],[1,5],[2,5],[3,5] */



/*    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */
/*                          & RANGE_MASK]; */
/*    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */
/*                          & RANGE_MASK];      final4 */
  movq_r2r (mm4, mm6);
  paddw_r2r (mm7, mm4);         /* tmp3+tmp4 */

  psubw_r2r (mm7, mm6);         /* tmp3-tmp4 */
  psraw_i2r (3, mm4);           /* outptr[0,4],[1,4],[2,4],[3,4] */

  /* mov                  ecx, [dataptr] */

  psraw_i2r (3, mm6);           /* outptr[0,3],[1,3],[2,3],[3,3] */

  packuswb_r2r (mm4, mm5);      /* out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] */

  packuswb_r2r (mm3, mm6);      /* out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] */
  movq_r2r (mm2, mm4);

  movq_r2r (mm5, mm7);
  punpcklbw_r2r (mm0, mm2);     /* out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] */

  punpckhbw_r2r (mm0, mm4);     /* out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] */
  movq_r2r (mm2, mm1);

  punpcklbw_r2r (mm6, mm5);     /* out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] */

  /* add                  dataptr, 4 */

  punpckhbw_r2r (mm6, mm7);     /* out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] */

  punpcklwd_r2r (mm5, mm2);     /* out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] */

  /* add                  ecx, output_col */

  movq_r2r (mm7, mm6);
  punpckhwd_r2r (mm5, mm1);     /* out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] */

  movq_r2r (mm2, mm0);
  punpcklwd_r2r (mm4, mm6);     /* out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] */

  /* mov                  idata, [dataptr] */

  punpckldq_r2r (mm6, mm2);     /* out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] */

  /* add                  dataptr, 4 */

  movq_r2r (mm1, mm3);

  /* add                  idata, output_col  */

  punpckhwd_r2r (mm4, mm7);     /* out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] */

  movq_r2m (mm2, *(dataptr));

  punpckhdq_r2r (mm6, mm0);     /* out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] */

  dataptr += rskip;
  movq_r2m (mm0, *(dataptr));

  punpckldq_r2r (mm7, mm1);     /* out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] */
  punpckhdq_r2r (mm7, mm3);     /* out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] */

  dataptr += rskip;
  movq_r2m (mm1, *(dataptr));

  dataptr += rskip;
  movq_r2m (mm3, *(dataptr));

/*******************************************************************/

  wsptr += 8;

/*******************************************************************/

/*    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); */
/*    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); */
/*    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); */
/*    tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]); */
  movq_m2r (*(wsptr), mm0);     /* wsptr[0,0],[0,1],[0,2],[0,3] */

  movq_m2r (*(wsptr + 1), mm1); /* wsptr[0,4],[0,5],[0,6],[0,7] */
  movq_r2r (mm0, mm2);

  movq_m2r (*(wsptr + 2), mm3); /* wsptr[1,0],[1,1],[1,2],[1,3] */
  paddw_r2r (mm1, mm0);         /* wsptr[0,tmp10],[xxx],[0,tmp13],[xxx] */

  movq_m2r (*(wsptr + 3), mm4); /* wsptr[1,4],[1,5],[1,6],[1,7] */
  psubw_r2r (mm1, mm2);         /* wsptr[0,tmp11],[xxx],[0,tmp14],[xxx] */

  movq_r2r (mm0, mm6);
  movq_r2r (mm3, mm5);

  paddw_r2r (mm4, mm3);         /* wsptr[1,tmp10],[xxx],[1,tmp13],[xxx] */
  movq_r2r (mm2, mm1);

  psubw_r2r (mm4, mm5);         /* wsptr[1,tmp11],[xxx],[1,tmp14],[xxx] */
  punpcklwd_r2r (mm3, mm0);     /* wsptr[0,tmp10],[1,tmp10],[xxx],[xxx] */

  movq_m2r (*(wsptr + 7), mm7); /* wsptr[3,4],[3,5],[3,6],[3,7] */
  punpckhwd_r2r (mm3, mm6);     /* wsptr[0,tmp13],[1,tmp13],[xxx],[xxx] */

  movq_m2r (*(wsptr + 4), mm3); /* wsptr[2,0],[2,1],[2,2],[2,3] */
  punpckldq_r2r (mm6, mm0);     /* wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */

  punpcklwd_r2r (mm5, mm1);     /* wsptr[0,tmp11],[1,tmp11],[xxx],[xxx] */
  movq_r2r (mm3, mm4);

  movq_m2r (*(wsptr + 6), mm6); /* wsptr[3,0],[3,1],[3,2],[3,3] */
  punpckhwd_r2r (mm5, mm2);     /* wsptr[0,tmp14],[1,tmp14],[xxx],[xxx] */

  movq_m2r (*(wsptr + 5), mm5); /* wsptr[2,4],[2,5],[2,6],[2,7] */
  punpckldq_r2r (mm2, mm1);     /* wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */

  paddw_r2r (mm5, mm3);         /* wsptr[2,tmp10],[xxx],[2,tmp13],[xxx] */
  movq_r2r (mm6, mm2);

  psubw_r2r (mm5, mm4);         /* wsptr[2,tmp11],[xxx],[2,tmp14],[xxx] */
  paddw_r2r (mm7, mm6);         /* wsptr[3,tmp10],[xxx],[3,tmp13],[xxx] */

  movq_r2r (mm3, mm5);
  punpcklwd_r2r (mm6, mm3);     /* wsptr[2,tmp10],[3,tmp10],[xxx],[xxx] */

  psubw_r2r (mm7, mm2);         /* wsptr[3,tmp11],[xxx],[3,tmp14],[xxx] */
  punpckhwd_r2r (mm6, mm5);     /* wsptr[2,tmp13],[3,tmp13],[xxx],[xxx] */

  movq_r2r (mm4, mm7);
  punpckldq_r2r (mm5, mm3);     /* wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13] */

  punpcklwd_r2r (mm2, mm4);     /* wsptr[2,tmp11],[3,tmp11],[xxx],[xxx] */

  punpckhwd_r2r (mm2, mm7);     /* wsptr[2,tmp14],[3,tmp14],[xxx],[xxx] */

  punpckldq_r2r (mm7, mm4);     /* wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14] */
  movq_r2r (mm1, mm6);

  /*OK */

/*      mm0 =   ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13] */
/*      mm1 =   ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14] */

  movq_r2r (mm0, mm2);
  punpckhdq_r2r (mm4, mm6);     /* wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14] */

  punpckldq_r2r (mm4, mm1);     /* wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11] */
  psllw_i2r (2, mm6);

  pmulhw_m2r (fix_141, mm6);
  punpckldq_r2r (mm3, mm0);     /* wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10] */

  punpckhdq_r2r (mm3, mm2);     /* wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13] */
  movq_r2r (mm0, mm7);

/*    tmp0 = tmp10 + tmp13; */
/*    tmp3 = tmp10 - tmp13; */
  paddw_r2r (mm2, mm0);         /* [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0] */
  psubw_r2r (mm2, mm7);         /* [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3] */

/*    tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13; */
  psubw_r2r (mm2, mm6);         /* wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12] */
/*    tmp1 = tmp11 + tmp12; */
/*    tmp2 = tmp11 - tmp12; */
  movq_r2r (mm1, mm5);

  /*OK */


  /* Odd part */

/*    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; */
/*    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; */
/*    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; */
/*    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; */
  movq_m2r (*(wsptr), mm3);     /* wsptr[0,0],[0,1],[0,2],[0,3] */
  paddw_r2r (mm6, mm1);         /* [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1] */

  movq_m2r (*(wsptr + 1), mm4); /* wsptr[0,4],[0,5],[0,6],[0,7] */
  psubw_r2r (mm6, mm5);         /* [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2] */

  movq_r2r (mm3, mm6);
  punpckldq_r2r (mm4, mm3);     /* wsptr[0,0],[0,1],[0,4],[0,5] */

  punpckhdq_r2r (mm6, mm4);     /* wsptr[0,6],[0,7],[0,2],[0,3] */
  movq_r2r (mm3, mm2);

/*Save tmp0 and tmp1 in wsptr */
  movq_r2m (mm0, *(wsptr));     /* save tmp0 */
  paddw_r2r (mm4, mm2);         /* wsptr[xxx],[0,z11],[xxx],[0,z13] */


/*Continue with z10 --- z13 */
  movq_m2r (*(wsptr + 2), mm6); /* wsptr[1,0],[1,1],[1,2],[1,3] */
  psubw_r2r (mm4, mm3);         /* wsptr[xxx],[0,z12],[xxx],[0,z10] */

  movq_m2r (*(wsptr + 3), mm0); /* wsptr[1,4],[1,5],[1,6],[1,7] */
  movq_r2r (mm6, mm4);

  movq_r2m (mm1, *(wsptr + 1)); /* save tmp1 */
  punpckldq_r2r (mm0, mm6);     /* wsptr[1,0],[1,1],[1,4],[1,5] */

  punpckhdq_r2r (mm4, mm0);     /* wsptr[1,6],[1,7],[1,2],[1,3] */
  movq_r2r (mm6, mm1);

/*Save tmp2 and tmp3 in wsptr */
  paddw_r2r (mm0, mm6);         /* wsptr[xxx],[1,z11],[xxx],[1,z13] */
  movq_r2r (mm2, mm4);

/*Continue with z10 --- z13 */
  movq_r2m (mm5, *(wsptr + 2)); /* save tmp2 */
  punpcklwd_r2r (mm6, mm2);     /* wsptr[xxx],[xxx],[0,z11],[1,z11] */

  psubw_r2r (mm0, mm1);         /* wsptr[xxx],[1,z12],[xxx],[1,z10] */
  punpckhwd_r2r (mm6, mm4);     /* wsptr[xxx],[xxx],[0,z13],[1,z13] */

  movq_r2r (mm3, mm0);
  punpcklwd_r2r (mm1, mm3);     /* wsptr[xxx],[xxx],[0,z12],[1,z12] */

  movq_r2m (mm7, *(wsptr + 3)); /* save tmp3 */
  punpckhwd_r2r (mm1, mm0);     /* wsptr[xxx],[xxx],[0,z10],[1,z10] */

  movq_m2r (*(wsptr + 4), mm6); /* wsptr[2,0],[2,1],[2,2],[2,3] */
  punpckhdq_r2r (mm2, mm0);     /* wsptr[0,z10],[1,z10],[0,z11],[1,z11] */

  movq_m2r (*(wsptr + 5), mm7); /* wsptr[2,4],[2,5],[2,6],[2,7] */
  punpckhdq_r2r (mm4, mm3);     /* wsptr[0,z12],[1,z12],[0,z13],[1,z13] */

  movq_m2r (*(wsptr + 6), mm1); /* wsptr[3,0],[3,1],[3,2],[3,3] */
  movq_r2r (mm6, mm4);

  punpckldq_r2r (mm7, mm6);     /* wsptr[2,0],[2,1],[2,4],[2,5] */
  movq_r2r (mm1, mm5);

  punpckhdq_r2r (mm4, mm7);     /* wsptr[2,6],[2,7],[2,2],[2,3] */
  movq_r2r (mm6, mm2);

  movq_m2r (*(wsptr + 7), mm4); /* wsptr[3,4],[3,5],[3,6],[3,7] */
  paddw_r2r (mm7, mm6);         /* wsptr[xxx],[2,z11],[xxx],[2,z13] */

  psubw_r2r (mm7, mm2);         /* wsptr[xxx],[2,z12],[xxx],[2,z10] */
  punpckldq_r2r (mm4, mm1);     /* wsptr[3,0],[3,1],[3,4],[3,5] */

  punpckhdq_r2r (mm5, mm4);     /* wsptr[3,6],[3,7],[3,2],[3,3] */
  movq_r2r (mm1, mm7);

  paddw_r2r (mm4, mm1);         /* wsptr[xxx],[3,z11],[xxx],[3,z13] */
  psubw_r2r (mm4, mm7);         /* wsptr[xxx],[3,z12],[xxx],[3,z10] */

  movq_r2r (mm6, mm5);
  punpcklwd_r2r (mm1, mm6);     /* wsptr[xxx],[xxx],[2,z11],[3,z11] */

  punpckhwd_r2r (mm1, mm5);     /* wsptr[xxx],[xxx],[2,z13],[3,z13] */
  movq_r2r (mm2, mm4);

  punpcklwd_r2r (mm7, mm2);     /* wsptr[xxx],[xxx],[2,z12],[3,z12] */

  punpckhwd_r2r (mm7, mm4);     /* wsptr[xxx],[xxx],[2,z10],[3,z10] */

  punpckhdq_r2r (mm6, mm4);     /* wsptr[2,z10],[3,z10],[2,z11],[3,z11] */

  punpckhdq_r2r (mm5, mm2);     /* wsptr[2,z12],[3,z12],[2,z13],[3,z13] */
  movq_r2r (mm0, mm5);

  punpckldq_r2r (mm4, mm0);     /* wsptr[0,z10],[1,z10],[2,z10],[3,z10] */

  punpckhdq_r2r (mm4, mm5);     /* wsptr[0,z11],[1,z11],[2,z11],[3,z11] */
  movq_r2r (mm3, mm4);

  punpckhdq_r2r (mm2, mm4);     /* wsptr[0,z13],[1,z13],[2,z13],[3,z13] */
  movq_r2r (mm5, mm1);

  punpckldq_r2r (mm2, mm3);     /* wsptr[0,z12],[1,z12],[2,z12],[3,z12] */
/*    tmp7 = z11 + z13;         : phase 5 */
/*    tmp8 = z11 - z13;         : phase 5 */
  psubw_r2r (mm4, mm1);         /* tmp8 */

  paddw_r2r (mm4, mm5);         /* tmp7 */
/*    tmp21 = MULTIPLY(tmp8, FIX_1_414213562);  2*c4 */
  psllw_i2r (2, mm1);

  psllw_i2r (2, mm0);

  pmulhw_m2r (fix_141, mm1);    /* tmp21 */
/*    tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) :  2*(c2-c6) */
/*                      + MULTIPLY(z10, - FIX_1_847759065); : 2*c2 */
  psllw_i2r (2, mm3);
  movq_r2r (mm0, mm7);

  pmulhw_m2r (fix_n184, mm7);
  movq_r2r (mm3, mm6);

  movq_m2r (*(wsptr), mm2);     /* tmp0,final1 */

  pmulhw_m2r (fix_108n184, mm6);
/*       tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) : -2*(c2+c6) */
/*                      + MULTIPLY(z12, FIX_1_847759065); : 2*c2 */
  movq_r2r (mm2, mm4);          /* final1 */

  pmulhw_m2r (fix_184n261, mm0);
  paddw_r2r (mm5, mm2);         /* tmp0+tmp7,final1 */

  pmulhw_m2r (fix_184, mm3);
  psubw_r2r (mm5, mm4);         /* tmp0-tmp7,final1 */

/*    tmp6 = tmp22 - tmp7;      phase 2 */
  psraw_i2r (3, mm2);           /* outptr[0,0],[1,0],[2,0],[3,0],final1 */

  paddw_r2r (mm6, mm7);         /* tmp20 */
  psraw_i2r (3, mm4);           /* outptr[0,7],[1,7],[2,7],[3,7],final1 */

  paddw_r2r (mm0, mm3);         /* tmp22 */

/*    tmp5 = tmp21 - tmp6; */
  psubw_r2r (mm5, mm3);         /* tmp6 */

/*    tmp4 = tmp20 + tmp5; */
  movq_m2r (*(wsptr + 1), mm0); /* tmp1,final2 */
  psubw_r2r (mm3, mm1);         /* tmp5 */

  movq_r2r (mm0, mm6);          /* final2 */
  paddw_r2r (mm3, mm0);         /* tmp1+tmp6,final2 */

  /* Final output stage: scale down by a factor of 8 and range-limit */

/*    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) */
/*                          & RANGE_MASK]; */
/*    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) */
/*                          & RANGE_MASK];      final1 */


/*    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) */
/*                          & RANGE_MASK]; */
/*    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) */
/*                          & RANGE_MASK];      final2 */
  psubw_r2r (mm3, mm6);         /* tmp1-tmp6,final2 */
  psraw_i2r (3, mm0);           /* outptr[0,1],[1,1],[2,1],[3,1] */

  psraw_i2r (3, mm6);           /* outptr[0,6],[1,6],[2,6],[3,6] */

  packuswb_r2r (mm4, mm0);      /* out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7] */

  movq_m2r (*(wsptr + 2), mm5); /* tmp2,final3 */
  packuswb_r2r (mm6, mm2);      /* out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6] */

/*    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) */
/*                          & RANGE_MASK]; */
/*    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) */
/*                          & RANGE_MASK];      final3 */
  paddw_r2r (mm1, mm7);         /* tmp4 */
  movq_r2r (mm5, mm3);

  paddw_r2r (mm1, mm5);         /* tmp2+tmp5 */
  psubw_r2r (mm1, mm3);         /* tmp2-tmp5 */

  psraw_i2r (3, mm5);           /* outptr[0,2],[1,2],[2,2],[3,2] */

  movq_m2r (*(wsptr + 3), mm4); /* tmp3,final4 */
  psraw_i2r (3, mm3);           /* outptr[0,5],[1,5],[2,5],[3,5] */



/*    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) */
/*                          & RANGE_MASK]; */
/*    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) */
/*                          & RANGE_MASK];      final4 */
  movq_r2r (mm4, mm6);
  paddw_r2r (mm7, mm4);         /* tmp3+tmp4 */

  psubw_r2r (mm7, mm6);         /* tmp3-tmp4 */
  psraw_i2r (3, mm4);           /* outptr[0,4],[1,4],[2,4],[3,4] */

  psraw_i2r (3, mm6);           /* outptr[0,3],[1,3],[2,3],[3,3] */

  /*
     movq_r2m(mm4, *dummy);
     fprintf(stderr, "3-4 %016llx\n", dummy);
     movq_r2m(mm4, *dummy);
     fprintf(stderr, "3+4 %016llx\n", dummy);
   */


  packuswb_r2r (mm4, mm5);      /* out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4] */

  packuswb_r2r (mm3, mm6);      /* out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5] */
  movq_r2r (mm2, mm4);

  movq_r2r (mm5, mm7);
  punpcklbw_r2r (mm0, mm2);     /* out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1] */

  punpckhbw_r2r (mm0, mm4);     /* out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7] */
  movq_r2r (mm2, mm1);

  punpcklbw_r2r (mm6, mm5);     /* out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3] */

  punpckhbw_r2r (mm6, mm7);     /* out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5] */

  punpcklwd_r2r (mm5, mm2);     /* out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3] */

  movq_r2r (mm7, mm6);
  punpckhwd_r2r (mm5, mm1);     /* out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3] */

  movq_r2r (mm2, mm0);
  punpcklwd_r2r (mm4, mm6);     /* out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7] */

  punpckldq_r2r (mm6, mm2);     /* out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7] */

  movq_r2r (mm1, mm3);

  punpckhwd_r2r (mm4, mm7);     /* out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7] */

  dataptr += rskip;
  movq_r2m (mm2, *(dataptr));

  punpckhdq_r2r (mm6, mm0);     /* out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7] */

  dataptr += rskip;
  movq_r2m (mm0, *(dataptr));

  punpckldq_r2r (mm7, mm1);     /* out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7] */

  punpckhdq_r2r (mm7, mm3);     /* out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7] */

  dataptr += rskip;
  movq_r2m (mm1, *(dataptr));

  dataptr += rskip;
  movq_r2m (mm3, *(dataptr));

#else
  __s32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  __s32 tmp10, tmp11, tmp12, tmp13;
  __s32 z5, z10, z11, z12, z13;
  __s16 *inptr;
  __s32 *wsptr;
  __u8 *outptr;
  int ctr;
  __s32 dcval;
  __s32 workspace[64];

  inptr = data;
  wsptr = workspace;
  for (ctr = 8; ctr > 0; ctr--) {

    if ((inptr[8] | inptr[16] | inptr[24] |
            inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) {
      dcval = inptr[0];
      wsptr[0] = dcval;
      wsptr[8] = dcval;
      wsptr[16] = dcval;
      wsptr[24] = dcval;
      wsptr[32] = dcval;
      wsptr[40] = dcval;
      wsptr[48] = dcval;
      wsptr[56] = dcval;

      inptr++;
      wsptr++;
      continue;
    }

    tmp0 = inptr[0];
    tmp1 = inptr[16];
    tmp2 = inptr[32];
    tmp3 = inptr[48];

    tmp10 = tmp0 + tmp2;
    tmp11 = tmp0 - tmp2;

    tmp13 = tmp1 + tmp3;
    tmp12 = MULTIPLY (tmp1 - tmp3, FIX_1_414213562) - tmp13;

    tmp0 = tmp10 + tmp13;
    tmp3 = tmp10 - tmp13;
    tmp1 = tmp11 + tmp12;
    tmp2 = tmp11 - tmp12;

    tmp4 = inptr[8];
    tmp5 = inptr[24];
    tmp6 = inptr[40];
    tmp7 = inptr[56];

    z13 = tmp6 + tmp5;
    z10 = tmp6 - tmp5;
    z11 = tmp4 + tmp7;
    z12 = tmp4 - tmp7;

    tmp7 = z11 + z13;
    tmp11 = MULTIPLY (z11 - z13, FIX_1_414213562);

    z5 = MULTIPLY (z10 + z12, FIX_1_847759065);
    tmp10 = MULTIPLY (z12, FIX_1_082392200) - z5;
    tmp12 = MULTIPLY (z10, -FIX_2_613125930) + z5;

    tmp6 = tmp12 - tmp7;
    tmp5 = tmp11 - tmp6;
    tmp4 = tmp10 + tmp5;

    wsptr[0] = (__s32) (tmp0 + tmp7);
    wsptr[56] = (__s32) (tmp0 - tmp7);
    wsptr[8] = (__s32) (tmp1 + tmp6);
    wsptr[48] = (__s32) (tmp1 - tmp6);
    wsptr[16] = (__s32) (tmp2 + tmp5);
    wsptr[40] = (__s32) (tmp2 - tmp5);
    wsptr[32] = (__s32) (tmp3 + tmp4);
    wsptr[24] = (__s32) (tmp3 - tmp4);

    inptr++;
    wsptr++;
  }

  wsptr = workspace;
  for (ctr = 0; ctr < 8; ctr++) {
    outptr = &(odata[ctr * rskip]);

    tmp10 = wsptr[0] + wsptr[4];
    tmp11 = wsptr[0] - wsptr[4];

    tmp13 = wsptr[2] + wsptr[6];
    tmp12 = MULTIPLY (wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13;

    tmp0 = tmp10 + tmp13;
    tmp3 = tmp10 - tmp13;
    tmp1 = tmp11 + tmp12;
    tmp2 = tmp11 - tmp12;

    z13 = wsptr[5] + wsptr[3];
    z10 = wsptr[5] - wsptr[3];
    z11 = wsptr[1] + wsptr[7];
    z12 = wsptr[1] - wsptr[7];

    tmp7 = z11 + z13;
    tmp11 = MULTIPLY (z11 - z13, FIX_1_414213562);

    z5 = MULTIPLY (z10 + z12, FIX_1_847759065);
    tmp10 = MULTIPLY (z12, FIX_1_082392200) - z5;
    tmp12 = MULTIPLY (z10, -FIX_2_613125930) + z5;

    tmp6 = tmp12 - tmp7;
    tmp5 = tmp11 - tmp6;
    tmp4 = tmp10 + tmp5;

    outptr[0] = RL (DESCALE (tmp0 + tmp7));
    outptr[7] = RL (DESCALE (tmp0 - tmp7));
    outptr[1] = RL (DESCALE (tmp1 + tmp6));
    outptr[6] = RL (DESCALE (tmp1 - tmp6));
    outptr[2] = RL (DESCALE (tmp2 + tmp5));
    outptr[5] = RL (DESCALE (tmp2 - tmp5));
    outptr[4] = RL (DESCALE (tmp3 + tmp4));
    outptr[3] = RL (DESCALE (tmp3 - tmp4));

    wsptr += 8;
  }
#endif
}

/*

Main Routines

This file contains most of the initialisation and control functions

(C) Justin Schoeman 1998

*/

/*

Private function

Initialise all the cache-aliged data blocks

*/

void
RTjpeg_init_data (void)
{
  unsigned long dptr;

  dptr = (unsigned long) &(RTjpeg_alldata[0]);
  dptr += 32;
  dptr = dptr >> 5;
  dptr = dptr << 5;             /* cache align data */

  RTjpeg_block = (__s16 *) dptr;
  dptr += sizeof (__s16) * 64;
  RTjpeg_lqt = (__s32 *) dptr;
  dptr += sizeof (__s32) * 64;
  RTjpeg_cqt = (__s32 *) dptr;
  dptr += sizeof (__s32) * 64;
  RTjpeg_liqt = (__u32 *) dptr;
  dptr += sizeof (__u32) * 64;
  RTjpeg_ciqt = (__u32 *) dptr;
}

/*

External Function

Re-set quality factor

Input: buf -> pointer to 128 ints for quant values store to pass back to
              init_decompress.
       Q -> quality factor (192=best, 32=worst)
*/

void
RTjpeg_init_Q (__u8 Q)
{
  int i;
  __u64 qual;

  qual = (__u64) Q << (32 - 7); /* 32 bit FP, 255=2, 0=0 */

  for (i = 0; i < 64; i++) {
    RTjpeg_lqt[i] =
        (__s32) ((qual / ((__u64) RTjpeg_lum_quant_tbl[i] << 16)) >> 3);
    if (RTjpeg_lqt[i] == 0)
      RTjpeg_lqt[i] = 1;
    RTjpeg_cqt[i] =
        (__s32) ((qual / ((__u64) RTjpeg_chrom_quant_tbl[i] << 16)) >> 3);
    if (RTjpeg_cqt[i] == 0)
      RTjpeg_cqt[i] = 1;
    RTjpeg_liqt[i] = (1 << 16) / (RTjpeg_lqt[i] << 3);
    RTjpeg_ciqt[i] = (1 << 16) / (RTjpeg_cqt[i] << 3);
    RTjpeg_lqt[i] = ((1 << 16) / RTjpeg_liqt[i]) >> 3;
    RTjpeg_cqt[i] = ((1 << 16) / RTjpeg_ciqt[i]) >> 3;
  }

  RTjpeg_lb8 = 0;
  while (RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]] <= 8);
  RTjpeg_lb8--;
  RTjpeg_cb8 = 0;
  while (RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]] <= 8);
  RTjpeg_cb8--;

  RTjpeg_dct_init ();
  RTjpeg_idct_init ();
  RTjpeg_quant_init ();
}

/*

External Function

Initialise compression.

Input: buf -> pointer to 128 ints for quant values store to pass back to 
                init_decompress.
       width -> width of image
       height -> height of image
       Q -> quality factor (192=best, 32=worst)
       
*/

void
RTjpeg_init_compress (__u32 * buf, int width, int height, __u8 Q)
{
  int i;
  __u64 qual;

  RTjpeg_init_data ();

  RTjpeg_width = width;
  RTjpeg_height = height;
  RTjpeg_Ywidth = RTjpeg_width >> 3;
  RTjpeg_Ysize = width * height;
  RTjpeg_Cwidth = RTjpeg_width >> 4;
  RTjpeg_Csize = (width >> 1) * height;

  qual = (__u64) Q << (32 - 7); /* 32 bit FP, 255=2, 0=0 */

  for (i = 0; i < 64; i++) {
    RTjpeg_lqt[i] =
        (__s32) ((qual / ((__u64) RTjpeg_lum_quant_tbl[i] << 16)) >> 3);
    if (RTjpeg_lqt[i] == 0)
      RTjpeg_lqt[i] = 1;
    RTjpeg_cqt[i] =
        (__s32) ((qual / ((__u64) RTjpeg_chrom_quant_tbl[i] << 16)) >> 3);
    if (RTjpeg_cqt[i] == 0)
      RTjpeg_cqt[i] = 1;
    RTjpeg_liqt[i] = (1 << 16) / (RTjpeg_lqt[i] << 3);
    RTjpeg_ciqt[i] = (1 << 16) / (RTjpeg_cqt[i] << 3);
    RTjpeg_lqt[i] = ((1 << 16) / RTjpeg_liqt[i]) >> 3;
    RTjpeg_cqt[i] = ((1 << 16) / RTjpeg_ciqt[i]) >> 3;
  }

  RTjpeg_lb8 = 0;
  while (RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]] <= 8);
  RTjpeg_lb8--;
  RTjpeg_cb8 = 0;
  while (RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]] <= 8);
  RTjpeg_cb8--;

  RTjpeg_dct_init ();
  RTjpeg_quant_init ();

  for (i = 0; i < 64; i++)
    buf[i] = RTjpeg_liqt[i];
  for (i = 0; i < 64; i++)
    buf[64 + i] = RTjpeg_ciqt[i];
}

void
RTjpeg_init_decompress (__u32 * buf, int width, int height)
{
  int i;

  RTjpeg_init_data ();

  RTjpeg_width = width;
  RTjpeg_height = height;
  RTjpeg_Ywidth = RTjpeg_width >> 3;
  RTjpeg_Ysize = width * height;
  RTjpeg_Cwidth = RTjpeg_width >> 4;
  RTjpeg_Csize = (width >> 1) * height;

  for (i = 0; i < 64; i++) {
    RTjpeg_liqt[i] = buf[i];
    RTjpeg_ciqt[i] = buf[i + 64];
  }

  RTjpeg_lb8 = 0;
  while (RTjpeg_liqt[RTjpeg_ZZ[++RTjpeg_lb8]] <= 8);
  RTjpeg_lb8--;
  RTjpeg_cb8 = 0;
  while (RTjpeg_ciqt[RTjpeg_ZZ[++RTjpeg_cb8]] <= 8);
  RTjpeg_cb8--;

  RTjpeg_idct_init ();

/* RTjpeg_color_init(); */
}

int
RTjpeg_compressYUV420 (__s8 * sp, unsigned char *bp)
{
  __s8 *sb;
  register __s8 *bp1 = bp + (RTjpeg_width << 3);
  register __s8 *bp2 = bp + RTjpeg_Ysize;
  register __s8 *bp3 = bp2 + (RTjpeg_Csize >> 1);
  register int i, j, k;

#ifdef HAVE_LIBMMX
  emms ();
#endif
  sb = sp;
/* Y */
  for (i = RTjpeg_height >> 1; i; i -= 8) {
    for (j = 0, k = 0; j < RTjpeg_width; j += 16, k += 8) {
      RTjpeg_dctY (bp + j, RTjpeg_block, RTjpeg_Ywidth);
      RTjpeg_quant (RTjpeg_block, RTjpeg_lqt);
      sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_lb8);

      RTjpeg_dctY (bp + j + 8, RTjpeg_block, RTjpeg_Ywidth);
      RTjpeg_quant (RTjpeg_block, RTjpeg_lqt);
      sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_lb8);

      RTjpeg_dctY (bp1 + j, RTjpeg_block, RTjpeg_Ywidth);
      RTjpeg_quant (RTjpeg_block, RTjpeg_lqt);
      sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_lb8);

      RTjpeg_dctY (bp1 + j + 8, RTjpeg_block, RTjpeg_Ywidth);
      RTjpeg_quant (RTjpeg_block, RTjpeg_lqt);
      sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_lb8);

      RTjpeg_dctY (bp2 + k, RTjpeg_block, RTjpeg_Cwidth);
      RTjpeg_quant (RTjpeg_block, RTjpeg_cqt);
      sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_cb8);

      RTjpeg_dctY (bp3 + k, RTjpeg_block, RTjpeg_Cwidth);
      RTjpeg_quant (RTjpeg_block, RTjpeg_cqt);
      sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_cb8);

    }
    bp += RTjpeg_width << 4;
    bp1 += RTjpeg_width << 4;
    bp2 += RTjpeg_width << 2;
    bp3 += RTjpeg_width << 2;

  }
#ifdef HAVE_LIBMMX
  emms ();
#endif
  return (sp - sb);
}

int
RTjpeg_compressYUV422 (__s8 * sp, unsigned char *bp)
{
  __s8 *sb;
  register __s8 *bp2 = bp + RTjpeg_Ysize;
  register __s8 *bp3 = bp2 + RTjpeg_Csize;
  register int i, j, k;

#ifdef HAVE_LIBMMX
  emms ();
#endif
  sb = sp;
/* Y */
  for (i = RTjpeg_height; i; i -= 8) {
    for (j = 0, k = 0; j < RTjpeg_width; j += 16, k += 8) {
      RTjpeg_dctY (bp + j, RTjpeg_block, RTjpeg_Ywidth);
      RTjpeg_quant (RTjpeg_block, RTjpeg_lqt);
      sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_lb8);

      RTjpeg_dctY (bp + j + 8, RTjpeg_block, RTjpeg_Ywidth);
      RTjpeg_quant (RTjpeg_block, RTjpeg_lqt);
      sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_lb8);

      RTjpeg_dctY (bp2 + k, RTjpeg_block, RTjpeg_Cwidth);
      RTjpeg_quant (RTjpeg_block, RTjpeg_cqt);
      sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_cb8);

      RTjpeg_dctY (bp3 + k, RTjpeg_block, RTjpeg_Cwidth);
      RTjpeg_quant (RTjpeg_block, RTjpeg_cqt);
      sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_cb8);

    }
    bp += RTjpeg_width << 3;
    bp2 += RTjpeg_width << 2;
    bp3 += RTjpeg_width << 2;

  }
#ifdef HAVE_LIBMMX
  emms ();
#endif
  return (sp - sb);
}

int
RTjpeg_compress8 (__s8 * sp, unsigned char *bp)
{
  __s8 *sb;
  int i, j;

#ifdef HAVE_LIBMMX
  emms ();
#endif

  sb = sp;
/* Y */
  for (i = 0; i < RTjpeg_height; i += 8) {
    for (j = 0; j < RTjpeg_width; j += 8) {
      RTjpeg_dctY (bp + j, RTjpeg_block, RTjpeg_width);
      RTjpeg_quant (RTjpeg_block, RTjpeg_lqt);
      sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_lb8);
    }
    bp += RTjpeg_width;
  }

#ifdef HAVE_LIBMMX
  emms ();
#endif
  return (sp - sb);
}

void
RTjpeg_decompressYUV422 (__s8 * sp, __u8 * bp)
{
  register __s8 *bp2 = bp + RTjpeg_Ysize;
  register __s8 *bp3 = bp2 + (RTjpeg_Csize);
  int i, j, k;

#ifdef HAVE_LIBMMX
  emms ();
#endif

/* Y */
  for (i = RTjpeg_height; i; i -= 8) {
    for (k = 0, j = 0; j < RTjpeg_width; j += 16, k += 8) {
      if (*sp == -1)
        sp++;
      else {
        sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
        RTjpeg_idct (bp + j, RTjpeg_block, RTjpeg_width);
      }
      if (*sp == -1)
        sp++;
      else {
        sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
        RTjpeg_idct (bp + j + 8, RTjpeg_block, RTjpeg_width);
      }
      if (*sp == -1)
        sp++;
      else {
        sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
        RTjpeg_idct (bp2 + k, RTjpeg_block, RTjpeg_width >> 1);
      }
      if (*sp == -1)
        sp++;
      else {
        sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
        RTjpeg_idct (bp3 + k, RTjpeg_block, RTjpeg_width >> 1);
      }
    }
    bp += RTjpeg_width << 3;
    bp2 += RTjpeg_width << 2;
    bp3 += RTjpeg_width << 2;
  }
#ifdef HAVE_LIBMMX
  emms ();
#endif
}

void
RTjpeg_decompressYUV420 (__s8 * sp, __u8 * bp)
{
  register __s8 *bp1 = bp + (RTjpeg_width << 3);
  register __s8 *bp2 = bp + RTjpeg_Ysize;
  register __s8 *bp3 = bp2 + (RTjpeg_Csize >> 1);
  int i, j, k;

#ifdef HAVE_LIBMMX
  emms ();
#endif

/* Y */
  for (i = RTjpeg_height >> 1; i; i -= 8) {
    for (k = 0, j = 0; j < RTjpeg_width; j += 16, k += 8) {
      if (*sp == -1)
        sp++;
      else {
        sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
        RTjpeg_idct (bp + j, RTjpeg_block, RTjpeg_width);
      }
      if (*sp == -1)
        sp++;
      else {
        sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
        RTjpeg_idct (bp + j + 8, RTjpeg_block, RTjpeg_width);
      }
      if (*sp == -1)
        sp++;
      else {
        sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
        RTjpeg_idct (bp1 + j, RTjpeg_block, RTjpeg_width);
      }
      if (*sp == -1)
        sp++;
      else {
        sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
        RTjpeg_idct (bp1 + j + 8, RTjpeg_block, RTjpeg_width);
      }
      if (*sp == -1)
        sp++;
      else {
        sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
        RTjpeg_idct (bp2 + k, RTjpeg_block, RTjpeg_width >> 1);
      }
      if (*sp == -1)
        sp++;
      else {
        sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_cb8, RTjpeg_ciqt);
        RTjpeg_idct (bp3 + k, RTjpeg_block, RTjpeg_width >> 1);
      }
    }
    bp += RTjpeg_width << 4;
    bp1 += RTjpeg_width << 4;
    bp2 += RTjpeg_width << 2;
    bp3 += RTjpeg_width << 2;
  }
#ifdef HAVE_LIBMMX
  emms ();
#endif
}

void
RTjpeg_decompress8 (__s8 * sp, __u8 * bp)
{
  int i, j;

#ifdef HAVE_LIBMMX
  emms ();
#endif

/* Y */
  for (i = 0; i < RTjpeg_height; i += 8) {
    for (j = 0; j < RTjpeg_width; j += 8)
      if (*sp == -1)
        sp++;
      else {
        sp += RTjpeg_s2b (RTjpeg_block, sp, RTjpeg_lb8, RTjpeg_liqt);
        RTjpeg_idct (bp + j, RTjpeg_block, RTjpeg_width);
      }
    bp += RTjpeg_width << 3;
  }
}

/*
External Function

Initialise additional data structures for motion compensation

*/

void
RTjpeg_init_mcompress (void)
{
  unsigned long tmp;

  if (!RTjpeg_old) {
    RTjpeg_old = malloc ((4 * RTjpeg_width * RTjpeg_height) + 32);
    tmp = (unsigned long) RTjpeg_old;
    tmp += 32;
    tmp = tmp >> 5;
    RTjpeg_old = (__s16 *) (tmp << 5);
  }
  if (!RTjpeg_old) {
    fprintf (stderr, "RTjpeg: Could not allocate memory\n");
    exit (-1);
  }
  memset (RTjpeg_old, 0, ((4 * RTjpeg_width * RTjpeg_height)));
}

#ifdef HAVE_LIBMMX

int
RTjpeg_bcomp (__s16 * old, mmx_t * mask)
{
  int i;
  mmx_t *mold = (mmx_t *) old;
  mmx_t *mblock = (mmx_t *) RTjpeg_block;
  mmx_t result;
  static mmx_t neg = (mmx_t) (unsigned long long) 0xffffffffffffffffULL;

  movq_m2r (*mask, mm7);
  movq_m2r (neg, mm6);
  pxor_r2r (mm5, mm5);

  for (i = 0; i < 8; i++) {
    movq_m2r (*(mblock++), mm0);
    movq_m2r (*(mblock++), mm2);
    movq_m2r (*(mold++), mm1);
    movq_m2r (*(mold++), mm3);
    psubsw_r2r (mm1, mm0);
    psubsw_r2r (mm3, mm2);
    movq_r2r (mm0, mm1);
    movq_r2r (mm2, mm3);
    pcmpgtw_r2r (mm7, mm0);
    pcmpgtw_r2r (mm7, mm2);
    pxor_r2r (mm6, mm1);
    pxor_r2r (mm6, mm3);
    pcmpgtw_r2r (mm7, mm1);
    pcmpgtw_r2r (mm7, mm3);
    por_r2r (mm0, mm5);
    por_r2r (mm2, mm5);
    por_r2r (mm1, mm5);
    por_r2r (mm3, mm5);
  }
  movq_r2m (mm5, result);

  if (result.q) {
    if (!RTjpeg_mtest)
      for (i = 0; i < 16; i++)
        ((__u64 *) old)[i] = ((__u64 *) RTjpeg_block)[i];
    return 0;
  }
/* printf("."); */
  return 1;
}

#else
int
RTjpeg_bcomp (__s16 * old, __u16 * mask)
{
  int i;

  for (i = 0; i < 64; i++)
    if (abs (old[i] - RTjpeg_block[i]) > *mask) {
      if (!RTjpeg_mtest)
        for (i = 0; i < 16; i++)
          ((__u64 *) old)[i] = ((__u64 *) RTjpeg_block)[i];
      return 0;
    }
  return 1;
}
#endif

void
RTjpeg_set_test (int i)
{
  RTjpeg_mtest = i;
}

int
RTjpeg_mcompress (__s8 * sp, unsigned char *bp, __u16 lmask, __u16 cmask)
{
  __s8 *sb;
  __s16 *block;
  register __s8 *bp2;
  register __s8 *bp3;
  register int i, j, k;

#ifdef HAVE_LIBMMX
  emms ();
  RTjpeg_lmask =
      (mmx_t) (((__u64) lmask << 48) | ((__u64) lmask << 32) | ((__u64) lmask <<
          16) | lmask);
  RTjpeg_cmask =
      (mmx_t) (((__u64) cmask << 48) | ((__u64) cmask << 32) | ((__u64) cmask <<
          16) | cmask);
#else
  RTjpeg_lmask = lmask;
  RTjpeg_cmask = cmask;
#endif

  bp = bp - RTjpeg_width * 0;
  bp2 = bp + RTjpeg_Ysize - RTjpeg_width * 0;
  bp3 = bp2 + RTjpeg_Csize;

  sb = sp;
  block = RTjpeg_old;
/* Y */
  for (i = RTjpeg_height; i; i -= 8) {
    for (j = 0, k = 0; j < RTjpeg_width; j += 16, k += 8) {
      RTjpeg_dctY (bp + j, RTjpeg_block, RTjpeg_Ywidth);
      RTjpeg_quant (RTjpeg_block, RTjpeg_lqt);
      if (RTjpeg_bcomp (block, &RTjpeg_lmask)) {
        *((__u8 *) sp++) = 255;
      } else
        sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_lb8);
      block += 64;

      RTjpeg_dctY (bp + j + 8, RTjpeg_block, RTjpeg_Ywidth);
      RTjpeg_quant (RTjpeg_block, RTjpeg_lqt);
      if (RTjpeg_bcomp (block, &RTjpeg_lmask)) {
        *((__u8 *) sp++) = 255;
      } else
        sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_lb8);
      block += 64;

      RTjpeg_dctY (bp2 + k, RTjpeg_block, RTjpeg_Cwidth);
      RTjpeg_quant (RTjpeg_block, RTjpeg_cqt);
      if (RTjpeg_bcomp (block, &RTjpeg_cmask)) {
        *((__u8 *) sp++) = 255;
      } else
        sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_cb8);
      block += 64;

      RTjpeg_dctY (bp3 + k, RTjpeg_block, RTjpeg_Cwidth);
      RTjpeg_quant (RTjpeg_block, RTjpeg_cqt);
      if (RTjpeg_bcomp (block, &RTjpeg_cmask)) {
        *((__u8 *) sp++) = 255;
      } else
        sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_cb8);
      block += 64;

    }
    bp += RTjpeg_width << 3;
    bp2 += RTjpeg_width << 2;
    bp3 += RTjpeg_width << 2;
  }
  /*printf ("%d\n", block - RTjpeg_old); */
#ifdef HAVE_LIBMMX
  emms ();
#endif
  return (sp - sb);
}

int
RTjpeg_mcompress8 (__s8 * sp, unsigned char *bp, __u16 lmask)
{
  __s8 *sb;
  __s16 *block;
  int i, j;

#ifdef HAVE_LIBMMX
  emms ();
  RTjpeg_lmask =
      (mmx_t) (((__u64) lmask << 48) | ((__u64) lmask << 32) | ((__u64) lmask <<
          16) | lmask);
#else
  RTjpeg_lmask = lmask;
#endif


  sb = sp;
  block = RTjpeg_old;
/* Y */
  for (i = 0; i < RTjpeg_height; i += 8) {
    for (j = 0; j < RTjpeg_width; j += 8) {
      RTjpeg_dctY (bp + j, RTjpeg_block, RTjpeg_width);
      RTjpeg_quant (RTjpeg_block, RTjpeg_lqt);
      if (RTjpeg_bcomp (block, &RTjpeg_lmask)) {
        *((__u8 *) sp++) = 255;
/*    printf("* %d ", sp[-1]); */
      } else
        sp += RTjpeg_b2s (RTjpeg_block, sp, RTjpeg_lb8);
      block += 64;
    }
    bp += RTjpeg_width << 3;
  }
#ifdef HAVE_LIBMMX
  emms ();
#endif
  return (sp - sb);
}

void
RTjpeg_color_init (void)
{
}

#define KcrR 76284
#define KcrG 53281
#define KcbG 25625
#define KcbB 132252
#define Ky 76284

void
RTjpeg_yuv422rgb (__u8 * buf, __u8 * rgb)
{
  int tmp;
  int i, j;
  __s32 y, crR, crG, cbG, cbB;
  __u8 *bufcr, *bufcb, *bufy, *bufoute;
  int yskip;

  yskip = RTjpeg_width;

  bufcb = &buf[RTjpeg_width * RTjpeg_height];
  bufcr =
      &buf[RTjpeg_width * RTjpeg_height + (RTjpeg_width * RTjpeg_height) / 2];
  bufy = &buf[0];
  bufoute = rgb;

  for (i = 0; i < (RTjpeg_height); i++) {
    for (j = 0; j < RTjpeg_width; j += 2) {
      crR = (*bufcr - 128) * KcrR;
      crG = (*(bufcr++) - 128) * KcrG;
      cbG = (*bufcb - 128) * KcbG;
      cbB = (*(bufcb++) - 128) * KcbB;

      y = (bufy[j] - 16) * Ky;

      tmp = (y + crR) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + cbB) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);

      y = (bufy[j + 1] - 16) * Ky;

      tmp = (y + crR) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + cbB) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);

    }
    bufy += yskip;
  }
}


void
RTjpeg_yuv420rgb (__u8 * buf, __u8 * rgb)
{
  int tmp;
  int i, j;
  __s32 y, crR, crG, cbG, cbB;
  __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
  int oskip, yskip;

  oskip = RTjpeg_width * 3;
  yskip = RTjpeg_width;

  bufcb = &buf[RTjpeg_width * RTjpeg_height];
  bufcr =
      &buf[RTjpeg_width * RTjpeg_height + (RTjpeg_width * RTjpeg_height) / 4];
  bufy = &buf[0];
  bufoute = rgb;
  bufouto = rgb + oskip;

  for (i = 0; i < (RTjpeg_height >> 1); i++) {
    for (j = 0; j < RTjpeg_width; j += 2) {
      crR = (*bufcr - 128) * KcrR;
      crG = (*(bufcr++) - 128) * KcrG;
      cbG = (*bufcb - 128) * KcbG;
      cbB = (*(bufcb++) - 128) * KcbB;

      y = (bufy[j] - 16) * Ky;

      tmp = (y + crR) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + cbB) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);

      y = (bufy[j + 1] - 16) * Ky;

      tmp = (y + crR) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + cbB) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);

      y = (bufy[j + yskip] - 16) * Ky;

      tmp = (y + crR) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + cbB) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);

      y = (bufy[j + 1 + yskip] - 16) * Ky;

      tmp = (y + crR) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + cbB) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);

    }
    bufoute += oskip;
    bufouto += oskip;
    bufy += yskip << 1;
  }
}


void
RTjpeg_yuvrgb32 (__u8 * buf, __u8 * rgb)
{
  int tmp;
  int i, j;
  __s32 y, crR, crG, cbG, cbB;
  __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
  int oskip, yskip;

  oskip = RTjpeg_width * 4;
  yskip = RTjpeg_width;

  bufcb = &buf[RTjpeg_width * RTjpeg_height];
  bufcr =
      &buf[RTjpeg_width * RTjpeg_height + (RTjpeg_width * RTjpeg_height) / 2];
  bufy = &buf[0];
  bufoute = rgb;
  bufouto = rgb + oskip;

  for (i = 0; i < (RTjpeg_height >> 1); i++) {
    for (j = 0; j < RTjpeg_width; j += 2) {
      crR = (*bufcr - 128) * KcrR;
      crG = (*(bufcr++) - 128) * KcrG;
      cbG = (*bufcb - 128) * KcbG;
      cbB = (*(bufcb++) - 128) * KcbB;

      y = (bufy[j] - 16) * Ky;

      tmp = (y + cbB) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + crR) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      bufoute++;

      y = (bufy[j + 1] - 16) * Ky;

      tmp = (y + cbB) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + crR) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      bufoute++;

      y = (bufy[j + yskip] - 16) * Ky;

      tmp = (y + cbB) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + crR) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      bufouto++;

      y = (bufy[j + 1 + yskip] - 16) * Ky;

      tmp = (y + cbB) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + crR) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      bufouto++;

    }
    bufoute += oskip;
    bufouto += oskip;
    bufy += yskip << 1;
  }
}

void
RTjpeg_yuvrgb24 (__u8 * buf, __u8 * rgb)
{
  int tmp;
  int i, j;
  __s32 y, crR, crG, cbG, cbB;
  __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
  int oskip, yskip;

  oskip = RTjpeg_width * 3;
  yskip = RTjpeg_width;

  bufcb = &buf[RTjpeg_width * RTjpeg_height];
  bufcr =
      &buf[RTjpeg_width * RTjpeg_height + (RTjpeg_width * RTjpeg_height) / 4];
  bufy = &buf[0];
  bufoute = rgb;
  bufouto = rgb + oskip;

  for (i = 0; i < (RTjpeg_height >> 1); i++) {
    for (j = 0; j < RTjpeg_width; j += 2) {
      crR = (*bufcr - 128) * KcrR;
      crG = (*(bufcr++) - 128) * KcrG;
      cbG = (*bufcb - 128) * KcbG;
      cbB = (*(bufcb++) - 128) * KcbB;

      y = (bufy[j] - 16) * Ky;

      tmp = (y + cbB) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + crR) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);

      y = (bufy[j + 1] - 16) * Ky;

      tmp = (y + cbB) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + crR) >> 16;
      *(bufoute++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);

      y = (bufy[j + yskip] - 16) * Ky;

      tmp = (y + cbB) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + crR) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);

      y = (bufy[j + 1 + yskip] - 16) * Ky;

      tmp = (y + cbB) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + crR) >> 16;
      *(bufouto++) = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);

    }
    bufoute += oskip;
    bufouto += oskip;
    bufy += yskip << 1;
  }
}

void
RTjpeg_yuvrgb16 (__u8 * buf, __u8 * rgb)
{
  int tmp;
  int i, j;
  __s32 y, crR, crG, cbG, cbB;
  __u8 *bufcr, *bufcb, *bufy, *bufoute, *bufouto;
  int oskip, yskip;
  unsigned char r, g, b;

  oskip = RTjpeg_width * 2;
  yskip = RTjpeg_width;

  bufcb = &buf[RTjpeg_width * RTjpeg_height];
  bufcr =
      &buf[RTjpeg_width * RTjpeg_height + (RTjpeg_width * RTjpeg_height) / 4];
  bufy = &buf[0];
  bufoute = rgb;
  bufouto = rgb + oskip;

  for (i = 0; i < (RTjpeg_height >> 1); i++) {
    for (j = 0; j < RTjpeg_width; j += 2) {
      crR = (*bufcr - 128) * KcrR;
      crG = (*(bufcr++) - 128) * KcrG;
      cbG = (*bufcb - 128) * KcbG;
      cbB = (*(bufcb++) - 128) * KcbB;

      y = (bufy[j] - 16) * Ky;

      tmp = (y + cbB) >> 16;
      b = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      g = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + crR) >> 16;
      r = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (int) ((int) b >> 3);
      tmp |= (int) (((int) g >> 2) << 5);
      tmp |= (int) (((int) r >> 3) << 11);
      *(bufoute++) = tmp & 0xff;
      *(bufoute++) = tmp >> 8;


      y = (bufy[j + 1] - 16) * Ky;

      tmp = (y + cbB) >> 16;
      b = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      g = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + crR) >> 16;
      r = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (int) ((int) b >> 3);
      tmp |= (int) (((int) g >> 2) << 5);
      tmp |= (int) (((int) r >> 3) << 11);
      *(bufoute++) = tmp & 0xff;
      *(bufoute++) = tmp >> 8;

      y = (bufy[j + yskip] - 16) * Ky;

      tmp = (y + cbB) >> 16;
      b = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      g = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + crR) >> 16;
      r = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (int) ((int) b >> 3);
      tmp |= (int) (((int) g >> 2) << 5);
      tmp |= (int) (((int) r >> 3) << 11);
      *(bufouto++) = tmp & 0xff;
      *(bufouto++) = tmp >> 8;

      y = (bufy[j + 1 + yskip] - 16) * Ky;

      tmp = (y + cbB) >> 16;
      b = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y - crG - cbG) >> 16;
      g = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (y + crR) >> 16;
      r = (tmp > 255) ? 255 : ((tmp < 0) ? 0 : tmp);
      tmp = (int) ((int) b >> 3);
      tmp |= (int) (((int) g >> 2) << 5);
      tmp |= (int) (((int) r >> 3) << 11);
      *(bufouto++) = tmp & 0xff;
      *(bufouto++) = tmp >> 8;

    }
    bufoute += oskip;
    bufouto += oskip;
    bufy += yskip << 1;
  }
}

void
RTjpeg_yuvrgb8 (__u8 * buf, __u8 * rgb)
{
  memcpy (rgb, buf, RTjpeg_width * RTjpeg_height);
}

void
RTjpeg_double32 (__u32 * buf)
{
  int i, j;

  __u32 *iptr, *optr1, *optr2;

  iptr = buf + (RTjpeg_width * RTjpeg_height) - 1;
  optr1 = buf + (RTjpeg_width * RTjpeg_height * 4) - 1;
  optr2 = optr1 - (2 * RTjpeg_width);

  for (i = 0; i < RTjpeg_height; i++) {
    for (j = 0; j < RTjpeg_width; j++) {
      *(optr1--) = *iptr;
      *(optr1--) = *iptr;
      *(optr2--) = *iptr;
      *(optr2--) = *(iptr--);
    }
    optr2 = optr2 - 2 * RTjpeg_width;
    optr1 = optr1 - 2 * RTjpeg_width;
  }
}

void
RTjpeg_double24 (__u8 * buf)
{
}

void
RTjpeg_double16 (__u16 * buf)
{
  int i, j;

  __u16 *iptr, *optr1, *optr2;

  iptr = buf + (RTjpeg_width * RTjpeg_height) - 1;
  optr1 = buf + (RTjpeg_width * RTjpeg_height * 4) - 1;
  optr2 = optr1 - (2 * RTjpeg_width);

  for (i = 0; i < RTjpeg_height; i++) {
    for (j = 0; j < RTjpeg_width; j++) {
      *(optr1--) = *iptr;
      *(optr1--) = *iptr;
      *(optr2--) = *iptr;
      *(optr2--) = *(iptr--);
    }
    optr2 = optr2 - 2 * RTjpeg_width;
    optr1 = optr1 - 2 * RTjpeg_width;
  }
}

void
RTjpeg_double8 (__u8 * buf)
{
  int i, j;

  __u8 *iptr, *optr1, *optr2;

  iptr = buf + (RTjpeg_width * RTjpeg_height) - 1;
  optr1 = buf + (RTjpeg_width * RTjpeg_height * 4) - 1;
  optr2 = optr1 - (2 * RTjpeg_width);

  for (i = 0; i < RTjpeg_height; i++) {
    for (j = 0; j < RTjpeg_width; j++) {
      *(optr1--) = *iptr;
      *(optr1--) = *iptr;
      *(optr2--) = *iptr;
      *(optr2--) = *(iptr--);
    }
    optr2 = optr2 - 2 * RTjpeg_width;
    optr1 = optr1 - 2 * RTjpeg_width;
  }
}