[MOVED FROM BAD 16/56] gst/deinterlace2/tvtime/: Add a C implementation for the greedyh deinterlacing method, clean up the code a bit and ma...

Original commit message from CVS: * gst/deinterlace2/tvtime/greedyh.asm: * gst/deinterlace2/tvtime/greedyh.c: (greedyDScaler_C), (deinterlace_frame_di_greedyh), (dscaler_greedyh_get_method): * gst/deinterlace2/tvtime/greedyhmacros.h: Add a C implementation for the greedyh deinterlacing method, clean up the code a bit and mark the SSE version as MMXEXT as it doesn't require any SSE instructions.
2025-01-25 16:48:11 +00:00 · 2008-06-28 17:25:56 +00:00 · 2008-06-28 17:25:56 +00:00 · 57dd0e85d1
commit 57dd0e85d1
parent 07f408a3ea
3 changed files with 421 additions and 320 deletions
--- a/gst/deinterlace2/tvtime/greedyh.asm
+++ b/gst/deinterlace2/tvtime/greedyh.asm
@ -28,88 +28,47 @@

 #include "x86-64_macros.inc"

-void FUNCT_NAME( GstDeinterlace2 *object)
+void
+FUNCT_NAME (uint8_t * L1, uint8_t * L2, uint8_t * L3, uint8_t * L2P,
+    uint8_t * Dest, int size)
 {
-    int64_t i;
-    int InfoIsOdd = 0;

  // in tight loop some vars are accessed faster in local storage
  int64_t YMask = 0x00ff00ff00ff00ffull;        // to keep only luma
  int64_t UVMask = 0xff00ff00ff00ff00ull;       // to keep only chroma
-    int64_t ShiftMask    = 0xfefffefffefffeffull; // to avoid shifting chroma to luma
+  int64_t ShiftMask = 0xfefefefefefefefeull;    // to avoid shifting chroma to luma
  int64_t QW256 = 0x0100010001000100ull;        // 4 256's
-
-    // Set up our two parms that are actually evaluated for each pixel
-    i=GreedyMaxComb;
-    int64_t MaxComb = i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i;
-    
-    i = GreedyMotionThreshold;		// scale to range of 0-257
-    int64_t MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask;
-    
-    i = GreedyMotionSense;		// scale to range of 0-257
-    int64_t MotionSense = i << 48 | i << 32 | i << 16 | i;
-
-    int Line;
+  int64_t MaxComb;
+  int64_t MotionThreshold;
+  int64_t MotionSense;
+  int64_t i;
  long LoopCtr;
-    unsigned int Pitch = object->field_stride;
-
-    unsigned char* L1;					// ptr to Line1, of 3
-    unsigned char* L2;					// ptr to Line2, the weave line
-    unsigned char* L3;					// ptr to Line3
-
-    unsigned char* L2P;					// ptr to prev Line2
-    unsigned char* Dest = GST_BUFFER_DATA(object->out_buf);
+  long oldbx;

  int64_t QW256B;
  int64_t LastAvg = 0;          //interp value from left qword

+  // Set up our two parms that are actually evaluated for each pixel
+  i = GreedyMaxComb;
+  MaxComb =
+      i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i;
+
+  i = GreedyMotionThreshold;    // scale to range of 0-257
+  MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask;
+
+  i = GreedyMotionSense;        // scale to range of 0-257
+  MotionSense = i << 48 | i << 32 | i << 16 | i;
+
  i = 0xffffffff - 256;
  QW256B = i << 48 | i << 32 | i << 16 | i;     // save a couple instr on PMINSW instruct.

-
-    // copy first even line no matter what, and the first odd line if we're
-    // processing an EVEN field. (note diff from other deint rtns.)
-
-    if (object->field_history[object->history_count-1].flags == PICTURE_INTERLACED_BOTTOM) {
-      InfoIsOdd = 1;
-
-      L1 = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf);
-      L2 = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf); 
-      L3 = L1 + Pitch;
-      L2P = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf);
-
-      // copy first even line
-      object->pMemcpy(Dest, L1, object->line_length);
-      Dest += object->output_stride;
-    } 
-    else {
-      InfoIsOdd = 0;
-      L1 = GST_BUFFER_DATA(object->field_history[object->history_count-2].buf);
-      L2 = GST_BUFFER_DATA(object->field_history[object->history_count-1].buf) + Pitch; 
-      L3 = L1 + Pitch;
-      L2P = GST_BUFFER_DATA(object->field_history[object->history_count-3].buf) + Pitch;
-
-      // copy first even line
-      object->pMemcpy(Dest, GST_BUFFER_DATA(object->field_history[0].buf), object->line_length);
-      Dest += object->output_stride;
-      // then first odd line
-      object->pMemcpy(Dest, L1, object->line_length);
-      Dest += object->output_stride;
-    }
-
-
-    long oldbx;
-
-    for (Line = 0; Line < (object->field_height - 1); ++Line) {
-        LoopCtr = object->line_length / 8 - 1; // there are LineLength / 8 qwords per line but do 1 less, adj at end of loop
+  LoopCtr = size / 8 - 1;       // there are LineLength / 8 qwords per line but do 1 less, adj at end of loop

  // For ease of reading, the comments below assume that we're operating on an odd
  // field (i.e., that InfoIsOdd is true).  Assume the obvious for even lines..
-        __asm__ __volatile__
-            (
+  __asm__ __volatile__ (
      // save ebx (-fPIC)
      MOVX " %%" XBX ", %[oldbx]\n\t"
-
      MOVX "  %[L1],          %%" XAX "\n\t"
      LEAX "  8(%%" XAX "),     %%" XBX "\n\t"   // next qword needed by DJR
      MOVX "  %[L3],          %%" XCX "\n\t"
@ -120,12 +79,12 @@ void FUNCT_NAME( GstDeinterlace2 *object)

      ".align 8\n\t"
      "1:\n\t"
-
      "movq  (%%" XSI "),      %%mm0\n\t"       // L2 - the newest weave pixel value
      "movq  (%%" XAX "),      %%mm1\n\t"       // L1 - the top pixel
      "movq  (%%" XDX "),      %%mm2\n\t"       // L2P - the prev weave pixel
      "movq  (%%" XAX ", %%" XCX "), %%mm3\n\t" // L3, next odd row
      "movq  %%mm1,          %%mm6\n\t"         // L1 - get simple single pixel interp
+
      //        pavgb   mm6, mm3                    // use macro below
      V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]")

@ -140,9 +99,9 @@ void FUNCT_NAME( GstDeinterlace2 *object)
      "movq  %%mm6,          %%mm7\n\t" // copy of simple bob pixel
      "psllq $16,            %%mm7\n\t" // left justify 3 pixels
      "por   %%mm7,          %%mm4\n\t" // and combine
-
      "movq  (%%" XBX "),      %%mm5\n\t"       // next horiz qword from L1
      // pavgb   mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below
+
      V_PAVGB ("%%mm5", "(%%" XBX ",%%" XCX ")", "%%mm7", "%[ShiftMask]")
      "psllq $48,            %%mm5\n\t" // left just 1 pixel
      "movq  %%mm6,          %%mm7\n\t" // another copy of simple bob pixel
@ -193,8 +152,8 @@ void FUNCT_NAME( GstDeinterlace2 *object)
      // mm3 = L3
      // mm4 = the best of L2,L2P weave pixel, base upon comb
      // mm6 = the avg interpolated value, if we need to use it
-
      // Let's measure movement, as how much the weave pixel has changed
+
      "movq    %%mm2,        %%mm7\n\t"
      "psubusb %%mm0,        %%mm2\n\t"
      "psubusb %%mm7,        %%mm0\n\t"
@ -204,12 +163,14 @@ void FUNCT_NAME( GstDeinterlace2 *object)
      // of the high/low range L1-L3 by more than MaxComb.
      // This allows some comb but limits the damages and also allows more
      // detail than a boring oversmoothed clip.
+
      "movq    %%mm1,        %%mm2\n\t" // copy L1
      // pmaxub mm2, mm3                     // use macro
      V_PMAXUB ("%%mm2", "%%mm3")       // now = Max(L1,L3)
      "movq    %%mm1,        %%mm5\n\t" // copy L1
      // pminub        mm5, mm3                    // now = Min(L1,L3), use macro
      V_PMINUB ("%%mm5", "%%mm3", "%%mm7")
+
      // allow the value to be above the high or below the low by amt of MaxComb
      "psubusb %[MaxComb],   %%mm5\n\t" // lower min by diff
      "paddusb %[MaxComb],   %%mm2\n\t" // increase max by diff
@ -222,7 +183,7 @@ void FUNCT_NAME( GstDeinterlace2 *object)
      "psubusb %[MotionThreshold], %%mm0\n\t"   // test Threshold, clear chroma change >>>??
      "pmullw  %[MotionSense], %%mm0\n\t"       // mul by user factor, keep low 16 bits
      "movq    %[QW256], %%mm7\n\t"
-#ifdef HAVE_SSE
+#if SIMD_TYPE == MMXEXT
      "pminsw  %%mm7,        %%mm0\n\t" // max = 256
 #else
      "paddusw %[QW256B],    %%mm0\n\t" // add, may sat at fff..
@ -236,13 +197,10 @@ void FUNCT_NAME( GstDeinterlace2 *object)
      "pmullw  %%mm0,        %%mm6\n\t" // use more bob for large motion
      "paddusw %%mm6,        %%mm4\n\t" // combine
      "psrlw   $8,           %%mm4\n\t" // div by 256 to get weighted avg
-
      // chroma comes from weave pixel
      "pand    %[UVMask],    %%mm2\n\t" // keep chroma
      "por     %%mm4,        %%mm2\n\t" // and combine
-
      V_MOVNTQ ("(%%" XDI ")", "%%mm2") // move in our clipped best, use macro
-
      // bump ptrs and loop
      LEAX "    8(%%" XAX "),   %%" XAX "\n\t"
      LEAX "    8(%%" XBX "),   %%" XBX "\n\t"
@ -250,6 +208,7 @@ void FUNCT_NAME( GstDeinterlace2 *object)
      LEAX "    8(%%" XDI "),   %%" XDI "\n\t"
      LEAX "    8(%%" XSI "),   %%" XSI "\n\t"
      DECX "    %[LoopCtr]\n\t"
+      
      "jg      1b\n\t"   // loop if not to last line
      // note P-III default assumes backward branches taken
      "jl      1f\n\t"          // done
@ -258,8 +217,7 @@ void FUNCT_NAME( GstDeinterlace2 *object)
      
      "1:\n\t"      
      MOVX " %[oldbx], %%" XBX "\n\t"
-
-             : /* no outputs */
+      "emms\n\t":     /* no outputs */

      :[LastAvg] "m" (LastAvg),
       [L1] "m" (L1),
@ -277,32 +235,9 @@ void FUNCT_NAME( GstDeinterlace2 *object)
       [LoopCtr] "m" (LoopCtr),
       [QW256] "m" (QW256),
       [oldbx] "m" (oldbx)
-
      : XAX, XCX, XDX, XSI, XDI,
-#ifdef HAVE_CPU_I386
      "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
-#endif
      /* FIXME: breaks unless compiling with -mmmx
         "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", */
-               "memory", "cc"
-            );
-
-        Dest += object->output_stride;
-        object->pMemcpy(Dest, L3, object->line_length);
-        Dest += object->output_stride;
-
-        L1  += Pitch;
-        L2  += Pitch;
-        L3  += Pitch;
-        L2P += Pitch;
-	}
-
-    if (InfoIsOdd) {
-        object->pMemcpy(Dest, L2, object->line_length);
-    }
-
-    // clear out the MMX registers ready for doing floating point again
-#ifdef HAVE_CPU_I386
-    __asm__ __volatile__ ("emms\n\t");
-#endif
+      "memory", "cc");
 }
--- a/gst/deinterlace2/tvtime/greedyh.c
+++ b/gst/deinterlace2/tvtime/greedyh.c
@ -41,51 +41,244 @@
 #include "gstdeinterlace2.h"
 #include "speedy.h"

+static const unsigned int GreedyMaxComb = 5;
+static const unsigned int GreedyMotionThreshold = 25;
+static const unsigned int GreedyMotionSense = 30;

-#define MAXCOMB_DEFAULT          5
-#define MOTIONTHRESHOLD_DEFAULT 25
-#define MOTIONSENSE_DEFAULT     30
+void
+greedyDScaler_C (uint8_t * L1, uint8_t * L2, uint8_t * L3, uint8_t * L2P,
+    uint8_t * Dest, int size)
+{
+  int Pos;
+  uint8_t l1_l, l1_1_l, l3_l, l3_1_l;
+  uint8_t l1_c, l1_1_c, l3_c, l3_1_c;
+  uint8_t avg_l, avg_c, avg_l_1, avg_c_1;
+  uint8_t avg_l__1 = 0, avg_c__1 = 0;
+  uint8_t avg_s_l, avg_s_c;
+  uint8_t avg_sc_l, avg_sc_c;
+  uint8_t best_l, best_c;
+  uint16_t mov_l;
+  uint8_t out_l, out_c;
+  uint8_t l2_l, l2_c, lp2_l, lp2_c;
+  uint8_t l2_l_diff, l2_c_diff, lp2_l_diff, lp2_c_diff;
+  uint8_t min_l, min_c, max_l, max_c;

-unsigned int GreedyMaxComb;
+  for (Pos = 0; Pos < size; Pos += 2) {
+    l1_l = L1[0];
+    l1_c = L1[1];
+    l3_l = L3[0];
+    l3_c = L3[1];

-unsigned int GreedyMotionThreshold;
+    if (Pos == size - 1) {
+      l1_1_l = l1_l;
+      l1_1_c = l1_c;
+      l3_1_l = l3_l;
+      l3_1_c = l3_c;
+    } else {
+      l1_1_l = L1[2];
+      l1_1_c = L1[3];
+      l3_1_l = L3[2];
+      l3_1_c = L3[3];
+    }

-unsigned int GreedyMotionSense;
+    /* Average of L1 and L3 */
+    avg_l = (l1_l + l3_l) / 2;
+    avg_c = (l1_c + l3_c) / 2;

+    /* Average of next L1 and next L3 */
+    avg_l_1 = (l1_1_l + l3_1_l) / 2;
+    avg_c_1 = (l1_1_c + l3_1_c) / 2;

-#define IS_SSE
-#define SSE_TYPE SSE
-#define FUNCT_NAME greedyDScaler_SSE
+    /* Calculate average of one pixel forward and previous */
+    avg_s_l = (avg_l__1 + avg_l_1) / 2;
+    avg_s_c = (avg_c__1 + avg_c_1) / 2;
+
+    /* Calculate average of center and surrounding pixels */
+    avg_sc_l = (avg_l + avg_s_l) / 2;
+    avg_sc_c = (avg_c + avg_s_c) / 2;
+
+    /* move forward */
+    avg_l__1 = avg_l;
+    avg_c__1 = avg_c;
+
+    /* Get best L2/L2P, i.e. least diff from above average */
+    l2_l = L2[0];
+    l2_c = L2[1];
+    lp2_l = L2P[0];
+    lp2_c = L2P[1];
+
+    l2_l_diff = ABS (l2_l - avg_sc_l);
+    l2_c_diff = ABS (l2_c - avg_sc_c);
+
+    lp2_l_diff = ABS (lp2_l - avg_sc_l);
+    lp2_c_diff = ABS (lp2_c - avg_sc_c);
+
+    if (l2_l_diff > lp2_l_diff)
+      best_l = lp2_l;
+    else
+      best_l = l2_l;
+
+    if (l2_c_diff > lp2_c_diff)
+      best_c = lp2_c;
+    else
+      best_c = l2_c;
+
+    /* Clip this best L2/L2P by L1/L3 and allow to differ by GreedyMaxComb */
+    max_l = MAX (l1_l, l3_l);
+    min_l = MIN (l1_l, l3_l);
+
+    if (max_l < 256 - GreedyMaxComb)
+      max_l += GreedyMaxComb;
+    else
+      max_l = 255;
+
+    if (min_l > GreedyMaxComb)
+      min_l -= GreedyMaxComb;
+    else
+      min_l = 0;
+
+    max_c = MAX (l1_c, l3_c);
+    min_c = MIN (l1_c, l3_c);
+
+    if (max_c < 256 - GreedyMaxComb)
+      max_c += GreedyMaxComb;
+    else
+      max_c = 255;
+
+    if (min_c > GreedyMaxComb)
+      min_c -= GreedyMaxComb;
+    else
+      min_c = 0;
+
+    out_l = CLAMP (best_l, min_l, max_l);
+    out_c = CLAMP (best_c, min_c, max_c);
+
+    /* Do motion compensation for luma, i.e. how much
+     * the weave pixel differs */
+    mov_l = ABS (l2_l - lp2_l);
+    if (mov_l > GreedyMotionThreshold)
+      mov_l -= GreedyMotionThreshold;
+    else
+      mov_l = 0;
+
+    mov_l = mov_l * GreedyMotionSense;
+    if (mov_l > 256)
+      mov_l = 256;
+
+    /* Weighted sum on clipped weave pixel and average */
+    out_l = (out_l * (256 - mov_l) + avg_sc_l * mov_l) / 256;
+
+    Dest[0] = out_l;
+    Dest[1] = out_c;
+
+    Dest += 2;
+    L1 += 2;
+    L2 += 2;
+    L3 += 2;
+    L2P += 2;
+  }
+}
+
+#define IS_MMXEXT
+#define SIMD_TYPE MMXEXT
+#define FUNCT_NAME greedyDScaler_MMXEXT
 #include "greedyh.asm"
-#undef SSE_TYPE
-#undef IS_SSE
+#undef SIMD_TYPE
+#undef IS_MMXEXT
 #undef FUNCT_NAME

-#define IS_3DNOW
+#define IS_TDNOW
+#define SIMD_TYPE TDNOW
 #define FUNCT_NAME greedyDScaler_3DNOW
-#define SSE_TYPE 3DNOW
 #include "greedyh.asm"
-#undef SSE_TYPE
-#undef IS_3DNOW
+#undef SIMD_TYPE
+#undef IS_TDNOW
 #undef FUNCT_NAME

 #define IS_MMX
-#define SSE_TYPE MMX
+#define SIMD_TYPE MMX
 #define FUNCT_NAME greedyDScaler_MMX
 #include "greedyh.asm"
-#undef SSE_TYPE
+#undef SIMD_TYPE
 #undef IS_MMX
 #undef FUNCT_NAME

-void
+static void
 deinterlace_frame_di_greedyh (GstDeinterlace2 * object)
 {
-  if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) {
-    greedyh_filter_sse (object);
+  void (*func) (uint8_t * L1, uint8_t * L2, uint8_t * L3, uint8_t * L2P,
+      uint8_t * Dest, int size);
+
+  int InfoIsOdd = 0;
+  int Line;
+  unsigned int Pitch = object->field_stride;
+
+  unsigned char *L1;            // ptr to Line1, of 3
+  unsigned char *L2;            // ptr to Line2, the weave line
+  unsigned char *L3;            // ptr to Line3
+
+  unsigned char *L2P;           // ptr to prev Line2
+  unsigned char *Dest = GST_BUFFER_DATA (object->out_buf);
+
+  if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
+    func = greedyDScaler_MMXEXT;
  } else if (object->cpu_feature_flags & OIL_IMPL_FLAG_3DNOW) {
-    greedyh_filter_3dnow (object);
+    func = greedyDScaler_3DNOW;
+  } else if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMX) {
+    func = greedyDScaler_MMX;
  } else {
-    greedyh_filter_mmx (object);
+    func = greedyDScaler_C;
+  }
+
+  // copy first even line no matter what, and the first odd line if we're
+  // processing an EVEN field. (note diff from other deint rtns.)
+
+  if (object->field_history[object->history_count - 1].flags ==
+      PICTURE_INTERLACED_BOTTOM) {
+    InfoIsOdd = 1;
+
+    L1 = GST_BUFFER_DATA (object->field_history[object->history_count - 2].buf);
+    L2 = GST_BUFFER_DATA (object->field_history[object->history_count - 1].buf);
+    L3 = L1 + Pitch;
+    L2P =
+        GST_BUFFER_DATA (object->field_history[object->history_count - 3].buf);
+
+    // copy first even line
+    object->pMemcpy (Dest, L1, object->line_length);
+    Dest += object->output_stride;
+  } else {
+    InfoIsOdd = 0;
+    L1 = GST_BUFFER_DATA (object->field_history[object->history_count - 2].buf);
+    L2 = GST_BUFFER_DATA (object->field_history[object->history_count -
+            1].buf) + Pitch;
+    L3 = L1 + Pitch;
+    L2P =
+        GST_BUFFER_DATA (object->field_history[object->history_count - 3].buf) +
+        Pitch;
+
+    // copy first even line
+    object->pMemcpy (Dest, GST_BUFFER_DATA (object->field_history[0].buf),
+        object->line_length);
+    Dest += object->output_stride;
+    // then first odd line
+    object->pMemcpy (Dest, L1, object->line_length);
+    Dest += object->output_stride;
+  }
+
+  for (Line = 0; Line < (object->field_height - 1); ++Line) {
+    func (L1, L2, L3, L2P, Dest, object->line_length);
+    Dest += object->output_stride;
+    object->pMemcpy (Dest, L3, object->line_length);
+    Dest += object->output_stride;
+
+    L1 += Pitch;
+    L2 += Pitch;
+    L3 += Pitch;
+    L2P += Pitch;
+  }
+
+  if (InfoIsOdd) {
+    object->pMemcpy (Dest, L2, object->line_length);
  }
 }

@ -94,7 +287,7 @@ static deinterlace_method_t greedyh_method = {
  "Motion Adaptive: Advanced Detection",
  "AdaptiveAdvanced",
  4,
-  OIL_IMPL_FLAG_MMX,
+  0,
  0,
  0,
  0,
@ -117,32 +310,5 @@ static deinterlace_method_t greedyh_method = {
 deinterlace_method_t *
 dscaler_greedyh_get_method (void)
 {
-  greedyh_init ();
  return &greedyh_method;
 }
-
-void
-greedyh_init (void)
-{
-  GreedyMaxComb = MAXCOMB_DEFAULT;
-  GreedyMotionThreshold = MOTIONTHRESHOLD_DEFAULT;
-  GreedyMotionSense = MOTIONSENSE_DEFAULT;
-}
-
-void
-greedyh_filter_mmx (GstDeinterlace2 * object)
-{
-  greedyDScaler_MMX (object);
-}
-
-void
-greedyh_filter_3dnow (GstDeinterlace2 * object)
-{
-  greedyDScaler_3DNOW (object);
-}
-
-void
-greedyh_filter_sse (GstDeinterlace2 * object)
-{
-  greedyDScaler_SSE (object);
-}
--- a/gst/deinterlace2/tvtime/greedyhmacros.h
+++ b/gst/deinterlace2/tvtime/greedyhmacros.h
@ -21,7 +21,7 @@

 // BEFORE USING THESE YOU MUST SET:

-// #define SSE_TYPE SSE            (or MMX or 3DNOW)
+// #define SIMD_TYPE MMXEXT            (or MMX or TDNOW)

 // some macros for pavgb instruction
 //      V_PAVGB(mmr1, mmr2, mmr work register, smask) mmr2 may = mmrw if you can trash it
@ -33,21 +33,21 @@
 	"pand    "smask", "mmr1"\n\t"            \
 	"psrlw   $1,      "mmr1"\n\t"            \
 	"paddusb "mmrw",  "mmr1"\n\t"
-#define V_PAVGB_SSE(mmr1, mmr2, mmrw, smask)      "pavgb   "mmr2", "mmr1"\n\t"
-#define V_PAVGB_3DNOW(mmr1, mmr2, mmrw, smask)    "pavgusb "mmr2", "mmr1"\n\t"
-#define V_PAVGB(mmr1, mmr2, mmrw, smask)          V_PAVGB2(mmr1, mmr2, mmrw, smask, SSE_TYPE) 
-#define V_PAVGB2(mmr1, mmr2, mmrw, smask, ssetyp) V_PAVGB3(mmr1, mmr2, mmrw, smask, ssetyp) 
-#define V_PAVGB3(mmr1, mmr2, mmrw, smask, ssetyp) V_PAVGB_##ssetyp(mmr1, mmr2, mmrw, smask) 
+#define V_PAVGB_MMXEXT(mmr1, mmr2, mmrw, smask)      "pavgb   "mmr2", "mmr1"\n\t"
+#define V_PAVGB_TDNOW(mmr1, mmr2, mmrw, smask)    "pavgusb "mmr2", "mmr1"\n\t"
+#define V_PAVGB(mmr1, mmr2, mmrw, smask)          V_PAVGB2(mmr1, mmr2, mmrw, smask, SIMD_TYPE) 
+#define V_PAVGB2(mmr1, mmr2, mmrw, smask, simdtype) V_PAVGB3(mmr1, mmr2, mmrw, smask, simdtype) 
+#define V_PAVGB3(mmr1, mmr2, mmrw, smask, simdtype) V_PAVGB_##simdtype(mmr1, mmr2, mmrw, smask) 

 // some macros for pmaxub instruction
 #define V_PMAXUB_MMX(mmr1, mmr2) \
    "psubusb "mmr2", "mmr1"\n\t" \
    "paddusb "mmr2", "mmr1"\n\t"
-#define V_PMAXUB_SSE(mmr1, mmr2)      "pmaxub "mmr2", "mmr1"\n\t"
-#define V_PMAXUB_3DNOW(mmr1, mmr2)    V_PMAXUB_MMX(mmr1, mmr2)  // use MMX version
-#define V_PMAXUB(mmr1, mmr2)          V_PMAXUB2(mmr1, mmr2, SSE_TYPE) 
-#define V_PMAXUB2(mmr1, mmr2, ssetyp) V_PMAXUB3(mmr1, mmr2, ssetyp) 
-#define V_PMAXUB3(mmr1, mmr2, ssetyp) V_PMAXUB_##ssetyp(mmr1, mmr2) 
+#define V_PMAXUB_MMXEXT(mmr1, mmr2)      "pmaxub "mmr2", "mmr1"\n\t"
+#define V_PMAXUB_TDNOW(mmr1, mmr2)    V_PMAXUB_MMX(mmr1, mmr2)  // use MMX version
+#define V_PMAXUB(mmr1, mmr2)          V_PMAXUB2(mmr1, mmr2, SIMD_TYPE) 
+#define V_PMAXUB2(mmr1, mmr2, simdtype) V_PMAXUB3(mmr1, mmr2, simdtype) 
+#define V_PMAXUB3(mmr1, mmr2, simdtype) V_PMAXUB_##simdtype(mmr1, mmr2) 

 // some macros for pminub instruction
 //      V_PMINUB(mmr1, mmr2, mmr work register)     mmr2 may NOT = mmrw
@ -56,19 +56,19 @@
    "psubusb "mmr2", "mmrw"\n\t"       \
    "paddusb "mmrw", "mmr1"\n\t"       \
    "psubusb "mmrw", "mmr1"\n\t"
-#define V_PMINUB_SSE(mmr1, mmr2, mmrw)      "pminub "mmr2", "mmr1"\n\t"
-#define V_PMINUB_3DNOW(mmr1, mmr2, mmrw)    V_PMINUB_MMX(mmr1, mmr2, mmrw)  // use MMX version
-#define V_PMINUB(mmr1, mmr2, mmrw)          V_PMINUB2(mmr1, mmr2, mmrw, SSE_TYPE) 
-#define V_PMINUB2(mmr1, mmr2, mmrw, ssetyp) V_PMINUB3(mmr1, mmr2, mmrw, ssetyp) 
-#define V_PMINUB3(mmr1, mmr2, mmrw, ssetyp) V_PMINUB_##ssetyp(mmr1, mmr2, mmrw) 
+#define V_PMINUB_MMXEXT(mmr1, mmr2, mmrw)      "pminub "mmr2", "mmr1"\n\t"
+#define V_PMINUB_TDNOW(mmr1, mmr2, mmrw)    V_PMINUB_MMX(mmr1, mmr2, mmrw)  // use MMX version
+#define V_PMINUB(mmr1, mmr2, mmrw)          V_PMINUB2(mmr1, mmr2, mmrw, SIMD_TYPE) 
+#define V_PMINUB2(mmr1, mmr2, mmrw, simdtype) V_PMINUB3(mmr1, mmr2, mmrw, simdtype) 
+#define V_PMINUB3(mmr1, mmr2, mmrw, simdtype) V_PMINUB_##simdtype(mmr1, mmr2, mmrw) 

 // some macros for movntq instruction
 //      V_MOVNTQ(mmr1, mmr2) 
 #define V_MOVNTQ_MMX(mmr1, mmr2)      "movq   "mmr2", "mmr1"\n\t"
-#define V_MOVNTQ_3DNOW(mmr1, mmr2)    "movq   "mmr2", "mmr1"\n\t"
-#define V_MOVNTQ_SSE(mmr1, mmr2)      "movntq "mmr2", "mmr1"\n\t"
-#define V_MOVNTQ(mmr1, mmr2)          V_MOVNTQ2(mmr1, mmr2, SSE_TYPE) 
-#define V_MOVNTQ2(mmr1, mmr2, ssetyp) V_MOVNTQ3(mmr1, mmr2, ssetyp) 
-#define V_MOVNTQ3(mmr1, mmr2, ssetyp) V_MOVNTQ_##ssetyp(mmr1, mmr2)
+#define V_MOVNTQ_TDNOW(mmr1, mmr2)    "movq   "mmr2", "mmr1"\n\t"
+#define V_MOVNTQ_MMXEXT(mmr1, mmr2)      "movntq "mmr2", "mmr1"\n\t"
+#define V_MOVNTQ(mmr1, mmr2)          V_MOVNTQ2(mmr1, mmr2, SIMD_TYPE) 
+#define V_MOVNTQ2(mmr1, mmr2, simdtype) V_MOVNTQ3(mmr1, mmr2, simdtype) 
+#define V_MOVNTQ3(mmr1, mmr2, simdtype) V_MOVNTQ_##simdtype(mmr1, mmr2)

 // end of macros