From 81f8895cb433d34da3478b45904536a365d88946 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Dr=C3=B6ge?= Date: Tue, 26 Aug 2008 12:33:16 +0000 Subject: [PATCH] gst/deinterlace2/tvtime/tomsmocomp/: Unroll the loop to handle two bytes at once. This should give a small speedup an... Original commit message from CVS: * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopBottom.inc: * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopTop.inc: * gst/deinterlace2/tvtime/tomsmocomp/StrangeBob.inc: * gst/deinterlace2/tvtime/tomsmocomp/WierdBob.inc: Unroll the loop to handle two bytes at once. This should give a small speedup and makes it possible to handle chroma and luma different which is needed later. --- ChangeLog | 10 ++ .../tvtime/tomsmocomp/SearchLoopBottom.inc | 38 +++++--- .../tvtime/tomsmocomp/SearchLoopTop.inc | 72 ++++++-------- .../tvtime/tomsmocomp/StrangeBob.inc | 97 ++++++++++++++----- .../tvtime/tomsmocomp/WierdBob.inc | 93 ++++++++++++------ 5 files changed, 204 insertions(+), 106 deletions(-) diff --git a/ChangeLog b/ChangeLog index a2c531374b..cdc3d14e2a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2008-08-26 Sebastian Dröge + + * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopBottom.inc: + * gst/deinterlace2/tvtime/tomsmocomp/SearchLoopTop.inc: + * gst/deinterlace2/tvtime/tomsmocomp/StrangeBob.inc: + * gst/deinterlace2/tvtime/tomsmocomp/WierdBob.inc: + Unroll the loop to handle two bytes at once. This should give + a small speedup and makes it possible to handle chroma and luma + different which is needed later. + 2008-08-26 Edward Hervey * gst/dccp/gstdccpserversink.c: diff --git a/gst/deinterlace2/tvtime/tomsmocomp/SearchLoopBottom.inc b/gst/deinterlace2/tvtime/tomsmocomp/SearchLoopBottom.inc index ce6d25341a..e1560353e3 100644 --- a/gst/deinterlace2/tvtime/tomsmocomp/SearchLoopBottom.inc +++ b/gst/deinterlace2/tvtime/tomsmocomp/SearchLoopBottom.inc @@ -114,27 +114,39 @@ return 0; #else #ifdef SKIP_SEARCH - out = best; // just use the results of our wierd bob + out[0] = best[0]; // just use the results of our wierd bob + out[1] = best[1]; #else - diff = diff - MIN (diff, 10) - 4; - if (diff < 0) - out = weave; + diff[0] = diff[0] - MIN (diff[0], 10) - 4; + diff[1] = diff[1] - MIN (diff[1] - 10) - 4; + if (diff[0] < 0) + out[0] = weave[0]; else - out = best; + out[0] = best[0]; + + if (diff[1] < 0) + out[1] = weave[1]; + else + out[1] = best[1]; - out = CLAMP (out, MinVals, MaxVals); + + out[0] = CLAMP (out[0], MinVals[0], MaxVals[0]); + out[1] = CLAMP (out[1], MinVals[1], MaxVals[1]); #endif #ifdef USE_VERTICAL_FILTER - pDest[x] = (out + pBob[0]) / 2; - pDest[x + dst_pitchw] = (pBob[src_pitch2] + out) / 2; + pDest[x] = (out[0] + pBob[0]) / 2; + pDest[x + dst_pitchw] = (pBob[src_pitch2] + out[0]) / 2; + pDest[x + 1] = (out[1] + pBob[1]) / 2; + pDest[x + 1 + dst_pitchw] = (pBob[src_pitch2 + 1] + out[1]) / 2; #else - pDest[x] = out; + pDest[x] = out[0]; + pDest[x+1] = out[1]; #endif - pBob += 1; - pBobP += 1; - pSrc += 1; - pSrcP += 1; + pBob += 2; + pBobP += 2; + pSrc += 2; + pSrcP += 2; } // adjust for next line pSrc = src_pitch2 * (y+1) + pWeaveSrc; diff --git a/gst/deinterlace2/tvtime/tomsmocomp/SearchLoopTop.inc b/gst/deinterlace2/tvtime/tomsmocomp/SearchLoopTop.inc index 9f42650b81..9d6a490f50 100644 --- a/gst/deinterlace2/tvtime/tomsmocomp/SearchLoopTop.inc +++ b/gst/deinterlace2/tvtime/tomsmocomp/SearchLoopTop.inc @@ -6,29 +6,6 @@ const unsigned char* pSrc; const unsigned char* pBob; const unsigned char* pBobP; -#ifndef IS_C - -int64_t Max_Mov = 0x0404040404040404ull; -int64_t DiffThres = 0x0f0f0f0f0f0f0f0full; -int64_t YMask = 0x00ff00ff00ff00ffull; // keeps only luma -int64_t UVMask = 0xff00ff00ff00ff00ull; // keeps only chroma -int64_t TENS = 0x0a0a0a0a0a0a0a0aull; -int64_t FOURS = 0x0404040404040404ull; -int64_t ONES = 0x0101010101010101ull; -int64_t Min_Vals = 0x0000000000000000ull; -int64_t Max_Vals = 0x0000000000000000ull; -int64_t ShiftMask = 0xfefffefffefffeffull; - -long oldbx; - -#else - -#ifdef USE_STRANGE_BOB -int64_t DiffThres = 0x0f; -#endif - -#endif - // long is int32 on ARCH_368, int64 on ARCH_AMD64. Declaring it this way // saves a lot of xor's to delete 64bit garbage. @@ -40,23 +17,10 @@ long src_pitch2 = 2 * src_pitch; // even & odd lines are interleaved in Avi long dst_pitch2 = 2 * dst_pitch; -#ifdef IS_C +long y; -long x,best,diff,avg,diff2,out; -#endif -long y; +long Last8; -#if defined(IS_SSE2) -long Last8 = (rowsize-16); // ofs to last 16 bytes in row for SSE2 -#elif defined(IS_C) -long Last8 = (rowsize-4); // ofs to last two pixel in row -#else -long Last8 = (rowsize-8); // ofs to last 8 bytes in row -#endif - -#ifndef IS_C -long dst_pitchw = dst_pitch; // local stor so asm can ref -#endif pSrc = pWeaveSrc; // points 1 weave line above pSrcP = pWeaveSrcP; // " @@ -112,9 +76,24 @@ long dst_pitchw = dst_pitch; // local stor so asm can ref #define _YMask "%17" #define _oldbx "%18" #endif + Last8 = (rowsize-8); for (y=1; y < FldHeight-1; y++) - { + { + long dst_pitchw = dst_pitch; // local stor so asm can ref + int64_t Max_Mov = 0x0404040404040404ull; + int64_t DiffThres = 0x0f0f0f0f0f0f0f0full; + int64_t YMask = 0x00ff00ff00ff00ffull; // keeps only luma + int64_t UVMask = 0xff00ff00ff00ff00ull; // keeps only chroma + int64_t TENS = 0x0a0a0a0a0a0a0a0aull; + int64_t FOURS = 0x0404040404040404ull; + int64_t ONES = 0x0101010101010101ull; + int64_t Min_Vals = 0x0000000000000000ull; + int64_t Max_Vals = 0x0000000000000000ull; + int64_t ShiftMask = 0xfefffefffefffeffull; + + long oldbx; + // pretend it's indented -->> __asm__ __volatile__ ( @@ -206,9 +185,20 @@ long dst_pitchw = dst_pitch; // local stor so asm can ref "pcmpeqb %%mm7, %%mm7\n\t" // ffff, say we didn't find anything good yet #else + Last8 = (rowsize - 4); for (y=1; y < FldHeight-1; y++) { + #ifdef USE_STRANGE_BOB + long DiffThres = 0x0f; + #endif + + #ifndef SKIP_SEARCH + long weave[2], MaxVals[2], MinVals[2]; + #endif + + long diff[2], best[2], avg[2], diff2[2], out[2], x; + #ifdef USE_VERTICAL_FILTER pDest[0] = (3 * pBob[0] + pBob[src_pitch2]) / 4; pDest[1] = (3 * pBob[1] + pBob[src_pitch2 + 1]) / 4; @@ -246,7 +236,7 @@ long dst_pitchw = dst_pitch; // local stor so asm can ref pSrc += 4; pSrcP += 4; - for (x=4; x < Last8; x += 1) { + for (x=4; x < Last8; x += 2) { #ifdef USE_STRANGE_BOB #include "StrangeBob.inc" @@ -258,7 +248,7 @@ long dst_pitchw = dst_pitch; // local stor so asm can ref // from the current location, by rating them by the min distance // from the Bob value instead of the avg distance from that value. // our best and only rating so far - diff = 255; + diff[0] = diff[1] = 255; #endif diff --git a/gst/deinterlace2/tvtime/tomsmocomp/StrangeBob.inc b/gst/deinterlace2/tvtime/tomsmocomp/StrangeBob.inc index 73ce706a70..45b4c8652c 100644 --- a/gst/deinterlace2/tvtime/tomsmocomp/StrangeBob.inc +++ b/gst/deinterlace2/tvtime/tomsmocomp/StrangeBob.inc @@ -324,65 +324,112 @@ #else - diff = -1; - best = 0; + diff[0] = -1; + diff[1] = -1; + best[0] = 0; + best[1] = 0; // j, n if (ABS (pBob[-2] - pBob[src_pitch2 - 4]) < DiffThres && ABS (pBob[-4] - pBob[src_pitch2 + 4]) > DiffThres) { - best = (pBob[-2] + pBob[src_pitch2 - 4]) / 2; - diff = ABS (pBob[-2] - pBob[src_pitch2 - 4]); + best[0] = (pBob[-2] + pBob[src_pitch2 - 4]) / 2; + diff[0] = ABS (pBob[-2] - pBob[src_pitch2 - 4]); + } + if (ABS (pBob[-1] - pBob[src_pitch2 - 3]) < DiffThres && + ABS (pBob[-3] - pBob[src_pitch2 + 5]) > DiffThres) { + best[1] = (pBob[-1] + pBob[src_pitch2 - 3]) / 2; + diff[1] = ABS (pBob[-1] - pBob[src_pitch2 - 3]); } // k & m if (ABS (pBob[2] - pBob[src_pitch2 + 4]) < DiffThres && ABS (pBob[4] - pBob[src_pitch2 - 4]) > DiffThres) { - best = (pBob[4] + pBob[src_pitch2 - 4]) / 2; - diff = ABS (pBob[4] - pBob[src_pitch2 - 4]); + best[0] = (pBob[4] + pBob[src_pitch2 - 4]) / 2; + diff[0] = ABS (pBob[4] - pBob[src_pitch2 - 4]); + } + + if (ABS (pBob[3] - pBob[src_pitch2 + 5]) < DiffThres && + ABS (pBob[5] - pBob[src_pitch2 - 3]) > DiffThres) { + best[1] = (pBob[5] + pBob[src_pitch2 - 3]) / 2; + diff[1] = ABS (pBob[5] - pBob[src_pitch2 - 3]); } // c & d if (ABS (pBob[0] - pBob[src_pitch2 + 2]) < DiffThres && ABS (pBob[2] - pBob[src_pitch2 - 2]) > DiffThres) { - best = (pBob[2] + pBob[src_pitch2 - 2]) / 2; - diff = ABS (pBob[2] - pBob[src_pitch2 - 2]); + best[0] = (pBob[2] + pBob[src_pitch2 - 2]) / 2; + diff[0] = ABS (pBob[2] - pBob[src_pitch2 - 2]); + } + + if (ABS (pBob[1] - pBob[src_pitch2 + 3]) < DiffThres && + ABS (pBob[3] - pBob[src_pitch2 - 1]) > DiffThres) { + best[1] = (pBob[3] + pBob[src_pitch2 - 1]) / 2; + diff[1] = ABS (pBob[3] - pBob[src_pitch2 - 1]); } // a & f if (ABS (pBob[0] - pBob[src_pitch2 - 2]) < DiffThres && ABS (pBob[-2] - pBob[src_pitch2 + 2]) > DiffThres) { - best = (pBob[-2] + pBob[src_pitch2 + 2]) / 2; - diff = ABS (pBob[-2] - pBob[src_pitch2 + 2]); + best[0] = (pBob[-2] + pBob[src_pitch2 + 2]) / 2; + diff[0] = ABS (pBob[-2] - pBob[src_pitch2 + 2]); + } + + if (ABS (pBob[1] - pBob[src_pitch2 - 1]) < DiffThres && + ABS (pBob[-1] - pBob[src_pitch2 + 3]) > DiffThres) { + best[1] = (pBob[-1] + pBob[src_pitch2 + 3]) / 2; + diff[1] = ABS (pBob[-1] - pBob[src_pitch2 + 3]); } // b,e if (ABS (pBob[0] - pBob[src_pitch2]) < DiffThres) { - best = (pBob[0] + pBob[src_pitch2]) / 2; - diff = ABS (pBob[0] - pBob[src_pitch2]); + best[0] = (pBob[0] + pBob[src_pitch2]) / 2; + diff[0] = ABS (pBob[0] - pBob[src_pitch2]); } + if (ABS (pBob[1] - pBob[src_pitch2 + 1]) < DiffThres) { + best[1] = (pBob[1] + pBob[src_pitch2 + 1]) / 2; + diff[1] = ABS (pBob[1] - pBob[src_pitch2 + 1]); + } + + // We will also calc here the max/min values to later limit comb // so the max excursion will not exceed the Max_Comb constant #ifdef SKIP_SEARCH - best = CLAMP (best, MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0])); + best[0] = CLAMP (best[0], MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0])); + best[1] = CLAMP (best[1], MIN (pBob[src_pitch2 + 1], pBob[1]), MAX (pBob[src_pitch2 + 1], pBob[1])); #else - mov = MAX (ABS (pBob[0] - pBobP[0]), ABS (pBob[src_pitch2] - pBobP[src_pitch2])); + mov[0] = MAX (ABS (pBob[0] - pBobP[0]), ABS (pBob[src_pitch2] - pBobP[src_pitch2])); + mov[1] = MAX (ABS (pBob[1] - pBobP[1]), ABS (pBob[src_pitch2 + 1] - pBobP[src_pitch2 + 1])); - MinVals = 0; - MaxVals = 255; - if (mov > DiffThres) { - MinVals = MAX (MIN (pBob[0], pBob[src_pitch2]), best); - MaxVals = MIN (MAX (pBob[0], pBob[src_pitch2]), best); + MinVals[0] = 0; + MinVals[1] = 0; + MaxVals[0] = 255; + MaxVals[1] = 255; + if (mov[0] > DiffThres) { + MinVals[0] = MAX (MIN (pBob[0], pBob[src_pitch2]), best[0]); + MaxVals[0] = MIN (MAX (pBob[0], pBob[src_pitch2]), best[0]); + } + + if (mov[1] > DiffThres) { + MinVals[1] = MAX (MIN (pBob[1], pBob[src_pitch2+1]), best[1]); + MaxVals[1] = MIN (MAX (pBob[1], pBob[src_pitch2+1]), best[1]); } - best = CLAMP (best, MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0])); + best[0] = CLAMP (best[0], MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0])); + best[1] = CLAMP (best[1], MIN (pBob[src_pitch2 + 1], pBob[1]), MAX (pBob[src_pitch2 + 1], pBob[1])); #endif + avg[0] = (pBob[src_pitch2] + pBob[0]) / 2; + avg[1] = (pBob[src_pitch2 + 1] + pBob[1]) / 2; + diff2[0] = ABS (pBob[src_pitch2 + 1] - pBob[1]); + diff2[1] = ABS (pBob[src_pitch2 + 1] - pBob[1]); - avg = (pBob[src_pitch2] + pBob[0]) / 2; - diff2 = ABS (pBob[src_pitch2] - pBob[0]); + if (diff[0] == -1 || diff2[0] < diff[0]) { + best[0] = avg[0]; + diff[0] = diff2[0]; + } - if (diff == -1 || diff2 < diff) { - best = avg; - diff = diff2; + if (diff[1] == -1 || diff2[1] < diff[1]) { + best[1] = avg[1]; + diff[1] = diff2[1]; } #endif diff --git a/gst/deinterlace2/tvtime/tomsmocomp/WierdBob.inc b/gst/deinterlace2/tvtime/tomsmocomp/WierdBob.inc index 6cbd1b8dcb..f4bbb8307a 100644 --- a/gst/deinterlace2/tvtime/tomsmocomp/WierdBob.inc +++ b/gst/deinterlace2/tvtime/tomsmocomp/WierdBob.inc @@ -192,56 +192,95 @@ #else // a,f - best = (pBob[-2] + pBob[src_pitch2 + 2]) / 2; - diff = ABS (pBob[-2] - pBob[src_pitch2 + 2]); + best[0] = (pBob[-2] + pBob[src_pitch2 + 2]) / 2; + diff[0] = ABS (pBob[-2] - pBob[src_pitch2 + 2]); + best[1] = (pBob[-1] + pBob[src_pitch2 + 3]) / 2; + diff[1] = ABS (pBob[-1] - pBob[src_pitch2 + 3]); // c,d - if (ABS (pBob[2] - pBob[src_pitch2 - 2]) < diff) { - best = (pBob[2] + pBob[src_pitch2 - 2]) / 2; - diff = ABS (pBob[2] - pBob[src_pitch2 - 2]); + if (ABS (pBob[2] - pBob[src_pitch2 - 2]) < diff[0]) { + best[0] = (pBob[2] + pBob[src_pitch2 - 2]) / 2; + diff[0] = ABS (pBob[2] - pBob[src_pitch2 - 2]); + } + + if (ABS (pBob[3] - pBob[src_pitch2 - 1]) < diff[1]) { + best[1] = (pBob[3] + pBob[src_pitch2 - 1]) / 2; + diff[1] = ABS (pBob[3] - pBob[src_pitch2 - 1]); } // j,n - if (ABS (pBob[-4] - pBob[src_pitch2 + 4]) < diff) { - best = (pBob[-4] + pBob[src_pitch2 + 4]) / 2; - diff = ABS (pBob[-4] - pBob[src_pitch2 + 4]); + if (ABS (pBob[-4] - pBob[src_pitch2 + 4]) < diff[0]) { + best[0] = (pBob[-4] + pBob[src_pitch2 + 4]) / 2; + diff[0] = ABS (pBob[-4] - pBob[src_pitch2 + 4]); + } + + if (ABS (pBob[-3] - pBob[src_pitch2 + 5]) < diff[1]) { + best[1] = (pBob[-3] + pBob[src_pitch2 + 5]) / 2; + diff[1] = ABS (pBob[-3] - pBob[src_pitch2 + 5]); } // k,m - if (ABS (pBob[4] - pBob[src_pitch2 - 4]) < diff) { - best = (pBob[4] + pBob[src_pitch2 - 4]) / 2; - diff = ABS (pBob[-4] - pBob[src_pitch2 - 4]); + if (ABS (pBob[4] - pBob[src_pitch2 - 4]) < diff[0]) { + best[0] = (pBob[4] + pBob[src_pitch2 - 4]) / 2; + diff[0] = ABS (pBob[-4] - pBob[src_pitch2 - 4]); } + if (ABS (pBob[5] - pBob[src_pitch2 - 3]) < diff[1]) { + best[1] = (pBob[5] + pBob[src_pitch2 - 3]) / 2; + diff[1] = ABS (pBob[-3] - pBob[src_pitch2 - 3]); + } // k,m - if (ABS (pBob[4] - pBob[src_pitch2 - 4]) < diff) { - best = (pBob[4] + pBob[src_pitch2 - 4]) / 2; - diff = ABS (pBob[-4] - pBob[src_pitch2 - 4]); + if (ABS (pBob[4] - pBob[src_pitch2 - 4]) < diff[0]) { + best[0] = (pBob[4] + pBob[src_pitch2 - 4]) / 2; + diff[0] = ABS (pBob[-4] - pBob[src_pitch2 - 4]); + } + + if (ABS (pBob[5] - pBob[src_pitch2 - 3]) < diff[1]) { + best[1] = (pBob[5] + pBob[src_pitch2 - 3]) / 2; + diff[1] = ABS (pBob[-3] - pBob[src_pitch2 - 3]); } // We will also calc here the max/min values to later limit comb // so the max excursion will not exceed the Max_Comb constant #ifdef SKIP_SEARCH - best = CLAMP (best, MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0])); + best[0] = CLAMP (best[0], MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0])); + best[1] = CLAMP (best[1], MIN (pBob[src_pitch2 + 1], pBob[1]), MAX (pBob[src_pitch2 + 1], pBob[1])); #else - mov = MAX (ABS (pBob[0] - pBobP[0]), ABS (pBob[src_pitch2] - pBobP[src_pitch2])); + mov[0] = MAX (ABS (pBob[0] - pBobP[0]), ABS (pBob[src_pitch2] - pBobP[src_pitch2])); + mov[1] = MAX (ABS (pBob[1] - pBobP[1]), ABS (pBob[src_pitch2 + 1] - pBobP[src_pitch2 + 1])); - MinVals = 0; - MaxVals = 255; - if (mov > Max_Mov) { - MinVals = MAX (MIN (pBob[0], pBob[src_pitch2]), best); - MaxVals = MIN (MAX (pBob[0], pBob[src_pitch2]), best); + MinVals[0] = 0; + MinVals[1] = 0; + MaxVals[0] = 255; + MaxVals[1] = 255; + + if (mov[0] > Max_Mov[0]) { + MinVals[0] = MAX (MIN (pBob[0], pBob[src_pitch2]), best[0]); + MaxVals[0] = MIN (MAX (pBob[0], pBob[src_pitch2]), best[0]); + } + + if (mov[1] > Max_Mov[1]) { + MinVals[1] = MAX (MIN (pBob[1], pBob[src_pitch2 + 1]), best[1]); + MaxVals[1] = MIN (MAX (pBob[1], pBob[src_pitch2 + 1]), best[1]); } - best = CLAMP (best, MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0])); + best[0] = CLAMP (best[0], MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0])); + best[1] = CLAMP (best[1], MIN (pBob[src_pitch2 + 1], pBob[1]), MAX (pBob[src_pitch2 + 1], pBob[1])); #endif - avg = (pBob[src_pitch2] + pBob[0]) / 2; - diff2 = ABS (pBob[src_pitch2] - pBob[0]); + avg[0] = (pBob[src_pitch2] + pBob[0]) / 2; + avg[1] = (pBob[src_pitch2 + 1] + pBob[1]) / 2; + diff2[0] = ABS (pBob[src_pitch2] - pBob[0]); + diff2[1] = ABS (pBob[src_pitch2 + 1] - pBob[1]); - if (diff2 < diff) { - best = avg; - diff = diff2; + if (diff2[0] < diff[0]) { + best[0] = avg[0]; + diff[0] = diff2[0]; + } + + if (diff2[1] < diff[1]) { + best[1] = avg[1]; + diff[1] = diff2[1]; } #endif