gst/deinterlace2/tvtime/tomsmocomp/: Unroll the loop to handle two bytes at once. This should give a small speedup an...

Original commit message from CVS:
* gst/deinterlace2/tvtime/tomsmocomp/SearchLoopBottom.inc:
* gst/deinterlace2/tvtime/tomsmocomp/SearchLoopTop.inc:
* gst/deinterlace2/tvtime/tomsmocomp/StrangeBob.inc:
* gst/deinterlace2/tvtime/tomsmocomp/WierdBob.inc:
Unroll the loop to handle two bytes at once. This should give
a small speedup and makes it possible to handle chroma and luma
different which is needed later.
This commit is contained in:
Sebastian Dröge 2008-08-26 12:33:16 +00:00
parent 9fbc550ee1
commit 81f8895cb4
5 changed files with 204 additions and 106 deletions

View file

@ -1,3 +1,13 @@
2008-08-26 Sebastian Dröge <sebastian.droege@collabora.co.uk>
* gst/deinterlace2/tvtime/tomsmocomp/SearchLoopBottom.inc:
* gst/deinterlace2/tvtime/tomsmocomp/SearchLoopTop.inc:
* gst/deinterlace2/tvtime/tomsmocomp/StrangeBob.inc:
* gst/deinterlace2/tvtime/tomsmocomp/WierdBob.inc:
Unroll the loop to handle two bytes at once. This should give
a small speedup and makes it possible to handle chroma and luma
different which is needed later.
2008-08-26 Edward Hervey <edward.hervey@collabora.co.uk>
* gst/dccp/gstdccpserversink.c:

View file

@ -114,27 +114,39 @@
return 0;
#else
#ifdef SKIP_SEARCH
out = best; // just use the results of our wierd bob
out[0] = best[0]; // just use the results of our wierd bob
out[1] = best[1];
#else
diff = diff - MIN (diff, 10) - 4;
if (diff < 0)
out = weave;
diff[0] = diff[0] - MIN (diff[0], 10) - 4;
diff[1] = diff[1] - MIN (diff[1] - 10) - 4;
if (diff[0] < 0)
out[0] = weave[0];
else
out = best;
out[0] = best[0];
if (diff[1] < 0)
out[1] = weave[1];
else
out[1] = best[1];
out = CLAMP (out, MinVals, MaxVals);
out[0] = CLAMP (out[0], MinVals[0], MaxVals[0]);
out[1] = CLAMP (out[1], MinVals[1], MaxVals[1]);
#endif
#ifdef USE_VERTICAL_FILTER
pDest[x] = (out + pBob[0]) / 2;
pDest[x + dst_pitchw] = (pBob[src_pitch2] + out) / 2;
pDest[x] = (out[0] + pBob[0]) / 2;
pDest[x + dst_pitchw] = (pBob[src_pitch2] + out[0]) / 2;
pDest[x + 1] = (out[1] + pBob[1]) / 2;
pDest[x + 1 + dst_pitchw] = (pBob[src_pitch2 + 1] + out[1]) / 2;
#else
pDest[x] = out;
pDest[x] = out[0];
pDest[x+1] = out[1];
#endif
pBob += 1;
pBobP += 1;
pSrc += 1;
pSrcP += 1;
pBob += 2;
pBobP += 2;
pSrc += 2;
pSrcP += 2;
}
// adjust for next line
pSrc = src_pitch2 * (y+1) + pWeaveSrc;

View file

@ -6,29 +6,6 @@ const unsigned char* pSrc;
const unsigned char* pBob;
const unsigned char* pBobP;
#ifndef IS_C
int64_t Max_Mov = 0x0404040404040404ull;
int64_t DiffThres = 0x0f0f0f0f0f0f0f0full;
int64_t YMask = 0x00ff00ff00ff00ffull; // keeps only luma
int64_t UVMask = 0xff00ff00ff00ff00ull; // keeps only chroma
int64_t TENS = 0x0a0a0a0a0a0a0a0aull;
int64_t FOURS = 0x0404040404040404ull;
int64_t ONES = 0x0101010101010101ull;
int64_t Min_Vals = 0x0000000000000000ull;
int64_t Max_Vals = 0x0000000000000000ull;
int64_t ShiftMask = 0xfefffefffefffeffull;
long oldbx;
#else
#ifdef USE_STRANGE_BOB
int64_t DiffThres = 0x0f;
#endif
#endif
// long is int32 on ARCH_368, int64 on ARCH_AMD64. Declaring it this way
// saves a lot of xor's to delete 64bit garbage.
@ -40,23 +17,10 @@ long src_pitch2 = 2 * src_pitch; // even & odd lines are interleaved in Avi
long dst_pitch2 = 2 * dst_pitch;
#ifdef IS_C
long y;
long x,best,diff,avg,diff2,out;
#endif
long y;
long Last8;
#if defined(IS_SSE2)
long Last8 = (rowsize-16); // ofs to last 16 bytes in row for SSE2
#elif defined(IS_C)
long Last8 = (rowsize-4); // ofs to last two pixel in row
#else
long Last8 = (rowsize-8); // ofs to last 8 bytes in row
#endif
#ifndef IS_C
long dst_pitchw = dst_pitch; // local stor so asm can ref
#endif
pSrc = pWeaveSrc; // points 1 weave line above
pSrcP = pWeaveSrcP; // "
@ -112,9 +76,24 @@ long dst_pitchw = dst_pitch; // local stor so asm can ref
#define _YMask "%17"
#define _oldbx "%18"
#endif
Last8 = (rowsize-8);
for (y=1; y < FldHeight-1; y++)
{
{
long dst_pitchw = dst_pitch; // local stor so asm can ref
int64_t Max_Mov = 0x0404040404040404ull;
int64_t DiffThres = 0x0f0f0f0f0f0f0f0full;
int64_t YMask = 0x00ff00ff00ff00ffull; // keeps only luma
int64_t UVMask = 0xff00ff00ff00ff00ull; // keeps only chroma
int64_t TENS = 0x0a0a0a0a0a0a0a0aull;
int64_t FOURS = 0x0404040404040404ull;
int64_t ONES = 0x0101010101010101ull;
int64_t Min_Vals = 0x0000000000000000ull;
int64_t Max_Vals = 0x0000000000000000ull;
int64_t ShiftMask = 0xfefffefffefffeffull;
long oldbx;
// pretend it's indented -->>
__asm__ __volatile__
(
@ -206,9 +185,20 @@ long dst_pitchw = dst_pitch; // local stor so asm can ref
"pcmpeqb %%mm7, %%mm7\n\t" // ffff, say we didn't find anything good yet
#else
Last8 = (rowsize - 4);
for (y=1; y < FldHeight-1; y++)
{
#ifdef USE_STRANGE_BOB
long DiffThres = 0x0f;
#endif
#ifndef SKIP_SEARCH
long weave[2], MaxVals[2], MinVals[2];
#endif
long diff[2], best[2], avg[2], diff2[2], out[2], x;
#ifdef USE_VERTICAL_FILTER
pDest[0] = (3 * pBob[0] + pBob[src_pitch2]) / 4;
pDest[1] = (3 * pBob[1] + pBob[src_pitch2 + 1]) / 4;
@ -246,7 +236,7 @@ long dst_pitchw = dst_pitch; // local stor so asm can ref
pSrc += 4;
pSrcP += 4;
for (x=4; x < Last8; x += 1) {
for (x=4; x < Last8; x += 2) {
#ifdef USE_STRANGE_BOB
#include "StrangeBob.inc"
@ -258,7 +248,7 @@ long dst_pitchw = dst_pitch; // local stor so asm can ref
// from the current location, by rating them by the min distance
// from the Bob value instead of the avg distance from that value.
// our best and only rating so far
diff = 255;
diff[0] = diff[1] = 255;
#endif

View file

@ -324,65 +324,112 @@
#else
diff = -1;
best = 0;
diff[0] = -1;
diff[1] = -1;
best[0] = 0;
best[1] = 0;
// j, n
if (ABS (pBob[-2] - pBob[src_pitch2 - 4]) < DiffThres &&
ABS (pBob[-4] - pBob[src_pitch2 + 4]) > DiffThres) {
best = (pBob[-2] + pBob[src_pitch2 - 4]) / 2;
diff = ABS (pBob[-2] - pBob[src_pitch2 - 4]);
best[0] = (pBob[-2] + pBob[src_pitch2 - 4]) / 2;
diff[0] = ABS (pBob[-2] - pBob[src_pitch2 - 4]);
}
if (ABS (pBob[-1] - pBob[src_pitch2 - 3]) < DiffThres &&
ABS (pBob[-3] - pBob[src_pitch2 + 5]) > DiffThres) {
best[1] = (pBob[-1] + pBob[src_pitch2 - 3]) / 2;
diff[1] = ABS (pBob[-1] - pBob[src_pitch2 - 3]);
}
// k & m
if (ABS (pBob[2] - pBob[src_pitch2 + 4]) < DiffThres &&
ABS (pBob[4] - pBob[src_pitch2 - 4]) > DiffThres) {
best = (pBob[4] + pBob[src_pitch2 - 4]) / 2;
diff = ABS (pBob[4] - pBob[src_pitch2 - 4]);
best[0] = (pBob[4] + pBob[src_pitch2 - 4]) / 2;
diff[0] = ABS (pBob[4] - pBob[src_pitch2 - 4]);
}
if (ABS (pBob[3] - pBob[src_pitch2 + 5]) < DiffThres &&
ABS (pBob[5] - pBob[src_pitch2 - 3]) > DiffThres) {
best[1] = (pBob[5] + pBob[src_pitch2 - 3]) / 2;
diff[1] = ABS (pBob[5] - pBob[src_pitch2 - 3]);
}
// c & d
if (ABS (pBob[0] - pBob[src_pitch2 + 2]) < DiffThres &&
ABS (pBob[2] - pBob[src_pitch2 - 2]) > DiffThres) {
best = (pBob[2] + pBob[src_pitch2 - 2]) / 2;
diff = ABS (pBob[2] - pBob[src_pitch2 - 2]);
best[0] = (pBob[2] + pBob[src_pitch2 - 2]) / 2;
diff[0] = ABS (pBob[2] - pBob[src_pitch2 - 2]);
}
if (ABS (pBob[1] - pBob[src_pitch2 + 3]) < DiffThres &&
ABS (pBob[3] - pBob[src_pitch2 - 1]) > DiffThres) {
best[1] = (pBob[3] + pBob[src_pitch2 - 1]) / 2;
diff[1] = ABS (pBob[3] - pBob[src_pitch2 - 1]);
}
// a & f
if (ABS (pBob[0] - pBob[src_pitch2 - 2]) < DiffThres &&
ABS (pBob[-2] - pBob[src_pitch2 + 2]) > DiffThres) {
best = (pBob[-2] + pBob[src_pitch2 + 2]) / 2;
diff = ABS (pBob[-2] - pBob[src_pitch2 + 2]);
best[0] = (pBob[-2] + pBob[src_pitch2 + 2]) / 2;
diff[0] = ABS (pBob[-2] - pBob[src_pitch2 + 2]);
}
if (ABS (pBob[1] - pBob[src_pitch2 - 1]) < DiffThres &&
ABS (pBob[-1] - pBob[src_pitch2 + 3]) > DiffThres) {
best[1] = (pBob[-1] + pBob[src_pitch2 + 3]) / 2;
diff[1] = ABS (pBob[-1] - pBob[src_pitch2 + 3]);
}
// b,e
if (ABS (pBob[0] - pBob[src_pitch2]) < DiffThres) {
best = (pBob[0] + pBob[src_pitch2]) / 2;
diff = ABS (pBob[0] - pBob[src_pitch2]);
best[0] = (pBob[0] + pBob[src_pitch2]) / 2;
diff[0] = ABS (pBob[0] - pBob[src_pitch2]);
}
if (ABS (pBob[1] - pBob[src_pitch2 + 1]) < DiffThres) {
best[1] = (pBob[1] + pBob[src_pitch2 + 1]) / 2;
diff[1] = ABS (pBob[1] - pBob[src_pitch2 + 1]);
}
// We will also calc here the max/min values to later limit comb
// so the max excursion will not exceed the Max_Comb constant
#ifdef SKIP_SEARCH
best = CLAMP (best, MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0]));
best[0] = CLAMP (best[0], MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0]));
best[1] = CLAMP (best[1], MIN (pBob[src_pitch2 + 1], pBob[1]), MAX (pBob[src_pitch2 + 1], pBob[1]));
#else
mov = MAX (ABS (pBob[0] - pBobP[0]), ABS (pBob[src_pitch2] - pBobP[src_pitch2]));
mov[0] = MAX (ABS (pBob[0] - pBobP[0]), ABS (pBob[src_pitch2] - pBobP[src_pitch2]));
mov[1] = MAX (ABS (pBob[1] - pBobP[1]), ABS (pBob[src_pitch2 + 1] - pBobP[src_pitch2 + 1]));
MinVals = 0;
MaxVals = 255;
if (mov > DiffThres) {
MinVals = MAX (MIN (pBob[0], pBob[src_pitch2]), best);
MaxVals = MIN (MAX (pBob[0], pBob[src_pitch2]), best);
MinVals[0] = 0;
MinVals[1] = 0;
MaxVals[0] = 255;
MaxVals[1] = 255;
if (mov[0] > DiffThres) {
MinVals[0] = MAX (MIN (pBob[0], pBob[src_pitch2]), best[0]);
MaxVals[0] = MIN (MAX (pBob[0], pBob[src_pitch2]), best[0]);
}
if (mov[1] > DiffThres) {
MinVals[1] = MAX (MIN (pBob[1], pBob[src_pitch2+1]), best[1]);
MaxVals[1] = MIN (MAX (pBob[1], pBob[src_pitch2+1]), best[1]);
}
best = CLAMP (best, MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0]));
best[0] = CLAMP (best[0], MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0]));
best[1] = CLAMP (best[1], MIN (pBob[src_pitch2 + 1], pBob[1]), MAX (pBob[src_pitch2 + 1], pBob[1]));
#endif
avg[0] = (pBob[src_pitch2] + pBob[0]) / 2;
avg[1] = (pBob[src_pitch2 + 1] + pBob[1]) / 2;
diff2[0] = ABS (pBob[src_pitch2 + 1] - pBob[1]);
diff2[1] = ABS (pBob[src_pitch2 + 1] - pBob[1]);
avg = (pBob[src_pitch2] + pBob[0]) / 2;
diff2 = ABS (pBob[src_pitch2] - pBob[0]);
if (diff[0] == -1 || diff2[0] < diff[0]) {
best[0] = avg[0];
diff[0] = diff2[0];
}
if (diff == -1 || diff2 < diff) {
best = avg;
diff = diff2;
if (diff[1] == -1 || diff2[1] < diff[1]) {
best[1] = avg[1];
diff[1] = diff2[1];
}
#endif

View file

@ -192,56 +192,95 @@
#else
// a,f
best = (pBob[-2] + pBob[src_pitch2 + 2]) / 2;
diff = ABS (pBob[-2] - pBob[src_pitch2 + 2]);
best[0] = (pBob[-2] + pBob[src_pitch2 + 2]) / 2;
diff[0] = ABS (pBob[-2] - pBob[src_pitch2 + 2]);
best[1] = (pBob[-1] + pBob[src_pitch2 + 3]) / 2;
diff[1] = ABS (pBob[-1] - pBob[src_pitch2 + 3]);
// c,d
if (ABS (pBob[2] - pBob[src_pitch2 - 2]) < diff) {
best = (pBob[2] + pBob[src_pitch2 - 2]) / 2;
diff = ABS (pBob[2] - pBob[src_pitch2 - 2]);
if (ABS (pBob[2] - pBob[src_pitch2 - 2]) < diff[0]) {
best[0] = (pBob[2] + pBob[src_pitch2 - 2]) / 2;
diff[0] = ABS (pBob[2] - pBob[src_pitch2 - 2]);
}
if (ABS (pBob[3] - pBob[src_pitch2 - 1]) < diff[1]) {
best[1] = (pBob[3] + pBob[src_pitch2 - 1]) / 2;
diff[1] = ABS (pBob[3] - pBob[src_pitch2 - 1]);
}
// j,n
if (ABS (pBob[-4] - pBob[src_pitch2 + 4]) < diff) {
best = (pBob[-4] + pBob[src_pitch2 + 4]) / 2;
diff = ABS (pBob[-4] - pBob[src_pitch2 + 4]);
if (ABS (pBob[-4] - pBob[src_pitch2 + 4]) < diff[0]) {
best[0] = (pBob[-4] + pBob[src_pitch2 + 4]) / 2;
diff[0] = ABS (pBob[-4] - pBob[src_pitch2 + 4]);
}
if (ABS (pBob[-3] - pBob[src_pitch2 + 5]) < diff[1]) {
best[1] = (pBob[-3] + pBob[src_pitch2 + 5]) / 2;
diff[1] = ABS (pBob[-3] - pBob[src_pitch2 + 5]);
}
// k,m
if (ABS (pBob[4] - pBob[src_pitch2 - 4]) < diff) {
best = (pBob[4] + pBob[src_pitch2 - 4]) / 2;
diff = ABS (pBob[-4] - pBob[src_pitch2 - 4]);
if (ABS (pBob[4] - pBob[src_pitch2 - 4]) < diff[0]) {
best[0] = (pBob[4] + pBob[src_pitch2 - 4]) / 2;
diff[0] = ABS (pBob[-4] - pBob[src_pitch2 - 4]);
}
if (ABS (pBob[5] - pBob[src_pitch2 - 3]) < diff[1]) {
best[1] = (pBob[5] + pBob[src_pitch2 - 3]) / 2;
diff[1] = ABS (pBob[-3] - pBob[src_pitch2 - 3]);
}
// k,m
if (ABS (pBob[4] - pBob[src_pitch2 - 4]) < diff) {
best = (pBob[4] + pBob[src_pitch2 - 4]) / 2;
diff = ABS (pBob[-4] - pBob[src_pitch2 - 4]);
if (ABS (pBob[4] - pBob[src_pitch2 - 4]) < diff[0]) {
best[0] = (pBob[4] + pBob[src_pitch2 - 4]) / 2;
diff[0] = ABS (pBob[-4] - pBob[src_pitch2 - 4]);
}
if (ABS (pBob[5] - pBob[src_pitch2 - 3]) < diff[1]) {
best[1] = (pBob[5] + pBob[src_pitch2 - 3]) / 2;
diff[1] = ABS (pBob[-3] - pBob[src_pitch2 - 3]);
}
// We will also calc here the max/min values to later limit comb
// so the max excursion will not exceed the Max_Comb constant
#ifdef SKIP_SEARCH
best = CLAMP (best, MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0]));
best[0] = CLAMP (best[0], MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0]));
best[1] = CLAMP (best[1], MIN (pBob[src_pitch2 + 1], pBob[1]), MAX (pBob[src_pitch2 + 1], pBob[1]));
#else
mov = MAX (ABS (pBob[0] - pBobP[0]), ABS (pBob[src_pitch2] - pBobP[src_pitch2]));
mov[0] = MAX (ABS (pBob[0] - pBobP[0]), ABS (pBob[src_pitch2] - pBobP[src_pitch2]));
mov[1] = MAX (ABS (pBob[1] - pBobP[1]), ABS (pBob[src_pitch2 + 1] - pBobP[src_pitch2 + 1]));
MinVals = 0;
MaxVals = 255;
if (mov > Max_Mov) {
MinVals = MAX (MIN (pBob[0], pBob[src_pitch2]), best);
MaxVals = MIN (MAX (pBob[0], pBob[src_pitch2]), best);
MinVals[0] = 0;
MinVals[1] = 0;
MaxVals[0] = 255;
MaxVals[1] = 255;
if (mov[0] > Max_Mov[0]) {
MinVals[0] = MAX (MIN (pBob[0], pBob[src_pitch2]), best[0]);
MaxVals[0] = MIN (MAX (pBob[0], pBob[src_pitch2]), best[0]);
}
if (mov[1] > Max_Mov[1]) {
MinVals[1] = MAX (MIN (pBob[1], pBob[src_pitch2 + 1]), best[1]);
MaxVals[1] = MIN (MAX (pBob[1], pBob[src_pitch2 + 1]), best[1]);
}
best = CLAMP (best, MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0]));
best[0] = CLAMP (best[0], MIN (pBob[src_pitch2], pBob[0]), MAX (pBob[src_pitch2], pBob[0]));
best[1] = CLAMP (best[1], MIN (pBob[src_pitch2 + 1], pBob[1]), MAX (pBob[src_pitch2 + 1], pBob[1]));
#endif
avg = (pBob[src_pitch2] + pBob[0]) / 2;
diff2 = ABS (pBob[src_pitch2] - pBob[0]);
avg[0] = (pBob[src_pitch2] + pBob[0]) / 2;
avg[1] = (pBob[src_pitch2 + 1] + pBob[1]) / 2;
diff2[0] = ABS (pBob[src_pitch2] - pBob[0]);
diff2[1] = ABS (pBob[src_pitch2 + 1] - pBob[1]);
if (diff2 < diff) {
best = avg;
diff = diff2;
if (diff2[0] < diff[0]) {
best[0] = avg[0];
diff[0] = diff2[0];
}
if (diff2[1] < diff[1]) {
best[1] = avg[1];
diff[1] = diff2[1];
}
#endif