gstreamer/subprojects/gst-plugins-good/gst/deinterlace/x86/yadif.asm
Vivia Nikolaidou 8f31e0c42a yadif.asm: Fix improper usage of LOAD macro
LOAD macro relies in m7 being zero for interleaving purposes. Using LOAD
on the m7 register makes it interleave with its new content instead of
with 0.

The effect of this bug was bobbing on some static lines that appeared
over fast-moving content.

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/1823>
2022-03-01 15:13:15 +00:00

412 lines
9.9 KiB
NASM

;*****************************************************************************
;* x86-optimized functions for yadif filter
;* Copyright (C) 2020 Vivia Nikolaidou <vivia.nikolaidou@ltnglobal.com>
;*
;* Based on libav's vf_yadif.asm file
;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86inc.asm"
SECTION_RODATA
; 16 bytes of value 1
pb_1: times 16 db 1
; 8 words of value 1
pw_1: times 8 dw 1
SECTION .text
%macro ABS1 2
%if cpuflag(ssse3)
pabsw %1, %1
%elif cpuflag(mmxext) ; a, tmp
pxor %2, %2
psubw %2, %1
pmaxsw %1, %2
%else ; a, tmp
pxor %2, %2
pcmpgtw %2, %1
pxor %1, %2
psubw %1, %2
%endif
%endmacro
%macro CHECK 2
; %1 = 1+j, %2 = 1-j
; m2 = t0[x+1+j]
movu m2, [tzeroq+%1]
; m3 = b0[x+1-j]
movu m3, [bzeroq+%2]
; m4 = t0[x+1+j]
mova m4, m2
; m5 = t0[x+1+j]
mova m5, m2
; m4 = xor(t0[x+1+j], b0[x+1-j])
pxor m4, m3
pavgb m5, m3
; round down to 0
pand m4, [pb_1]
; m5 = rounded down average of the whole thing
psubusb m5, m4
; shift by 1 quadword to prepare for spatial_pred
psrldq m5, 1
; m7 = 0
; Interleave low-order bytes with 0
; so one pixel doesn't spill into the next one
punpcklbw m5, m7
; m4 = t0[x+1+j] (reset)
mova m4, m2
; m2 = t0[x+1+j] - b0[x+1-j]
psubusb m2, m3
; m3 = -m2
psubusb m3, m4
; m2 = FFABS(t0[x+1+j] - b0[x+1-j]);
pmaxub m2, m3
; m3 = FFABS(t0[x+1+j] - b0[x+1-j]);
mova m3, m2
; m4 = FFABS(t0[x+1+j] - b0[x+1-j]);
mova m4, m2
; m3 = FFABS(t0[x+j] - b0[x-j])
psrldq m3, 1
; m4 = FFABS(t0[x-1+j] - b0[x-1-j])
psrldq m4, 2
; prevent pixel spilling for all of them
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
paddw m2, m3
; m2 = score
paddw m2, m4
%endmacro
%macro CHECK1 0
; m0 was spatial_score
; m1 was spatial_pred
mova m3, m0
; compare for greater than
; each word will be 1111 or 0000
pcmpgtw m3, m2
; if (score < spatial_score) spatial_score = score;
pminsw m0, m2
; m6 = the mask
mova m6, m3
; m5 = becomes 0 if it should change
pand m5, m3
; nand: m3 = becomes 0 if it should not change
pandn m3, m1
; m3 = put them together in an OR
por m3, m5
; and put it in spatial_pred
mova m1, m3
%endmacro
%macro CHECK2 0
; m6 was the mask from CHECK1 (we don't change it)
paddw m6, [pw_1]
; shift words left while shifting in 14 0s (16 - j)
; essentially to not recalculate the mask!
psllw m6, 14
; add it to score
paddsw m2, m6
; same as CHECK1
mova m3, m0
pcmpgtw m3, m2
pminsw m0, m2
pand m5, m3
pandn m3, m1
por m3, m5
mova m1, m3
%endmacro
%macro LOAD 2
movh %1, %2
punpcklbw %1, m7
%endmacro
%macro FILTER_HEAD 0
; m7 = 0
pxor m7, m7
; m0 = c
LOAD m0, [tzeroq]
; m1 = e
LOAD m1, [bzeroq]
; m3 = mp
LOAD m3, [mpq]
; m2 = m1
LOAD m2, [moneq]
; m4 = mp
mova m4, m3
; m3 = m1 + mp
paddw m3, m2
; m3 = d
psraw m3, 1
; rsp + 0 = d
mova [rsp+ 0], m3
; rsp + 16 = bzeroq
mova [rsp+16], m1
; m2 = m1 - mp
psubw m2, m4
; m2 = temporal_diff0 (m4 is temporary)
ABS1 m2, m4
; m3 = t2
LOAD m3, [ttwoq]
; m4 = b2
LOAD m4, [btwoq]
; m3 = t2 - c
psubw m3, m0
; m4 = b2 - e
psubw m4, m1
; m3 = ABS(t2 - c)
ABS1 m3, m5
; m4 = ABS(b2 - e)
ABS1 m4, m5
paddw m3, m4
psrlw m2, 1
; m3 = temporal_diff1
psrlw m3, 1
; m2 = left part of diff
pmaxsw m2, m3
; m3 = tp2
LOAD m3, [tptwoq]
; m4 = bp2
LOAD m4, [bptwoq]
psubw m3, m0
psubw m4, m1
ABS1 m3, m5
ABS1 m4, m5
paddw m3, m4
; m3 = temporal_diff2
psrlw m3, 1
; m2 = diff (for real)
pmaxsw m2, m3
; rsp + 32 = diff
mova [rsp+32], m2
; m1 = e + c
paddw m1, m0
; m0 = 2c
paddw m0, m0
; m0 = c - e
psubw m0, m1
; m1 = spatial_pred
psrlw m1, 1
; m0 = FFABS(c-e)
ABS1 m0, m2
; m2 = t0[x-1]
; if it's unpacked it should contain 4 bytes
movu m2, [tzeroq-1]
; m3 = b0[x-1]
movu m3, [bzeroq-1]
; m4 = t0[x-1]
mova m4, m2
; m2 = t0[x-1]-b0[x-1] unsigned packed
psubusb m2, m3
; m3 = m3 - m4 = b0[x-1]-t0[x-1] = -m2 unsigned packed
psubusb m3, m4
; m2 = max(m2, -m2) = abs(t0[x-1]-b0[x-1])
pmaxub m2, m3
%if mmsize == 16
; m3 = m2 >> 2quadwords
; pixel jump: go from x-1 to x+1
mova m3, m2
psrldq m3, 2
%else
pshufw m3, m2, q0021
%endif
; m7 = 0
; unpack and interleave low-order bytes
; to prevent pixel spilling when adding
punpcklbw m2, m7
punpcklbw m3, m7
paddw m0, m2
paddw m0, m3
; m0 = spatial_score
psubw m0, [pw_1]
CHECK -2, 0
CHECK1
CHECK -3, 1
CHECK2
CHECK 0, -2
CHECK1
CHECK 1, -3
CHECK2
; now m0 = spatial_score, m1 = spatial_pred
; m6 = diff
mova m6, [rsp+32]
%endmacro
%macro FILTER_TAIL 0
; m2 = d
mova m2, [rsp]
; m3 = d
mova m3, m2
; m2 = d - diff
psubw m2, m6
; m3 = d + diff
paddw m3, m6
; m1 = max(spatial_pred, d-diff)
pmaxsw m1, m2
; m1 = min(d + diff, max(spatial_pred, d-diff))
; m1 = spatial_pred
pminsw m1, m3
; Converts 8 signed word integers into 16 unsigned byte integers with saturation
packuswb m1, m1
; dst = spatial_pred
movh [dstq], m1
; half the register size
add dstq, mmsize/2
add tzeroq, mmsize/2
add bzeroq, mmsize/2
add moneq, mmsize/2
add mpq, mmsize/2
add ttwoq, mmsize/2
add btwoq, mmsize/2
add tptwoq, mmsize/2
add bptwoq, mmsize/2
add ttoneq, mmsize/2
add ttpq, mmsize/2
add bboneq, mmsize/2
add bbpq, mmsize/2
%endmacro
%macro FILTER_MODE0 0
.loop0:
FILTER_HEAD
; m2 = tt1
LOAD m2, [ttoneq]
; m4 = ttp
LOAD m4, [ttpq]
; m3 = bb1
LOAD m3, [bboneq]
; m5 = bbp
LOAD m5, [bbpq]
paddw m2, m4
paddw m3, m5
; m2 = b
psrlw m2, 1
; m3 = f
psrlw m3, 1
; m4 = c
LOAD m4, [tzeroq]
; m5 = d
mova m5, [rsp]
; m7 = e
mova m7, [rsp+16]
; m2 = b - c
psubw m2, m4
; m3 = f - e
psubw m3, m7
; m0 = d
mova m0, m5
; m5 = d - c
psubw m5, m4
; m0 = d - e
psubw m0, m7
; m4 = b - c
mova m4, m2
; m2 = FFMIN(b-c, f-e)
pminsw m2, m3
; m3 = FFMAX(f-e, b-c)
pmaxsw m3, m4
; m2 = FFMAX(d-c, FFMIN(b-c, f-e))
pmaxsw m2, m5
; m3 = FFMIN(d-c, FFMAX(f-e, b-c))
pminsw m3, m5
; m2 = max
pmaxsw m2, m0
; m3 = min
pminsw m3, m0
; m4 = 0
pxor m4, m4
; m6 = MAX(diff, min)
pmaxsw m6, m3
; m4 = -max
psubw m4, m2
; m6 = diff
pmaxsw m6, m4
FILTER_TAIL
; r13m = w
sub DWORD r13m, mmsize/2
jg .loop0
%endmacro
%macro FILTER_MODE2 0
.loop2:
FILTER_HEAD
FILTER_TAIL
; r13m = w
sub DWORD r13m, mmsize/2
jg .loop2
%endmacro
%macro YADIF_ADD3 0
; start 3 pixels later
add dstq, 3
add tzeroq, 3
add bzeroq, 3
add moneq, 3
add mpq, 3
add ttwoq, 3
add btwoq, 3
add tptwoq, 3
add bptwoq, 3
add ttoneq, 3
add ttpq, 3
add bboneq, 3
add bbpq, 3
%endmacro
; cglobal foo, 2,3,7,0x40, dst, src, tmp
; declares a function (foo) that automatically loads two arguments (dst and
; src) into registers, uses one additional register (tmp) plus 7 vector
; registers (m0-m6) and allocates 0x40 bytes of stack space.
%macro YADIF_MODE0 0
cglobal yadif_filter_line_mode0, 13, 14, 8, 80, dst, tzero, bzero, mone, mp, \
ttwo, btwo, tptwo, bptwo, ttone, \
ttp, bbone, bbp, w
YADIF_ADD3
FILTER_MODE0
RET
%endmacro
%macro YADIF_MODE2 0
cglobal yadif_filter_line_mode2, 13, 14, 8, 80, dst, tzero, bzero, mone, mp, \
ttwo, btwo, tptwo, bptwo, ttone, \
ttp, bbone, bbp, w
YADIF_ADD3
FILTER_MODE2
RET
%endmacro
; declares two functions for ssse3, and two for sse2
INIT_XMM ssse3
YADIF_MODE0
YADIF_MODE2
INIT_XMM sse2
YADIF_MODE0
YADIF_MODE2