mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2025-01-21 22:58:16 +00:00
536ff4776f
Measured to be about 3.4x faster than C Part-of: <https://gitlab.freedesktop.org/gstreamer/gst-plugins-good/-/merge_requests/621>
410 lines
9.8 KiB
NASM
410 lines
9.8 KiB
NASM
;*****************************************************************************
|
|
;* x86-optimized functions for yadif filter
|
|
;* Copyright (C) 2020 Vivia Nikolaidou <vivia.nikolaidou@ltnglobal.com>
|
|
;*
|
|
;* Based on libav's vf_yadif.asm file
|
|
;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
|
|
;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
|
|
;*
|
|
;* This file is part of FFmpeg.
|
|
;*
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
;* License as published by the Free Software Foundation; either
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
;*
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;* Lesser General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;******************************************************************************
|
|
|
|
%include "x86inc.asm"
|
|
|
|
SECTION_RODATA
|
|
|
|
; 16 bytes of value 1
|
|
pb_1: times 16 db 1
|
|
; 8 words of value 1
|
|
pw_1: times 8 dw 1
|
|
|
|
SECTION .text
|
|
|
|
%macro ABS1 2
|
|
%if cpuflag(ssse3)
|
|
pabsw %1, %1
|
|
%elif cpuflag(mmxext) ; a, tmp
|
|
pxor %2, %2
|
|
psubw %2, %1
|
|
pmaxsw %1, %2
|
|
%else ; a, tmp
|
|
pxor %2, %2
|
|
pcmpgtw %2, %1
|
|
pxor %1, %2
|
|
psubw %1, %2
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro CHECK 2
|
|
; %1 = 1+j, %2 = 1-j
|
|
; m2 = t0[x+1+j]
|
|
movu m2, [tzeroq+%1]
|
|
; m3 = b0[x+1-j]
|
|
movu m3, [bzeroq+%2]
|
|
; m4 = t0[x+1+j]
|
|
mova m4, m2
|
|
; m5 = t0[x+1+j]
|
|
mova m5, m2
|
|
; m4 = xor(t0[x+1+j], b0[x+1-j]
|
|
pxor m4, m3
|
|
pavgb m5, m3
|
|
; round down to 0
|
|
pand m4, [pb_1]
|
|
; m5 = rounded down average of the whole thing
|
|
psubusb m5, m4
|
|
; shift by 1 quadword to prepare for spatial_pred
|
|
psrldq m5, 1
|
|
; m7 = 0
|
|
; Interleave low-order bytes with 0
|
|
; so one pixel doesn't spill into the next one
|
|
punpcklbw m5, m7
|
|
; m4 = t0[x+1+j] (reset)
|
|
mova m4, m2
|
|
; m2 = t0[x+1+j] - b0[x+1-j]
|
|
psubusb m2, m3
|
|
; m3 = -m2
|
|
psubusb m3, m4
|
|
; m2 = FFABS(t0[x+1+j] - b0[x+1-j]);
|
|
pmaxub m2, m3
|
|
; m3 = FFABS(t0[x+1+j] - b0[x+1-j]);
|
|
mova m3, m2
|
|
; m4 = FFABS(FFABS(t0[x+1+j] - b0[x+1-j]);
|
|
mova m4, m2
|
|
; m3 = FFABS(t0[x+j] - b0[x-j])
|
|
psrldq m3, 1
|
|
; m4 = FFABS(t0[x-1+j] - b0[x-1-j])
|
|
psrldq m4, 2
|
|
; prevent pixel spilling for all of them
|
|
punpcklbw m2, m7
|
|
punpcklbw m3, m7
|
|
punpcklbw m4, m7
|
|
paddw m2, m3
|
|
; m2 = score
|
|
paddw m2, m4
|
|
%endmacro
|
|
|
|
%macro CHECK1 0
|
|
; m0 was spatial_score
|
|
; m1 was spatial_pred
|
|
mova m3, m0
|
|
; compare for greater than
|
|
; each word will be 1111 or 0000
|
|
pcmpgtw m3, m2
|
|
; if (score < spatial_score) spatial_score = score;
|
|
pminsw m0, m2
|
|
; m6 = the mask
|
|
mova m6, m3
|
|
; m5 = becomes 0 if it should change
|
|
pand m5, m3
|
|
; nand: m3 = becomes 0 if it should not change
|
|
pandn m3, m1
|
|
; m3 = put them together in an OR
|
|
por m3, m5
|
|
; and put it in spatial_pred
|
|
mova m1, m3
|
|
%endmacro
|
|
|
|
%macro CHECK2 0
|
|
; m6 was the mask from CHECK1 (we don't change it)
|
|
paddw m6, [pw_1]
|
|
; shift words left while shifting in 14 0s (16 - j)
|
|
; essentially to not recalculate the mask!
|
|
psllw m6, 14
|
|
; add it to score
|
|
paddsw m2, m6
|
|
; same as CHECK1
|
|
mova m3, m0
|
|
pcmpgtw m3, m2
|
|
pminsw m0, m2
|
|
pand m5, m3
|
|
pandn m3, m1
|
|
por m3, m5
|
|
mova m1, m3
|
|
%endmacro
|
|
|
|
%macro LOAD 2
|
|
movh %1, %2
|
|
punpcklbw %1, m7
|
|
%endmacro
|
|
|
|
%macro FILTER_HEAD 0
|
|
; m7 = 0
|
|
pxor m7, m7
|
|
; m0 = c
|
|
LOAD m0, [tzeroq]
|
|
; m1 = e
|
|
LOAD m1, [bzeroq]
|
|
; m3 = mp
|
|
LOAD m3, [mpq]
|
|
; m2 = m1
|
|
LOAD m2, [moneq]
|
|
; m4 = mp
|
|
mova m4, m3
|
|
; m3 = m1 + mp
|
|
paddw m3, m2
|
|
; m3 = d
|
|
psraw m3, 1
|
|
; rsp + 0 = d
|
|
mova [rsp+ 0], m3
|
|
; m2 = m1 - mp
|
|
psubw m2, m4
|
|
; m2 = temporal_diff0 (m4 is temporary)
|
|
ABS1 m2, m4
|
|
; m3 = t2
|
|
LOAD m3, [ttwoq]
|
|
; m4 = b2
|
|
LOAD m4, [btwoq]
|
|
; m3 = t2 - c
|
|
psubw m3, m0
|
|
; m4 = b2 - e
|
|
psubw m4, m1
|
|
; m3 = ABS(t2 - c)
|
|
ABS1 m3, m5
|
|
; m4 = ABS(b2 - e)
|
|
ABS1 m4, m5
|
|
paddw m3, m4
|
|
psrlw m2, 1
|
|
; m3 = temporal_diff1
|
|
psrlw m3, 1
|
|
; m2 = left part of diff
|
|
pmaxsw m2, m3
|
|
; m3 = tp2
|
|
LOAD m3, [tptwoq]
|
|
; m4 = bp2
|
|
LOAD m4, [bptwoq]
|
|
psubw m3, m0
|
|
psubw m4, m1
|
|
ABS1 m3, m5
|
|
ABS1 m4, m5
|
|
paddw m3, m4
|
|
; m3 = temporal_diff2
|
|
psrlw m3, 1
|
|
; m2 = diff (for real)
|
|
pmaxsw m2, m3
|
|
; rsp + 16 = diff
|
|
mova [rsp+16], m2
|
|
|
|
; m1 = e + c
|
|
paddw m1, m0
|
|
; m0 = 2c
|
|
paddw m0, m0
|
|
; m0 = c - e
|
|
psubw m0, m1
|
|
; m1 = spatial_pred
|
|
psrlw m1, 1
|
|
; m0 = FFABS(c-e)
|
|
ABS1 m0, m2
|
|
|
|
; m2 = t0[x-1]
|
|
; if it's unpacked it should contain 4 bytes
|
|
movu m2, [tzeroq-1]
|
|
; m3 = b0[x-1]
|
|
movu m3, [bzeroq-1]
|
|
; m4 = t0[x-1]
|
|
mova m4, m2
|
|
; m2 = t0[x-1]-b0[x-1] unsigned packed
|
|
psubusb m2, m3
|
|
; m3 = m3 - m4 = b0[x-1]-t0[x-1] = -m2 unsigned packed
|
|
psubusb m3, m4
|
|
; m2 = max(m2, -m2) = abs(t0[x-1]-b0[x-1])
|
|
pmaxub m2, m3
|
|
%if mmsize == 16
|
|
; m3 = m2 >> 2quadwords
|
|
; pixel jump: go from x-1 to x+1
|
|
mova m3, m2
|
|
psrldq m3, 2
|
|
%else
|
|
pshufw m3, m2, q0021
|
|
%endif
|
|
; m7 = 0
|
|
; unpack and interleave low-order bytes
|
|
; to prevent pixel spilling when adding
|
|
punpcklbw m2, m7
|
|
punpcklbw m3, m7
|
|
paddw m0, m2
|
|
paddw m0, m3
|
|
; m0 = spatial_score
|
|
psubw m0, [pw_1]
|
|
|
|
CHECK -2, 0
|
|
CHECK1
|
|
CHECK -3, 1
|
|
CHECK2
|
|
CHECK 0, -2
|
|
CHECK1
|
|
CHECK 1, -3
|
|
CHECK2
|
|
; now m0 = spatial_score, m1 = spatial_pred
|
|
|
|
; m6 = diff
|
|
mova m6, [rsp+16]
|
|
%endmacro
|
|
|
|
%macro FILTER_TAIL 0
|
|
; m2 = d
|
|
mova m2, [rsp]
|
|
; m3 = d
|
|
mova m3, m2
|
|
; m2 = d - diff
|
|
psubw m2, m6
|
|
; m3 = d + diff
|
|
paddw m3, m6
|
|
; m1 = max(spatial_pred, d-diff)
|
|
pmaxsw m1, m2
|
|
; m1 = min(d + diff, max(spatial_pred, d-diff))
|
|
; m1 = spatial_pred
|
|
pminsw m1, m3
|
|
; Converts 8 signed word integers into 16 unsigned byte integers with saturation
|
|
packuswb m1, m1
|
|
|
|
; dst = spatial_pred
|
|
movh [dstq], m1
|
|
; half the register size
|
|
add dstq, mmsize/2
|
|
add tzeroq, mmsize/2
|
|
add bzeroq, mmsize/2
|
|
add moneq, mmsize/2
|
|
add mpq, mmsize/2
|
|
add ttwoq, mmsize/2
|
|
add btwoq, mmsize/2
|
|
add tptwoq, mmsize/2
|
|
add bptwoq, mmsize/2
|
|
add ttoneq, mmsize/2
|
|
add ttpq, mmsize/2
|
|
add bboneq, mmsize/2
|
|
add bbpq, mmsize/2
|
|
%endmacro
|
|
|
|
%macro FILTER_MODE0 0
|
|
.loop0:
|
|
FILTER_HEAD
|
|
; m2 = tt1
|
|
LOAD m2, [ttoneq]
|
|
; m4 = ttp
|
|
LOAD m4, [ttpq]
|
|
; m3 = bb1
|
|
LOAD m3, [bboneq]
|
|
; m5 = bbp
|
|
LOAD m5, [bbpq]
|
|
paddw m2, m4
|
|
paddw m3, m5
|
|
; m2 = b
|
|
psrlw m2, 1
|
|
; m3 = f
|
|
psrlw m3, 1
|
|
; m4 = c
|
|
LOAD m4, [tzeroq]
|
|
; m5 = d
|
|
mova m5, [rsp]
|
|
; m7 = e
|
|
LOAD m7, [bzeroq]
|
|
; m2 = b - c
|
|
psubw m2, m4
|
|
; m3 = f - e
|
|
psubw m3, m7
|
|
; m0 = d
|
|
mova m0, m5
|
|
; m5 = d - c
|
|
psubw m5, m4
|
|
; m0 = d - e
|
|
psubw m0, m7
|
|
; m4 = b - c
|
|
mova m4, m2
|
|
; m2 = FFMIN(b-c, f-e)
|
|
pminsw m2, m3
|
|
; m3 = FFMAX(f-e, b-c)
|
|
pmaxsw m3, m4
|
|
; m2 = FFMAX(d-c, FFMIN(b-c, f-e))
|
|
pmaxsw m2, m5
|
|
; m3 = FFMIN(d-c, FFMAX(f-e, b-c))
|
|
pminsw m3, m5
|
|
; m2 = max
|
|
pmaxsw m2, m0
|
|
; m3 = min
|
|
pminsw m3, m0
|
|
; m4 = 0
|
|
pxor m4, m4
|
|
; m6 = MAX(diff, min)
|
|
pmaxsw m6, m3
|
|
; m4 = -max
|
|
psubw m4, m2
|
|
; m6 = diff
|
|
pmaxsw m6, m4
|
|
|
|
FILTER_TAIL
|
|
; r13m = w
|
|
sub DWORD r13m, mmsize/2
|
|
jg .loop0
|
|
%endmacro
|
|
|
|
%macro FILTER_MODE2 0
|
|
.loop2:
|
|
FILTER_HEAD
|
|
FILTER_TAIL
|
|
; r13m = w
|
|
sub DWORD r13m, mmsize/2
|
|
jg .loop2
|
|
%endmacro
|
|
|
|
%macro YADIF_ADD3 0
|
|
; start 3 pixels later
|
|
add dstq, 3
|
|
add tzeroq, 3
|
|
add bzeroq, 3
|
|
add moneq, 3
|
|
add mpq, 3
|
|
add ttwoq, 3
|
|
add btwoq, 3
|
|
add tptwoq, 3
|
|
add bptwoq, 3
|
|
add ttoneq, 3
|
|
add ttpq, 3
|
|
add bboneq, 3
|
|
add bbpq, 3
|
|
%endmacro
|
|
|
|
; cglobal foo, 2,3,7,0x40, dst, src, tmp
|
|
; declares a function (foo) that automatically loads two arguments (dst and
|
|
; src) into registers, uses one additional register (tmp) plus 7 vector
|
|
; registers (m0-m6) and allocates 0x40 bytes of stack space.
|
|
%macro YADIF_MODE0 0
|
|
cglobal yadif_filter_line_mode0, 13, 14, 8, 80, dst, tzero, bzero, mone, mp, \
|
|
ttwo, btwo, tptwo, bptwo, ttone, \
|
|
ttp, bbone, bbp, w
|
|
|
|
YADIF_ADD3
|
|
FILTER_MODE0
|
|
RET
|
|
%endmacro
|
|
|
|
%macro YADIF_MODE2 0
|
|
cglobal yadif_filter_line_mode2, 13, 14, 8, 80, dst, tzero, bzero, mone, mp, \
|
|
ttwo, btwo, tptwo, bptwo, ttone, \
|
|
ttp, bbone, bbp, w
|
|
|
|
YADIF_ADD3
|
|
FILTER_MODE2
|
|
RET
|
|
%endmacro
|
|
|
|
; declares two functions for ssse3, and two for sse2
|
|
INIT_XMM ssse3
|
|
YADIF_MODE0
|
|
YADIF_MODE2
|
|
INIT_XMM sse2
|
|
YADIF_MODE0
|
|
YADIF_MODE2
|