mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2025-01-24 08:08:22 +00:00
deinterlace: Add yadif ASM optimisations
Measured to be about 3.4x faster than C Part-of: <https://gitlab.freedesktop.org/gstreamer/gst-plugins-good/-/merge_requests/621>
This commit is contained in:
parent
ef78014d15
commit
536ff4776f
7 changed files with 2411 additions and 40 deletions
|
@ -33,8 +33,60 @@ else
|
|||
copy : true)
|
||||
endif
|
||||
|
||||
asm_gen_objs = []
|
||||
if have_nasm
|
||||
if host_system == 'windows'
|
||||
outputname = '@PLAINNAME@.obj'
|
||||
else
|
||||
outputname = '@PLAINNAME@.o'
|
||||
endif
|
||||
|
||||
if get_option('b_staticpic')
|
||||
asm_pic_def = '-DPIC'
|
||||
else
|
||||
asm_pic_def = '-UPIC'
|
||||
endif
|
||||
|
||||
# Assembly has to be told when the symbols have to be prefixed with _
|
||||
if cc.symbols_have_underscore_prefix()
|
||||
asm_prefix_def = '-DPREFIX'
|
||||
else
|
||||
asm_prefix_def = '-UPREFIX'
|
||||
endif
|
||||
|
||||
asm_arch_def = '-DARCH_X86_64=1'
|
||||
if host_system == 'windows'
|
||||
asm_outformat = 'win64'
|
||||
elif ['darwin', 'ios'].contains(host_system)
|
||||
asm_outformat = 'macho64'
|
||||
elif host_system.endswith('bsd')
|
||||
asm_outformat = 'aoutb'
|
||||
else
|
||||
asm_outformat = 'elf64'
|
||||
endif
|
||||
asm_x = files('x86/yadif.asm',
|
||||
'x86/x86inc.asm')
|
||||
|
||||
asm_stackalign_def = '-DSTACK_ALIGNMENT=64'
|
||||
asm_incdir = 'x86'
|
||||
|
||||
message('Nasm configured on x86-64')
|
||||
asm_gen = generator(nasm,
|
||||
output: outputname,
|
||||
arguments: ['-I@CURRENT_SOURCE_DIR@',
|
||||
'-I@CURRENT_SOURCE_DIR@/@0@/'.format(asm_incdir),
|
||||
asm_arch_def,
|
||||
asm_stackalign_def,
|
||||
asm_pic_def,
|
||||
asm_prefix_def,
|
||||
'-f', asm_outformat,
|
||||
'-o', '@OUTPUT@',
|
||||
'@INPUT@'])
|
||||
asm_gen_objs = asm_gen.process(asm_x)
|
||||
endif
|
||||
|
||||
gstdeinterlace = library('gstdeinterlace',
|
||||
interlace_sources, orc_c, orc_h,
|
||||
interlace_sources, asm_gen_objs, orc_c, orc_h,
|
||||
c_args : gst_plugins_good_args,
|
||||
include_directories : [configinc],
|
||||
dependencies : [orc_dep, gstbase_dep, gstvideo_dep],
|
||||
|
|
1701
gst/deinterlace/x86/x86inc.asm
Normal file
1701
gst/deinterlace/x86/x86inc.asm
Normal file
File diff suppressed because it is too large
Load diff
410
gst/deinterlace/x86/yadif.asm
Normal file
410
gst/deinterlace/x86/yadif.asm
Normal file
|
@ -0,0 +1,410 @@
|
|||
;*****************************************************************************
|
||||
;* x86-optimized functions for yadif filter
|
||||
;* Copyright (C) 2020 Vivia Nikolaidou <vivia.nikolaidou@ltnglobal.com>
|
||||
;*
|
||||
;* Based on libav's vf_yadif.asm file
|
||||
;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
|
||||
;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "x86inc.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
; 16 bytes of value 1
|
||||
pb_1: times 16 db 1
|
||||
; 8 words of value 1
|
||||
pw_1: times 8 dw 1
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro ABS1 2
|
||||
%if cpuflag(ssse3)
|
||||
pabsw %1, %1
|
||||
%elif cpuflag(mmxext) ; a, tmp
|
||||
pxor %2, %2
|
||||
psubw %2, %1
|
||||
pmaxsw %1, %2
|
||||
%else ; a, tmp
|
||||
pxor %2, %2
|
||||
pcmpgtw %2, %1
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro CHECK 2
|
||||
; %1 = 1+j, %2 = 1-j
|
||||
; m2 = t0[x+1+j]
|
||||
movu m2, [tzeroq+%1]
|
||||
; m3 = b0[x+1-j]
|
||||
movu m3, [bzeroq+%2]
|
||||
; m4 = t0[x+1+j]
|
||||
mova m4, m2
|
||||
; m5 = t0[x+1+j]
|
||||
mova m5, m2
|
||||
; m4 = xor(t0[x+1+j], b0[x+1-j]
|
||||
pxor m4, m3
|
||||
pavgb m5, m3
|
||||
; round down to 0
|
||||
pand m4, [pb_1]
|
||||
; m5 = rounded down average of the whole thing
|
||||
psubusb m5, m4
|
||||
; shift by 1 quadword to prepare for spatial_pred
|
||||
psrldq m5, 1
|
||||
; m7 = 0
|
||||
; Interleave low-order bytes with 0
|
||||
; so one pixel doesn't spill into the next one
|
||||
punpcklbw m5, m7
|
||||
; m4 = t0[x+1+j] (reset)
|
||||
mova m4, m2
|
||||
; m2 = t0[x+1+j] - b0[x+1-j]
|
||||
psubusb m2, m3
|
||||
; m3 = -m2
|
||||
psubusb m3, m4
|
||||
; m2 = FFABS(t0[x+1+j] - b0[x+1-j]);
|
||||
pmaxub m2, m3
|
||||
; m3 = FFABS(t0[x+1+j] - b0[x+1-j]);
|
||||
mova m3, m2
|
||||
; m4 = FFABS(FFABS(t0[x+1+j] - b0[x+1-j]);
|
||||
mova m4, m2
|
||||
; m3 = FFABS(t0[x+j] - b0[x-j])
|
||||
psrldq m3, 1
|
||||
; m4 = FFABS(t0[x-1+j] - b0[x-1-j])
|
||||
psrldq m4, 2
|
||||
; prevent pixel spilling for all of them
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m4, m7
|
||||
paddw m2, m3
|
||||
; m2 = score
|
||||
paddw m2, m4
|
||||
%endmacro
|
||||
|
||||
%macro CHECK1 0
|
||||
; m0 was spatial_score
|
||||
; m1 was spatial_pred
|
||||
mova m3, m0
|
||||
; compare for greater than
|
||||
; each word will be 1111 or 0000
|
||||
pcmpgtw m3, m2
|
||||
; if (score < spatial_score) spatial_score = score;
|
||||
pminsw m0, m2
|
||||
; m6 = the mask
|
||||
mova m6, m3
|
||||
; m5 = becomes 0 if it should change
|
||||
pand m5, m3
|
||||
; nand: m3 = becomes 0 if it should not change
|
||||
pandn m3, m1
|
||||
; m3 = put them together in an OR
|
||||
por m3, m5
|
||||
; and put it in spatial_pred
|
||||
mova m1, m3
|
||||
%endmacro
|
||||
|
||||
%macro CHECK2 0
|
||||
; m6 was the mask from CHECK1 (we don't change it)
|
||||
paddw m6, [pw_1]
|
||||
; shift words left while shifting in 14 0s (16 - j)
|
||||
; essentially to not recalculate the mask!
|
||||
psllw m6, 14
|
||||
; add it to score
|
||||
paddsw m2, m6
|
||||
; same as CHECK1
|
||||
mova m3, m0
|
||||
pcmpgtw m3, m2
|
||||
pminsw m0, m2
|
||||
pand m5, m3
|
||||
pandn m3, m1
|
||||
por m3, m5
|
||||
mova m1, m3
|
||||
%endmacro
|
||||
|
||||
%macro LOAD 2
|
||||
movh %1, %2
|
||||
punpcklbw %1, m7
|
||||
%endmacro
|
||||
|
||||
%macro FILTER_HEAD 0
|
||||
; m7 = 0
|
||||
pxor m7, m7
|
||||
; m0 = c
|
||||
LOAD m0, [tzeroq]
|
||||
; m1 = e
|
||||
LOAD m1, [bzeroq]
|
||||
; m3 = mp
|
||||
LOAD m3, [mpq]
|
||||
; m2 = m1
|
||||
LOAD m2, [moneq]
|
||||
; m4 = mp
|
||||
mova m4, m3
|
||||
; m3 = m1 + mp
|
||||
paddw m3, m2
|
||||
; m3 = d
|
||||
psraw m3, 1
|
||||
; rsp + 0 = d
|
||||
mova [rsp+ 0], m3
|
||||
; m2 = m1 - mp
|
||||
psubw m2, m4
|
||||
; m2 = temporal_diff0 (m4 is temporary)
|
||||
ABS1 m2, m4
|
||||
; m3 = t2
|
||||
LOAD m3, [ttwoq]
|
||||
; m4 = b2
|
||||
LOAD m4, [btwoq]
|
||||
; m3 = t2 - c
|
||||
psubw m3, m0
|
||||
; m4 = b2 - e
|
||||
psubw m4, m1
|
||||
; m3 = ABS(t2 - c)
|
||||
ABS1 m3, m5
|
||||
; m4 = ABS(b2 - e)
|
||||
ABS1 m4, m5
|
||||
paddw m3, m4
|
||||
psrlw m2, 1
|
||||
; m3 = temporal_diff1
|
||||
psrlw m3, 1
|
||||
; m2 = left part of diff
|
||||
pmaxsw m2, m3
|
||||
; m3 = tp2
|
||||
LOAD m3, [tptwoq]
|
||||
; m4 = bp2
|
||||
LOAD m4, [bptwoq]
|
||||
psubw m3, m0
|
||||
psubw m4, m1
|
||||
ABS1 m3, m5
|
||||
ABS1 m4, m5
|
||||
paddw m3, m4
|
||||
; m3 = temporal_diff2
|
||||
psrlw m3, 1
|
||||
; m2 = diff (for real)
|
||||
pmaxsw m2, m3
|
||||
; rsp + 16 = diff
|
||||
mova [rsp+16], m2
|
||||
|
||||
; m1 = e + c
|
||||
paddw m1, m0
|
||||
; m0 = 2c
|
||||
paddw m0, m0
|
||||
; m0 = c - e
|
||||
psubw m0, m1
|
||||
; m1 = spatial_pred
|
||||
psrlw m1, 1
|
||||
; m0 = FFABS(c-e)
|
||||
ABS1 m0, m2
|
||||
|
||||
; m2 = t0[x-1]
|
||||
; if it's unpacked it should contain 4 bytes
|
||||
movu m2, [tzeroq-1]
|
||||
; m3 = b0[x-1]
|
||||
movu m3, [bzeroq-1]
|
||||
; m4 = t0[x-1]
|
||||
mova m4, m2
|
||||
; m2 = t0[x-1]-b0[x-1] unsigned packed
|
||||
psubusb m2, m3
|
||||
; m3 = m3 - m4 = b0[x-1]-t0[x-1] = -m2 unsigned packed
|
||||
psubusb m3, m4
|
||||
; m2 = max(m2, -m2) = abs(t0[x-1]-b0[x-1])
|
||||
pmaxub m2, m3
|
||||
%if mmsize == 16
|
||||
; m3 = m2 >> 2quadwords
|
||||
; pixel jump: go from x-1 to x+1
|
||||
mova m3, m2
|
||||
psrldq m3, 2
|
||||
%else
|
||||
pshufw m3, m2, q0021
|
||||
%endif
|
||||
; m7 = 0
|
||||
; unpack and interleave low-order bytes
|
||||
; to prevent pixel spilling when adding
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
paddw m0, m2
|
||||
paddw m0, m3
|
||||
; m0 = spatial_score
|
||||
psubw m0, [pw_1]
|
||||
|
||||
CHECK -2, 0
|
||||
CHECK1
|
||||
CHECK -3, 1
|
||||
CHECK2
|
||||
CHECK 0, -2
|
||||
CHECK1
|
||||
CHECK 1, -3
|
||||
CHECK2
|
||||
; now m0 = spatial_score, m1 = spatial_pred
|
||||
|
||||
; m6 = diff
|
||||
mova m6, [rsp+16]
|
||||
%endmacro
|
||||
|
||||
%macro FILTER_TAIL 0
|
||||
; m2 = d
|
||||
mova m2, [rsp]
|
||||
; m3 = d
|
||||
mova m3, m2
|
||||
; m2 = d - diff
|
||||
psubw m2, m6
|
||||
; m3 = d + diff
|
||||
paddw m3, m6
|
||||
; m1 = max(spatial_pred, d-diff)
|
||||
pmaxsw m1, m2
|
||||
; m1 = min(d + diff, max(spatial_pred, d-diff))
|
||||
; m1 = spatial_pred
|
||||
pminsw m1, m3
|
||||
; Converts 8 signed word integers into 16 unsigned byte integers with saturation
|
||||
packuswb m1, m1
|
||||
|
||||
; dst = spatial_pred
|
||||
movh [dstq], m1
|
||||
; half the register size
|
||||
add dstq, mmsize/2
|
||||
add tzeroq, mmsize/2
|
||||
add bzeroq, mmsize/2
|
||||
add moneq, mmsize/2
|
||||
add mpq, mmsize/2
|
||||
add ttwoq, mmsize/2
|
||||
add btwoq, mmsize/2
|
||||
add tptwoq, mmsize/2
|
||||
add bptwoq, mmsize/2
|
||||
add ttoneq, mmsize/2
|
||||
add ttpq, mmsize/2
|
||||
add bboneq, mmsize/2
|
||||
add bbpq, mmsize/2
|
||||
%endmacro
|
||||
|
||||
%macro FILTER_MODE0 0
|
||||
.loop0:
|
||||
FILTER_HEAD
|
||||
; m2 = tt1
|
||||
LOAD m2, [ttoneq]
|
||||
; m4 = ttp
|
||||
LOAD m4, [ttpq]
|
||||
; m3 = bb1
|
||||
LOAD m3, [bboneq]
|
||||
; m5 = bbp
|
||||
LOAD m5, [bbpq]
|
||||
paddw m2, m4
|
||||
paddw m3, m5
|
||||
; m2 = b
|
||||
psrlw m2, 1
|
||||
; m3 = f
|
||||
psrlw m3, 1
|
||||
; m4 = c
|
||||
LOAD m4, [tzeroq]
|
||||
; m5 = d
|
||||
mova m5, [rsp]
|
||||
; m7 = e
|
||||
LOAD m7, [bzeroq]
|
||||
; m2 = b - c
|
||||
psubw m2, m4
|
||||
; m3 = f - e
|
||||
psubw m3, m7
|
||||
; m0 = d
|
||||
mova m0, m5
|
||||
; m5 = d - c
|
||||
psubw m5, m4
|
||||
; m0 = d - e
|
||||
psubw m0, m7
|
||||
; m4 = b - c
|
||||
mova m4, m2
|
||||
; m2 = FFMIN(b-c, f-e)
|
||||
pminsw m2, m3
|
||||
; m3 = FFMAX(f-e, b-c)
|
||||
pmaxsw m3, m4
|
||||
; m2 = FFMAX(d-c, FFMIN(b-c, f-e))
|
||||
pmaxsw m2, m5
|
||||
; m3 = FFMIN(d-c, FFMAX(f-e, b-c))
|
||||
pminsw m3, m5
|
||||
; m2 = max
|
||||
pmaxsw m2, m0
|
||||
; m3 = min
|
||||
pminsw m3, m0
|
||||
; m4 = 0
|
||||
pxor m4, m4
|
||||
; m6 = MAX(diff, min)
|
||||
pmaxsw m6, m3
|
||||
; m4 = -max
|
||||
psubw m4, m2
|
||||
; m6 = diff
|
||||
pmaxsw m6, m4
|
||||
|
||||
FILTER_TAIL
|
||||
; r13m = w
|
||||
sub DWORD r13m, mmsize/2
|
||||
jg .loop0
|
||||
%endmacro
|
||||
|
||||
%macro FILTER_MODE2 0
|
||||
.loop2:
|
||||
FILTER_HEAD
|
||||
FILTER_TAIL
|
||||
; r13m = w
|
||||
sub DWORD r13m, mmsize/2
|
||||
jg .loop2
|
||||
%endmacro
|
||||
|
||||
%macro YADIF_ADD3 0
|
||||
; start 3 pixels later
|
||||
add dstq, 3
|
||||
add tzeroq, 3
|
||||
add bzeroq, 3
|
||||
add moneq, 3
|
||||
add mpq, 3
|
||||
add ttwoq, 3
|
||||
add btwoq, 3
|
||||
add tptwoq, 3
|
||||
add bptwoq, 3
|
||||
add ttoneq, 3
|
||||
add ttpq, 3
|
||||
add bboneq, 3
|
||||
add bbpq, 3
|
||||
%endmacro
|
||||
|
||||
; cglobal foo, 2,3,7,0x40, dst, src, tmp
|
||||
; declares a function (foo) that automatically loads two arguments (dst and
|
||||
; src) into registers, uses one additional register (tmp) plus 7 vector
|
||||
; registers (m0-m6) and allocates 0x40 bytes of stack space.
|
||||
%macro YADIF_MODE0 0
|
||||
cglobal yadif_filter_line_mode0, 13, 14, 8, 80, dst, tzero, bzero, mone, mp, \
|
||||
ttwo, btwo, tptwo, bptwo, ttone, \
|
||||
ttp, bbone, bbp, w
|
||||
|
||||
YADIF_ADD3
|
||||
FILTER_MODE0
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro YADIF_MODE2 0
|
||||
cglobal yadif_filter_line_mode2, 13, 14, 8, 80, dst, tzero, bzero, mone, mp, \
|
||||
ttwo, btwo, tptwo, bptwo, ttone, \
|
||||
ttp, bbone, bbp, w
|
||||
|
||||
YADIF_ADD3
|
||||
FILTER_MODE2
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
; declares two functions for ssse3, and two for sse2
|
||||
INIT_XMM ssse3
|
||||
YADIF_MODE0
|
||||
YADIF_MODE2
|
||||
INIT_XMM sse2
|
||||
YADIF_MODE0
|
||||
YADIF_MODE2
|
|
@ -31,6 +31,7 @@
|
|||
#include <gst/gst.h>
|
||||
#ifdef HAVE_ORC
|
||||
#include <orc/orc.h>
|
||||
#include <orc/orcsse.h>
|
||||
#endif
|
||||
#include "gstdeinterlacemethod.h"
|
||||
#include "yadif.h"
|
||||
|
@ -86,6 +87,41 @@ static void
|
|||
filter_scanline_yadif_packed_3 (GstDeinterlaceSimpleMethod * self,
|
||||
guint8 * out, const GstDeinterlaceScanlineData * scanlines, guint size);
|
||||
|
||||
static void
|
||||
filter_line_c_planar_mode0 (void *ORC_RESTRICT dst,
|
||||
const void *ORC_RESTRICT tzero, const void *ORC_RESTRICT bzero,
|
||||
const void *ORC_RESTRICT mone, const void *ORC_RESTRICT mp,
|
||||
const void *ORC_RESTRICT ttwo, const void *ORC_RESTRICT btwo,
|
||||
const void *ORC_RESTRICT tptwo, const void *ORC_RESTRICT bptwo,
|
||||
const void *ORC_RESTRICT ttone, const void *ORC_RESTRICT ttp,
|
||||
const void *ORC_RESTRICT bbone, const void *ORC_RESTRICT bbp, int w);
|
||||
|
||||
static void
|
||||
filter_line_c_planar_mode2 (void *ORC_RESTRICT dst,
|
||||
const void *ORC_RESTRICT tzero, const void *ORC_RESTRICT bzero,
|
||||
const void *ORC_RESTRICT mone, const void *ORC_RESTRICT mp,
|
||||
const void *ORC_RESTRICT ttwo, const void *ORC_RESTRICT btwo,
|
||||
const void *ORC_RESTRICT tptwo, const void *ORC_RESTRICT bptwo,
|
||||
const void *ORC_RESTRICT ttone, const void *ORC_RESTRICT ttp,
|
||||
const void *ORC_RESTRICT bbone, const void *ORC_RESTRICT bbp, int w);
|
||||
|
||||
static void (*filter_mode2) (void *ORC_RESTRICT dst,
|
||||
const void *ORC_RESTRICT tzero, const void *ORC_RESTRICT bzero,
|
||||
const void *ORC_RESTRICT mone, const void *ORC_RESTRICT mp,
|
||||
const void *ORC_RESTRICT ttwo, const void *ORC_RESTRICT btwo,
|
||||
const void *ORC_RESTRICT tptwo, const void *ORC_RESTRICT bptwo,
|
||||
const void *ORC_RESTRICT ttone, const void *ORC_RESTRICT ttp,
|
||||
const void *ORC_RESTRICT bbone, const void *ORC_RESTRICT bbp, int w);
|
||||
|
||||
static void (*filter_mode0) (void *ORC_RESTRICT dst,
|
||||
const void *ORC_RESTRICT tzero, const void *ORC_RESTRICT bzero,
|
||||
const void *ORC_RESTRICT mone, const void *ORC_RESTRICT mp,
|
||||
const void *ORC_RESTRICT ttwo, const void *ORC_RESTRICT btwo,
|
||||
const void *ORC_RESTRICT tptwo, const void *ORC_RESTRICT bptwo,
|
||||
const void *ORC_RESTRICT ttone, const void *ORC_RESTRICT ttp,
|
||||
const void *ORC_RESTRICT bbone, const void *ORC_RESTRICT bbp, int w);
|
||||
|
||||
|
||||
static void
|
||||
copy_scanline (GstDeinterlaceSimpleMethod * self, guint8 * out,
|
||||
const GstDeinterlaceScanlineData * scanlines, guint size)
|
||||
|
@ -139,36 +175,31 @@ static void
|
|||
dism_class->interpolate_scanline_nv21 = filter_scanline_yadif_semiplanar;
|
||||
}
|
||||
|
||||
static void
|
||||
gst_deinterlace_method_yadif_init (GstDeinterlaceMethodYadif * self)
|
||||
{
|
||||
}
|
||||
|
||||
#define FFABS(a) ABS(a)
|
||||
#define FFMIN(a,b) MIN(a,b)
|
||||
#define FFMAX(a,b) MAX(a,b)
|
||||
#define FFMAX3(a,b,c) FFMAX(FFMAX(a,b),c)
|
||||
#define FFMIN3(a,b,c) FFMIN(FFMIN(a,b),c)
|
||||
|
||||
#define CHECK(j)\
|
||||
{ int score = FFABS(s->t0[x - colors2 * (1 + (j))] - s->b0[x - colors2 * (1 - (j))])\
|
||||
+ FFABS(s->t0[x + colors2 * (j)] - s->b0[x -colors2 * (j)])\
|
||||
+ FFABS(s->t0[x + colors2 * (1 + (j))] - s->b0[x + colors2 * (1 - (j))]);\
|
||||
#define CHECK(j1, j2, j3)\
|
||||
{ int score = FFABS(stzero[x - j1] - sbzero[x - j2])\
|
||||
+ FFABS(stzero[x + j3] - sbzero[x - j3])\
|
||||
+ FFABS(stzero[x + j1] - sbzero[x + j2]);\
|
||||
if (score < spatial_score) {\
|
||||
spatial_score= score;\
|
||||
spatial_pred= (s->t0[x +colors2 * ((j))] + s->b0[x - colors2 * (j)])>>1;\
|
||||
spatial_pred= (stzero[x + j3] + sbzero[x - j3])>>1;\
|
||||
|
||||
/* The is_not_edge argument here controls when the code will enter a branch
|
||||
* which reads up to and including x-3 and x+3. */
|
||||
|
||||
#define FILTER(start, end, is_not_edge) \
|
||||
for (x = start; x < end; x++) { \
|
||||
int c = s->t0[x]; \
|
||||
int d = (s->m1[x] + s->mp[x])>>1; \
|
||||
int e = s->b0[x]; \
|
||||
int temporal_diff0 = FFABS(s->m1[x] - s->mp[x]); \
|
||||
int temporal_diff1 =(FFABS(s->t2[x] - c) + FFABS(s->b2[x] - e) )>>1; \
|
||||
int temporal_diff2 =(FFABS(s->tp2[x] - c) + FFABS(s->bp2[x] - e) )>>1; \
|
||||
int c = stzero[x]; \
|
||||
int d = (smone[x] + smp[x])>>1; \
|
||||
int e = sbzero[x]; \
|
||||
int temporal_diff0 = FFABS(smone[x] - smp[x]); \
|
||||
int temporal_diff1 =(FFABS(sttwo[x] - c) + FFABS(sbtwo[x] - e) )>>1; \
|
||||
int temporal_diff2 =(FFABS(stptwo[x] - c) + FFABS(sbptwo[x] - e) )>>1; \
|
||||
int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \
|
||||
int spatial_pred = (c+e) >> 1; \
|
||||
int colors2 = colors; \
|
||||
|
@ -177,15 +208,21 @@ gst_deinterlace_method_yadif_init (GstDeinterlaceMethodYadif * self)
|
|||
colors2 = 2; \
|
||||
\
|
||||
if (is_not_edge) {\
|
||||
int spatial_score = FFABS(s->t0[x-colors2] - s->b0[x-colors2]) + FFABS(c-e) \
|
||||
+ FFABS(s->t0[x+colors2] - s->b0[x+colors2]); \
|
||||
CHECK(-1) CHECK(-2) }} }} \
|
||||
CHECK( 1) CHECK( 2) }} }} \
|
||||
int spatial_score = FFABS(stzero[x-colors2] - sbzero[x-colors2]) + FFABS(c-e) \
|
||||
+ FFABS(stzero[x+colors2] - sbzero[x+colors2]); \
|
||||
int twice_colors2 = colors2 << 1; \
|
||||
int minus_colors2 = -colors2; \
|
||||
int thrice_colors2 = colors2 * 3; \
|
||||
int minus2_colors2 = colors2 * -2; \
|
||||
CHECK(0, twice_colors2, minus_colors2) \
|
||||
CHECK(-colors2, thrice_colors2, minus2_colors2) }} }} \
|
||||
CHECK(twice_colors2, 0, colors2) \
|
||||
CHECK(thrice_colors2, minus_colors2, twice_colors2) }} }} \
|
||||
}\
|
||||
\
|
||||
if (!(mode&2)) { \
|
||||
int b = (s->tt1[x] + s->ttp[x])>>1; \
|
||||
int f = (s->bb1[x] + s->bbp[x])>>1; \
|
||||
int b = (sttone[x] + sttp[x])>>1; \
|
||||
int f = (sbbone[x] + sbbp[x])>>1; \
|
||||
int max = FFMAX3(d - e, d - c, FFMIN(b - c, f - e)); \
|
||||
int min = FFMIN3(d - e, d - c, FFMAX(b - c, f - e)); \
|
||||
\
|
||||
|
@ -197,16 +234,20 @@ gst_deinterlace_method_yadif_init (GstDeinterlaceMethodYadif * self)
|
|||
else if (spatial_pred < d - diff) \
|
||||
spatial_pred = d - diff; \
|
||||
\
|
||||
dst[x] = spatial_pred; \
|
||||
sdst[x] = spatial_pred; \
|
||||
\
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void
|
||||
filter_line_c (guint8 * dst,
|
||||
const GstDeinterlaceScanlineData * s, int start, int end, int mode,
|
||||
int colors, int y_alternates_every)
|
||||
filter_line_c (guint8 * sdst, const guint8 * stzero, const guint8 * sbzero,
|
||||
const guint8 * smone, const guint8 * smp, const guint8 * sttwo,
|
||||
const guint8 * sbtwo, const guint8 * stptwo, const guint8 * sbptwo,
|
||||
const guint8 * sttone, const guint8 * sttp, const guint8 * sbbone,
|
||||
const guint8 * sbbp, int w, int colors, int y_alternates_every, int start,
|
||||
int end, int mode)
|
||||
{
|
||||
int x;
|
||||
|
||||
/* The function is called for processing the middle
|
||||
* pixels of each line, excluding 3 at each end.
|
||||
* This allows the FILTER macro to be
|
||||
|
@ -218,9 +259,74 @@ filter_line_c (guint8 * dst,
|
|||
#define MAX_ALIGN 8
|
||||
|
||||
ALWAYS_INLINE static void
|
||||
filter_edges (guint8 * dst,
|
||||
const GstDeinterlaceScanlineData * s, int w, int mode, const int bpp,
|
||||
const int colors, int y_alternates_every)
|
||||
filter_line_c_planar (void *ORC_RESTRICT dst, const void *ORC_RESTRICT tzero,
|
||||
const void *ORC_RESTRICT bzero, const void *ORC_RESTRICT mone,
|
||||
const void *ORC_RESTRICT mp, const void *ORC_RESTRICT ttwo,
|
||||
const void *ORC_RESTRICT btwo, const void *ORC_RESTRICT tptwo,
|
||||
const void *ORC_RESTRICT bptwo, const void *ORC_RESTRICT ttone,
|
||||
const void *ORC_RESTRICT ttp, const void *ORC_RESTRICT bbone,
|
||||
const void *ORC_RESTRICT bbp, int w, int mode)
|
||||
{
|
||||
int x;
|
||||
const int start = 0;
|
||||
const int colors = 1;
|
||||
const int y_alternates_every = 0;
|
||||
/* hardcode colors = 1, bpp = 1 */
|
||||
const int end = w;
|
||||
guint8 *sdst = (guint8 *) dst + 3;
|
||||
guint8 *stzero = (guint8 *) tzero + 3;
|
||||
guint8 *sbzero = (guint8 *) bzero + 3;
|
||||
guint8 *smone = (guint8 *) mone + 3;
|
||||
guint8 *smp = (guint8 *) mp + 3;
|
||||
guint8 *sttwo = (guint8 *) ttwo + 3;
|
||||
guint8 *sbtwo = (guint8 *) btwo + 3;
|
||||
guint8 *stptwo = (guint8 *) tptwo + 3;
|
||||
guint8 *sbptwo = (guint8 *) bptwo + 3;
|
||||
guint8 *sttone = (guint8 *) ttone + 3;
|
||||
guint8 *sttp = (guint8 *) ttp + 3;
|
||||
guint8 *sbbone = (guint8 *) bbone + 3;
|
||||
guint8 *sbbp = (guint8 *) bbp + 3;
|
||||
/* The function is called for processing the middle
|
||||
* pixels of each line, excluding 3 at each end.
|
||||
* This allows the FILTER macro to be
|
||||
* called so that it processes all the pixels normally. A constant value of
|
||||
* true for is_not_edge lets the compiler ignore the if statement. */
|
||||
FILTER (start, end, 1)
|
||||
}
|
||||
|
||||
ALWAYS_INLINE G_GNUC_UNUSED static void
|
||||
filter_line_c_planar_mode0 (void *ORC_RESTRICT dst,
|
||||
const void *ORC_RESTRICT tzero, const void *ORC_RESTRICT bzero,
|
||||
const void *ORC_RESTRICT mone, const void *ORC_RESTRICT mp,
|
||||
const void *ORC_RESTRICT ttwo, const void *ORC_RESTRICT btwo,
|
||||
const void *ORC_RESTRICT tptwo, const void *ORC_RESTRICT bptwo,
|
||||
const void *ORC_RESTRICT ttone, const void *ORC_RESTRICT ttp,
|
||||
const void *ORC_RESTRICT bbone, const void *ORC_RESTRICT bbp, int w)
|
||||
{
|
||||
filter_line_c_planar (dst, tzero, bzero, mone, mp, ttwo, btwo, tptwo, bptwo,
|
||||
ttone, ttp, bbone, bbp, w, 0);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE G_GNUC_UNUSED static void
|
||||
filter_line_c_planar_mode2 (void *ORC_RESTRICT dst,
|
||||
const void *ORC_RESTRICT tzero, const void *ORC_RESTRICT bzero,
|
||||
const void *ORC_RESTRICT mone, const void *ORC_RESTRICT mp,
|
||||
const void *ORC_RESTRICT ttwo, const void *ORC_RESTRICT btwo,
|
||||
const void *ORC_RESTRICT tptwo, const void *ORC_RESTRICT bptwo,
|
||||
const void *ORC_RESTRICT ttone, const void *ORC_RESTRICT ttp,
|
||||
const void *ORC_RESTRICT bbone, const void *ORC_RESTRICT bbp, int w)
|
||||
{
|
||||
filter_line_c_planar (dst, tzero, bzero, mone, mp, ttwo, btwo, tptwo, bptwo,
|
||||
ttone, ttp, bbone, bbp, w, 2);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void
|
||||
filter_edges (guint8 * sdst, const guint8 * stzero, const guint8 * sbzero,
|
||||
const guint8 * smone, const guint8 * smp, const guint8 * sttwo,
|
||||
const guint8 * sbtwo, const guint8 * stptwo, const guint8 * sbptwo,
|
||||
const guint8 * sttone, const guint8 * sttp, const guint8 * sbbone,
|
||||
const guint8 * sbbp, int w, int colors, int y_alternates_every,
|
||||
int mode, const int bpp)
|
||||
{
|
||||
int x;
|
||||
const int edge = colors * (MAX_ALIGN / bpp);
|
||||
|
@ -233,13 +339,6 @@ filter_edges (guint8 * dst,
|
|||
FILTER (w - border, w, 0)
|
||||
}
|
||||
|
||||
static void
|
||||
filter_scanline_yadif_planar (GstDeinterlaceSimpleMethod * self,
|
||||
guint8 * out, const GstDeinterlaceScanlineData * s_orig, guint size)
|
||||
{
|
||||
filter_scanline_yadif (self, out, s_orig, size, 1, 0);
|
||||
}
|
||||
|
||||
static void
|
||||
filter_scanline_yadif_semiplanar (GstDeinterlaceSimpleMethod * self,
|
||||
guint8 * out, const GstDeinterlaceScanlineData * s_orig, guint size)
|
||||
|
@ -301,7 +400,78 @@ filter_scanline_yadif (GstDeinterlaceSimpleMethod * self,
|
|||
if (s.b2 == NULL)
|
||||
s.b2 = s.bp2;
|
||||
|
||||
filter_edges (dst, &s, w, mode, bpp, colors, y_alternates_every);
|
||||
filter_line_c (dst, &s, colors * 3, w - edge, mode, colors,
|
||||
y_alternates_every);
|
||||
filter_edges (dst, s.t0, s.b0, s.m1, s.mp, s.t2, s.b2, s.tp2, s.bp2, s.tt1,
|
||||
s.ttp, s.bb1, s.bbp, w, colors, y_alternates_every, mode, bpp);
|
||||
filter_line_c (dst, s.t0, s.b0, s.m1, s.mp, s.t2, s.b2, s.tp2, s.bp2, s.tt1,
|
||||
s.ttp, s.bb1, s.bbp, w, colors, y_alternates_every, colors * 3, w - edge,
|
||||
mode);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void
|
||||
filter_scanline_yadif_planar (GstDeinterlaceSimpleMethod * self,
|
||||
guint8 * out, const GstDeinterlaceScanlineData * s_orig, guint size)
|
||||
{
|
||||
guint8 *dst = out;
|
||||
const int bpp = 1; // Hard code 8-bit atm
|
||||
int w = size / bpp;
|
||||
int edge = MAX_ALIGN / bpp;
|
||||
GstDeinterlaceScanlineData s = *s_orig;
|
||||
|
||||
int mode = (s.tt1 == NULL || s.bb1 == NULL || s.ttp == NULL
|
||||
|| s.bbp == NULL) ? 2 : 0;
|
||||
|
||||
/* When starting up, some data might not yet be available, so use the current frame */
|
||||
if (s.m1 == NULL)
|
||||
s.m1 = s.mp;
|
||||
if (s.tt1 == NULL)
|
||||
s.tt1 = s.ttp;
|
||||
if (s.bb1 == NULL)
|
||||
s.bb1 = s.bbp;
|
||||
if (s.t2 == NULL)
|
||||
s.t2 = s.tp2;
|
||||
if (s.b2 == NULL)
|
||||
s.b2 = s.bp2;
|
||||
|
||||
filter_edges (dst, s.t0, s.b0, s.m1, s.mp, s.t2, s.b2, s.tp2, s.bp2, s.tt1,
|
||||
s.ttp, s.bb1, s.bbp, w, 1, 0, mode, bpp);
|
||||
if (mode == 0)
|
||||
filter_mode0 (dst, (void *) s.t0, (void *) s.b0, (void *) s.m1,
|
||||
(void *) s.mp, (void *) s.t2, (void *) s.b2, (void *) s.tp2,
|
||||
(void *) s.bp2, (void *) s.tt1, (void *) s.ttp, (void *) s.bb1,
|
||||
(void *) s.bbp, w - edge);
|
||||
else
|
||||
filter_mode2 (dst, (void *) s.t0, (void *) s.b0, (void *) s.m1,
|
||||
(void *) s.mp, (void *) s.t2, (void *) s.b2, (void *) s.tp2,
|
||||
(void *) s.bp2, (void *) s.tt1, (void *) s.ttp, (void *) s.bb1,
|
||||
(void *) s.bbp, w - edge);
|
||||
}
|
||||
|
||||
static void
|
||||
gst_deinterlace_method_yadif_init (GstDeinterlaceMethodYadif * self)
|
||||
{
|
||||
#if (defined __x86_64__ || defined _M_X64) && defined HAVE_NASM
|
||||
if (
|
||||
# if defined HAVE_ORC
|
||||
orc_sse_get_cpu_flags () & ORC_TARGET_SSE_SSSE3
|
||||
# elif defined __SSSE3__
|
||||
TRUE
|
||||
# else
|
||||
FALSE
|
||||
# endif
|
||||
) {
|
||||
GST_DEBUG ("SSSE3 optimization enabled");
|
||||
filter_mode0 = gst_yadif_filter_line_mode0_ssse3;
|
||||
filter_mode2 = gst_yadif_filter_line_mode2_ssse3;
|
||||
} else {
|
||||
GST_DEBUG ("SSE2 optimization enabled");
|
||||
filter_mode0 = gst_yadif_filter_line_mode0_sse2;
|
||||
filter_mode2 = gst_yadif_filter_line_mode2_sse2;
|
||||
}
|
||||
#else
|
||||
{
|
||||
GST_DEBUG ("SSE optimization disabled");
|
||||
filter_mode0 = filter_line_c_planar_mode0;
|
||||
filter_mode2 = filter_line_c_planar_mode2;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -25,4 +25,24 @@
|
|||
|
||||
GType gst_deinterlace_method_yadif_get_type (void);
|
||||
|
||||
void
|
||||
gst_yadif_filter_line_mode0_sse2 (void *dst, const void *tzero, const void *bzero,
|
||||
const void *mone, const void *mp, const void *ttwo, const void *btwo, const void *tptwo, const void *bptwo,
|
||||
const void *ttone, const void *ttp, const void *bbone, const void *bbp, int w);
|
||||
|
||||
void
|
||||
gst_yadif_filter_line_mode2_sse2 (void *dst, const void *tzero, const void *bzero,
|
||||
const void *mone, const void *mp, const void *ttwo, const void *btwo, const void *tptwo, const void *bptwo,
|
||||
const void *ttone, const void *ttp, const void *bbone, const void *bbp, int w);
|
||||
|
||||
void
|
||||
gst_yadif_filter_line_mode0_ssse3 (void *dst, const void *tzero, const void *bzero,
|
||||
const void *mone, const void *mp, const void *ttwo, const void *btwo, const void *tptwo, const void *bptwo,
|
||||
const void *ttone, const void *ttp, const void *bbone, const void *bbp, int w);
|
||||
|
||||
void
|
||||
gst_yadif_filter_line_mode2_ssse3 (void *dst, const void *tzero, const void *bzero,
|
||||
const void *mone, const void *mp, const void *ttwo, const void *btwo, const void *tptwo, const void *bptwo,
|
||||
const void *ttone, const void *ttp, const void *bbone, const void *bbp, int w);
|
||||
|
||||
#endif
|
||||
|
|
17
meson.build
17
meson.build
|
@ -335,6 +335,22 @@ else
|
|||
cdata.set('DISABLE_ORC', 1)
|
||||
endif
|
||||
|
||||
have_nasm=false
|
||||
# FIXME: nasm path needs testing on non-Linux, esp. Windows
|
||||
host_cpu = host_machine.cpu_family()
|
||||
if host_cpu == 'x86_64'
|
||||
if cc.get_id() == 'msvc'
|
||||
message('Nasm disabled on MSVC')
|
||||
else
|
||||
nasm = find_program('nasm', native: true, version : '>= 2.13', required: get_option('asm'))
|
||||
if nasm.found()
|
||||
message('Nasm found on x86-64')
|
||||
cdata.set('HAVE_NASM', 1)
|
||||
have_nasm = true
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Disable compiler warnings for unused variables and args if gst debug system is disabled
|
||||
if gst_dep.type_name() == 'internal'
|
||||
gst_debug_disabled = not subproject('gstreamer').get_variable('gst_debug')
|
||||
|
@ -378,6 +394,7 @@ if find_program('xgettext', required : get_option('nls')).found()
|
|||
subdir('po')
|
||||
endif
|
||||
|
||||
cdata.set10('ARCH_X86_64', host_cpu == 'x86_64')
|
||||
configure_file(output : 'config.h', configuration : cdata)
|
||||
|
||||
run_command(python3, '-c', 'import shutil; shutil.copy("hooks/pre-commit.hook", ".git/hooks/pre-commit")')
|
||||
|
|
|
@ -97,6 +97,7 @@ option('glib-asserts', type : 'feature', value : 'enabled', yield : true,
|
|||
description: 'Enable GLib assertion (auto = enabled for development, disabled for stable releases)')
|
||||
option('glib-checks', type : 'feature', value : 'enabled', yield : true,
|
||||
description: 'Enable GLib checks such as API guards (auto = enabled for development, disabled for stable releases)')
|
||||
option('asm', type : 'feature', value : 'auto', yield : true)
|
||||
|
||||
# Common options
|
||||
option('package-name', type : 'string', yield : true,
|
||||
|
|
Loading…
Reference in a new issue