deinterlace: Add yadif ASM optimisations

Measured to be about 3.4x faster than C Part-of: <https://gitlab.freedesktop.org/gstreamer/gst-plugins-good/-/merge_requests/621>
2025-04-13 19:44:10 +00:00 · 2020-06-16 11:52:38 +03:00 · 2020-06-16 11:52:38 +03:00 · 536ff4776f
commit 536ff4776f
parent ef78014d15
7 changed files with 2411 additions and 40 deletions
--- a/gst/deinterlace/meson.build
+++ b/gst/deinterlace/meson.build
@ -33,8 +33,60 @@ else
    copy : true)
 endif

+asm_gen_objs = []
+if have_nasm
+  if host_system == 'windows'
+    outputname = '@PLAINNAME@.obj'
+  else
+    outputname = '@PLAINNAME@.o'
+  endif
+
+  if get_option('b_staticpic')
+    asm_pic_def = '-DPIC'
+  else
+    asm_pic_def = '-UPIC'
+  endif
+
+  # Assembly has to be told when the symbols have to be prefixed with _
+  if cc.symbols_have_underscore_prefix()
+    asm_prefix_def = '-DPREFIX'
+  else
+    asm_prefix_def = '-UPREFIX'
+  endif
+
+  asm_arch_def = '-DARCH_X86_64=1'
+  if host_system == 'windows'
+    asm_outformat = 'win64'
+  elif ['darwin', 'ios'].contains(host_system)
+    asm_outformat = 'macho64'
+  elif host_system.endswith('bsd')
+    asm_outformat = 'aoutb'
+  else
+    asm_outformat = 'elf64'
+  endif
+  asm_x = files('x86/yadif.asm',
+                'x86/x86inc.asm')
+
+  asm_stackalign_def = '-DSTACK_ALIGNMENT=64'
+  asm_incdir = 'x86'
+
+  message('Nasm configured on x86-64')
+  asm_gen = generator(nasm,
+    output: outputname,
+    arguments: ['-I@CURRENT_SOURCE_DIR@',
+                '-I@CURRENT_SOURCE_DIR@/@0@/'.format(asm_incdir),
+                asm_arch_def,
+                asm_stackalign_def,
+                asm_pic_def,
+                asm_prefix_def,
+                '-f', asm_outformat,
+                '-o', '@OUTPUT@',
+                '@INPUT@'])
+  asm_gen_objs = asm_gen.process(asm_x)
+endif
+
 gstdeinterlace = library('gstdeinterlace',
-  interlace_sources, orc_c, orc_h,
+  interlace_sources, asm_gen_objs, orc_c, orc_h,
  c_args : gst_plugins_good_args,
  include_directories : [configinc],
  dependencies : [orc_dep, gstbase_dep, gstvideo_dep],
--- a/gst/deinterlace/x86/x86inc.asm
+++ b/gst/deinterlace/x86/x86inc.asm
--- a/gst/deinterlace/x86/yadif.asm
+++ b/gst/deinterlace/x86/yadif.asm
@ -0,0 +1,410 @@
+;*****************************************************************************
+;* x86-optimized functions for yadif filter
+;* Copyright (C) 2020 Vivia Nikolaidou <vivia.nikolaidou@ltnglobal.com>
+;*
+;* Based on libav's vf_yadif.asm file
+;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+
+; 16 bytes of value 1
+pb_1: times 16 db 1
+; 8 words of value 1
+pw_1: times  8 dw 1
+
+SECTION .text
+
+%macro ABS1 2
+%if cpuflag(ssse3)
+    pabsw   %1, %1
+%elif cpuflag(mmxext) ; a, tmp
+    pxor    %2, %2
+    psubw   %2, %1
+    pmaxsw  %1, %2
+%else ; a, tmp
+    pxor       %2, %2
+    pcmpgtw    %2, %1
+    pxor       %1, %2
+    psubw      %1, %2
+%endif
+%endmacro
+
+%macro CHECK 2
+; %1 = 1+j, %2 = 1-j
+    ; m2 = t0[x+1+j]
+    movu      m2, [tzeroq+%1]
+    ; m3 = b0[x+1-j]
+    movu      m3, [bzeroq+%2]
+    ; m4 = t0[x+1+j]
+    mova      m4, m2
+    ; m5 = t0[x+1+j]
+    mova      m5, m2
+    ; m4 = xor(t0[x+1+j], b0[x+1-j]
+    pxor      m4, m3
+    pavgb     m5, m3
+    ; round down to 0
+    pand      m4, [pb_1]
+    ; m5 = rounded down average of the whole thing
+    psubusb   m5, m4
+    ; shift by 1 quadword to prepare for spatial_pred
+    psrldq    m5, 1
+    ; m7 = 0
+    ; Interleave low-order bytes with 0
+    ; so one pixel doesn't spill into the next one
+    punpcklbw m5, m7
+    ; m4 = t0[x+1+j] (reset)
+    mova      m4, m2
+    ; m2 = t0[x+1+j] - b0[x+1-j]
+    psubusb   m2, m3
+    ; m3 = -m2
+    psubusb   m3, m4
+    ; m2 = FFABS(t0[x+1+j] - b0[x+1-j]);
+    pmaxub    m2, m3
+    ; m3 = FFABS(t0[x+1+j] - b0[x+1-j]);
+    mova      m3, m2
+    ; m4 = FFABS(FFABS(t0[x+1+j] - b0[x+1-j]);
+    mova      m4, m2
+    ; m3 = FFABS(t0[x+j] - b0[x-j])
+    psrldq    m3, 1
+    ; m4 = FFABS(t0[x-1+j] - b0[x-1-j])
+    psrldq    m4, 2
+    ; prevent pixel spilling for all of them
+    punpcklbw m2, m7
+    punpcklbw m3, m7
+    punpcklbw m4, m7
+    paddw     m2, m3
+    ; m2 = score
+    paddw     m2, m4
+%endmacro
+
+%macro CHECK1 0
+; m0 was spatial_score
+; m1 was spatial_pred
+    mova    m3, m0
+    ; compare for greater than
+    ; each word will be 1111 or 0000
+    pcmpgtw m3, m2
+    ; if (score < spatial_score) spatial_score = score;
+    pminsw  m0, m2
+    ; m6 = the mask
+    mova    m6, m3
+    ; m5 = becomes 0 if it should change
+    pand    m5, m3
+    ; nand: m3 = becomes 0 if it should not change
+    pandn   m3, m1
+    ; m3 = put them together in an OR
+    por     m3, m5
+    ; and put it in spatial_pred
+    mova    m1, m3
+%endmacro
+
+%macro CHECK2 0
+; m6 was the mask from CHECK1 (we don't change it)
+    paddw   m6, [pw_1]
+    ; shift words left while shifting in 14 0s (16 - j)
+    ; essentially to not recalculate the mask!
+    psllw   m6, 14
+    ; add it to score
+    paddsw  m2, m6
+    ; same as CHECK1
+    mova    m3, m0
+    pcmpgtw m3, m2
+    pminsw  m0, m2
+    pand    m5, m3
+    pandn   m3, m1
+    por     m3, m5
+    mova    m1, m3
+%endmacro
+
+%macro LOAD 2
+    movh      %1, %2
+    punpcklbw %1, m7
+%endmacro
+
+%macro FILTER_HEAD 0
+    ; m7 = 0
+    pxor         m7, m7
+    ; m0 = c
+    LOAD         m0, [tzeroq]
+    ; m1 = e
+    LOAD         m1, [bzeroq]
+    ; m3 = mp
+    LOAD         m3, [mpq]
+    ; m2 = m1
+    LOAD         m2, [moneq]
+    ; m4 = mp
+    mova         m4, m3
+    ; m3 = m1 + mp
+    paddw        m3, m2
+    ; m3 = d
+    psraw        m3, 1
+    ; rsp + 0 = d
+    mova   [rsp+ 0], m3
+    ; m2 = m1 - mp
+    psubw        m2, m4
+    ; m2 = temporal_diff0 (m4 is temporary)
+    ABS1         m2, m4
+    ; m3 = t2
+    LOAD         m3, [ttwoq]
+    ; m4 = b2
+    LOAD         m4, [btwoq]
+    ; m3 = t2 - c
+    psubw        m3, m0
+    ; m4 = b2 - e
+    psubw        m4, m1
+    ; m3 = ABS(t2 - c)
+    ABS1         m3, m5
+    ; m4 = ABS(b2 - e)
+    ABS1         m4, m5
+    paddw        m3, m4
+    psrlw        m2, 1
+    ; m3 = temporal_diff1
+    psrlw        m3, 1
+    ; m2 = left part of diff
+    pmaxsw       m2, m3
+    ; m3 = tp2
+    LOAD         m3, [tptwoq]
+    ; m4 = bp2
+    LOAD         m4, [bptwoq]
+    psubw        m3, m0
+    psubw        m4, m1
+    ABS1         m3, m5
+    ABS1         m4, m5
+    paddw        m3, m4
+    ; m3 = temporal_diff2
+    psrlw        m3, 1
+    ; m2 = diff (for real)
+    pmaxsw       m2, m3
+    ; rsp + 16 = diff
+    mova   [rsp+16], m2
+
+    ; m1 = e + c
+    paddw        m1, m0
+    ; m0 = 2c
+    paddw        m0, m0
+    ; m0 = c - e
+    psubw        m0, m1
+    ; m1 = spatial_pred
+    psrlw        m1, 1
+    ; m0 = FFABS(c-e)
+    ABS1         m0, m2
+
+    ; m2 = t0[x-1]
+    ; if it's unpacked it should contain 4 bytes
+    movu         m2, [tzeroq-1]
+    ; m3 = b0[x-1]
+    movu         m3, [bzeroq-1]
+    ; m4 = t0[x-1]
+    mova         m4, m2
+    ; m2 = t0[x-1]-b0[x-1] unsigned packed
+    psubusb      m2, m3
+    ; m3 = m3 - m4 = b0[x-1]-t0[x-1] = -m2 unsigned packed
+    psubusb      m3, m4
+    ; m2 = max(m2, -m2) = abs(t0[x-1]-b0[x-1])
+    pmaxub       m2, m3
+%if mmsize == 16
+    ; m3 = m2 >> 2quadwords
+    ; pixel jump: go from x-1 to x+1
+    mova         m3, m2
+    psrldq       m3, 2
+%else
+    pshufw       m3, m2, q0021
+%endif
+    ; m7 = 0
+    ; unpack and interleave low-order bytes
+    ; to prevent pixel spilling when adding
+    punpcklbw    m2, m7
+    punpcklbw    m3, m7
+    paddw        m0, m2
+    paddw        m0, m3
+    ; m0 = spatial_score
+    psubw        m0, [pw_1]
+
+    CHECK -2, 0
+    CHECK1
+    CHECK -3, 1
+    CHECK2
+    CHECK 0, -2
+    CHECK1
+    CHECK 1, -3
+    CHECK2
+    ; now m0 = spatial_score, m1 = spatial_pred
+
+    ; m6 = diff
+    mova         m6, [rsp+16]
+%endmacro
+
+%macro FILTER_TAIL 0
+    ; m2 = d
+    mova         m2, [rsp]
+    ; m3 = d
+    mova         m3, m2
+    ; m2 = d - diff
+    psubw        m2, m6
+    ; m3 = d + diff
+    paddw        m3, m6
+    ; m1 = max(spatial_pred, d-diff)
+    pmaxsw       m1, m2
+    ; m1 = min(d + diff, max(spatial_pred, d-diff))
+    ; m1 = spatial_pred
+    pminsw       m1, m3
+    ; Converts 8 signed word integers into 16 unsigned byte integers with saturation
+    packuswb     m1, m1
+
+    ; dst = spatial_pred
+    movh     [dstq], m1
+    ; half the register size
+    add        dstq, mmsize/2
+    add        tzeroq, mmsize/2
+    add        bzeroq, mmsize/2
+    add        moneq, mmsize/2
+    add        mpq, mmsize/2
+    add        ttwoq, mmsize/2
+    add        btwoq, mmsize/2
+    add        tptwoq, mmsize/2
+    add        bptwoq, mmsize/2
+    add        ttoneq, mmsize/2
+    add        ttpq, mmsize/2
+    add        bboneq, mmsize/2
+    add        bbpq, mmsize/2
+%endmacro
+
+%macro FILTER_MODE0 0
+.loop0:
+    FILTER_HEAD
+    ; m2 = tt1
+    LOAD         m2, [ttoneq]
+    ; m4 = ttp
+    LOAD         m4, [ttpq]
+    ; m3 = bb1
+    LOAD         m3, [bboneq]
+    ; m5 = bbp
+    LOAD         m5, [bbpq]
+    paddw        m2, m4
+    paddw        m3, m5
+    ; m2 = b
+    psrlw        m2, 1
+    ; m3 = f
+    psrlw        m3, 1
+    ; m4 = c
+    LOAD         m4, [tzeroq]
+    ; m5 = d
+    mova         m5, [rsp]
+    ; m7 = e
+    LOAD         m7, [bzeroq]
+    ; m2 = b - c
+    psubw        m2, m4
+    ; m3 = f - e
+    psubw        m3, m7
+    ; m0 = d
+    mova         m0, m5
+    ; m5 = d - c
+    psubw        m5, m4
+    ; m0 = d - e
+    psubw        m0, m7
+    ; m4 = b - c
+    mova         m4, m2
+    ; m2 = FFMIN(b-c, f-e)
+    pminsw       m2, m3
+    ; m3 = FFMAX(f-e, b-c)
+    pmaxsw       m3, m4
+    ; m2 = FFMAX(d-c, FFMIN(b-c, f-e))
+    pmaxsw       m2, m5
+    ; m3 = FFMIN(d-c, FFMAX(f-e, b-c))
+    pminsw       m3, m5
+    ; m2 = max
+    pmaxsw       m2, m0
+    ; m3 = min
+    pminsw       m3, m0
+    ; m4 = 0
+    pxor         m4, m4
+    ; m6 = MAX(diff, min)
+    pmaxsw       m6, m3
+    ; m4 = -max
+    psubw        m4, m2
+    ; m6 = diff
+    pmaxsw       m6, m4
+
+    FILTER_TAIL
+    ; r13m = w
+    sub   DWORD r13m, mmsize/2
+    jg .loop0
+%endmacro
+
+%macro FILTER_MODE2 0
+.loop2:
+    FILTER_HEAD
+    FILTER_TAIL
+    ; r13m = w
+    sub   DWORD r13m, mmsize/2
+    jg .loop2
+%endmacro
+
+%macro YADIF_ADD3 0
+    ; start 3 pixels later
+    add        dstq, 3
+    add        tzeroq, 3
+    add        bzeroq, 3
+    add        moneq, 3
+    add        mpq, 3
+    add        ttwoq, 3
+    add        btwoq, 3
+    add        tptwoq, 3
+    add        bptwoq, 3
+    add        ttoneq, 3
+    add        ttpq, 3
+    add        bboneq, 3
+    add        bbpq, 3
+%endmacro
+
+; cglobal foo, 2,3,7,0x40, dst, src, tmp
+; declares a function (foo) that automatically loads two arguments (dst and
+; src) into registers, uses one additional register (tmp) plus 7 vector
+; registers (m0-m6) and allocates 0x40 bytes of stack space.
+%macro YADIF_MODE0 0
+cglobal yadif_filter_line_mode0, 13, 14, 8, 80, dst, tzero, bzero, mone, mp, \
+                                        ttwo, btwo, tptwo, bptwo, ttone, \
+                                        ttp, bbone, bbp, w
+
+    YADIF_ADD3
+    FILTER_MODE0
+    RET
+%endmacro
+
+%macro YADIF_MODE2 0
+cglobal yadif_filter_line_mode2, 13, 14, 8, 80, dst, tzero, bzero, mone, mp, \
+                                        ttwo, btwo, tptwo, bptwo, ttone, \
+                                        ttp, bbone, bbp, w
+
+    YADIF_ADD3
+    FILTER_MODE2
+    RET
+%endmacro
+
+; declares two functions for ssse3, and two for sse2
+INIT_XMM ssse3
+YADIF_MODE0
+YADIF_MODE2
+INIT_XMM sse2
+YADIF_MODE0
+YADIF_MODE2
--- a/gst/deinterlace/yadif.c
+++ b/gst/deinterlace/yadif.c
@ -31,6 +31,7 @@
 #include <gst/gst.h>
 #ifdef HAVE_ORC
 #include <orc/orc.h>
+#include <orc/orcsse.h>
 #endif
 #include "gstdeinterlacemethod.h"
 #include "yadif.h"
@ -86,6 +87,41 @@ static void
 filter_scanline_yadif_packed_3 (GstDeinterlaceSimpleMethod * self,
    guint8 * out, const GstDeinterlaceScanlineData * scanlines, guint size);

+static void
+filter_line_c_planar_mode0 (void *ORC_RESTRICT dst,
+    const void *ORC_RESTRICT tzero, const void *ORC_RESTRICT bzero,
+    const void *ORC_RESTRICT mone, const void *ORC_RESTRICT mp,
+    const void *ORC_RESTRICT ttwo, const void *ORC_RESTRICT btwo,
+    const void *ORC_RESTRICT tptwo, const void *ORC_RESTRICT bptwo,
+    const void *ORC_RESTRICT ttone, const void *ORC_RESTRICT ttp,
+    const void *ORC_RESTRICT bbone, const void *ORC_RESTRICT bbp, int w);
+
+static void
+filter_line_c_planar_mode2 (void *ORC_RESTRICT dst,
+    const void *ORC_RESTRICT tzero, const void *ORC_RESTRICT bzero,
+    const void *ORC_RESTRICT mone, const void *ORC_RESTRICT mp,
+    const void *ORC_RESTRICT ttwo, const void *ORC_RESTRICT btwo,
+    const void *ORC_RESTRICT tptwo, const void *ORC_RESTRICT bptwo,
+    const void *ORC_RESTRICT ttone, const void *ORC_RESTRICT ttp,
+    const void *ORC_RESTRICT bbone, const void *ORC_RESTRICT bbp, int w);
+
+static void (*filter_mode2) (void *ORC_RESTRICT dst,
+    const void *ORC_RESTRICT tzero, const void *ORC_RESTRICT bzero,
+    const void *ORC_RESTRICT mone, const void *ORC_RESTRICT mp,
+    const void *ORC_RESTRICT ttwo, const void *ORC_RESTRICT btwo,
+    const void *ORC_RESTRICT tptwo, const void *ORC_RESTRICT bptwo,
+    const void *ORC_RESTRICT ttone, const void *ORC_RESTRICT ttp,
+    const void *ORC_RESTRICT bbone, const void *ORC_RESTRICT bbp, int w);
+
+static void (*filter_mode0) (void *ORC_RESTRICT dst,
+    const void *ORC_RESTRICT tzero, const void *ORC_RESTRICT bzero,
+    const void *ORC_RESTRICT mone, const void *ORC_RESTRICT mp,
+    const void *ORC_RESTRICT ttwo, const void *ORC_RESTRICT btwo,
+    const void *ORC_RESTRICT tptwo, const void *ORC_RESTRICT bptwo,
+    const void *ORC_RESTRICT ttone, const void *ORC_RESTRICT ttp,
+    const void *ORC_RESTRICT bbone, const void *ORC_RESTRICT bbp, int w);
+
+
 static void
 copy_scanline (GstDeinterlaceSimpleMethod * self, guint8 * out,
    const GstDeinterlaceScanlineData * scanlines, guint size)
@ -139,36 +175,31 @@ static void
  dism_class->interpolate_scanline_nv21 = filter_scanline_yadif_semiplanar;
 }

-static void
-gst_deinterlace_method_yadif_init (GstDeinterlaceMethodYadif * self)
-{
-}
-
 #define FFABS(a) ABS(a)
 #define FFMIN(a,b) MIN(a,b)
 #define FFMAX(a,b) MAX(a,b)
 #define FFMAX3(a,b,c) FFMAX(FFMAX(a,b),c)
 #define FFMIN3(a,b,c) FFMIN(FFMIN(a,b),c)

-#define CHECK(j)\
-    {   int score = FFABS(s->t0[x - colors2 * (1 + (j))] - s->b0[x - colors2 * (1 - (j))])\
-                  + FFABS(s->t0[x  + colors2 * (j)] - s->b0[x  -colors2 * (j)])\
-                  + FFABS(s->t0[x + colors2 * (1 + (j))] - s->b0[x + colors2 * (1 - (j))]);\
+#define CHECK(j1, j2, j3)\
+    {   int score = FFABS(stzero[x - j1] - sbzero[x - j2])\
+                  + FFABS(stzero[x + j3] - sbzero[x - j3])\
+                  + FFABS(stzero[x + j1] - sbzero[x + j2]);\
        if (score < spatial_score) {\
            spatial_score= score;\
-            spatial_pred= (s->t0[x  +colors2 * ((j))] + s->b0[x - colors2 * (j)])>>1;\
+            spatial_pred= (stzero[x + j3] + sbzero[x - j3])>>1;\

 /* The is_not_edge argument here controls when the code will enter a branch
 * which reads up to and including x-3 and x+3. */

 #define FILTER(start, end, is_not_edge) \
    for (x = start;  x < end; x++) { \
-        int c = s->t0[x]; \
-        int d = (s->m1[x] + s->mp[x])>>1; \
-        int e = s->b0[x]; \
-        int temporal_diff0 = FFABS(s->m1[x] - s->mp[x]); \
-        int temporal_diff1 =(FFABS(s->t2[x] - c) + FFABS(s->b2[x] - e) )>>1; \
-        int temporal_diff2 =(FFABS(s->tp2[x] - c) + FFABS(s->bp2[x] - e) )>>1; \
+        int c = stzero[x]; \
+        int d = (smone[x] + smp[x])>>1; \
+        int e = sbzero[x]; \
+        int temporal_diff0 = FFABS(smone[x] - smp[x]); \
+        int temporal_diff1 =(FFABS(sttwo[x] - c) + FFABS(sbtwo[x] - e) )>>1; \
+        int temporal_diff2 =(FFABS(stptwo[x] - c) + FFABS(sbptwo[x] - e) )>>1; \
        int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \
        int spatial_pred = (c+e) >> 1; \
        int colors2 = colors; \
@ -177,15 +208,21 @@ gst_deinterlace_method_yadif_init (GstDeinterlaceMethodYadif * self)
          colors2 = 2; \
 \
        if (is_not_edge) {\
-            int spatial_score = FFABS(s->t0[x-colors2] - s->b0[x-colors2]) + FFABS(c-e) \
-                              + FFABS(s->t0[x+colors2] - s->b0[x+colors2]); \
-            CHECK(-1) CHECK(-2) }} }} \
-            CHECK( 1) CHECK( 2) }} }} \
+            int spatial_score = FFABS(stzero[x-colors2] - sbzero[x-colors2]) + FFABS(c-e) \
+                              + FFABS(stzero[x+colors2] - sbzero[x+colors2]); \
+            int twice_colors2 = colors2 << 1; \
+            int minus_colors2 = -colors2; \
+            int thrice_colors2 = colors2 * 3; \
+            int minus2_colors2 = colors2 * -2; \
+            CHECK(0, twice_colors2, minus_colors2) \
+              CHECK(-colors2, thrice_colors2, minus2_colors2) }} }} \
+            CHECK(twice_colors2, 0, colors2) \
+              CHECK(thrice_colors2, minus_colors2, twice_colors2) }} }} \
        }\
 \
        if (!(mode&2)) { \
-            int b = (s->tt1[x] + s->ttp[x])>>1; \
-            int f = (s->bb1[x] + s->bbp[x])>>1; \
+            int b = (sttone[x] + sttp[x])>>1; \
+            int f = (sbbone[x] + sbbp[x])>>1; \
            int max = FFMAX3(d - e, d - c, FFMIN(b - c, f - e)); \
            int min = FFMIN3(d - e, d - c, FFMAX(b - c, f - e)); \
 \
@ -197,16 +234,20 @@ gst_deinterlace_method_yadif_init (GstDeinterlaceMethodYadif * self)
        else if (spatial_pred < d - diff) \
           spatial_pred = d - diff; \
 \
-        dst[x] = spatial_pred; \
+        sdst[x] = spatial_pred; \
 \
    }

 ALWAYS_INLINE static void
-filter_line_c (guint8 * dst,
-    const GstDeinterlaceScanlineData * s, int start, int end, int mode,
-    int colors, int y_alternates_every)
+filter_line_c (guint8 * sdst, const guint8 * stzero, const guint8 * sbzero,
+    const guint8 * smone, const guint8 * smp, const guint8 * sttwo,
+    const guint8 * sbtwo, const guint8 * stptwo, const guint8 * sbptwo,
+    const guint8 * sttone, const guint8 * sttp, const guint8 * sbbone,
+    const guint8 * sbbp, int w, int colors, int y_alternates_every, int start,
+    int end, int mode)
 {
  int x;
+
  /* The function is called for processing the middle
   * pixels of each line, excluding 3 at each end.
   * This allows the FILTER macro to be
@ -218,9 +259,74 @@ filter_line_c (guint8 * dst,
 #define MAX_ALIGN 8

 ALWAYS_INLINE static void
-filter_edges (guint8 * dst,
-    const GstDeinterlaceScanlineData * s, int w, int mode, const int bpp,
-    const int colors, int y_alternates_every)
+filter_line_c_planar (void *ORC_RESTRICT dst, const void *ORC_RESTRICT tzero,
+    const void *ORC_RESTRICT bzero, const void *ORC_RESTRICT mone,
+    const void *ORC_RESTRICT mp, const void *ORC_RESTRICT ttwo,
+    const void *ORC_RESTRICT btwo, const void *ORC_RESTRICT tptwo,
+    const void *ORC_RESTRICT bptwo, const void *ORC_RESTRICT ttone,
+    const void *ORC_RESTRICT ttp, const void *ORC_RESTRICT bbone,
+    const void *ORC_RESTRICT bbp, int w, int mode)
+{
+  int x;
+  const int start = 0;
+  const int colors = 1;
+  const int y_alternates_every = 0;
+  /* hardcode colors = 1, bpp = 1 */
+  const int end = w;
+  guint8 *sdst = (guint8 *) dst + 3;
+  guint8 *stzero = (guint8 *) tzero + 3;
+  guint8 *sbzero = (guint8 *) bzero + 3;
+  guint8 *smone = (guint8 *) mone + 3;
+  guint8 *smp = (guint8 *) mp + 3;
+  guint8 *sttwo = (guint8 *) ttwo + 3;
+  guint8 *sbtwo = (guint8 *) btwo + 3;
+  guint8 *stptwo = (guint8 *) tptwo + 3;
+  guint8 *sbptwo = (guint8 *) bptwo + 3;
+  guint8 *sttone = (guint8 *) ttone + 3;
+  guint8 *sttp = (guint8 *) ttp + 3;
+  guint8 *sbbone = (guint8 *) bbone + 3;
+  guint8 *sbbp = (guint8 *) bbp + 3;
+  /* The function is called for processing the middle
+   * pixels of each line, excluding 3 at each end.
+   * This allows the FILTER macro to be
+   * called so that it processes all the pixels normally.  A constant value of
+   * true for is_not_edge lets the compiler ignore the if statement. */
+  FILTER (start, end, 1)
+}
+
+ALWAYS_INLINE G_GNUC_UNUSED static void
+filter_line_c_planar_mode0 (void *ORC_RESTRICT dst,
+    const void *ORC_RESTRICT tzero, const void *ORC_RESTRICT bzero,
+    const void *ORC_RESTRICT mone, const void *ORC_RESTRICT mp,
+    const void *ORC_RESTRICT ttwo, const void *ORC_RESTRICT btwo,
+    const void *ORC_RESTRICT tptwo, const void *ORC_RESTRICT bptwo,
+    const void *ORC_RESTRICT ttone, const void *ORC_RESTRICT ttp,
+    const void *ORC_RESTRICT bbone, const void *ORC_RESTRICT bbp, int w)
+{
+  filter_line_c_planar (dst, tzero, bzero, mone, mp, ttwo, btwo, tptwo, bptwo,
+      ttone, ttp, bbone, bbp, w, 0);
+}
+
+ALWAYS_INLINE G_GNUC_UNUSED static void
+filter_line_c_planar_mode2 (void *ORC_RESTRICT dst,
+    const void *ORC_RESTRICT tzero, const void *ORC_RESTRICT bzero,
+    const void *ORC_RESTRICT mone, const void *ORC_RESTRICT mp,
+    const void *ORC_RESTRICT ttwo, const void *ORC_RESTRICT btwo,
+    const void *ORC_RESTRICT tptwo, const void *ORC_RESTRICT bptwo,
+    const void *ORC_RESTRICT ttone, const void *ORC_RESTRICT ttp,
+    const void *ORC_RESTRICT bbone, const void *ORC_RESTRICT bbp, int w)
+{
+  filter_line_c_planar (dst, tzero, bzero, mone, mp, ttwo, btwo, tptwo, bptwo,
+      ttone, ttp, bbone, bbp, w, 2);
+}
+
+ALWAYS_INLINE static void
+filter_edges (guint8 * sdst, const guint8 * stzero, const guint8 * sbzero,
+    const guint8 * smone, const guint8 * smp, const guint8 * sttwo,
+    const guint8 * sbtwo, const guint8 * stptwo, const guint8 * sbptwo,
+    const guint8 * sttone, const guint8 * sttp, const guint8 * sbbone,
+    const guint8 * sbbp, int w, int colors, int y_alternates_every,
+    int mode, const int bpp)
 {
  int x;
  const int edge = colors * (MAX_ALIGN / bpp);
@ -233,13 +339,6 @@ filter_edges (guint8 * dst,
      FILTER (w - border, w, 0)
 }

-static void
-filter_scanline_yadif_planar (GstDeinterlaceSimpleMethod * self,
-    guint8 * out, const GstDeinterlaceScanlineData * s_orig, guint size)
-{
-  filter_scanline_yadif (self, out, s_orig, size, 1, 0);
-}
-
 static void
 filter_scanline_yadif_semiplanar (GstDeinterlaceSimpleMethod * self,
    guint8 * out, const GstDeinterlaceScanlineData * s_orig, guint size)
@ -301,7 +400,78 @@ filter_scanline_yadif (GstDeinterlaceSimpleMethod * self,
  if (s.b2 == NULL)
    s.b2 = s.bp2;

-  filter_edges (dst, &s, w, mode, bpp, colors, y_alternates_every);
-  filter_line_c (dst, &s, colors * 3, w - edge, mode, colors,
-      y_alternates_every);
+  filter_edges (dst, s.t0, s.b0, s.m1, s.mp, s.t2, s.b2, s.tp2, s.bp2, s.tt1,
+      s.ttp, s.bb1, s.bbp, w, colors, y_alternates_every, mode, bpp);
+  filter_line_c (dst, s.t0, s.b0, s.m1, s.mp, s.t2, s.b2, s.tp2, s.bp2, s.tt1,
+      s.ttp, s.bb1, s.bbp, w, colors, y_alternates_every, colors * 3, w - edge,
+      mode);
+}
+
+ALWAYS_INLINE static void
+filter_scanline_yadif_planar (GstDeinterlaceSimpleMethod * self,
+    guint8 * out, const GstDeinterlaceScanlineData * s_orig, guint size)
+{
+  guint8 *dst = out;
+  const int bpp = 1;            // Hard code 8-bit atm
+  int w = size / bpp;
+  int edge = MAX_ALIGN / bpp;
+  GstDeinterlaceScanlineData s = *s_orig;
+
+  int mode = (s.tt1 == NULL || s.bb1 == NULL || s.ttp == NULL
+      || s.bbp == NULL) ? 2 : 0;
+
+  /* When starting up, some data might not yet be available, so use the current frame */
+  if (s.m1 == NULL)
+    s.m1 = s.mp;
+  if (s.tt1 == NULL)
+    s.tt1 = s.ttp;
+  if (s.bb1 == NULL)
+    s.bb1 = s.bbp;
+  if (s.t2 == NULL)
+    s.t2 = s.tp2;
+  if (s.b2 == NULL)
+    s.b2 = s.bp2;
+
+  filter_edges (dst, s.t0, s.b0, s.m1, s.mp, s.t2, s.b2, s.tp2, s.bp2, s.tt1,
+      s.ttp, s.bb1, s.bbp, w, 1, 0, mode, bpp);
+  if (mode == 0)
+    filter_mode0 (dst, (void *) s.t0, (void *) s.b0, (void *) s.m1,
+        (void *) s.mp, (void *) s.t2, (void *) s.b2, (void *) s.tp2,
+        (void *) s.bp2, (void *) s.tt1, (void *) s.ttp, (void *) s.bb1,
+        (void *) s.bbp, w - edge);
+  else
+    filter_mode2 (dst, (void *) s.t0, (void *) s.b0, (void *) s.m1,
+        (void *) s.mp, (void *) s.t2, (void *) s.b2, (void *) s.tp2,
+        (void *) s.bp2, (void *) s.tt1, (void *) s.ttp, (void *) s.bb1,
+        (void *) s.bbp, w - edge);
+}
+
+static void
+gst_deinterlace_method_yadif_init (GstDeinterlaceMethodYadif * self)
+{
+#if (defined __x86_64__ || defined _M_X64) && defined HAVE_NASM
+  if (
+#  if defined HAVE_ORC
+      orc_sse_get_cpu_flags () & ORC_TARGET_SSE_SSSE3
+#  elif defined __SSSE3__
+      TRUE
+#  else
+      FALSE
+#  endif
+      ) {
+    GST_DEBUG ("SSSE3 optimization enabled");
+    filter_mode0 = gst_yadif_filter_line_mode0_ssse3;
+    filter_mode2 = gst_yadif_filter_line_mode2_ssse3;
+  } else {
+    GST_DEBUG ("SSE2 optimization enabled");
+    filter_mode0 = gst_yadif_filter_line_mode0_sse2;
+    filter_mode2 = gst_yadif_filter_line_mode2_sse2;
+  }
+#else
+  {
+    GST_DEBUG ("SSE optimization disabled");
+    filter_mode0 = filter_line_c_planar_mode0;
+    filter_mode2 = filter_line_c_planar_mode2;
+  }
+#endif
 }
--- a/gst/deinterlace/yadif.h
+++ b/gst/deinterlace/yadif.h
@ -25,4 +25,24 @@

 GType gst_deinterlace_method_yadif_get_type (void);

+void
+gst_yadif_filter_line_mode0_sse2 (void *dst, const void *tzero, const void *bzero,
+    const void *mone, const void *mp, const void *ttwo, const void *btwo, const void *tptwo, const void *bptwo,
+    const void *ttone, const void *ttp, const void *bbone, const void *bbp, int w);
+
+void
+gst_yadif_filter_line_mode2_sse2 (void *dst, const void *tzero, const void *bzero,
+    const void *mone, const void *mp, const void *ttwo, const void *btwo, const void *tptwo, const void *bptwo,
+    const void *ttone, const void *ttp, const void *bbone, const void *bbp, int w);
+
+void
+gst_yadif_filter_line_mode0_ssse3 (void *dst, const void *tzero, const void *bzero,
+    const void *mone, const void *mp, const void *ttwo, const void *btwo, const void *tptwo, const void *bptwo,
+    const void *ttone, const void *ttp, const void *bbone, const void *bbp, int w);
+
+void
+gst_yadif_filter_line_mode2_ssse3 (void *dst, const void *tzero, const void *bzero,
+    const void *mone, const void *mp, const void *ttwo, const void *btwo, const void *tptwo, const void *bptwo,
+    const void *ttone, const void *ttp, const void *bbone, const void *bbp, int w);
+
 #endif
--- a/meson.build
+++ b/meson.build
@ -335,6 +335,22 @@ else
  cdata.set('DISABLE_ORC', 1)
 endif

+have_nasm=false
+# FIXME: nasm path needs testing on non-Linux, esp. Windows
+host_cpu = host_machine.cpu_family()
+if host_cpu == 'x86_64'
+  if cc.get_id() == 'msvc'
+    message('Nasm disabled on MSVC')
+  else
+    nasm = find_program('nasm', native: true, version : '>= 2.13', required: get_option('asm'))
+    if nasm.found()
+      message('Nasm found on x86-64')
+      cdata.set('HAVE_NASM', 1)
+      have_nasm = true
+    endif
+  endif
+endif
+
 # Disable compiler warnings for unused variables and args if gst debug system is disabled
 if gst_dep.type_name() == 'internal'
  gst_debug_disabled = not subproject('gstreamer').get_variable('gst_debug')
@ -378,6 +394,7 @@ if find_program('xgettext', required : get_option('nls')).found()
  subdir('po')
 endif

+cdata.set10('ARCH_X86_64', host_cpu == 'x86_64')
 configure_file(output : 'config.h', configuration : cdata)

 run_command(python3, '-c', 'import shutil; shutil.copy("hooks/pre-commit.hook", ".git/hooks/pre-commit")')
--- a/meson_options.txt
+++ b/meson_options.txt
@ -97,6 +97,7 @@ option('glib-asserts', type : 'feature', value : 'enabled', yield : true,
       description: 'Enable GLib assertion (auto = enabled for development, disabled for stable releases)')
 option('glib-checks', type : 'feature', value : 'enabled', yield : true,
       description: 'Enable GLib checks such as API guards (auto = enabled for development, disabled for stable releases)')
+option('asm', type : 'feature', value : 'auto', yield : true)

 # Common options
 option('package-name', type : 'string', yield : true,