diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/cuda-converter.c b/subprojects/gst-plugins-bad/sys/nvcodec/cuda-converter.c
deleted file mode 100644
index f2930019a6..0000000000
--- a/subprojects/gst-plugins-bad/sys/nvcodec/cuda-converter.c
+++ /dev/null
@@ -1,2090 +0,0 @@
-/* GStreamer
- * Copyright (C) 2010 David Schleef <ds@schleef.org>
- * Copyright (C) 2010 Sebastian Dröge <sebastian.droege@collabora.co.uk>
- * Copyright (C) 2019 Seungha Yang <seungha.yang@navercorp.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License along with this library; if not, write to the
- * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
- * Boston, MA 02110-1301, USA.
- */
-
-/**
- * SECTION:cudaconverter
- * @title: GstCudaConverter
- * @short_description: Generic video conversion using CUDA
- *
- * This object is used to convert video frames from one format to another.
- * The object can perform conversion of:
- *
- *  * video format
- *  * video colorspace
- *  * video size
- */
-
-/**
- * TODO:
- *  * Add more interpolation method and make it selectable,
- *    currently default bi-linear interpolation only
- *  * Add fast-path for conversion like videoconvert
- *  * Full colorimetry and chroma-siting support
- *  * cropping, and x, y position support
- */
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include "cuda-converter.h"
-#include <gst/cuda/gstcudautils.h>
-#include <gst/cuda/gstcudaloader.h>
-#include <gst/cuda/gstcudanvrtc.h>
-#include <string.h>
-
-#define CUDA_BLOCK_X 16
-#define CUDA_BLOCK_Y 16
-#define DIV_UP(size,block) (((size) + ((block) - 1)) / (block))
-
-static gboolean cuda_converter_lookup_path (GstCudaConverter * convert);
-
-#ifndef GST_DISABLE_GST_DEBUG
-#define GST_CAT_DEFAULT ensure_debug_category()
-static GstDebugCategory *
-ensure_debug_category (void)
-{
-  static gsize cat_gonce = 0;
-
-  if (g_once_init_enter (&cat_gonce)) {
-    gsize cat_done;
-
-    cat_done = (gsize) _gst_debug_category_new ("cuda-converter", 0,
-        "cuda-converter object");
-
-    g_once_init_leave (&cat_gonce, cat_done);
-  }
-
-  return (GstDebugCategory *) cat_gonce;
-}
-#else
-#define ensure_debug_category()
-#endif
-
-#define GST_CUDA_KERNEL_FUNC "gst_cuda_kernel_func"
-
-#define GST_CUDA_KERNEL_FUNC_TO_Y444 "gst_cuda_kernel_func_to_y444"
-
-#define GST_CUDA_KERNEL_FUNC_Y444_TO_YUV "gst_cuda_kernel_func_y444_to_yuv"
-
-#define GST_CUDA_KERNEL_FUNC_TO_ARGB "gst_cuda_kernel_func_to_argb"
-
-#define GST_CUDA_KERNEL_FUNC_SCALE_RGB "gst_cuda_kernel_func_scale_rgb"
-
-/* *INDENT-OFF* */
-/**
- * read_chroma:
- * @tex1: a CUDA texture object representing a semi-planar chroma plane
- * @tex2: dummy object
- * @x: the x coordinate to read data from @tex1
- * @y: the y coordinate to read data from @tex1
- *
- * Returns: a #ushort2 vector representing both chroma pixel values
- */
-static const gchar READ_CHROMA_FROM_SEMI_PLANAR[] =
-"__device__ ushort2\n"
-"read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, \n"
-"    float x, float y)\n"
-"{\n"
-"  return tex2D<ushort2>(tex1, x, y);\n"
-"}";
-
-/**
- * read_chroma:
- * @tex1: a CUDA texture object representing a chroma planar plane
- * @tex2: a CUDA texture object representing the other planar plane
- * @x: the x coordinate to read data from @tex1 and @tex2
- * @y: the y coordinate to read data from @tex1 and @tex2
- *
- * Returns: a #ushort2 vector representing both chroma pixel values
- */
-static const gchar READ_CHROMA_FROM_PLANAR[] =
-"__device__ ushort2\n"
-"read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, \n"
-"    float x, float y)\n"
-"{\n"
-"  unsigned short u, v;\n"
-"  u = tex2D<unsigned short>(tex1, x, y);\n"
-"  v = tex2D<unsigned short>(tex2, x, y);\n"
-"  return make_ushort2(u, v);\n"
-"}";
-
-/**
- * write_chroma:
- * @dst1: a CUDA global memory pointing to a semi-planar chroma plane
- * @dst2: dummy
- * @u: a pixel value to write @dst1
- * @v: a pixel value to write @dst1
- * @x: the x coordinate to write data into @tex1
- * @x: the y coordinate to write data into @tex1
- * @pstride: the pixel stride of @dst1
- * @mask: bitmask to be applied to high bitdepth plane
- *
- * Write @u and @v pixel value to @dst1 semi-planar plane
- */
-static const gchar WRITE_CHROMA_TO_SEMI_PLANAR[] =
-"__device__ void\n"
-"write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,\n"
-"    unsigned short v, int x, int y, int pstride, int stride, int mask)\n"
-"{\n"
-"  if (OUT_DEPTH > 8) {\n"
-"    *(unsigned short *)&dst1[x * pstride + y * stride] = (u & mask);\n"
-"    *(unsigned short *)&dst1[x * pstride + 2 + y * stride] = (v & mask);\n"
-"  } else {\n"
-"    dst1[x * pstride + y * stride] = u;\n"
-"    dst1[x * pstride + 1 + y * stride] = v;\n"
-"  }\n"
-"}";
-
-/**
- * write_chroma:
- * @dst1: a CUDA global memory pointing to a planar chroma plane
- * @dst2: a CUDA global memory pointing to a the other planar chroma plane
- * @u: a pixel value to write @dst1
- * @v: a pixel value to write @dst1
- * @x: the x coordinate to write data into @tex1
- * @x: the y coordinate to write data into @tex1
- * @pstride: the pixel stride of @dst1
- * @mask: bitmask to be applied to high bitdepth plane
- *
- * Write @u and @v pixel value into @dst1 and @dst2 planar planes
- */
-static const gchar WRITE_CHROMA_TO_PLANAR[] =
-"__device__ void\n"
-"write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,\n"
-"    unsigned short v, int x, int y, int pstride, int stride, int mask)\n"
-"{\n"
-"  if (OUT_DEPTH > 8) {\n"
-"    *(unsigned short *)&dst1[x * pstride + y * stride] = (u & mask);\n"
-"    *(unsigned short *)&dst2[x * pstride + y * stride] = (v & mask);\n"
-"  } else {\n"
-"    dst1[x * pstride + y * stride] = u;\n"
-"    dst2[x * pstride + y * stride] = v;\n"
-"  }\n"
-"}";
-
-/* CUDA kernel source for from YUV to YUV conversion and scale */
-static const gchar templ_YUV_TO_YUV[] =
-"extern \"C\"{\n"
-"__constant__ float SCALE_H = %s;\n"
-"__constant__ float SCALE_V = %s;\n"
-"__constant__ float CHROMA_SCALE_H = %s;\n"
-"__constant__ float CHROMA_SCALE_V = %s;\n"
-"__constant__ int WIDTH = %d;\n"
-"__constant__ int HEIGHT = %d;\n"
-"__constant__ int CHROMA_WIDTH = %d;\n"
-"__constant__ int CHROMA_HEIGHT = %d;\n"
-"__constant__ int IN_DEPTH = %d;\n"
-"__constant__ int OUT_DEPTH = %d;\n"
-"__constant__ int PSTRIDE = %d;\n"
-"__constant__ int CHROMA_PSTRIDE = %d;\n"
-"__constant__ int IN_SHIFT = %d;\n"
-"__constant__ int OUT_SHIFT = %d;\n"
-"__constant__ int MASK = %d;\n"
-"__constant__ int SWAP_UV = %d;\n"
-"\n"
-"__device__ unsigned short\n"
-"do_scale_pixel (unsigned short val) \n"
-"{\n"
-"  unsigned int diff;\n"
-"  if (OUT_DEPTH > IN_DEPTH) {\n"
-"    diff = OUT_DEPTH - IN_DEPTH;\n"
-"    return (val << diff) | (val >> (IN_DEPTH - diff));\n"
-"  } else if (IN_DEPTH > OUT_DEPTH) {\n"
-"    return val >> (IN_DEPTH - OUT_DEPTH);\n"
-"  }\n"
-"  return val;\n"
-"}\n"
-"\n"
-/* __device__ ushort2
- * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y);
- */
-"%s\n"
-"\n"
-/* __device__ void
- * write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,
- *     unsigned short v, int x, int y, int pstride, int stride, int mask);
- */
-"%s\n"
-"\n"
-"__global__ void\n"
-GST_CUDA_KERNEL_FUNC
-"(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n"
-"    unsigned char *dst0, unsigned char *dst1, unsigned char *dst2,\n"
-"    int stride, int uv_stride)\n"
-"{\n"
-"  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
-"  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
-"  if (x_pos < WIDTH && y_pos < HEIGHT) {\n"
-"    float src_xpos = SCALE_H * x_pos;\n"
-"    float src_ypos = SCALE_V * y_pos;\n"
-"    unsigned short y = tex2D<unsigned short>(tex0, src_xpos, src_ypos);\n"
-"    y = y >> IN_SHIFT;\n"
-"    y = do_scale_pixel (y);\n"
-"    y = y << OUT_SHIFT;\n"
-"    if (OUT_DEPTH > 8) {\n"
-"      *(unsigned short *)&dst0[x_pos * PSTRIDE + y_pos * stride] = (y & MASK);\n"
-"    } else {\n"
-"      dst0[x_pos * PSTRIDE + y_pos * stride] = y;\n"
-"    }\n"
-"  }\n"
-"  if (x_pos < CHROMA_WIDTH && y_pos < CHROMA_HEIGHT) {\n"
-"    float src_xpos = CHROMA_SCALE_H * x_pos;\n"
-"    float src_ypos = CHROMA_SCALE_V * y_pos;\n"
-"    unsigned short u, v;\n"
-"    ushort2 uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n"
-"    u = uv.x;\n"
-"    v = uv.y;\n"
-"    u = u >> IN_SHIFT;\n"
-"    v = v >> IN_SHIFT;\n"
-"    u = do_scale_pixel (u);\n"
-"    v = do_scale_pixel (v);\n"
-"    u = u << OUT_SHIFT;\n"
-"    v = v << OUT_SHIFT;\n"
-"    if (SWAP_UV) {\n"
-"      unsigned short tmp = u;\n"
-"      u = v;\n"
-"      v = tmp;\n"
-"    }\n"
-"    write_chroma (dst1,\n"
-"      dst2, u, v, x_pos, y_pos, CHROMA_PSTRIDE, uv_stride, MASK);\n"
-"  }\n"
-"}\n"
-"\n"
-"}";
-
-/* CUDA kernel source for from YUV to RGB conversion and scale */
-static const gchar templ_YUV_TO_RGB[] =
-"extern \"C\"{\n"
-"__constant__ float offset[3] = {%s, %s, %s};\n"
-"__constant__ float rcoeff[3] = {%s, %s, %s};\n"
-"__constant__ float gcoeff[3] = {%s, %s, %s};\n"
-"__constant__ float bcoeff[3] = {%s, %s, %s};\n"
-"\n"
-"__constant__ float SCALE_H = %s;\n"
-"__constant__ float SCALE_V = %s;\n"
-"__constant__ float CHROMA_SCALE_H = %s;\n"
-"__constant__ float CHROMA_SCALE_V = %s;\n"
-"__constant__ int WIDTH = %d;\n"
-"__constant__ int HEIGHT = %d;\n"
-"__constant__ int CHROMA_WIDTH = %d;\n"
-"__constant__ int CHROMA_HEIGHT = %d;\n"
-"__constant__ int IN_DEPTH = %d;\n"
-"__constant__ int OUT_DEPTH = %d;\n"
-"__constant__ int PSTRIDE = %d;\n"
-"__constant__ int CHROMA_PSTRIDE = %d;\n"
-"__constant__ int IN_SHIFT = %d;\n"
-"__constant__ int OUT_SHIFT = %d;\n"
-"__constant__ int MASK = %d;\n"
-"__constant__ int SWAP_UV = %d;\n"
-"__constant__ int MAX_IN_VAL = %d;\n"
-"__constant__ int R_IDX = %d;\n"
-"__constant__ int G_IDX = %d;\n"
-"__constant__ int B_IDX = %d;\n"
-"__constant__ int A_IDX = %d;\n"
-"__constant__ int X_IDX = %d;\n"
-"\n"
-"__device__ unsigned short\n"
-"do_scale_pixel (unsigned short val) \n"
-"{\n"
-"  unsigned int diff;\n"
-"  if (OUT_DEPTH > IN_DEPTH) {\n"
-"    diff = OUT_DEPTH - IN_DEPTH;\n"
-"    return (val << diff) | (val >> (IN_DEPTH - diff));\n"
-"  } else if (IN_DEPTH > OUT_DEPTH) {\n"
-"    return val >> (IN_DEPTH - OUT_DEPTH);\n"
-"  }\n"
-"  return val;\n"
-"}\n"
-"\n"
-"__device__ float\n"
-"dot(float3 val, float *coeff)\n"
-"{\n"
-"  return val.x * coeff[0] + val.y * coeff[1] + val.z * coeff[2];\n"
-"}\n"
-"\n"
-"__device__ uint3\n"
-"yuv_to_rgb (unsigned short y, unsigned short u, unsigned short v, unsigned int max_val)\n"
-"{\n"
-"  float3 yuv = make_float3 (y, u, v);\n"
-"  uint3 rgb;\n"
-"  rgb.x = max ((unsigned int)(dot (yuv, rcoeff) + offset[0]), 0);\n"
-"  rgb.y = max ((unsigned int)(dot (yuv, gcoeff) + offset[1]), 0);\n"
-"  rgb.z = max ((unsigned int)(dot (yuv, bcoeff) + offset[2]), 0);\n"
-"  rgb.x = min (rgb.x, max_val);\n"
-"  rgb.y = min (rgb.y, max_val);\n"
-"  rgb.z = min (rgb.z, max_val);\n"
-"  return rgb;\n"
-"}\n"
-"\n"
-/* __device__ ushort2
- * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y);
- */
-"%s\n"
-"\n"
-"__global__ void\n"
-GST_CUDA_KERNEL_FUNC
-"(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n"
-"    unsigned char *dstRGB, int stride)\n"
-"{\n"
-"  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
-"  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
-"  if (x_pos < WIDTH && y_pos < HEIGHT) {\n"
-"    float src_xpos = SCALE_H * x_pos;\n"
-"    float src_ypos = SCALE_V * y_pos;\n"
-"    unsigned short y = tex2D<unsigned short>(tex0, src_xpos, src_ypos);\n"
-"    ushort2 uv;\n"
-"    unsigned short u, v;\n"
-"    uint3 rgb;\n"
-"    unsigned int clip_max = MAX_IN_VAL;\n"
-"    src_xpos = CHROMA_SCALE_H * x_pos;\n"
-"    src_ypos = CHROMA_SCALE_V * y_pos;\n"
-"    uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n"
-"    u = uv.x;\n"
-"    v = uv.y;\n"
-"    y = y >> IN_SHIFT;\n"
-"    u = u >> IN_SHIFT;\n"
-"    v = v >> IN_SHIFT;\n"
-"    if (SWAP_UV) {\n"
-"      unsigned short tmp = u;\n"
-"      u = v;\n"
-"      v = tmp;\n"
-"    }\n"
-     /* conversion matrix is scaled to higher bitdepth between in/out formats */
-"    if (OUT_DEPTH > IN_DEPTH) {\n"
-"      y = do_scale_pixel (y);\n"
-"      u = do_scale_pixel (u);\n"
-"      v = do_scale_pixel (v);\n"
-"      clip_max = MASK;\n"
-"    }"
-"    rgb = yuv_to_rgb (y, u, v, clip_max);\n"
-"    if (OUT_DEPTH < IN_DEPTH) {\n"
-"      rgb.x = do_scale_pixel (rgb.x);\n"
-"      rgb.y = do_scale_pixel (rgb.y);\n"
-"      rgb.z = do_scale_pixel (rgb.z);\n"
-"    }"
-"    if (OUT_DEPTH > 8) {\n"
-"      unsigned int packed_rgb = 0;\n"
-       /* A is always MSB, we support only little endian system */
-"      packed_rgb = 0xc000 << 16;\n"
-"      packed_rgb |= (rgb.x << (30 - (R_IDX * 10)));\n"
-"      packed_rgb |= (rgb.y << (30 - (G_IDX * 10)));\n"
-"      packed_rgb |= (rgb.z << (30 - (B_IDX * 10)));\n"
-"      *(unsigned int *)&dstRGB[x_pos * PSTRIDE + y_pos * stride] = packed_rgb;\n"
-"    } else {\n"
-"      dstRGB[x_pos * PSTRIDE + R_IDX + y_pos * stride] = (unsigned char) rgb.x;\n"
-"      dstRGB[x_pos * PSTRIDE + G_IDX + y_pos * stride] = (unsigned char) rgb.y;\n"
-"      dstRGB[x_pos * PSTRIDE + B_IDX + y_pos * stride] = (unsigned char) rgb.z;\n"
-"      if (A_IDX >= 0 || X_IDX >= 0)\n"
-"        dstRGB[x_pos * PSTRIDE + A_IDX + y_pos * stride] = 0xff;\n"
-"    }\n"
-"  }\n"
-"}\n"
-"\n"
-"}";
-
-/**
- * GST_CUDA_KERNEL_FUNC_TO_ARGB:
- * @srcRGB: a CUDA global memory containing a RGB image
- * @dstRGB: a CUDA global memory to store unpacked ARGB image
- * @width: the width of @srcRGB and @dstRGB
- * @height: the height of @srcRGB and @dstRGB
- * @src_stride: the stride of @srcRGB
- * @src_pstride: the pixel stride of @srcRGB
- * @dst_stride: the stride of @dstRGB
- * @r_idx: the index of red component of @srcRGB
- * @g_idx: the index of green component of @srcRGB
- * @b_idx: the index of blue component of @srcRGB
- * @a_idx: the index of alpha component of @srcRGB
- *
- * Unpack a RGB image from @srcRGB and write the unpacked data into @dstRGB
- */
-static const gchar unpack_to_ARGB[] =
-"__global__ void\n"
-GST_CUDA_KERNEL_FUNC_TO_ARGB
-"(unsigned char *srcRGB, unsigned char *dstRGB, int width, int height,\n"
-"    int src_stride, int src_pstride, int dst_stride,\n"
-"    int r_idx, int g_idx, int b_idx, int a_idx)\n"
-"{\n"
-"  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
-"  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
-"  if (x_pos < width && y_pos < height) {\n"
-"    if (a_idx >= 0) {\n"
-"      dstRGB[x_pos * 4 + y_pos * dst_stride] =\n"
-"          srcRGB[x_pos * src_pstride + a_idx + y_pos * src_stride];\n"
-"    } else {\n"
-"      dstRGB[x_pos * 4 + y_pos * dst_stride] = 0xff;\n"
-"    }\n"
-"    dstRGB[x_pos * 4 + 1 + y_pos * dst_stride] =\n"
-"        srcRGB[x_pos * src_pstride + r_idx + y_pos * src_stride];\n"
-"    dstRGB[x_pos * 4 + 2 + y_pos * dst_stride] =\n"
-"        srcRGB[x_pos * src_pstride + g_idx + y_pos * src_stride];\n"
-"    dstRGB[x_pos * 4 + 3 + y_pos * dst_stride] =\n"
-"        srcRGB[x_pos * src_pstride + b_idx + y_pos * src_stride];\n"
-"  }\n"
-"}\n";
-
-/**
- * GST_CUDA_KERNEL_FUNC_TO_ARGB:
- * @srcRGB: a CUDA global memory containing a RGB image
- * @dstRGB: a CUDA global memory to store unpacked ARGB64 image
- * @width: the width of @srcRGB and @dstRGB
- * @height: the height of @srcRGB and @dstRGB
- * @src_stride: the stride of @srcRGB
- * @src_pstride: the pixel stride of @srcRGB
- * @dst_stride: the stride of @dstRGB
- * @r_idx: the index of red component of @srcRGB
- * @g_idx: the index of green component of @srcRGB
- * @b_idx: the index of blue component of @srcRGB
- * @a_idx: the index of alpha component of @srcRGB
- *
- * Unpack a RGB image from @srcRGB and write the unpacked data into @dstRGB
- */
-static const gchar unpack_to_ARGB64[] =
-"__global__ void\n"
-GST_CUDA_KERNEL_FUNC_TO_ARGB
-"(unsigned char *srcRGB, unsigned char *dstRGB, int width, int height,\n"
-"    int src_stride, int src_pstride, int dst_stride,\n"
-"    int r_idx, int g_idx, int b_idx, int a_idx)\n"
-"{\n"
-"  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
-"  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
-"  if (x_pos < width && y_pos < height) {\n"
-"    unsigned short a, r, g, b;\n"
-"    unsigned int read_val;\n"
-"    read_val = *(unsigned int *)&srcRGB[x_pos * src_pstride + y_pos * src_stride];\n"
-"    a = (read_val >> 30) & 0x03;\n"
-"    a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);\n"
-"    r = ((read_val >> (30 - (r_idx * 10))) & 0x3ff);\n"
-"    r = (r << 6) | (r >> 4);\n"
-"    g = ((read_val >> (30 - (g_idx * 10))) & 0x3ff);\n"
-"    g = (g << 6) | (g >> 4);\n"
-"    b = ((read_val >> (30 - (b_idx * 10))) & 0x3ff);\n"
-"    b = (b << 6) | (b >> 4);\n"
-"    *(unsigned short *)&dstRGB[x_pos * 8 + y_pos * dst_stride] = 0xffff;\n"
-"    *(unsigned short *)&dstRGB[x_pos * 8 + 2 + y_pos * dst_stride] = r;\n"
-"    *(unsigned short *)&dstRGB[x_pos * 8 + 4 + y_pos * dst_stride] = g;\n"
-"    *(unsigned short *)&dstRGB[x_pos * 8 + 6 + y_pos * dst_stride] = b;\n"
-"  }\n"
-"}\n";
-
-/* CUDA kernel source for from RGB to YUV conversion and scale */
-static const gchar templ_RGB_TO_YUV[] =
-"extern \"C\"{\n"
-"__constant__ float offset[3] = {%s, %s, %s};\n"
-"__constant__ float ycoeff[3] = {%s, %s, %s};\n"
-"__constant__ float ucoeff[3] = {%s, %s, %s};\n"
-"__constant__ float vcoeff[3] = {%s, %s, %s};\n"
-"\n"
-"__constant__ float SCALE_H = %s;\n"
-"__constant__ float SCALE_V = %s;\n"
-"__constant__ float CHROMA_SCALE_H = %s;\n"
-"__constant__ float CHROMA_SCALE_V = %s;\n"
-"__constant__ int WIDTH = %d;\n"
-"__constant__ int HEIGHT = %d;\n"
-"__constant__ int CHROMA_WIDTH = %d;\n"
-"__constant__ int CHROMA_HEIGHT = %d;\n"
-"__constant__ int IN_DEPTH = %d;\n"
-"__constant__ int OUT_DEPTH = %d;\n"
-"__constant__ int PSTRIDE = %d;\n"
-"__constant__ int CHROMA_PSTRIDE = %d;\n"
-"__constant__ int IN_SHIFT = %d;\n"
-"__constant__ int OUT_SHIFT = %d;\n"
-"__constant__ int MASK = %d;\n"
-"__constant__ int SWAP_UV = %d;\n"
-"\n"
-"__device__ unsigned short\n"
-"do_scale_pixel (unsigned short val) \n"
-"{\n"
-"  unsigned int diff;\n"
-"  if (OUT_DEPTH > IN_DEPTH) {\n"
-"    diff = OUT_DEPTH - IN_DEPTH;\n"
-"    return (val << diff) | (val >> (IN_DEPTH - diff));\n"
-"  } else if (IN_DEPTH > OUT_DEPTH) {\n"
-"    return val >> (IN_DEPTH - OUT_DEPTH);\n"
-"  }\n"
-"  return val;\n"
-"}\n"
-"\n"
-"__device__ float\n"
-"dot(float3 val, float *coeff)\n"
-"{\n"
-"  return val.x * coeff[0] + val.y * coeff[1] + val.z * coeff[2];\n"
-"}\n"
-"\n"
-"__device__ uint3\n"
-"rgb_to_yuv (unsigned short r, unsigned short g, unsigned short b,\n"
-"    unsigned int max_val)\n"
-"{\n"
-"  float3 rgb = make_float3 (r, g, b);\n"
-"  uint3 yuv;\n"
-"  yuv.x = max ((unsigned int)(dot (rgb, ycoeff) + offset[0]), 0);\n"
-"  yuv.y = max ((unsigned int)(dot (rgb, ucoeff) + offset[1]), 0);\n"
-"  yuv.z = max ((unsigned int)(dot (rgb, vcoeff) + offset[2]), 0);\n"
-"  yuv.x = min (yuv.x, max_val);\n"
-"  yuv.y = min (yuv.y, max_val);\n"
-"  yuv.z = min (yuv.z, max_val);\n"
-"  return yuv;\n"
-"}\n"
-"\n"
-/* __global__ void
- * GST_CUDA_KERNEL_FUNC_TO_ARGB
- */
-"%s\n"
-"\n"
-/* __device__ ushort2
- * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y);
- */
-"%s\n"
-"\n"
-/* __device__ void
- * write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,
- *     unsigned short v, int x, int y, int pstride, int stride, int mask);
- */
-"%s\n"
-"\n"
-"__global__ void\n"
-GST_CUDA_KERNEL_FUNC_TO_Y444
-"(cudaTextureObject_t srcRGB, unsigned char *dstY, int y_stride,\n"
-"    unsigned char *dstU, int u_stride, unsigned char *dstV, int v_stride,\n"
-"    int width, int height, int dst_pstride, int in_depth)\n"
-"{\n"
-"  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
-"  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
-"  if (x_pos < width && y_pos < height) {\n"
-"    ushort4 argb = tex2D<ushort4>(srcRGB, x_pos, y_pos);\n"
-"    uint3 yuv;\n"
-"    yuv = rgb_to_yuv (argb.y, argb.z, argb.w, (1 << in_depth) - 1);\n"
-"    if (in_depth > 8) {\n"
-"      *(unsigned short *)&dstY[x_pos * dst_pstride + y_pos * y_stride] = yuv.x;\n"
-"      *(unsigned short *)&dstU[x_pos * dst_pstride + y_pos * u_stride] = yuv.y;\n"
-"      *(unsigned short *)&dstV[x_pos * dst_pstride + y_pos * v_stride] = yuv.z;\n"
-"    } else {\n"
-"      dstY[x_pos * dst_pstride + y_pos * y_stride] = yuv.x;\n"
-"      dstU[x_pos * dst_pstride + y_pos * u_stride] = yuv.y;\n"
-"      dstV[x_pos * dst_pstride + y_pos * v_stride] = yuv.z;\n"
-"    }\n"
-"  }\n"
-"}\n"
-"\n"
-"__global__ void\n"
-GST_CUDA_KERNEL_FUNC_Y444_TO_YUV
-"(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n"
-"    unsigned char *dst0, unsigned char *dst1, unsigned char *dst2,\n"
-"    int stride, int uv_stride)\n"
-"{\n"
-"  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
-"  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
-"  if (x_pos < WIDTH && y_pos < HEIGHT) {\n"
-"    float src_xpos = SCALE_H * x_pos;\n"
-"    float src_ypos = SCALE_V * y_pos;\n"
-"    unsigned short y = tex2D<unsigned short>(tex0, src_xpos, src_ypos);\n"
-"    y = y >> IN_SHIFT;\n"
-"    y = do_scale_pixel (y);\n"
-"    y = y << OUT_SHIFT;\n"
-"    if (OUT_DEPTH > 8) {\n"
-"      *(unsigned short *)&dst0[x_pos * PSTRIDE + y_pos * stride] = (y & MASK);\n"
-"    } else {\n"
-"      dst0[x_pos * PSTRIDE + y_pos * stride] = y;\n"
-"    }\n"
-"  }\n"
-"  if (x_pos < CHROMA_WIDTH && y_pos < CHROMA_HEIGHT) {\n"
-"    float src_xpos = CHROMA_SCALE_H * x_pos;\n"
-"    float src_ypos = CHROMA_SCALE_V * y_pos;\n"
-"    unsigned short u, v;\n"
-"    ushort2 uv;\n"
-"    uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n"
-"    u = uv.x;\n"
-"    v = uv.y;\n"
-"    u = u >> IN_SHIFT;\n"
-"    v = v >> IN_SHIFT;\n"
-"    u = do_scale_pixel (u);\n"
-"    v = do_scale_pixel (v);\n"
-"    u = u << OUT_SHIFT;\n"
-"    v = v << OUT_SHIFT;\n"
-"    if (SWAP_UV) {\n"
-"      unsigned short tmp = u;\n"
-"      u = v;\n"
-"      v = tmp;\n"
-"    }\n"
-"    write_chroma (dst1,\n"
-"      dst2, u, v, x_pos, y_pos, CHROMA_PSTRIDE, uv_stride, MASK);\n"
-"  }\n"
-"}\n"
-"\n"
-"}";
-
-/* CUDA kernel source for from RGB to RGB conversion and scale */
-static const gchar templ_RGB_to_RGB[] =
-"extern \"C\"{\n"
-"__constant__ float SCALE_H = %s;\n"
-"__constant__ float SCALE_V = %s;\n"
-"__constant__ int WIDTH = %d;\n"
-"__constant__ int HEIGHT = %d;\n"
-"__constant__ int IN_DEPTH = %d;\n"
-"__constant__ int OUT_DEPTH = %d;\n"
-"__constant__ int PSTRIDE = %d;\n"
-"__constant__ int R_IDX = %d;\n"
-"__constant__ int G_IDX = %d;\n"
-"__constant__ int B_IDX = %d;\n"
-"__constant__ int A_IDX = %d;\n"
-"__constant__ int X_IDX = %d;\n"
-"\n"
-"__device__ unsigned short\n"
-"do_scale_pixel (unsigned short val) \n"
-"{\n"
-"  unsigned int diff;\n"
-"  if (OUT_DEPTH > IN_DEPTH) {\n"
-"    diff = OUT_DEPTH - IN_DEPTH;\n"
-"    return (val << diff) | (val >> (IN_DEPTH - diff));\n"
-"  } else if (IN_DEPTH > OUT_DEPTH) {\n"
-"    return val >> (IN_DEPTH - OUT_DEPTH);\n"
-"  }\n"
-"  return val;\n"
-"}\n"
-"\n"
-/* __global__ void
- * GST_CUDA_KERNEL_FUNC_TO_ARGB
- */
-"%s\n"
-"\n"
-/* convert ARGB or ARGB64 to other RGB formats with scale */
-"__global__ void\n"
-GST_CUDA_KERNEL_FUNC_SCALE_RGB
-"(cudaTextureObject_t srcRGB, unsigned char *dstRGB, int dst_stride)\n"
-"{\n"
-"  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
-"  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
-"  if (x_pos < WIDTH && y_pos < HEIGHT) {\n"
-"    float src_xpos = SCALE_H * x_pos;\n"
-"    float src_ypos = SCALE_V * y_pos;\n"
-"    ushort4 argb = tex2D<ushort4>(srcRGB, src_xpos, src_ypos);\n"
-"    argb.x = do_scale_pixel(argb.x);\n"
-"    argb.y = do_scale_pixel(argb.y);\n"
-"    argb.z = do_scale_pixel(argb.z);\n"
-"    argb.w = do_scale_pixel(argb.w);\n"
-     /* FIXME: RGB10A2_LE or BGR10A2_LE only */
-"    if (OUT_DEPTH > 8) {\n"
-"      unsigned int packed_rgb = 0;\n"
-"      unsigned int a, r, g, b;"
-"      a = (argb.x >> 8) & 0x3;\n"
-"      r = argb.y & 0x3ff;\n"
-"      g = argb.z & 0x3ff;\n"
-"      b = argb.w & 0x3ff;\n"
-       /* A is always MSB, we support only little endian system */
-"      packed_rgb = a << 30;\n"
-"      packed_rgb |= (r << (30 - (R_IDX * 10)));\n"
-"      packed_rgb |= (g << (30 - (G_IDX * 10)));\n"
-"      packed_rgb |= (b << (30 - (B_IDX * 10)));\n"
-"      *(unsigned int *)&dstRGB[x_pos * 4 + y_pos * dst_stride] = packed_rgb;\n"
-"    } else {\n"
-"      if (A_IDX >= 0) {\n"
-"        argb.x = do_scale_pixel(argb.x);\n"
-"        dstRGB[x_pos * PSTRIDE + A_IDX + y_pos * dst_stride] = argb.x;\n"
-"      } else if (X_IDX >= 0) {\n"
-"        dstRGB[x_pos * PSTRIDE + X_IDX + y_pos * dst_stride] = 0xff;\n"
-"      }\n"
-"      dstRGB[x_pos * PSTRIDE + R_IDX + y_pos * dst_stride] = argb.y;\n"
-"      dstRGB[x_pos * PSTRIDE + G_IDX + y_pos * dst_stride] = argb.z;\n"
-"      dstRGB[x_pos * PSTRIDE + B_IDX + y_pos * dst_stride] = argb.w;\n"
-"    }\n"
-"  }\n"
-"}\n"
-"\n"
-"}";
-/* *INDENT-ON* */
-
-typedef struct
-{
-  gint R;
-  gint G;
-  gint B;
-  gint A;
-  gint X;
-} GstCudaRGBOrder;
-
-typedef struct
-{
-  CUdeviceptr device_ptr;
-  gsize cuda_stride;
-} GstCudaStageBuffer;
-
-#define CONVERTER_MAX_NUM_FUNC 4
-
-struct _GstCudaConverter
-{
-  GstVideoInfo in_info;
-  GstVideoInfo out_info;
-  gboolean keep_size;
-
-  gint texture_alignment;
-
-  GstCudaContext *cuda_ctx;
-  CUmodule cuda_module;
-  CUfunction kernel_func[CONVERTER_MAX_NUM_FUNC];
-  const gchar *func_names[CONVERTER_MAX_NUM_FUNC];
-  gchar *kernel_source;
-  gchar *ptx;
-  GstCudaStageBuffer fallback_buffer[GST_VIDEO_MAX_PLANES];
-
-  /* *INDENT-OFF* */
-  gboolean (*convert) (GstCudaConverter * convert, GstVideoFrame * src_frame,
-      GstVideoFrame * dst_frame, CUstream cuda_stream);
-  /* *INDENT-ON* */
-
-  const CUdeviceptr src;
-  GstVideoInfo *cur_in_info;
-
-  CUdeviceptr dest;
-  GstVideoInfo *cur_out_info;
-
-  /* rgb to {rgb, yuv} only */
-  GstCudaRGBOrder in_rgb_order;
-  GstCudaStageBuffer unpack_surface;
-  GstCudaStageBuffer y444_surface[GST_VIDEO_MAX_PLANES];
-};
-
-#define LOAD_CUDA_FUNC(module,func,name) G_STMT_START { \
-  if (!gst_cuda_result (CuModuleGetFunction (&(func), (module), name))) { \
-    GST_ERROR ("failed to get %s function", (name)); \
-    goto error; \
-  } \
-} G_STMT_END
-
-/**
- * gst_cuda_converter_new:
- * @in_info: a #GstVideoInfo
- * @out_info: a #GstVideoInfo
- * @cuda_ctx: (transfer none): a #GstCudaContext
- *
- * Create a new converter object to convert between @in_info and @out_info
- * with @config.
- *
- * Returns: a #GstCudaConverter or %NULL if conversion is not possible.
- */
-GstCudaConverter *
-gst_cuda_converter_new (GstVideoInfo * in_info, GstVideoInfo * out_info,
-    GstCudaContext * cuda_ctx)
-{
-  GstCudaConverter *convert;
-  gint i;
-
-  g_return_val_if_fail (in_info != NULL, NULL);
-  g_return_val_if_fail (out_info != NULL, NULL);
-  g_return_val_if_fail (cuda_ctx != NULL, NULL);
-  /* we won't ever do framerate conversion */
-  g_return_val_if_fail (in_info->fps_n == out_info->fps_n, NULL);
-  g_return_val_if_fail (in_info->fps_d == out_info->fps_d, NULL);
-  /* we won't ever do deinterlace */
-  g_return_val_if_fail (in_info->interlace_mode == out_info->interlace_mode,
-      NULL);
-
-  convert = g_new0 (GstCudaConverter, 1);
-
-  convert->in_info = *in_info;
-  convert->out_info = *out_info;
-
-  /* FIXME: should return kernel source */
-  if (!gst_cuda_context_push (cuda_ctx)) {
-    GST_ERROR ("cannot push context");
-    goto error;
-  }
-
-  if (!cuda_converter_lookup_path (convert))
-    goto error;
-
-  convert->ptx = gst_cuda_nvrtc_compile (convert->kernel_source);
-  if (!convert->ptx) {
-    GST_ERROR ("no PTX data to load");
-    goto error;
-  }
-
-  GST_TRACE ("compiled convert ptx \n%s", convert->ptx);
-
-  if (!gst_cuda_result (CuModuleLoadData (&convert->cuda_module, convert->ptx))) {
-    gst_cuda_context_pop (NULL);
-    GST_ERROR ("failed to load cuda module data");
-
-    goto error;
-  }
-
-  for (i = 0; i < CONVERTER_MAX_NUM_FUNC; i++) {
-    if (!convert->func_names[i])
-      break;
-
-    LOAD_CUDA_FUNC (convert->cuda_module, convert->kernel_func[i],
-        convert->func_names[i]);
-    GST_DEBUG ("kernel function \"%s\" loaded", convert->func_names[i]);
-  }
-
-  gst_cuda_context_pop (NULL);
-  convert->cuda_ctx = gst_object_ref (cuda_ctx);
-  convert->texture_alignment =
-      gst_cuda_context_get_texture_alignment (cuda_ctx);
-
-  g_free (convert->kernel_source);
-  g_free (convert->ptx);
-  convert->kernel_source = NULL;
-  convert->ptx = NULL;
-
-  return convert;
-
-error:
-  gst_cuda_context_pop (NULL);
-  gst_cuda_converter_free (convert);
-
-  return NULL;
-}
-
-/**
- * gst_video_converter_free:
- * @convert: a #GstCudaConverter
- *
- * Free @convert
- */
-void
-gst_cuda_converter_free (GstCudaConverter * convert)
-{
-  g_return_if_fail (convert != NULL);
-
-  if (convert->cuda_ctx) {
-    if (gst_cuda_context_push (convert->cuda_ctx)) {
-      gint i;
-
-      if (convert->cuda_module) {
-        gst_cuda_result (CuModuleUnload (convert->cuda_module));
-      }
-
-      for (i = 0; i < GST_VIDEO_MAX_PLANES; i++) {
-        if (convert->fallback_buffer[i].device_ptr)
-          gst_cuda_result (CuMemFree (convert->fallback_buffer[i].device_ptr));
-        if (convert->y444_surface[i].device_ptr)
-          gst_cuda_result (CuMemFree (convert->y444_surface[i].device_ptr));
-      }
-
-      if (convert->unpack_surface.device_ptr)
-        gst_cuda_result (CuMemFree (convert->unpack_surface.device_ptr));
-
-      gst_cuda_context_pop (NULL);
-    }
-
-    gst_object_unref (convert->cuda_ctx);
-  }
-
-  g_free (convert->kernel_source);
-  g_free (convert->ptx);
-  g_free (convert);
-}
-
-gboolean
-gst_cuda_converter_convert_frame (GstCudaConverter * convert,
-    GstVideoFrame * src_frame, GstVideoFrame * dst_frame, CUstream cuda_stream)
-{
-  gboolean ret;
-
-  g_return_val_if_fail (convert, FALSE);
-  g_return_val_if_fail (src_frame, FALSE);
-  g_return_val_if_fail (dst_frame, FALSE);
-
-  gst_cuda_context_push (convert->cuda_ctx);
-
-  ret = convert->convert (convert, src_frame, dst_frame, cuda_stream);
-
-  gst_cuda_context_pop (NULL);
-
-  return ret;
-}
-
-/* allocate fallback memory for texture alignment requirement */
-static gboolean
-convert_ensure_fallback_memory (GstCudaConverter * convert,
-    GstVideoInfo * info, guint plane)
-{
-  CUresult ret;
-  guint element_size = 8;
-
-  if (convert->fallback_buffer[plane].device_ptr)
-    return TRUE;
-
-  if (GST_VIDEO_INFO_COMP_DEPTH (info, 0) > 8)
-    element_size = 16;
-
-  ret = CuMemAllocPitch (&convert->fallback_buffer[plane].device_ptr,
-      &convert->fallback_buffer[plane].cuda_stride,
-      GST_VIDEO_INFO_COMP_WIDTH (info, plane) *
-      GST_VIDEO_INFO_COMP_PSTRIDE (info, plane),
-      GST_VIDEO_INFO_COMP_HEIGHT (info, plane), element_size);
-
-  if (!gst_cuda_result (ret)) {
-    GST_ERROR ("failed to allocated fallback memory");
-    return FALSE;
-  }
-
-  return TRUE;
-}
-
-/* create a 2D CUDA texture without alignment check */
-static CUtexObject
-convert_create_texture_unchecked (const CUdeviceptr src, gint width,
-    gint height, gint channels, gint stride, CUarray_format format,
-    CUfilter_mode mode, CUstream cuda_stream)
-{
-  CUDA_TEXTURE_DESC texture_desc;
-  CUDA_RESOURCE_DESC resource_desc;
-  CUtexObject texture = 0;
-  CUresult cuda_ret;
-
-  memset (&texture_desc, 0, sizeof (CUDA_TEXTURE_DESC));
-  memset (&resource_desc, 0, sizeof (CUDA_RESOURCE_DESC));
-
-  resource_desc.resType = CU_RESOURCE_TYPE_PITCH2D;
-  resource_desc.res.pitch2D.format = format;
-  resource_desc.res.pitch2D.numChannels = channels;
-  resource_desc.res.pitch2D.width = width;
-  resource_desc.res.pitch2D.height = height;
-  resource_desc.res.pitch2D.pitchInBytes = stride;
-  resource_desc.res.pitch2D.devPtr = src;
-
-  texture_desc.filterMode = mode;
-  texture_desc.flags = CU_TRSF_READ_AS_INTEGER;
-
-  gst_cuda_result (CuStreamSynchronize (cuda_stream));
-  cuda_ret = CuTexObjectCreate (&texture, &resource_desc, &texture_desc, NULL);
-
-  if (!gst_cuda_result (cuda_ret)) {
-    GST_ERROR ("couldn't create texture");
-
-    return 0;
-  }
-
-  return texture;
-}
-
-static CUtexObject
-convert_create_texture (GstCudaConverter * convert, GstVideoFrame * src_frame,
-    guint plane, CUstream cuda_stream)
-{
-  CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8;
-  guint channels = 1;
-  CUdeviceptr src_ptr;
-  gsize stride;
-  CUresult cuda_ret;
-  CUfilter_mode mode;
-
-  if (GST_VIDEO_FRAME_COMP_DEPTH (src_frame, plane) > 8)
-    format = CU_AD_FORMAT_UNSIGNED_INT16;
-
-  /* FIXME: more graceful method ? */
-  if (plane != 0 &&
-      GST_VIDEO_FRAME_N_PLANES (src_frame) !=
-      GST_VIDEO_FRAME_N_COMPONENTS (src_frame)) {
-    channels = 2;
-  }
-
-  src_ptr = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (src_frame, plane);
-  stride = GST_VIDEO_FRAME_PLANE_STRIDE (src_frame, plane);
-
-  if (convert->texture_alignment && (src_ptr % convert->texture_alignment)) {
-    CUDA_MEMCPY2D copy_params = { 0, };
-
-    if (!convert_ensure_fallback_memory (convert, &src_frame->info, plane))
-      return 0;
-
-    GST_LOG ("device memory was not aligned, copy to fallback memory");
-
-    copy_params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
-    copy_params.srcPitch = stride;
-    copy_params.srcDevice = (CUdeviceptr) src_ptr;
-
-    copy_params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-    copy_params.dstPitch = convert->fallback_buffer[plane].cuda_stride;
-    copy_params.dstDevice = convert->fallback_buffer[plane].device_ptr;
-    copy_params.WidthInBytes = GST_VIDEO_FRAME_COMP_WIDTH (src_frame, plane)
-        * GST_VIDEO_FRAME_COMP_PSTRIDE (src_frame, plane);
-    copy_params.Height = GST_VIDEO_FRAME_COMP_HEIGHT (src_frame, plane);
-
-    cuda_ret = CuMemcpy2DAsync (&copy_params, cuda_stream);
-    if (!gst_cuda_result (cuda_ret)) {
-      GST_ERROR ("failed to copy to fallback buffer");
-      return 0;
-    }
-
-    src_ptr = convert->fallback_buffer[plane].device_ptr;
-    stride = convert->fallback_buffer[plane].cuda_stride;
-  }
-
-  /* Use h/w linear interpolation only when resize is required.
-   * Otherwise the image might be blurred */
-  if (convert->keep_size)
-    mode = CU_TR_FILTER_MODE_POINT;
-  else
-    mode = CU_TR_FILTER_MODE_LINEAR;
-
-  return convert_create_texture_unchecked (src_ptr,
-      GST_VIDEO_FRAME_COMP_WIDTH (src_frame, plane),
-      GST_VIDEO_FRAME_COMP_HEIGHT (src_frame, plane), channels, stride, format,
-      mode, cuda_stream);
-}
-
-/* main conversion function for YUV to YUV conversion */
-static gboolean
-convert_YUV_TO_YUV (GstCudaConverter * convert, GstVideoFrame * src_frame,
-    GstVideoFrame * dst_frame, CUstream cuda_stream)
-{
-  CUtexObject texture[GST_VIDEO_MAX_PLANES] = { 0, };
-  CUresult cuda_ret;
-  gboolean ret = FALSE;
-  CUdeviceptr dst_ptr[GST_VIDEO_MAX_PLANES] = { 0, };
-  gint dst_stride, dst_uv_stride;
-  gint width, height;
-  gint i;
-
-  gpointer kernel_args[] = { &texture[0], &texture[1], &texture[2],
-    &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_stride, &dst_uv_stride
-  };
-
-  /* conversion step
-   * STEP 1: create CUtexObject per plane
-   * STEP 2: call YUV to YUV conversion kernel function.
-   *         resize, uv reordering and bitdepth conversion will be performed in
-   *         the CUDA kernel function
-   */
-
-  /* map CUDA device memory to CUDA texture object */
-  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (src_frame); i++) {
-    texture[i] = convert_create_texture (convert, src_frame, i, cuda_stream);
-    if (!texture[i]) {
-      GST_ERROR ("couldn't create texture for %d th plane", i);
-      goto done;
-    }
-  }
-
-  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (dst_frame); i++) {
-    dst_ptr[i] = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (dst_frame, i);
-  }
-
-  dst_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 0);
-  dst_uv_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 1);
-
-  width = GST_VIDEO_FRAME_WIDTH (dst_frame);
-  height = GST_VIDEO_FRAME_HEIGHT (dst_frame);
-
-  cuda_ret =
-      CuLaunchKernel (convert->kernel_func[0], DIV_UP (width, CUDA_BLOCK_X),
-      DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
-      cuda_stream, kernel_args, NULL);
-
-  if (!gst_cuda_result (cuda_ret)) {
-    GST_ERROR ("could not rescale plane");
-    goto done;
-  }
-
-  ret = TRUE;
-  gst_cuda_result (CuStreamSynchronize (cuda_stream));
-
-done:
-  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (src_frame); i++) {
-    if (texture[i])
-      gst_cuda_result (CuTexObjectDestroy (texture[i]));
-  }
-
-  return ret;
-}
-
-/* main conversion function for YUV to RGB conversion */
-static gboolean
-convert_YUV_TO_RGB (GstCudaConverter * convert, GstVideoFrame * src_frame,
-    GstVideoFrame * dst_frame, CUstream cuda_stream)
-{
-  CUtexObject texture[GST_VIDEO_MAX_PLANES] = { 0, };
-  CUresult cuda_ret;
-  gboolean ret = FALSE;
-  CUdeviceptr dstRGB = 0;
-  gint dst_stride;
-  gint width, height;
-  gint i;
-
-  gpointer kernel_args[] = { &texture[0], &texture[1], &texture[2],
-    &dstRGB, &dst_stride
-  };
-
-  /* conversion step
-   * STEP 1: create CUtexObject per plane
-   * STEP 2: call YUV to RGB conversion kernel function.
-   *         resizing, argb ordering and bitdepth conversion will be performed in
-   *         the CUDA kernel function
-   */
-
-  /* map CUDA device memory to CUDA texture object */
-  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (src_frame); i++) {
-    texture[i] = convert_create_texture (convert, src_frame, i, cuda_stream);
-    if (!texture[i]) {
-      GST_ERROR ("couldn't create texture for %d th plane", i);
-      goto done;
-    }
-  }
-
-  dstRGB = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (dst_frame, 0);
-  dst_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 0);
-
-  width = GST_VIDEO_FRAME_WIDTH (dst_frame);
-  height = GST_VIDEO_FRAME_HEIGHT (dst_frame);
-
-  cuda_ret =
-      CuLaunchKernel (convert->kernel_func[0], DIV_UP (width, CUDA_BLOCK_X),
-      DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
-      cuda_stream, kernel_args, NULL);
-
-  if (!gst_cuda_result (cuda_ret)) {
-    GST_ERROR ("could not rescale plane");
-    goto done;
-  }
-
-  ret = TRUE;
-  gst_cuda_result (CuStreamSynchronize (cuda_stream));
-
-done:
-  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (src_frame); i++) {
-    if (texture[i])
-      gst_cuda_result (CuTexObjectDestroy (texture[i]));
-  }
-
-  return ret;
-}
-
-static gboolean
-convert_UNPACK_RGB (GstCudaConverter * convert, CUfunction kernel_func,
-    CUstream cuda_stream, GstVideoFrame * src_frame,
-    CUdeviceptr dst, gint dst_stride, GstCudaRGBOrder * rgb_order)
-{
-  CUdeviceptr srcRGB = 0;
-  gint width, height;
-  gint src_stride, src_pstride;
-  CUresult cuda_ret;
-
-  gpointer unpack_kernel_args[] = { &srcRGB, &dst,
-    &width, &height,
-    &src_stride, &src_pstride, &dst_stride,
-    &convert->in_rgb_order.R, &convert->in_rgb_order.G,
-    &convert->in_rgb_order.B, &convert->in_rgb_order.A,
-  };
-
-  srcRGB = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (src_frame, 0);
-  src_stride = GST_VIDEO_FRAME_PLANE_STRIDE (src_frame, 0);
-
-  width = GST_VIDEO_FRAME_WIDTH (src_frame);
-  height = GST_VIDEO_FRAME_HEIGHT (src_frame);
-  src_pstride = GST_VIDEO_FRAME_COMP_PSTRIDE (src_frame, 0);
-
-  cuda_ret =
-      CuLaunchKernel (kernel_func, DIV_UP (width, CUDA_BLOCK_X),
-      DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
-      cuda_stream, unpack_kernel_args, NULL);
-
-  if (!gst_cuda_result (cuda_ret)) {
-    GST_ERROR ("could not unpack rgb");
-    return FALSE;
-  }
-
-  return TRUE;
-}
-
-static gboolean
-convert_TO_Y444 (GstCudaConverter * convert, CUfunction kernel_func,
-    CUstream cuda_stream, CUtexObject srcRGB, CUdeviceptr dstY, gint y_stride,
-    CUdeviceptr dstU, gint u_stride, CUdeviceptr dstV, gint v_stride,
-    gint width, gint height, gint pstride, gint bitdepth)
-{
-  CUresult cuda_ret;
-
-  gpointer kernel_args[] = { &srcRGB, &dstY, &y_stride, &dstU, &u_stride, &dstV,
-    &v_stride, &width, &height, &pstride, &bitdepth,
-  };
-
-  cuda_ret =
-      CuLaunchKernel (kernel_func, DIV_UP (width, CUDA_BLOCK_X),
-      DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
-      cuda_stream, kernel_args, NULL);
-
-  if (!gst_cuda_result (cuda_ret)) {
-    GST_ERROR ("could not unpack rgb");
-    return FALSE;
-  }
-
-  return TRUE;
-}
-
-/* main conversion function for RGB to YUV conversion */
-static gboolean
-convert_RGB_TO_YUV (GstCudaConverter * convert, GstVideoFrame * src_frame,
-    GstVideoFrame * dst_frame, CUstream cuda_stream)
-{
-  CUtexObject texture = 0;
-  CUtexObject yuv_texture[3] = { 0, };
-  CUdeviceptr dst_ptr[GST_VIDEO_MAX_PLANES] = { 0, };
-  CUresult cuda_ret;
-  gboolean ret = FALSE;
-  gint in_width, in_height;
-  gint out_width, out_height;
-  gint dst_stride, dst_uv_stride;
-  CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8;
-  CUfilter_mode mode = CU_TR_FILTER_MODE_POINT;
-  gint pstride = 1;
-  gint bitdepth = 8;
-  gint i;
-
-  gpointer kernel_args[] = { &yuv_texture[0], &yuv_texture[1], &yuv_texture[2],
-    &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_stride, &dst_uv_stride
-  };
-
-  /* conversion step
-   * STEP 1: unpack src RGB into ARGB or ARGB64 format
-   * STEP 2: convert unpacked ARGB (or ARGB64) to Y444 (or Y444_16LE)
-   * STEP 3: convert Y444 (or Y444_16LE) to final YUV format.
-   *         resizing, bitdepth conversion, uv reordering will be performed in
-   *         the CUDA kernel function
-   */
-  if (!convert_UNPACK_RGB (convert, convert->kernel_func[0], cuda_stream,
-          src_frame, convert->unpack_surface.device_ptr,
-          convert->unpack_surface.cuda_stride, &convert->in_rgb_order)) {
-    GST_ERROR ("could not unpack input rgb");
-
-    goto done;
-  }
-
-  in_width = GST_VIDEO_FRAME_WIDTH (src_frame);
-  in_height = GST_VIDEO_FRAME_HEIGHT (src_frame);
-
-  out_width = GST_VIDEO_FRAME_WIDTH (dst_frame);
-  out_height = GST_VIDEO_FRAME_HEIGHT (dst_frame);
-  dst_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 0);
-  dst_uv_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 1);
-
-  if (GST_VIDEO_FRAME_COMP_DEPTH (src_frame, 0) > 8) {
-    pstride = 2;
-    bitdepth = 16;
-    format = CU_AD_FORMAT_UNSIGNED_INT16;
-  }
-
-  texture =
-      convert_create_texture_unchecked (convert->unpack_surface.device_ptr,
-      in_width, in_height, 4, convert->unpack_surface.cuda_stride, format,
-      mode, cuda_stream);
-
-  if (!texture) {
-    GST_ERROR ("could not create texture");
-    goto done;
-  }
-
-  if (!convert_TO_Y444 (convert, convert->kernel_func[1], cuda_stream, texture,
-          convert->y444_surface[0].device_ptr,
-          convert->y444_surface[0].cuda_stride,
-          convert->y444_surface[1].device_ptr,
-          convert->y444_surface[1].cuda_stride,
-          convert->y444_surface[2].device_ptr,
-          convert->y444_surface[2].cuda_stride, in_width, in_height, pstride,
-          bitdepth)) {
-    GST_ERROR ("could not convert to Y444 or Y444_16LE");
-    goto done;
-  }
-
-  /* Use h/w linear interpolation only when resize is required.
-   * Otherwise the image might be blurred */
-  if (convert->keep_size)
-    mode = CU_TR_FILTER_MODE_POINT;
-  else
-    mode = CU_TR_FILTER_MODE_LINEAR;
-
-  for (i = 0; i < 3; i++) {
-    yuv_texture[i] =
-        convert_create_texture_unchecked (convert->y444_surface[i].device_ptr,
-        in_width, in_height, 1, convert->y444_surface[i].cuda_stride, format,
-        mode, cuda_stream);
-
-    if (!yuv_texture[i]) {
-      GST_ERROR ("could not create %dth yuv texture", i);
-      goto done;
-    }
-  }
-
-  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (dst_frame); i++)
-    dst_ptr[i] = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (dst_frame, i);
-
-  cuda_ret =
-      CuLaunchKernel (convert->kernel_func[2], DIV_UP (out_width, CUDA_BLOCK_X),
-      DIV_UP (out_height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
-      cuda_stream, kernel_args, NULL);
-
-  if (!gst_cuda_result (cuda_ret)) {
-    GST_ERROR ("could not rescale plane");
-    goto done;
-  }
-
-  ret = TRUE;
-  gst_cuda_result (CuStreamSynchronize (cuda_stream));
-
-done:
-  if (texture)
-    gst_cuda_result (CuTexObjectDestroy (texture));
-  for (i = 0; i < 3; i++) {
-    if (yuv_texture[i])
-      gst_cuda_result (CuTexObjectDestroy (yuv_texture[i]));
-  }
-
-  return ret;
-}
-
-/* main conversion function for RGB to RGB conversion */
-static gboolean
-convert_RGB_TO_RGB (GstCudaConverter * convert, GstVideoFrame * src_frame,
-    GstVideoFrame * dst_frame, CUstream cuda_stream)
-{
-  CUtexObject texture = 0;
-  CUresult cuda_ret;
-  gboolean ret = FALSE;
-  CUdeviceptr dstRGB = 0;
-  gint in_width, in_height;
-  gint out_width, out_height;
-  gint dst_stride;
-  CUfilter_mode mode;
-  CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8;
-
-  gpointer rescale_kernel_args[] = { &texture, &dstRGB, &dst_stride };
-
-  /* conversion step
-   * STEP 1: unpack src RGB into ARGB or ARGB64 format
-   * STEP 2: convert ARGB (or ARGB64) to final RGB format.
-   *         resizing, bitdepth conversion, argb reordering will be performed in
-   *         the CUDA kernel function
-   */
-
-  if (!convert_UNPACK_RGB (convert, convert->kernel_func[0], cuda_stream,
-          src_frame, convert->unpack_surface.device_ptr,
-          convert->unpack_surface.cuda_stride, &convert->in_rgb_order)) {
-    GST_ERROR ("could not unpack input rgb");
-
-    goto done;
-  }
-
-  in_width = GST_VIDEO_FRAME_WIDTH (src_frame);
-  in_height = GST_VIDEO_FRAME_HEIGHT (src_frame);
-
-  out_width = GST_VIDEO_FRAME_WIDTH (dst_frame);
-  out_height = GST_VIDEO_FRAME_HEIGHT (dst_frame);
-
-  dstRGB = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (dst_frame, 0);
-  dst_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 0);
-
-  if (GST_VIDEO_FRAME_COMP_DEPTH (src_frame, 0) > 8)
-    format = CU_AD_FORMAT_UNSIGNED_INT16;
-
-  /* Use h/w linear interpolation only when resize is required.
-   * Otherwise the image might be blurred */
-  if (convert->keep_size)
-    mode = CU_TR_FILTER_MODE_POINT;
-  else
-    mode = CU_TR_FILTER_MODE_LINEAR;
-
-  texture =
-      convert_create_texture_unchecked (convert->unpack_surface.device_ptr,
-      in_width, in_height, 4, convert->unpack_surface.cuda_stride, format,
-      mode, cuda_stream);
-
-  if (!texture) {
-    GST_ERROR ("could not create texture");
-    goto done;
-  }
-
-  cuda_ret =
-      CuLaunchKernel (convert->kernel_func[1], DIV_UP (out_width, CUDA_BLOCK_X),
-      DIV_UP (out_height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
-      cuda_stream, rescale_kernel_args, NULL);
-
-  if (!gst_cuda_result (cuda_ret)) {
-    GST_ERROR ("could not rescale plane");
-    goto done;
-  }
-
-  ret = TRUE;
-  gst_cuda_result (CuStreamSynchronize (cuda_stream));
-
-done:
-  if (texture)
-    gst_cuda_result (CuTexObjectDestroy (texture));
-
-  return ret;
-}
-
-/* from video-converter.c */
-typedef struct
-{
-  gdouble dm[4][4];
-} MatrixData;
-
-static void
-color_matrix_set_identity (MatrixData * m)
-{
-  gint i, j;
-
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      m->dm[i][j] = (i == j);
-    }
-  }
-}
-
-static void
-color_matrix_copy (MatrixData * d, const MatrixData * s)
-{
-  gint i, j;
-
-  for (i = 0; i < 4; i++)
-    for (j = 0; j < 4; j++)
-      d->dm[i][j] = s->dm[i][j];
-}
-
-/* Perform 4x4 matrix multiplication:
- *  - @dst@ = @a@ * @b@
- *  - @dst@ may be a pointer to @a@ andor @b@
- */
-static void
-color_matrix_multiply (MatrixData * dst, MatrixData * a, MatrixData * b)
-{
-  MatrixData tmp;
-  gint i, j, k;
-
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      gdouble x = 0;
-      for (k = 0; k < 4; k++) {
-        x += a->dm[i][k] * b->dm[k][j];
-      }
-      tmp.dm[i][j] = x;
-    }
-  }
-  color_matrix_copy (dst, &tmp);
-}
-
-static void
-color_matrix_offset_components (MatrixData * m, gdouble a1, gdouble a2,
-    gdouble a3)
-{
-  MatrixData a;
-
-  color_matrix_set_identity (&a);
-  a.dm[0][3] = a1;
-  a.dm[1][3] = a2;
-  a.dm[2][3] = a3;
-  color_matrix_multiply (m, &a, m);
-}
-
-static void
-color_matrix_scale_components (MatrixData * m, gdouble a1, gdouble a2,
-    gdouble a3)
-{
-  MatrixData a;
-
-  color_matrix_set_identity (&a);
-  a.dm[0][0] = a1;
-  a.dm[1][1] = a2;
-  a.dm[2][2] = a3;
-  color_matrix_multiply (m, &a, m);
-}
-
-static void
-color_matrix_debug (const MatrixData * s)
-{
-  GST_DEBUG ("[%f %f %f %f]", s->dm[0][0], s->dm[0][1], s->dm[0][2],
-      s->dm[0][3]);
-  GST_DEBUG ("[%f %f %f %f]", s->dm[1][0], s->dm[1][1], s->dm[1][2],
-      s->dm[1][3]);
-  GST_DEBUG ("[%f %f %f %f]", s->dm[2][0], s->dm[2][1], s->dm[2][2],
-      s->dm[2][3]);
-  GST_DEBUG ("[%f %f %f %f]", s->dm[3][0], s->dm[3][1], s->dm[3][2],
-      s->dm[3][3]);
-}
-
-static void
-color_matrix_YCbCr_to_RGB (MatrixData * m, gdouble Kr, gdouble Kb)
-{
-  gdouble Kg = 1.0 - Kr - Kb;
-  MatrixData k = {
-    {
-          {1., 0., 2 * (1 - Kr), 0.},
-          {1., -2 * Kb * (1 - Kb) / Kg, -2 * Kr * (1 - Kr) / Kg, 0.},
-          {1., 2 * (1 - Kb), 0., 0.},
-          {0., 0., 0., 1.},
-        }
-  };
-
-  color_matrix_multiply (m, &k, m);
-}
-
-static void
-color_matrix_RGB_to_YCbCr (MatrixData * m, gdouble Kr, gdouble Kb)
-{
-  gdouble Kg = 1.0 - Kr - Kb;
-  MatrixData k;
-  gdouble x;
-
-  k.dm[0][0] = Kr;
-  k.dm[0][1] = Kg;
-  k.dm[0][2] = Kb;
-  k.dm[0][3] = 0;
-
-  x = 1 / (2 * (1 - Kb));
-  k.dm[1][0] = -x * Kr;
-  k.dm[1][1] = -x * Kg;
-  k.dm[1][2] = x * (1 - Kb);
-  k.dm[1][3] = 0;
-
-  x = 1 / (2 * (1 - Kr));
-  k.dm[2][0] = x * (1 - Kr);
-  k.dm[2][1] = -x * Kg;
-  k.dm[2][2] = -x * Kb;
-  k.dm[2][3] = 0;
-
-  k.dm[3][0] = 0;
-  k.dm[3][1] = 0;
-  k.dm[3][2] = 0;
-  k.dm[3][3] = 1;
-
-  color_matrix_multiply (m, &k, m);
-}
-
-static void
-compute_matrix_to_RGB (GstCudaConverter * convert, MatrixData * data,
-    GstVideoInfo * info)
-{
-  gdouble Kr = 0, Kb = 0;
-  gint offset[4], scale[4];
-
-  /* bring color components to [0..1.0] range */
-  gst_video_color_range_offsets (info->colorimetry.range, info->finfo, offset,
-      scale);
-
-  color_matrix_offset_components (data, -offset[0], -offset[1], -offset[2]);
-  color_matrix_scale_components (data, 1 / ((float) scale[0]),
-      1 / ((float) scale[1]), 1 / ((float) scale[2]));
-
-  if (!GST_VIDEO_INFO_IS_RGB (info)) {
-    /* bring components to R'G'B' space */
-    if (gst_video_color_matrix_get_Kr_Kb (info->colorimetry.matrix, &Kr, &Kb))
-      color_matrix_YCbCr_to_RGB (data, Kr, Kb);
-  }
-  color_matrix_debug (data);
-}
-
-static void
-compute_matrix_to_YUV (GstCudaConverter * convert, MatrixData * data,
-    GstVideoInfo * info)
-{
-  gdouble Kr = 0, Kb = 0;
-  gint offset[4], scale[4];
-
-  if (!GST_VIDEO_INFO_IS_RGB (info)) {
-    /* bring components to YCbCr space */
-    if (gst_video_color_matrix_get_Kr_Kb (info->colorimetry.matrix, &Kr, &Kb))
-      color_matrix_RGB_to_YCbCr (data, Kr, Kb);
-  }
-
-  /* bring color components to nominal range */
-  gst_video_color_range_offsets (info->colorimetry.range, info->finfo, offset,
-      scale);
-
-  color_matrix_scale_components (data, (float) scale[0], (float) scale[1],
-      (float) scale[2]);
-  color_matrix_offset_components (data, offset[0], offset[1], offset[2]);
-
-  color_matrix_debug (data);
-}
-
-static gboolean
-cuda_converter_get_matrix (GstCudaConverter * convert, MatrixData * matrix,
-    GstVideoInfo * in_info, GstVideoInfo * out_info)
-{
-  gboolean same_matrix, same_bits;
-  guint in_bits, out_bits;
-
-  in_bits = GST_VIDEO_INFO_COMP_DEPTH (in_info, 0);
-  out_bits = GST_VIDEO_INFO_COMP_DEPTH (out_info, 0);
-
-  same_bits = in_bits == out_bits;
-  same_matrix = in_info->colorimetry.matrix == out_info->colorimetry.matrix;
-
-  GST_DEBUG ("matrix %d -> %d (%d)", in_info->colorimetry.matrix,
-      out_info->colorimetry.matrix, same_matrix);
-  GST_DEBUG ("bits %d -> %d (%d)", in_bits, out_bits, same_bits);
-
-  color_matrix_set_identity (matrix);
-
-  if (same_bits && same_matrix) {
-    GST_DEBUG ("conversion matrix is not required");
-
-    return FALSE;
-  }
-
-  if (in_bits < out_bits) {
-    gint scale = 1 << (out_bits - in_bits);
-    color_matrix_scale_components (matrix,
-        1 / (float) scale, 1 / (float) scale, 1 / (float) scale);
-  }
-
-  GST_DEBUG ("to RGB matrix");
-  compute_matrix_to_RGB (convert, matrix, in_info);
-  GST_DEBUG ("current matrix");
-  color_matrix_debug (matrix);
-
-  GST_DEBUG ("to YUV matrix");
-  compute_matrix_to_YUV (convert, matrix, out_info);
-  GST_DEBUG ("current matrix");
-  color_matrix_debug (matrix);
-
-  if (in_bits > out_bits) {
-    gint scale = 1 << (in_bits - out_bits);
-    color_matrix_scale_components (matrix,
-        (float) scale, (float) scale, (float) scale);
-  }
-
-  GST_DEBUG ("final matrix");
-  color_matrix_debug (matrix);
-
-  return TRUE;
-}
-
-static gboolean
-is_uv_swapped (GstVideoFormat format)
-{
-  static GstVideoFormat swapped_formats[] = {
-    GST_VIDEO_FORMAT_YV12,
-    GST_VIDEO_FORMAT_NV21,
-  };
-  gint i;
-
-  for (i = 0; i < G_N_ELEMENTS (swapped_formats); i++) {
-    if (format == swapped_formats[i])
-      return TRUE;
-  }
-
-  return FALSE;
-}
-
-typedef struct
-{
-  const gchar *read_chroma;
-  const gchar *write_chroma;
-  const gchar *unpack_function;
-  gfloat scale_h, scale_v;
-  gfloat chroma_scale_h, chroma_scale_v;
-  gint width, height;
-  gint chroma_width, chroma_height;
-  gint in_depth;
-  gint out_depth;
-  gint pstride, chroma_pstride;
-  gint in_shift, out_shift;
-  gint mask;
-  gint swap_uv;
-  /* RGBA specific variables */
-  gint max_in_val;
-  GstCudaRGBOrder rgb_order;
-} GstCudaKernelTempl;
-
-static gchar *
-cuda_converter_generate_yuv_to_yuv_kernel_code (GstCudaConverter * convert,
-    GstCudaKernelTempl * templ)
-{
-  gchar scale_h_str[G_ASCII_DTOSTR_BUF_SIZE];
-  gchar scale_v_str[G_ASCII_DTOSTR_BUF_SIZE];
-  gchar chroma_scale_h_str[G_ASCII_DTOSTR_BUF_SIZE];
-  gchar chroma_scale_v_str[G_ASCII_DTOSTR_BUF_SIZE];
-  g_ascii_formatd (scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_h);
-  g_ascii_formatd (scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_v);
-  g_ascii_formatd (chroma_scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f",
-      templ->chroma_scale_h);
-  g_ascii_formatd (chroma_scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f",
-      templ->chroma_scale_v);
-  return g_strdup_printf (templ_YUV_TO_YUV, scale_h_str, scale_v_str,
-      chroma_scale_h_str, chroma_scale_v_str, templ->width, templ->height,
-      templ->chroma_width, templ->chroma_height, templ->in_depth,
-      templ->out_depth, templ->pstride, templ->chroma_pstride, templ->in_shift,
-      templ->out_shift, templ->mask, templ->swap_uv, templ->read_chroma,
-      templ->write_chroma);
-}
-
-static gchar *
-cuda_converter_generate_yuv_to_rgb_kernel_code (GstCudaConverter * convert,
-    GstCudaKernelTempl * templ, MatrixData * matrix)
-{
-  gchar matrix_dm[4][4][G_ASCII_DTOSTR_BUF_SIZE];
-  gchar scale_h_str[G_ASCII_DTOSTR_BUF_SIZE];
-  gchar scale_v_str[G_ASCII_DTOSTR_BUF_SIZE];
-  gchar chroma_scale_h_str[G_ASCII_DTOSTR_BUF_SIZE];
-  gchar chroma_scale_v_str[G_ASCII_DTOSTR_BUF_SIZE];
-  gint i, j;
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      g_ascii_formatd (matrix_dm[i][j], G_ASCII_DTOSTR_BUF_SIZE, "%f",
-          matrix->dm[i][j]);
-    }
-  }
-  g_ascii_formatd (scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_h);
-  g_ascii_formatd (scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_v);
-  g_ascii_formatd (chroma_scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f",
-      templ->chroma_scale_h);
-  g_ascii_formatd (chroma_scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f",
-      templ->chroma_scale_v);
-  return g_strdup_printf (templ_YUV_TO_RGB, matrix_dm[0][3], matrix_dm[1][3],
-      matrix_dm[2][3], matrix_dm[0][0], matrix_dm[0][1], matrix_dm[0][2],
-      matrix_dm[1][0], matrix_dm[1][1], matrix_dm[1][2], matrix_dm[2][0],
-      matrix_dm[2][1], matrix_dm[2][2], scale_h_str, scale_v_str,
-      chroma_scale_h_str, chroma_scale_v_str, templ->width, templ->height,
-      templ->chroma_width, templ->chroma_height, templ->in_depth,
-      templ->out_depth, templ->pstride, templ->chroma_pstride, templ->in_shift,
-      templ->out_shift, templ->mask, templ->swap_uv, templ->max_in_val,
-      templ->rgb_order.R, templ->rgb_order.G, templ->rgb_order.B,
-      templ->rgb_order.A, templ->rgb_order.X, templ->read_chroma);
-}
-
-static gchar *
-cuda_converter_generate_rgb_to_yuv_kernel_code (GstCudaConverter * convert,
-    GstCudaKernelTempl * templ, MatrixData * matrix)
-{
-  gchar matrix_dm[4][4][G_ASCII_DTOSTR_BUF_SIZE];
-  gchar scale_h_str[G_ASCII_DTOSTR_BUF_SIZE];
-  gchar scale_v_str[G_ASCII_DTOSTR_BUF_SIZE];
-  gchar chroma_scale_h_str[G_ASCII_DTOSTR_BUF_SIZE];
-  gchar chroma_scale_v_str[G_ASCII_DTOSTR_BUF_SIZE];
-  gint i, j;
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      g_ascii_formatd (matrix_dm[i][j], G_ASCII_DTOSTR_BUF_SIZE, "%f",
-          matrix->dm[i][j]);
-    }
-  }
-  g_ascii_formatd (scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_h);
-  g_ascii_formatd (scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_v);
-  g_ascii_formatd (chroma_scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f",
-      templ->chroma_scale_h);
-  g_ascii_formatd (chroma_scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f",
-      templ->chroma_scale_v);
-  return g_strdup_printf (templ_RGB_TO_YUV, matrix_dm[0][3], matrix_dm[1][3],
-      matrix_dm[2][3], matrix_dm[0][0], matrix_dm[0][1], matrix_dm[0][2],
-      matrix_dm[1][0], matrix_dm[1][1], matrix_dm[1][2], matrix_dm[2][0],
-      matrix_dm[2][1], matrix_dm[2][2], scale_h_str, scale_v_str,
-      chroma_scale_h_str, chroma_scale_v_str, templ->width, templ->height,
-      templ->chroma_width, templ->chroma_height, templ->in_depth,
-      templ->out_depth, templ->pstride, templ->chroma_pstride, templ->in_shift,
-      templ->out_shift, templ->mask, templ->swap_uv, templ->unpack_function,
-      templ->read_chroma, templ->write_chroma);
-}
-
-static gchar *
-cuda_converter_generate_rgb_to_rgb_kernel_code (GstCudaConverter * convert,
-    GstCudaKernelTempl * templ)
-{
-  gchar scale_h_str[G_ASCII_DTOSTR_BUF_SIZE];
-  gchar scale_v_str[G_ASCII_DTOSTR_BUF_SIZE];
-  g_ascii_formatd (scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_h);
-  g_ascii_formatd (scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_v);
-  return g_strdup_printf (templ_RGB_to_RGB,
-      scale_h_str, scale_v_str,
-      templ->width, templ->height,
-      templ->in_depth, templ->out_depth, templ->pstride,
-      templ->rgb_order.R, templ->rgb_order.G,
-      templ->rgb_order.B, templ->rgb_order.A, templ->rgb_order.X,
-      templ->unpack_function);
-}
-
-#define SET_ORDER(o,r,g,b,a,x) G_STMT_START { \
-  (o)->R = (r); \
-  (o)->G = (g); \
-  (o)->B = (b); \
-  (o)->A = (a); \
-  (o)->X = (x); \
-} G_STMT_END
-
-static void
-cuda_converter_get_rgb_order (GstVideoFormat format, GstCudaRGBOrder * order)
-{
-  switch (format) {
-    case GST_VIDEO_FORMAT_RGBA:
-      SET_ORDER (order, 0, 1, 2, 3, -1);
-      break;
-    case GST_VIDEO_FORMAT_RGBx:
-      SET_ORDER (order, 0, 1, 2, -1, 3);
-      break;
-    case GST_VIDEO_FORMAT_BGRA:
-      SET_ORDER (order, 2, 1, 0, 3, -1);
-      break;
-    case GST_VIDEO_FORMAT_BGRx:
-      SET_ORDER (order, 2, 1, 0, -1, 3);
-      break;
-    case GST_VIDEO_FORMAT_ARGB:
-      SET_ORDER (order, 1, 2, 3, 0, -1);
-      break;
-    case GST_VIDEO_FORMAT_ABGR:
-      SET_ORDER (order, 3, 2, 1, 0, -1);
-      break;
-    case GST_VIDEO_FORMAT_RGB:
-      SET_ORDER (order, 0, 1, 2, -1, -1);
-      break;
-    case GST_VIDEO_FORMAT_BGR:
-      SET_ORDER (order, 2, 1, 0, -1, -1);
-      break;
-    case GST_VIDEO_FORMAT_BGR10A2_LE:
-      SET_ORDER (order, 1, 2, 3, 0, -1);
-      break;
-    case GST_VIDEO_FORMAT_RGB10A2_LE:
-      SET_ORDER (order, 3, 2, 1, 0, -1);
-      break;
-    default:
-      g_assert_not_reached ();
-      break;
-  }
-}
-
-static gboolean
-cuda_converter_lookup_path (GstCudaConverter * convert)
-{
-  GstVideoFormat in_format, out_format;
-  gboolean src_yuv, dst_yuv;
-  gboolean src_planar, dst_planar;
-  GstCudaKernelTempl templ = { 0, };
-  GstVideoInfo *in_info, *out_info;
-  gboolean ret = FALSE;
-  CUresult cuda_ret;
-
-  in_info = &convert->in_info;
-  out_info = &convert->out_info;
-
-  in_format = GST_VIDEO_INFO_FORMAT (in_info);
-  out_format = GST_VIDEO_INFO_FORMAT (out_info);
-
-  src_yuv = GST_VIDEO_INFO_IS_YUV (in_info);
-  dst_yuv = GST_VIDEO_INFO_IS_YUV (out_info);
-
-  src_planar = GST_VIDEO_INFO_N_PLANES (in_info) ==
-      GST_VIDEO_INFO_N_COMPONENTS (in_info);
-  dst_planar = GST_VIDEO_INFO_N_PLANES (out_info) ==
-      GST_VIDEO_INFO_N_COMPONENTS (out_info);
-
-  convert->keep_size = (GST_VIDEO_INFO_WIDTH (&convert->in_info) ==
-      GST_VIDEO_INFO_WIDTH (&convert->out_info) &&
-      GST_VIDEO_INFO_HEIGHT (&convert->in_info) ==
-      GST_VIDEO_INFO_HEIGHT (&convert->out_info));
-
-  templ.scale_h = (gfloat) GST_VIDEO_INFO_COMP_WIDTH (in_info, 0) /
-      (gfloat) GST_VIDEO_INFO_COMP_WIDTH (out_info, 0);
-  templ.scale_v = (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (in_info, 0) /
-      (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (out_info, 0);
-  templ.chroma_scale_h = (gfloat) GST_VIDEO_INFO_COMP_WIDTH (in_info, 1) /
-      (gfloat) GST_VIDEO_INFO_COMP_WIDTH (out_info, 1);
-  templ.chroma_scale_v = (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (in_info, 1) /
-      (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (out_info, 1);
-  templ.width = GST_VIDEO_INFO_COMP_WIDTH (out_info, 0);
-  templ.height = GST_VIDEO_INFO_COMP_HEIGHT (out_info, 0);
-  templ.chroma_width = GST_VIDEO_INFO_COMP_WIDTH (out_info, 1);
-  templ.chroma_height = GST_VIDEO_INFO_COMP_HEIGHT (out_info, 1);
-
-  templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (in_info, 0);
-  templ.out_depth = GST_VIDEO_INFO_COMP_DEPTH (out_info, 0);
-  templ.pstride = GST_VIDEO_INFO_COMP_PSTRIDE (out_info, 0);
-  templ.chroma_pstride = GST_VIDEO_INFO_COMP_PSTRIDE (out_info, 1);
-  templ.in_shift = in_info->finfo->shift[0];
-  templ.out_shift = out_info->finfo->shift[0];
-  templ.mask = ((1 << templ.out_depth) - 1) << templ.out_shift;
-  templ.swap_uv = (is_uv_swapped (in_format) != is_uv_swapped (out_format));
-
-  if (src_yuv && dst_yuv) {
-    convert->convert = convert_YUV_TO_YUV;
-
-    if (src_planar && dst_planar) {
-      templ.read_chroma = READ_CHROMA_FROM_PLANAR;
-      templ.write_chroma = WRITE_CHROMA_TO_PLANAR;
-    } else if (!src_planar && dst_planar) {
-      templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR;
-      templ.write_chroma = WRITE_CHROMA_TO_PLANAR;
-    } else if (src_planar && !dst_planar) {
-      templ.read_chroma = READ_CHROMA_FROM_PLANAR;
-      templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR;
-    } else {
-      templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR;
-      templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR;
-    }
-
-    convert->kernel_source =
-        cuda_converter_generate_yuv_to_yuv_kernel_code (convert, &templ);
-    convert->func_names[0] = GST_CUDA_KERNEL_FUNC;
-
-    ret = TRUE;
-  } else if (src_yuv && !dst_yuv) {
-    MatrixData matrix;
-
-    if (src_planar) {
-      templ.read_chroma = READ_CHROMA_FROM_PLANAR;
-    } else {
-      templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR;
-    }
-
-    templ.max_in_val = (1 << templ.in_depth) - 1;
-    cuda_converter_get_rgb_order (out_format, &templ.rgb_order);
-
-    cuda_converter_get_matrix (convert, &matrix, in_info, out_info);
-    convert->kernel_source =
-        cuda_converter_generate_yuv_to_rgb_kernel_code (convert,
-        &templ, &matrix);
-    convert->func_names[0] = GST_CUDA_KERNEL_FUNC;
-
-    convert->convert = convert_YUV_TO_RGB;
-
-    ret = TRUE;
-  } else if (!src_yuv && dst_yuv) {
-    MatrixData matrix;
-    gsize element_size = 8;
-    GstVideoFormat unpack_format;
-    GstVideoFormat y444_format;
-    GstVideoInfo unpack_info;
-    GstVideoInfo y444_info;
-    gint i;
-
-    if (dst_planar) {
-      templ.write_chroma = WRITE_CHROMA_TO_PLANAR;
-    } else {
-      templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR;
-    }
-    templ.read_chroma = READ_CHROMA_FROM_PLANAR;
-
-    cuda_converter_get_rgb_order (in_format, &convert->in_rgb_order);
-
-    if (templ.in_depth > 8) {
-      /* FIXME: RGB10A2_LE and BGR10A2_LE only */
-      element_size = 16;
-      unpack_format = GST_VIDEO_FORMAT_ARGB64;
-      y444_format = GST_VIDEO_FORMAT_Y444_16LE;
-      templ.unpack_function = unpack_to_ARGB64;
-    } else {
-      unpack_format = GST_VIDEO_FORMAT_ARGB;
-      y444_format = GST_VIDEO_FORMAT_Y444;
-      templ.unpack_function = unpack_to_ARGB;
-    }
-
-    gst_video_info_set_format (&unpack_info,
-        unpack_format, GST_VIDEO_INFO_WIDTH (in_info),
-        GST_VIDEO_INFO_HEIGHT (in_info));
-    gst_video_info_set_format (&y444_info,
-        y444_format, GST_VIDEO_INFO_WIDTH (in_info),
-        GST_VIDEO_INFO_HEIGHT (in_info));
-
-    templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (&unpack_info, 0);
-
-    cuda_ret = CuMemAllocPitch (&convert->unpack_surface.device_ptr,
-        &convert->unpack_surface.cuda_stride,
-        GST_VIDEO_INFO_COMP_WIDTH (&unpack_info, 0) *
-        GST_VIDEO_INFO_COMP_PSTRIDE (&unpack_info, 0),
-        GST_VIDEO_INFO_HEIGHT (&unpack_info), element_size);
-
-    if (!gst_cuda_result (cuda_ret)) {
-      GST_ERROR ("couldn't alloc unpack surface");
-      return FALSE;
-    }
-
-    for (i = 0; i < 3; i++) {
-      cuda_ret = CuMemAllocPitch (&convert->y444_surface[i].device_ptr,
-          &convert->y444_surface[i].cuda_stride,
-          GST_VIDEO_INFO_COMP_WIDTH (&y444_info, i) *
-          GST_VIDEO_INFO_COMP_PSTRIDE (&y444_info, i),
-          GST_VIDEO_INFO_COMP_HEIGHT (&y444_info, i), element_size);
-
-      if (!gst_cuda_result (cuda_ret)) {
-        GST_ERROR ("couldn't alloc %dth y444 surface", i);
-        return FALSE;
-      }
-    }
-
-    cuda_converter_get_matrix (convert, &matrix, &unpack_info, &y444_info);
-
-    convert->kernel_source =
-        cuda_converter_generate_rgb_to_yuv_kernel_code (convert,
-        &templ, &matrix);
-
-    convert->func_names[0] = GST_CUDA_KERNEL_FUNC_TO_ARGB;
-    convert->func_names[1] = GST_CUDA_KERNEL_FUNC_TO_Y444;
-    convert->func_names[2] = GST_CUDA_KERNEL_FUNC_Y444_TO_YUV;
-
-    convert->convert = convert_RGB_TO_YUV;
-
-    ret = TRUE;
-  } else {
-    gsize element_size = 8;
-    GstVideoFormat unpack_format;
-    GstVideoInfo unpack_info;
-
-    cuda_converter_get_rgb_order (in_format, &convert->in_rgb_order);
-    cuda_converter_get_rgb_order (out_format, &templ.rgb_order);
-
-    if (templ.in_depth > 8) {
-      /* FIXME: RGB10A2_LE and BGR10A2_LE only */
-      element_size = 16;
-      unpack_format = GST_VIDEO_FORMAT_ARGB64;
-      templ.unpack_function = unpack_to_ARGB64;
-    } else {
-      unpack_format = GST_VIDEO_FORMAT_ARGB;
-      templ.unpack_function = unpack_to_ARGB;
-    }
-
-    gst_video_info_set_format (&unpack_info,
-        unpack_format, GST_VIDEO_INFO_WIDTH (in_info),
-        GST_VIDEO_INFO_HEIGHT (in_info));
-
-    templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (&unpack_info, 0);
-
-    cuda_ret = CuMemAllocPitch (&convert->unpack_surface.device_ptr,
-        &convert->unpack_surface.cuda_stride,
-        GST_VIDEO_INFO_COMP_WIDTH (&unpack_info, 0) *
-        GST_VIDEO_INFO_COMP_PSTRIDE (&unpack_info, 0),
-        GST_VIDEO_INFO_HEIGHT (&unpack_info), element_size);
-
-    if (!gst_cuda_result (cuda_ret)) {
-      GST_ERROR ("couldn't alloc unpack surface");
-      return FALSE;
-    }
-
-    convert->kernel_source =
-        cuda_converter_generate_rgb_to_rgb_kernel_code (convert, &templ);
-
-    convert->func_names[0] = GST_CUDA_KERNEL_FUNC_TO_ARGB;
-    convert->func_names[1] = GST_CUDA_KERNEL_FUNC_SCALE_RGB;
-
-    convert->convert = convert_RGB_TO_RGB;
-
-    ret = TRUE;
-  }
-
-  if (!ret) {
-    GST_DEBUG ("no path found");
-
-    return FALSE;
-  }
-
-  GST_TRACE ("configured CUDA kernel source\n%s", convert->kernel_source);
-
-  return TRUE;
-}
diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/cuda-converter.h b/subprojects/gst-plugins-bad/sys/nvcodec/cuda-converter.h
deleted file mode 100644
index 82c5f169eb..0000000000
--- a/subprojects/gst-plugins-bad/sys/nvcodec/cuda-converter.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* GStreamer
- * Copyright (C) 2019 Seungha Yang <seungha.yang@navercorp.com>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License along with this library; if not, write to the
- * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
- * Boston, MA 02110-1301, USA.
- */
-
-#ifndef __GST_CUDA_CONVERTER_H__
-#define __GST_CUDA_CONVERTER_H__
-
-#include <gst/video/video.h>
-#include <gst/cuda/gstcudacontext.h>
-#include <gst/cuda/gstcudamemory.h>
-
-G_BEGIN_DECLS
-
-typedef struct _GstCudaConverter GstCudaConverter;
-
-GstCudaConverter *    gst_cuda_converter_new           (GstVideoInfo * in_info,
-                                                        GstVideoInfo * out_info,
-                                                        GstCudaContext * cuda_ctx);
-
-void                 gst_cuda_converter_free           (GstCudaConverter * convert);
-
-gboolean             gst_cuda_converter_convert_frame  (GstCudaConverter * convert,
-                                                        GstVideoFrame * src_frame,
-                                                        GstVideoFrame * dst_frame,
-                                                        CUstream cuda_stream);
-
-G_END_DECLS
-
-#endif /* __GST_CUDA_CONVERTER_H__ */
diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconverter.c b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconverter.c
new file mode 100644
index 0000000000..a7ea73d141
--- /dev/null
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconverter.c
@@ -0,0 +1,2243 @@
+/* GStreamer
+ * Copyright (C) 2022 Seungha Yang <seungha@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "gstcudaconverter.h"
+#include <gst/cuda/gstcudautils.h>
+#include <gst/cuda/gstcudaloader.h>
+#include <gst/cuda/gstcudanvrtc.h>
+#include <string.h>
+
+GST_DEBUG_CATEGORY_STATIC (gst_cuda_converter_debug);
+#define GST_CAT_DEFAULT gst_cuda_converter_debug
+
+#define CUDA_BLOCK_X 16
+#define CUDA_BLOCK_Y 16
+#define DIV_UP(size,block) (((size) + ((block) - 1)) / (block))
+
+/* from GstD3D11 */
+typedef struct _GstCudaColorMatrix
+{
+  gdouble matrix[3][3];
+  gdouble offset[3];
+  gdouble min[3];
+  gdouble max[3];
+} GstCudaColorMatrix;
+
+static gchar *
+gst_cuda_dump_color_matrix (GstCudaColorMatrix * matrix)
+{
+  /* *INDENT-OFF* */
+  static const gchar format[] =
+      "[MATRIX]\n"
+      "|% .6f, % .6f, % .6f|\n"
+      "|% .6f, % .6f, % .6f|\n"
+      "|% .6f, % .6f, % .6f|\n"
+      "[OFFSET]\n"
+      "|% .6f, % .6f, % .6f|\n"
+      "[MIN]\n"
+      "|% .6f, % .6f, % .6f|\n"
+      "[MAX]\n"
+      "|% .6f, % .6f, % .6f|";
+  /* *INDENT-ON* */
+
+  return g_strdup_printf (format,
+      matrix->matrix[0][0], matrix->matrix[0][1], matrix->matrix[0][2],
+      matrix->matrix[1][0], matrix->matrix[1][1], matrix->matrix[1][2],
+      matrix->matrix[2][0], matrix->matrix[2][1], matrix->matrix[2][2],
+      matrix->offset[0], matrix->offset[1], matrix->offset[2],
+      matrix->min[0], matrix->min[1], matrix->min[2],
+      matrix->max[0], matrix->max[1], matrix->max[2]);
+}
+
+static void
+color_matrix_copy (GstCudaColorMatrix * dst, const GstCudaColorMatrix * src)
+{
+  for (guint i = 0; i < 3; i++) {
+    for (guint j = 0; j < 3; j++) {
+      dst->matrix[i][j] = src->matrix[i][j];
+    }
+  }
+}
+
+static void
+color_matrix_multiply (GstCudaColorMatrix * dst, GstCudaColorMatrix * a,
+    GstCudaColorMatrix * b)
+{
+  GstCudaColorMatrix tmp;
+
+  for (guint i = 0; i < 3; i++) {
+    for (guint j = 0; j < 3; j++) {
+      gdouble val = 0;
+      for (guint k = 0; k < 3; k++) {
+        val += a->matrix[i][k] * b->matrix[k][j];
+      }
+
+      tmp.matrix[i][j] = val;
+    }
+  }
+
+  color_matrix_copy (dst, &tmp);
+}
+
+static void
+color_matrix_identity (GstCudaColorMatrix * m)
+{
+  for (guint i = 0; i < 3; i++) {
+    for (guint j = 0; j < 3; j++) {
+      if (i == j)
+        m->matrix[i][j] = 1.0;
+      else
+        m->matrix[i][j] = 0;
+    }
+  }
+}
+
+/**
+ * gst_cuda_color_range_adjust_matrix_unorm:
+ * @in_info: a #GstVideoInfo
+ * @out_info: a #GstVideoInfo
+ * @matrix: a #GstCudaColorMatrix
+ *
+ * Calculates matrix for color range adjustment. Both input and output
+ * signals are in normalized [0.0..1.0] space.
+ *
+ * Resulting values can be calculated by
+ * | Yout |                           | Yin |   | matrix.offset[0] |
+ * | Uout | = clamp ( matrix.matrix * | Uin | + | matrix.offset[1] |, matrix.min, matrix.max )
+ * | Vout |                           | Vin |   | matrix.offset[2] |
+ *
+ * Returns: %TRUE if successful
+ */
+static gboolean
+gst_cuda_color_range_adjust_matrix_unorm (const GstVideoInfo * in_info,
+    const GstVideoInfo * out_info, GstCudaColorMatrix * matrix)
+{
+  gboolean in_rgb, out_rgb;
+  gint in_offset[GST_VIDEO_MAX_COMPONENTS];
+  gint in_scale[GST_VIDEO_MAX_COMPONENTS];
+  gint out_offset[GST_VIDEO_MAX_COMPONENTS];
+  gint out_scale[GST_VIDEO_MAX_COMPONENTS];
+  GstVideoColorRange in_range;
+  GstVideoColorRange out_range;
+  gdouble src_fullscale, dst_fullscale;
+
+  memset (matrix, 0, sizeof (GstCudaColorMatrix));
+  for (guint i = 0; i < 3; i++) {
+    matrix->matrix[i][i] = 1.0;
+    matrix->matrix[i][i] = 1.0;
+    matrix->matrix[i][i] = 1.0;
+    matrix->max[i] = 1.0;
+  }
+
+  in_rgb = GST_VIDEO_INFO_IS_RGB (in_info);
+  out_rgb = GST_VIDEO_INFO_IS_RGB (out_info);
+
+  if (in_rgb != out_rgb) {
+    GST_WARNING ("Invalid format conversion");
+    return FALSE;
+  }
+
+  in_range = in_info->colorimetry.range;
+  out_range = out_info->colorimetry.range;
+
+  if (in_range == GST_VIDEO_COLOR_RANGE_UNKNOWN) {
+    GST_WARNING ("Unknown input color range");
+    if (in_rgb || GST_VIDEO_INFO_IS_GRAY (in_info))
+      in_range = GST_VIDEO_COLOR_RANGE_0_255;
+    else
+      in_range = GST_VIDEO_COLOR_RANGE_16_235;
+  }
+
+  if (out_range == GST_VIDEO_COLOR_RANGE_UNKNOWN) {
+    GST_WARNING ("Unknown output color range");
+    if (out_rgb || GST_VIDEO_INFO_IS_GRAY (out_info))
+      out_range = GST_VIDEO_COLOR_RANGE_0_255;
+    else
+      out_range = GST_VIDEO_COLOR_RANGE_16_235;
+  }
+
+  src_fullscale = (gdouble) ((1 << in_info->finfo->depth[0]) - 1);
+  dst_fullscale = (gdouble) ((1 << out_info->finfo->depth[0]) - 1);
+
+  gst_video_color_range_offsets (in_range, in_info->finfo, in_offset, in_scale);
+  gst_video_color_range_offsets (out_range,
+      out_info->finfo, out_offset, out_scale);
+
+  matrix->min[0] = matrix->min[1] = matrix->min[2] =
+      (gdouble) out_offset[0] / dst_fullscale;
+
+  matrix->max[0] = (out_scale[0] + out_offset[0]) / dst_fullscale;
+  matrix->max[1] = matrix->max[2] =
+      (out_scale[1] + out_offset[0]) / dst_fullscale;
+
+  if (in_info->colorimetry.range == out_info->colorimetry.range) {
+    GST_DEBUG ("Same color range");
+    return TRUE;
+  }
+
+  /* Formula
+   *
+   * 1) Scales and offset compensates input to [0..1] range
+   * SRC_NORM[i] = (src[i] * src_fullscale - in_offset[i]) / in_scale[i]
+   *             = (src[i] * src_fullscale / in_scale[i]) - in_offset[i] / in_scale[i]
+   *
+   * 2) Reverse to output UNIT scale
+   * DST_UINT[i] = SRC_NORM[i] * out_scale[i] + out_offset[i]
+   *             = src[i] * src_fullscale * out_scale[i] / in_scale[i]
+   *               - in_offset[i] * out_scale[i] / in_scale[i]
+   *               + out_offset[i]
+   *
+   * 3) Back to [0..1] scale
+   * dst[i] = DST_UINT[i] / dst_fullscale
+   *        = COEFF[i] * src[i] + OFF[i]
+   * where
+   *             src_fullscale * out_scale[i]
+   * COEFF[i] = ------------------------------
+   *             dst_fullscale * in_scale[i]
+   *
+   *            out_offset[i]     in_offset[i] * out_scale[i]
+   * OFF[i] =  -------------- -  ------------------------------
+   *            dst_fullscale     dst_fullscale * in_scale[i]
+   */
+  for (guint i = 0; i < 3; i++) {
+    matrix->matrix[i][i] = (src_fullscale * out_scale[i]) /
+        (dst_fullscale * in_scale[i]);
+    matrix->offset[i] = (out_offset[i] / dst_fullscale) -
+        ((gdouble) in_offset[i] * out_scale[i] / (dst_fullscale * in_scale[i]));
+  }
+
+  return TRUE;
+}
+
+/**
+ * gst_cuda_yuv_to_rgb_matrix_unorm:
+ * @in_yuv_info: a #GstVideoInfo of input YUV signal
+ * @out_rgb_info: a #GstVideoInfo of output RGB signal
+ * @matrix: a #GstCudaColorMatrix
+ *
+ * Calculates transform matrix from YUV to RGB conversion. Both input and output
+ * signals are in normalized [0.0..1.0] space and additional gamma decoding
+ * or primary/transfer function transform is not performed by this matrix.
+ *
+ * Resulting non-linear RGB values can be calculated by
+ * | R' |                           | Y' |   | matrix.offset[0] |
+ * | G' | = clamp ( matrix.matrix * | Cb | + | matrix.offset[1] | matrix.min, matrix.max )
+ * | B' |                           | Cr |   | matrix.offset[2] |
+ *
+ * Returns: %TRUE if successful
+ */
+static gboolean
+gst_cuda_yuv_to_rgb_matrix_unorm (const GstVideoInfo * in_yuv_info,
+    const GstVideoInfo * out_rgb_info, GstCudaColorMatrix * matrix)
+{
+  gint offset[4], scale[4];
+  gdouble Kr, Kb, Kg;
+
+  /*
+   * <Formula>
+   *
+   * Input: Unsigned normalized Y'CbCr(unorm), [0.0..1.0] range
+   * Output: Unsigned normalized non-linear R'G'B'(unorm), [0.0..1.0] range
+   *
+   * 1) Y'CbCr(unorm) to scaled Y'CbCr
+   * | Y' |     | Y'(unorm) |
+   * | Cb | = S | Cb(unorm) |
+   * | Cb |     | Cr(unorm) |
+   * where S = (2 ^ bitdepth) - 1
+   *
+   * 2) Y'CbCr to YPbPr
+   * Y  = (Y' - offsetY )    / scaleY
+   * Pb = [(Cb - offsetCbCr) / scaleCbCr]
+   * Pr = [(Cr - offsetCrCr) / scaleCrCr]
+   * =>
+   * Y  = Y'(unorm) * Sy  + Oy
+   * Pb = Cb(unorm) * Suv + Ouv
+   * Pb = Cr(unorm) * Suv + Ouv
+   * where
+   * Sy  = S / scaleY
+   * Suv = S / scaleCbCr
+   * Oy  = -(offsetY / scaleY)
+   * Ouv = -(offsetCbCr / scaleCbCr)
+   *
+   * 3) YPbPr to R'G'B'
+   * | R' |      | Y  |
+   * | G' | = M *| Pb |
+   * | B' |      | Pr |
+   * where
+   *     | vecR |
+   * M = | vecG |
+   *     | vecB |
+   * vecR = | 1,         0           ,       2(1 - Kr)      |
+   * vecG = | 1, -(Kb/Kg) * 2(1 - Kb), -(Kr/Kg) * 2(1 - Kr) |
+   * vecB = | 1,       2(1 - Kb)     ,          0           |
+   * =>
+   * R' = dot(vecR, (Syuv * Y'CbCr(unorm))) + dot(vecR, Offset)
+   * G' = dot(vecG, (Svuy * Y'CbCr(unorm))) + dot(vecG, Offset)
+   * B' = dot(vecB, (Syuv * Y'CbCr(unorm)) + dot(vecB, Offset)
+   * where
+   *        | Sy,   0,   0 |
+   * Syuv = |  0, Suv,   0 |
+   *        |  0    0, Suv |
+   *
+   *          | Oy  |
+   * Offset = | Ouv |
+   *          | Ouv |
+   *
+   * 4) YUV -> RGB matrix
+   * | R' |            | Y'(unorm) |   | offsetA |
+   * | G' | = Matrix * | Cb(unorm) | + | offsetB |
+   * | B' |            | Cr(unorm) |   | offsetC |
+   *
+   * where
+   *          | vecR |
+   * Matrix = | vecG | * Syuv
+   *          | vecB |
+   *
+   * offsetA = dot(vecR, Offset)
+   * offsetB = dot(vecG, Offset)
+   * offsetC = dot(vecB, Offset)
+   *
+   * 4) Consider 16-235 scale RGB
+   * RGBfull(0..255) -> RGBfull(16..235) matrix is represented by
+   * | Rs |      | Rf |   | Or |
+   * | Gs | = Ms | Gf | + | Og |
+   * | Bs |      | Bf |   | Ob |
+   *
+   * Combining all matrix into
+   * | Rs |                   | Y'(unorm) |   | offsetA |     | Or |
+   * | Gs | = Ms * ( Matrix * | Cb(unorm) | + | offsetB | ) + | Og |
+   * | Bs |                   | Cr(unorm) |   | offsetC |     | Ob |
+   *
+   *                        | Y'(unorm) |      | offsetA |   | Or |
+   *        = Ms * Matrix * | Cb(unorm) | + Ms | offsetB | + | Og |
+   *                        | Cr(unorm) |      | offsetC |   | Ob |
+   */
+
+  memset (matrix, 0, sizeof (GstCudaColorMatrix));
+  for (guint i = 0; i < 3; i++)
+    matrix->max[i] = 1.0;
+
+  gst_video_color_range_offsets (in_yuv_info->colorimetry.range,
+      in_yuv_info->finfo, offset, scale);
+
+  if (gst_video_color_matrix_get_Kr_Kb (in_yuv_info->colorimetry.matrix,
+          &Kr, &Kb)) {
+    guint S;
+    gdouble Sy, Suv;
+    gdouble Oy, Ouv;
+    gdouble vecR[3], vecG[3], vecB[3];
+
+    Kg = 1.0 - Kr - Kb;
+
+    vecR[0] = 1.0;
+    vecR[1] = 0;
+    vecR[2] = 2 * (1 - Kr);
+
+    vecG[0] = 1.0;
+    vecG[1] = -(Kb / Kg) * 2 * (1 - Kb);
+    vecG[2] = -(Kr / Kg) * 2 * (1 - Kr);
+
+    vecB[0] = 1.0;
+    vecB[1] = 2 * (1 - Kb);
+    vecB[2] = 0;
+
+    /* Assume all components has the same bitdepth */
+    S = (1 << in_yuv_info->finfo->depth[0]) - 1;
+    Sy = (gdouble) S / scale[0];
+    Suv = (gdouble) S / scale[1];
+    Oy = -((gdouble) offset[0] / scale[0]);
+    Ouv = -((gdouble) offset[1] / scale[1]);
+
+    matrix->matrix[0][0] = Sy * vecR[0];
+    matrix->matrix[1][0] = Sy * vecG[0];
+    matrix->matrix[2][0] = Sy * vecB[0];
+
+    matrix->matrix[0][1] = Suv * vecR[1];
+    matrix->matrix[1][1] = Suv * vecG[1];
+    matrix->matrix[2][1] = Suv * vecB[1];
+
+    matrix->matrix[0][2] = Suv * vecR[2];
+    matrix->matrix[1][2] = Suv * vecG[2];
+    matrix->matrix[2][2] = Suv * vecB[2];
+
+    matrix->offset[0] = vecR[0] * Oy + vecR[1] * Ouv + vecR[2] * Ouv;
+    matrix->offset[1] = vecG[0] * Oy + vecG[1] * Ouv + vecG[2] * Ouv;
+    matrix->offset[2] = vecB[0] * Oy + vecB[1] * Ouv + vecB[2] * Ouv;
+
+    /* Apply RGB range scale matrix */
+    if (out_rgb_info->colorimetry.range == GST_VIDEO_COLOR_RANGE_16_235) {
+      GstCudaColorMatrix scale_matrix, rst;
+      GstVideoInfo full_rgb = *out_rgb_info;
+
+      full_rgb.colorimetry.range = GST_VIDEO_COLOR_RANGE_0_255;
+
+      if (gst_cuda_color_range_adjust_matrix_unorm (&full_rgb,
+              out_rgb_info, &scale_matrix)) {
+        /* Ms * Matrix */
+        color_matrix_multiply (&rst, &scale_matrix, matrix);
+
+        /* Ms * transform offsets */
+        for (guint i = 0; i < 3; i++) {
+          gdouble val = 0;
+          for (guint j = 0; j < 3; j++) {
+            val += scale_matrix.matrix[i][j] * matrix->offset[j];
+          }
+          rst.offset[i] = val + scale_matrix.offset[i];
+        }
+
+        /* copy back to output matrix */
+        for (guint i = 0; i < 3; i++) {
+          for (guint j = 0; j < 3; j++) {
+            matrix->matrix[i][j] = rst.matrix[i][j];
+          }
+          matrix->offset[i] = rst.offset[i];
+          matrix->min[i] = scale_matrix.min[i];
+          matrix->max[i] = scale_matrix.max[i];
+        }
+      }
+    }
+  } else {
+    /* Unknown matrix */
+    matrix->matrix[0][0] = 1.0;
+    matrix->matrix[1][1] = 1.0;
+    matrix->matrix[2][2] = 1.0;
+  }
+
+  return TRUE;
+}
+
+/**
+ * gst_cuda_rgb_to_yuv_matrix_unorm:
+ * @in_rgb_info: a #GstVideoInfo of input RGB signal
+ * @out_yuv_info: a #GstVideoInfo of output YUV signal
+ * @matrix: a #GstCudaColorMatrix
+ *
+ * Calculates transform matrix from RGB to YUV conversion. Both input and output
+ * signals are in normalized [0.0..1.0] space and additional gamma decoding
+ * or primary/transfer function transform is not performed by this matrix.
+ *
+ * Resulting RGB values can be calculated by
+ * | Y' |                           | R' |   | matrix.offset[0] |
+ * | Cb | = clamp ( matrix.matrix * | G' | + | matrix.offset[1] |, matrix.min, matrix.max )
+ * | Cr |                           | B' |   | matrix.offset[2] |
+ *
+ * Returns: %TRUE if successful
+ */
+static gboolean
+gst_cuda_rgb_to_yuv_matrix_unorm (const GstVideoInfo * in_rgb_info,
+    const GstVideoInfo * out_yuv_info, GstCudaColorMatrix * matrix)
+{
+  gint offset[4], scale[4];
+  gdouble Kr, Kb, Kg;
+
+  /*
+   * <Formula>
+   *
+   * Input: Unsigned normalized non-linear R'G'B'(unorm), [0.0..1.0] range
+   * Output: Unsigned normalized Y'CbCr(unorm), [0.0..1.0] range
+   *
+   * 1) R'G'B' to YPbPr
+   * | Y  |      | R' |
+   * | Pb | = M *| G' |
+   * | Pr |      | B' |
+   * where
+   *     | vecY |
+   * M = | vecU |
+   *     | vecV |
+   * vecY = |       Kr      ,       Kg      ,      Kb       |
+   * vecU = | -0.5*Kr/(1-Kb), -0.5*Kg/(1-Kb),     0.5       |
+   * vecV = |      0.5      , -0.5*Kg/(1-Kr), -0.5*Kb(1-Kr) |
+   *
+   * 2) YPbPr to Y'CbCr(unorm)
+   * Y'(unorm) = (Y  * scaleY + offsetY)       / S
+   * Cb(unorm) = (Pb * scaleCbCr + offsetCbCr) / S
+   * Cr(unorm) = (Pr * scaleCbCr + offsetCbCr) / S
+   * =>
+   * Y'(unorm) = (Y  * scaleY    / S) + (offsetY    / S)
+   * Cb(unorm) = (Pb * scaleCbCr / S) + (offsetCbCr / S)
+   * Cr(unorm) = (Pb * scaleCbCr / S) + (offsetCbCr / S)
+   * where S = (2 ^ bitdepth) - 1
+   *
+   * 3) RGB -> YUV matrix
+   * | Y'(unorm) |            | R' |   | offsetA |
+   * | Cb(unorm) | = Matrix * | G' | + | offsetB |
+   * | Cr(unorm) |            | B' |   | offsetC |
+   *
+   * where
+   *          | (scaleY/S)    * vecY |
+   * Matrix = | (scaleCbCr/S) * vecU |
+   *          | (scaleCbCr/S) * vecV |
+   *
+   * offsetA = offsetY    / S
+   * offsetB = offsetCbCr / S
+   * offsetC = offsetCbCr / S
+   *
+   * 4) Consider 16-235 scale RGB
+   * RGBstudio(16..235) -> RGBfull(0..255) matrix is represented by
+   * | Rf |      | Rs |   | Or |
+   * | Gf | = Ms | Gs | + | Og |
+   * | Bf |      | Bs |   | Ob |
+   *
+   * Combining all matrix into
+   * | Y'(unorm) |                 | Rs |   | Or |     | offsetA |
+   * | Cb(unorm) | = Matrix * ( Ms | Gs | + | Og | ) + | offsetB |
+   * | Cr(unorm) |                 | Bs |   | Ob |     | offsetC |
+   *
+   *                             | Rs |          | Or |   | offsetA |
+   *               = Matrix * Ms | Gs | + Matrix | Og | + | offsetB |
+   *                             | Bs |          | Ob |   | offsetB |
+   */
+
+  memset (matrix, 0, sizeof (GstCudaColorMatrix));
+  for (guint i = 0; i < 3; i++)
+    matrix->max[i] = 1.0;
+
+  gst_video_color_range_offsets (out_yuv_info->colorimetry.range,
+      out_yuv_info->finfo, offset, scale);
+
+  if (gst_video_color_matrix_get_Kr_Kb (out_yuv_info->colorimetry.matrix,
+          &Kr, &Kb)) {
+    guint S;
+    gdouble Sy, Suv;
+    gdouble Oy, Ouv;
+    gdouble vecY[3], vecU[3], vecV[3];
+
+    Kg = 1.0 - Kr - Kb;
+
+    vecY[0] = Kr;
+    vecY[1] = Kg;
+    vecY[2] = Kb;
+
+    vecU[0] = -0.5 * Kr / (1 - Kb);
+    vecU[1] = -0.5 * Kg / (1 - Kb);
+    vecU[2] = 0.5;
+
+    vecV[0] = 0.5;
+    vecV[1] = -0.5 * Kg / (1 - Kr);
+    vecV[2] = -0.5 * Kb / (1 - Kr);
+
+    /* Assume all components has the same bitdepth */
+    S = (1 << out_yuv_info->finfo->depth[0]) - 1;
+    Sy = (gdouble) scale[0] / S;
+    Suv = (gdouble) scale[1] / S;
+    Oy = (gdouble) offset[0] / S;
+    Ouv = (gdouble) offset[1] / S;
+
+    for (guint i = 0; i < 3; i++) {
+      matrix->matrix[0][i] = Sy * vecY[i];
+      matrix->matrix[1][i] = Suv * vecU[i];
+      matrix->matrix[2][i] = Suv * vecV[i];
+    }
+
+    matrix->offset[0] = Oy;
+    matrix->offset[1] = Ouv;
+    matrix->offset[2] = Ouv;
+
+    matrix->min[0] = Oy;
+    matrix->min[1] = Oy;
+    matrix->min[2] = Oy;
+
+    matrix->max[0] = ((gdouble) scale[0] + offset[0]) / S;
+    matrix->max[1] = ((gdouble) scale[1] + offset[0]) / S;
+    matrix->max[2] = ((gdouble) scale[1] + offset[0]) / S;
+
+    /* Apply RGB range scale matrix */
+    if (in_rgb_info->colorimetry.range == GST_VIDEO_COLOR_RANGE_16_235) {
+      GstCudaColorMatrix scale_matrix, rst;
+      GstVideoInfo full_rgb = *in_rgb_info;
+
+      full_rgb.colorimetry.range = GST_VIDEO_COLOR_RANGE_0_255;
+
+      if (gst_cuda_color_range_adjust_matrix_unorm (in_rgb_info,
+              &full_rgb, &scale_matrix)) {
+        /* Matrix * Ms */
+        color_matrix_multiply (&rst, matrix, &scale_matrix);
+
+        /* Matrix * scale offsets */
+        for (guint i = 0; i < 3; i++) {
+          gdouble val = 0;
+          for (guint j = 0; j < 3; j++) {
+            val += matrix->matrix[i][j] * scale_matrix.offset[j];
+          }
+          rst.offset[i] = val + matrix->offset[i];
+        }
+
+        /* copy back to output matrix */
+        for (guint i = 0; i < 3; i++) {
+          for (guint j = 0; j < 3; j++) {
+            matrix->matrix[i][j] = rst.matrix[i][j];
+          }
+          matrix->offset[i] = rst.offset[i];
+        }
+      }
+    }
+  } else {
+    /* Unknown matrix */
+    matrix->matrix[0][0] = 1.0;
+    matrix->matrix[1][1] = 1.0;
+    matrix->matrix[2][2] = 1.0;
+  }
+
+  return TRUE;
+}
+
+typedef struct
+{
+  float coeffX[3];
+  float coeffY[3];
+  float coeffZ[3];
+  float offset[3];
+  float min[3];
+  float max[3];
+} ColorMatrix;
+
+typedef struct
+{
+  ColorMatrix toRGBCoeff;
+  ColorMatrix toYuvCoeff;
+  ColorMatrix primariesCoeff;
+} ConstBuffer;
+
+#define COLOR_SPACE_IDENTITY "color_space_identity"
+#define COLOR_SPACE_CONVERT "color_space_convert"
+
+#define SAMPLE_YUV_PLANAR "sample_yuv_planar"
+#define SAMPLE_YV12 "sample_yv12"
+#define SAMPLE_YUV_PLANAR_10BIS "sample_yuv_planar_10bits"
+#define SAMPLE_YUV_PLANAR_12BIS "sample_yuv_planar_12bits"
+#define SAMPLE_SEMI_PLANAR "sample_semi_planar"
+#define SAMPLE_SEMI_PLANAR_SWAP "sample_semi_planar_swap"
+#define SAMPLE_RGBA "sample_rgba"
+#define SAMPLE_BGRA "sample_bgra"
+#define SAMPLE_RGBx "sample_rgbx"
+#define SAMPLE_BGRx "sample_bgrx"
+#define SAMPLE_ARGB "sample_argb"
+/* same as ARGB */
+#define SAMPLE_ARGB64 "sample_argb"
+#define SAMPLE_AGBR "sample_abgr"
+#define SAMPLE_RGBP "sample_rgbp"
+#define SAMPLE_BGRP "sample_bgrp"
+#define SAMPLE_GBR "sample_gbr"
+#define SAMPLE_GBRA "sample_gbra"
+
+#define WRITE_I420 "write_i420"
+#define WRITE_YV12 "write_yv12"
+#define WRITE_NV12 "write_nv12"
+#define WRITE_NV21 "write_nv21"
+#define WRITE_P010 "write_p010"
+/* same as P010 */
+#define WRITE_P016 "write_p010"
+#define WRITE_I420_10 "write_i420_10"
+#define WRITE_Y444 "write_y444"
+#define WRITE_Y444_16 "write_y444_16"
+#define WRITE_RGBA "write_rgba"
+#define WRITE_RGBx "write_rgbx"
+#define WRITE_BGRA "write_bgra"
+#define WRITE_BGRx "write_bgrx"
+#define WRITE_ARGB "write_argb"
+#define WRITE_ABGR "write_abgr"
+#define WRITE_RGB "write_rgb"
+#define WRITE_BGR "write_bgr"
+#define WRITE_RGB10A2 "write_rgb10a2"
+#define WRITE_BGR10A2 "write_bgr10a2"
+#define WRITE_Y42B "write_y42b"
+#define WRITE_I422_10 "write_i422_10"
+#define WRITE_I422_12 "write_i422_12"
+#define WRITE_RGBP "write_rgbp"
+#define WRITE_BGRP "write_bgrp"
+#define WRITE_GBR "write_gbr"
+#define WRITE_GBRA "write_gbra"
+
+/* *INDENT-OFF* */
+const static gchar KERNEL_COMMON[] =
+"struct ColorMatrix\n"
+"{\n"
+"  float CoeffX[3];\n"
+"  float CoeffY[3];\n"
+"  float CoeffZ[3];\n"
+"  float Offset[3];\n"
+"  float Min[3];\n"
+"  float Max[3];\n"
+"};\n"
+"\n"
+"__device__ inline float\n"
+"dot (const float coeff[3], float3 val)\n"
+"{\n"
+"  return coeff[0] * val.x + coeff[1] * val.y + coeff[2] * val.z;\n"
+"}\n"
+"\n"
+"__device__ inline float\n"
+"clamp (float val, float min_val, float max_val)\n"
+"{\n"
+"  return max (min_val, min (val, max_val));\n"
+"}\n"
+"\n"
+"__device__ inline float3\n"
+"clamp3 (float3 val, const float min_val[3], const float max_val[3])\n"
+"{\n"
+"  return make_float3 (clamp (val.x, min_val[0], max_val[0]),\n"
+"      clamp (val.y, min_val[1], max_val[2]),\n"
+"      clamp (val.z, min_val[1], max_val[2]));\n"
+"}\n"
+"\n"
+"__device__ inline unsigned char\n"
+"scale_to_2bits (float val)\n"
+"{\n"
+"  return (unsigned short) __float2int_rz (val * 3.0);\n"
+"}\n"
+"\n"
+"__device__ inline unsigned char\n"
+"scale_to_uchar (float val)\n"
+"{\n"
+"  return (unsigned char) __float2int_rz (val * 255.0);\n"
+"}\n"
+"\n"
+"__device__ inline unsigned short\n"
+"scale_to_ushort (float val)\n"
+"{\n"
+"  return (unsigned short) __float2int_rz (val * 65535.0);\n"
+"}\n"
+"\n"
+"__device__ inline unsigned short\n"
+"scale_to_10bits (float val)\n"
+"{\n"
+"  return (unsigned short) __float2int_rz (val * 1023.0);\n"
+"}\n"
+"\n"
+"__device__ inline unsigned short\n"
+"scale_to_12bits (float val)\n"
+"{\n"
+"  return (unsigned short) __float2int_rz (val * 4095.0);\n"
+"}\n"
+"\n"
+"__device__ inline float3\n"
+COLOR_SPACE_IDENTITY "(float3 sample, const ColorMatrix * matrix)\n"
+"{\n"
+"  return sample;\n"
+"}\n"
+"\n"
+"__device__ inline float3\n"
+COLOR_SPACE_CONVERT "(float3 sample, const ColorMatrix * matrix)\n"
+"{\n"
+"  float3 out;\n"
+"  out.x = dot (matrix->CoeffX, sample);\n"
+"  out.y = dot (matrix->CoeffY, sample);\n"
+"  out.z = dot (matrix->CoeffZ, sample);\n"
+"  out.x += matrix->Offset[0];\n"
+"  out.y += matrix->Offset[1];\n"
+"  out.z += matrix->Offset[2];\n"
+"  return clamp3 (out, matrix->Min, matrix->Max);\n"
+"}\n"
+"/* All 8bits yuv planar except for yv12 */\n"
+"__device__ inline float4\n"
+SAMPLE_YUV_PLANAR "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
+"{\n"
+"  float luma = tex2D<float>(tex0, x, y);\n"
+"  float u = tex2D<float>(tex1, x, y);\n"
+"  float v = tex2D<float>(tex2, x, y);\n"
+"  return make_float4 (luma, u, v, 1);\n"
+"}\n"
+"\n"
+"__device__ inline float4\n"
+SAMPLE_YV12 "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
+"{\n"
+"  float luma = tex2D<float>(tex0, x, y);\n"
+"  float u = tex2D<float>(tex2, x, y);\n"
+"  float v = tex2D<float>(tex1, x, y);\n"
+"  return make_float4 (luma, u, v, 1);\n"
+"}\n"
+"\n"
+"__device__ inline float4\n"
+SAMPLE_YUV_PLANAR_10BIS "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
+"{\n"
+"  float luma = tex2D<float>(tex0, x, y);\n"
+"  float u = tex2D<float>(tex1, x, y);\n"
+"  float v = tex2D<float>(tex2, x, y);\n"
+"  /* (1 << 6) to scale [0, 1.0) range */\n"
+"  return make_float4 (luma * 64, u * 64, v * 64, 1);\n"
+"}\n"
+"\n"
+"__device__ inline float4\n"
+SAMPLE_YUV_PLANAR_12BIS "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
+"{\n"
+"  float luma = tex2D<float>(tex0, x, y);\n"
+"  float u = tex2D<float>(tex1, x, y);\n"
+"  float v = tex2D<float>(tex2, x, y);\n"
+"  /* (1 << 6) to scale [0, 1.0) range */\n"
+"  return make_float4 (luma * 16, u * 16, v * 16, 1);\n"
+"}\n"
+"\n"
+"/* NV12, P010, and P016 */\n"
+"__device__ inline float4\n"
+SAMPLE_SEMI_PLANAR "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
+"{\n"
+"  float luma = tex2D<float>(tex0, x, y);\n"
+"  float2 uv = tex2D<float2>(tex1, x, y);\n"
+"  return make_float4 (luma, uv.x, uv.y, 1);\n"
+"}\n"
+"\n"
+"__device__ inline float4\n"
+SAMPLE_SEMI_PLANAR_SWAP "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
+"{\n"
+"  float luma = tex2D<float>(tex0, x, y);\n"
+"  float2 vu = tex2D<float2>(tex1, x, y);\n"
+"  return make_float4 (luma, vu.y, vu.x, 1);\n"
+"}\n"
+"\n"
+"__device__ inline float4\n"
+SAMPLE_RGBA "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
+"{\n"
+"  return tex2D<float4>(tex0, x, y);\n"
+"}\n"
+"\n"
+"__device__ inline float4\n"
+SAMPLE_BGRA "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
+"{\n"
+"  float4 bgra = tex2D<float4>(tex0, x, y);\n"
+"  return make_float4 (bgra.z, bgra.y, bgra.x, bgra.w);\n"
+"}\n"
+"\n"
+"__device__ inline float4\n"
+SAMPLE_RGBx "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
+"{\n"
+"  float4 rgbx = tex2D<float4>(tex0, x, y);\n"
+"  rgbx.w = 1;\n"
+"  return rgbx;\n"
+"}\n"
+"\n"
+"__device__ inline float4\n"
+SAMPLE_BGRx "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
+"{\n"
+"  float4 bgrx = tex2D<float4>(tex0, x, y);\n"
+"  return make_float4 (bgrx.z, bgrx.y, bgrx.x, 1);\n"
+"}\n"
+"\n"
+"__device__ inline float4\n"
+SAMPLE_ARGB "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
+"{\n"
+"  float4 argb = tex2D<float4>(tex0, x, y);\n"
+"  return make_float4 (argb.y, argb.z, argb.w, argb.x);\n"
+"}\n"
+"\n"
+"__device__ inline float4\n"
+SAMPLE_AGBR "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
+"{\n"
+"  float4 abgr = tex2D<float4>(tex0, x, y);\n"
+"  return make_float4 (abgr.w, abgr.z, abgr.y, abgr.x);\n"
+"}\n"
+"\n"
+"__device__ inline float4\n"
+SAMPLE_RGBP "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
+"{\n"
+"  float r = tex2D<float>(tex0, x, y);\n"
+"  float g = tex2D<float>(tex1, x, y);\n"
+"  float b = tex2D<float>(tex2, x, y);\n"
+"  return make_float4 (r, g, b, 1);\n"
+"}\n"
+"\n"
+"__device__ inline float4\n"
+SAMPLE_BGRP "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
+"{\n"
+"  float b = tex2D<float>(tex0, x, y);\n"
+"  float g = tex2D<float>(tex1, x, y);\n"
+"  float r = tex2D<float>(tex2, x, y);\n"
+"  return make_float4 (r, g, b, 1);\n"
+"}\n"
+"\n"
+"__device__ inline float4\n"
+SAMPLE_GBR "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
+"{\n"
+"  float g = tex2D<float>(tex0, x, y);\n"
+"  float b = tex2D<float>(tex1, x, y);\n"
+"  float r = tex2D<float>(tex2, x, y);\n"
+"  return make_float4 (r, g, b, 1);\n"
+"}\n"
+"\n"
+"__device__ inline float4\n"
+SAMPLE_GBRA "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
+"{\n"
+"  float g = tex2D<float>(tex0, x, y);\n"
+"  float b = tex2D<float>(tex1, x, y);\n"
+"  float r = tex2D<float>(tex2, x, y);\n"
+"  float a = tex2D<float>(tex3, x, y);\n"
+"  return make_float4 (r, g, b, a);\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_I420 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
+"  if (x % 2 == 0 && y % 2 == 0) {\n"
+"    unsigned int pos = x / 2 + (y / 2) * stride1;\n"
+"    dst1[pos] = scale_to_uchar (sample.y);\n"
+"    dst2[pos] = scale_to_uchar (sample.z);\n"
+"  }\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_YV12 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
+"  if (x % 2 == 0 && y % 2 == 0) {\n"
+"    unsigned int pos = x / 2 + (y / 2) * stride1;\n"
+"    dst1[pos] = scale_to_uchar (sample.z);\n"
+"    dst2[pos] = scale_to_uchar (sample.y);\n"
+"  }\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_NV12 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
+"  if (x % 2 == 0 && y % 2 == 0) {\n"
+"    unsigned int pos = x + (y / 2) * stride1;\n"
+"    dst1[pos] = scale_to_uchar (sample.y);\n"
+"    dst1[pos + 1] = scale_to_uchar (sample.z);\n"
+"  }\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_NV21 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
+"  if (x % 2 == 0 && y % 2 == 0) {\n"
+"    unsigned int pos = x + (y / 2) * stride1;\n"
+"    dst1[pos] = scale_to_uchar (sample.z);\n"
+"    dst1[pos + 1] = scale_to_uchar (sample.y);\n"
+"  }\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_P010 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_ushort (sample.x);\n"
+"  if (x % 2 == 0 && y % 2 == 0) {\n"
+"    unsigned int pos = x * 2 + (y / 2) * stride1;\n"
+"    *(unsigned short *) &dst1[pos] = scale_to_ushort (sample.y);\n"
+"    *(unsigned short *) &dst1[pos + 2] = scale_to_ushort (sample.z);\n"
+"  }\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_I420_10 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_10bits (sample.x);\n"
+"  if (x % 2 == 0 && y % 2 == 0) {\n"
+"    unsigned int pos = x + (y / 2) * stride1;\n"
+"    *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);\n"
+"    *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);\n"
+"  }\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_Y444 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  int pos = x + y * stride0;\n"
+"  dst0[pos] = scale_to_uchar (sample.x);\n"
+"  dst1[pos] = scale_to_uchar (sample.y);\n"
+"  dst2[pos] = scale_to_uchar (sample.z);\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_Y444_16 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  int pos = x * 2 + y * stride0;\n"
+"  *(unsigned short *) &dst0[pos] = scale_to_ushort (sample.x);\n"
+"  *(unsigned short *) &dst1[pos] = scale_to_ushort (sample.y);\n"
+"  *(unsigned short *) &dst2[pos] = scale_to_ushort (sample.z);\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_RGBA "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  int pos = x * 4 + y * stride0;\n"
+"  dst0[pos] = scale_to_uchar (sample.x);\n"
+"  dst0[pos + 1] = scale_to_uchar (sample.y);\n"
+"  dst0[pos + 2] = scale_to_uchar (sample.z);\n"
+"  dst0[pos + 3] = scale_to_uchar (sample.w);\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_RGBx "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  int pos = x * 4 + y * stride0;\n"
+"  dst0[pos] = scale_to_uchar (sample.x);\n"
+"  dst0[pos + 1] = scale_to_uchar (sample.y);\n"
+"  dst0[pos + 2] = scale_to_uchar (sample.z);\n"
+"  dst0[pos + 3] = 255;\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_BGRA "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  int pos = x * 4 + y * stride0;\n"
+"  dst0[pos] = scale_to_uchar (sample.z);\n"
+"  dst0[pos + 1] = scale_to_uchar (sample.y);\n"
+"  dst0[pos + 2] = scale_to_uchar (sample.x);\n"
+"  dst0[pos + 3] = scale_to_uchar (sample.w);\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_BGRx "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  int pos = x * 4 + y * stride0;\n"
+"  dst0[pos] = scale_to_uchar (sample.z);\n"
+"  dst0[pos + 1] = scale_to_uchar (sample.y);\n"
+"  dst0[pos + 2] = scale_to_uchar (sample.x);\n"
+"  dst0[pos + 3] = 255;\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_ARGB "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  int pos = x * 4 + y * stride0;\n"
+"  dst0[pos] = scale_to_uchar (sample.w);\n"
+"  dst0[pos + 1] = scale_to_uchar (sample.x);\n"
+"  dst0[pos + 2] = scale_to_uchar (sample.y);\n"
+"  dst0[pos + 3] = scale_to_uchar (sample.z);\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_ABGR "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  int pos = x * 4 + y * stride0;\n"
+"  dst0[pos] = scale_to_uchar (sample.w);\n"
+"  dst0[pos + 1] = scale_to_uchar (sample.z);\n"
+"  dst0[pos + 2] = scale_to_uchar (sample.y);\n"
+"  dst0[pos + 3] = scale_to_uchar (sample.x);\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_RGB "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  int pos = x * 3 + y * stride0;\n"
+"  dst0[pos] = scale_to_uchar (sample.x);\n"
+"  dst0[pos + 1] = scale_to_uchar (sample.y);\n"
+"  dst0[pos + 2] = scale_to_uchar (sample.z);\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_BGR "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  int pos = x * 3 + y * stride0;\n"
+"  dst0[pos] = scale_to_uchar (sample.z);\n"
+"  dst0[pos + 1] = scale_to_uchar (sample.y);\n"
+"  dst0[pos + 2] = scale_to_uchar (sample.x);\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_RGB10A2 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  unsigned int alpha = (unsigned int) scale_to_2bits (sample.x);\n"
+"  unsigned int packed_rgb = alpha << 30;\n"
+"  packed_rgb |= ((unsigned int) scale_to_10bits (sample.x));\n"
+"  packed_rgb |= ((unsigned int) scale_to_10bits (sample.y)) << 10;\n"
+"  packed_rgb |= ((unsigned int) scale_to_10bits (sample.z)) << 20;\n"
+"  *(unsigned int *) &dst0[x * 4 + y * stride0] = packed_rgb;\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_BGR10A2 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  unsigned int alpha = (unsigned int) scale_to_2bits (sample.x);\n"
+"  unsigned int packed_rgb = alpha << 30;\n"
+"  packed_rgb |= ((unsigned int) scale_to_10bits (sample.x)) << 20;\n"
+"  packed_rgb |= ((unsigned int) scale_to_10bits (sample.y)) << 10;\n"
+"  packed_rgb |= ((unsigned int) scale_to_10bits (sample.z));\n"
+"  *(unsigned int *) &dst0[x * 4 + y * stride0] = packed_rgb;\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_Y42B "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
+"  if (x % 2 == 0) {\n"
+"    unsigned int pos = x / 2 + y * stride1;\n"
+"    dst1[pos] = scale_to_uchar (sample.y);\n"
+"    dst2[pos] = scale_to_uchar (sample.z);\n"
+"  }\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_I422_10 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_10bits (sample.x);\n"
+"  if (x % 2 == 0) {\n"
+"    unsigned int pos = x + y * stride1;\n"
+"    *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);\n"
+"    *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);\n"
+"  }\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_I422_12 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_12bits (sample.x);\n"
+"  if (x % 2 == 0) {\n"
+"    unsigned int pos = x + y * stride1;\n"
+"    *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);\n"
+"    *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);\n"
+"  }\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_RGBP "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  int pos = x + y * stride0;\n"
+"  dst0[pos] = scale_to_uchar (sample.x);\n"
+"  dst1[pos] = scale_to_uchar (sample.y);\n"
+"  dst2[pos] = scale_to_uchar (sample.z);\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_BGRP "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  int pos = x + y * stride0;\n"
+"  dst0[pos] = scale_to_uchar (sample.z);\n"
+"  dst1[pos] = scale_to_uchar (sample.y);\n"
+"  dst2[pos] = scale_to_uchar (sample.x);\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_GBR "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  int pos = x + y * stride0;\n"
+"  dst0[pos] = scale_to_uchar (sample.y);\n"
+"  dst1[pos] = scale_to_uchar (sample.z);\n"
+"  dst2[pos] = scale_to_uchar (sample.x);\n"
+"}\n"
+"\n"
+"__device__ inline void\n"
+WRITE_GBRA "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
+"    unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n"
+"{\n"
+"  int pos = x + y * stride0;\n"
+"  dst0[pos] = scale_to_uchar (sample.y);\n"
+"  dst1[pos] = scale_to_uchar (sample.z);\n"
+"  dst2[pos] = scale_to_uchar (sample.x);\n"
+"  dst3[pos] = scale_to_uchar (sample.w);\n"
+"}\n";
+
+#define GST_CUDA_KERNEL_UNPACK_FUNC "gst_cuda_kernel_unpack_func"
+static const gchar RGB_TO_RGBx[] =
+"extern \"C\" {\n"
+"__global__ void\n"
+GST_CUDA_KERNEL_UNPACK_FUNC
+"(unsigned char *src, unsigned char *dst, int width, int height,\n"
+"    int src_stride, int dst_stride)\n"
+"{\n"
+"  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
+"  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
+"  if (x_pos < width && y_pos < height) {\n"
+"    int dst_pos = x_pos * 4 + y_pos * dst_stride;\n"
+"    int src_pos = x_pos * 3 + y_pos * src_stride;\n"
+"    dst[dst_pos] = src[src_pos];\n"
+"    dst[dst_pos + 1] = src[src_pos + 1];\n"
+"    dst[dst_pos + 2] = src[src_pos + 2];\n"
+"    dst[dst_pos + 3] = 0xff;\n"
+"  }\n"
+"}\n"
+"}\n";
+
+static const gchar RGB10A2_TO_ARGB64[] =
+"extern \"C\" {\n"
+"__global__ void\n"
+GST_CUDA_KERNEL_UNPACK_FUNC
+"(unsigned char *src, unsigned char *dst, int width, int height,\n"
+"    int src_stride, int dst_stride)\n"
+"{\n"
+"  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
+"  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
+"  if (x_pos < width && y_pos < height) {\n"
+"    unsigned short a, r, g, b;\n"
+"    unsigned int val;\n"
+"    int dst_pos = x_pos * 8 + y_pos * dst_stride;\n"
+"    val = *(unsigned int *)&src[x_pos * 4 + y_pos * src_stride];\n"
+"    a = (val >> 30) & 0x03;\n"
+"    a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);\n"
+"    r = (val & 0x3ff);\n"
+"    r = (r << 6) | (r >> 4);\n"
+"    g = ((val >> 10) & 0x3ff);\n"
+"    g = (g << 6) | (g >> 4);\n"
+"    b = ((val >> 20) & 0x3ff);\n"
+"    b = (b << 6) | (b >> 4);\n"
+"    *(unsigned short *) &dst[dst_pos] = a;\n"
+"    *(unsigned short *) &dst[dst_pos + 2] = r;\n"
+"    *(unsigned short *) &dst[dst_pos + 4] = g;\n"
+"    *(unsigned short *) &dst[dst_pos + 6] = b;\n"
+"  }\n"
+"}\n"
+"}\n";
+
+static const gchar BGR10A2_TO_ARGB64[] =
+"extern \"C\" {\n"
+"__global__ void\n"
+GST_CUDA_KERNEL_UNPACK_FUNC
+"(unsigned char *src, unsigned char *dst, int width, int height,\n"
+"    int src_stride, int dst_stride)\n"
+"{\n"
+"  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
+"  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
+"  if (x_pos < width && y_pos < height) {\n"
+"    unsigned short a, r, g, b;\n"
+"    unsigned int val;\n"
+"    int dst_pos = x_pos * 8 + y_pos * dst_stride;\n"
+"    val = *(unsigned int *)&src[x_pos * 4 + y_pos * src_stride];\n"
+"    a = (val >> 30) & 0x03;\n"
+"    a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);\n"
+"    b = (val & 0x3ff);\n"
+"    b = (b << 6) | (b >> 4);\n"
+"    g = ((val >> 10) & 0x3ff);\n"
+"    g = (g << 6) | (g >> 4);\n"
+"    r = ((val >> 20) & 0x3ff);\n"
+"    r = (r << 6) | (r >> 4);\n"
+"    *(unsigned short *) &dst[dst_pos] = a;\n"
+"    *(unsigned short *) &dst[dst_pos + 2] = r;\n"
+"    *(unsigned short *) &dst[dst_pos + 4] = g;\n"
+"    *(unsigned short *) &dst[dst_pos + 6] = b;\n"
+"  }\n"
+"}\n"
+"}\n";
+
+#define GST_CUDA_KERNEL_MAIN_FUNC "KernelMain"
+
+static const gchar TEMPLETA_KERNEL[] =
+/* KERNEL_COMMON */
+"%s\n"
+/* UNPACK FUNCTION */
+"%s\n"
+"__constant__ ColorMatrix TO_RGB_MATRIX = { { %s, %s, %s },\n"
+"                                           { %s, %s, %s },\n"
+"                                           { %s, %s, %s },\n"
+"                                           { %s, %s, %s },\n"
+"                                           { %s, %s, %s },\n"
+"                                           { %s, %s, %s } };\n"
+"__constant__ ColorMatrix TO_YUV_MATRIX = { { %s, %s, %s },\n"
+"                                           { %s, %s, %s },\n"
+"                                           { %s, %s, %s },\n"
+"                                           { %s, %s, %s },\n"
+"                                           { %s, %s, %s },\n"
+"                                           { %s, %s, %s } };\n"
+"__constant__ int WIDTH = %d;\n"
+"__constant__ int HEIGHT = %d;\n"
+"__constant__ int LEFT = %d;\n"
+"__constant__ int TOP = %d;\n"
+"__constant__ int RIGHT = %d;\n"
+"__constant__ int BOTTOM = %d;\n"
+"__constant__ int VIEW_WIDTH = %d;\n"
+"__constant__ int VIEW_HEIGHT = %d;\n"
+"__constant__ float OFFSET_X = %s;\n"
+"__constant__ float OFFSET_Y = %s;\n"
+"__constant__ float BORDER_X = %s;\n"
+"__constant__ float BORDER_Y = %s;\n"
+"__constant__ float BORDER_Z = %s;\n"
+"__constant__ float BORDER_W = %s;\n"
+"\n"
+"extern \"C\" {\n"
+"__global__ void\n"
+GST_CUDA_KERNEL_MAIN_FUNC "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
+"    cudaTextureObject_t tex2, cudaTextureObject_t tex3, unsigned char * dst0,\n"
+"    unsigned char * dst1, unsigned char * dst2, unsigned char * dst3,\n"
+"    int stride0, int stride1)\n"
+"{\n"
+"  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
+"  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
+"  float4 sample;\n"
+"  if (x_pos >= WIDTH || y_pos >= HEIGHT)\n"
+"    return;\n"
+"  if (x_pos < LEFT || x_pos >= RIGHT || y_pos < TOP || y_pos >= BOTTOM) {\n"
+"    sample = make_float4 (BORDER_X, BORDER_Y, BORDER_Z, BORDER_W);\n"
+"  } else {\n"
+"    float x = OFFSET_X + (float) (x_pos - LEFT) / VIEW_WIDTH;\n"
+"    float y = OFFSET_Y + (float) (y_pos - TOP) / VIEW_HEIGHT;\n"
+"    float4 s = %s (tex0, tex1, tex2, tex3, x, y);\n"
+"    float3 xyz = make_float3 (s.x, s.y, s.z);\n"
+"    float3 rgb = %s (xyz, &TO_RGB_MATRIX);\n"
+"    float3 yuv = %s (rgb, &TO_YUV_MATRIX);\n"
+"    sample = make_float4 (yuv.x, yuv.y, yuv.z, s.w);\n"
+"  }\n"
+"  %s (dst0, dst1, dst2, dst3, sample, x_pos, y_pos, stride0, stride1);\n"
+"}\n"
+"}\n";
+/* *INDENT-ON* */
+
+typedef struct _TextureFormat
+{
+  GstVideoFormat format;
+  CUarray_format array_format[GST_VIDEO_MAX_COMPONENTS];
+  guint channels[GST_VIDEO_MAX_COMPONENTS];
+  const gchar *sample_func;
+} TextureFormat;
+
+#define CU_AD_FORMAT_NONE 0
+#define MAKE_FORMAT_YUV_PLANAR(f,cf,sample_func) \
+  { GST_VIDEO_FORMAT_ ##f,  { CU_AD_FORMAT_ ##cf, CU_AD_FORMAT_ ##cf, \
+      CU_AD_FORMAT_ ##cf, CU_AD_FORMAT_NONE },  {1, 1, 1, 0}, sample_func }
+#define MAKE_FORMAT_YUV_SEMI_PLANAR(f,cf,sample_func) \
+  { GST_VIDEO_FORMAT_ ##f,  { CU_AD_FORMAT_ ##cf, CU_AD_FORMAT_ ##cf, \
+      CU_AD_FORMAT_NONE, CU_AD_FORMAT_NONE }, {1, 2, 0, 0}, sample_func }
+#define MAKE_FORMAT_RGB(f,cf,sample_func) \
+  { GST_VIDEO_FORMAT_ ##f,  { CU_AD_FORMAT_ ##cf, CU_AD_FORMAT_NONE, \
+      CU_AD_FORMAT_NONE, CU_AD_FORMAT_NONE }, {4, 0, 0, 0}, sample_func }
+#define MAKE_FORMAT_RGBP(f,cf,sample_func) \
+  { GST_VIDEO_FORMAT_ ##f,  { CU_AD_FORMAT_ ##cf, CU_AD_FORMAT_ ##cf, \
+      CU_AD_FORMAT_ ##cf, CU_AD_FORMAT_NONE }, {1, 1, 1, 0}, sample_func }
+#define MAKE_FORMAT_RGBAP(f,cf,sample_func) \
+  { GST_VIDEO_FORMAT_ ##f,  { CU_AD_FORMAT_ ##cf, CU_AD_FORMAT_ ##cf, \
+      CU_AD_FORMAT_ ##cf, CU_AD_FORMAT_ ##cf }, {1, 1, 1, 1}, sample_func }
+
+static const TextureFormat format_map[] = {
+  MAKE_FORMAT_YUV_PLANAR (I420, UNSIGNED_INT8, SAMPLE_YUV_PLANAR),
+  MAKE_FORMAT_YUV_PLANAR (YV12, UNSIGNED_INT8, SAMPLE_YV12),
+  MAKE_FORMAT_YUV_SEMI_PLANAR (NV12, UNSIGNED_INT8, SAMPLE_SEMI_PLANAR),
+  MAKE_FORMAT_YUV_SEMI_PLANAR (NV21, UNSIGNED_INT8, SAMPLE_SEMI_PLANAR_SWAP),
+  MAKE_FORMAT_YUV_SEMI_PLANAR (P010_10LE, UNSIGNED_INT16, SAMPLE_SEMI_PLANAR),
+  MAKE_FORMAT_YUV_SEMI_PLANAR (P016_LE, UNSIGNED_INT16, SAMPLE_SEMI_PLANAR),
+  MAKE_FORMAT_YUV_PLANAR (I420_10LE, UNSIGNED_INT16, SAMPLE_YUV_PLANAR_10BIS),
+  MAKE_FORMAT_YUV_PLANAR (Y444, UNSIGNED_INT8, SAMPLE_YUV_PLANAR),
+  MAKE_FORMAT_YUV_PLANAR (Y444_16LE, UNSIGNED_INT16, SAMPLE_YUV_PLANAR),
+  MAKE_FORMAT_RGB (RGBA, UNSIGNED_INT8, SAMPLE_RGBA),
+  MAKE_FORMAT_RGB (BGRA, UNSIGNED_INT8, SAMPLE_BGRA),
+  MAKE_FORMAT_RGB (RGBx, UNSIGNED_INT8, SAMPLE_RGBx),
+  MAKE_FORMAT_RGB (BGRx, UNSIGNED_INT8, SAMPLE_BGRx),
+  MAKE_FORMAT_RGB (ARGB, UNSIGNED_INT8, SAMPLE_ARGB),
+  MAKE_FORMAT_RGB (ARGB64, UNSIGNED_INT16, SAMPLE_ARGB64),
+  MAKE_FORMAT_RGB (ABGR, UNSIGNED_INT8, SAMPLE_AGBR),
+  MAKE_FORMAT_YUV_PLANAR (Y42B, UNSIGNED_INT8, SAMPLE_YUV_PLANAR),
+  MAKE_FORMAT_YUV_PLANAR (I422_10LE, UNSIGNED_INT16, SAMPLE_YUV_PLANAR_10BIS),
+  MAKE_FORMAT_YUV_PLANAR (I422_12LE, UNSIGNED_INT16, SAMPLE_YUV_PLANAR_12BIS),
+  MAKE_FORMAT_RGBP (RGBP, UNSIGNED_INT8, SAMPLE_RGBP),
+  MAKE_FORMAT_RGBP (BGRP, UNSIGNED_INT8, SAMPLE_BGRP),
+  MAKE_FORMAT_RGBP (GBR, UNSIGNED_INT8, SAMPLE_GBR),
+  MAKE_FORMAT_RGBAP (GBRA, UNSIGNED_INT8, SAMPLE_GBRA),
+};
+
+typedef struct _TextureBuffer
+{
+  CUdeviceptr ptr;
+  gsize stride;
+} TextureBuffer;
+
+typedef struct
+{
+  gint x;
+  gint y;
+  gint width;
+  gint height;
+} ConverterRect;
+
+struct _GstCudaConverterPrivate
+{
+  GstVideoInfo in_info;
+  GstVideoInfo out_info;
+
+  GstStructure *config;
+
+  GstVideoInfo texture_info;
+  const TextureFormat *texture_fmt;
+  gint texture_align;
+  ConverterRect dest_rect;
+
+  TextureBuffer fallback_buffer[GST_VIDEO_MAX_COMPONENTS];
+  CUfilter_mode filter_mode[GST_VIDEO_MAX_COMPONENTS];
+  TextureBuffer unpack_buffer;
+
+  CUmodule module;
+  CUfunction main_func;
+  CUfunction unpack_func;
+};
+
+static void gst_cuda_converter_dispose (GObject * object);
+static void gst_cuda_converter_finalize (GObject * object);
+
+#define gst_cuda_converter_parent_class parent_class
+G_DEFINE_TYPE_WITH_PRIVATE (GstCudaConverter, gst_cuda_converter,
+    GST_TYPE_OBJECT);
+
+static void
+gst_cuda_converter_class_init (GstCudaConverterClass * klass)
+{
+  GObjectClass *object_class = G_OBJECT_CLASS (klass);
+
+  object_class->dispose = gst_cuda_converter_dispose;
+  object_class->finalize = gst_cuda_converter_finalize;
+
+  GST_DEBUG_CATEGORY_INIT (gst_cuda_converter_debug,
+      "cudaconverter", 0, "cudaconverter");
+}
+
+static void
+gst_cuda_converter_init (GstCudaConverter * self)
+{
+  GstCudaConverterPrivate *priv;
+
+  self->priv = priv = gst_cuda_converter_get_instance_private (self);
+  priv->config = gst_structure_new_empty ("GstCudaConverter");
+}
+
+static void
+gst_cuda_converter_dispose (GObject * object)
+{
+  GstCudaConverter *self = GST_CUDA_CONVERTER (object);
+  GstCudaConverterPrivate *priv = self->priv;
+  guint i;
+
+  if (self->context && gst_cuda_context_push (self->context)) {
+    if (priv->module) {
+      CuModuleUnload (priv->module);
+      priv->module = NULL;
+    }
+
+    for (i = 0; i < G_N_ELEMENTS (priv->fallback_buffer); i++) {
+      if (priv->fallback_buffer[i].ptr) {
+        CuMemFree (priv->fallback_buffer[i].ptr);
+        priv->fallback_buffer[i].ptr = 0;
+      }
+    }
+
+    if (priv->unpack_buffer.ptr) {
+      CuMemFree (priv->unpack_buffer.ptr);
+      priv->unpack_buffer.ptr = 0;
+    }
+
+    gst_cuda_context_pop (NULL);
+  }
+
+  gst_clear_object (&self->context);
+
+  G_OBJECT_CLASS (parent_class)->dispose (object);
+}
+
+static void
+gst_cuda_converter_finalize (GObject * object)
+{
+  GstCudaConverter *self = GST_CUDA_CONVERTER (object);
+  GstCudaConverterPrivate *priv = self->priv;
+
+  gst_structure_free (priv->config);
+
+  G_OBJECT_CLASS (parent_class)->finalize (object);
+}
+
+static const gchar *
+get_color_range_name (GstVideoColorRange range)
+{
+  switch (range) {
+    case GST_VIDEO_COLOR_RANGE_0_255:
+      return "FULL";
+    case GST_VIDEO_COLOR_RANGE_16_235:
+      return "STUDIO";
+    default:
+      break;
+  }
+
+  return "UNKNOWN";
+}
+
+typedef struct _GstCudaColorMatrixString
+{
+  gchar matrix[3][3][G_ASCII_DTOSTR_BUF_SIZE];
+  gchar offset[3][G_ASCII_DTOSTR_BUF_SIZE];
+  gchar min[3][G_ASCII_DTOSTR_BUF_SIZE];
+  gchar max[3][G_ASCII_DTOSTR_BUF_SIZE];
+} GstCudaColorMatrixString;
+
+static void
+color_matrix_to_string (const GstCudaColorMatrix * m,
+    GstCudaColorMatrixString * str)
+{
+  guint i, j;
+  for (i = 0; i < 3; i++) {
+    for (j = 0; j < 3; j++) {
+      g_ascii_formatd (str->matrix[i][j], G_ASCII_DTOSTR_BUF_SIZE, "%f",
+          m->matrix[i][j]);
+    }
+
+    g_ascii_formatd (str->offset[i],
+        G_ASCII_DTOSTR_BUF_SIZE, "%f", m->offset[i]);
+    g_ascii_formatd (str->min[i], G_ASCII_DTOSTR_BUF_SIZE, "%f", m->min[i]);
+    g_ascii_formatd (str->max[i], G_ASCII_DTOSTR_BUF_SIZE, "%f", m->max[i]);
+  }
+}
+
+static gboolean
+gst_cuda_converter_setup (GstCudaConverter * self)
+{
+  GstCudaConverterPrivate *priv = self->priv;
+  const GstVideoInfo *in_info;
+  const GstVideoInfo *out_info;
+  const GstVideoInfo *texture_info;
+  GstCudaColorMatrix to_rgb_matrix;
+  GstCudaColorMatrix to_yuv_matrix;
+  GstCudaColorMatrix border_color_matrix;
+  GstCudaColorMatrixString to_rgb_matrix_str;
+  GstCudaColorMatrixString to_yuv_matrix_str;
+  gchar border_color_str[4][G_ASCII_DTOSTR_BUF_SIZE];
+  gdouble border_color[4];
+  gchar offset_x[G_ASCII_DTOSTR_BUF_SIZE];
+  gchar offset_y[G_ASCII_DTOSTR_BUF_SIZE];
+  gint i, j;
+  const gchar *unpack_function = NULL;
+  const gchar *write_func = NULL;
+  const gchar *to_rgb_func = COLOR_SPACE_IDENTITY;
+  const gchar *to_yuv_func = COLOR_SPACE_IDENTITY;
+  const GstVideoColorimetry *in_color;
+  const GstVideoColorimetry *out_color;
+  gchar *str;
+  gchar *ptx;
+  CUresult ret;
+
+  in_info = &priv->in_info;
+  out_info = &priv->out_info;
+  texture_info = &priv->texture_info;
+  in_color = &in_info->colorimetry;
+  out_color = &out_info->colorimetry;
+
+  memset (&to_rgb_matrix, 0, sizeof (GstCudaColorMatrix));
+  color_matrix_identity (&to_rgb_matrix);
+
+  memset (&to_yuv_matrix, 0, sizeof (GstCudaColorMatrix));
+  color_matrix_identity (&to_yuv_matrix);
+
+  switch (GST_VIDEO_INFO_FORMAT (out_info)) {
+    case GST_VIDEO_FORMAT_I420:
+      write_func = WRITE_I420;
+      break;
+    case GST_VIDEO_FORMAT_YV12:
+      write_func = WRITE_YV12;
+      break;
+    case GST_VIDEO_FORMAT_NV12:
+      write_func = WRITE_NV12;
+      break;
+    case GST_VIDEO_FORMAT_NV21:
+      write_func = WRITE_NV21;
+      break;
+    case GST_VIDEO_FORMAT_P010_10LE:
+      write_func = WRITE_P010;
+      break;
+    case GST_VIDEO_FORMAT_P016_LE:
+      write_func = WRITE_P016;
+      break;
+    case GST_VIDEO_FORMAT_I420_10LE:
+      write_func = WRITE_I420_10;
+      break;
+    case GST_VIDEO_FORMAT_Y444:
+      write_func = WRITE_Y444;
+      break;
+    case GST_VIDEO_FORMAT_Y444_16LE:
+      write_func = WRITE_Y444_16;
+      break;
+    case GST_VIDEO_FORMAT_RGBA:
+      write_func = WRITE_RGBA;
+      break;
+    case GST_VIDEO_FORMAT_RGBx:
+      write_func = WRITE_RGBx;
+      break;
+    case GST_VIDEO_FORMAT_BGRA:
+      write_func = WRITE_BGRA;
+      break;
+    case GST_VIDEO_FORMAT_BGRx:
+      write_func = WRITE_BGRx;
+      break;
+    case GST_VIDEO_FORMAT_ARGB:
+      write_func = WRITE_ARGB;
+      break;
+    case GST_VIDEO_FORMAT_ABGR:
+      write_func = WRITE_ABGR;
+      break;
+    case GST_VIDEO_FORMAT_RGB:
+      write_func = WRITE_RGB;
+      break;
+    case GST_VIDEO_FORMAT_BGR:
+      write_func = WRITE_BGR;
+      break;
+    case GST_VIDEO_FORMAT_RGB10A2_LE:
+      write_func = WRITE_RGB10A2;
+      break;
+    case GST_VIDEO_FORMAT_BGR10A2_LE:
+      write_func = WRITE_BGR10A2;
+      break;
+    case GST_VIDEO_FORMAT_Y42B:
+      write_func = WRITE_Y42B;
+      break;
+    case GST_VIDEO_FORMAT_I422_10LE:
+      write_func = WRITE_I422_10;
+      break;
+    case GST_VIDEO_FORMAT_I422_12LE:
+      write_func = WRITE_I422_12;
+      break;
+    case GST_VIDEO_FORMAT_RGBP:
+      write_func = WRITE_RGBP;
+      break;
+    case GST_VIDEO_FORMAT_BGRP:
+      write_func = WRITE_BGRP;
+      break;
+    case GST_VIDEO_FORMAT_GBR:
+      write_func = WRITE_GBR;
+      break;
+    case GST_VIDEO_FORMAT_GBRA:
+      write_func = WRITE_GBRA;
+      break;
+    default:
+      break;
+  }
+
+  if (!write_func) {
+    GST_ERROR_OBJECT (self, "Unknown write function for format %s",
+        gst_video_format_to_string (GST_VIDEO_INFO_FORMAT (out_info)));
+    return FALSE;
+  }
+
+  /* Decide texture info to use, 3 channel RGB or 10bits packed RGB
+   * need be converted to other format */
+  priv->texture_info = priv->in_info;
+  switch (GST_VIDEO_INFO_FORMAT (in_info)) {
+    case GST_VIDEO_FORMAT_RGB:
+      gst_video_info_set_format (&priv->texture_info,
+          GST_VIDEO_FORMAT_RGBx, GST_VIDEO_INFO_WIDTH (in_info),
+          GST_VIDEO_INFO_HEIGHT (in_info));
+      unpack_function = RGB_TO_RGBx;
+      break;
+    case GST_VIDEO_FORMAT_BGR:
+      gst_video_info_set_format (&priv->texture_info,
+          GST_VIDEO_FORMAT_BGRx, GST_VIDEO_INFO_WIDTH (in_info),
+          GST_VIDEO_INFO_HEIGHT (in_info));
+      unpack_function = RGB_TO_RGBx;
+      break;
+    case GST_VIDEO_FORMAT_RGB10A2_LE:
+      gst_video_info_set_format (&priv->texture_info,
+          GST_VIDEO_FORMAT_ARGB64, GST_VIDEO_INFO_WIDTH (in_info),
+          GST_VIDEO_INFO_HEIGHT (in_info));
+      unpack_function = RGB10A2_TO_ARGB64;
+      break;
+    case GST_VIDEO_FORMAT_BGR10A2_LE:
+      gst_video_info_set_format (&priv->texture_info,
+          GST_VIDEO_FORMAT_ARGB64, GST_VIDEO_INFO_WIDTH (in_info),
+          GST_VIDEO_INFO_HEIGHT (in_info));
+      unpack_function = BGR10A2_TO_ARGB64;
+      break;
+    default:
+      break;
+  }
+
+  for (i = 0; i < G_N_ELEMENTS (format_map); i++) {
+    if (format_map[i].format == GST_VIDEO_INFO_FORMAT (texture_info)) {
+      priv->texture_fmt = &format_map[i];
+      break;
+    }
+  }
+
+  if (!priv->texture_fmt) {
+    GST_ERROR_OBJECT (self, "Couldn't find texture format for %s (%s)",
+        gst_video_format_to_string (GST_VIDEO_INFO_FORMAT (in_info)),
+        gst_video_format_to_string (GST_VIDEO_INFO_FORMAT (texture_info)));
+    return FALSE;
+  }
+
+  /* calculate black color
+   * TODO: add support border color */
+  if (GST_VIDEO_INFO_IS_RGB (out_info)) {
+    GstVideoInfo rgb_info = *out_info;
+    rgb_info.colorimetry.range = GST_VIDEO_COLOR_RANGE_0_255;
+    gst_cuda_color_range_adjust_matrix_unorm (&rgb_info, out_info,
+        &border_color_matrix);
+  } else {
+    GstVideoInfo rgb_info;
+
+    gst_video_info_set_format (&rgb_info, GST_VIDEO_FORMAT_RGBA64_LE,
+        out_info->width, out_info->height);
+
+    gst_cuda_rgb_to_yuv_matrix_unorm (&rgb_info,
+        out_info, &border_color_matrix);
+  }
+
+  for (i = 0; i < 3; i++) {
+    /* TODO: property */
+    gdouble border_rgba[4] = { 0, 0, 0 };
+    border_color[i] = 0;
+    for (j = 0; j < 3; j++)
+      border_color[i] += border_color_matrix.matrix[i][j] * border_rgba[i];
+    border_color[i] = border_color_matrix.offset[i];
+    border_color[i] = CLAMP (border_color[i],
+        border_color_matrix.min[i], border_color_matrix.max[i]);
+
+    g_ascii_formatd (border_color_str[i],
+        G_ASCII_DTOSTR_BUF_SIZE, "%f", border_color[i]);
+  }
+  g_ascii_formatd (border_color_str[3], G_ASCII_DTOSTR_BUF_SIZE, "%f", 1);
+
+  /* FIXME: handle primaries and transfer functions */
+  if (GST_VIDEO_INFO_IS_RGB (texture_info)) {
+    if (GST_VIDEO_INFO_IS_RGB (out_info)) {
+      /* RGB -> RGB */
+      if (in_color->range == out_color->range) {
+        GST_DEBUG_OBJECT (self, "RGB -> RGB conversion without matrix");
+      } else {
+        if (!gst_cuda_color_range_adjust_matrix_unorm (in_info, out_info,
+                &to_rgb_matrix)) {
+          GST_ERROR_OBJECT (self, "Failed to get RGB range adjust matrix");
+          return FALSE;
+        }
+
+        str = gst_cuda_dump_color_matrix (&to_rgb_matrix);
+        GST_DEBUG_OBJECT (self, "RGB range adjust %s -> %s\n%s",
+            get_color_range_name (in_color->range),
+            get_color_range_name (out_color->range), str);
+        g_free (str);
+
+        to_rgb_func = COLOR_SPACE_CONVERT;
+      }
+    } else {
+      /* RGB -> YUV */
+      if (!gst_cuda_rgb_to_yuv_matrix_unorm (in_info, out_info, &to_yuv_matrix)) {
+        GST_ERROR_OBJECT (self, "Failed to get RGB -> YUV transform matrix");
+        return FALSE;
+      }
+
+      str = gst_cuda_dump_color_matrix (&to_yuv_matrix);
+      GST_DEBUG_OBJECT (self, "RGB -> YUV matrix:\n%s", str);
+      g_free (str);
+
+      to_yuv_func = COLOR_SPACE_CONVERT;
+    }
+  } else {
+    if (GST_VIDEO_INFO_IS_RGB (out_info)) {
+      /* YUV -> RGB */
+      if (!gst_cuda_yuv_to_rgb_matrix_unorm (in_info, out_info, &to_rgb_matrix)) {
+        GST_ERROR_OBJECT (self, "Failed to get YUV -> RGB transform matrix");
+        return FALSE;
+      }
+
+      str = gst_cuda_dump_color_matrix (&to_rgb_matrix);
+      GST_DEBUG_OBJECT (self, "YUV -> RGB matrix:\n%s", str);
+      g_free (str);
+
+      to_rgb_func = COLOR_SPACE_CONVERT;
+    } else {
+      /* YUV -> YUV */
+      if (in_color->range == out_color->range) {
+        GST_DEBUG_OBJECT (self, "YUV -> YU conversion without matrix");
+      } else {
+        if (!gst_cuda_color_range_adjust_matrix_unorm (in_info, out_info,
+                &to_yuv_matrix)) {
+          GST_ERROR_OBJECT (self, "Failed to get GRAY range adjust matrix");
+          return FALSE;
+        }
+
+        str = gst_cuda_dump_color_matrix (&to_yuv_matrix);
+        GST_DEBUG_OBJECT (self, "YUV range adjust matrix:\n%s", str);
+        g_free (str);
+
+        to_yuv_func = COLOR_SPACE_CONVERT;
+      }
+    }
+  }
+
+  color_matrix_to_string (&to_rgb_matrix, &to_rgb_matrix_str);
+  color_matrix_to_string (&to_yuv_matrix, &to_yuv_matrix_str);
+
+  /* half pixel offset, to sample texture at center of the pixel position */
+  g_ascii_formatd (offset_x, G_ASCII_DTOSTR_BUF_SIZE, "%f",
+      (gdouble) 0.5 / priv->dest_rect.width);
+  g_ascii_formatd (offset_y, G_ASCII_DTOSTR_BUF_SIZE, "%f",
+      (gdouble) 0.5 / priv->dest_rect.height);
+
+  str = g_strdup_printf (TEMPLETA_KERNEL, KERNEL_COMMON,
+      unpack_function ? unpack_function : "",
+      /* TO RGB matrix */
+      to_rgb_matrix_str.matrix[0][0],
+      to_rgb_matrix_str.matrix[0][1],
+      to_rgb_matrix_str.matrix[0][2],
+      to_rgb_matrix_str.matrix[1][0],
+      to_rgb_matrix_str.matrix[1][1],
+      to_rgb_matrix_str.matrix[1][2],
+      to_rgb_matrix_str.matrix[2][0],
+      to_rgb_matrix_str.matrix[2][1],
+      to_rgb_matrix_str.matrix[2][2],
+      to_rgb_matrix_str.offset[0],
+      to_rgb_matrix_str.offset[1],
+      to_rgb_matrix_str.offset[2],
+      to_rgb_matrix_str.min[0],
+      to_rgb_matrix_str.min[1],
+      to_rgb_matrix_str.min[2],
+      to_rgb_matrix_str.max[0],
+      to_rgb_matrix_str.max[1], to_rgb_matrix_str.max[2],
+      /* TO YUV matrix */
+      to_yuv_matrix_str.matrix[0][0],
+      to_yuv_matrix_str.matrix[0][1],
+      to_yuv_matrix_str.matrix[0][2],
+      to_yuv_matrix_str.matrix[1][0],
+      to_yuv_matrix_str.matrix[1][1],
+      to_yuv_matrix_str.matrix[1][2],
+      to_yuv_matrix_str.matrix[2][0],
+      to_yuv_matrix_str.matrix[2][1],
+      to_yuv_matrix_str.matrix[2][2],
+      to_yuv_matrix_str.offset[0],
+      to_yuv_matrix_str.offset[1],
+      to_yuv_matrix_str.offset[2],
+      to_yuv_matrix_str.min[0],
+      to_yuv_matrix_str.min[1],
+      to_yuv_matrix_str.min[2],
+      to_yuv_matrix_str.max[0],
+      to_yuv_matrix_str.max[1], to_yuv_matrix_str.max[2],
+      /* width/height */
+      GST_VIDEO_INFO_WIDTH (out_info), GST_VIDEO_INFO_HEIGHT (out_info),
+      /* viewport */
+      priv->dest_rect.x, priv->dest_rect.y,
+      priv->dest_rect.x + priv->dest_rect.width,
+      priv->dest_rect.y + priv->dest_rect.height,
+      priv->dest_rect.width, priv->dest_rect.height,
+      /* half pixel offsets */
+      offset_x, offset_y,
+      /* border colors */
+      border_color_str[0], border_color_str[1],
+      border_color_str[2], border_color_str[3],
+      /* sampler function name */
+      priv->texture_fmt->sample_func,
+      /* TO RGB conversion function name */
+      to_rgb_func,
+      /* TO YUV conversion function name */
+      to_yuv_func,
+      /* write function name */
+      write_func);
+
+  GST_LOG_OBJECT (self, "kernel code:\n%s\n", str);
+  ptx = gst_cuda_nvrtc_compile (str);
+  g_free (str);
+
+  if (!ptx) {
+    GST_ERROR_OBJECT (self, "Could not compile code");
+    return FALSE;
+  }
+
+  if (!gst_cuda_context_push (self->context)) {
+    GST_ERROR_OBJECT (self, "Couldn't push context");
+    return FALSE;
+  }
+
+  /* Allocates intermediate memory for texture */
+  if (unpack_function) {
+    ret = CuMemAllocPitch (&priv->unpack_buffer.ptr,
+        &priv->unpack_buffer.stride,
+        GST_VIDEO_INFO_COMP_WIDTH (texture_info, 0) *
+        GST_VIDEO_INFO_COMP_PSTRIDE (texture_info, 0),
+        GST_VIDEO_INFO_HEIGHT (texture_info), 16);
+    if (!gst_cuda_result (ret)) {
+      GST_ERROR_OBJECT (self, "Couldn't allocate unpack buffer");
+      goto error;
+    }
+  }
+
+  ret = CuModuleLoadData (&priv->module, ptx);
+  g_free (ptx);
+  if (!gst_cuda_result (ret)) {
+    GST_ERROR_OBJECT (self, "Could not load module");
+    priv->module = NULL;
+    goto error;
+  }
+
+  ret = CuModuleGetFunction (&priv->main_func,
+      priv->module, GST_CUDA_KERNEL_MAIN_FUNC);
+  if (!gst_cuda_result (ret)) {
+    GST_ERROR_OBJECT (self, "Could not get main function");
+    goto error;
+  }
+
+  if (unpack_function) {
+    ret = CuModuleGetFunction (&priv->unpack_func,
+        priv->module, GST_CUDA_KERNEL_UNPACK_FUNC);
+    if (!gst_cuda_result (ret)) {
+      GST_ERROR_OBJECT (self, "Could not get unpack function");
+      goto error;
+    }
+  }
+
+  gst_cuda_context_pop (NULL);
+
+  if (priv->dest_rect.x != 0 || priv->dest_rect.y != 0 ||
+      priv->dest_rect.width != out_info->width ||
+      priv->dest_rect.height != out_info->height ||
+      in_info->width != out_info->width
+      || in_info->height != out_info->height) {
+    for (i = 0; i < G_N_ELEMENTS (priv->filter_mode); i++)
+      priv->filter_mode[i] = CU_TR_FILTER_MODE_LINEAR;
+  } else {
+    for (i = 0; i < G_N_ELEMENTS (priv->filter_mode); i++)
+      priv->filter_mode[i] = CU_TR_FILTER_MODE_POINT;
+  }
+
+  return TRUE;
+
+error:
+  gst_cuda_context_pop (NULL);
+  return FALSE;
+}
+
+static gboolean
+copy_config (GQuark field_id, const GValue * value, gpointer user_data)
+{
+  GstCudaConverter *self = (GstCudaConverter *) user_data;
+
+  gst_structure_id_set_value (self->priv->config, field_id, value);
+
+  return TRUE;
+}
+
+static void
+gst_cuda_converter_set_config (GstCudaConverter * self, GstStructure * config)
+{
+  gst_structure_foreach (config, copy_config, self);
+  gst_structure_free (config);
+}
+
+static gint
+get_opt_int (GstCudaConverter * self, const gchar * opt, gint def)
+{
+  gint res;
+  if (!gst_structure_get_int (self->priv->config, opt, &res))
+    res = def;
+  return res;
+}
+
+GstCudaConverter *
+gst_cuda_converter_new (const GstVideoInfo * in_info,
+    const GstVideoInfo * out_info, GstCudaContext * context,
+    GstStructure * config)
+{
+  GstCudaConverter *self;
+  GstCudaConverterPrivate *priv;
+
+  g_return_val_if_fail (in_info != NULL, NULL);
+  g_return_val_if_fail (out_info != NULL, NULL);
+  g_return_val_if_fail (GST_IS_CUDA_CONTEXT (context), NULL);
+
+  self = g_object_new (GST_TYPE_CUDA_CONVERTER, NULL);
+
+  if (!GST_IS_CUDA_CONTEXT (context)) {
+    GST_WARNING_OBJECT (self, "Not a valid cuda context object");
+    goto error;
+  }
+
+  self->context = gst_object_ref (context);
+  priv = self->priv;
+  priv->in_info = *in_info;
+  priv->out_info = *out_info;
+
+  if (config)
+    gst_cuda_converter_set_config (self, config);
+
+  priv->dest_rect.x = get_opt_int (self, GST_CUDA_CONVERTER_OPT_DEST_X, 0);
+  priv->dest_rect.y = get_opt_int (self, GST_CUDA_CONVERTER_OPT_DEST_Y, 0);
+  priv->dest_rect.width = get_opt_int (self,
+      GST_CUDA_CONVERTER_OPT_DEST_WIDTH, out_info->width);
+  priv->dest_rect.height = get_opt_int (self,
+      GST_CUDA_CONVERTER_OPT_DEST_HEIGHT, out_info->height);
+
+  if (!gst_cuda_converter_setup (self))
+    goto error;
+
+  priv->texture_align = gst_cuda_context_get_texture_alignment (context);
+
+  gst_object_ref_sink (self);
+  return self;
+
+error:
+  gst_object_unref (self);
+  return NULL;
+}
+
+
+static CUtexObject
+gst_cuda_converter_create_texture_unchecked (GstCudaConverter * self,
+    CUdeviceptr src, gint width, gint height, CUarray_format format,
+    guint channels, gint stride, gint plane, CUfilter_mode mode)
+{
+  CUDA_TEXTURE_DESC texture_desc;
+  CUDA_RESOURCE_DESC resource_desc;
+  CUtexObject texture = 0;
+  CUresult cuda_ret;
+
+  memset (&texture_desc, 0, sizeof (CUDA_TEXTURE_DESC));
+  memset (&resource_desc, 0, sizeof (CUDA_RESOURCE_DESC));
+
+  resource_desc.resType = CU_RESOURCE_TYPE_PITCH2D;
+  resource_desc.res.pitch2D.format = format;
+  resource_desc.res.pitch2D.numChannels = channels;
+  resource_desc.res.pitch2D.width = width;
+  resource_desc.res.pitch2D.height = height;
+  resource_desc.res.pitch2D.pitchInBytes = stride;
+  resource_desc.res.pitch2D.devPtr = src;
+
+  texture_desc.filterMode = mode;
+  /* Will read texture value as a normalized [0, 1] float value
+   * with [0, 1) coordinates */
+  /* CU_TRSF_NORMALIZED_COORDINATES */
+  texture_desc.flags = 0x2;
+  /* CU_TR_ADDRESS_MODE_CLAMP */
+  texture_desc.addressMode[0] = 1;
+  texture_desc.addressMode[1] = 1;
+  texture_desc.addressMode[2] = 1;
+
+  cuda_ret = CuTexObjectCreate (&texture, &resource_desc, &texture_desc, NULL);
+
+  if (!gst_cuda_result (cuda_ret)) {
+    GST_ERROR_OBJECT (self, "Could not create texture");
+    return 0;
+  }
+
+  return texture;
+}
+
+static gboolean
+ensure_fallback_buffer (GstCudaConverter * self, gint width_in_bytes,
+    gint height, guint plane)
+{
+  GstCudaConverterPrivate *priv = self->priv;
+  CUresult ret;
+
+  if (priv->fallback_buffer[plane].ptr)
+    return TRUE;
+
+  ret = CuMemAllocPitch (&priv->fallback_buffer[plane].ptr,
+      &priv->fallback_buffer[plane].stride, width_in_bytes, height, 16);
+
+  if (!gst_cuda_result (ret)) {
+    GST_ERROR_OBJECT (self, "Couldn't allocate fallback buffer");
+    return FALSE;
+  }
+
+  return TRUE;
+}
+
+static CUtexObject
+gst_cuda_converter_create_texture (GstCudaConverter * self,
+    CUdeviceptr src, gint width, gint height, gint stride, CUfilter_mode mode,
+    CUarray_format format, guint channles, gint plane, CUstream stream)
+{
+  GstCudaConverterPrivate *priv = self->priv;
+  CUresult ret;
+  CUdeviceptr src_ptr;
+
+  src_ptr = src;
+
+  if (priv->texture_align > 0 && (src_ptr % priv->texture_align) != 0) {
+    CUDA_MEMCPY2D params = { 0, };
+
+    GST_DEBUG_OBJECT (self, "Plane %d is not aligned, copying", plane);
+
+    if (!ensure_fallback_buffer (self, stride, height, plane))
+      return 0;
+
+    params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+    params.srcPitch = stride;
+    params.srcDevice = (CUdeviceptr) src_ptr;
+
+    params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    params.dstPitch = priv->fallback_buffer[plane].stride;
+    params.dstDevice = priv->fallback_buffer[plane].ptr;
+    params.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (&priv->in_info, plane)
+        * GST_VIDEO_INFO_COMP_PSTRIDE (&priv->in_info, plane),
+        params.Height = GST_VIDEO_INFO_COMP_HEIGHT (&priv->in_info, plane);
+
+    ret = CuMemcpy2D (&params);
+    if (!gst_cuda_result (ret)) {
+      GST_ERROR_OBJECT (self, "Couldn't copy to fallback buffer");
+      return 0;
+    }
+
+    src_ptr = priv->fallback_buffer[plane].ptr;
+    stride = priv->fallback_buffer[plane].stride;
+  }
+
+  return gst_cuda_converter_create_texture_unchecked (self,
+      src_ptr, width, height, format, channles, stride, plane, mode);
+}
+
+static gboolean
+gst_cuda_converter_unpack_rgb (GstCudaConverter * self,
+    GstVideoFrame * src_frame, CUstream stream)
+{
+  GstCudaConverterPrivate *priv = self->priv;
+  CUdeviceptr src;
+  gint width, height, src_stride, dst_stride;
+  CUresult ret;
+  gpointer args[] = { &src, &priv->unpack_buffer.ptr,
+    &width, &height, &src_stride, &dst_stride
+  };
+
+  g_assert (priv->unpack_buffer.ptr);
+  g_assert (priv->unpack_buffer.stride > 0);
+
+  src = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (src_frame, 0);
+  width = GST_VIDEO_FRAME_WIDTH (src_frame);
+  height = GST_VIDEO_FRAME_HEIGHT (src_frame);
+  src_stride = GST_VIDEO_FRAME_PLANE_STRIDE (src_frame, 0);
+  dst_stride = (gint) priv->unpack_buffer.stride;
+
+  ret = CuLaunchKernel (priv->unpack_func, DIV_UP (width, CUDA_BLOCK_X),
+      DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
+      stream, args, NULL);
+
+  if (!gst_cuda_result (ret)) {
+    GST_ERROR_OBJECT (self, "Couldn't unpack source RGB");
+    return FALSE;
+  }
+
+  return TRUE;
+}
+
+gboolean
+gst_cuda_converter_convert_frame (GstCudaConverter * converter,
+    GstVideoFrame * src_frame, GstVideoFrame * dst_frame, CUstream stream)
+{
+  GstCudaConverterPrivate *priv;
+  const TextureFormat *format;
+  CUtexObject texture[GST_VIDEO_MAX_COMPONENTS] = { 0, };
+  guint8 *dst[GST_VIDEO_MAX_COMPONENTS] = { NULL, };
+  gint stride[2] = { 0, };
+  gint i;
+  gboolean ret = FALSE;
+  CUresult cuda_ret;
+  gint width, height;
+  gpointer args[] = { &texture[0], &texture[1], &texture[2], &texture[3],
+    &dst[0], &dst[1], &dst[2], &dst[3], &stride[0], &stride[1]
+  };
+
+  g_return_val_if_fail (GST_IS_CUDA_CONVERTER (converter), FALSE);
+  g_return_val_if_fail (src_frame != NULL, FALSE);
+  g_return_val_if_fail (dst_frame != NULL, FALSE);
+
+  priv = converter->priv;
+  format = priv->texture_fmt;
+
+  g_assert (format);
+
+  if (!gst_cuda_context_push (converter->context)) {
+    GST_ERROR_OBJECT (converter, "Couldn't push context");
+    return FALSE;
+  }
+
+  if (priv->unpack_func) {
+    if (!gst_cuda_converter_unpack_rgb (converter, src_frame, stream))
+      goto out;
+
+    texture[0] = gst_cuda_converter_create_texture_unchecked (converter,
+        priv->unpack_buffer.ptr, priv->in_info.width, priv->in_info.height,
+        format->array_format[0], 4, priv->unpack_buffer.stride, 0,
+        priv->filter_mode[0]);
+    if (!texture[0]) {
+      GST_ERROR_OBJECT (converter, "Couldn't create texture");
+      goto out;
+    }
+  } else {
+    for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (src_frame); i++) {
+      CUdeviceptr src;
+
+      src = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (src_frame, i);
+      texture[i] = gst_cuda_converter_create_texture (converter,
+          src, GST_VIDEO_FRAME_COMP_WIDTH (src_frame, i),
+          GST_VIDEO_FRAME_COMP_HEIGHT (src_frame, i),
+          GST_VIDEO_FRAME_PLANE_STRIDE (src_frame, i),
+          priv->filter_mode[i], format->array_format[i], format->channels[i], i,
+          stream);
+      if (!texture[i]) {
+        GST_ERROR_OBJECT (converter, "Couldn't create texture %d", i);
+        goto out;
+      }
+    }
+  }
+
+  width = GST_VIDEO_FRAME_WIDTH (dst_frame);
+  height = GST_VIDEO_FRAME_HEIGHT (dst_frame);
+
+  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (dst_frame); i++)
+    dst[i] = GST_VIDEO_FRAME_PLANE_DATA (dst_frame, i);
+
+  stride[0] = stride[1] = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 0);
+  if (GST_VIDEO_FRAME_N_PLANES (dst_frame) > 1)
+    stride[1] = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 1);
+
+  cuda_ret = CuLaunchKernel (priv->main_func, DIV_UP (width, CUDA_BLOCK_X),
+      DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0,
+      stream, args, NULL);
+
+  if (!gst_cuda_result (cuda_ret)) {
+    GST_ERROR_OBJECT (converter, "Couldn't convert frame");
+    goto out;
+  }
+
+  CuStreamSynchronize (stream);
+
+  ret = TRUE;
+
+out:
+  for (i = 0; i < G_N_ELEMENTS (texture); i++) {
+    if (texture[i])
+      CuTexObjectDestroy (texture[i]);
+    else
+      break;
+  }
+
+  gst_cuda_context_pop (NULL);
+  return ret;
+}
diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconverter.h b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconverter.h
new file mode 100644
index 0000000000..d7b009a545
--- /dev/null
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconverter.h
@@ -0,0 +1,99 @@
+/* GStreamer
+ * Copyright (C) 2019 Seungha Yang <seungha.yang@navercorp.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#pragma once
+
+#include <gst/video/video.h>
+#include <gst/cuda/gstcudacontext.h>
+
+G_BEGIN_DECLS
+
+#define GST_TYPE_CUDA_CONVERTER             (gst_cuda_converter_get_type())
+#define GST_CUDA_CONVERTER(obj)             (G_TYPE_CHECK_INSTANCE_CAST((obj),GST_TYPE_CUDA_CONVERTER,GstCudaConverter))
+#define GST_CUDA_CONVERTER_CLASS(klass)     (G_TYPE_CHECK_CLASS_CAST((klass),GST_TYPE_CUDA_CONVERTER,GstCudaConverterClass))
+#define GST_CUDA_CONVERTER_GET_CLASS(obj)   (GST_CUDA_CONVERTER_CLASS(G_OBJECT_GET_CLASS(obj)))
+#define GST_IS_CUDA_CONVERTER(obj)          (G_TYPE_CHECK_INSTANCE_TYPE((obj),GST_TYPE_CUDA_CONVERTER))
+#define GST_IS_CUDA_CONVERTER_CLASS(klass)  (G_TYPE_CHECK_CLASS_TYPE((klass),GST_TYPE_CUDA_CONVERTER))
+#define GST_CUDA_CONVERTER_CAST(obj)        ((GstCudaConverter*)(obj))
+
+typedef struct _GstCudaConverter GstCudaConverter;
+typedef struct _GstCudaConverterClass GstCudaConverterClass;
+typedef struct _GstCudaConverterPrivate GstCudaConverterPrivate;
+
+/**
+ * GST_CUDA_CONVERTER_OPT_DEST_X:
+ *
+ * #G_TYPE_INT, x position in the destination frame, default 0
+ */
+#define GST_CUDA_CONVERTER_OPT_DEST_X   "GstCudaConverter.dest-x"
+
+/**
+ * GST_CUDA_CONVERTER_OPT_DEST_Y:
+ *
+ * #G_TYPE_INT, y position in the destination frame, default 0
+ */
+#define GST_CUDA_CONVERTER_OPT_DEST_Y   "GstCudaConverter.dest-y"
+
+/**
+ * GST_CUDA_CONVERTER_OPT_DEST_WIDTH:
+ *
+ * #G_TYPE_INT, width in the destination frame, default destination width
+ */
+#define GST_CUDA_CONVERTER_OPT_DEST_WIDTH   "GstCudaConverter.dest-width"
+
+/**
+ * GST_CUDA_CONVERTER_OPT_DEST_HEIGHT:
+ *
+ * #G_TYPE_INT, height in the destination frame, default destination height
+ */
+#define GST_CUDA_CONVERTER_OPT_DEST_HEIGHT   "GstCudaConverter.dest-height"
+
+struct _GstCudaConverter
+{
+  GstObject parent;
+
+  GstCudaContext *context;
+
+  /*< private >*/
+  GstCudaConverterPrivate *priv;
+  gpointer _gst_reserved[GST_PADDING];
+};
+
+struct _GstCudaConverterClass
+{
+  GstObjectClass parent_class;
+
+  /*< private >*/
+  gpointer _gst_reserved[GST_PADDING];
+};
+
+GType gst_cuda_converter_get_type (void);
+
+GstCudaConverter *  gst_cuda_converter_new (const GstVideoInfo * in_info,
+                                            const GstVideoInfo * out_info,
+                                            GstCudaContext * context,
+                                            GstStructure * config);
+
+gboolean            gst_cuda_converter_convert_frame (GstCudaConverter * converter,
+                                                      GstVideoFrame * src_frame,
+                                                      GstVideoFrame * dst_frame,
+                                                      CUstream cuda_stream);
+
+G_END_DECLS
+
diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconvertscale.c b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconvertscale.c
index 3fda6d7a6d..ad696ebd96 100644
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconvertscale.c
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconvertscale.c
@@ -24,7 +24,7 @@
 
 #include <gst/cuda/gstcudautils.h>
 #include "gstcudaconvertscale.h"
-#include "cuda-converter.h"
+#include "gstcudaconverter.h"
 
 GST_DEBUG_CATEGORY_STATIC (gst_cuda_base_convert_debug);
 #define GST_CAT_DEFAULT gst_cuda_base_convert_debug
@@ -133,10 +133,7 @@ gst_cuda_base_convert_dispose (GObject * object)
 {
   GstCudaBaseConvert *self = GST_CUDA_BASE_CONVERT (object);
 
-  if (self->converter) {
-    gst_cuda_converter_free (self->converter);
-    self->converter = NULL;
-  }
+  gst_clear_object (&self->converter);
 
   G_OBJECT_CLASS (parent_class)->dispose (object);
 }
@@ -1229,7 +1226,7 @@ gst_cuda_base_convert_set_info (GstCudaBaseTransform * btrans,
   gint from_dar_n, from_dar_d, to_dar_n, to_dar_d;
   GstVideoInfo tmp_info;
 
-  g_clear_pointer (&self->converter, gst_cuda_converter_free);
+  gst_clear_object (&self->converter);
 
   if (!gst_util_fraction_multiply (in_info->width,
           in_info->height, in_info->par_n, in_info->par_d, &from_dar_n,
@@ -1288,7 +1285,7 @@ gst_cuda_base_convert_set_info (GstCudaBaseTransform * btrans,
     gst_base_transform_set_passthrough (GST_BASE_TRANSFORM (self), FALSE);
 
     self->converter = gst_cuda_converter_new (in_info,
-        out_info, btrans->context);
+        out_info, btrans->context, NULL);
     if (!self->converter) {
       GST_ERROR_OBJECT (self, "Couldn't create converter");
       return FALSE;
diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/meson.build b/subprojects/gst-plugins-bad/sys/nvcodec/meson.build
index ec871216fb..9010b7c7a1 100644
--- a/subprojects/gst-plugins-bad/sys/nvcodec/meson.build
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/meson.build
@@ -1,6 +1,6 @@
 nvcodec_sources = [
-  'cuda-converter.c',
   'gstcudabasetransform.c',
+  'gstcudaconverter.c',
   'gstcudaconvertscale.c',
   'gstcudafilter.c',
   'gstcudamemorycopy.c',