gstreamer/subprojects/gst-plugins-bad/sys/nvcodec/kernel/gstcudaconverter.cu
Seungha Yang 9a8f3a65a3 nvcodec: Add support for CUDA kernel precompile
Enable build time CUDA kernel compile if nvcc is detected.
Precompile is disabled by default and controlled by
"nvcodec-cuda-precompile" build option.

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/8536>
2025-02-22 07:09:03 +00:00

2924 lines
No EOL
96 KiB
Text

/* GStreamer
* Copyright (C) 2025 Seungha Yang <seungha@centricular.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#ifdef __NVCC__
struct ColorMatrix
{
float CoeffX[3];
float CoeffY[3];
float CoeffZ[3];
float Offset[3];
float Min[3];
float Max[3];
};
struct ConstBuffer
{
ColorMatrix matrix;
int width;
int height;
int left;
int top;
int right;
int bottom;
int view_width;
int view_height;
float border_x;
float border_y;
float border_z;
float border_w;
int fill_border;
int video_direction;
float alpha;
int do_blend;
int do_convert;
};
__device__ inline float
dot (const float coeff[3], float3 val)
{
return coeff[0] * val.x + coeff[1] * val.y + coeff[2] * val.z;
}
__device__ inline float
clamp (float val, float min_val, float max_val)
{
return max (min_val, min (val, max_val));
}
__device__ inline float3
clamp3 (float3 val, const float min_val[3], const float max_val[3])
{
return make_float3 (clamp (val.x, min_val[0], max_val[0]),
clamp (val.y, min_val[1], max_val[2]),
clamp (val.z, min_val[1], max_val[2]));
}
__device__ inline unsigned char
scale_to_2bits (float val)
{
return (unsigned short) __float2int_rz (val * 3.0);
}
__device__ inline unsigned char
scale_to_uchar (float val)
{
return (unsigned char) __float2int_rz (val * 255.0);
}
__device__ inline unsigned short
scale_to_ushort (float val)
{
return (unsigned short) __float2int_rz (val * 65535.0);
}
__device__ inline unsigned short
scale_to_10bits (float val)
{
return (unsigned short) __float2int_rz (val * 1023.0);
}
__device__ inline unsigned short
scale_to_12bits (float val)
{
return (unsigned short) __float2int_rz (val * 4095.0);
}
__device__ inline unsigned char
blend_uchar (unsigned char dst, float src, float src_alpha)
{
// DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor
float src_val = src * src_alpha;
float dst_val = __int2float_rz (dst) / 255.0 * (1.0 - src_alpha);
return scale_to_uchar(clamp(src_val + dst_val, 0, 1.0));
}
__device__ inline unsigned short
blend_ushort (unsigned short dst, float src, float src_alpha)
{
// DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor
float src_val = src * src_alpha;
float dst_val = __int2float_rz (dst) / 65535.0 * (1.0 - src_alpha);
return scale_to_ushort(clamp(src_val + dst_val, 0, 1.0));
}
__device__ inline unsigned short
blend_10bits (unsigned short dst, float src, float src_alpha)
{
// DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor
float src_val = src * src_alpha;
float dst_val = __int2float_rz (dst) / 1023.0 * (1.0 - src_alpha);
return scale_to_10bits(clamp(src_val + dst_val, 0, 1.0));
}
__device__ inline unsigned short
blend_12bits (unsigned short dst, float src, float src_alpha)
{
// DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor
float src_val = src * src_alpha;
float dst_val = __int2float_rz (dst) / 4095.0 * (1.0 - src_alpha);
return scale_to_12bits(clamp(src_val + dst_val, 0, 1.0));
}
struct IConverter
{
__device__ virtual float3
Execute (float3 sample, const ColorMatrix * matrix) = 0;
};
struct ConvertSimple : public IConverter
{
__device__ float3
Execute (float3 sample, const ColorMatrix * matrix)
{
float3 out;
out.x = dot (matrix->CoeffX, sample);
out.y = dot (matrix->CoeffY, sample);
out.z = dot (matrix->CoeffZ, sample);
out.x += matrix->Offset[0];
out.y += matrix->Offset[1];
out.z += matrix->Offset[2];
return clamp3 (out, matrix->Min, matrix->Max);
}
};
struct ISampler
{
__device__ virtual float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y) = 0;
};
struct SampleI420 : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float luma = tex2D<float>(tex0, x, y);
float u = tex2D<float>(tex1, x, y);
float v = tex2D<float>(tex2, x, y);
return make_float4 (luma, u, v, 1);
}
};
struct SampleYV12 : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float luma = tex2D<float>(tex0, x, y);
float u = tex2D<float>(tex2, x, y);
float v = tex2D<float>(tex1, x, y);
return make_float4 (luma, u, v, 1);
}
};
struct SampleI420_10 : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float luma = tex2D<float>(tex0, x, y);
float u = tex2D<float>(tex1, x, y);
float v = tex2D<float>(tex2, x, y);
/* (1 << 6) to scale [0, 1.0) range */
return make_float4 (luma * 64, u * 64, v * 64, 1);
}
};
struct SampleI420_12 : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float luma = tex2D<float>(tex0, x, y);
float u = tex2D<float>(tex1, x, y);
float v = tex2D<float>(tex2, x, y);
/* (1 << 4) to scale [0, 1.0) range */
return make_float4 (luma * 16, u * 16, v * 16, 1);
}
};
struct SampleNV12 : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float luma = tex2D<float>(tex0, x, y);
float2 uv = tex2D<float2>(tex1, x, y);
return make_float4 (luma, uv.x, uv.y, 1);
}
};
struct SampleNV21 : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float luma = tex2D<float>(tex0, x, y);
float2 vu = tex2D<float2>(tex1, x, y);
return make_float4 (luma, vu.y, vu.x, 1);
}
};
struct SampleRGBA : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
return tex2D<float4>(tex0, x, y);
}
};
struct SampleBGRA : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float4 bgra = tex2D<float4>(tex0, x, y);
return make_float4 (bgra.z, bgra.y, bgra.x, bgra.w);
}
};
struct SampleRGBx : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float4 rgbx = tex2D<float4>(tex0, x, y);
rgbx.w = 1;
return rgbx;
}
};
struct SampleBGRx : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float4 bgrx = tex2D<float4>(tex0, x, y);
return make_float4 (bgrx.z, bgrx.y, bgrx.x, 1);
}
};
struct SampleARGB : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float4 argb = tex2D<float4>(tex0, x, y);
return make_float4 (argb.y, argb.z, argb.w, argb.x);
}
};
struct SampleABGR : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float4 abgr = tex2D<float4>(tex0, x, y);
return make_float4 (abgr.w, abgr.z, abgr.y, abgr.x);
}
};
struct SampleRGBP : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float r = tex2D<float>(tex0, x, y);
float g = tex2D<float>(tex1, x, y);
float b = tex2D<float>(tex2, x, y);
return make_float4 (r, g, b, 1);
}
};
struct SampleBGRP : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float b = tex2D<float>(tex0, x, y);
float g = tex2D<float>(tex1, x, y);
float r = tex2D<float>(tex2, x, y);
return make_float4 (r, g, b, 1);
}
};
struct SampleGBR : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float g = tex2D<float>(tex0, x, y);
float b = tex2D<float>(tex1, x, y);
float r = tex2D<float>(tex2, x, y);
return make_float4 (r, g, b, 1);
}
};
struct SampleGBR_10 : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float g = tex2D<float>(tex0, x, y);
float b = tex2D<float>(tex1, x, y);
float r = tex2D<float>(tex2, x, y);
/* (1 << 6) to scale [0, 1.0) range */
return make_float4 (r * 64, g * 64, b * 64, 1);
}
};
struct SampleGBR_12 : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float g = tex2D<float>(tex0, x, y);
float b = tex2D<float>(tex1, x, y);
float r = tex2D<float>(tex2, x, y);
/* (1 << 4) to scale [0, 1.0) range */
return make_float4 (r * 16, g * 16, b * 16, 1);
}
};
struct SampleGBRA : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float g = tex2D<float>(tex0, x, y);
float b = tex2D<float>(tex1, x, y);
float r = tex2D<float>(tex2, x, y);
float a = tex2D<float>(tex3, x, y);
return make_float4 (r, g, b, a);
}
};
struct SampleVUYA : public ISampler
{
__device__ float4
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
{
float4 vuya = tex2D<float4>(tex0, x, y);
return make_float4 (vuya.z, vuya.y, vuya.x, vuya.w);
}
};
struct IOutput
{
__device__ virtual void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1) = 0;
__device__ virtual void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1) = 0;
};
struct OutputI420 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
dst0[x + y * stride0] = scale_to_uchar (sample.x);
if (x % 2 == 0 && y % 2 == 0) {
unsigned int pos = x / 2 + (y / 2) * stride1;
dst1[pos] = scale_to_uchar (sample.y);
dst2[pos] = scale_to_uchar (sample.z);
}
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
unsigned int pos = x + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
if (x % 2 == 0 && y % 2 == 0) {
pos = x / 2 + (y / 2) * stride1;
dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);
}
}
};
struct OutputYV12 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
dst0[x + y * stride0] = scale_to_uchar (sample.x);
if (x % 2 == 0 && y % 2 == 0) {
unsigned int pos = x / 2 + (y / 2) * stride1;
dst1[pos] = scale_to_uchar (sample.z);
dst2[pos] = scale_to_uchar (sample.y);
}
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
unsigned int pos = x + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
if (x % 2 == 0 && y % 2 == 0) {
pos = x / 2 + (y / 2) * stride1;
dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);
dst2[pos] = blend_uchar (dst2[pos], sample.y, sample.w);
}
}
};
struct OutputNV12 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
dst0[x + y * stride0] = scale_to_uchar (sample.x);
if (x % 2 == 0 && y % 2 == 0) {
unsigned int pos = x + (y / 2) * stride1;
dst1[pos] = scale_to_uchar (sample.y);
dst1[pos + 1] = scale_to_uchar (sample.z);
}
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
unsigned int pos = x + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
if (x % 2 == 0 && y % 2 == 0) {
pos = x + (y / 2) * stride1;
dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
dst1[pos + 1] = blend_uchar (dst1[pos + 1], sample.z, sample.w);
}
}
};
struct OutputNV21 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
dst0[x + y * stride0] = scale_to_uchar (sample.x);
if (x % 2 == 0 && y % 2 == 0) {
unsigned int pos = x + (y / 2) * stride1;
dst1[pos] = scale_to_uchar (sample.z);
dst1[pos + 1] = scale_to_uchar (sample.y);
}
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
unsigned int pos = x + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
if (x % 2 == 0 && y % 2 == 0) {
pos = x + (y / 2) * stride1;
dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);
dst1[pos + 1] = blend_uchar (dst1[pos + 1], sample.y, sample.w);
}
}
};
struct OutputP010 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
*(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_ushort (sample.x);
if (x % 2 == 0 && y % 2 == 0) {
unsigned int pos = x * 2 + (y / 2) * stride1;
*(unsigned short *) &dst1[pos] = scale_to_ushort (sample.y);
*(unsigned short *) &dst1[pos + 2] = scale_to_ushort (sample.z);
}
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
unsigned int pos = x * 2 + y * stride0;
unsigned short * target = (unsigned short *) &dst0[pos];
*target = blend_ushort (*target, sample.x, sample.w);
if (x % 2 == 0 && y % 2 == 0) {
pos = x * 2 + (y / 2) * stride1;
target = (unsigned short *) &dst1[pos];
*target = blend_ushort (*target, sample.y, sample.w);
target = (unsigned short *) &dst1[pos + 2];
*target = blend_ushort (*target, sample.z, sample.w);
}
}
};
struct OutputI420_10 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
*(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_10bits (sample.x);
if (x % 2 == 0 && y % 2 == 0) {
unsigned int pos = x + (y / 2) * stride1;
*(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);
*(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);
}
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
unsigned int pos = x * 2 + y * stride0;
unsigned short * target = (unsigned short *) &dst0[pos];
*target = blend_10bits (*target, sample.x, sample.w);
if (x % 2 == 0 && y % 2 == 0) {
pos = x * 2 + (y / 2) * stride1;
target = (unsigned short *) &dst1[pos];
*target = blend_10bits (*target, sample.y, sample.w);
target = (unsigned short *) &dst2[pos];
*target = blend_10bits (*target, sample.z, sample.w);
}
}
};
struct OutputI420_12 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
*(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_12bits (sample.x);
if (x % 2 == 0 && y % 2 == 0) {
unsigned int pos = x + (y / 2) * stride1;
*(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);
*(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);
}
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
unsigned int pos = x * 2 + y * stride0;
unsigned short * target = (unsigned short *) &dst0[pos];
*target = blend_12bits (*target, sample.x, sample.w);
if (x % 2 == 0 && y % 2 == 0) {
pos = x * 2 + (y / 2) * stride1;
target = (unsigned short *) &dst1[pos];
*target = blend_12bits (*target, sample.y, sample.w);
target = (unsigned short *) &dst2[pos];
*target = blend_12bits (*target, sample.z, sample.w);
}
}
};
struct OutputY444 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x + y * stride0;
dst0[pos] = scale_to_uchar (sample.x);
dst1[pos] = scale_to_uchar (sample.y);
dst2[pos] = scale_to_uchar (sample.z);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);
}
};
struct OutputY444_10 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 2 + y * stride0;
*(unsigned short *) &dst0[pos] = scale_to_10bits (sample.x);
*(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);
*(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 2 + y * stride0;
unsigned short * target = (unsigned short *) &dst0[pos];
*target = blend_10bits (*target, sample.x, sample.w);
target = (unsigned short *) &dst1[pos];
*target = blend_10bits (*target, sample.y, sample.w);
target = (unsigned short *) &dst2[pos];
*target = blend_10bits (*target, sample.z, sample.w);
}
};
struct OutputY444_12 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 2 + y * stride0;
*(unsigned short *) &dst0[pos] = scale_to_12bits (sample.x);
*(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);
*(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 2 + y * stride0;
unsigned short * target = (unsigned short *) &dst0[pos];
*target = blend_12bits (*target, sample.x, sample.w);
target = (unsigned short *) &dst1[pos];
*target = blend_12bits (*target, sample.y, sample.w);
target = (unsigned short *) &dst2[pos];
*target = blend_12bits (*target, sample.z, sample.w);
}
};
struct OutputY444_16 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 2 + y * stride0;
*(unsigned short *) &dst0[pos] = scale_to_ushort (sample.x);
*(unsigned short *) &dst1[pos] = scale_to_ushort (sample.y);
*(unsigned short *) &dst2[pos] = scale_to_ushort (sample.z);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 2 + y * stride0;
unsigned short * target = (unsigned short *) &dst0[pos];
*target = blend_ushort (*target, sample.x, sample.w);
target = (unsigned short *) &dst1[pos];
*target = blend_ushort (*target, sample.y, sample.w);
target = (unsigned short *) &dst2[pos];
*target = blend_ushort (*target, sample.z, sample.w);
}
};
struct OutputRGBA : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 4 + y * stride0;
dst0[pos] = scale_to_uchar (sample.x);
dst0[pos + 1] = scale_to_uchar (sample.y);
dst0[pos + 2] = scale_to_uchar (sample.z);
dst0[pos + 3] = scale_to_uchar (sample.w);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 4 + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);
dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);
}
};
struct OutputRGBx : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 4 + y * stride0;
dst0[pos] = scale_to_uchar (sample.x);
dst0[pos + 1] = scale_to_uchar (sample.y);
dst0[pos + 2] = scale_to_uchar (sample.z);
dst0[pos + 3] = 255;
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 4 + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);
dst0[pos + 3] = 255;
}
};
struct OutputBGRA : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 4 + y * stride0;
dst0[pos] = scale_to_uchar (sample.z);
dst0[pos + 1] = scale_to_uchar (sample.y);
dst0[pos + 2] = scale_to_uchar (sample.x);
dst0[pos + 3] = scale_to_uchar (sample.w);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 4 + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);
dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);
}
};
struct OutputBGRx : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 4 + y * stride0;
dst0[pos] = scale_to_uchar (sample.z);
dst0[pos + 1] = scale_to_uchar (sample.y);
dst0[pos + 2] = scale_to_uchar (sample.x);
dst0[pos + 3] = 255;
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 4 + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);
dst0[pos + 3] = 255;
}
};
struct OutputARGB : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 4 + y * stride0;
dst0[pos] = scale_to_uchar (sample.w);
dst0[pos + 1] = scale_to_uchar (sample.x);
dst0[pos + 2] = scale_to_uchar (sample.y);
dst0[pos + 3] = scale_to_uchar (sample.z);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 4 + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], 1.0, sample.w);
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.x, sample.w);
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.y, sample.w);
dst0[pos + 3] = blend_uchar (dst0[pos + 3], sample.z, sample.w);
}
};
struct OutputABGR : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 4 + y * stride0;
dst0[pos] = scale_to_uchar (sample.w);
dst0[pos + 1] = scale_to_uchar (sample.z);
dst0[pos + 2] = scale_to_uchar (sample.y);
dst0[pos + 3] = scale_to_uchar (sample.x);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 4 + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], 1.0, sample.w);
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.z, sample.w);
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.y, sample.w);
dst0[pos + 3] = blend_uchar (dst0[pos + 3], sample.x, sample.w);
}
};
struct OutputRGB : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 3 + y * stride0;
dst0[pos] = scale_to_uchar (sample.x);
dst0[pos + 1] = scale_to_uchar (sample.y);
dst0[pos + 2] = scale_to_uchar (sample.z);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 3 + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);
}
};
struct OutputBGR : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 3 + y * stride0;
dst0[pos] = scale_to_uchar (sample.z);
dst0[pos + 1] = scale_to_uchar (sample.y);
dst0[pos + 2] = scale_to_uchar (sample.x);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 3 + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);
}
};
__device__ inline ushort3
unpack_rgb10a2 (unsigned int val)
{
unsigned short r, g, b;
r = (val & 0x3ff);
r = (r << 6) | (r >> 4);
g = ((val >> 10) & 0x3ff);
g = (g << 6) | (g >> 4);
b = ((val >> 20) & 0x3ff);
b = (b << 6) | (b >> 4);
return make_ushort3 (r, g, b);
}
struct OutputRGB10A2 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);
unsigned int packed_rgb = alpha << 30;
packed_rgb |= ((unsigned int) scale_to_10bits (sample.x));
packed_rgb |= ((unsigned int) scale_to_10bits (sample.y)) << 10;
packed_rgb |= ((unsigned int) scale_to_10bits (sample.z)) << 20;
*(unsigned int *) &dst0[x * 4 + y * stride0] = packed_rgb;
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
unsigned int * target = (unsigned int *) &dst0[x * 4 + y * stride0];
ushort3 val = unpack_rgb10a2 (*target);
unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);
unsigned int packed_rgb = alpha << 30;
packed_rgb |= ((unsigned int) blend_10bits (val.x, sample.x, sample.w));
packed_rgb |= ((unsigned int) blend_10bits (val.y, sample.y, sample.w)) << 10;
packed_rgb |= ((unsigned int) blend_10bits (val.z, sample.z, sample.w)) << 20;
*target = packed_rgb;
}
};
__device__ inline ushort3
unpack_bgr10a2 (unsigned int val)
{
unsigned short r, g, b;
b = (val & 0x3ff);
b = (b << 6) | (b >> 4);
g = ((val >> 10) & 0x3ff);
g = (g << 6) | (g >> 4);
r = ((val >> 20) & 0x3ff);
r = (r << 6) | (r >> 4);
return make_ushort3 (r, g, b);
}
struct OutputBGR10A2 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
unsigned int alpha = (unsigned int) scale_to_2bits (sample.x);
unsigned int packed_rgb = alpha << 30;
packed_rgb |= ((unsigned int) scale_to_10bits (sample.x)) << 20;
packed_rgb |= ((unsigned int) scale_to_10bits (sample.y)) << 10;
packed_rgb |= ((unsigned int) scale_to_10bits (sample.z));
*(unsigned int *) &dst0[x * 4 + y * stride0] = packed_rgb;
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
unsigned int * target = (unsigned int *) &dst0[x * 4 + y * stride0];
ushort3 val = unpack_bgr10a2 (*target);
unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);
unsigned int packed_rgb = alpha << 30;
packed_rgb |= ((unsigned int) blend_10bits (val.x, sample.x, sample.w)) << 20;
packed_rgb |= ((unsigned int) blend_10bits (val.y, sample.y, sample.w)) << 10;
packed_rgb |= ((unsigned int) blend_10bits (val.z, sample.z, sample.w));
*target = packed_rgb;
}
};
struct OutputY42B : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
dst0[x + y * stride0] = scale_to_uchar (sample.x);
if (x % 2 == 0) {
unsigned int pos = x / 2 + y * stride1;
dst1[pos] = scale_to_uchar (sample.y);
dst2[pos] = scale_to_uchar (sample.z);
}
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
unsigned int pos = x + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
if (x % 2 == 0) {
pos = x / 2 + y * stride1;
dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);
}
}
};
struct OutputI422_10 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
*(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_10bits (sample.x);
if (x % 2 == 0) {
unsigned int pos = x + y * stride1;
*(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);
*(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);
}
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
unsigned int pos = x * 2 + y * stride0;
unsigned short * target = (unsigned short *) &dst0[pos];
*target = blend_10bits (*target, sample.x, sample.w);
if (x % 2 == 0) {
pos = x / 2 + y * stride1;
target = (unsigned short *) &dst1[pos];
*target = blend_10bits (*target, sample.y, sample.w);
target = (unsigned short *) &dst2[pos];
*target = blend_10bits (*target, sample.z, sample.w);
}
}
};
struct OutputI422_12 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
*(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_12bits (sample.x);
if (x % 2 == 0) {
unsigned int pos = x + y * stride1;
*(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);
*(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);
}
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
unsigned int pos = x * 2 + y * stride0;
unsigned short * target = (unsigned short *) &dst0[pos];
*target = blend_12bits (*target, sample.x, sample.w);
if (x % 2 == 0) {
pos = x / 2 + y * stride1;
target = (unsigned short *) &dst1[pos];
*target = blend_12bits (*target, sample.y, sample.w);
target = (unsigned short *) &dst2[pos];
*target = blend_12bits (*target, sample.z, sample.w);
}
}
};
struct OutputRGBP : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x + y * stride0;
dst0[pos] = scale_to_uchar (sample.x);
dst1[pos] = scale_to_uchar (sample.y);
dst2[pos] = scale_to_uchar (sample.z);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);
}
};
struct OutputBGRP : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x + y * stride0;
dst0[pos] = scale_to_uchar (sample.z);
dst1[pos] = scale_to_uchar (sample.y);
dst2[pos] = scale_to_uchar (sample.x);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);
dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);
}
};
struct OutputGBR : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x + y * stride0;
dst0[pos] = scale_to_uchar (sample.y);
dst1[pos] = scale_to_uchar (sample.z);
dst2[pos] = scale_to_uchar (sample.x);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.y, sample.w);
dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);
dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);
}
};
struct OutputGBR_10 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 2 + y * stride0;
*(unsigned short *) &dst0[pos] = scale_to_10bits (sample.y);
*(unsigned short *) &dst1[pos] = scale_to_10bits (sample.z);
*(unsigned short *) &dst2[pos] = scale_to_10bits (sample.x);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 2 + y * stride0;
unsigned short * target = (unsigned short *) &dst0[pos];
*target = blend_10bits (*target, sample.y, sample.w);
target = (unsigned short *) &dst1[pos];
*target = blend_10bits (*target, sample.z, sample.w);
target = (unsigned short *) &dst2[pos];
*target = blend_10bits (*target, sample.x, sample.w);
}
};
struct OutputGBR_12 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 2 + y * stride0;
*(unsigned short *) &dst0[pos] = scale_to_12bits (sample.y);
*(unsigned short *) &dst1[pos] = scale_to_12bits (sample.z);
*(unsigned short *) &dst2[pos] = scale_to_12bits (sample.x);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 2 + y * stride0;
unsigned short * target = (unsigned short *) &dst0[pos];
*target = blend_12bits (*target, sample.y, sample.w);
target = (unsigned short *) &dst1[pos];
*target = blend_12bits (*target, sample.z, sample.w);
target = (unsigned short *) &dst2[pos];
*target = blend_12bits (*target, sample.x, sample.w);
}
};
struct OutputGBR_16 : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 2 + y * stride0;
*(unsigned short *) &dst0[pos] = scale_to_ushort (sample.y);
*(unsigned short *) &dst1[pos] = scale_to_ushort (sample.z);
*(unsigned short *) &dst2[pos] = scale_to_ushort (sample.x);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 2 + y * stride0;
unsigned short * target = (unsigned short *) &dst0[pos];
*target = blend_ushort (*target, sample.y, sample.w);
target = (unsigned short *) &dst1[pos];
*target = blend_ushort (*target, sample.z, sample.w);
target = (unsigned short *) &dst2[pos];
*target = blend_ushort (*target, sample.x, sample.w);
}
};
struct OutputGBRA : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x + y * stride0;
dst0[pos] = scale_to_uchar (sample.y);
dst1[pos] = scale_to_uchar (sample.z);
dst2[pos] = scale_to_uchar (sample.x);
dst3[pos] = scale_to_uchar (sample.w);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.y, sample.w);
dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);
dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);
dst3[pos] = blend_uchar (dst3[pos], 1.0, sample.w);
}
};
struct OutputVUYA : public IOutput
{
__device__ void
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 4 + y * stride0;
dst0[pos] = scale_to_uchar (sample.z);
dst0[pos + 1] = scale_to_uchar (sample.y);
dst0[pos + 2] = scale_to_uchar (sample.x);
dst0[pos + 3] = scale_to_uchar (sample.w);
}
__device__ void
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
unsigned char * dst3, float4 sample, int x, int y, int stride0,
int stride1)
{
int pos = x * 4 + y * stride0;
dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);
dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);
}
};
__device__ inline float2
rotate_identity (float x, float y)
{
return make_float2(x, y);
}
__device__ inline float2
rotate_90r (float x, float y)
{
return make_float2(y, 1.0 - x);
}
__device__ inline float2
rotate_180 (float x, float y)
{
return make_float2(1.0 - x, 1.0 - y);
}
__device__ inline float2
rotate_90l (float x, float y)
{
return make_float2(1.0 - y, x);
}
__device__ inline float2
rotate_horiz (float x, float y)
{
return make_float2(1.0 - x, y);
}
__device__ inline float2
rotate_vert (float x, float y)
{
return make_float2(x, 1.0 - y);
}
__device__ inline float2
rotate_ul_lr (float x, float y)
{
return make_float2(y, x);
}
__device__ inline float2
rotate_ur_ll (float x, float y)
{
return make_float2(1.0 - y, 1.0 - x);
}
__device__ inline float2
do_rotate (float x, float y, int direction)
{
switch (direction) {
case 1:
return rotate_90r (x, y);
case 2:
return rotate_180 (x, y);
case 3:
return rotate_90l (x, y);
case 4:
return rotate_horiz (x, y);
case 5:
return rotate_vert (x, y);
case 6:
return rotate_ul_lr (x, y);
case 7:
return rotate_ur_ll (x, y);
default:
return rotate_identity (x, y);
}
}
extern "C" {
__global__ void
GstCudaConverterMain (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
cudaTextureObject_t tex2, cudaTextureObject_t tex3, unsigned char * dst0,
unsigned char * dst1, unsigned char * dst2, unsigned char * dst3,
int stride0, int stride1, ConstBuffer const_buf, int off_x, int off_y)
{
ConvertSimple g_converter;
SAMPLER g_sampler;
OUTPUT g_output;
int x_pos = blockIdx.x * blockDim.x + threadIdx.x + off_x;
int y_pos = blockIdx.y * blockDim.y + threadIdx.y + off_y;
float4 sample;
if (x_pos >= const_buf.width || y_pos >= const_buf.height ||
const_buf.view_width <= 0 || const_buf.view_height <= 0)
return;
if (x_pos < const_buf.left || x_pos >= const_buf.right ||
y_pos < const_buf.top || y_pos >= const_buf.bottom) {
if (!const_buf.fill_border)
return;
sample = make_float4 (const_buf.border_x, const_buf.border_y,
const_buf.border_z, const_buf.border_w);
} else {
float x = (__int2float_rz (x_pos - const_buf.left) + 0.5) / const_buf.view_width;
if (x < 0.0 || x > 1.0)
return;
float y = (__int2float_rz (y_pos - const_buf.top) + 0.5) / const_buf.view_height;
if (y < 0.0 || y > 1.0)
return;
float2 rotated = do_rotate (x, y, const_buf.video_direction);
float4 s = g_sampler.Execute (tex0, tex1, tex2, tex3, rotated.x, rotated.y);
float3 rgb = make_float3 (s.x, s.y, s.z);
float3 yuv;
if (const_buf.do_convert)
yuv = g_converter.Execute (rgb, &const_buf.matrix);
else
yuv = rgb;
sample = make_float4 (yuv.x, yuv.y, yuv.z, s.w);
}
sample.w = sample.w * const_buf.alpha;
if (!const_buf.do_blend) {
g_output.Write (dst0, dst1, dst2, dst3, sample, x_pos, y_pos, stride0, stride1);
} else {
g_output.Blend (dst0, dst1, dst2, dst3, sample, x_pos, y_pos, stride0, stride1);
}
}
}
#else
static const char GstCudaConverterMain_str[] =
"struct ColorMatrix\n"
"{\n"
" float CoeffX[3];\n"
" float CoeffY[3];\n"
" float CoeffZ[3];\n"
" float Offset[3];\n"
" float Min[3];\n"
" float Max[3];\n"
"};\n"
"\n"
"struct ConstBuffer\n"
"{\n"
" ColorMatrix matrix;\n"
" int width;\n"
" int height;\n"
" int left;\n"
" int top;\n"
" int right;\n"
" int bottom;\n"
" int view_width;\n"
" int view_height;\n"
" float border_x;\n"
" float border_y;\n"
" float border_z;\n"
" float border_w;\n"
" int fill_border;\n"
" int video_direction;\n"
" float alpha;\n"
" int do_blend;\n"
" int do_convert;\n"
"};\n"
"\n"
"__device__ inline float\n"
"dot (const float coeff[3], float3 val)\n"
"{\n"
" return coeff[0] * val.x + coeff[1] * val.y + coeff[2] * val.z;\n"
"}\n"
"\n"
"__device__ inline float\n"
"clamp (float val, float min_val, float max_val)\n"
"{\n"
" return max (min_val, min (val, max_val));\n"
"}\n"
"\n"
"__device__ inline float3\n"
"clamp3 (float3 val, const float min_val[3], const float max_val[3])\n"
"{\n"
" return make_float3 (clamp (val.x, min_val[0], max_val[0]),\n"
" clamp (val.y, min_val[1], max_val[2]),\n"
" clamp (val.z, min_val[1], max_val[2]));\n"
"}\n"
"\n"
"__device__ inline unsigned char\n"
"scale_to_2bits (float val)\n"
"{\n"
" return (unsigned short) __float2int_rz (val * 3.0);\n"
"}\n"
"\n"
"__device__ inline unsigned char\n"
"scale_to_uchar (float val)\n"
"{\n"
" return (unsigned char) __float2int_rz (val * 255.0);\n"
"}\n"
"\n"
"__device__ inline unsigned short\n"
"scale_to_ushort (float val)\n"
"{\n"
" return (unsigned short) __float2int_rz (val * 65535.0);\n"
"}\n"
"\n"
"__device__ inline unsigned short\n"
"scale_to_10bits (float val)\n"
"{\n"
" return (unsigned short) __float2int_rz (val * 1023.0);\n"
"}\n"
"\n"
"__device__ inline unsigned short\n"
"scale_to_12bits (float val)\n"
"{\n"
" return (unsigned short) __float2int_rz (val * 4095.0);\n"
"}\n"
"\n"
"__device__ inline unsigned char\n"
"blend_uchar (unsigned char dst, float src, float src_alpha)\n"
"{\n"
" // DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor\n"
" float src_val = src * src_alpha;\n"
" float dst_val = __int2float_rz (dst) / 255.0 * (1.0 - src_alpha);\n"
" return scale_to_uchar(clamp(src_val + dst_val, 0, 1.0));\n"
"}\n"
"\n"
"__device__ inline unsigned short\n"
"blend_ushort (unsigned short dst, float src, float src_alpha)\n"
"{\n"
" // DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor\n"
" float src_val = src * src_alpha;\n"
" float dst_val = __int2float_rz (dst) / 65535.0 * (1.0 - src_alpha);\n"
" return scale_to_ushort(clamp(src_val + dst_val, 0, 1.0));\n"
"}\n"
"\n"
"__device__ inline unsigned short\n"
"blend_10bits (unsigned short dst, float src, float src_alpha)\n"
"{\n"
" // DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor\n"
" float src_val = src * src_alpha;\n"
" float dst_val = __int2float_rz (dst) / 1023.0 * (1.0 - src_alpha);\n"
" return scale_to_10bits(clamp(src_val + dst_val, 0, 1.0));\n"
"}\n"
"\n"
"__device__ inline unsigned short\n"
"blend_12bits (unsigned short dst, float src, float src_alpha)\n"
"{\n"
" // DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor\n"
" float src_val = src * src_alpha;\n"
" float dst_val = __int2float_rz (dst) / 4095.0 * (1.0 - src_alpha);\n"
" return scale_to_12bits(clamp(src_val + dst_val, 0, 1.0));\n"
"}\n"
"\n"
"struct IConverter\n"
"{\n"
" __device__ virtual float3\n"
" Execute (float3 sample, const ColorMatrix * matrix) = 0;\n"
"};\n"
"\n"
"struct ConvertSimple : public IConverter\n"
"{\n"
" __device__ float3\n"
" Execute (float3 sample, const ColorMatrix * matrix)\n"
" {\n"
" float3 out;\n"
" out.x = dot (matrix->CoeffX, sample);\n"
" out.y = dot (matrix->CoeffY, sample);\n"
" out.z = dot (matrix->CoeffZ, sample);\n"
" out.x += matrix->Offset[0];\n"
" out.y += matrix->Offset[1];\n"
" out.z += matrix->Offset[2];\n"
" return clamp3 (out, matrix->Min, matrix->Max);\n"
" }\n"
"};\n"
"\n"
"struct ISampler\n"
"{\n"
" __device__ virtual float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y) = 0;\n"
"};\n"
"\n"
"struct SampleI420 : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float luma = tex2D<float>(tex0, x, y);\n"
" float u = tex2D<float>(tex1, x, y);\n"
" float v = tex2D<float>(tex2, x, y);\n"
" return make_float4 (luma, u, v, 1);\n"
" }\n"
"};\n"
"\n"
"struct SampleYV12 : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float luma = tex2D<float>(tex0, x, y);\n"
" float u = tex2D<float>(tex2, x, y);\n"
" float v = tex2D<float>(tex1, x, y);\n"
" return make_float4 (luma, u, v, 1);\n"
" }\n"
"};\n"
"\n"
"struct SampleI420_10 : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float luma = tex2D<float>(tex0, x, y);\n"
" float u = tex2D<float>(tex1, x, y);\n"
" float v = tex2D<float>(tex2, x, y);\n"
" /* (1 << 6) to scale [0, 1.0) range */\n"
" return make_float4 (luma * 64, u * 64, v * 64, 1);\n"
" }\n"
"};\n"
"\n"
"struct SampleI420_12 : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float luma = tex2D<float>(tex0, x, y);\n"
" float u = tex2D<float>(tex1, x, y);\n"
" float v = tex2D<float>(tex2, x, y);\n"
" /* (1 << 4) to scale [0, 1.0) range */\n"
" return make_float4 (luma * 16, u * 16, v * 16, 1);\n"
" }\n"
"};\n"
"\n"
"struct SampleNV12 : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float luma = tex2D<float>(tex0, x, y);\n"
" float2 uv = tex2D<float2>(tex1, x, y);\n"
" return make_float4 (luma, uv.x, uv.y, 1);\n"
" }\n"
"};\n"
"\n"
"struct SampleNV21 : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float luma = tex2D<float>(tex0, x, y);\n"
" float2 vu = tex2D<float2>(tex1, x, y);\n"
" return make_float4 (luma, vu.y, vu.x, 1);\n"
" }\n"
"};\n"
"\n"
"struct SampleRGBA : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" return tex2D<float4>(tex0, x, y);\n"
" }\n"
"};\n"
"\n"
"struct SampleBGRA : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float4 bgra = tex2D<float4>(tex0, x, y);\n"
" return make_float4 (bgra.z, bgra.y, bgra.x, bgra.w);\n"
" }\n"
"};\n"
"\n"
"struct SampleRGBx : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float4 rgbx = tex2D<float4>(tex0, x, y);\n"
" rgbx.w = 1;\n"
" return rgbx;\n"
" }\n"
"};\n"
"\n"
"struct SampleBGRx : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float4 bgrx = tex2D<float4>(tex0, x, y);\n"
" return make_float4 (bgrx.z, bgrx.y, bgrx.x, 1);\n"
" }\n"
"};\n"
"\n"
"struct SampleARGB : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float4 argb = tex2D<float4>(tex0, x, y);\n"
" return make_float4 (argb.y, argb.z, argb.w, argb.x);\n"
" }\n"
"};\n"
"\n"
"struct SampleABGR : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float4 abgr = tex2D<float4>(tex0, x, y);\n"
" return make_float4 (abgr.w, abgr.z, abgr.y, abgr.x);\n"
" }\n"
"};\n"
"\n"
"struct SampleRGBP : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float r = tex2D<float>(tex0, x, y);\n"
" float g = tex2D<float>(tex1, x, y);\n"
" float b = tex2D<float>(tex2, x, y);\n"
" return make_float4 (r, g, b, 1);\n"
" }\n"
"};\n"
"\n"
"struct SampleBGRP : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float b = tex2D<float>(tex0, x, y);\n"
" float g = tex2D<float>(tex1, x, y);\n"
" float r = tex2D<float>(tex2, x, y);\n"
" return make_float4 (r, g, b, 1);\n"
" }\n"
"};\n"
"\n"
"struct SampleGBR : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float g = tex2D<float>(tex0, x, y);\n"
" float b = tex2D<float>(tex1, x, y);\n"
" float r = tex2D<float>(tex2, x, y);\n"
" return make_float4 (r, g, b, 1);\n"
" }\n"
"};\n"
"\n"
"struct SampleGBR_10 : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float g = tex2D<float>(tex0, x, y);\n"
" float b = tex2D<float>(tex1, x, y);\n"
" float r = tex2D<float>(tex2, x, y);\n"
" /* (1 << 6) to scale [0, 1.0) range */\n"
" return make_float4 (r * 64, g * 64, b * 64, 1);\n"
" }\n"
"};\n"
"\n"
"struct SampleGBR_12 : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float g = tex2D<float>(tex0, x, y);\n"
" float b = tex2D<float>(tex1, x, y);\n"
" float r = tex2D<float>(tex2, x, y);\n"
" /* (1 << 4) to scale [0, 1.0) range */\n"
" return make_float4 (r * 16, g * 16, b * 16, 1);\n"
" }\n"
"};\n"
"\n"
"struct SampleGBRA : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float g = tex2D<float>(tex0, x, y);\n"
" float b = tex2D<float>(tex1, x, y);\n"
" float r = tex2D<float>(tex2, x, y);\n"
" float a = tex2D<float>(tex3, x, y);\n"
" return make_float4 (r, g, b, a);\n"
" }\n"
"};\n"
"\n"
"struct SampleVUYA : public ISampler\n"
"{\n"
" __device__ float4\n"
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
" {\n"
" float4 vuya = tex2D<float4>(tex0, x, y);\n"
" return make_float4 (vuya.z, vuya.y, vuya.x, vuya.w);\n"
" }\n"
"};\n"
"\n"
"struct IOutput\n"
"{\n"
" __device__ virtual void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1) = 0;\n"
"\n"
" __device__ virtual void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1) = 0;\n"
"};\n"
"\n"
"struct OutputI420 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
" if (x % 2 == 0 && y % 2 == 0) {\n"
" unsigned int pos = x / 2 + (y / 2) * stride1;\n"
" dst1[pos] = scale_to_uchar (sample.y);\n"
" dst2[pos] = scale_to_uchar (sample.z);\n"
" }\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" unsigned int pos = x + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
" if (x % 2 == 0 && y % 2 == 0) {\n"
" pos = x / 2 + (y / 2) * stride1;\n"
" dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
" dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);\n"
" }\n"
" }\n"
"};\n"
"\n"
"struct OutputYV12 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
" if (x % 2 == 0 && y % 2 == 0) {\n"
" unsigned int pos = x / 2 + (y / 2) * stride1;\n"
" dst1[pos] = scale_to_uchar (sample.z);\n"
" dst2[pos] = scale_to_uchar (sample.y);\n"
" }\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" unsigned int pos = x + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
" if (x % 2 == 0 && y % 2 == 0) {\n"
" pos = x / 2 + (y / 2) * stride1;\n"
" dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);\n"
" dst2[pos] = blend_uchar (dst2[pos], sample.y, sample.w);\n"
" }\n"
" }\n"
"};\n"
"\n"
"struct OutputNV12 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
" if (x % 2 == 0 && y % 2 == 0) {\n"
" unsigned int pos = x + (y / 2) * stride1;\n"
" dst1[pos] = scale_to_uchar (sample.y);\n"
" dst1[pos + 1] = scale_to_uchar (sample.z);\n"
" }\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" unsigned int pos = x + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
" if (x % 2 == 0 && y % 2 == 0) {\n"
" pos = x + (y / 2) * stride1;\n"
" dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
" dst1[pos + 1] = blend_uchar (dst1[pos + 1], sample.z, sample.w);\n"
" }\n"
" }\n"
"};\n"
"\n"
"struct OutputNV21 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
" if (x % 2 == 0 && y % 2 == 0) {\n"
" unsigned int pos = x + (y / 2) * stride1;\n"
" dst1[pos] = scale_to_uchar (sample.z);\n"
" dst1[pos + 1] = scale_to_uchar (sample.y);\n"
" }\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" unsigned int pos = x + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
" if (x % 2 == 0 && y % 2 == 0) {\n"
" pos = x + (y / 2) * stride1;\n"
" dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);\n"
" dst1[pos + 1] = blend_uchar (dst1[pos + 1], sample.y, sample.w);\n"
" }\n"
" }\n"
"};\n"
"\n"
"struct OutputP010 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_ushort (sample.x);\n"
" if (x % 2 == 0 && y % 2 == 0) {\n"
" unsigned int pos = x * 2 + (y / 2) * stride1;\n"
" *(unsigned short *) &dst1[pos] = scale_to_ushort (sample.y);\n"
" *(unsigned short *) &dst1[pos + 2] = scale_to_ushort (sample.z);\n"
" }\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" unsigned int pos = x * 2 + y * stride0;\n"
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
" *target = blend_ushort (*target, sample.x, sample.w);\n"
" if (x % 2 == 0 && y % 2 == 0) {\n"
" pos = x * 2 + (y / 2) * stride1;\n"
" target = (unsigned short *) &dst1[pos];\n"
" *target = blend_ushort (*target, sample.y, sample.w);\n"
" target = (unsigned short *) &dst1[pos + 2];\n"
" *target = blend_ushort (*target, sample.z, sample.w);\n"
" }\n"
" }\n"
"};\n"
"\n"
"struct OutputI420_10 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_10bits (sample.x);\n"
" if (x % 2 == 0 && y % 2 == 0) {\n"
" unsigned int pos = x + (y / 2) * stride1;\n"
" *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);\n"
" *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);\n"
" }\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" unsigned int pos = x * 2 + y * stride0;\n"
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
" *target = blend_10bits (*target, sample.x, sample.w);\n"
" if (x % 2 == 0 && y % 2 == 0) {\n"
" pos = x * 2 + (y / 2) * stride1;\n"
" target = (unsigned short *) &dst1[pos];\n"
" *target = blend_10bits (*target, sample.y, sample.w);\n"
" target = (unsigned short *) &dst2[pos];\n"
" *target = blend_10bits (*target, sample.z, sample.w);\n"
" }\n"
" }\n"
"};\n"
"\n"
"struct OutputI420_12 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_12bits (sample.x);\n"
" if (x % 2 == 0 && y % 2 == 0) {\n"
" unsigned int pos = x + (y / 2) * stride1;\n"
" *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);\n"
" *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);\n"
" }\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" unsigned int pos = x * 2 + y * stride0;\n"
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
" *target = blend_12bits (*target, sample.x, sample.w);\n"
" if (x % 2 == 0 && y % 2 == 0) {\n"
" pos = x * 2 + (y / 2) * stride1;\n"
" target = (unsigned short *) &dst1[pos];\n"
" *target = blend_12bits (*target, sample.y, sample.w);\n"
" target = (unsigned short *) &dst2[pos];\n"
" *target = blend_12bits (*target, sample.z, sample.w);\n"
" }\n"
" }\n"
"};\n"
"\n"
"struct OutputY444 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x + y * stride0;\n"
" dst0[pos] = scale_to_uchar (sample.x);\n"
" dst1[pos] = scale_to_uchar (sample.y);\n"
" dst2[pos] = scale_to_uchar (sample.z);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
" dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
" dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);\n"
" }\n"
"};\n"
"\n"
"struct OutputY444_10 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 2 + y * stride0;\n"
" *(unsigned short *) &dst0[pos] = scale_to_10bits (sample.x);\n"
" *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);\n"
" *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 2 + y * stride0;\n"
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
" *target = blend_10bits (*target, sample.x, sample.w);\n"
" target = (unsigned short *) &dst1[pos];\n"
" *target = blend_10bits (*target, sample.y, sample.w);\n"
" target = (unsigned short *) &dst2[pos];\n"
" *target = blend_10bits (*target, sample.z, sample.w);\n"
" }\n"
"};\n"
"\n"
"struct OutputY444_12 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 2 + y * stride0;\n"
" *(unsigned short *) &dst0[pos] = scale_to_12bits (sample.x);\n"
" *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);\n"
" *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 2 + y * stride0;\n"
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
" *target = blend_12bits (*target, sample.x, sample.w);\n"
" target = (unsigned short *) &dst1[pos];\n"
" *target = blend_12bits (*target, sample.y, sample.w);\n"
" target = (unsigned short *) &dst2[pos];\n"
" *target = blend_12bits (*target, sample.z, sample.w);\n"
" }\n"
"};\n"
"\n"
"struct OutputY444_16 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 2 + y * stride0;\n"
" *(unsigned short *) &dst0[pos] = scale_to_ushort (sample.x);\n"
" *(unsigned short *) &dst1[pos] = scale_to_ushort (sample.y);\n"
" *(unsigned short *) &dst2[pos] = scale_to_ushort (sample.z);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 2 + y * stride0;\n"
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
" *target = blend_ushort (*target, sample.x, sample.w);\n"
" target = (unsigned short *) &dst1[pos];\n"
" *target = blend_ushort (*target, sample.y, sample.w);\n"
" target = (unsigned short *) &dst2[pos];\n"
" *target = blend_ushort (*target, sample.z, sample.w);\n"
" }\n"
"};\n"
"\n"
"struct OutputRGBA : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 4 + y * stride0;\n"
" dst0[pos] = scale_to_uchar (sample.x);\n"
" dst0[pos + 1] = scale_to_uchar (sample.y);\n"
" dst0[pos + 2] = scale_to_uchar (sample.z);\n"
" dst0[pos + 3] = scale_to_uchar (sample.w);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 4 + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);\n"
" dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);\n"
" }\n"
"};\n"
"\n"
"struct OutputRGBx : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 4 + y * stride0;\n"
" dst0[pos] = scale_to_uchar (sample.x);\n"
" dst0[pos + 1] = scale_to_uchar (sample.y);\n"
" dst0[pos + 2] = scale_to_uchar (sample.z);\n"
" dst0[pos + 3] = 255;\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 4 + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);\n"
" dst0[pos + 3] = 255;\n"
" }\n"
"};\n"
"\n"
"struct OutputBGRA : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 4 + y * stride0;\n"
" dst0[pos] = scale_to_uchar (sample.z);\n"
" dst0[pos + 1] = scale_to_uchar (sample.y);\n"
" dst0[pos + 2] = scale_to_uchar (sample.x);\n"
" dst0[pos + 3] = scale_to_uchar (sample.w);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 4 + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);\n"
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);\n"
" dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);\n"
" }\n"
"};\n"
"\n"
"struct OutputBGRx : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 4 + y * stride0;\n"
" dst0[pos] = scale_to_uchar (sample.z);\n"
" dst0[pos + 1] = scale_to_uchar (sample.y);\n"
" dst0[pos + 2] = scale_to_uchar (sample.x);\n"
" dst0[pos + 3] = 255;\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 4 + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);\n"
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);\n"
" dst0[pos + 3] = 255;\n"
" }\n"
"};\n"
"\n"
"struct OutputARGB : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 4 + y * stride0;\n"
" dst0[pos] = scale_to_uchar (sample.w);\n"
" dst0[pos + 1] = scale_to_uchar (sample.x);\n"
" dst0[pos + 2] = scale_to_uchar (sample.y);\n"
" dst0[pos + 3] = scale_to_uchar (sample.z);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 4 + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], 1.0, sample.w);\n"
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.x, sample.w);\n"
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.y, sample.w);\n"
" dst0[pos + 3] = blend_uchar (dst0[pos + 3], sample.z, sample.w);\n"
" }\n"
"};\n"
"\n"
"struct OutputABGR : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 4 + y * stride0;\n"
" dst0[pos] = scale_to_uchar (sample.w);\n"
" dst0[pos + 1] = scale_to_uchar (sample.z);\n"
" dst0[pos + 2] = scale_to_uchar (sample.y);\n"
" dst0[pos + 3] = scale_to_uchar (sample.x);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 4 + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], 1.0, sample.w);\n"
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.z, sample.w);\n"
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.y, sample.w);\n"
" dst0[pos + 3] = blend_uchar (dst0[pos + 3], sample.x, sample.w);\n"
" }\n"
"};\n"
"\n"
"struct OutputRGB : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 3 + y * stride0;\n"
" dst0[pos] = scale_to_uchar (sample.x);\n"
" dst0[pos + 1] = scale_to_uchar (sample.y);\n"
" dst0[pos + 2] = scale_to_uchar (sample.z);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 3 + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);\n"
" }\n"
"};\n"
"\n"
"struct OutputBGR : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 3 + y * stride0;\n"
" dst0[pos] = scale_to_uchar (sample.z);\n"
" dst0[pos + 1] = scale_to_uchar (sample.y);\n"
" dst0[pos + 2] = scale_to_uchar (sample.x);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 3 + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);\n"
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);\n"
" }\n"
"};\n"
"\n"
"__device__ inline ushort3\n"
"unpack_rgb10a2 (unsigned int val)\n"
"{\n"
" unsigned short r, g, b;\n"
" r = (val & 0x3ff);\n"
" r = (r << 6) | (r >> 4);\n"
" g = ((val >> 10) & 0x3ff);\n"
" g = (g << 6) | (g >> 4);\n"
" b = ((val >> 20) & 0x3ff);\n"
" b = (b << 6) | (b >> 4);\n"
" return make_ushort3 (r, g, b);\n"
"}\n"
"\n"
"struct OutputRGB10A2 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);\n"
" unsigned int packed_rgb = alpha << 30;\n"
" packed_rgb |= ((unsigned int) scale_to_10bits (sample.x));\n"
" packed_rgb |= ((unsigned int) scale_to_10bits (sample.y)) << 10;\n"
" packed_rgb |= ((unsigned int) scale_to_10bits (sample.z)) << 20;\n"
" *(unsigned int *) &dst0[x * 4 + y * stride0] = packed_rgb;\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" unsigned int * target = (unsigned int *) &dst0[x * 4 + y * stride0];\n"
" ushort3 val = unpack_rgb10a2 (*target);\n"
" unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);\n"
" unsigned int packed_rgb = alpha << 30;\n"
" packed_rgb |= ((unsigned int) blend_10bits (val.x, sample.x, sample.w));\n"
" packed_rgb |= ((unsigned int) blend_10bits (val.y, sample.y, sample.w)) << 10;\n"
" packed_rgb |= ((unsigned int) blend_10bits (val.z, sample.z, sample.w)) << 20;\n"
" *target = packed_rgb;\n"
" }\n"
"};\n"
"\n"
"__device__ inline ushort3\n"
"unpack_bgr10a2 (unsigned int val)\n"
"{\n"
" unsigned short r, g, b;\n"
" b = (val & 0x3ff);\n"
" b = (b << 6) | (b >> 4);\n"
" g = ((val >> 10) & 0x3ff);\n"
" g = (g << 6) | (g >> 4);\n"
" r = ((val >> 20) & 0x3ff);\n"
" r = (r << 6) | (r >> 4);\n"
" return make_ushort3 (r, g, b);\n"
"}\n"
"\n"
"struct OutputBGR10A2 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" unsigned int alpha = (unsigned int) scale_to_2bits (sample.x);\n"
" unsigned int packed_rgb = alpha << 30;\n"
" packed_rgb |= ((unsigned int) scale_to_10bits (sample.x)) << 20;\n"
" packed_rgb |= ((unsigned int) scale_to_10bits (sample.y)) << 10;\n"
" packed_rgb |= ((unsigned int) scale_to_10bits (sample.z));\n"
" *(unsigned int *) &dst0[x * 4 + y * stride0] = packed_rgb;\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" unsigned int * target = (unsigned int *) &dst0[x * 4 + y * stride0];\n"
" ushort3 val = unpack_bgr10a2 (*target);\n"
" unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);\n"
" unsigned int packed_rgb = alpha << 30;\n"
" packed_rgb |= ((unsigned int) blend_10bits (val.x, sample.x, sample.w)) << 20;\n"
" packed_rgb |= ((unsigned int) blend_10bits (val.y, sample.y, sample.w)) << 10;\n"
" packed_rgb |= ((unsigned int) blend_10bits (val.z, sample.z, sample.w));\n"
" *target = packed_rgb;\n"
" }\n"
"};\n"
"\n"
"struct OutputY42B : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
" if (x % 2 == 0) {\n"
" unsigned int pos = x / 2 + y * stride1;\n"
" dst1[pos] = scale_to_uchar (sample.y);\n"
" dst2[pos] = scale_to_uchar (sample.z);\n"
" }\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" unsigned int pos = x + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
" if (x % 2 == 0) {\n"
" pos = x / 2 + y * stride1;\n"
" dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
" dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);\n"
" }\n"
" }\n"
"};\n"
"\n"
"struct OutputI422_10 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_10bits (sample.x);\n"
" if (x % 2 == 0) {\n"
" unsigned int pos = x + y * stride1;\n"
" *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);\n"
" *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);\n"
" }\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" unsigned int pos = x * 2 + y * stride0;\n"
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
" *target = blend_10bits (*target, sample.x, sample.w);\n"
" if (x % 2 == 0) {\n"
" pos = x / 2 + y * stride1;\n"
" target = (unsigned short *) &dst1[pos];\n"
" *target = blend_10bits (*target, sample.y, sample.w);\n"
" target = (unsigned short *) &dst2[pos];\n"
" *target = blend_10bits (*target, sample.z, sample.w);\n"
" }\n"
" }\n"
"};\n"
"\n"
"struct OutputI422_12 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_12bits (sample.x);\n"
" if (x % 2 == 0) {\n"
" unsigned int pos = x + y * stride1;\n"
" *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);\n"
" *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);\n"
" }\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" unsigned int pos = x * 2 + y * stride0;\n"
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
" *target = blend_12bits (*target, sample.x, sample.w);\n"
" if (x % 2 == 0) {\n"
" pos = x / 2 + y * stride1;\n"
" target = (unsigned short *) &dst1[pos];\n"
" *target = blend_12bits (*target, sample.y, sample.w);\n"
" target = (unsigned short *) &dst2[pos];\n"
" *target = blend_12bits (*target, sample.z, sample.w);\n"
" }\n"
" }\n"
"};\n"
"\n"
"struct OutputRGBP : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x + y * stride0;\n"
" dst0[pos] = scale_to_uchar (sample.x);\n"
" dst1[pos] = scale_to_uchar (sample.y);\n"
" dst2[pos] = scale_to_uchar (sample.z);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
" dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
" dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);\n"
" }\n"
"};\n"
"\n"
"struct OutputBGRP : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x + y * stride0;\n"
" dst0[pos] = scale_to_uchar (sample.z);\n"
" dst1[pos] = scale_to_uchar (sample.y);\n"
" dst2[pos] = scale_to_uchar (sample.x);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);\n"
" dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
" dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);\n"
" }\n"
"};\n"
"\n"
"struct OutputGBR : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x + y * stride0;\n"
" dst0[pos] = scale_to_uchar (sample.y);\n"
" dst1[pos] = scale_to_uchar (sample.z);\n"
" dst2[pos] = scale_to_uchar (sample.x);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.y, sample.w);\n"
" dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);\n"
" dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);\n"
" }\n"
"};\n"
"\n"
"struct OutputGBR_10 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 2 + y * stride0;\n"
" *(unsigned short *) &dst0[pos] = scale_to_10bits (sample.y);\n"
" *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.z);\n"
" *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.x);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 2 + y * stride0;\n"
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
" *target = blend_10bits (*target, sample.y, sample.w);\n"
" target = (unsigned short *) &dst1[pos];\n"
" *target = blend_10bits (*target, sample.z, sample.w);\n"
" target = (unsigned short *) &dst2[pos];\n"
" *target = blend_10bits (*target, sample.x, sample.w);\n"
" }\n"
"};\n"
"\n"
"struct OutputGBR_12 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 2 + y * stride0;\n"
" *(unsigned short *) &dst0[pos] = scale_to_12bits (sample.y);\n"
" *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.z);\n"
" *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.x);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 2 + y * stride0;\n"
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
" *target = blend_12bits (*target, sample.y, sample.w);\n"
" target = (unsigned short *) &dst1[pos];\n"
" *target = blend_12bits (*target, sample.z, sample.w);\n"
" target = (unsigned short *) &dst2[pos];\n"
" *target = blend_12bits (*target, sample.x, sample.w);\n"
" }\n"
"};\n"
"\n"
"struct OutputGBR_16 : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 2 + y * stride0;\n"
" *(unsigned short *) &dst0[pos] = scale_to_ushort (sample.y);\n"
" *(unsigned short *) &dst1[pos] = scale_to_ushort (sample.z);\n"
" *(unsigned short *) &dst2[pos] = scale_to_ushort (sample.x);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 2 + y * stride0;\n"
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
" *target = blend_ushort (*target, sample.y, sample.w);\n"
" target = (unsigned short *) &dst1[pos];\n"
" *target = blend_ushort (*target, sample.z, sample.w);\n"
" target = (unsigned short *) &dst2[pos];\n"
" *target = blend_ushort (*target, sample.x, sample.w);\n"
" }\n"
"};\n"
"\n"
"struct OutputGBRA : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x + y * stride0;\n"
" dst0[pos] = scale_to_uchar (sample.y);\n"
" dst1[pos] = scale_to_uchar (sample.z);\n"
" dst2[pos] = scale_to_uchar (sample.x);\n"
" dst3[pos] = scale_to_uchar (sample.w);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.y, sample.w);\n"
" dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);\n"
" dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);\n"
" dst3[pos] = blend_uchar (dst3[pos], 1.0, sample.w);\n"
" }\n"
"};\n"
"\n"
"struct OutputVUYA : public IOutput\n"
"{\n"
" __device__ void\n"
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 4 + y * stride0;\n"
" dst0[pos] = scale_to_uchar (sample.z);\n"
" dst0[pos + 1] = scale_to_uchar (sample.y);\n"
" dst0[pos + 2] = scale_to_uchar (sample.x);\n"
" dst0[pos + 3] = scale_to_uchar (sample.w);\n"
" }\n"
"\n"
" __device__ void\n"
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
" int stride1)\n"
" {\n"
" int pos = x * 4 + y * stride0;\n"
" dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);\n"
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);\n"
" dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);\n"
" }\n"
"};\n"
"\n"
"__device__ inline float2\n"
"rotate_identity (float x, float y)\n"
"{\n"
" return make_float2(x, y);\n"
"}\n"
"\n"
"__device__ inline float2\n"
"rotate_90r (float x, float y)\n"
"{\n"
" return make_float2(y, 1.0 - x);\n"
"}\n"
"\n"
"__device__ inline float2\n"
"rotate_180 (float x, float y)\n"
"{\n"
" return make_float2(1.0 - x, 1.0 - y);\n"
"}\n"
"\n"
"__device__ inline float2\n"
"rotate_90l (float x, float y)\n"
"{\n"
" return make_float2(1.0 - y, x);\n"
"}\n"
"\n"
"__device__ inline float2\n"
"rotate_horiz (float x, float y)\n"
"{\n"
" return make_float2(1.0 - x, y);\n"
"}\n"
"\n"
"__device__ inline float2\n"
"rotate_vert (float x, float y)\n"
"{\n"
" return make_float2(x, 1.0 - y);\n"
"}\n"
"\n"
"__device__ inline float2\n"
"rotate_ul_lr (float x, float y)\n"
"{\n"
" return make_float2(y, x);\n"
"}\n"
"\n"
"__device__ inline float2\n"
"rotate_ur_ll (float x, float y)\n"
"{\n"
" return make_float2(1.0 - y, 1.0 - x);\n"
"}\n"
"__device__ inline float2\n"
"do_rotate (float x, float y, int direction)\n"
"{\n"
" switch (direction) {\n"
" case 1:\n"
" return rotate_90r (x, y);\n"
" case 2:\n"
" return rotate_180 (x, y);\n"
" case 3:\n"
" return rotate_90l (x, y);\n"
" case 4:\n"
" return rotate_horiz (x, y);\n"
" case 5:\n"
" return rotate_vert (x, y);\n"
" case 6:\n"
" return rotate_ul_lr (x, y);\n"
" case 7:\n"
" return rotate_ur_ll (x, y);\n"
" default:\n"
" return rotate_identity (x, y);\n"
" }\n"
"}\n"
"\n"
"extern \"C\" {\n"
"__global__ void\n"
"GstCudaConverterMain (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, unsigned char * dst0,\n"
" unsigned char * dst1, unsigned char * dst2, unsigned char * dst3,\n"
" int stride0, int stride1, ConstBuffer const_buf, int off_x, int off_y)\n"
"{\n"
" ConvertSimple g_converter;\n"
" SAMPLER g_sampler;\n"
" OUTPUT g_output;\n"
" int x_pos = blockIdx.x * blockDim.x + threadIdx.x + off_x;\n"
" int y_pos = blockIdx.y * blockDim.y + threadIdx.y + off_y;\n"
" float4 sample;\n"
" if (x_pos >= const_buf.width || y_pos >= const_buf.height ||\n"
" const_buf.view_width <= 0 || const_buf.view_height <= 0)\n"
" return;\n"
" if (x_pos < const_buf.left || x_pos >= const_buf.right ||\n"
" y_pos < const_buf.top || y_pos >= const_buf.bottom) {\n"
" if (!const_buf.fill_border)\n"
" return;\n"
" sample = make_float4 (const_buf.border_x, const_buf.border_y,\n"
" const_buf.border_z, const_buf.border_w);\n"
" } else {\n"
" float x = (__int2float_rz (x_pos - const_buf.left) + 0.5) / const_buf.view_width;\n"
" if (x < 0.0 || x > 1.0)\n"
" return;\n"
" float y = (__int2float_rz (y_pos - const_buf.top) + 0.5) / const_buf.view_height;\n"
" if (y < 0.0 || y > 1.0)\n"
" return;\n"
" float2 rotated = do_rotate (x, y, const_buf.video_direction);\n"
" float4 s = g_sampler.Execute (tex0, tex1, tex2, tex3, rotated.x, rotated.y);\n"
" float3 rgb = make_float3 (s.x, s.y, s.z);\n"
" float3 yuv;\n"
" if (const_buf.do_convert)\n"
" yuv = g_converter.Execute (rgb, &const_buf.matrix);\n"
" else\n"
" yuv = rgb;\n"
" sample = make_float4 (yuv.x, yuv.y, yuv.z, s.w);\n"
" }\n"
" sample.w = sample.w * const_buf.alpha;\n"
" if (!const_buf.do_blend) {\n"
" g_output.Write (dst0, dst1, dst2, dst3, sample, x_pos, y_pos, stride0, stride1);\n"
" } else {\n"
" g_output.Blend (dst0, dst1, dst2, dst3, sample, x_pos, y_pos, stride0, stride1);\n"
" }\n"
"}\n"
"}\n"
"\n";
#endif