mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2025-02-27 16:36:39 +00:00
Enable build time CUDA kernel compile if nvcc is detected. Precompile is disabled by default and controlled by "nvcodec-cuda-precompile" build option. Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/8536>
2924 lines
No EOL
96 KiB
Text
2924 lines
No EOL
96 KiB
Text
/* GStreamer
|
|
* Copyright (C) 2025 Seungha Yang <seungha@centricular.com>
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Library General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Library General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Library General Public
|
|
* License along with this library; if not, write to the
|
|
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
|
|
* Boston, MA 02110-1301, USA.
|
|
*/
|
|
|
|
#ifdef __NVCC__
|
|
struct ColorMatrix
|
|
{
|
|
float CoeffX[3];
|
|
float CoeffY[3];
|
|
float CoeffZ[3];
|
|
float Offset[3];
|
|
float Min[3];
|
|
float Max[3];
|
|
};
|
|
|
|
struct ConstBuffer
|
|
{
|
|
ColorMatrix matrix;
|
|
int width;
|
|
int height;
|
|
int left;
|
|
int top;
|
|
int right;
|
|
int bottom;
|
|
int view_width;
|
|
int view_height;
|
|
float border_x;
|
|
float border_y;
|
|
float border_z;
|
|
float border_w;
|
|
int fill_border;
|
|
int video_direction;
|
|
float alpha;
|
|
int do_blend;
|
|
int do_convert;
|
|
};
|
|
|
|
__device__ inline float
|
|
dot (const float coeff[3], float3 val)
|
|
{
|
|
return coeff[0] * val.x + coeff[1] * val.y + coeff[2] * val.z;
|
|
}
|
|
|
|
__device__ inline float
|
|
clamp (float val, float min_val, float max_val)
|
|
{
|
|
return max (min_val, min (val, max_val));
|
|
}
|
|
|
|
__device__ inline float3
|
|
clamp3 (float3 val, const float min_val[3], const float max_val[3])
|
|
{
|
|
return make_float3 (clamp (val.x, min_val[0], max_val[0]),
|
|
clamp (val.y, min_val[1], max_val[2]),
|
|
clamp (val.z, min_val[1], max_val[2]));
|
|
}
|
|
|
|
__device__ inline unsigned char
|
|
scale_to_2bits (float val)
|
|
{
|
|
return (unsigned short) __float2int_rz (val * 3.0);
|
|
}
|
|
|
|
__device__ inline unsigned char
|
|
scale_to_uchar (float val)
|
|
{
|
|
return (unsigned char) __float2int_rz (val * 255.0);
|
|
}
|
|
|
|
__device__ inline unsigned short
|
|
scale_to_ushort (float val)
|
|
{
|
|
return (unsigned short) __float2int_rz (val * 65535.0);
|
|
}
|
|
|
|
__device__ inline unsigned short
|
|
scale_to_10bits (float val)
|
|
{
|
|
return (unsigned short) __float2int_rz (val * 1023.0);
|
|
}
|
|
|
|
__device__ inline unsigned short
|
|
scale_to_12bits (float val)
|
|
{
|
|
return (unsigned short) __float2int_rz (val * 4095.0);
|
|
}
|
|
|
|
__device__ inline unsigned char
|
|
blend_uchar (unsigned char dst, float src, float src_alpha)
|
|
{
|
|
// DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor
|
|
float src_val = src * src_alpha;
|
|
float dst_val = __int2float_rz (dst) / 255.0 * (1.0 - src_alpha);
|
|
return scale_to_uchar(clamp(src_val + dst_val, 0, 1.0));
|
|
}
|
|
|
|
__device__ inline unsigned short
|
|
blend_ushort (unsigned short dst, float src, float src_alpha)
|
|
{
|
|
// DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor
|
|
float src_val = src * src_alpha;
|
|
float dst_val = __int2float_rz (dst) / 65535.0 * (1.0 - src_alpha);
|
|
return scale_to_ushort(clamp(src_val + dst_val, 0, 1.0));
|
|
}
|
|
|
|
__device__ inline unsigned short
|
|
blend_10bits (unsigned short dst, float src, float src_alpha)
|
|
{
|
|
// DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor
|
|
float src_val = src * src_alpha;
|
|
float dst_val = __int2float_rz (dst) / 1023.0 * (1.0 - src_alpha);
|
|
return scale_to_10bits(clamp(src_val + dst_val, 0, 1.0));
|
|
}
|
|
|
|
__device__ inline unsigned short
|
|
blend_12bits (unsigned short dst, float src, float src_alpha)
|
|
{
|
|
// DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor
|
|
float src_val = src * src_alpha;
|
|
float dst_val = __int2float_rz (dst) / 4095.0 * (1.0 - src_alpha);
|
|
return scale_to_12bits(clamp(src_val + dst_val, 0, 1.0));
|
|
}
|
|
|
|
struct IConverter
|
|
{
|
|
__device__ virtual float3
|
|
Execute (float3 sample, const ColorMatrix * matrix) = 0;
|
|
};
|
|
|
|
struct ConvertSimple : public IConverter
|
|
{
|
|
__device__ float3
|
|
Execute (float3 sample, const ColorMatrix * matrix)
|
|
{
|
|
float3 out;
|
|
out.x = dot (matrix->CoeffX, sample);
|
|
out.y = dot (matrix->CoeffY, sample);
|
|
out.z = dot (matrix->CoeffZ, sample);
|
|
out.x += matrix->Offset[0];
|
|
out.y += matrix->Offset[1];
|
|
out.z += matrix->Offset[2];
|
|
return clamp3 (out, matrix->Min, matrix->Max);
|
|
}
|
|
};
|
|
|
|
struct ISampler
|
|
{
|
|
__device__ virtual float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y) = 0;
|
|
};
|
|
|
|
struct SampleI420 : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float luma = tex2D<float>(tex0, x, y);
|
|
float u = tex2D<float>(tex1, x, y);
|
|
float v = tex2D<float>(tex2, x, y);
|
|
return make_float4 (luma, u, v, 1);
|
|
}
|
|
};
|
|
|
|
struct SampleYV12 : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float luma = tex2D<float>(tex0, x, y);
|
|
float u = tex2D<float>(tex2, x, y);
|
|
float v = tex2D<float>(tex1, x, y);
|
|
return make_float4 (luma, u, v, 1);
|
|
}
|
|
};
|
|
|
|
struct SampleI420_10 : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float luma = tex2D<float>(tex0, x, y);
|
|
float u = tex2D<float>(tex1, x, y);
|
|
float v = tex2D<float>(tex2, x, y);
|
|
/* (1 << 6) to scale [0, 1.0) range */
|
|
return make_float4 (luma * 64, u * 64, v * 64, 1);
|
|
}
|
|
};
|
|
|
|
struct SampleI420_12 : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float luma = tex2D<float>(tex0, x, y);
|
|
float u = tex2D<float>(tex1, x, y);
|
|
float v = tex2D<float>(tex2, x, y);
|
|
/* (1 << 4) to scale [0, 1.0) range */
|
|
return make_float4 (luma * 16, u * 16, v * 16, 1);
|
|
}
|
|
};
|
|
|
|
struct SampleNV12 : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float luma = tex2D<float>(tex0, x, y);
|
|
float2 uv = tex2D<float2>(tex1, x, y);
|
|
return make_float4 (luma, uv.x, uv.y, 1);
|
|
}
|
|
};
|
|
|
|
struct SampleNV21 : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float luma = tex2D<float>(tex0, x, y);
|
|
float2 vu = tex2D<float2>(tex1, x, y);
|
|
return make_float4 (luma, vu.y, vu.x, 1);
|
|
}
|
|
};
|
|
|
|
struct SampleRGBA : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
return tex2D<float4>(tex0, x, y);
|
|
}
|
|
};
|
|
|
|
struct SampleBGRA : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float4 bgra = tex2D<float4>(tex0, x, y);
|
|
return make_float4 (bgra.z, bgra.y, bgra.x, bgra.w);
|
|
}
|
|
};
|
|
|
|
struct SampleRGBx : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float4 rgbx = tex2D<float4>(tex0, x, y);
|
|
rgbx.w = 1;
|
|
return rgbx;
|
|
}
|
|
};
|
|
|
|
struct SampleBGRx : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float4 bgrx = tex2D<float4>(tex0, x, y);
|
|
return make_float4 (bgrx.z, bgrx.y, bgrx.x, 1);
|
|
}
|
|
};
|
|
|
|
struct SampleARGB : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float4 argb = tex2D<float4>(tex0, x, y);
|
|
return make_float4 (argb.y, argb.z, argb.w, argb.x);
|
|
}
|
|
};
|
|
|
|
struct SampleABGR : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float4 abgr = tex2D<float4>(tex0, x, y);
|
|
return make_float4 (abgr.w, abgr.z, abgr.y, abgr.x);
|
|
}
|
|
};
|
|
|
|
struct SampleRGBP : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float r = tex2D<float>(tex0, x, y);
|
|
float g = tex2D<float>(tex1, x, y);
|
|
float b = tex2D<float>(tex2, x, y);
|
|
return make_float4 (r, g, b, 1);
|
|
}
|
|
};
|
|
|
|
struct SampleBGRP : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float b = tex2D<float>(tex0, x, y);
|
|
float g = tex2D<float>(tex1, x, y);
|
|
float r = tex2D<float>(tex2, x, y);
|
|
return make_float4 (r, g, b, 1);
|
|
}
|
|
};
|
|
|
|
struct SampleGBR : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float g = tex2D<float>(tex0, x, y);
|
|
float b = tex2D<float>(tex1, x, y);
|
|
float r = tex2D<float>(tex2, x, y);
|
|
return make_float4 (r, g, b, 1);
|
|
}
|
|
};
|
|
|
|
struct SampleGBR_10 : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float g = tex2D<float>(tex0, x, y);
|
|
float b = tex2D<float>(tex1, x, y);
|
|
float r = tex2D<float>(tex2, x, y);
|
|
/* (1 << 6) to scale [0, 1.0) range */
|
|
return make_float4 (r * 64, g * 64, b * 64, 1);
|
|
}
|
|
};
|
|
|
|
struct SampleGBR_12 : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float g = tex2D<float>(tex0, x, y);
|
|
float b = tex2D<float>(tex1, x, y);
|
|
float r = tex2D<float>(tex2, x, y);
|
|
/* (1 << 4) to scale [0, 1.0) range */
|
|
return make_float4 (r * 16, g * 16, b * 16, 1);
|
|
}
|
|
};
|
|
|
|
struct SampleGBRA : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float g = tex2D<float>(tex0, x, y);
|
|
float b = tex2D<float>(tex1, x, y);
|
|
float r = tex2D<float>(tex2, x, y);
|
|
float a = tex2D<float>(tex3, x, y);
|
|
return make_float4 (r, g, b, a);
|
|
}
|
|
};
|
|
|
|
struct SampleVUYA : public ISampler
|
|
{
|
|
__device__ float4
|
|
Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)
|
|
{
|
|
float4 vuya = tex2D<float4>(tex0, x, y);
|
|
return make_float4 (vuya.z, vuya.y, vuya.x, vuya.w);
|
|
}
|
|
};
|
|
|
|
struct IOutput
|
|
{
|
|
__device__ virtual void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1) = 0;
|
|
|
|
__device__ virtual void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1) = 0;
|
|
};
|
|
|
|
struct OutputI420 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
dst0[x + y * stride0] = scale_to_uchar (sample.x);
|
|
if (x % 2 == 0 && y % 2 == 0) {
|
|
unsigned int pos = x / 2 + (y / 2) * stride1;
|
|
dst1[pos] = scale_to_uchar (sample.y);
|
|
dst2[pos] = scale_to_uchar (sample.z);
|
|
}
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
unsigned int pos = x + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
|
|
if (x % 2 == 0 && y % 2 == 0) {
|
|
pos = x / 2 + (y / 2) * stride1;
|
|
dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
|
|
dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct OutputYV12 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
dst0[x + y * stride0] = scale_to_uchar (sample.x);
|
|
if (x % 2 == 0 && y % 2 == 0) {
|
|
unsigned int pos = x / 2 + (y / 2) * stride1;
|
|
dst1[pos] = scale_to_uchar (sample.z);
|
|
dst2[pos] = scale_to_uchar (sample.y);
|
|
}
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
unsigned int pos = x + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
|
|
if (x % 2 == 0 && y % 2 == 0) {
|
|
pos = x / 2 + (y / 2) * stride1;
|
|
dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);
|
|
dst2[pos] = blend_uchar (dst2[pos], sample.y, sample.w);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct OutputNV12 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
dst0[x + y * stride0] = scale_to_uchar (sample.x);
|
|
if (x % 2 == 0 && y % 2 == 0) {
|
|
unsigned int pos = x + (y / 2) * stride1;
|
|
dst1[pos] = scale_to_uchar (sample.y);
|
|
dst1[pos + 1] = scale_to_uchar (sample.z);
|
|
}
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
unsigned int pos = x + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
|
|
if (x % 2 == 0 && y % 2 == 0) {
|
|
pos = x + (y / 2) * stride1;
|
|
dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
|
|
dst1[pos + 1] = blend_uchar (dst1[pos + 1], sample.z, sample.w);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct OutputNV21 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
dst0[x + y * stride0] = scale_to_uchar (sample.x);
|
|
if (x % 2 == 0 && y % 2 == 0) {
|
|
unsigned int pos = x + (y / 2) * stride1;
|
|
dst1[pos] = scale_to_uchar (sample.z);
|
|
dst1[pos + 1] = scale_to_uchar (sample.y);
|
|
}
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
unsigned int pos = x + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
|
|
if (x % 2 == 0 && y % 2 == 0) {
|
|
pos = x + (y / 2) * stride1;
|
|
dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);
|
|
dst1[pos + 1] = blend_uchar (dst1[pos + 1], sample.y, sample.w);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct OutputP010 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
*(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_ushort (sample.x);
|
|
if (x % 2 == 0 && y % 2 == 0) {
|
|
unsigned int pos = x * 2 + (y / 2) * stride1;
|
|
*(unsigned short *) &dst1[pos] = scale_to_ushort (sample.y);
|
|
*(unsigned short *) &dst1[pos + 2] = scale_to_ushort (sample.z);
|
|
}
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
unsigned int pos = x * 2 + y * stride0;
|
|
unsigned short * target = (unsigned short *) &dst0[pos];
|
|
*target = blend_ushort (*target, sample.x, sample.w);
|
|
if (x % 2 == 0 && y % 2 == 0) {
|
|
pos = x * 2 + (y / 2) * stride1;
|
|
target = (unsigned short *) &dst1[pos];
|
|
*target = blend_ushort (*target, sample.y, sample.w);
|
|
target = (unsigned short *) &dst1[pos + 2];
|
|
*target = blend_ushort (*target, sample.z, sample.w);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct OutputI420_10 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
*(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_10bits (sample.x);
|
|
if (x % 2 == 0 && y % 2 == 0) {
|
|
unsigned int pos = x + (y / 2) * stride1;
|
|
*(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);
|
|
*(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);
|
|
}
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
unsigned int pos = x * 2 + y * stride0;
|
|
unsigned short * target = (unsigned short *) &dst0[pos];
|
|
*target = blend_10bits (*target, sample.x, sample.w);
|
|
if (x % 2 == 0 && y % 2 == 0) {
|
|
pos = x * 2 + (y / 2) * stride1;
|
|
target = (unsigned short *) &dst1[pos];
|
|
*target = blend_10bits (*target, sample.y, sample.w);
|
|
target = (unsigned short *) &dst2[pos];
|
|
*target = blend_10bits (*target, sample.z, sample.w);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct OutputI420_12 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
*(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_12bits (sample.x);
|
|
if (x % 2 == 0 && y % 2 == 0) {
|
|
unsigned int pos = x + (y / 2) * stride1;
|
|
*(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);
|
|
*(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);
|
|
}
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
unsigned int pos = x * 2 + y * stride0;
|
|
unsigned short * target = (unsigned short *) &dst0[pos];
|
|
*target = blend_12bits (*target, sample.x, sample.w);
|
|
if (x % 2 == 0 && y % 2 == 0) {
|
|
pos = x * 2 + (y / 2) * stride1;
|
|
target = (unsigned short *) &dst1[pos];
|
|
*target = blend_12bits (*target, sample.y, sample.w);
|
|
target = (unsigned short *) &dst2[pos];
|
|
*target = blend_12bits (*target, sample.z, sample.w);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct OutputY444 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x + y * stride0;
|
|
dst0[pos] = scale_to_uchar (sample.x);
|
|
dst1[pos] = scale_to_uchar (sample.y);
|
|
dst2[pos] = scale_to_uchar (sample.z);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
|
|
dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
|
|
dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);
|
|
}
|
|
};
|
|
|
|
struct OutputY444_10 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 2 + y * stride0;
|
|
*(unsigned short *) &dst0[pos] = scale_to_10bits (sample.x);
|
|
*(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);
|
|
*(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 2 + y * stride0;
|
|
unsigned short * target = (unsigned short *) &dst0[pos];
|
|
*target = blend_10bits (*target, sample.x, sample.w);
|
|
target = (unsigned short *) &dst1[pos];
|
|
*target = blend_10bits (*target, sample.y, sample.w);
|
|
target = (unsigned short *) &dst2[pos];
|
|
*target = blend_10bits (*target, sample.z, sample.w);
|
|
}
|
|
};
|
|
|
|
struct OutputY444_12 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 2 + y * stride0;
|
|
*(unsigned short *) &dst0[pos] = scale_to_12bits (sample.x);
|
|
*(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);
|
|
*(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 2 + y * stride0;
|
|
unsigned short * target = (unsigned short *) &dst0[pos];
|
|
*target = blend_12bits (*target, sample.x, sample.w);
|
|
target = (unsigned short *) &dst1[pos];
|
|
*target = blend_12bits (*target, sample.y, sample.w);
|
|
target = (unsigned short *) &dst2[pos];
|
|
*target = blend_12bits (*target, sample.z, sample.w);
|
|
}
|
|
};
|
|
|
|
struct OutputY444_16 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 2 + y * stride0;
|
|
*(unsigned short *) &dst0[pos] = scale_to_ushort (sample.x);
|
|
*(unsigned short *) &dst1[pos] = scale_to_ushort (sample.y);
|
|
*(unsigned short *) &dst2[pos] = scale_to_ushort (sample.z);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 2 + y * stride0;
|
|
unsigned short * target = (unsigned short *) &dst0[pos];
|
|
*target = blend_ushort (*target, sample.x, sample.w);
|
|
target = (unsigned short *) &dst1[pos];
|
|
*target = blend_ushort (*target, sample.y, sample.w);
|
|
target = (unsigned short *) &dst2[pos];
|
|
*target = blend_ushort (*target, sample.z, sample.w);
|
|
}
|
|
};
|
|
|
|
struct OutputRGBA : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 4 + y * stride0;
|
|
dst0[pos] = scale_to_uchar (sample.x);
|
|
dst0[pos + 1] = scale_to_uchar (sample.y);
|
|
dst0[pos + 2] = scale_to_uchar (sample.z);
|
|
dst0[pos + 3] = scale_to_uchar (sample.w);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 4 + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
|
|
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
|
|
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);
|
|
dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);
|
|
}
|
|
};
|
|
|
|
struct OutputRGBx : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 4 + y * stride0;
|
|
dst0[pos] = scale_to_uchar (sample.x);
|
|
dst0[pos + 1] = scale_to_uchar (sample.y);
|
|
dst0[pos + 2] = scale_to_uchar (sample.z);
|
|
dst0[pos + 3] = 255;
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 4 + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
|
|
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
|
|
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);
|
|
dst0[pos + 3] = 255;
|
|
}
|
|
};
|
|
|
|
struct OutputBGRA : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 4 + y * stride0;
|
|
dst0[pos] = scale_to_uchar (sample.z);
|
|
dst0[pos + 1] = scale_to_uchar (sample.y);
|
|
dst0[pos + 2] = scale_to_uchar (sample.x);
|
|
dst0[pos + 3] = scale_to_uchar (sample.w);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 4 + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);
|
|
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
|
|
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);
|
|
dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);
|
|
}
|
|
};
|
|
|
|
struct OutputBGRx : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 4 + y * stride0;
|
|
dst0[pos] = scale_to_uchar (sample.z);
|
|
dst0[pos + 1] = scale_to_uchar (sample.y);
|
|
dst0[pos + 2] = scale_to_uchar (sample.x);
|
|
dst0[pos + 3] = 255;
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 4 + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);
|
|
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
|
|
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);
|
|
dst0[pos + 3] = 255;
|
|
}
|
|
};
|
|
|
|
struct OutputARGB : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 4 + y * stride0;
|
|
dst0[pos] = scale_to_uchar (sample.w);
|
|
dst0[pos + 1] = scale_to_uchar (sample.x);
|
|
dst0[pos + 2] = scale_to_uchar (sample.y);
|
|
dst0[pos + 3] = scale_to_uchar (sample.z);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 4 + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], 1.0, sample.w);
|
|
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.x, sample.w);
|
|
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.y, sample.w);
|
|
dst0[pos + 3] = blend_uchar (dst0[pos + 3], sample.z, sample.w);
|
|
}
|
|
};
|
|
|
|
struct OutputABGR : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 4 + y * stride0;
|
|
dst0[pos] = scale_to_uchar (sample.w);
|
|
dst0[pos + 1] = scale_to_uchar (sample.z);
|
|
dst0[pos + 2] = scale_to_uchar (sample.y);
|
|
dst0[pos + 3] = scale_to_uchar (sample.x);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 4 + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], 1.0, sample.w);
|
|
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.z, sample.w);
|
|
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.y, sample.w);
|
|
dst0[pos + 3] = blend_uchar (dst0[pos + 3], sample.x, sample.w);
|
|
}
|
|
};
|
|
|
|
struct OutputRGB : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 3 + y * stride0;
|
|
dst0[pos] = scale_to_uchar (sample.x);
|
|
dst0[pos + 1] = scale_to_uchar (sample.y);
|
|
dst0[pos + 2] = scale_to_uchar (sample.z);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 3 + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
|
|
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
|
|
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);
|
|
}
|
|
};
|
|
|
|
struct OutputBGR : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 3 + y * stride0;
|
|
dst0[pos] = scale_to_uchar (sample.z);
|
|
dst0[pos + 1] = scale_to_uchar (sample.y);
|
|
dst0[pos + 2] = scale_to_uchar (sample.x);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 3 + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);
|
|
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
|
|
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);
|
|
}
|
|
};
|
|
|
|
__device__ inline ushort3
|
|
unpack_rgb10a2 (unsigned int val)
|
|
{
|
|
unsigned short r, g, b;
|
|
r = (val & 0x3ff);
|
|
r = (r << 6) | (r >> 4);
|
|
g = ((val >> 10) & 0x3ff);
|
|
g = (g << 6) | (g >> 4);
|
|
b = ((val >> 20) & 0x3ff);
|
|
b = (b << 6) | (b >> 4);
|
|
return make_ushort3 (r, g, b);
|
|
}
|
|
|
|
struct OutputRGB10A2 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);
|
|
unsigned int packed_rgb = alpha << 30;
|
|
packed_rgb |= ((unsigned int) scale_to_10bits (sample.x));
|
|
packed_rgb |= ((unsigned int) scale_to_10bits (sample.y)) << 10;
|
|
packed_rgb |= ((unsigned int) scale_to_10bits (sample.z)) << 20;
|
|
*(unsigned int *) &dst0[x * 4 + y * stride0] = packed_rgb;
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
unsigned int * target = (unsigned int *) &dst0[x * 4 + y * stride0];
|
|
ushort3 val = unpack_rgb10a2 (*target);
|
|
unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);
|
|
unsigned int packed_rgb = alpha << 30;
|
|
packed_rgb |= ((unsigned int) blend_10bits (val.x, sample.x, sample.w));
|
|
packed_rgb |= ((unsigned int) blend_10bits (val.y, sample.y, sample.w)) << 10;
|
|
packed_rgb |= ((unsigned int) blend_10bits (val.z, sample.z, sample.w)) << 20;
|
|
*target = packed_rgb;
|
|
}
|
|
};
|
|
|
|
__device__ inline ushort3
|
|
unpack_bgr10a2 (unsigned int val)
|
|
{
|
|
unsigned short r, g, b;
|
|
b = (val & 0x3ff);
|
|
b = (b << 6) | (b >> 4);
|
|
g = ((val >> 10) & 0x3ff);
|
|
g = (g << 6) | (g >> 4);
|
|
r = ((val >> 20) & 0x3ff);
|
|
r = (r << 6) | (r >> 4);
|
|
return make_ushort3 (r, g, b);
|
|
}
|
|
|
|
struct OutputBGR10A2 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
unsigned int alpha = (unsigned int) scale_to_2bits (sample.x);
|
|
unsigned int packed_rgb = alpha << 30;
|
|
packed_rgb |= ((unsigned int) scale_to_10bits (sample.x)) << 20;
|
|
packed_rgb |= ((unsigned int) scale_to_10bits (sample.y)) << 10;
|
|
packed_rgb |= ((unsigned int) scale_to_10bits (sample.z));
|
|
*(unsigned int *) &dst0[x * 4 + y * stride0] = packed_rgb;
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
unsigned int * target = (unsigned int *) &dst0[x * 4 + y * stride0];
|
|
ushort3 val = unpack_bgr10a2 (*target);
|
|
unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);
|
|
unsigned int packed_rgb = alpha << 30;
|
|
packed_rgb |= ((unsigned int) blend_10bits (val.x, sample.x, sample.w)) << 20;
|
|
packed_rgb |= ((unsigned int) blend_10bits (val.y, sample.y, sample.w)) << 10;
|
|
packed_rgb |= ((unsigned int) blend_10bits (val.z, sample.z, sample.w));
|
|
*target = packed_rgb;
|
|
}
|
|
};
|
|
|
|
struct OutputY42B : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
dst0[x + y * stride0] = scale_to_uchar (sample.x);
|
|
if (x % 2 == 0) {
|
|
unsigned int pos = x / 2 + y * stride1;
|
|
dst1[pos] = scale_to_uchar (sample.y);
|
|
dst2[pos] = scale_to_uchar (sample.z);
|
|
}
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
unsigned int pos = x + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
|
|
if (x % 2 == 0) {
|
|
pos = x / 2 + y * stride1;
|
|
dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
|
|
dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct OutputI422_10 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
*(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_10bits (sample.x);
|
|
if (x % 2 == 0) {
|
|
unsigned int pos = x + y * stride1;
|
|
*(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);
|
|
*(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);
|
|
}
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
unsigned int pos = x * 2 + y * stride0;
|
|
unsigned short * target = (unsigned short *) &dst0[pos];
|
|
*target = blend_10bits (*target, sample.x, sample.w);
|
|
if (x % 2 == 0) {
|
|
pos = x / 2 + y * stride1;
|
|
target = (unsigned short *) &dst1[pos];
|
|
*target = blend_10bits (*target, sample.y, sample.w);
|
|
target = (unsigned short *) &dst2[pos];
|
|
*target = blend_10bits (*target, sample.z, sample.w);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct OutputI422_12 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
*(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_12bits (sample.x);
|
|
if (x % 2 == 0) {
|
|
unsigned int pos = x + y * stride1;
|
|
*(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);
|
|
*(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);
|
|
}
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
unsigned int pos = x * 2 + y * stride0;
|
|
unsigned short * target = (unsigned short *) &dst0[pos];
|
|
*target = blend_12bits (*target, sample.x, sample.w);
|
|
if (x % 2 == 0) {
|
|
pos = x / 2 + y * stride1;
|
|
target = (unsigned short *) &dst1[pos];
|
|
*target = blend_12bits (*target, sample.y, sample.w);
|
|
target = (unsigned short *) &dst2[pos];
|
|
*target = blend_12bits (*target, sample.z, sample.w);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct OutputRGBP : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x + y * stride0;
|
|
dst0[pos] = scale_to_uchar (sample.x);
|
|
dst1[pos] = scale_to_uchar (sample.y);
|
|
dst2[pos] = scale_to_uchar (sample.z);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);
|
|
dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
|
|
dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);
|
|
}
|
|
};
|
|
|
|
struct OutputBGRP : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x + y * stride0;
|
|
dst0[pos] = scale_to_uchar (sample.z);
|
|
dst1[pos] = scale_to_uchar (sample.y);
|
|
dst2[pos] = scale_to_uchar (sample.x);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);
|
|
dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);
|
|
dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);
|
|
}
|
|
};
|
|
|
|
struct OutputGBR : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x + y * stride0;
|
|
dst0[pos] = scale_to_uchar (sample.y);
|
|
dst1[pos] = scale_to_uchar (sample.z);
|
|
dst2[pos] = scale_to_uchar (sample.x);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.y, sample.w);
|
|
dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);
|
|
dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);
|
|
}
|
|
};
|
|
|
|
struct OutputGBR_10 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 2 + y * stride0;
|
|
*(unsigned short *) &dst0[pos] = scale_to_10bits (sample.y);
|
|
*(unsigned short *) &dst1[pos] = scale_to_10bits (sample.z);
|
|
*(unsigned short *) &dst2[pos] = scale_to_10bits (sample.x);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 2 + y * stride0;
|
|
unsigned short * target = (unsigned short *) &dst0[pos];
|
|
*target = blend_10bits (*target, sample.y, sample.w);
|
|
target = (unsigned short *) &dst1[pos];
|
|
*target = blend_10bits (*target, sample.z, sample.w);
|
|
target = (unsigned short *) &dst2[pos];
|
|
*target = blend_10bits (*target, sample.x, sample.w);
|
|
}
|
|
};
|
|
|
|
struct OutputGBR_12 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 2 + y * stride0;
|
|
*(unsigned short *) &dst0[pos] = scale_to_12bits (sample.y);
|
|
*(unsigned short *) &dst1[pos] = scale_to_12bits (sample.z);
|
|
*(unsigned short *) &dst2[pos] = scale_to_12bits (sample.x);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 2 + y * stride0;
|
|
unsigned short * target = (unsigned short *) &dst0[pos];
|
|
*target = blend_12bits (*target, sample.y, sample.w);
|
|
target = (unsigned short *) &dst1[pos];
|
|
*target = blend_12bits (*target, sample.z, sample.w);
|
|
target = (unsigned short *) &dst2[pos];
|
|
*target = blend_12bits (*target, sample.x, sample.w);
|
|
}
|
|
};
|
|
|
|
struct OutputGBR_16 : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 2 + y * stride0;
|
|
*(unsigned short *) &dst0[pos] = scale_to_ushort (sample.y);
|
|
*(unsigned short *) &dst1[pos] = scale_to_ushort (sample.z);
|
|
*(unsigned short *) &dst2[pos] = scale_to_ushort (sample.x);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 2 + y * stride0;
|
|
unsigned short * target = (unsigned short *) &dst0[pos];
|
|
*target = blend_ushort (*target, sample.y, sample.w);
|
|
target = (unsigned short *) &dst1[pos];
|
|
*target = blend_ushort (*target, sample.z, sample.w);
|
|
target = (unsigned short *) &dst2[pos];
|
|
*target = blend_ushort (*target, sample.x, sample.w);
|
|
}
|
|
};
|
|
|
|
struct OutputGBRA : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x + y * stride0;
|
|
dst0[pos] = scale_to_uchar (sample.y);
|
|
dst1[pos] = scale_to_uchar (sample.z);
|
|
dst2[pos] = scale_to_uchar (sample.x);
|
|
dst3[pos] = scale_to_uchar (sample.w);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.y, sample.w);
|
|
dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);
|
|
dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);
|
|
dst3[pos] = blend_uchar (dst3[pos], 1.0, sample.w);
|
|
}
|
|
};
|
|
|
|
struct OutputVUYA : public IOutput
|
|
{
|
|
__device__ void
|
|
Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 4 + y * stride0;
|
|
dst0[pos] = scale_to_uchar (sample.z);
|
|
dst0[pos + 1] = scale_to_uchar (sample.y);
|
|
dst0[pos + 2] = scale_to_uchar (sample.x);
|
|
dst0[pos + 3] = scale_to_uchar (sample.w);
|
|
}
|
|
|
|
__device__ void
|
|
Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,
|
|
unsigned char * dst3, float4 sample, int x, int y, int stride0,
|
|
int stride1)
|
|
{
|
|
int pos = x * 4 + y * stride0;
|
|
dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);
|
|
dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);
|
|
dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);
|
|
dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);
|
|
}
|
|
};
|
|
|
|
__device__ inline float2
|
|
rotate_identity (float x, float y)
|
|
{
|
|
return make_float2(x, y);
|
|
}
|
|
|
|
__device__ inline float2
|
|
rotate_90r (float x, float y)
|
|
{
|
|
return make_float2(y, 1.0 - x);
|
|
}
|
|
|
|
__device__ inline float2
|
|
rotate_180 (float x, float y)
|
|
{
|
|
return make_float2(1.0 - x, 1.0 - y);
|
|
}
|
|
|
|
__device__ inline float2
|
|
rotate_90l (float x, float y)
|
|
{
|
|
return make_float2(1.0 - y, x);
|
|
}
|
|
|
|
__device__ inline float2
|
|
rotate_horiz (float x, float y)
|
|
{
|
|
return make_float2(1.0 - x, y);
|
|
}
|
|
|
|
__device__ inline float2
|
|
rotate_vert (float x, float y)
|
|
{
|
|
return make_float2(x, 1.0 - y);
|
|
}
|
|
|
|
__device__ inline float2
|
|
rotate_ul_lr (float x, float y)
|
|
{
|
|
return make_float2(y, x);
|
|
}
|
|
|
|
__device__ inline float2
|
|
rotate_ur_ll (float x, float y)
|
|
{
|
|
return make_float2(1.0 - y, 1.0 - x);
|
|
}
|
|
__device__ inline float2
|
|
do_rotate (float x, float y, int direction)
|
|
{
|
|
switch (direction) {
|
|
case 1:
|
|
return rotate_90r (x, y);
|
|
case 2:
|
|
return rotate_180 (x, y);
|
|
case 3:
|
|
return rotate_90l (x, y);
|
|
case 4:
|
|
return rotate_horiz (x, y);
|
|
case 5:
|
|
return rotate_vert (x, y);
|
|
case 6:
|
|
return rotate_ul_lr (x, y);
|
|
case 7:
|
|
return rotate_ur_ll (x, y);
|
|
default:
|
|
return rotate_identity (x, y);
|
|
}
|
|
}
|
|
|
|
extern "C" {
|
|
__global__ void
|
|
GstCudaConverterMain (cudaTextureObject_t tex0, cudaTextureObject_t tex1,
|
|
cudaTextureObject_t tex2, cudaTextureObject_t tex3, unsigned char * dst0,
|
|
unsigned char * dst1, unsigned char * dst2, unsigned char * dst3,
|
|
int stride0, int stride1, ConstBuffer const_buf, int off_x, int off_y)
|
|
{
|
|
ConvertSimple g_converter;
|
|
SAMPLER g_sampler;
|
|
OUTPUT g_output;
|
|
int x_pos = blockIdx.x * blockDim.x + threadIdx.x + off_x;
|
|
int y_pos = blockIdx.y * blockDim.y + threadIdx.y + off_y;
|
|
float4 sample;
|
|
if (x_pos >= const_buf.width || y_pos >= const_buf.height ||
|
|
const_buf.view_width <= 0 || const_buf.view_height <= 0)
|
|
return;
|
|
if (x_pos < const_buf.left || x_pos >= const_buf.right ||
|
|
y_pos < const_buf.top || y_pos >= const_buf.bottom) {
|
|
if (!const_buf.fill_border)
|
|
return;
|
|
sample = make_float4 (const_buf.border_x, const_buf.border_y,
|
|
const_buf.border_z, const_buf.border_w);
|
|
} else {
|
|
float x = (__int2float_rz (x_pos - const_buf.left) + 0.5) / const_buf.view_width;
|
|
if (x < 0.0 || x > 1.0)
|
|
return;
|
|
float y = (__int2float_rz (y_pos - const_buf.top) + 0.5) / const_buf.view_height;
|
|
if (y < 0.0 || y > 1.0)
|
|
return;
|
|
float2 rotated = do_rotate (x, y, const_buf.video_direction);
|
|
float4 s = g_sampler.Execute (tex0, tex1, tex2, tex3, rotated.x, rotated.y);
|
|
float3 rgb = make_float3 (s.x, s.y, s.z);
|
|
float3 yuv;
|
|
if (const_buf.do_convert)
|
|
yuv = g_converter.Execute (rgb, &const_buf.matrix);
|
|
else
|
|
yuv = rgb;
|
|
sample = make_float4 (yuv.x, yuv.y, yuv.z, s.w);
|
|
}
|
|
sample.w = sample.w * const_buf.alpha;
|
|
if (!const_buf.do_blend) {
|
|
g_output.Write (dst0, dst1, dst2, dst3, sample, x_pos, y_pos, stride0, stride1);
|
|
} else {
|
|
g_output.Blend (dst0, dst1, dst2, dst3, sample, x_pos, y_pos, stride0, stride1);
|
|
}
|
|
}
|
|
}
|
|
#else
|
|
static const char GstCudaConverterMain_str[] =
|
|
"struct ColorMatrix\n"
|
|
"{\n"
|
|
" float CoeffX[3];\n"
|
|
" float CoeffY[3];\n"
|
|
" float CoeffZ[3];\n"
|
|
" float Offset[3];\n"
|
|
" float Min[3];\n"
|
|
" float Max[3];\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct ConstBuffer\n"
|
|
"{\n"
|
|
" ColorMatrix matrix;\n"
|
|
" int width;\n"
|
|
" int height;\n"
|
|
" int left;\n"
|
|
" int top;\n"
|
|
" int right;\n"
|
|
" int bottom;\n"
|
|
" int view_width;\n"
|
|
" int view_height;\n"
|
|
" float border_x;\n"
|
|
" float border_y;\n"
|
|
" float border_z;\n"
|
|
" float border_w;\n"
|
|
" int fill_border;\n"
|
|
" int video_direction;\n"
|
|
" float alpha;\n"
|
|
" int do_blend;\n"
|
|
" int do_convert;\n"
|
|
"};\n"
|
|
"\n"
|
|
"__device__ inline float\n"
|
|
"dot (const float coeff[3], float3 val)\n"
|
|
"{\n"
|
|
" return coeff[0] * val.x + coeff[1] * val.y + coeff[2] * val.z;\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline float\n"
|
|
"clamp (float val, float min_val, float max_val)\n"
|
|
"{\n"
|
|
" return max (min_val, min (val, max_val));\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline float3\n"
|
|
"clamp3 (float3 val, const float min_val[3], const float max_val[3])\n"
|
|
"{\n"
|
|
" return make_float3 (clamp (val.x, min_val[0], max_val[0]),\n"
|
|
" clamp (val.y, min_val[1], max_val[2]),\n"
|
|
" clamp (val.z, min_val[1], max_val[2]));\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline unsigned char\n"
|
|
"scale_to_2bits (float val)\n"
|
|
"{\n"
|
|
" return (unsigned short) __float2int_rz (val * 3.0);\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline unsigned char\n"
|
|
"scale_to_uchar (float val)\n"
|
|
"{\n"
|
|
" return (unsigned char) __float2int_rz (val * 255.0);\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline unsigned short\n"
|
|
"scale_to_ushort (float val)\n"
|
|
"{\n"
|
|
" return (unsigned short) __float2int_rz (val * 65535.0);\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline unsigned short\n"
|
|
"scale_to_10bits (float val)\n"
|
|
"{\n"
|
|
" return (unsigned short) __float2int_rz (val * 1023.0);\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline unsigned short\n"
|
|
"scale_to_12bits (float val)\n"
|
|
"{\n"
|
|
" return (unsigned short) __float2int_rz (val * 4095.0);\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline unsigned char\n"
|
|
"blend_uchar (unsigned char dst, float src, float src_alpha)\n"
|
|
"{\n"
|
|
" // DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor\n"
|
|
" float src_val = src * src_alpha;\n"
|
|
" float dst_val = __int2float_rz (dst) / 255.0 * (1.0 - src_alpha);\n"
|
|
" return scale_to_uchar(clamp(src_val + dst_val, 0, 1.0));\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline unsigned short\n"
|
|
"blend_ushort (unsigned short dst, float src, float src_alpha)\n"
|
|
"{\n"
|
|
" // DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor\n"
|
|
" float src_val = src * src_alpha;\n"
|
|
" float dst_val = __int2float_rz (dst) / 65535.0 * (1.0 - src_alpha);\n"
|
|
" return scale_to_ushort(clamp(src_val + dst_val, 0, 1.0));\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline unsigned short\n"
|
|
"blend_10bits (unsigned short dst, float src, float src_alpha)\n"
|
|
"{\n"
|
|
" // DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor\n"
|
|
" float src_val = src * src_alpha;\n"
|
|
" float dst_val = __int2float_rz (dst) / 1023.0 * (1.0 - src_alpha);\n"
|
|
" return scale_to_10bits(clamp(src_val + dst_val, 0, 1.0));\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline unsigned short\n"
|
|
"blend_12bits (unsigned short dst, float src, float src_alpha)\n"
|
|
"{\n"
|
|
" // DstColor' = SrcA * SrcColor + (1 - SrcA) DstColor\n"
|
|
" float src_val = src * src_alpha;\n"
|
|
" float dst_val = __int2float_rz (dst) / 4095.0 * (1.0 - src_alpha);\n"
|
|
" return scale_to_12bits(clamp(src_val + dst_val, 0, 1.0));\n"
|
|
"}\n"
|
|
"\n"
|
|
"struct IConverter\n"
|
|
"{\n"
|
|
" __device__ virtual float3\n"
|
|
" Execute (float3 sample, const ColorMatrix * matrix) = 0;\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct ConvertSimple : public IConverter\n"
|
|
"{\n"
|
|
" __device__ float3\n"
|
|
" Execute (float3 sample, const ColorMatrix * matrix)\n"
|
|
" {\n"
|
|
" float3 out;\n"
|
|
" out.x = dot (matrix->CoeffX, sample);\n"
|
|
" out.y = dot (matrix->CoeffY, sample);\n"
|
|
" out.z = dot (matrix->CoeffZ, sample);\n"
|
|
" out.x += matrix->Offset[0];\n"
|
|
" out.y += matrix->Offset[1];\n"
|
|
" out.z += matrix->Offset[2];\n"
|
|
" return clamp3 (out, matrix->Min, matrix->Max);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct ISampler\n"
|
|
"{\n"
|
|
" __device__ virtual float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y) = 0;\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleI420 : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float luma = tex2D<float>(tex0, x, y);\n"
|
|
" float u = tex2D<float>(tex1, x, y);\n"
|
|
" float v = tex2D<float>(tex2, x, y);\n"
|
|
" return make_float4 (luma, u, v, 1);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleYV12 : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float luma = tex2D<float>(tex0, x, y);\n"
|
|
" float u = tex2D<float>(tex2, x, y);\n"
|
|
" float v = tex2D<float>(tex1, x, y);\n"
|
|
" return make_float4 (luma, u, v, 1);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleI420_10 : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float luma = tex2D<float>(tex0, x, y);\n"
|
|
" float u = tex2D<float>(tex1, x, y);\n"
|
|
" float v = tex2D<float>(tex2, x, y);\n"
|
|
" /* (1 << 6) to scale [0, 1.0) range */\n"
|
|
" return make_float4 (luma * 64, u * 64, v * 64, 1);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleI420_12 : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float luma = tex2D<float>(tex0, x, y);\n"
|
|
" float u = tex2D<float>(tex1, x, y);\n"
|
|
" float v = tex2D<float>(tex2, x, y);\n"
|
|
" /* (1 << 4) to scale [0, 1.0) range */\n"
|
|
" return make_float4 (luma * 16, u * 16, v * 16, 1);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleNV12 : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float luma = tex2D<float>(tex0, x, y);\n"
|
|
" float2 uv = tex2D<float2>(tex1, x, y);\n"
|
|
" return make_float4 (luma, uv.x, uv.y, 1);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleNV21 : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float luma = tex2D<float>(tex0, x, y);\n"
|
|
" float2 vu = tex2D<float2>(tex1, x, y);\n"
|
|
" return make_float4 (luma, vu.y, vu.x, 1);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleRGBA : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" return tex2D<float4>(tex0, x, y);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleBGRA : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float4 bgra = tex2D<float4>(tex0, x, y);\n"
|
|
" return make_float4 (bgra.z, bgra.y, bgra.x, bgra.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleRGBx : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float4 rgbx = tex2D<float4>(tex0, x, y);\n"
|
|
" rgbx.w = 1;\n"
|
|
" return rgbx;\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleBGRx : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float4 bgrx = tex2D<float4>(tex0, x, y);\n"
|
|
" return make_float4 (bgrx.z, bgrx.y, bgrx.x, 1);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleARGB : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float4 argb = tex2D<float4>(tex0, x, y);\n"
|
|
" return make_float4 (argb.y, argb.z, argb.w, argb.x);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleABGR : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float4 abgr = tex2D<float4>(tex0, x, y);\n"
|
|
" return make_float4 (abgr.w, abgr.z, abgr.y, abgr.x);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleRGBP : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float r = tex2D<float>(tex0, x, y);\n"
|
|
" float g = tex2D<float>(tex1, x, y);\n"
|
|
" float b = tex2D<float>(tex2, x, y);\n"
|
|
" return make_float4 (r, g, b, 1);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleBGRP : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float b = tex2D<float>(tex0, x, y);\n"
|
|
" float g = tex2D<float>(tex1, x, y);\n"
|
|
" float r = tex2D<float>(tex2, x, y);\n"
|
|
" return make_float4 (r, g, b, 1);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleGBR : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float g = tex2D<float>(tex0, x, y);\n"
|
|
" float b = tex2D<float>(tex1, x, y);\n"
|
|
" float r = tex2D<float>(tex2, x, y);\n"
|
|
" return make_float4 (r, g, b, 1);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleGBR_10 : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float g = tex2D<float>(tex0, x, y);\n"
|
|
" float b = tex2D<float>(tex1, x, y);\n"
|
|
" float r = tex2D<float>(tex2, x, y);\n"
|
|
" /* (1 << 6) to scale [0, 1.0) range */\n"
|
|
" return make_float4 (r * 64, g * 64, b * 64, 1);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleGBR_12 : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float g = tex2D<float>(tex0, x, y);\n"
|
|
" float b = tex2D<float>(tex1, x, y);\n"
|
|
" float r = tex2D<float>(tex2, x, y);\n"
|
|
" /* (1 << 4) to scale [0, 1.0) range */\n"
|
|
" return make_float4 (r * 16, g * 16, b * 16, 1);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleGBRA : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float g = tex2D<float>(tex0, x, y);\n"
|
|
" float b = tex2D<float>(tex1, x, y);\n"
|
|
" float r = tex2D<float>(tex2, x, y);\n"
|
|
" float a = tex2D<float>(tex3, x, y);\n"
|
|
" return make_float4 (r, g, b, a);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct SampleVUYA : public ISampler\n"
|
|
"{\n"
|
|
" __device__ float4\n"
|
|
" Execute (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n"
|
|
" {\n"
|
|
" float4 vuya = tex2D<float4>(tex0, x, y);\n"
|
|
" return make_float4 (vuya.z, vuya.y, vuya.x, vuya.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct IOutput\n"
|
|
"{\n"
|
|
" __device__ virtual void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1) = 0;\n"
|
|
"\n"
|
|
" __device__ virtual void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1) = 0;\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputI420 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
|
|
" if (x % 2 == 0 && y % 2 == 0) {\n"
|
|
" unsigned int pos = x / 2 + (y / 2) * stride1;\n"
|
|
" dst1[pos] = scale_to_uchar (sample.y);\n"
|
|
" dst2[pos] = scale_to_uchar (sample.z);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" unsigned int pos = x + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
|
|
" if (x % 2 == 0 && y % 2 == 0) {\n"
|
|
" pos = x / 2 + (y / 2) * stride1;\n"
|
|
" dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
|
|
" dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputYV12 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
|
|
" if (x % 2 == 0 && y % 2 == 0) {\n"
|
|
" unsigned int pos = x / 2 + (y / 2) * stride1;\n"
|
|
" dst1[pos] = scale_to_uchar (sample.z);\n"
|
|
" dst2[pos] = scale_to_uchar (sample.y);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" unsigned int pos = x + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
|
|
" if (x % 2 == 0 && y % 2 == 0) {\n"
|
|
" pos = x / 2 + (y / 2) * stride1;\n"
|
|
" dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);\n"
|
|
" dst2[pos] = blend_uchar (dst2[pos], sample.y, sample.w);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputNV12 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
|
|
" if (x % 2 == 0 && y % 2 == 0) {\n"
|
|
" unsigned int pos = x + (y / 2) * stride1;\n"
|
|
" dst1[pos] = scale_to_uchar (sample.y);\n"
|
|
" dst1[pos + 1] = scale_to_uchar (sample.z);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" unsigned int pos = x + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
|
|
" if (x % 2 == 0 && y % 2 == 0) {\n"
|
|
" pos = x + (y / 2) * stride1;\n"
|
|
" dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
|
|
" dst1[pos + 1] = blend_uchar (dst1[pos + 1], sample.z, sample.w);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputNV21 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
|
|
" if (x % 2 == 0 && y % 2 == 0) {\n"
|
|
" unsigned int pos = x + (y / 2) * stride1;\n"
|
|
" dst1[pos] = scale_to_uchar (sample.z);\n"
|
|
" dst1[pos + 1] = scale_to_uchar (sample.y);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" unsigned int pos = x + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
|
|
" if (x % 2 == 0 && y % 2 == 0) {\n"
|
|
" pos = x + (y / 2) * stride1;\n"
|
|
" dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);\n"
|
|
" dst1[pos + 1] = blend_uchar (dst1[pos + 1], sample.y, sample.w);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputP010 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_ushort (sample.x);\n"
|
|
" if (x % 2 == 0 && y % 2 == 0) {\n"
|
|
" unsigned int pos = x * 2 + (y / 2) * stride1;\n"
|
|
" *(unsigned short *) &dst1[pos] = scale_to_ushort (sample.y);\n"
|
|
" *(unsigned short *) &dst1[pos + 2] = scale_to_ushort (sample.z);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" unsigned int pos = x * 2 + y * stride0;\n"
|
|
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
|
|
" *target = blend_ushort (*target, sample.x, sample.w);\n"
|
|
" if (x % 2 == 0 && y % 2 == 0) {\n"
|
|
" pos = x * 2 + (y / 2) * stride1;\n"
|
|
" target = (unsigned short *) &dst1[pos];\n"
|
|
" *target = blend_ushort (*target, sample.y, sample.w);\n"
|
|
" target = (unsigned short *) &dst1[pos + 2];\n"
|
|
" *target = blend_ushort (*target, sample.z, sample.w);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputI420_10 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_10bits (sample.x);\n"
|
|
" if (x % 2 == 0 && y % 2 == 0) {\n"
|
|
" unsigned int pos = x + (y / 2) * stride1;\n"
|
|
" *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);\n"
|
|
" *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" unsigned int pos = x * 2 + y * stride0;\n"
|
|
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
|
|
" *target = blend_10bits (*target, sample.x, sample.w);\n"
|
|
" if (x % 2 == 0 && y % 2 == 0) {\n"
|
|
" pos = x * 2 + (y / 2) * stride1;\n"
|
|
" target = (unsigned short *) &dst1[pos];\n"
|
|
" *target = blend_10bits (*target, sample.y, sample.w);\n"
|
|
" target = (unsigned short *) &dst2[pos];\n"
|
|
" *target = blend_10bits (*target, sample.z, sample.w);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputI420_12 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_12bits (sample.x);\n"
|
|
" if (x % 2 == 0 && y % 2 == 0) {\n"
|
|
" unsigned int pos = x + (y / 2) * stride1;\n"
|
|
" *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);\n"
|
|
" *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" unsigned int pos = x * 2 + y * stride0;\n"
|
|
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
|
|
" *target = blend_12bits (*target, sample.x, sample.w);\n"
|
|
" if (x % 2 == 0 && y % 2 == 0) {\n"
|
|
" pos = x * 2 + (y / 2) * stride1;\n"
|
|
" target = (unsigned short *) &dst1[pos];\n"
|
|
" *target = blend_12bits (*target, sample.y, sample.w);\n"
|
|
" target = (unsigned short *) &dst2[pos];\n"
|
|
" *target = blend_12bits (*target, sample.z, sample.w);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputY444 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x + y * stride0;\n"
|
|
" dst0[pos] = scale_to_uchar (sample.x);\n"
|
|
" dst1[pos] = scale_to_uchar (sample.y);\n"
|
|
" dst2[pos] = scale_to_uchar (sample.z);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
|
|
" dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
|
|
" dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputY444_10 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 2 + y * stride0;\n"
|
|
" *(unsigned short *) &dst0[pos] = scale_to_10bits (sample.x);\n"
|
|
" *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);\n"
|
|
" *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 2 + y * stride0;\n"
|
|
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
|
|
" *target = blend_10bits (*target, sample.x, sample.w);\n"
|
|
" target = (unsigned short *) &dst1[pos];\n"
|
|
" *target = blend_10bits (*target, sample.y, sample.w);\n"
|
|
" target = (unsigned short *) &dst2[pos];\n"
|
|
" *target = blend_10bits (*target, sample.z, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputY444_12 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 2 + y * stride0;\n"
|
|
" *(unsigned short *) &dst0[pos] = scale_to_12bits (sample.x);\n"
|
|
" *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);\n"
|
|
" *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 2 + y * stride0;\n"
|
|
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
|
|
" *target = blend_12bits (*target, sample.x, sample.w);\n"
|
|
" target = (unsigned short *) &dst1[pos];\n"
|
|
" *target = blend_12bits (*target, sample.y, sample.w);\n"
|
|
" target = (unsigned short *) &dst2[pos];\n"
|
|
" *target = blend_12bits (*target, sample.z, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputY444_16 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 2 + y * stride0;\n"
|
|
" *(unsigned short *) &dst0[pos] = scale_to_ushort (sample.x);\n"
|
|
" *(unsigned short *) &dst1[pos] = scale_to_ushort (sample.y);\n"
|
|
" *(unsigned short *) &dst2[pos] = scale_to_ushort (sample.z);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 2 + y * stride0;\n"
|
|
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
|
|
" *target = blend_ushort (*target, sample.x, sample.w);\n"
|
|
" target = (unsigned short *) &dst1[pos];\n"
|
|
" *target = blend_ushort (*target, sample.y, sample.w);\n"
|
|
" target = (unsigned short *) &dst2[pos];\n"
|
|
" *target = blend_ushort (*target, sample.z, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputRGBA : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 4 + y * stride0;\n"
|
|
" dst0[pos] = scale_to_uchar (sample.x);\n"
|
|
" dst0[pos + 1] = scale_to_uchar (sample.y);\n"
|
|
" dst0[pos + 2] = scale_to_uchar (sample.z);\n"
|
|
" dst0[pos + 3] = scale_to_uchar (sample.w);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 4 + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
|
|
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
|
|
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);\n"
|
|
" dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputRGBx : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 4 + y * stride0;\n"
|
|
" dst0[pos] = scale_to_uchar (sample.x);\n"
|
|
" dst0[pos + 1] = scale_to_uchar (sample.y);\n"
|
|
" dst0[pos + 2] = scale_to_uchar (sample.z);\n"
|
|
" dst0[pos + 3] = 255;\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 4 + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
|
|
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
|
|
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);\n"
|
|
" dst0[pos + 3] = 255;\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputBGRA : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 4 + y * stride0;\n"
|
|
" dst0[pos] = scale_to_uchar (sample.z);\n"
|
|
" dst0[pos + 1] = scale_to_uchar (sample.y);\n"
|
|
" dst0[pos + 2] = scale_to_uchar (sample.x);\n"
|
|
" dst0[pos + 3] = scale_to_uchar (sample.w);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 4 + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);\n"
|
|
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
|
|
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);\n"
|
|
" dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputBGRx : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 4 + y * stride0;\n"
|
|
" dst0[pos] = scale_to_uchar (sample.z);\n"
|
|
" dst0[pos + 1] = scale_to_uchar (sample.y);\n"
|
|
" dst0[pos + 2] = scale_to_uchar (sample.x);\n"
|
|
" dst0[pos + 3] = 255;\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 4 + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);\n"
|
|
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
|
|
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);\n"
|
|
" dst0[pos + 3] = 255;\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputARGB : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 4 + y * stride0;\n"
|
|
" dst0[pos] = scale_to_uchar (sample.w);\n"
|
|
" dst0[pos + 1] = scale_to_uchar (sample.x);\n"
|
|
" dst0[pos + 2] = scale_to_uchar (sample.y);\n"
|
|
" dst0[pos + 3] = scale_to_uchar (sample.z);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 4 + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], 1.0, sample.w);\n"
|
|
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.x, sample.w);\n"
|
|
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.y, sample.w);\n"
|
|
" dst0[pos + 3] = blend_uchar (dst0[pos + 3], sample.z, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputABGR : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 4 + y * stride0;\n"
|
|
" dst0[pos] = scale_to_uchar (sample.w);\n"
|
|
" dst0[pos + 1] = scale_to_uchar (sample.z);\n"
|
|
" dst0[pos + 2] = scale_to_uchar (sample.y);\n"
|
|
" dst0[pos + 3] = scale_to_uchar (sample.x);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 4 + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], 1.0, sample.w);\n"
|
|
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.z, sample.w);\n"
|
|
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.y, sample.w);\n"
|
|
" dst0[pos + 3] = blend_uchar (dst0[pos + 3], sample.x, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputRGB : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 3 + y * stride0;\n"
|
|
" dst0[pos] = scale_to_uchar (sample.x);\n"
|
|
" dst0[pos + 1] = scale_to_uchar (sample.y);\n"
|
|
" dst0[pos + 2] = scale_to_uchar (sample.z);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 3 + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
|
|
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
|
|
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.z, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputBGR : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 3 + y * stride0;\n"
|
|
" dst0[pos] = scale_to_uchar (sample.z);\n"
|
|
" dst0[pos + 1] = scale_to_uchar (sample.y);\n"
|
|
" dst0[pos + 2] = scale_to_uchar (sample.x);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 3 + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);\n"
|
|
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
|
|
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"__device__ inline ushort3\n"
|
|
"unpack_rgb10a2 (unsigned int val)\n"
|
|
"{\n"
|
|
" unsigned short r, g, b;\n"
|
|
" r = (val & 0x3ff);\n"
|
|
" r = (r << 6) | (r >> 4);\n"
|
|
" g = ((val >> 10) & 0x3ff);\n"
|
|
" g = (g << 6) | (g >> 4);\n"
|
|
" b = ((val >> 20) & 0x3ff);\n"
|
|
" b = (b << 6) | (b >> 4);\n"
|
|
" return make_ushort3 (r, g, b);\n"
|
|
"}\n"
|
|
"\n"
|
|
"struct OutputRGB10A2 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);\n"
|
|
" unsigned int packed_rgb = alpha << 30;\n"
|
|
" packed_rgb |= ((unsigned int) scale_to_10bits (sample.x));\n"
|
|
" packed_rgb |= ((unsigned int) scale_to_10bits (sample.y)) << 10;\n"
|
|
" packed_rgb |= ((unsigned int) scale_to_10bits (sample.z)) << 20;\n"
|
|
" *(unsigned int *) &dst0[x * 4 + y * stride0] = packed_rgb;\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" unsigned int * target = (unsigned int *) &dst0[x * 4 + y * stride0];\n"
|
|
" ushort3 val = unpack_rgb10a2 (*target);\n"
|
|
" unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);\n"
|
|
" unsigned int packed_rgb = alpha << 30;\n"
|
|
" packed_rgb |= ((unsigned int) blend_10bits (val.x, sample.x, sample.w));\n"
|
|
" packed_rgb |= ((unsigned int) blend_10bits (val.y, sample.y, sample.w)) << 10;\n"
|
|
" packed_rgb |= ((unsigned int) blend_10bits (val.z, sample.z, sample.w)) << 20;\n"
|
|
" *target = packed_rgb;\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"__device__ inline ushort3\n"
|
|
"unpack_bgr10a2 (unsigned int val)\n"
|
|
"{\n"
|
|
" unsigned short r, g, b;\n"
|
|
" b = (val & 0x3ff);\n"
|
|
" b = (b << 6) | (b >> 4);\n"
|
|
" g = ((val >> 10) & 0x3ff);\n"
|
|
" g = (g << 6) | (g >> 4);\n"
|
|
" r = ((val >> 20) & 0x3ff);\n"
|
|
" r = (r << 6) | (r >> 4);\n"
|
|
" return make_ushort3 (r, g, b);\n"
|
|
"}\n"
|
|
"\n"
|
|
"struct OutputBGR10A2 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" unsigned int alpha = (unsigned int) scale_to_2bits (sample.x);\n"
|
|
" unsigned int packed_rgb = alpha << 30;\n"
|
|
" packed_rgb |= ((unsigned int) scale_to_10bits (sample.x)) << 20;\n"
|
|
" packed_rgb |= ((unsigned int) scale_to_10bits (sample.y)) << 10;\n"
|
|
" packed_rgb |= ((unsigned int) scale_to_10bits (sample.z));\n"
|
|
" *(unsigned int *) &dst0[x * 4 + y * stride0] = packed_rgb;\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" unsigned int * target = (unsigned int *) &dst0[x * 4 + y * stride0];\n"
|
|
" ushort3 val = unpack_bgr10a2 (*target);\n"
|
|
" unsigned int alpha = (unsigned int) scale_to_2bits (sample.w);\n"
|
|
" unsigned int packed_rgb = alpha << 30;\n"
|
|
" packed_rgb |= ((unsigned int) blend_10bits (val.x, sample.x, sample.w)) << 20;\n"
|
|
" packed_rgb |= ((unsigned int) blend_10bits (val.y, sample.y, sample.w)) << 10;\n"
|
|
" packed_rgb |= ((unsigned int) blend_10bits (val.z, sample.z, sample.w));\n"
|
|
" *target = packed_rgb;\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputY42B : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" dst0[x + y * stride0] = scale_to_uchar (sample.x);\n"
|
|
" if (x % 2 == 0) {\n"
|
|
" unsigned int pos = x / 2 + y * stride1;\n"
|
|
" dst1[pos] = scale_to_uchar (sample.y);\n"
|
|
" dst2[pos] = scale_to_uchar (sample.z);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" unsigned int pos = x + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
|
|
" if (x % 2 == 0) {\n"
|
|
" pos = x / 2 + y * stride1;\n"
|
|
" dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
|
|
" dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputI422_10 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_10bits (sample.x);\n"
|
|
" if (x % 2 == 0) {\n"
|
|
" unsigned int pos = x + y * stride1;\n"
|
|
" *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);\n"
|
|
" *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" unsigned int pos = x * 2 + y * stride0;\n"
|
|
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
|
|
" *target = blend_10bits (*target, sample.x, sample.w);\n"
|
|
" if (x % 2 == 0) {\n"
|
|
" pos = x / 2 + y * stride1;\n"
|
|
" target = (unsigned short *) &dst1[pos];\n"
|
|
" *target = blend_10bits (*target, sample.y, sample.w);\n"
|
|
" target = (unsigned short *) &dst2[pos];\n"
|
|
" *target = blend_10bits (*target, sample.z, sample.w);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputI422_12 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_12bits (sample.x);\n"
|
|
" if (x % 2 == 0) {\n"
|
|
" unsigned int pos = x + y * stride1;\n"
|
|
" *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);\n"
|
|
" *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" unsigned int pos = x * 2 + y * stride0;\n"
|
|
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
|
|
" *target = blend_12bits (*target, sample.x, sample.w);\n"
|
|
" if (x % 2 == 0) {\n"
|
|
" pos = x / 2 + y * stride1;\n"
|
|
" target = (unsigned short *) &dst1[pos];\n"
|
|
" *target = blend_12bits (*target, sample.y, sample.w);\n"
|
|
" target = (unsigned short *) &dst2[pos];\n"
|
|
" *target = blend_12bits (*target, sample.z, sample.w);\n"
|
|
" }\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputRGBP : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x + y * stride0;\n"
|
|
" dst0[pos] = scale_to_uchar (sample.x);\n"
|
|
" dst1[pos] = scale_to_uchar (sample.y);\n"
|
|
" dst2[pos] = scale_to_uchar (sample.z);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.x, sample.w);\n"
|
|
" dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
|
|
" dst2[pos] = blend_uchar (dst2[pos], sample.z, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputBGRP : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x + y * stride0;\n"
|
|
" dst0[pos] = scale_to_uchar (sample.z);\n"
|
|
" dst1[pos] = scale_to_uchar (sample.y);\n"
|
|
" dst2[pos] = scale_to_uchar (sample.x);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);\n"
|
|
" dst1[pos] = blend_uchar (dst1[pos], sample.y, sample.w);\n"
|
|
" dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputGBR : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x + y * stride0;\n"
|
|
" dst0[pos] = scale_to_uchar (sample.y);\n"
|
|
" dst1[pos] = scale_to_uchar (sample.z);\n"
|
|
" dst2[pos] = scale_to_uchar (sample.x);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.y, sample.w);\n"
|
|
" dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);\n"
|
|
" dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputGBR_10 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 2 + y * stride0;\n"
|
|
" *(unsigned short *) &dst0[pos] = scale_to_10bits (sample.y);\n"
|
|
" *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.z);\n"
|
|
" *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.x);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 2 + y * stride0;\n"
|
|
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
|
|
" *target = blend_10bits (*target, sample.y, sample.w);\n"
|
|
" target = (unsigned short *) &dst1[pos];\n"
|
|
" *target = blend_10bits (*target, sample.z, sample.w);\n"
|
|
" target = (unsigned short *) &dst2[pos];\n"
|
|
" *target = blend_10bits (*target, sample.x, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputGBR_12 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 2 + y * stride0;\n"
|
|
" *(unsigned short *) &dst0[pos] = scale_to_12bits (sample.y);\n"
|
|
" *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.z);\n"
|
|
" *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.x);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 2 + y * stride0;\n"
|
|
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
|
|
" *target = blend_12bits (*target, sample.y, sample.w);\n"
|
|
" target = (unsigned short *) &dst1[pos];\n"
|
|
" *target = blend_12bits (*target, sample.z, sample.w);\n"
|
|
" target = (unsigned short *) &dst2[pos];\n"
|
|
" *target = blend_12bits (*target, sample.x, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputGBR_16 : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 2 + y * stride0;\n"
|
|
" *(unsigned short *) &dst0[pos] = scale_to_ushort (sample.y);\n"
|
|
" *(unsigned short *) &dst1[pos] = scale_to_ushort (sample.z);\n"
|
|
" *(unsigned short *) &dst2[pos] = scale_to_ushort (sample.x);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 2 + y * stride0;\n"
|
|
" unsigned short * target = (unsigned short *) &dst0[pos];\n"
|
|
" *target = blend_ushort (*target, sample.y, sample.w);\n"
|
|
" target = (unsigned short *) &dst1[pos];\n"
|
|
" *target = blend_ushort (*target, sample.z, sample.w);\n"
|
|
" target = (unsigned short *) &dst2[pos];\n"
|
|
" *target = blend_ushort (*target, sample.x, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputGBRA : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x + y * stride0;\n"
|
|
" dst0[pos] = scale_to_uchar (sample.y);\n"
|
|
" dst1[pos] = scale_to_uchar (sample.z);\n"
|
|
" dst2[pos] = scale_to_uchar (sample.x);\n"
|
|
" dst3[pos] = scale_to_uchar (sample.w);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.y, sample.w);\n"
|
|
" dst1[pos] = blend_uchar (dst1[pos], sample.z, sample.w);\n"
|
|
" dst2[pos] = blend_uchar (dst2[pos], sample.x, sample.w);\n"
|
|
" dst3[pos] = blend_uchar (dst3[pos], 1.0, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"struct OutputVUYA : public IOutput\n"
|
|
"{\n"
|
|
" __device__ void\n"
|
|
" Write (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 4 + y * stride0;\n"
|
|
" dst0[pos] = scale_to_uchar (sample.z);\n"
|
|
" dst0[pos + 1] = scale_to_uchar (sample.y);\n"
|
|
" dst0[pos + 2] = scale_to_uchar (sample.x);\n"
|
|
" dst0[pos + 3] = scale_to_uchar (sample.w);\n"
|
|
" }\n"
|
|
"\n"
|
|
" __device__ void\n"
|
|
" Blend (unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n"
|
|
" unsigned char * dst3, float4 sample, int x, int y, int stride0,\n"
|
|
" int stride1)\n"
|
|
" {\n"
|
|
" int pos = x * 4 + y * stride0;\n"
|
|
" dst0[pos] = blend_uchar (dst0[pos], sample.z, sample.w);\n"
|
|
" dst0[pos + 1] = blend_uchar (dst0[pos + 1], sample.y, sample.w);\n"
|
|
" dst0[pos + 2] = blend_uchar (dst0[pos + 2], sample.x, sample.w);\n"
|
|
" dst0[pos + 3] = blend_uchar (dst0[pos + 3], 1.0, sample.w);\n"
|
|
" }\n"
|
|
"};\n"
|
|
"\n"
|
|
"__device__ inline float2\n"
|
|
"rotate_identity (float x, float y)\n"
|
|
"{\n"
|
|
" return make_float2(x, y);\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline float2\n"
|
|
"rotate_90r (float x, float y)\n"
|
|
"{\n"
|
|
" return make_float2(y, 1.0 - x);\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline float2\n"
|
|
"rotate_180 (float x, float y)\n"
|
|
"{\n"
|
|
" return make_float2(1.0 - x, 1.0 - y);\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline float2\n"
|
|
"rotate_90l (float x, float y)\n"
|
|
"{\n"
|
|
" return make_float2(1.0 - y, x);\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline float2\n"
|
|
"rotate_horiz (float x, float y)\n"
|
|
"{\n"
|
|
" return make_float2(1.0 - x, y);\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline float2\n"
|
|
"rotate_vert (float x, float y)\n"
|
|
"{\n"
|
|
" return make_float2(x, 1.0 - y);\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline float2\n"
|
|
"rotate_ul_lr (float x, float y)\n"
|
|
"{\n"
|
|
" return make_float2(y, x);\n"
|
|
"}\n"
|
|
"\n"
|
|
"__device__ inline float2\n"
|
|
"rotate_ur_ll (float x, float y)\n"
|
|
"{\n"
|
|
" return make_float2(1.0 - y, 1.0 - x);\n"
|
|
"}\n"
|
|
"__device__ inline float2\n"
|
|
"do_rotate (float x, float y, int direction)\n"
|
|
"{\n"
|
|
" switch (direction) {\n"
|
|
" case 1:\n"
|
|
" return rotate_90r (x, y);\n"
|
|
" case 2:\n"
|
|
" return rotate_180 (x, y);\n"
|
|
" case 3:\n"
|
|
" return rotate_90l (x, y);\n"
|
|
" case 4:\n"
|
|
" return rotate_horiz (x, y);\n"
|
|
" case 5:\n"
|
|
" return rotate_vert (x, y);\n"
|
|
" case 6:\n"
|
|
" return rotate_ul_lr (x, y);\n"
|
|
" case 7:\n"
|
|
" return rotate_ur_ll (x, y);\n"
|
|
" default:\n"
|
|
" return rotate_identity (x, y);\n"
|
|
" }\n"
|
|
"}\n"
|
|
"\n"
|
|
"extern \"C\" {\n"
|
|
"__global__ void\n"
|
|
"GstCudaConverterMain (cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n"
|
|
" cudaTextureObject_t tex2, cudaTextureObject_t tex3, unsigned char * dst0,\n"
|
|
" unsigned char * dst1, unsigned char * dst2, unsigned char * dst3,\n"
|
|
" int stride0, int stride1, ConstBuffer const_buf, int off_x, int off_y)\n"
|
|
"{\n"
|
|
" ConvertSimple g_converter;\n"
|
|
" SAMPLER g_sampler;\n"
|
|
" OUTPUT g_output;\n"
|
|
" int x_pos = blockIdx.x * blockDim.x + threadIdx.x + off_x;\n"
|
|
" int y_pos = blockIdx.y * blockDim.y + threadIdx.y + off_y;\n"
|
|
" float4 sample;\n"
|
|
" if (x_pos >= const_buf.width || y_pos >= const_buf.height ||\n"
|
|
" const_buf.view_width <= 0 || const_buf.view_height <= 0)\n"
|
|
" return;\n"
|
|
" if (x_pos < const_buf.left || x_pos >= const_buf.right ||\n"
|
|
" y_pos < const_buf.top || y_pos >= const_buf.bottom) {\n"
|
|
" if (!const_buf.fill_border)\n"
|
|
" return;\n"
|
|
" sample = make_float4 (const_buf.border_x, const_buf.border_y,\n"
|
|
" const_buf.border_z, const_buf.border_w);\n"
|
|
" } else {\n"
|
|
" float x = (__int2float_rz (x_pos - const_buf.left) + 0.5) / const_buf.view_width;\n"
|
|
" if (x < 0.0 || x > 1.0)\n"
|
|
" return;\n"
|
|
" float y = (__int2float_rz (y_pos - const_buf.top) + 0.5) / const_buf.view_height;\n"
|
|
" if (y < 0.0 || y > 1.0)\n"
|
|
" return;\n"
|
|
" float2 rotated = do_rotate (x, y, const_buf.video_direction);\n"
|
|
" float4 s = g_sampler.Execute (tex0, tex1, tex2, tex3, rotated.x, rotated.y);\n"
|
|
" float3 rgb = make_float3 (s.x, s.y, s.z);\n"
|
|
" float3 yuv;\n"
|
|
" if (const_buf.do_convert)\n"
|
|
" yuv = g_converter.Execute (rgb, &const_buf.matrix);\n"
|
|
" else\n"
|
|
" yuv = rgb;\n"
|
|
" sample = make_float4 (yuv.x, yuv.y, yuv.z, s.w);\n"
|
|
" }\n"
|
|
" sample.w = sample.w * const_buf.alpha;\n"
|
|
" if (!const_buf.do_blend) {\n"
|
|
" g_output.Write (dst0, dst1, dst2, dst3, sample, x_pos, y_pos, stride0, stride1);\n"
|
|
" } else {\n"
|
|
" g_output.Blend (dst0, dst1, dst2, dst3, sample, x_pos, y_pos, stride0, stride1);\n"
|
|
" }\n"
|
|
"}\n"
|
|
"}\n"
|
|
"\n";
|
|
#endif |