gstreamer/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/CSMain_mipgen.hlsl

/**
 * MIT License
 *
 * Copyright (c) 2018 Jeremiah van Oosten
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/* Source: https://github.com/jpvanoosten/LearningDirectX12 */

#ifdef BUILDING_HLSL

#define BLOCK_SIZE 8

 // When reducing the size of a texture, it could be that downscaling the texture
 // will result in a less than exactly 50% (1/2) of the original texture size.
 // This happens if either the width, or the height (or both) dimensions of the texture
 // are odd. For example, downscaling a 5x3 texture will result in a 2x1 texture which
 // has a 60% reduction in the texture width and 66% reduction in the height.
 // When this happens, we need to take more samples from the source texture to
 // determine the pixel value in the destination texture.

#define WIDTH_HEIGHT_EVEN 0     // Both the width and the height of the texture are even.
#define WIDTH_ODD_HEIGHT_EVEN 1 // The texture width is odd and the height is even.
#define WIDTH_EVEN_HEIGHT_ODD 2 // The texture width is even and teh height is odd.
#define WIDTH_HEIGHT_ODD 3      // Both the width and height of the texture are odd.

struct ComputeShaderInput
{
    uint3 GroupID           : SV_GroupID;           // 3D index of the thread group in the dispatch.
    uint3 GroupThreadID     : SV_GroupThreadID;     // 3D index of local thread ID in a thread group.
    uint3 DispatchThreadID  : SV_DispatchThreadID;  // 3D index of global thread ID in the dispatch.
    uint  GroupIndex        : SV_GroupIndex;        // Flattened local index of the thread within a thread group.
};

cbuffer GenerateMipsCB : register( b0 )
{
    uint SrcMipLevel;   // Texture level of source mip
    uint NumMipLevels;  // Number of OutMips to write: [1-4]
    uint SrcDimension;  // Width and height of the source texture are even or odd.
    uint padding;
    float2 TexelSize;   // 1.0 / OutMip1.Dimensions
}

// Source mip map.
Texture2D<float4> SrcMip : register( t0 );

// Write up to 4 mip map levels.
RWTexture2D<float4> OutMip1 : register( u0 );
RWTexture2D<float4> OutMip2 : register( u1 );
RWTexture2D<float4> OutMip3 : register( u2 );
RWTexture2D<float4> OutMip4 : register( u3 );

// Linear clamp sampler.
SamplerState LinearClampSampler : register( s0 );

// The reason for separating channels is to reduce bank conflicts in the
// local data memory controller.  A large stride will cause more threads
// to collide on the same memory bank.
groupshared float gs_R[64];
groupshared float gs_G[64];
groupshared float gs_B[64];
groupshared float gs_A[64];

void StoreColor( uint Index, float4 Color )
{
    gs_R[Index] = Color.r;
    gs_G[Index] = Color.g;
    gs_B[Index] = Color.b;
    gs_A[Index] = Color.a;
}

float4 LoadColor( uint Index )
{
    return float4( gs_R[Index], gs_G[Index], gs_B[Index], gs_A[Index] );
}

[numthreads( BLOCK_SIZE, BLOCK_SIZE, 1 )]
void ENTRY_POINT( ComputeShaderInput IN )
{
    float4 Src1 = (float4)0;

    // One bilinear sample is insufficient when scaling down by more than 2x.
    // You will slightly undersample in the case where the source dimension
    // is odd.  This is why it's a really good idea to only generate mips on
    // power-of-two sized textures.  Trying to handle the undersampling case
    // will force this shader to be slower and more complicated as it will
    // have to take more source texture samples.

    // Determine the path to use based on the dimension of the
    // source texture.
    // 0b00(0): Both width and height are even.
    // 0b01(1): Width is odd, height is even.
    // 0b10(2): Width is even, height is odd.
    // 0b11(3): Both width and height are odd.
    switch ( SrcDimension )
    {
        case WIDTH_HEIGHT_EVEN:
        {
            float2 UV = TexelSize * ( IN.DispatchThreadID.xy + 0.5 );

            Src1 = SrcMip.SampleLevel( LinearClampSampler, UV, SrcMipLevel );
        }
        break;
        case WIDTH_ODD_HEIGHT_EVEN:
        {
            // > 2:1 in X dimension
            // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x
            // horizontally.
            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.25, 0.5 ) );
            float2 Off = TexelSize * float2( 0.5, 0.0 );

            Src1 = 0.5 * ( SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ) +
                           SrcMip.SampleLevel( LinearClampSampler, UV1 + Off, SrcMipLevel ) );
        }
        break;
        case WIDTH_EVEN_HEIGHT_ODD:
        {
            // > 2:1 in Y dimension
            // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x
            // vertically.
            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.5, 0.25 ) );
            float2 Off = TexelSize * float2( 0.0, 0.5 );

            Src1 = 0.5 * ( SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ) +
                           SrcMip.SampleLevel( LinearClampSampler, UV1 + Off, SrcMipLevel ) );
        }
        break;
        case WIDTH_HEIGHT_ODD:
        {
            // > 2:1 in in both dimensions
            // Use 4 bilinear samples to guarantee we don't undersample when downsizing by more than 2x
            // in both directions.
            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.25, 0.25 ) );
            float2 Off = TexelSize * 0.5;

            Src1 =  SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel );
            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( Off.x, 0.0   ), SrcMipLevel );
            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( 0.0,   Off.y ), SrcMipLevel );
            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( Off.x, Off.y ), SrcMipLevel );
            Src1 *= 0.25;
        }
        break;
    }

    OutMip1[IN.DispatchThreadID.xy] = Src1;

    // A scalar (constant) branch can exit all threads coherently.
    if ( NumMipLevels == 1 )
        return;

    // Without lane swizzle operations, the only way to share data with other
    // threads is through LDS.
    StoreColor( IN.GroupIndex, Src1 );

    // This guarantees all LDS writes are complete and that all threads have
    // executed all instructions so far (and therefore have issued their LDS
    // write instructions.)
    GroupMemoryBarrierWithGroupSync();

    // With low three bits for X and high three bits for Y, this bit mask
    // (binary: 001001) checks that X and Y are even.
    if ( ( IN.GroupIndex & 0x9 ) == 0 )
    {
        float4 Src2 = LoadColor( IN.GroupIndex + 0x01 );
        float4 Src3 = LoadColor( IN.GroupIndex + 0x08 );
        float4 Src4 = LoadColor( IN.GroupIndex + 0x09 );
        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );

        OutMip2[IN.DispatchThreadID.xy / 2] = Src1;
        StoreColor( IN.GroupIndex, Src1 );
    }

    if ( NumMipLevels == 2 )
        return;

    GroupMemoryBarrierWithGroupSync();

    // This bit mask (binary: 011011) checks that X and Y are multiples of four.
    if ( ( IN.GroupIndex & 0x1B ) == 0 )
    {
        float4 Src2 = LoadColor( IN.GroupIndex + 0x02 );
        float4 Src3 = LoadColor( IN.GroupIndex + 0x10 );
        float4 Src4 = LoadColor( IN.GroupIndex + 0x12 );
        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );

        OutMip3[IN.DispatchThreadID.xy / 4] = Src1;
        StoreColor( IN.GroupIndex, Src1 );
    }

    if ( NumMipLevels == 3 )
        return;

    GroupMemoryBarrierWithGroupSync();

    // This bit mask would be 111111 (X & Y multiples of 8), but only one
    // thread fits that criteria.
    if ( IN.GroupIndex == 0 )
    {
        float4 Src2 = LoadColor( IN.GroupIndex + 0x04 );
        float4 Src3 = LoadColor( IN.GroupIndex + 0x20 );
        float4 Src4 = LoadColor( IN.GroupIndex + 0x24 );
        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );

        OutMip4[IN.DispatchThreadID.xy / 8] = Src1;
    }
}
#else
static const char str_CSMain_mipgen[] =
"#define BLOCK_SIZE 8\n"
"\n"
" // When reducing the size of a texture, it could be that downscaling the texture\n"
" // will result in a less than exactly 50% (1/2) of the original texture size.\n"
" // This happens if either the width, or the height (or both) dimensions of the texture\n"
" // are odd. For example, downscaling a 5x3 texture will result in a 2x1 texture which\n"
" // has a 60% reduction in the texture width and 66% reduction in the height.\n"
" // When this happens, we need to take more samples from the source texture to\n"
" // determine the pixel value in the destination texture.\n"
"\n"
"#define WIDTH_HEIGHT_EVEN 0     // Both the width and the height of the texture are even.\n"
"#define WIDTH_ODD_HEIGHT_EVEN 1 // The texture width is odd and the height is even.\n"
"#define WIDTH_EVEN_HEIGHT_ODD 2 // The texture width is even and teh height is odd.\n"
"#define WIDTH_HEIGHT_ODD 3      // Both the width and height of the texture are odd.\n"
"\n"
"struct ComputeShaderInput\n"
"{\n"
"    uint3 GroupID           : SV_GroupID;           // 3D index of the thread group in the dispatch.\n"
"    uint3 GroupThreadID     : SV_GroupThreadID;     // 3D index of local thread ID in a thread group.\n"
"    uint3 DispatchThreadID  : SV_DispatchThreadID;  // 3D index of global thread ID in the dispatch.\n"
"    uint  GroupIndex        : SV_GroupIndex;        // Flattened local index of the thread within a thread group.\n"
"};\n"
"\n"
"cbuffer GenerateMipsCB : register( b0 )\n"
"{\n"
"    uint SrcMipLevel;   // Texture level of source mip\n"
"    uint NumMipLevels;  // Number of OutMips to write: [1-4]\n"
"    uint SrcDimension;  // Width and height of the source texture are even or odd.\n"
"    uint padding;\n"
"    float2 TexelSize;   // 1.0 / OutMip1.Dimensions\n"
"}\n"
"\n"
"// Source mip map.\n"
"Texture2D<float4> SrcMip : register( t0 );\n"
"\n"
"// Write up to 4 mip map levels.\n"
"RWTexture2D<float4> OutMip1 : register( u0 );\n"
"RWTexture2D<float4> OutMip2 : register( u1 );\n"
"RWTexture2D<float4> OutMip3 : register( u2 );\n"
"RWTexture2D<float4> OutMip4 : register( u3 );\n"
"\n"
"// Linear clamp sampler.\n"
"SamplerState LinearClampSampler : register( s0 );\n"
"\n"
"// The reason for separating channels is to reduce bank conflicts in the\n"
"// local data memory controller.  A large stride will cause more threads\n"
"// to collide on the same memory bank.\n"
"groupshared float gs_R[64];\n"
"groupshared float gs_G[64];\n"
"groupshared float gs_B[64];\n"
"groupshared float gs_A[64];\n"
"\n"
"void StoreColor( uint Index, float4 Color )\n"
"{\n"
"    gs_R[Index] = Color.r;\n"
"    gs_G[Index] = Color.g;\n"
"    gs_B[Index] = Color.b;\n"
"    gs_A[Index] = Color.a;\n"
"}\n"
"\n"
"float4 LoadColor( uint Index )\n"
"{\n"
"    return float4( gs_R[Index], gs_G[Index], gs_B[Index], gs_A[Index] );\n"
"}\n"
"\n"
"[numthreads( BLOCK_SIZE, BLOCK_SIZE, 1 )]\n"
"void ENTRY_POINT( ComputeShaderInput IN )\n"
"{\n"
"    float4 Src1 = (float4)0;\n"
"\n"
"    // One bilinear sample is insufficient when scaling down by more than 2x.\n"
"    // You will slightly undersample in the case where the source dimension\n"
"    // is odd.  This is why it's a really good idea to only generate mips on\n"
"    // power-of-two sized textures.  Trying to handle the undersampling case\n"
"    // will force this shader to be slower and more complicated as it will\n"
"    // have to take more source texture samples.\n"
"\n"
"    // Determine the path to use based on the dimension of the\n"
"    // source texture.\n"
"    // 0b00(0): Both width and height are even.\n"
"    // 0b01(1): Width is odd, height is even.\n"
"    // 0b10(2): Width is even, height is odd.\n"
"    // 0b11(3): Both width and height are odd.\n"
"    switch ( SrcDimension )\n"
"    {\n"
"        case WIDTH_HEIGHT_EVEN:\n"
"        {\n"
"            float2 UV = TexelSize * ( IN.DispatchThreadID.xy + 0.5 );\n"
"\n"
"            Src1 = SrcMip.SampleLevel( LinearClampSampler, UV, SrcMipLevel );\n"
"        }\n"
"        break;\n"
"        case WIDTH_ODD_HEIGHT_EVEN:\n"
"        {\n"
"            // > 2:1 in X dimension\n"
"            // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x\n"
"            // horizontally.\n"
"            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.25, 0.5 ) );\n"
"            float2 Off = TexelSize * float2( 0.5, 0.0 );\n"
"\n"
"            Src1 = 0.5 * ( SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ) +\n"
"                           SrcMip.SampleLevel( LinearClampSampler, UV1 + Off, SrcMipLevel ) );\n"
"        }\n"
"        break;\n"
"        case WIDTH_EVEN_HEIGHT_ODD:\n"
"        {\n"
"            // > 2:1 in Y dimension\n"
"            // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x\n"
"            // vertically.\n"
"            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.5, 0.25 ) );\n"
"            float2 Off = TexelSize * float2( 0.0, 0.5 );\n"
"\n"
"            Src1 = 0.5 * ( SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ) +\n"
"                           SrcMip.SampleLevel( LinearClampSampler, UV1 + Off, SrcMipLevel ) );\n"
"        }\n"
"        break;\n"
"        case WIDTH_HEIGHT_ODD:\n"
"        {\n"
"            // > 2:1 in in both dimensions\n"
"            // Use 4 bilinear samples to guarantee we don't undersample when downsizing by more than 2x\n"
"            // in both directions.\n"
"            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.25, 0.25 ) );\n"
"            float2 Off = TexelSize * 0.5;\n"
"\n"
"            Src1 =  SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel );\n"
"            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( Off.x, 0.0   ), SrcMipLevel );\n"
"            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( 0.0,   Off.y ), SrcMipLevel );\n"
"            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( Off.x, Off.y ), SrcMipLevel );\n"
"            Src1 *= 0.25;\n"
"        }\n"
"        break;\n"
"    }\n"
"\n"
"    OutMip1[IN.DispatchThreadID.xy] = Src1;\n"
"\n"
"    // A scalar (constant) branch can exit all threads coherently.\n"
"    if ( NumMipLevels == 1 )\n"
"        return;\n"
"\n"
"    // Without lane swizzle operations, the only way to share data with other\n"
"    // threads is through LDS.\n"
"    StoreColor( IN.GroupIndex, Src1 );\n"
"\n"
"    // This guarantees all LDS writes are complete and that all threads have\n"
"    // executed all instructions so far (and therefore have issued their LDS\n"
"    // write instructions.)\n"
"    GroupMemoryBarrierWithGroupSync();\n"
"\n"
"    // With low three bits for X and high three bits for Y, this bit mask\n"
"    // (binary: 001001) checks that X and Y are even.\n"
"    if ( ( IN.GroupIndex & 0x9 ) == 0 )\n"
"    {\n"
"        float4 Src2 = LoadColor( IN.GroupIndex + 0x01 );\n"
"        float4 Src3 = LoadColor( IN.GroupIndex + 0x08 );\n"
"        float4 Src4 = LoadColor( IN.GroupIndex + 0x09 );\n"
"        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );\n"
"\n"
"        OutMip2[IN.DispatchThreadID.xy / 2] = Src1;\n"
"        StoreColor( IN.GroupIndex, Src1 );\n"
"    }\n"
"\n"
"    if ( NumMipLevels == 2 )\n"
"        return;\n"
"\n"
"    GroupMemoryBarrierWithGroupSync();\n"
"\n"
"    // This bit mask (binary: 011011) checks that X and Y are multiples of four.\n"
"    if ( ( IN.GroupIndex & 0x1B ) == 0 )\n"
"    {\n"
"        float4 Src2 = LoadColor( IN.GroupIndex + 0x02 );\n"
"        float4 Src3 = LoadColor( IN.GroupIndex + 0x10 );\n"
"        float4 Src4 = LoadColor( IN.GroupIndex + 0x12 );\n"
"        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );\n"
"\n"
"        OutMip3[IN.DispatchThreadID.xy / 4] = Src1;\n"
"        StoreColor( IN.GroupIndex, Src1 );\n"
"    }\n"
"\n"
"    if ( NumMipLevels == 3 )\n"
"        return;\n"
"\n"
"    GroupMemoryBarrierWithGroupSync();\n"
"\n"
"    // This bit mask would be 111111 (X & Y multiples of 8), but only one\n"
"    // thread fits that criteria.\n"
"    if ( IN.GroupIndex == 0 )\n"
"    {\n"
"        float4 Src2 = LoadColor( IN.GroupIndex + 0x04 );\n"
"        float4 Src3 = LoadColor( IN.GroupIndex + 0x20 );\n"
"        float4 Src4 = LoadColor( IN.GroupIndex + 0x24 );\n"
"        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );\n"
"\n"
"        OutMip4[IN.DispatchThreadID.xy / 8] = Src1;\n"
"    }\n"
"}\n";
#endif
d3d12: Add d3d12mipmapping element Adding a new element for texture conversion from single mip level texture to mipmapping enabled RGBA texture Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/7555> 2024-09-20 14:46:32 +00:00			`/**`
			`* MIT License`
			`*`
			`* Copyright (c) 2018 Jeremiah van Oosten`
			`*`
			`* Permission is hereby granted, free of charge, to any person obtaining a copy`
			`* of this software and associated documentation files (the "Software"), to deal`
			`* in the Software without restriction, including without limitation the rights`
			`* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell`
			`* copies of the Software, and to permit persons to whom the Software is`
			`* furnished to do so, subject to the following conditions:`
			`*`
			`* The above copyright notice and this permission notice shall be included in all`
			`* copies or substantial portions of the Software.`
			`*`
			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE`
			`* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER`
			`* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,`
			`* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE`
			`* SOFTWARE.`
			`*/`

			`/* Source: https://github.com/jpvanoosten/LearningDirectX12 */`

			`#ifdef BUILDING_HLSL`

			`#define BLOCK_SIZE 8`

			`// When reducing the size of a texture, it could be that downscaling the texture`
			`// will result in a less than exactly 50% (1/2) of the original texture size.`
			`// This happens if either the width, or the height (or both) dimensions of the texture`
			`// are odd. For example, downscaling a 5x3 texture will result in a 2x1 texture which`
			`// has a 60% reduction in the texture width and 66% reduction in the height.`
			`// When this happens, we need to take more samples from the source texture to`
			`// determine the pixel value in the destination texture.`

			`#define WIDTH_HEIGHT_EVEN 0 // Both the width and the height of the texture are even.`
			`#define WIDTH_ODD_HEIGHT_EVEN 1 // The texture width is odd and the height is even.`
			`#define WIDTH_EVEN_HEIGHT_ODD 2 // The texture width is even and teh height is odd.`
			`#define WIDTH_HEIGHT_ODD 3 // Both the width and height of the texture are odd.`

			`struct ComputeShaderInput`
			`{`
			`uint3 GroupID : SV_GroupID; // 3D index of the thread group in the dispatch.`
			`uint3 GroupThreadID : SV_GroupThreadID; // 3D index of local thread ID in a thread group.`
			`uint3 DispatchThreadID : SV_DispatchThreadID; // 3D index of global thread ID in the dispatch.`
			`uint GroupIndex : SV_GroupIndex; // Flattened local index of the thread within a thread group.`
			`};`

			`cbuffer GenerateMipsCB : register( b0 )`
			`{`
			`uint SrcMipLevel; // Texture level of source mip`
			`uint NumMipLevels; // Number of OutMips to write: [1-4]`
			`uint SrcDimension; // Width and height of the source texture are even or odd.`
			`uint padding;`
			`float2 TexelSize; // 1.0 / OutMip1.Dimensions`
			`}`

			`// Source mip map.`
			`Texture2D<float4> SrcMip : register( t0 );`

			`// Write up to 4 mip map levels.`
			`RWTexture2D<float4> OutMip1 : register( u0 );`
			`RWTexture2D<float4> OutMip2 : register( u1 );`
			`RWTexture2D<float4> OutMip3 : register( u2 );`
			`RWTexture2D<float4> OutMip4 : register( u3 );`

			`// Linear clamp sampler.`
			`SamplerState LinearClampSampler : register( s0 );`

			`// The reason for separating channels is to reduce bank conflicts in the`
			`// local data memory controller. A large stride will cause more threads`
			`// to collide on the same memory bank.`
			`groupshared float gs_R[64];`
			`groupshared float gs_G[64];`
			`groupshared float gs_B[64];`
			`groupshared float gs_A[64];`

			`void StoreColor( uint Index, float4 Color )`
			`{`
			`gs_R[Index] = Color.r;`
			`gs_G[Index] = Color.g;`
			`gs_B[Index] = Color.b;`
			`gs_A[Index] = Color.a;`
			`}`

			`float4 LoadColor( uint Index )`
			`{`
			`return float4( gs_R[Index], gs_G[Index], gs_B[Index], gs_A[Index] );`
			`}`

			`[numthreads( BLOCK_SIZE, BLOCK_SIZE, 1 )]`
			`void ENTRY_POINT( ComputeShaderInput IN )`
			`{`
			`float4 Src1 = (float4)0;`

			`// One bilinear sample is insufficient when scaling down by more than 2x.`
			`// You will slightly undersample in the case where the source dimension`
			`// is odd. This is why it's a really good idea to only generate mips on`
			`// power-of-two sized textures. Trying to handle the undersampling case`
			`// will force this shader to be slower and more complicated as it will`
			`// have to take more source texture samples.`

			`// Determine the path to use based on the dimension of the`
			`// source texture.`
			`// 0b00(0): Both width and height are even.`
			`// 0b01(1): Width is odd, height is even.`
			`// 0b10(2): Width is even, height is odd.`
			`// 0b11(3): Both width and height are odd.`
			`switch ( SrcDimension )`
			`{`
			`case WIDTH_HEIGHT_EVEN:`
			`{`
			`float2 UV = TexelSize * ( IN.DispatchThreadID.xy + 0.5 );`

			`Src1 = SrcMip.SampleLevel( LinearClampSampler, UV, SrcMipLevel );`
			`}`
			`break;`
			`case WIDTH_ODD_HEIGHT_EVEN:`
			`{`
			`// > 2:1 in X dimension`
			`// Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x`
			`// horizontally.`
			`float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.25, 0.5 ) );`
			`float2 Off = TexelSize * float2( 0.5, 0.0 );`

			`Src1 = 0.5 * ( SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ) +`
			`SrcMip.SampleLevel( LinearClampSampler, UV1 + Off, SrcMipLevel ) );`
			`}`
			`break;`
			`case WIDTH_EVEN_HEIGHT_ODD:`
			`{`
			`// > 2:1 in Y dimension`
			`// Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x`
			`// vertically.`
			`float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.5, 0.25 ) );`
			`float2 Off = TexelSize * float2( 0.0, 0.5 );`

			`Src1 = 0.5 * ( SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ) +`
			`SrcMip.SampleLevel( LinearClampSampler, UV1 + Off, SrcMipLevel ) );`
			`}`
			`break;`
			`case WIDTH_HEIGHT_ODD:`
			`{`
			`// > 2:1 in in both dimensions`
			`// Use 4 bilinear samples to guarantee we don't undersample when downsizing by more than 2x`
			`// in both directions.`
			`float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.25, 0.25 ) );`
			`float2 Off = TexelSize * 0.5;`

			`Src1 = SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel );`
			`Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( Off.x, 0.0 ), SrcMipLevel );`
			`Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( 0.0, Off.y ), SrcMipLevel );`
			`Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( Off.x, Off.y ), SrcMipLevel );`
			`Src1 *= 0.25;`
			`}`
			`break;`
			`}`

			`OutMip1[IN.DispatchThreadID.xy] = Src1;`

			`// A scalar (constant) branch can exit all threads coherently.`
			`if ( NumMipLevels == 1 )`
			`return;`

			`// Without lane swizzle operations, the only way to share data with other`
			`// threads is through LDS.`
			`StoreColor( IN.GroupIndex, Src1 );`

			`// This guarantees all LDS writes are complete and that all threads have`
			`// executed all instructions so far (and therefore have issued their LDS`
			`// write instructions.)`
			`GroupMemoryBarrierWithGroupSync();`

			`// With low three bits for X and high three bits for Y, this bit mask`
			`// (binary: 001001) checks that X and Y are even.`
			`if ( ( IN.GroupIndex & 0x9 ) == 0 )`
			`{`
			`float4 Src2 = LoadColor( IN.GroupIndex + 0x01 );`
			`float4 Src3 = LoadColor( IN.GroupIndex + 0x08 );`
			`float4 Src4 = LoadColor( IN.GroupIndex + 0x09 );`
			`Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );`

			`OutMip2[IN.DispatchThreadID.xy / 2] = Src1;`
			`StoreColor( IN.GroupIndex, Src1 );`
			`}`

			`if ( NumMipLevels == 2 )`
			`return;`

			`GroupMemoryBarrierWithGroupSync();`

			`// This bit mask (binary: 011011) checks that X and Y are multiples of four.`
			`if ( ( IN.GroupIndex & 0x1B ) == 0 )`
			`{`
			`float4 Src2 = LoadColor( IN.GroupIndex + 0x02 );`
			`float4 Src3 = LoadColor( IN.GroupIndex + 0x10 );`
			`float4 Src4 = LoadColor( IN.GroupIndex + 0x12 );`
			`Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );`

			`OutMip3[IN.DispatchThreadID.xy / 4] = Src1;`
			`StoreColor( IN.GroupIndex, Src1 );`
			`}`

			`if ( NumMipLevels == 3 )`
			`return;`

			`GroupMemoryBarrierWithGroupSync();`

			`// This bit mask would be 111111 (X & Y multiples of 8), but only one`
			`// thread fits that criteria.`
			`if ( IN.GroupIndex == 0 )`
			`{`
			`float4 Src2 = LoadColor( IN.GroupIndex + 0x04 );`
			`float4 Src3 = LoadColor( IN.GroupIndex + 0x20 );`
			`float4 Src4 = LoadColor( IN.GroupIndex + 0x24 );`
			`Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );`

			`OutMip4[IN.DispatchThreadID.xy / 8] = Src1;`
			`}`
			`}`
			`#else`
			`static const char str_CSMain_mipgen[] =`
			`"#define BLOCK_SIZE 8\n"`
			`"\n"`
			`" // When reducing the size of a texture, it could be that downscaling the texture\n"`
			`" // will result in a less than exactly 50% (1/2) of the original texture size.\n"`
			`" // This happens if either the width, or the height (or both) dimensions of the texture\n"`
			`" // are odd. For example, downscaling a 5x3 texture will result in a 2x1 texture which\n"`
			`" // has a 60% reduction in the texture width and 66% reduction in the height.\n"`
			`" // When this happens, we need to take more samples from the source texture to\n"`
			`" // determine the pixel value in the destination texture.\n"`
			`"\n"`
			`"#define WIDTH_HEIGHT_EVEN 0 // Both the width and the height of the texture are even.\n"`
			`"#define WIDTH_ODD_HEIGHT_EVEN 1 // The texture width is odd and the height is even.\n"`
			`"#define WIDTH_EVEN_HEIGHT_ODD 2 // The texture width is even and teh height is odd.\n"`
			`"#define WIDTH_HEIGHT_ODD 3 // Both the width and height of the texture are odd.\n"`
			`"\n"`
			`"struct ComputeShaderInput\n"`
			`"{\n"`
			`" uint3 GroupID : SV_GroupID; // 3D index of the thread group in the dispatch.\n"`
			`" uint3 GroupThreadID : SV_GroupThreadID; // 3D index of local thread ID in a thread group.\n"`
			`" uint3 DispatchThreadID : SV_DispatchThreadID; // 3D index of global thread ID in the dispatch.\n"`
			`" uint GroupIndex : SV_GroupIndex; // Flattened local index of the thread within a thread group.\n"`
			`"};\n"`
			`"\n"`
			`"cbuffer GenerateMipsCB : register( b0 )\n"`
			`"{\n"`
			`" uint SrcMipLevel; // Texture level of source mip\n"`
			`" uint NumMipLevels; // Number of OutMips to write: [1-4]\n"`
			`" uint SrcDimension; // Width and height of the source texture are even or odd.\n"`
			`" uint padding;\n"`
			`" float2 TexelSize; // 1.0 / OutMip1.Dimensions\n"`
			`"}\n"`
			`"\n"`
			`"// Source mip map.\n"`
			`"Texture2D<float4> SrcMip : register( t0 );\n"`
			`"\n"`
			`"// Write up to 4 mip map levels.\n"`
			`"RWTexture2D<float4> OutMip1 : register( u0 );\n"`
			`"RWTexture2D<float4> OutMip2 : register( u1 );\n"`
			`"RWTexture2D<float4> OutMip3 : register( u2 );\n"`
			`"RWTexture2D<float4> OutMip4 : register( u3 );\n"`
			`"\n"`
			`"// Linear clamp sampler.\n"`
			`"SamplerState LinearClampSampler : register( s0 );\n"`
			`"\n"`
			`"// The reason for separating channels is to reduce bank conflicts in the\n"`
			`"// local data memory controller. A large stride will cause more threads\n"`
			`"// to collide on the same memory bank.\n"`
			`"groupshared float gs_R[64];\n"`
			`"groupshared float gs_G[64];\n"`
			`"groupshared float gs_B[64];\n"`
			`"groupshared float gs_A[64];\n"`
			`"\n"`
			`"void StoreColor( uint Index, float4 Color )\n"`
			`"{\n"`
			`" gs_R[Index] = Color.r;\n"`
			`" gs_G[Index] = Color.g;\n"`
			`" gs_B[Index] = Color.b;\n"`
			`" gs_A[Index] = Color.a;\n"`
			`"}\n"`
			`"\n"`
			`"float4 LoadColor( uint Index )\n"`
			`"{\n"`
			`" return float4( gs_R[Index], gs_G[Index], gs_B[Index], gs_A[Index] );\n"`
			`"}\n"`
			`"\n"`
			`"[numthreads( BLOCK_SIZE, BLOCK_SIZE, 1 )]\n"`
			`"void ENTRY_POINT( ComputeShaderInput IN )\n"`
			`"{\n"`
			`" float4 Src1 = (float4)0;\n"`
			`"\n"`
			`" // One bilinear sample is insufficient when scaling down by more than 2x.\n"`
			`" // You will slightly undersample in the case where the source dimension\n"`
			`" // is odd. This is why it's a really good idea to only generate mips on\n"`
			`" // power-of-two sized textures. Trying to handle the undersampling case\n"`
			`" // will force this shader to be slower and more complicated as it will\n"`
			`" // have to take more source texture samples.\n"`
			`"\n"`
			`" // Determine the path to use based on the dimension of the\n"`
			`" // source texture.\n"`
			`" // 0b00(0): Both width and height are even.\n"`
			`" // 0b01(1): Width is odd, height is even.\n"`
			`" // 0b10(2): Width is even, height is odd.\n"`
			`" // 0b11(3): Both width and height are odd.\n"`
			`" switch ( SrcDimension )\n"`
			`" {\n"`
			`" case WIDTH_HEIGHT_EVEN:\n"`
			`" {\n"`
			`" float2 UV = TexelSize * ( IN.DispatchThreadID.xy + 0.5 );\n"`
			`"\n"`
			`" Src1 = SrcMip.SampleLevel( LinearClampSampler, UV, SrcMipLevel );\n"`
			`" }\n"`
			`" break;\n"`
			`" case WIDTH_ODD_HEIGHT_EVEN:\n"`
			`" {\n"`
			`" // > 2:1 in X dimension\n"`
			`" // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x\n"`
			`" // horizontally.\n"`
			`" float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.25, 0.5 ) );\n"`
			`" float2 Off = TexelSize * float2( 0.5, 0.0 );\n"`
			`"\n"`
			`" Src1 = 0.5 * ( SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ) +\n"`
			`" SrcMip.SampleLevel( LinearClampSampler, UV1 + Off, SrcMipLevel ) );\n"`
			`" }\n"`
			`" break;\n"`
			`" case WIDTH_EVEN_HEIGHT_ODD:\n"`
			`" {\n"`
			`" // > 2:1 in Y dimension\n"`
			`" // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x\n"`
			`" // vertically.\n"`
			`" float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.5, 0.25 ) );\n"`
			`" float2 Off = TexelSize * float2( 0.0, 0.5 );\n"`
			`"\n"`
			`" Src1 = 0.5 * ( SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ) +\n"`
			`" SrcMip.SampleLevel( LinearClampSampler, UV1 + Off, SrcMipLevel ) );\n"`
			`" }\n"`
			`" break;\n"`
			`" case WIDTH_HEIGHT_ODD:\n"`
			`" {\n"`
			`" // > 2:1 in in both dimensions\n"`
			`" // Use 4 bilinear samples to guarantee we don't undersample when downsizing by more than 2x\n"`
			`" // in both directions.\n"`
			`" float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.25, 0.25 ) );\n"`
			`" float2 Off = TexelSize * 0.5;\n"`
			`"\n"`
			`" Src1 = SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel );\n"`
			`" Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( Off.x, 0.0 ), SrcMipLevel );\n"`
			`" Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( 0.0, Off.y ), SrcMipLevel );\n"`
			`" Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( Off.x, Off.y ), SrcMipLevel );\n"`
			`" Src1 *= 0.25;\n"`
			`" }\n"`
			`" break;\n"`
			`" }\n"`
			`"\n"`
			`" OutMip1[IN.DispatchThreadID.xy] = Src1;\n"`
			`"\n"`
			`" // A scalar (constant) branch can exit all threads coherently.\n"`
			`" if ( NumMipLevels == 1 )\n"`
			`" return;\n"`
			`"\n"`
			`" // Without lane swizzle operations, the only way to share data with other\n"`
			`" // threads is through LDS.\n"`
			`" StoreColor( IN.GroupIndex, Src1 );\n"`
			`"\n"`
			`" // This guarantees all LDS writes are complete and that all threads have\n"`
			`" // executed all instructions so far (and therefore have issued their LDS\n"`
			`" // write instructions.)\n"`
			`" GroupMemoryBarrierWithGroupSync();\n"`
			`"\n"`
			`" // With low three bits for X and high three bits for Y, this bit mask\n"`
			`" // (binary: 001001) checks that X and Y are even.\n"`
			`" if ( ( IN.GroupIndex & 0x9 ) == 0 )\n"`
			`" {\n"`
			`" float4 Src2 = LoadColor( IN.GroupIndex + 0x01 );\n"`
			`" float4 Src3 = LoadColor( IN.GroupIndex + 0x08 );\n"`
			`" float4 Src4 = LoadColor( IN.GroupIndex + 0x09 );\n"`
			`" Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );\n"`
			`"\n"`
			`" OutMip2[IN.DispatchThreadID.xy / 2] = Src1;\n"`
			`" StoreColor( IN.GroupIndex, Src1 );\n"`
			`" }\n"`
			`"\n"`
			`" if ( NumMipLevels == 2 )\n"`
			`" return;\n"`
			`"\n"`
			`" GroupMemoryBarrierWithGroupSync();\n"`
			`"\n"`
			`" // This bit mask (binary: 011011) checks that X and Y are multiples of four.\n"`
			`" if ( ( IN.GroupIndex & 0x1B ) == 0 )\n"`
			`" {\n"`
			`" float4 Src2 = LoadColor( IN.GroupIndex + 0x02 );\n"`
			`" float4 Src3 = LoadColor( IN.GroupIndex + 0x10 );\n"`
			`" float4 Src4 = LoadColor( IN.GroupIndex + 0x12 );\n"`
			`" Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );\n"`
			`"\n"`
			`" OutMip3[IN.DispatchThreadID.xy / 4] = Src1;\n"`
			`" StoreColor( IN.GroupIndex, Src1 );\n"`
			`" }\n"`
			`"\n"`
			`" if ( NumMipLevels == 3 )\n"`
			`" return;\n"`
			`"\n"`
			`" GroupMemoryBarrierWithGroupSync();\n"`
			`"\n"`
			`" // This bit mask would be 111111 (X & Y multiples of 8), but only one\n"`
			`" // thread fits that criteria.\n"`
			`" if ( IN.GroupIndex == 0 )\n"`
			`" {\n"`
			`" float4 Src2 = LoadColor( IN.GroupIndex + 0x04 );\n"`
			`" float4 Src3 = LoadColor( IN.GroupIndex + 0x20 );\n"`
			`" float4 Src4 = LoadColor( IN.GroupIndex + 0x24 );\n"`
			`" Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );\n"`
			`"\n"`
			`" OutMip4[IN.DispatchThreadID.xy / 8] = Src1;\n"`
			`" }\n"`
			`"}\n";`
			`#endif`