From 65c23885d448b5cd275917a668e21c6f398725a2 Mon Sep 17 00:00:00 2001
From: Seungha Yang <seungha@centricular.com>
Date: Fri, 10 Jan 2025 00:38:39 +0900
Subject: [PATCH] d3d12mipmapping: Skip alpha sampling if possible

If input format has no alpha and output format has no alpha,
skip alpha sampling which can reduce the number of instruction slots

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/8290>
---
 .../gst/d3d12/gstd3d12mipgen-private.h        |   4 +-
 .../gst-libs/gst/d3d12/gstd3d12mipgen.cpp     |   5 +-
 .../gst/d3dshader/gstd3dshadercache.cpp       |   2 +
 .../gst/d3dshader/gstd3dshadercache.h         |   2 +
 .../plugin-hlsl/CSMain_mipgen_ayuv.hlsl       | 417 ++++++++++++++++++
 .../plugin-hlsl/CSMain_mipgen_vuya.hlsl       | 417 ++++++++++++++++++
 .../gst-libs/gst/d3dshader/plugin-hlsl/hlsl.h |   2 +
 .../gst/d3dshader/plugin-hlsl/meson.build     |   2 +
 .../sys/d3d12/gstd3d12mipmapping.cpp          |  11 +-
 9 files changed, 857 insertions(+), 5 deletions(-)
 create mode 100644 subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/CSMain_mipgen_ayuv.hlsl
 create mode 100644 subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/CSMain_mipgen_vuya.hlsl

diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12mipgen-private.h b/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12mipgen-private.h
index ceed7308cd..9266f85d1c 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12mipgen-private.h
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12mipgen-private.h
@@ -21,6 +21,7 @@
 
 #include <gst/gst.h>
 #include <gst/d3d12/gstd3d12_fwd.h>
+#include <gst/d3dshader/gstd3dshader.h>
 
 G_BEGIN_DECLS
 
@@ -30,7 +31,8 @@ GST_D3D12_API
 G_DECLARE_FINAL_TYPE (GstD3D12MipGen, gst_d3d12_mip_gen, GST, D3D12_MIP_GEN, GstObject);
 
 GST_D3D12_API
-GstD3D12MipGen * gst_d3d12_mip_gen_new (GstD3D12Device * device);
+GstD3D12MipGen * gst_d3d12_mip_gen_new (GstD3D12Device * device,
+                                        GstD3DPluginCS cs_type);
 
 GST_D3D12_API
 gboolean         gst_d3d12_mip_gen_execute (GstD3D12MipGen * gen,
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12mipgen.cpp b/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12mipgen.cpp
index 853805cf33..61308e0fb5 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12mipgen.cpp
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/d3d12/gstd3d12mipgen.cpp
@@ -129,7 +129,7 @@ gst_d3d12_mip_gen_finalize (GObject * object)
 }
 
 GstD3D12MipGen *
-gst_d3d12_mip_gen_new (GstD3D12Device * device)
+gst_d3d12_mip_gen_new (GstD3D12Device * device, GstD3DPluginCS cs_type)
 {
   g_return_val_if_fail (GST_IS_D3D12_DEVICE (device), nullptr);
 
@@ -197,8 +197,7 @@ gst_d3d12_mip_gen_new (GstD3D12Device * device)
   }
 
   GstD3DShaderByteCode byte_code;
-  if (!gst_d3d_plugin_shader_get_cs_blob (GST_D3D_PLUGIN_CS_MIP_GEN,
-          GST_D3D_SM_5_0, &byte_code)) {
+  if (!gst_d3d_plugin_shader_get_cs_blob (cs_type, GST_D3D_SM_5_0, &byte_code)) {
     GST_ERROR_OBJECT (self, "Couldn't get shader byte code");
     gst_object_unref (self);
     return nullptr;
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/gstd3dshadercache.cpp b/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/gstd3dshadercache.cpp
index ef9eeefc20..f19ba362a7 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/gstd3dshadercache.cpp
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/gstd3dshadercache.cpp
@@ -88,6 +88,8 @@ static const ShaderItem g_vs_map[] = {
 
 static const ShaderItem g_cs_map[] = {
   {GST_D3D_PLUGIN_CS_MIP_GEN, BUILD_SOURCE (CSMain_mipgen)},
+  {GST_D3D_PLUGIN_CS_MIP_GEN_VUYA, BUILD_SOURCE (CSMain_mipgen_vuya)},
+  {GST_D3D_PLUGIN_CS_MIP_GEN_AYUV, BUILD_SOURCE (CSMain_mipgen_ayuv)},
   {GST_D3D_PLUGIN_CS_YADIF_1, BUILD_SOURCE (CSMain_yadif_1)},
   {GST_D3D_PLUGIN_CS_YADIF_1_10, BUILD_SOURCE (CSMain_yadif_1_10)},
   {GST_D3D_PLUGIN_CS_YADIF_1_12, BUILD_SOURCE (CSMain_yadif_1_12)},
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/gstd3dshadercache.h b/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/gstd3dshadercache.h
index 8601183974..aecb316531 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/gstd3dshadercache.h
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/gstd3dshadercache.h
@@ -54,6 +54,8 @@ typedef enum
 typedef enum
 {
   GST_D3D_PLUGIN_CS_MIP_GEN,
+  GST_D3D_PLUGIN_CS_MIP_GEN_VUYA,
+  GST_D3D_PLUGIN_CS_MIP_GEN_AYUV,
   GST_D3D_PLUGIN_CS_YADIF_1,
   GST_D3D_PLUGIN_CS_YADIF_1_10,
   GST_D3D_PLUGIN_CS_YADIF_1_12,
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/CSMain_mipgen_ayuv.hlsl b/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/CSMain_mipgen_ayuv.hlsl
new file mode 100644
index 0000000000..29a2183466
--- /dev/null
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/CSMain_mipgen_ayuv.hlsl
@@ -0,0 +1,417 @@
+/**
+ * MIT License
+ *
+ * Copyright (c) 2018 Jeremiah van Oosten
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Source: https://github.com/jpvanoosten/LearningDirectX12 */
+
+#ifdef BUILDING_HLSL
+
+#define BLOCK_SIZE 8
+
+ // When reducing the size of a texture, it could be that downscaling the texture
+ // will result in a less than exactly 50% (1/2) of the original texture size.
+ // This happens if either the width, or the height (or both) dimensions of the texture
+ // are odd. For example, downscaling a 5x3 texture will result in a 2x1 texture which
+ // has a 60% reduction in the texture width and 66% reduction in the height.
+ // When this happens, we need to take more samples from the source texture to
+ // determine the pixel value in the destination texture.
+
+#define WIDTH_HEIGHT_EVEN 0     // Both the width and the height of the texture are even.
+#define WIDTH_ODD_HEIGHT_EVEN 1 // The texture width is odd and the height is even.
+#define WIDTH_EVEN_HEIGHT_ODD 2 // The texture width is even and teh height is odd.
+#define WIDTH_HEIGHT_ODD 3      // Both the width and height of the texture are odd.
+
+struct ComputeShaderInput
+{
+    uint3 GroupID           : SV_GroupID;           // 3D index of the thread group in the dispatch.
+    uint3 GroupThreadID     : SV_GroupThreadID;     // 3D index of local thread ID in a thread group.
+    uint3 DispatchThreadID  : SV_DispatchThreadID;  // 3D index of global thread ID in the dispatch.
+    uint  GroupIndex        : SV_GroupIndex;        // Flattened local index of the thread within a thread group.
+};
+
+cbuffer GenerateMipsCB : register( b0 )
+{
+    uint SrcMipLevel;   // Texture level of source mip
+    uint NumMipLevels;  // Number of OutMips to write: [1-4]
+    uint SrcDimension;  // Width and height of the source texture are even or odd.
+    uint padding;
+    float2 TexelSize;   // 1.0 / OutMip1.Dimensions
+}
+
+// Source mip map.
+Texture2D<float4> SrcMip : register( t0 );
+
+// Write up to 4 mip map levels.
+RWTexture2D<float4> OutMip1 : register( u0 );
+RWTexture2D<float4> OutMip2 : register( u1 );
+RWTexture2D<float4> OutMip3 : register( u2 );
+RWTexture2D<float4> OutMip4 : register( u3 );
+
+// Linear clamp sampler.
+SamplerState LinearClampSampler : register( s0 );
+
+// The reason for separating channels is to reduce bank conflicts in the
+// local data memory controller.  A large stride will cause more threads
+// to collide on the same memory bank.
+groupshared float gs_Y[64];
+groupshared float gs_U[64];
+groupshared float gs_V[64];
+
+void StoreColor( uint Index, float3 Color )
+{
+    gs_Y[Index] = Color.x;
+    gs_U[Index] = Color.y;
+    gs_V[Index] = Color.z;
+}
+
+float3 LoadColor( uint Index )
+{
+    return float3( gs_Y[Index], gs_U[Index], gs_V[Index] );
+}
+
+[numthreads( BLOCK_SIZE, BLOCK_SIZE, 1 )]
+void ENTRY_POINT( ComputeShaderInput IN )
+{
+    float3 Src1 = (float3)0;
+
+    // One bilinear sample is insufficient when scaling down by more than 2x.
+    // You will slightly undersample in the case where the source dimension
+    // is odd.  This is why it's a really good idea to only generate mips on
+    // power-of-two sized textures.  Trying to handle the undersampling case
+    // will force this shader to be slower and more complicated as it will
+    // have to take more source texture samples.
+
+    // Determine the path to use based on the dimension of the
+    // source texture.
+    // 0b00(0): Both width and height are even.
+    // 0b01(1): Width is odd, height is even.
+    // 0b10(2): Width is even, height is odd.
+    // 0b11(3): Both width and height are odd.
+    switch ( SrcDimension )
+    {
+        case WIDTH_HEIGHT_EVEN:
+        {
+            float2 UV = TexelSize * ( IN.DispatchThreadID.xy + 0.5 );
+
+            Src1 = SrcMip.SampleLevel( LinearClampSampler, UV, SrcMipLevel ).yzw;
+        }
+        break;
+        case WIDTH_ODD_HEIGHT_EVEN:
+        {
+            // > 2:1 in X dimension
+            // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x
+            // horizontally.
+            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.25, 0.5 ) );
+            float2 Off = TexelSize * float2( 0.5, 0.0 );
+
+            Src1 = 0.5 * ( SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ).yzw +
+                           SrcMip.SampleLevel( LinearClampSampler, UV1 + Off, SrcMipLevel ).yzw );
+        }
+        break;
+        case WIDTH_EVEN_HEIGHT_ODD:
+        {
+            // > 2:1 in Y dimension
+            // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x
+            // vertically.
+            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.5, 0.25 ) );
+            float2 Off = TexelSize * float2( 0.0, 0.5 );
+
+            Src1 = 0.5 * ( SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ).yzw +
+                           SrcMip.SampleLevel( LinearClampSampler, UV1 + Off, SrcMipLevel ).yzw );
+        }
+        break;
+        case WIDTH_HEIGHT_ODD:
+        {
+            // > 2:1 in in both dimensions
+            // Use 4 bilinear samples to guarantee we don't undersample when downsizing by more than 2x
+            // in both directions.
+            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.25, 0.25 ) );
+            float2 Off = TexelSize * 0.5;
+
+            Src1 =  SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ).yzw;
+            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( Off.x, 0.0   ), SrcMipLevel ).yzw;
+            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( 0.0,   Off.y ), SrcMipLevel ).yzw;
+            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( Off.x, Off.y ), SrcMipLevel ).yzw;
+            Src1 *= 0.25;
+        }
+        break;
+    }
+
+    OutMip1[IN.DispatchThreadID.xy] = float4(1.0f, Src1);
+
+    // A scalar (constant) branch can exit all threads coherently.
+    if ( NumMipLevels == 1 )
+        return;
+
+    // Without lane swizzle operations, the only way to share data with other
+    // threads is through LDS.
+    StoreColor( IN.GroupIndex, Src1 );
+
+    // This guarantees all LDS writes are complete and that all threads have
+    // executed all instructions so far (and therefore have issued their LDS
+    // write instructions.)
+    GroupMemoryBarrierWithGroupSync();
+
+    // With low three bits for X and high three bits for Y, this bit mask
+    // (binary: 001001) checks that X and Y are even.
+    if ( ( IN.GroupIndex & 0x9 ) == 0 )
+    {
+        float3 Src2 = LoadColor( IN.GroupIndex + 0x01 );
+        float3 Src3 = LoadColor( IN.GroupIndex + 0x08 );
+        float3 Src4 = LoadColor( IN.GroupIndex + 0x09 );
+        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );
+
+        OutMip2[IN.DispatchThreadID.xy / 2] = float4(1.0f, Src1);
+        StoreColor( IN.GroupIndex, Src1 );
+    }
+
+    if ( NumMipLevels == 2 )
+        return;
+
+    GroupMemoryBarrierWithGroupSync();
+
+    // This bit mask (binary: 011011) checks that X and Y are multiples of four.
+    if ( ( IN.GroupIndex & 0x1B ) == 0 )
+    {
+        float3 Src2 = LoadColor( IN.GroupIndex + 0x02 );
+        float3 Src3 = LoadColor( IN.GroupIndex + 0x10 );
+        float3 Src4 = LoadColor( IN.GroupIndex + 0x12 );
+        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );
+
+        OutMip3[IN.DispatchThreadID.xy / 4] = float4(1.0f, Src1);
+        StoreColor( IN.GroupIndex, Src1 );
+    }
+
+    if ( NumMipLevels == 3 )
+        return;
+
+    GroupMemoryBarrierWithGroupSync();
+
+    // This bit mask would be 111111 (X & Y multiples of 8), but only one
+    // thread fits that criteria.
+    if ( IN.GroupIndex == 0 )
+    {
+        float3 Src2 = LoadColor( IN.GroupIndex + 0x04 );
+        float3 Src3 = LoadColor( IN.GroupIndex + 0x20 );
+        float3 Src4 = LoadColor( IN.GroupIndex + 0x24 );
+        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );
+
+        OutMip4[IN.DispatchThreadID.xy / 8] = float4(1.0f, Src1);
+    }
+}
+#else
+static const char str_CSMain_mipgen_ayuv[] =
+"#define BLOCK_SIZE 8\n"
+"\n"
+" // When reducing the size of a texture, it could be that downscaling the texture\n"
+" // will result in a less than exactly 50% (1/2) of the original texture size.\n"
+" // This happens if either the width, or the height (or both) dimensions of the texture\n"
+" // are odd. For example, downscaling a 5x3 texture will result in a 2x1 texture which\n"
+" // has a 60% reduction in the texture width and 66% reduction in the height.\n"
+" // When this happens, we need to take more samples from the source texture to\n"
+" // determine the pixel value in the destination texture.\n"
+"\n"
+"#define WIDTH_HEIGHT_EVEN 0     // Both the width and the height of the texture are even.\n"
+"#define WIDTH_ODD_HEIGHT_EVEN 1 // The texture width is odd and the height is even.\n"
+"#define WIDTH_EVEN_HEIGHT_ODD 2 // The texture width is even and teh height is odd.\n"
+"#define WIDTH_HEIGHT_ODD 3      // Both the width and height of the texture are odd.\n"
+"\n"
+"struct ComputeShaderInput\n"
+"{\n"
+"    uint3 GroupID           : SV_GroupID;           // 3D index of the thread group in the dispatch.\n"
+"    uint3 GroupThreadID     : SV_GroupThreadID;     // 3D index of local thread ID in a thread group.\n"
+"    uint3 DispatchThreadID  : SV_DispatchThreadID;  // 3D index of global thread ID in the dispatch.\n"
+"    uint  GroupIndex        : SV_GroupIndex;        // Flattened local index of the thread within a thread group.\n"
+"};\n"
+"\n"
+"cbuffer GenerateMipsCB : register( b0 )\n"
+"{\n"
+"    uint SrcMipLevel;   // Texture level of source mip\n"
+"    uint NumMipLevels;  // Number of OutMips to write: [1-4]\n"
+"    uint SrcDimension;  // Width and height of the source texture are even or odd.\n"
+"    uint padding;\n"
+"    float2 TexelSize;   // 1.0 / OutMip1.Dimensions\n"
+"}\n"
+"\n"
+"// Source mip map.\n"
+"Texture2D<float4> SrcMip : register( t0 );\n"
+"\n"
+"// Write up to 4 mip map levels.\n"
+"RWTexture2D<float4> OutMip1 : register( u0 );\n"
+"RWTexture2D<float4> OutMip2 : register( u1 );\n"
+"RWTexture2D<float4> OutMip3 : register( u2 );\n"
+"RWTexture2D<float4> OutMip4 : register( u3 );\n"
+"\n"
+"// Linear clamp sampler.\n"
+"SamplerState LinearClampSampler : register( s0 );\n"
+"\n"
+"// The reason for separating channels is to reduce bank conflicts in the\n"
+"// local data memory controller.  A large stride will cause more threads\n"
+"// to collide on the same memory bank.\n"
+"groupshared float gs_Y[64];\n"
+"groupshared float gs_U[64];\n"
+"groupshared float gs_V[64];\n"
+"\n"
+"void StoreColor( uint Index, float3 Color )\n"
+"{\n"
+"    gs_Y[Index] = Color.x;\n"
+"    gs_U[Index] = Color.y;\n"
+"    gs_V[Index] = Color.z;\n"
+"}\n"
+"\n"
+"float3 LoadColor( uint Index )\n"
+"{\n"
+"    return float3( gs_Y[Index], gs_U[Index], gs_V[Index] );\n"
+"}\n"
+"\n"
+"[numthreads( BLOCK_SIZE, BLOCK_SIZE, 1 )]\n"
+"void ENTRY_POINT( ComputeShaderInput IN )\n"
+"{\n"
+"    float3 Src1 = (float3)0;\n"
+"\n"
+"    // One bilinear sample is insufficient when scaling down by more than 2x.\n"
+"    // You will slightly undersample in the case where the source dimension\n"
+"    // is odd.  This is why it's a really good idea to only generate mips on\n"
+"    // power-of-two sized textures.  Trying to handle the undersampling case\n"
+"    // will force this shader to be slower and more complicated as it will\n"
+"    // have to take more source texture samples.\n"
+"\n"
+"    // Determine the path to use based on the dimension of the\n"
+"    // source texture.\n"
+"    // 0b00(0): Both width and height are even.\n"
+"    // 0b01(1): Width is odd, height is even.\n"
+"    // 0b10(2): Width is even, height is odd.\n"
+"    // 0b11(3): Both width and height are odd.\n"
+"    switch ( SrcDimension )\n"
+"    {\n"
+"        case WIDTH_HEIGHT_EVEN:\n"
+"        {\n"
+"            float2 UV = TexelSize * ( IN.DispatchThreadID.xy + 0.5 );\n"
+"\n"
+"            Src1 = SrcMip.SampleLevel( LinearClampSampler, UV, SrcMipLevel ).yzw;\n"
+"        }\n"
+"        break;\n"
+"        case WIDTH_ODD_HEIGHT_EVEN:\n"
+"        {\n"
+"            // > 2:1 in X dimension\n"
+"            // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x\n"
+"            // horizontally.\n"
+"            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.25, 0.5 ) );\n"
+"            float2 Off = TexelSize * float2( 0.5, 0.0 );\n"
+"\n"
+"            Src1 = 0.5 * ( SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ).yzw +\n"
+"                           SrcMip.SampleLevel( LinearClampSampler, UV1 + Off, SrcMipLevel ).yzw );\n"
+"        }\n"
+"        break;\n"
+"        case WIDTH_EVEN_HEIGHT_ODD:\n"
+"        {\n"
+"            // > 2:1 in Y dimension\n"
+"            // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x\n"
+"            // vertically.\n"
+"            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.5, 0.25 ) );\n"
+"            float2 Off = TexelSize * float2( 0.0, 0.5 );\n"
+"\n"
+"            Src1 = 0.5 * ( SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ).yzw +\n"
+"                           SrcMip.SampleLevel( LinearClampSampler, UV1 + Off, SrcMipLevel ).yzw );\n"
+"        }\n"
+"        break;\n"
+"        case WIDTH_HEIGHT_ODD:\n"
+"        {\n"
+"            // > 2:1 in in both dimensions\n"
+"            // Use 4 bilinear samples to guarantee we don't undersample when downsizing by more than 2x\n"
+"            // in both directions.\n"
+"            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.25, 0.25 ) );\n"
+"            float2 Off = TexelSize * 0.5;\n"
+"\n"
+"            Src1 =  SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ).yzw;\n"
+"            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( Off.x, 0.0   ), SrcMipLevel ).yzw;\n"
+"            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( 0.0,   Off.y ), SrcMipLevel ).yzw;\n"
+"            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( Off.x, Off.y ), SrcMipLevel ).yzw;\n"
+"            Src1 *= 0.25;\n"
+"        }\n"
+"        break;\n"
+"    }\n"
+"\n"
+"    OutMip1[IN.DispatchThreadID.xy] = float4(1.0f, Src1);\n"
+"\n"
+"    // A scalar (constant) branch can exit all threads coherently.\n"
+"    if ( NumMipLevels == 1 )\n"
+"        return;\n"
+"\n"
+"    // Without lane swizzle operations, the only way to share data with other\n"
+"    // threads is through LDS.\n"
+"    StoreColor( IN.GroupIndex, Src1 );\n"
+"\n"
+"    // This guarantees all LDS writes are complete and that all threads have\n"
+"    // executed all instructions so far (and therefore have issued their LDS\n"
+"    // write instructions.)\n"
+"    GroupMemoryBarrierWithGroupSync();\n"
+"\n"
+"    // With low three bits for X and high three bits for Y, this bit mask\n"
+"    // (binary: 001001) checks that X and Y are even.\n"
+"    if ( ( IN.GroupIndex & 0x9 ) == 0 )\n"
+"    {\n"
+"        float3 Src2 = LoadColor( IN.GroupIndex + 0x01 );\n"
+"        float3 Src3 = LoadColor( IN.GroupIndex + 0x08 );\n"
+"        float3 Src4 = LoadColor( IN.GroupIndex + 0x09 );\n"
+"        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );\n"
+"\n"
+"        OutMip2[IN.DispatchThreadID.xy / 2] = float4(1.0f, Src1);\n"
+"        StoreColor( IN.GroupIndex, Src1 );\n"
+"    }\n"
+"\n"
+"    if ( NumMipLevels == 2 )\n"
+"        return;\n"
+"\n"
+"    GroupMemoryBarrierWithGroupSync();\n"
+"\n"
+"    // This bit mask (binary: 011011) checks that X and Y are multiples of four.\n"
+"    if ( ( IN.GroupIndex & 0x1B ) == 0 )\n"
+"    {\n"
+"        float3 Src2 = LoadColor( IN.GroupIndex + 0x02 );\n"
+"        float3 Src3 = LoadColor( IN.GroupIndex + 0x10 );\n"
+"        float3 Src4 = LoadColor( IN.GroupIndex + 0x12 );\n"
+"        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );\n"
+"\n"
+"        OutMip3[IN.DispatchThreadID.xy / 4] = float4(1.0f, Src1);\n"
+"        StoreColor( IN.GroupIndex, Src1 );\n"
+"    }\n"
+"\n"
+"    if ( NumMipLevels == 3 )\n"
+"        return;\n"
+"\n"
+"    GroupMemoryBarrierWithGroupSync();\n"
+"\n"
+"    // This bit mask would be 111111 (X & Y multiples of 8), but only one\n"
+"    // thread fits that criteria.\n"
+"    if ( IN.GroupIndex == 0 )\n"
+"    {\n"
+"        float3 Src2 = LoadColor( IN.GroupIndex + 0x04 );\n"
+"        float3 Src3 = LoadColor( IN.GroupIndex + 0x20 );\n"
+"        float3 Src4 = LoadColor( IN.GroupIndex + 0x24 );\n"
+"        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );\n"
+"\n"
+"        OutMip4[IN.DispatchThreadID.xy / 8] = float4(1.0f, Src1);\n"
+"    }\n"
+"}\n";
+#endif
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/CSMain_mipgen_vuya.hlsl b/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/CSMain_mipgen_vuya.hlsl
new file mode 100644
index 0000000000..b53e67b2cf
--- /dev/null
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/CSMain_mipgen_vuya.hlsl
@@ -0,0 +1,417 @@
+/**
+ * MIT License
+ *
+ * Copyright (c) 2018 Jeremiah van Oosten
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Source: https://github.com/jpvanoosten/LearningDirectX12 */
+
+#ifdef BUILDING_HLSL
+
+#define BLOCK_SIZE 8
+
+ // When reducing the size of a texture, it could be that downscaling the texture
+ // will result in a less than exactly 50% (1/2) of the original texture size.
+ // This happens if either the width, or the height (or both) dimensions of the texture
+ // are odd. For example, downscaling a 5x3 texture will result in a 2x1 texture which
+ // has a 60% reduction in the texture width and 66% reduction in the height.
+ // When this happens, we need to take more samples from the source texture to
+ // determine the pixel value in the destination texture.
+
+#define WIDTH_HEIGHT_EVEN 0     // Both the width and the height of the texture are even.
+#define WIDTH_ODD_HEIGHT_EVEN 1 // The texture width is odd and the height is even.
+#define WIDTH_EVEN_HEIGHT_ODD 2 // The texture width is even and teh height is odd.
+#define WIDTH_HEIGHT_ODD 3      // Both the width and height of the texture are odd.
+
+struct ComputeShaderInput
+{
+    uint3 GroupID           : SV_GroupID;           // 3D index of the thread group in the dispatch.
+    uint3 GroupThreadID     : SV_GroupThreadID;     // 3D index of local thread ID in a thread group.
+    uint3 DispatchThreadID  : SV_DispatchThreadID;  // 3D index of global thread ID in the dispatch.
+    uint  GroupIndex        : SV_GroupIndex;        // Flattened local index of the thread within a thread group.
+};
+
+cbuffer GenerateMipsCB : register( b0 )
+{
+    uint SrcMipLevel;   // Texture level of source mip
+    uint NumMipLevels;  // Number of OutMips to write: [1-4]
+    uint SrcDimension;  // Width and height of the source texture are even or odd.
+    uint padding;
+    float2 TexelSize;   // 1.0 / OutMip1.Dimensions
+}
+
+// Source mip map.
+Texture2D<float4> SrcMip : register( t0 );
+
+// Write up to 4 mip map levels.
+RWTexture2D<float4> OutMip1 : register( u0 );
+RWTexture2D<float4> OutMip2 : register( u1 );
+RWTexture2D<float4> OutMip3 : register( u2 );
+RWTexture2D<float4> OutMip4 : register( u3 );
+
+// Linear clamp sampler.
+SamplerState LinearClampSampler : register( s0 );
+
+// The reason for separating channels is to reduce bank conflicts in the
+// local data memory controller.  A large stride will cause more threads
+// to collide on the same memory bank.
+groupshared float gs_Y[64];
+groupshared float gs_U[64];
+groupshared float gs_V[64];
+
+void StoreColor( uint Index, float3 Color )
+{
+    gs_Y[Index] = Color.x;
+    gs_U[Index] = Color.y;
+    gs_V[Index] = Color.z;
+}
+
+float3 LoadColor( uint Index )
+{
+    return float3( gs_Y[Index], gs_U[Index], gs_V[Index] );
+}
+
+[numthreads( BLOCK_SIZE, BLOCK_SIZE, 1 )]
+void ENTRY_POINT( ComputeShaderInput IN )
+{
+    float3 Src1 = (float3)0;
+
+    // One bilinear sample is insufficient when scaling down by more than 2x.
+    // You will slightly undersample in the case where the source dimension
+    // is odd.  This is why it's a really good idea to only generate mips on
+    // power-of-two sized textures.  Trying to handle the undersampling case
+    // will force this shader to be slower and more complicated as it will
+    // have to take more source texture samples.
+
+    // Determine the path to use based on the dimension of the
+    // source texture.
+    // 0b00(0): Both width and height are even.
+    // 0b01(1): Width is odd, height is even.
+    // 0b10(2): Width is even, height is odd.
+    // 0b11(3): Both width and height are odd.
+    switch ( SrcDimension )
+    {
+        case WIDTH_HEIGHT_EVEN:
+        {
+            float2 UV = TexelSize * ( IN.DispatchThreadID.xy + 0.5 );
+
+            Src1 = SrcMip.SampleLevel( LinearClampSampler, UV, SrcMipLevel ).xyz;
+        }
+        break;
+        case WIDTH_ODD_HEIGHT_EVEN:
+        {
+            // > 2:1 in X dimension
+            // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x
+            // horizontally.
+            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.25, 0.5 ) );
+            float2 Off = TexelSize * float2( 0.5, 0.0 );
+
+            Src1 = 0.5 * ( SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ).xyz +
+                           SrcMip.SampleLevel( LinearClampSampler, UV1 + Off, SrcMipLevel ).xyz );
+        }
+        break;
+        case WIDTH_EVEN_HEIGHT_ODD:
+        {
+            // > 2:1 in Y dimension
+            // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x
+            // vertically.
+            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.5, 0.25 ) );
+            float2 Off = TexelSize * float2( 0.0, 0.5 );
+
+            Src1 = 0.5 * ( SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ).xyz +
+                           SrcMip.SampleLevel( LinearClampSampler, UV1 + Off, SrcMipLevel ).xyz );
+        }
+        break;
+        case WIDTH_HEIGHT_ODD:
+        {
+            // > 2:1 in in both dimensions
+            // Use 4 bilinear samples to guarantee we don't undersample when downsizing by more than 2x
+            // in both directions.
+            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.25, 0.25 ) );
+            float2 Off = TexelSize * 0.5;
+
+            Src1 =  SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ).xyz;
+            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( Off.x, 0.0   ), SrcMipLevel ).xyz;
+            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( 0.0,   Off.y ), SrcMipLevel ).xyz;
+            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( Off.x, Off.y ), SrcMipLevel ).xyz;
+            Src1 *= 0.25;
+        }
+        break;
+    }
+
+    OutMip1[IN.DispatchThreadID.xy] = float4(Src1, 1.0f);
+
+    // A scalar (constant) branch can exit all threads coherently.
+    if ( NumMipLevels == 1 )
+        return;
+
+    // Without lane swizzle operations, the only way to share data with other
+    // threads is through LDS.
+    StoreColor( IN.GroupIndex, Src1 );
+
+    // This guarantees all LDS writes are complete and that all threads have
+    // executed all instructions so far (and therefore have issued their LDS
+    // write instructions.)
+    GroupMemoryBarrierWithGroupSync();
+
+    // With low three bits for X and high three bits for Y, this bit mask
+    // (binary: 001001) checks that X and Y are even.
+    if ( ( IN.GroupIndex & 0x9 ) == 0 )
+    {
+        float3 Src2 = LoadColor( IN.GroupIndex + 0x01 );
+        float3 Src3 = LoadColor( IN.GroupIndex + 0x08 );
+        float3 Src4 = LoadColor( IN.GroupIndex + 0x09 );
+        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );
+
+        OutMip2[IN.DispatchThreadID.xy / 2] = float4(Src1, 1.0f);
+        StoreColor( IN.GroupIndex, Src1 );
+    }
+
+    if ( NumMipLevels == 2 )
+        return;
+
+    GroupMemoryBarrierWithGroupSync();
+
+    // This bit mask (binary: 011011) checks that X and Y are multiples of four.
+    if ( ( IN.GroupIndex & 0x1B ) == 0 )
+    {
+        float3 Src2 = LoadColor( IN.GroupIndex + 0x02 );
+        float3 Src3 = LoadColor( IN.GroupIndex + 0x10 );
+        float3 Src4 = LoadColor( IN.GroupIndex + 0x12 );
+        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );
+
+        OutMip3[IN.DispatchThreadID.xy / 4] = float4(Src1, 1.0f);
+        StoreColor( IN.GroupIndex, Src1 );
+    }
+
+    if ( NumMipLevels == 3 )
+        return;
+
+    GroupMemoryBarrierWithGroupSync();
+
+    // This bit mask would be 111111 (X & Y multiples of 8), but only one
+    // thread fits that criteria.
+    if ( IN.GroupIndex == 0 )
+    {
+        float3 Src2 = LoadColor( IN.GroupIndex + 0x04 );
+        float3 Src3 = LoadColor( IN.GroupIndex + 0x20 );
+        float3 Src4 = LoadColor( IN.GroupIndex + 0x24 );
+        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );
+
+        OutMip4[IN.DispatchThreadID.xy / 8] = float4(Src1, 1.0f);
+    }
+}
+#else
+static const char str_CSMain_mipgen_vuya[] =
+"#define BLOCK_SIZE 8\n"
+"\n"
+" // When reducing the size of a texture, it could be that downscaling the texture\n"
+" // will result in a less than exactly 50% (1/2) of the original texture size.\n"
+" // This happens if either the width, or the height (or both) dimensions of the texture\n"
+" // are odd. For example, downscaling a 5x3 texture will result in a 2x1 texture which\n"
+" // has a 60% reduction in the texture width and 66% reduction in the height.\n"
+" // When this happens, we need to take more samples from the source texture to\n"
+" // determine the pixel value in the destination texture.\n"
+"\n"
+"#define WIDTH_HEIGHT_EVEN 0     // Both the width and the height of the texture are even.\n"
+"#define WIDTH_ODD_HEIGHT_EVEN 1 // The texture width is odd and the height is even.\n"
+"#define WIDTH_EVEN_HEIGHT_ODD 2 // The texture width is even and teh height is odd.\n"
+"#define WIDTH_HEIGHT_ODD 3      // Both the width and height of the texture are odd.\n"
+"\n"
+"struct ComputeShaderInput\n"
+"{\n"
+"    uint3 GroupID           : SV_GroupID;           // 3D index of the thread group in the dispatch.\n"
+"    uint3 GroupThreadID     : SV_GroupThreadID;     // 3D index of local thread ID in a thread group.\n"
+"    uint3 DispatchThreadID  : SV_DispatchThreadID;  // 3D index of global thread ID in the dispatch.\n"
+"    uint  GroupIndex        : SV_GroupIndex;        // Flattened local index of the thread within a thread group.\n"
+"};\n"
+"\n"
+"cbuffer GenerateMipsCB : register( b0 )\n"
+"{\n"
+"    uint SrcMipLevel;   // Texture level of source mip\n"
+"    uint NumMipLevels;  // Number of OutMips to write: [1-4]\n"
+"    uint SrcDimension;  // Width and height of the source texture are even or odd.\n"
+"    uint padding;\n"
+"    float2 TexelSize;   // 1.0 / OutMip1.Dimensions\n"
+"}\n"
+"\n"
+"// Source mip map.\n"
+"Texture2D<float4> SrcMip : register( t0 );\n"
+"\n"
+"// Write up to 4 mip map levels.\n"
+"RWTexture2D<float4> OutMip1 : register( u0 );\n"
+"RWTexture2D<float4> OutMip2 : register( u1 );\n"
+"RWTexture2D<float4> OutMip3 : register( u2 );\n"
+"RWTexture2D<float4> OutMip4 : register( u3 );\n"
+"\n"
+"// Linear clamp sampler.\n"
+"SamplerState LinearClampSampler : register( s0 );\n"
+"\n"
+"// The reason for separating channels is to reduce bank conflicts in the\n"
+"// local data memory controller.  A large stride will cause more threads\n"
+"// to collide on the same memory bank.\n"
+"groupshared float gs_Y[64];\n"
+"groupshared float gs_U[64];\n"
+"groupshared float gs_V[64];\n"
+"\n"
+"void StoreColor( uint Index, float3 Color )\n"
+"{\n"
+"    gs_Y[Index] = Color.x;\n"
+"    gs_U[Index] = Color.y;\n"
+"    gs_V[Index] = Color.z;\n"
+"}\n"
+"\n"
+"float3 LoadColor( uint Index )\n"
+"{\n"
+"    return float3( gs_Y[Index], gs_U[Index], gs_V[Index] );\n"
+"}\n"
+"\n"
+"[numthreads( BLOCK_SIZE, BLOCK_SIZE, 1 )]\n"
+"void ENTRY_POINT( ComputeShaderInput IN )\n"
+"{\n"
+"    float3 Src1 = (float3)0;\n"
+"\n"
+"    // One bilinear sample is insufficient when scaling down by more than 2x.\n"
+"    // You will slightly undersample in the case where the source dimension\n"
+"    // is odd.  This is why it's a really good idea to only generate mips on\n"
+"    // power-of-two sized textures.  Trying to handle the undersampling case\n"
+"    // will force this shader to be slower and more complicated as it will\n"
+"    // have to take more source texture samples.\n"
+"\n"
+"    // Determine the path to use based on the dimension of the\n"
+"    // source texture.\n"
+"    // 0b00(0): Both width and height are even.\n"
+"    // 0b01(1): Width is odd, height is even.\n"
+"    // 0b10(2): Width is even, height is odd.\n"
+"    // 0b11(3): Both width and height are odd.\n"
+"    switch ( SrcDimension )\n"
+"    {\n"
+"        case WIDTH_HEIGHT_EVEN:\n"
+"        {\n"
+"            float2 UV = TexelSize * ( IN.DispatchThreadID.xy + 0.5 );\n"
+"\n"
+"            Src1 = SrcMip.SampleLevel( LinearClampSampler, UV, SrcMipLevel ).xyz;\n"
+"        }\n"
+"        break;\n"
+"        case WIDTH_ODD_HEIGHT_EVEN:\n"
+"        {\n"
+"            // > 2:1 in X dimension\n"
+"            // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x\n"
+"            // horizontally.\n"
+"            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.25, 0.5 ) );\n"
+"            float2 Off = TexelSize * float2( 0.5, 0.0 );\n"
+"\n"
+"            Src1 = 0.5 * ( SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ).xyz +\n"
+"                           SrcMip.SampleLevel( LinearClampSampler, UV1 + Off, SrcMipLevel ).xyz );\n"
+"        }\n"
+"        break;\n"
+"        case WIDTH_EVEN_HEIGHT_ODD:\n"
+"        {\n"
+"            // > 2:1 in Y dimension\n"
+"            // Use 2 bilinear samples to guarantee we don't undersample when downsizing by more than 2x\n"
+"            // vertically.\n"
+"            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.5, 0.25 ) );\n"
+"            float2 Off = TexelSize * float2( 0.0, 0.5 );\n"
+"\n"
+"            Src1 = 0.5 * ( SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ).xyz +\n"
+"                           SrcMip.SampleLevel( LinearClampSampler, UV1 + Off, SrcMipLevel ).xyz );\n"
+"        }\n"
+"        break;\n"
+"        case WIDTH_HEIGHT_ODD:\n"
+"        {\n"
+"            // > 2:1 in in both dimensions\n"
+"            // Use 4 bilinear samples to guarantee we don't undersample when downsizing by more than 2x\n"
+"            // in both directions.\n"
+"            float2 UV1 = TexelSize * ( IN.DispatchThreadID.xy + float2( 0.25, 0.25 ) );\n"
+"            float2 Off = TexelSize * 0.5;\n"
+"\n"
+"            Src1 =  SrcMip.SampleLevel( LinearClampSampler, UV1, SrcMipLevel ).xyz;\n"
+"            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( Off.x, 0.0   ), SrcMipLevel ).xyz;\n"
+"            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( 0.0,   Off.y ), SrcMipLevel ).xyz;\n"
+"            Src1 += SrcMip.SampleLevel( LinearClampSampler, UV1 + float2( Off.x, Off.y ), SrcMipLevel ).xyz;\n"
+"            Src1 *= 0.25;\n"
+"        }\n"
+"        break;\n"
+"    }\n"
+"\n"
+"    OutMip1[IN.DispatchThreadID.xy] = float4(Src1, 1.0f);\n"
+"\n"
+"    // A scalar (constant) branch can exit all threads coherently.\n"
+"    if ( NumMipLevels == 1 )\n"
+"        return;\n"
+"\n"
+"    // Without lane swizzle operations, the only way to share data with other\n"
+"    // threads is through LDS.\n"
+"    StoreColor( IN.GroupIndex, Src1 );\n"
+"\n"
+"    // This guarantees all LDS writes are complete and that all threads have\n"
+"    // executed all instructions so far (and therefore have issued their LDS\n"
+"    // write instructions.)\n"
+"    GroupMemoryBarrierWithGroupSync();\n"
+"\n"
+"    // With low three bits for X and high three bits for Y, this bit mask\n"
+"    // (binary: 001001) checks that X and Y are even.\n"
+"    if ( ( IN.GroupIndex & 0x9 ) == 0 )\n"
+"    {\n"
+"        float3 Src2 = LoadColor( IN.GroupIndex + 0x01 );\n"
+"        float3 Src3 = LoadColor( IN.GroupIndex + 0x08 );\n"
+"        float3 Src4 = LoadColor( IN.GroupIndex + 0x09 );\n"
+"        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );\n"
+"\n"
+"        OutMip2[IN.DispatchThreadID.xy / 2] = float4(Src1, 1.0f);\n"
+"        StoreColor( IN.GroupIndex, Src1 );\n"
+"    }\n"
+"\n"
+"    if ( NumMipLevels == 2 )\n"
+"        return;\n"
+"\n"
+"    GroupMemoryBarrierWithGroupSync();\n"
+"\n"
+"    // This bit mask (binary: 011011) checks that X and Y are multiples of four.\n"
+"    if ( ( IN.GroupIndex & 0x1B ) == 0 )\n"
+"    {\n"
+"        float3 Src2 = LoadColor( IN.GroupIndex + 0x02 );\n"
+"        float3 Src3 = LoadColor( IN.GroupIndex + 0x10 );\n"
+"        float3 Src4 = LoadColor( IN.GroupIndex + 0x12 );\n"
+"        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );\n"
+"\n"
+"        OutMip3[IN.DispatchThreadID.xy / 4] = float4(Src1, 1.0f);\n"
+"        StoreColor( IN.GroupIndex, Src1 );\n"
+"    }\n"
+"\n"
+"    if ( NumMipLevels == 3 )\n"
+"        return;\n"
+"\n"
+"    GroupMemoryBarrierWithGroupSync();\n"
+"\n"
+"    // This bit mask would be 111111 (X & Y multiples of 8), but only one\n"
+"    // thread fits that criteria.\n"
+"    if ( IN.GroupIndex == 0 )\n"
+"    {\n"
+"        float3 Src2 = LoadColor( IN.GroupIndex + 0x04 );\n"
+"        float3 Src3 = LoadColor( IN.GroupIndex + 0x20 );\n"
+"        float3 Src4 = LoadColor( IN.GroupIndex + 0x24 );\n"
+"        Src1 = 0.25 * ( Src1 + Src2 + Src3 + Src4 );\n"
+"\n"
+"        OutMip4[IN.DispatchThreadID.xy / 8] = float4(Src1, 1.0f);\n"
+"    }\n"
+"}\n";
+#endif
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/hlsl.h b/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/hlsl.h
index acb9c7fce0..59aed914d2 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/hlsl.h
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/hlsl.h
@@ -33,6 +33,8 @@
 #include "VSMain_coord.hlsl"
 #include "VSMain_pos.hlsl"
 #include "CSMain_mipgen.hlsl"
+#include "CSMain_mipgen_vuya.hlsl"
+#include "CSMain_mipgen_ayuv.hlsl"
 #include "CSMain_yadif_1.hlsl"
 #include "CSMain_yadif_1_10.hlsl"
 #include "CSMain_yadif_1_12.hlsl"
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/meson.build b/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/meson.build
index 33f69b3015..38550d370f 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/meson.build
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/d3dshader/plugin-hlsl/meson.build
@@ -13,6 +13,8 @@ hlsl_sources = [
   ['VSMain_coord', 'vs'],
   ['VSMain_pos', 'vs'],
   ['CSMain_mipgen', 'cs'],
+  ['CSMain_mipgen_vuya', 'cs'],
+  ['CSMain_mipgen_ayuv', 'cs'],
   ['CSMain_yadif_1_10', 'cs'],
   ['CSMain_yadif_1_12', 'cs'],
   ['CSMain_yadif_1', 'cs'],
diff --git a/subprojects/gst-plugins-bad/sys/d3d12/gstd3d12mipmapping.cpp b/subprojects/gst-plugins-bad/sys/d3d12/gstd3d12mipmapping.cpp
index 2bcfce50d7..a79aaf6a05 100644
--- a/subprojects/gst-plugins-bad/sys/d3d12/gstd3d12mipmapping.cpp
+++ b/subprojects/gst-plugins-bad/sys/d3d12/gstd3d12mipmapping.cpp
@@ -901,7 +901,16 @@ gst_d3d12_mip_mapping_set_info (GstD3D12BaseFilter * filter,
     return FALSE;
   }
 
-  ctx->gen = gst_d3d12_mip_gen_new (filter->device);
+  GstD3DPluginCS cs_type = GST_D3D_PLUGIN_CS_MIP_GEN;
+  if (!GST_VIDEO_INFO_HAS_ALPHA (in_info)) {
+    GST_DEBUG_OBJECT (self, "Use VUYA shader");
+    if (GST_VIDEO_INFO_FORMAT (out_info) == GST_VIDEO_FORMAT_AYUV64)
+      cs_type = GST_D3D_PLUGIN_CS_MIP_GEN_AYUV;
+    else
+      cs_type = GST_D3D_PLUGIN_CS_MIP_GEN_VUYA;
+  }
+
+  ctx->gen = gst_d3d12_mip_gen_new (filter->device, cs_type);
   if (!ctx->gen) {
     GST_ERROR_OBJECT (self, "Couldn't create mip generator");
     return FALSE;