Todo: fix gemm_bilinear_wmma instances compilation bug

18d5297b · aska-0096 · 4fe49693 · 18d5297b · 18d5297b · 18d5297b
Commit 18d5297b authored Feb 26, 2024 by aska-0096
20 changed files
--- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
+++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
-add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_fp16 batched_gemm_scale_softmax_gemm_xdl_fp16.cpp)
-add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_bf16 batched_gemm_scale_softmax_gemm_xdl_bf16.cpp)
-add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp)
-add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16 batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp)
-add_example_executable(example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp)
-add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp)
-add_example_executable(example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp)
 if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102")
    add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp)
    add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp)

--- a/example/49_fpAintB_gemm/CMakeLists.txt
+++ b/example/49_fpAintB_gemm/CMakeLists.txt
--- a/example/49_fpAintB_gemm/common.hpp
+++ b/example/49_fpAintB_gemm/common.hpp
--- a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp
+++ b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp
--- a/example/49_fpAintB_gemm/run_gemm_example.inc
+++ b/example/49_fpAintB_gemm/run_gemm_example.inc
@@ -34,30 +34,15 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
    {
    case 0: break;
    case 1:
-        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-1.f, 1.f}(a_m_k);
-        ck::utils::FillUniformDistributionIntegerValue<QuantDataType>{-5.f, 5.f}(quant_b_k_n);
+        ck::utils::FillUniformDistributionIntegerValue<QuantDataType>{-1.f, 1.f}(quant_b_k_n);
-        ck::utils::FillUniformDistributionIntegerValue<ScaleDataType>{-5.f, 5.f}(scale_k_n);
+        ck::utils::FillUniformDistributionIntegerValue<ScaleDataType>{-1.f, 1.f}(scale_k_n);
        break;
    case 2:
        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
        ck::utils::FillUniformDistribution<QuantDataType>{-1.f, 1.f}(quant_b_k_n);
        ck::utils::FillUniformDistribution<ScaleDataType>{-1.f, 1.f}(scale_k_n);
        break;
-    case 3:
-        ck::utils::FillUniformDistributionIntegerValue<ADataType>{1.f, 1.f}(a_m_k);
-        ck::utils::FillUniformDistributionIntegerValue<QuantDataType>{-5.f, 5.f}(quant_b_k_n);
-        ck::utils::FillUniformDistributionIntegerValue<ScaleDataType>{-5.f, 5.f}(scale_k_n);
-        break;
-    case 4:
-        ck::utils::FillUniformDistributionIntegerValue<ADataType>{1.f, 1.f}(a_m_k);
-        ck::utils::FillUniformDistributionIntegerValue<QuantDataType>{1.f, 1.f}(quant_b_k_n);
-        ck::utils::FillUniformDistributionIntegerValue<ScaleDataType>{2.f, 2.f}(scale_k_n);
-        break;
-    case 5:
-        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-2.f, 2.f}(a_m_k);
-        ck::utils::FillUniformDistributionIntegerValue<QuantDataType>{-2.f, 2.f}(quant_b_k_n);
-        ck::utils::FillUniformDistributionIntegerValue<ScaleDataType>{-2.f, 2.f}(scale_k_n);
-        break;
    default:
        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
        ck::utils::FillUniformDistribution<QuantDataType>{-1.f, 1.f}(quant_b_k_n);

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
@@ -217,7 +217,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
        // Tiling Family
        MPerBlock,
        NPerBlock,
-        K0PerBlock,
+        KPerBlock,
        MPerWMMA,
        NPerWMMA,
        K1,
@@ -231,8 +231,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
        ABlockTransferSrcVectorDim,
        ABlockTransferSrcScalarPerVector,
        ABlockTransferDstScalarPerVector_AK1,
-        true,
        false,
+        true,
        ABlockLdsExtraM,
        BBlockTransferThreadClusterLengths_BK0_N_BK1,
        BBlockTransferThreadClusterArrangeOrder,
@@ -240,8 +240,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
        BBlockTransferSrcVectorDim,
        BBlockTransferSrcScalarPerVector,
        BBlockTransferDstScalarPerVector_BK1,
-        true,
        false,
+        true,
        BBlockLdsExtraN,
        CShuffleMRepeatPerShuffle,
        CShuffleNRepeatPerShuffle,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp
@@ -416,7 +416,7 @@ struct DeviceGroupedConvBwdWeight_Wmma_CShuffle
        // Tiling Family
        MPerBlock,
        NPerBlock,
-        K0PerBlock,
+        KPerBlock,
        MPerWMMA,
        NPerWMMA,
        K1,
@@ -430,8 +430,8 @@ struct DeviceGroupedConvBwdWeight_Wmma_CShuffle
        ABlockTransferSrcVectorDim,
        ABlockTransferSrcScalarPerVector,
        ABlockTransferDstScalarPerVector_K1,
-        true,
        false,
+        true,
        ABlockLdsAddExtraM,
        BBlockTransferThreadClusterLengths_K0_N_K1,
        BBlockTransferThreadClusterArrangeOrder,
@@ -439,8 +439,8 @@ struct DeviceGroupedConvBwdWeight_Wmma_CShuffle
        BBlockTransferSrcVectorDim,
        BBlockTransferSrcScalarPerVector,
        BBlockTransferDstScalarPerVector_K1,
-        true,
        false,
+        true,
        BBlockLdsAddExtraN,
        CShuffleMRepeatPerShuffle,
        CShuffleNRepeatPerShuffle,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
@@ -279,15 +279,11 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
    }
    // desc for problem definition
-    using AGridDesc_M_K  = remove_cvref_t<decltype(MakeAGridDescriptor_M_K<ALayout>(
-        {}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>;
-    using BGridDesc_N_K  = remove_cvref_t<decltype(MakeBGridDescriptor_N_K<BLayout>({}, {}))>;
-    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({}, {}))>;
-    using EGridDesc_M_N  = remove_cvref_t<decltype(MakeEGridDescriptor_M_N<ELayout>({}, {}))>;
    using AGridDesc =
        decltype(DeviceOp::MakeAGridDescriptor<ALayout>({}, {}, {}, {}, {}, {}, {}, {}, {}, {}));
-    using BGridDesc = decltype(DeviceOp::MakeBGridDescriptor<BLayout>({}, {}));
+    using BGridDesc      = decltype(DeviceOp::MakeBGridDescriptor<BLayout>({}, {}));
+    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({}, {}))>;
+    using EGridDesc_M_N  = remove_cvref_t<decltype(MakeEGridDescriptor_M_N<ELayout>({}, {}))>;
    // GridwiseOp
    using GridwiseOp = GridwiseGemmMultipleD_Wmma<

--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -121,6 +121,9 @@ struct PassThrough
    __host__ __device__ void operator()<bhalf_t, int8_t>(bhalf_t& y, const int8_t& x) const
    {
        y = type_convert<bhalf_t>(x);
+    }
+    template <>
    __host__ __device__ void operator()<uint8_t, uint8_t>(uint8_t& y, const uint8_t& x) const
    {
        y = x;
@@ -738,5 +741,4 @@ struct FastNumericArrayConverter<uint8_t, ck::half_t, N>
 } // namespace element_wise
 } // namespace tensor_operation
-} 
+} // namespace ck
-}// namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp
@@ -651,8 +651,9 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma
            c_grid_desc_m_n);
    }
-    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock =
-        MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}))>;
+        remove_cvref_t<decltype(MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            CGridDesc_M_N{}))>;
    using DefaultBlock2CTileMap =
        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1))>;

--- a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp
@@ -535,8 +535,9 @@ struct GridwiseFpAintBGemm_Wmma
            c_grid_desc_m_n);
    }
-    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock =
-        MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}))>;
+        remove_cvref_t<decltype(MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            CGridDesc_M_N{}))>;
    using DefaultBlock2CTileMap =
        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1))>;

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
@@ -596,6 +596,93 @@ struct GridwiseGemmMultipleD_Wmma
            Number<NumDTensor>{});
    }
+    // CheckValidity for kernels without multi D
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool CheckValidity(const AGridDesc& a_grid_desc,
+                                                            const BGridDesc& b_grid_desc,
+                                                            const EGridDesc_M_N& e_grid_desc_m_n,
+                                                            const Block2CTileMap& block_2_ctile_map)
+    {
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
+        static_assert((MPerBlock % (MPerWmma * MRepeat) == 0) &&
+                          (NPerBlock % (NRepeat * NPerWmma)) == 0,
+                      "Invalid tuning param!");
+        const auto GetAProblemsizeMK = [&]() {
+            if constexpr(AEnableLds)
+            {
+                return make_tuple(a_grid_desc.GetLength(I1),
+                                  a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I2));
+            }
+            else
+            {
+                return make_tuple(a_grid_desc.GetLength(I1) * a_grid_desc.GetLength(I2) *
+                                      a_grid_desc.GetLength(I5),
+                                  a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) *
+                                      a_grid_desc.GetLength(I4) * a_grid_desc.GetLength(I6));
+            }
+        };
+        const auto GetBProblemsizeNK = [&]() {
+            if constexpr(BEnableLds)
+            {
+                return make_tuple(b_grid_desc.GetLength(I1),
+                                  b_grid_desc.GetLength(I0) * b_grid_desc.GetLength(I2));
+            }
+            else
+            {
+                return make_tuple(b_grid_desc.GetLength(I1) * b_grid_desc.GetLength(I2) *
+                                      b_grid_desc.GetLength(I5),
+                                  b_grid_desc.GetLength(I0) * b_grid_desc.GetLength(I3) *
+                                      b_grid_desc.GetLength(I4) * b_grid_desc.GetLength(I6));
+            }
+        };
+        const auto M = GetAProblemsizeMK()[I0];
+        const auto N = GetBProblemsizeNK()[I0];
+        const auto K = GetAProblemsizeMK()[I1];
+        if(!(M == e_grid_desc_m_n.GetLength(I0) && N == e_grid_desc_m_n.GetLength(I1) &&
+             K == GetBProblemsizeNK()[I1]))
+        {
+            printf("GridwiseOp: ABE descriptor dimension cross check failure\n");
+            return false;
+        }
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
+        {
+            printf("GridwiseOp: Problemsize descriptor dimension check failure\n");
+            return false;
+        }
+        // check gridwise gemm pipeline
+        const auto num_k_loop = K / KPerBlock;
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+        if(!block_2_ctile_map.CheckValidity(e_grid_desc_m_n))
+        {
+            return false;
+        }
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+        if(!(a_grid_desc.GetElementSpaceSize() * sizeof(ADataType) <= TwoGB &&
+             b_grid_desc.GetElementSpaceSize() * sizeof(BDataType) <= TwoGB &&
+             e_grid_desc_m_n.GetElementSpaceSize() * sizeof(EDataType) <= TwoGB))
+        {
+            return false;
+        }
+        return true;
+    }
    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
    template <typename Block2CTileMap>
    __host__ __device__ static constexpr bool CheckValidity(const AGridDesc& a_grid_desc,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
@@ -45,6 +45,7 @@ constexpr auto GridwiseGemmPipeline_Selector()
    else if constexpr(PipelineVer == PipelineVersion::v4)
    {
        return GridwiseGemmPipeline_v4<NumPrefetch>{};
+    }
    else if constexpr(PipelineVer == PipelineVersion::weight_only)
    {
        return GridwiseGemmPipeline_v1_WeightOnly<NumPrefetch, AEnableLds, BEnableLds>{};

--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -417,7 +417,8 @@ __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_w
            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, f8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
+            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
        "wrong! not implemented");
    using r_t     = typename vector_type<T, N>::type;
@@ -521,114 +522,6 @@ amd_buffer_store_impl_raw(const typename vector_type<int8_t, N>::type src_thread
                                           dst_wave_addr_offset + sizeof(int32_t) * 12,
                                           static_cast<index_t>(coherence));
    }
-    else if constexpr(is_same<T, uint8_t>::value)
-    {
-        if constexpr(N == 1)
-        {
-            return llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource,
-                                                  src_thread_addr_offset,
-                                                  src_wave_addr_offset,
-                                                  static_cast<index_t>(coherence));
-        }
-        else if constexpr(N == 2)
-        {
-#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
-            return llvm_amdgcn_raw_buffer_load_i8x2(src_wave_buffer_resource,
-                                                    src_thread_addr_offset,
-                                                    src_wave_addr_offset,
-                                                    static_cast<index_t>(coherence));
-#else
-            int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
-                                                          src_thread_addr_offset,
-                                                          src_wave_addr_offset,
-                                                          static_cast<index_t>(coherence));
-            return bit_cast<uint8x2_t>(tmp);
-#endif
-        }
-        else if constexpr(N == 4)
-        {
-#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
-            return llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
-                                                    src_thread_addr_offset,
-                                                    src_wave_addr_offset,
-                                                    static_cast<index_t>(coherence));
-#else
-            int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource,
-                                                          src_thread_addr_offset,
-                                                          src_wave_addr_offset,
-                                                          static_cast<index_t>(coherence));
-            return bit_cast<uint8x4_t>(tmp);
-#endif
-        }
-        else if constexpr(N == 8)
-        {
-#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
-            vector_type<uint8_t, 8> tmp;
-            tmp.AsType<uint8x4_t>()(Number<0>{}) =
-                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
-                                                 src_thread_addr_offset,
-                                                 src_wave_addr_offset,
-                                                 static_cast<index_t>(coherence));
-            tmp.AsType<uint8x4_t>()(Number<1>{}) =
-                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
-                                                 src_thread_addr_offset,
-                                                 src_wave_addr_offset + 4 * sizeof(int8_t),
-                                                 static_cast<index_t>(coherence));
-            return tmp.AsType<uint8x8_t>()(Number<0>{});
-#else
-            int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource,
-                                                              src_thread_addr_offset,
-                                                              src_wave_addr_offset,
-                                                              static_cast<index_t>(coherence));
-            return bit_cast<uint8x8_t>(tmp);
-#endif
-        }
-        else if constexpr(N == 16)
-        {
-#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
-            vector_type<uint8_t, 16> tmp;
-            tmp.AsType<uint8x4_t>()(Number<0>{}) =
-                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
-                                                 src_thread_addr_offset,
-                                                 src_wave_addr_offset,
-                                                 static_cast<index_t>(coherence));
-            tmp.AsType<uint8x4_t>()(Number<1>{}) =
-                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
-                                                 src_thread_addr_offset,
-                                                 src_wave_addr_offset + 4 * sizeof(int8_t),
-                                                 static_cast<index_t>(coherence));
-            tmp.AsType<uint8x4_t>()(Number<2>{}) =
-                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
-                                                 src_thread_addr_offset,
-                                                 src_wave_addr_offset + 8 * sizeof(int8_t),
-                                                 static_cast<index_t>(coherence));
-            tmp.AsType<uint8x4_t>()(Number<3>{}) =
-                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
-                                                 src_thread_addr_offset,
-                                                 src_wave_addr_offset + 12 * sizeof(int8_t),
-                                                 static_cast<index_t>(coherence));
-            return tmp.AsType<uint8x16_t>()(Number<0>{});
-#else
-            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
-                                                              src_thread_addr_offset,
-                                                              src_wave_addr_offset,
-                                                              static_cast<index_t>(coherence));
-            return bit_cast<uint8x16_t>(tmp);
-#endif
-        }
-    }
 }
 template <typename T,

--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -1053,219 +1053,6 @@ using uint8x16_t = typename vector_type<uint8_t, 16>::type;
 using uint8x32_t = typename vector_type<uint8_t, 32>::type;
 using uint8x64_t = typename vector_type<uint8_t, 64>::type;
-// Convert X to Y
-template <typename Y, typename X>
-__host__ __device__ constexpr Y type_convert(X x)
-{
-    static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
-    return static_cast<Y>(x);
-}
-// convert bfp16 to fp32
-template <>
-inline __host__ __device__ constexpr float type_convert<float, bhalf_t>(bhalf_t x)
-{
-    union
-    {
-        uint32_t int32;
-        float fp32;
-    } u = {uint32_t(x) << 16};
-    return u.fp32;
-}
-// Convert X to Y
-template <typename Y, typename X>
-__host__ __device__ constexpr Y type_convert_sp(X x)
-{
-    static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
-    return static_cast<Y>(x);
-}
-template <>
-inline __host__ __device__ constexpr int type_convert_sp<int, float>(float x)
-{
-    union
-    {
-        float fp32;
-        int int32;
-    } u = {x};
-    return u.int32;
-}
-template <>
-inline __host__ __device__ constexpr float type_convert_sp<float, int>(int x)
-{
-    union
-    {
-        int int32;
-        float fp32;
-    } u = {x};
-    return u.fp32;
-}
-template <>
-inline __host__ __device__ constexpr int type_convert_sp<int, half_t>(half_t x)
-{
-    union
-    {
-        half_t fp16;
-        int int32;
-    } u = {x};
-    return u.int32;
-}
-template <>
-inline __host__ __device__ constexpr half_t type_convert_sp<half_t, int>(int x)
-{
-    union
-    {
-        int int32;
-        half_t fp16;
-    } u = {x};
-    return u.fp16;
-}
-// convert fp32 to bfp16
-template <>
-inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float x)
-{
-    union
-    {
-        float fp32;
-        uint32_t int32;
-    } u = {x};
-    return uint16_t(u.int32 >> 16);
-}
-// convert bfp16 to fp16 via fp32
-template <>
-inline __host__ __device__ constexpr half_t type_convert<half_t, bhalf_t>(bhalf_t x)
-{
-    float x_fp32 = type_convert<float>(x);
-    return static_cast<half_t>(x_fp32);
-}
-// convert fp16 to bfp16 via fp32
-template <>
-inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, half_t>(half_t x)
-{
-    float x_fp32 = static_cast<float>(x);
-    return type_convert<bhalf_t>(x_fp32);
-}
-// convert bfp16 to int32 via fp32
-template <>
-inline __host__ __device__ constexpr int32_t type_convert<int32_t, bhalf_t>(bhalf_t x)
-{
-    float x_fp32 = type_convert<float>(x);
-    return static_cast<int32_t>(x_fp32);
-}
-// convert int32 to bfp16 via fp32
-template <>
-inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int32_t>(int32_t x)
-{
-    float x_fp32 = static_cast<float>(x);
-    return type_convert<bhalf_t>(x_fp32);
-}
-// convert bfp16 to int8 via fp32
-template <>
-inline __host__ __device__ constexpr int8_t type_convert<int8_t, bhalf_t>(bhalf_t x)
-{
-    float x_fp32 = type_convert<float>(x);
-    return static_cast<int8_t>(x_fp32);
-}
-// convert int8 to bfp16 via fp32
-template <>
-inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_t x)
-{
-    float x_fp32 = static_cast<float>(x);
-    return type_convert<bhalf_t>(x_fp32);
-}
-// convert int8 to fp16 via fp32
-template <>
-inline __host__ __device__ constexpr half_t type_convert<half_t, int8_t>(int8_t x)
-{
-    // TODO: replace it with fast_converter
-    float x_fp32 = static_cast<float>(x);
-    return type_convert<half_t>(x_fp32);
-}
-// Declare a template function for bf16 conversion using RTN
-template <typename Y, typename X>
-__host__ __device__ constexpr Y bf16_convert_rtn(X x);
-// Convert fp32 to bf16 with RTN if higher precision is needed
-template <>
-inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, float>(float x)
-{
-    union
-    {
-        float fp32;
-        uint32_t int32;
-    } u = {x};
-    // When the exponent bits are not all 1s, then the value is zero, normal,
-    // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
-    // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
-    // This causes the bfloat16's mantissa to be incremented by 1 if the 16
-    // least significant bits of the float mantissa are greater than 0x8000,
-    // or if they are equal to 0x8000 and the least significant bit of the
-    // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
-    // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
-    // has the value 0x7f, then incrementing it causes it to become 0x00 and
-    // the exponent is incremented by one, which is the next higher FP value
-    // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
-    // with an exponent of 0x00 and a mantissa of 0x7f, it may be rounded up
-    // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
-    // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
-    // incrementing it causes it to become an exponent of 0xFF and a mantissa
-    // of 0x00, which is Inf, the next higher value to the unrounded value.
-    bool flag0 = ~u.int32 & 0x7f800000;
-    // When all of the exponent bits are 1, the value is Inf or NaN.
-    // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
-    // mantissa bit. Quiet NaN is indicated by the most significant mantissa
-    // bit being 1. Signaling NaN is indicated by the most significant
-    // mantissa bit being 0 but some other bit(s) being 1. If any of the
-    // lower 16 bits of the mantissa are 1, we set the least significant bit
-    // of the bfloat16 mantissa, in order to preserve signaling NaN in case
-    // the bfloat16's mantissa bits are all 0.
-    bool flag1 = !flag0 && (u.int32 & 0xffff);
-    u.int32 += flag0 ? 0x7fff + ((u.int32 >> 16) & 1) : 0; // Round to nearest, round to even
-    u.int32 |= flag1 ? 0x10000 : 0x0;                      // Preserve signaling NaN
-    return uint16_t(u.int32 >> 16);
-}
-// convert fp16 to bfp16 via fp32 with RTN if higher precision is needed
-template <>
-inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn<bhalf_t, half_t>(half_t x)
-{
-    float x_fp32 = static_cast<float>(x);
-    return bf16_convert_rtn<bhalf_t>(x_fp32);
-}
 template <typename T>
 struct NumericLimits
 {

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp
@@ -54,36 +54,36 @@ template <index_t NDSpatial,
          ConvolutionForwardSpecialization ConvSpec>
 using device_grouped_conv_fwd_wmma_f16_instances = std::tuple<
    // clang-format off
-        //########################################|    NumDim|       A|       B|       Ds|       E| AData| BData|         Ds|  EData| AccData| CShuffle|            A|           B|          CDE|    ConvForward|           GEMM| Block|  MPer|  NPer|  KPer| K1|  MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    NumDim|       A|       B|       Ds|       E| AData| BData| AccData| CShuffle|         Ds|  EData|            A|           B|          CDE|    ConvForward|           GEMM| Prefetch| Block|  MPer|  NPer|  KPer| K1|  MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################|   Spatial|  Layout|  Layout|   Layout|  Layout|  Type|  Type|   DataType|   Type|    Type| DataType|  Elementwise| Elementwise|  Elementwise| Specialization| Specialization|  Size| Block| Block| Block|   |  WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|   Spatial|  Layout|  Layout|   Layout|  Layout|  Type|  Type|    Type| DataType|   DataType|   Type|  Elementwise| Elementwise|  Elementwise| Specialization| Specialization|    Stage|  Size| Block| Block| Block|   |  WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|          |        |        |         |        |      |      |           |       |        |         |    Operation|   Operation|    Operation|               |               |      |      |      |      |   |      |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|          |        |        |         |        |      |      |        |         |           |       |    Operation|   Operation|    Operation|               |               |         |      |      |      |      |   |      |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|          |        |        |         |        |      |      |           |       |        |         |             |            |             |               |               |      |      |      |      |   |      |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        //########################################|          |        |        |         |        |      |      |        |         |           |       |             |            |             |               |               |         |      |      |      |      |   |      |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
        // generic instance
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,    64,    64,     4,  8,    16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,    128,    64,    64,    32,  8,    16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 32, 1, 4>,               1>,
        // blocksize=256
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,     4,  8,    16,   16,       4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,    256,   128,   128,    32,  8,    16,   16,       4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   256,    64,   256,     4,  8,    16,   16,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,    256,    64,   256,    32,  8,    16,   16,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   256,   256,    64,     4,  8,    16,   16,       8,       1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,    256,   256,    64,    32,  8,    16,   16,       8,       1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,     8,  8,    16,   16,       4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,    256,   128,   128,    64,  8,    16,   16,       4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        // blocksize=128
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,    64,    64,     4,  8,    16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,    128,    64,    64,    32,  8,    16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,    64,    64,     8,  8,    16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,    128,    64,    64,    64,  8,    16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,    64,   128,     4,  8,    16,   16,       2,       4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,    128,    64,   128,    32,  8,    16,   16,       2,       4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,    64,   128,     8,  8,    16,   16,       2,       4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,    128,    64,   128,    64,  8,    16,   16,       2,       4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,   128,    64,     4,  8,    16,   16,       4,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,    128,   128,    64,    32,  8,    16,   16,       4,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,   128,    64,     8,  8,    16,   16,       4,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,    128,   128,    64,    64,  8,    16,   16,       4,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   256,     4,  8,    16,   16,       1,       8,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,    128,    32,   256,    32,  8,    16,   16,       1,       8,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,   256,    32,     4,  8,    16,   16,       8,       1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,      
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,    128,   256,    32,    32,  8,    16,   16,       8,       1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,      
        // blocksize=64
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,    64,    32,    64,     4,  8,    16,   16,       1,       4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,     64,    32,    64,    32,  8,    16,   16,       1,       4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,    64,    64,    32,     4,  8,    16,   16,       2,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,     64,    64,    32,    32,  8,    16,   16,       2,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,    64,    32,    32,     8,  8,    16,   16,       1,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,     64,    32,    32,    64,  8,    16,   16,       1,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,    64,    32,   128,     4,  8,    16,   16,       1,       8,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,     64,    32,   128,    32,  8,    16,   16,       1,       8,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
        // blocksize=32
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,    32,    16,    64,     4,  8,    16,   16,       1,       4,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,     32,    16,    64,    32,  8,    16,   16,       1,       4,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,    32,    64,    16,     4,  8,    16,   16,       4,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,     32,    64,    16,    32,  8,    16,   16,       4,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,    32,    32,    32,     4,  8,    16,   16,       2,       2,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,     32,    32,    32,    32,  8,    16,   16,       2,       2,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16, DsDatatype,    F16,     F32,      F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,    32,    16,    16,     4,  8,    16,   16,       1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  F16,   F16,     F32,      F16, DsDatatype,    F16,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,         1,     32,    16,    16,    32,  8,    16,   16,       1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>
    // clang-format on
    >;
@@ -97,36 +97,36 @@ template <index_t NDSpatial,
          ConvolutionForwardSpecialization ConvSpec>
 using device_grouped_conv_fwd_wmma_i8_instances = std::tuple<
    // clang-format off
-        //########################################|    NumDim|       A|       B|       Ds|       E| AData| BData|         Ds|  EData| AccData| CShuffle|            A|           B|          CDE|    ConvForward|           GEMM| Block|  MPer|  NPer|  KPer| K1|  MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    NumDim|       A|       B|       Ds|       E| AData| BData| AccData| CShuffle|         Ds|  EData|            A|           B|          CDE|    ConvForward|           GEMM| Prefetch| Block|  MPer|  NPer|  KPer| K1|  MPer| NPer| MRepeat| NRepeat|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|  ABlockTransfer|  ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //########################################|   Spatial|  Layout|  Layout|   Layout|  Layout|  Type|  Type|   DataType|   Type|    Type| DataType|  Elementwise| Elementwise|  Elementwise| Specialization| Specialization|  Size| Block| Block| Block|   |  WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|   Spatial|  Layout|  Layout|   Layout|  Layout|  Type|  Type|    Type| DataType|   DataType|   Type|  Elementwise| Elementwise|  Elementwise| Specialization| Specialization|    Stage|  Size| Block| Block| Block|   |  WMMA| WMMA|        |        |    ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|       SrcScalar|       DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|       SrcScalar|       DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //########################################|          |        |        |         |        |      |      |           |       |        |         |    Operation|   Operation|    Operation|               |               |      |      |      |      |   |      |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|          |        |        |         |        |      |      |        |         |           |       |    Operation|   Operation|    Operation|               |               |         |      |      |      |      |   |      |     |        |        |  Lengths_K0_M_K1|   ArrangeOrder|               |               |       PerVector|    PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |       PerVector|    PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //########################################|          |        |        |         |        |      |      |           |       |        |         |             |            |             |               |               |      |      |      |      |   |      |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        //########################################|          |        |        |         |        |      |      |        |         |           |       |             |            |             |               |               |         |      |      |      |      |   |      |     |        |        |                 |               |               |               |                |                |          |                |               |               |              |                |                |          |            |            |                             |                |
        //generic instance
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,    64,    64,     4,  16,    16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,               1,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,               1,              16,         1,           1,           1,               S<1, 32, 1, 4>,               1>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   128,    64,    64,    64,  16,    16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,               1,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,               1,              16,         1,           1,           1,               S<1, 32, 1, 4>,               1>,
        // blocksize=256
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,     4,  16,    16,   16,       4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,    64,  16,    16,   16,       4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   256,    64,   256,     4,  16,    16,   16,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,    64,   256,    64,  16,    16,   16,       2,       4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   256,   256,    64,     4,  16,    16,   16,       8,       1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   256,    64,    64,  16,    16,   16,       8,       1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   256,   128,   128,     8,  16,    16,   16,       4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,   256,   128,   128,   128,  16,    16,   16,       4,       2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
        // blocksize=128
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,    64,    64,     4,  16,    16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    128,    64,    64,   64,  16,    16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,    64,    64,     8,  16,    16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    128,    64,    64,  128,  16,    16,   16,       2,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,    64,   128,     4,  16,    16,   16,       2,       4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    128,    64,   128,   64,  16,    16,   16,       2,       4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,    64,   128,     8,  16,    16,   16,       2,       4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    128,    64,   128,  128,  16,    16,   16,       2,       4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,   128,    64,     4,  16,    16,   16,       4,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    128,   128,    64,   64,  16,    16,   16,       4,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,   128,    64,     8,  16,    16,   16,       4,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    128,   128,    64,  128,  16,    16,   16,       4,       2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,    32,   256,     4,  16,    16,   16,       1,       8,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    128,    32,   256,   64,  16,    16,   16,       1,       8,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,   128,   256,    32,     4,  16,    16,   16,       8,       1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,      
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,    128,   256,    32,   64,  16,    16,   16,       8,       1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,      
        // blocksize=64
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,    64,    32,    64,     4,  16,    16,   16,       1,       4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,     64,    32,    64,   64,  16,    16,   16,       1,       4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,    64,    64,    32,     4,  16,    16,   16,       2,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,     64,    64,    32,   64,  16,    16,   16,       2,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,    64,    32,    32,     8,  16,    16,   16,       1,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,     64,    32,    32,  128,  16,    16,   16,       1,       2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,    64,    32,   128,     4,  16,    16,   16,       1,       8,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,     64,    32,   128,   64,  16,    16,   16,       1,       8,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
        // blocksize=32
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,    32,    16,    64,     4,  16,    16,   16,       1,       4,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,     32,    16,    64,   64,  16,    16,   16,       1,       4,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,    32,    64,    16,     4,  16,    16,   16,       4,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,     32,    64,    16,   64,  16,    16,   16,       4,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,    32,    32,    32,     4,  16,    16,   16,       2,       2,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,     32,    32,    32,   64,  16,    16,   16,       2,       2,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
-        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,  I8,   I8, DsDatatype,    I8,     I32,      I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,    32,    16,    16,     4,  16,    16,   16,       1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 16, 1, 2>,               8>
+        DeviceGroupedConvFwdMultipleD_Wmma_CShuffle<NDSpatial, ALayout, BLayout, DsLayout, ELayout,    I8,    I8,     I32,       I8, DsDatatype,     I8,  PassThrough, PassThrough, CDEElementOp,       ConvSpec, GemmMNKPadding,        1,     32,    16,    16,   64,  16,    16,   16,       1,       1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,         1,           1,           1,               S<1, 16, 1, 2>,               8>
    // clang-format on
    >;

--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
@@ -111,6 +111,12 @@ list(APPEND GEMM_INSTANCES
    device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp
    device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp)
+list(APPEND GEMM_INSTANCES
+    device_gemm_wmma_f16_f16_f16_mk_kn_mn_instance.cpp
+    device_gemm_wmma_f16_f16_f16_mk_nk_mn_instance.cpp
+    device_gemm_wmma_f16_f16_f16_km_kn_mn_instance.cpp
+    device_gemm_wmma_f16_f16_f16_km_nk_mn_instance.cpp)
 add_instance_library(device_gemm_instance ${GEMM_INSTANCES})
 set(ENABLE_PIPELINE_V2_OPT)

--- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt
@@ -3,8 +3,8 @@ add_instance_library(device_gemm_bilinear_instance
   device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
   device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
   device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
-  #device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp
+   device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp
-  #device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp
+   device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp
-  #device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp
+   device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp
-  #device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp
+   device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp
 )
--- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp
@@ -36,32 +36,32 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // e[m, n] = bilinear(a[m, k] * b[k, n], d[m, n])
 using device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instances = std::tuple<
    // clang-format off
-        //################################|      A|      B|        Ds|      E| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|         CDE|           GEMM| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //################################|      A|      B|        Ds|      E| AData| BData|  AccData| CShuffle|   DsData| EData|            A|           B|         CDE|           GEMM| Prefetch| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //################################| Layout| Layout|    Layout| Layout|  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    | WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //################################| Layout| Layout|    Layout| Layout|  Type|  Type|     Type| DataType|     Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization|    Stage|  Size| Block| Block| Block|    | WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //################################|       |       |          |       |      |      |          |      |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //################################|       |       |          |       |      |      |         |         |         |      |    Operation|   Operation|   Operation|               |         |      |      |      |      |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //################################|       |       |          |       |      |      |          |      |        |         |            |            |            |               |      |      |      |      |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        //################################|       |       |          |       |      |      |         |         |         |      |             |            |            |               |         |      |      |      |      |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear,    GemmDefault,   256,   128,   128,     4,  16,   16,   16,       4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           2,               S<1, 32, 1, 8>,              16>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   128,    64,  16,   16,   16,       4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           2,               S<1, 32, 1, 8>,              16>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear,    GemmDefault,   128,    64,    64,     4,  16,   16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           2,               S<1, 32, 1, 4>,              16>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,    64,    64,    64,  16,   16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           2,               S<1, 32, 1, 4>,              16>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear,    GemmDefault,    64,    32,    32,     4,  16,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           2,               S<1, 32, 1, 2>,              16>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,    64,    32,    32,    64,  16,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           2,               S<1, 32, 1, 2>,              16>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear,    GemmDefault,    32,    16,    16,     4,  16,   16,   16,       1,       1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 16, 1, 2>,               8>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,    32,    16,    16,    64,  16,   16,   16,       1,       1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 16, 1, 2>,               8>,
        // M/N/K padding
-        //################################|      A|      B|        Ds|      E| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|         CDE|           GEMM| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //################################|      A|      B|        Ds|      E| AData| BData|  AccData| CShuffle|   DsData| EData|            A|           B|         CDE|           GEMM| Prefetch| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //################################| Layout| Layout|    Layout| Layout|  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    | WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //################################| Layout| Layout|    Layout| Layout|  Type|  Type|     Type| DataType|     Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization|    Stage|  Size| Block| Block| Block|    | WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //################################|       |       |          |       |      |      |          |      |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //################################|       |       |          |       |      |      |         |         |         |      |    Operation|   Operation|   Operation|               |         |      |      |      |      |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //################################|       |       |          |       |      |      |          |      |        |         |            |            |            |               |      |      |      |      |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        //################################|       |       |          |       |      |      |         |         |         |      |             |            |            |               |         |      |      |      |      |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,   256,   128,   128,     4,  16,   16,   16,       4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           2,               S<1, 32, 1, 8>,              16>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    64,  16,   16,   16,       4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           2,               S<1, 32, 1, 8>,              16>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,   128,    64,    64,     4,  16,   16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           2,               S<1, 32, 1, 4>,              16>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,    64,    64,  16,   16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           2,               S<1, 32, 1, 4>,              16>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,    64,    32,    32,     4,  16,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           2,               S<1, 32, 1, 2>,              16>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    32,    64,  16,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           2,               S<1, 32, 1, 2>,              16>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,    32,    16,    16,     4,  16,   16,   16,       1,       1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 16, 1, 2>,               8>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    32,    16,    16,    64,  16,   16,   16,       1,       1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 16, 1, 2>,               8>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,   256,   128,   128,     8,   8,   16,   16,       4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    64,   8,   16,   16,       4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           2,               S<1, 32, 1, 8>,               8>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,   128,    64,    64,     8,   8,   16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           2,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,    64,    64,   8,   16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           2,               S<1, 32, 1, 4>,               8>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,    64,    32,    32,     8,   8,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           2,               S<1, 32, 1, 2>,               8>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    32,    64,   8,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           2,               S<1, 32, 1, 2>,               8>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,    32,    16,    16,     8,   8,   16,   16,       1,       1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    32,    16,    16,    64,   8,   16,   16,       1,       1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,   256,   128,   128,     8,   4,   16,   16,       4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           2,               S<1, 32, 1, 8>,               4>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    32,   4,   16,   16,       4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           2,               S<1, 32, 1, 8>,               4>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,   128,    64,    64,     8,   4,   16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           2,               S<1, 32, 1, 4>,               4>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,    64,    32,   4,   16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           2,               S<1, 32, 1, 4>,               4>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,    64,    32,    32,     8,   4,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           2,               S<1, 32, 1, 2>,               4>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    32,    32,   4,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           2,               S<1, 32, 1, 2>,               4>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,    32,    16,    16,     8,   4,   16,   16,       1,       1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,               S<1, 16, 1, 2>,               4>
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Row, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    32,    16,    16,    32,   4,   16,   16,       1,       1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,               S<1, 16, 1, 2>,               4>
    // clang-format on
    >;

--- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp
@@ -36,32 +36,32 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // e[m, n] = bilinear(a[m, k] * b[k, n], d[m, n])
 using device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instances = std::tuple<
    // clang-format off
-        //################################|      A|      B|        Ds|      E| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|         CDE|           GEMM| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //################################|      A|      B|        Ds|      E| AData| BData|  AccData| CShuffle|   DsData| EData|           A|           B|         CDE|           GEMM| Prefetch| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //################################| Layout| Layout|    Layout| Layout|  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    | WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //################################| Layout| Layout|    Layout| Layout|  Type|  Type|     Type| DataType|     Type|  Type| Elementwise| Elementwise| Elementwise| Specialization|    Stage|  Size| Block| Block| Block|    | WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //################################|       |       |          |       |      |      |          |      |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //################################|       |       |          |       |      |      |         |         |         |      |   Operation|   Operation|   Operation|               |         |      |      |      |      |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //################################|       |       |          |       |      |      |          |      |        |         |            |            |            |               |      |      |      |      |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        //################################|       |       |          |       |      |      |         |         |         |      |            |            |            |               |         |      |      |      |      |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear,    GemmDefault,   256,   128,   128,     4,  16,   16,   16,       4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,           1,           2,               S<1, 32, 1, 8>,              16>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   128,    64,  16,   16,   16,       4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,           1,           2,               S<1, 32, 1, 8>,              16>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear,    GemmDefault,   128,    64,    64,     4,  16,   16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,           1,           2,               S<1, 32, 1, 4>,              16>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,    64,    64,    64,  16,   16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,           1,           2,               S<1, 32, 1, 4>,              16>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear,    GemmDefault,    64,    32,    32,     4,  16,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,           1,           2,               S<1, 32, 1, 2>,              16>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,    64,    32,    32,    64,  16,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,           1,           2,               S<1, 32, 1, 2>,              16>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear,    GemmDefault,    32,    16,    16,     4,  16,   16,   16,       1,       1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,           1,           1,               S<1, 16, 1, 2>,               8>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,    32,    16,    16,    64,  16,   16,   16,       1,       1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,           1,           1,               S<1, 16, 1, 2>,               8>,
        // M/N/K padding
-        //################################|      A|      B|        Ds|      E| AData| BData|    DsData| EData| AccData| CShuffle|           A|           B|         CDE|           GEMM| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //################################|      A|      B|        Ds|      E| AData| BData|  AccData| CShuffle|   DsData| EData|            A|           B|         CDE|           GEMM| Prefetch| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //################################| Layout| Layout|    Layout| Layout|  Type|  Type|      Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    | WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //################################| Layout| Layout|    Layout| Layout|  Type|  Type|     Type| DataType|     Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization|    Stage|  Size| Block| Block| Block|    | WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-        //################################|       |       |          |       |      |      |          |      |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //################################|       |       |          |       |      |      |         |         |         |      |    Operation|   Operation|   Operation|               |         |      |      |      |      |    |     |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-        //################################|       |       |          |       |      |      |          |      |        |         |            |            |            |               |      |      |      |      |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        //################################|       |       |          |       |      |      |         |         |         |      |             |            |            |               |         |      |      |      |      |    |     |     |        |        |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,   256,   128,   128,     4,  16,   16,   16,       4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,           1,           2,               S<1, 32, 1, 8>,              16>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    64,  16,   16,   16,       4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,           1,           2,               S<1, 32, 1, 8>,              16>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,   128,    64,    64,     4,  16,   16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,           1,           2,               S<1, 32, 1, 4>,              16>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,    64,    64,  16,   16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,           1,           2,               S<1, 32, 1, 4>,              16>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,    64,    32,    32,     4,  16,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,           1,           2,               S<1, 32, 1, 2>,              16>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    32,    64,  16,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,           1,           2,               S<1, 32, 1, 2>,              16>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,    32,    16,    16,     4,  16,   16,   16,       1,       1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,           1,           1,               S<1, 16, 1, 2>,               8>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    32,    16,    16,    64,  16,   16,   16,       1,       1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,           1,           1,               S<1, 16, 1, 2>,               8>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,   256,   128,   128,     8,   8,   16,   16,       4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    64,   8,   16,   16,       4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,           1,           2,               S<1, 32, 1, 8>,               8>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,   128,    64,    64,     8,   8,   16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,           1,           2,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,    64,    64,   8,   16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,           1,           2,               S<1, 32, 1, 4>,               8>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,    64,    32,    32,     8,   8,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,           1,           2,               S<1, 32, 1, 2>,               8>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    32,    64,   8,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,           1,           2,               S<1, 32, 1, 2>,               8>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,    32,    16,    16,     8,   8,   16,   16,       1,       1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    32,    16,    16,    64,   8,   16,   16,       1,       1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,   256,   128,   128,     8,   4,   16,   16,       4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,           1,           2,               S<1, 32, 1, 8>,               4>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    32,   4,   16,   16,       4,       2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,           1,           2,               S<1, 32, 1, 8>,               4>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,   128,    64,    64,     8,   4,   16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,           1,           2,               S<1, 32, 1, 4>,               4>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,    64,    32,   4,   16,   16,       2,       2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,           1,           2,               S<1, 32, 1, 4>,               4>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,    64,    32,    32,     8,   4,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,           1,           2,               S<1, 32, 1, 2>,               4>,
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    32,    32,   4,   16,   16,       1,       2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,           1,           2,               S<1, 32, 1, 2>,               4>,
-        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,  I8_Tuple,    I8,     I32,      I32, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,    32,    16,    16,     8,   4,   16,   16,       1,       1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,           1,           1,               S<1, 16, 1, 2>,               4>
+        DeviceGemmMultipleD_Wmma_CShuffle<     Col,    Col, Row_Tuple,    Row,    I8,    I8,      I32,      I32, I8_Tuple,    I8,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    32,    16,    16,    32,   4,   16,   16,       1,       1,     S<4,  8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,     S<2, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,           1,           1,               S<1, 16, 1, 2>,               4>
    // clang-format on
    >;