Merge branch 'develop' into codegen-enable-hiprtc

2a30cfdd · arai713 · GitHub · 9533a172 · 78195ccc · 2a30cfdd
Unverified Commit 2a30cfdd authored Feb 12, 2025 by arai713 Committed by GitHub Feb 12, 2025
20 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+
+#include "ck/host_utility/flush_cache.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_v2.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t ScaleBlockN, // scale block for N
+          index_t ScaleBlockK, // scale block for K
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
+struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2BScale<ALayout,
+                                                             BLayout,
+                                                             CLayout,
+                                                             ADataType,
+                                                             BDataType,
+                                                             BScaleDataType,
+                                                             CDataType,
+                                                             ScaleBlockN,
+                                                             ScaleBlockK,
+                                                             AElementwiseOperation,
+                                                             BElementwiseOperation,
+                                                             CElementwiseOperation>
+{
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_xdl_cshuffle_v3<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        ScaleBlockN,
+        ScaleBlockK,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        PermuteA,
+        PermuteB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.KBatch);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto Run = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer =
+                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
+                    auto size_b_buffer =
+                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+
+                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
+                                                             0,
+                                                             arg_.M * arg_.N * sizeof(CDataType),
+                                                             stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
+                                                         0,
+                                                         arg.M * arg.N * sizeof(CDataType),
+                                                         stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            constexpr index_t minimum_occupancy =
+                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave
+                    ? (BlkGemmPipelineVer == BlockGemmPipelineVersion::v3 &&
+                       MPerBlock * NPerBlock * KPerBlock * sizeof(ADataType) <= 128 * 128 * 64 * 2)
+                          ? 2
+                          : 1
+                    : 2;
+
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                             BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::AtomicAdd,
+                                                        minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        true,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+                // Tail number could be One to Seven
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            minimum_occupancy,
+                                                            TailNumber::One>;
+                            Run(kernel);
+                        }
+                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                TailNumber::Full)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            minimum_occupancy,
+                                                            TailNumber::Full>;
+                            Run(kernel);
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Two>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Three)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Three>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Four)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Four>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Five)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Five>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Six>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Seven)
+                            {
+                                const auto kernel = kernel_gemm_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Seven>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::One>;
+                            Run(kernel);
+                        }
+                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                TailNumber::Full)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Full>;
+                            Run(kernel);
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Two>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Three)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Three>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Four)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Four>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Five)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Five>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Six>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Seven)
+                            {
+                                const auto kernel =
+                                    kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                                true,
+                                                                InMemoryDataOperationEnum::Set,
+                                                                minimum_occupancy,
+                                                                TailNumber::Seven>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                }
+                // Tail number could be Odd or Even
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
+                                                                 true,
+                                                                 InMemoryDataOperationEnum::Set,
+                                                                 minimum_occupancy,
+                                                                 TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3_2lds<GridwiseGemm,
+                                                                 true,
+                                                                 InMemoryDataOperationEnum::Set,
+                                                                 minimum_occupancy,
+                                                                 TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            minimum_occupancy,
+                                                            TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            minimum_occupancy,
+                                                            TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                            true,
+                                                            InMemoryDataOperationEnum::Set,
+                                                            minimum_occupancy,
+                                                            TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        false,
+                                                        InMemoryDataOperationEnum::AtomicAdd,
+                                                        minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel =
+                            kernel_gemm_xdl_cshuffle_v3<GridwiseGemm,
+                                                        false,
+                                                        InMemoryDataOperationEnum::Set,
+                                                        minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
+        {
+            return false;
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    index_t GetKPerBlock() override { return KPerBlock; }
+
+    bool GetPermuteB() override { return PermuteB; }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             index_t StrideScaleB,
+                             const BScaleDataType* p_b_scale,
+                             index_t KBatch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        StrideScaleB,
+                        p_b_scale,
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      index_t StrideScaleB,
+                                                      const void* p_b_scale,
+                                                      index_t KBatch,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          StrideScaleB,
+                                          static_cast<const BScaleDataType*>(p_b_scale),
+                                          KBatch,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceGemmXdlUniversal"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock<<"x"<<NPerBlock<<"x"<<KPerBlock << ", "
+            << "WaveTile: "
+            << MPerXDL<<"x"<<NPerXDL << ", "
+            << "WaveMap: "
+            << MXdlPerWave<<"x" << NXdlPerWave<<", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector<<"x"<<BBlockTransferSrcScalarPerVector<<", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -205,8 +205,8 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,

            const auto b2c_map = DefaultBlock2CTileMap{};
            index_t gdx, gdy, gdz;
-            std::tie(gdx, gdy, gdz) = b2c_map.CalculateGridSize(karg.M, karg.N, karg.k_batch);
-            const auto K0Padded     = karg.K0Padded;
+            ck::tie(gdx, gdy, gdz) = b2c_map.CalculateGridSize(karg.M, karg.N, karg.k_batch);
+            const auto K0Padded    = karg.K0Padded;

            const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0Padded);


--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle_lds_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle_lds_direct_load.hpp
@@ -183,8 +183,8 @@ struct DeviceGemmXdlSplitKCShuffle_LdsDirectLoad : public DeviceGemmSplitK<ALayo

            const auto b2c_map = DefaultBlock2CTileMap{};
            index_t gdx, gdy, gdz;
-            std::tie(gdx, gdy, gdz) = b2c_map.CalculateGridSize(karg.M, karg.N, karg.k_batch);
-            const auto K0Padded     = karg.K0Padded;
+            ck::tie(gdx, gdy, gdz) = b2c_map.CalculateGridSize(karg.M, karg.N, karg.k_batch);
+            const auto K0Padded    = karg.K0Padded;

            const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0Padded);


--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_waveletmodel_cshuffle.hpp
@@ -47,8 +47,7 @@ __global__ void
                e_grid_desc_mblock_mperblock_nblock_nperblock,
            const Block2ETileMap block_2_etile_map)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
@@ -37,8 +37,7 @@ __global__ void
            const BElementwiseOperation b_element_op,
            const CDEElementwiseOperation cde_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t block_id = get_block_1d_id();

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -106,89 +106,35 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
    static constexpr auto I3           = Number<3>{};
    static constexpr index_t KPerBlock = K0PerBlock * K1;

-    static constexpr auto transform_conv_to_gemm =
-        TransformConvBwdDataToGemm_v1<NDimSpatial,
-                                      ConvBackwardDataSpecialization,
-                                      K1,
-                                      K1,
-                                      MPerBlock,
-                                      NPerBlock,
-                                      KPerBlock,
-                                      true /* DoPadGemmM */,
-                                      true /* DoPadGemmN */>{};
-
-    static auto GetDummyABDsEGridDescriptor()
-    {
-        const std::array<index_t, NDimSpatial + 3> dummy_tensor_lengths = {1};
-        const std::array<index_t, NDimSpatial + 3> dummy_tensor_strides = {1};
-        const std::array<index_t, NDimSpatial> dummy_spatial_lengths    = {1};
-
-        const auto a_grid_desc_ak0_m_ak1 =
-            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths);
-
-        const auto b_grid_desc_bk0_n_bk1 =
-            transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths);
-
-        const auto ds_grid_desc_m_n = generate_tuple(
-            [&](auto i) {
-                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
-
-                return transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths);
-            },
-            Number<NumDTensor>{});
-
-        const auto e_grid_desc_m_n =
-            transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths);
+    using ConvToGemmBwdDataTransform = TransformConvBwdDataToGemm_v1<NDimSpatial,
+                                                                     ConvBackwardDataSpecialization,
+                                                                     K1,
+                                                                     K1,
+                                                                     MPerBlock,
+                                                                     NPerBlock,
+                                                                     KPerBlock,
+                                                                     true /* DoPadGemmM */,
+                                                                     true /* DoPadGemmN */,
+                                                                     ALayout,
+                                                                     BLayout,
+                                                                     ELayout>;

+    static auto
+    GetDummyABDsEGridDescriptor(const ConvToGemmBwdDataTransform& conv_to_gemm_transform)
+    {
+        const auto a_grid_desc_ak0_m_ak1 = conv_to_gemm_transform.MakeADescriptor_AK0_M_AK1();
+        const auto b_grid_desc_bk0_n_bk1 = conv_to_gemm_transform.MakeBDescriptor_BK0_N_BK1();
+        const auto ds_grid_desc_m_n =
+            generate_tuple([&](auto) { return conv_to_gemm_transform.MakeCDescriptor_M_N(); },
+                           Number<NumDTensor>{});
+        const auto e_grid_desc_m_n = conv_to_gemm_transform.MakeCDescriptor_M_N();
        return make_tuple(
            a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n);
    }

    // desc
-    using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor());
+    constexpr static ConvToGemmBwdDataTransform dummy_conv_to_gemm_transform;
+    using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor(dummy_conv_to_gemm_transform));

    using AGridDesc_AK0_M_AK1 = remove_cvref_t<tuple_element_t<0, ABDsEGridDesc>>;
    using BGridDesc_BK0_N_BK1 = remove_cvref_t<tuple_element_t<1, ABDsEGridDesc>>;
@@ -270,7 +216,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
-                     ds_g_n_c_wis_lengths,
+                 /*ds_g_n_c_wis_lengths*/,
                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
                     ds_g_n_c_wis_strides,
                 const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths,
@@ -291,15 +237,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
              b_element_op_{b_element_op},
              cde_element_op_{cde_element_op},
              a_g_n_k_wos_lengths_{a_g_n_k_wos_lengths},
-              a_g_n_k_wos_strides_{a_g_n_k_wos_strides},
              b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
-              b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
-              ds_g_n_c_wis_lengths_{ds_g_n_c_wis_lengths},
-              ds_g_n_c_wis_strides_{ds_g_n_c_wis_strides},
-              e_g_n_c_wis_lengths_{e_g_n_c_wis_lengths},
-              e_g_n_c_wis_strides_{e_g_n_c_wis_strides},
              conv_filter_strides_{conv_filter_strides},
-              conv_filter_dilations_{conv_filter_dilations},
              input_left_pads_{input_left_pads},
              input_right_pads_{input_right_pads}
        {
@@ -382,68 +321,47 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
                            tildes = {i_ztilde, i_ytilde, i_xtilde};
                        }

+                        ConvToGemmBwdDataTransform conv_to_gemm_transform_{a_g_n_k_wos_lengths,
+                                                                           a_g_n_k_wos_strides,
+                                                                           b_g_k_c_xs_lengths,
+                                                                           b_g_k_c_xs_strides,
+                                                                           e_g_n_c_wis_lengths,
+                                                                           e_g_n_c_wis_strides,
+                                                                           conv_filter_strides,
+                                                                           conv_filter_dilations,
+                                                                           input_left_pads,
+                                                                           input_right_pads,
+                                                                           tildes};
+
                        const auto a_grid_desc_ak0_m_ak1 =
-                            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
-                                a_g_n_k_wos_lengths,
-                                a_g_n_k_wos_strides,
-                                b_g_k_c_xs_lengths,
-                                b_g_k_c_xs_strides,
-                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
-                                conv_filter_strides,
-                                conv_filter_dilations,
-                                input_left_pads,
-                                input_right_pads,
-                                tildes);
+                            conv_to_gemm_transform_.MakeADescriptor_AK0_M_AK1();

                        const auto b_grid_desc_bk0_n_bk1 =
-                            transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
-                                a_g_n_k_wos_lengths,
-                                a_g_n_k_wos_strides,
-                                b_g_k_c_xs_lengths,
-                                b_g_k_c_xs_strides,
-                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
-                                conv_filter_strides,
-                                conv_filter_dilations,
-                                input_left_pads,
-                                input_right_pads,
-                                tildes);
+                            conv_to_gemm_transform_.MakeBDescriptor_BK0_N_BK1();

                        DsGridDesc_M_N ds_grid_desc_m_n;

                        // populate Ds desc
                        static_for<0, NumDTensor, 1>{}([&](auto i) {
                            using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
-
-                            ds_grid_desc_m_n(i) =
-                                transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
-                                    a_g_n_k_wos_lengths,
-                                    a_g_n_k_wos_strides,
-                                    b_g_k_c_xs_lengths,
-                                    b_g_k_c_xs_strides,
-                                    ds_g_n_c_wis_lengths[i],
-                                    ds_g_n_c_wis_strides[i],
-                                    conv_filter_strides,
-                                    conv_filter_dilations,
-                                    input_left_pads,
-                                    input_right_pads,
-                                    tildes);
-                        });
-
-                        const auto e_grid_desc_m_n =
-                            transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(
+                            static_assert(is_same_v<DLayout, ELayout>);
+                            ConvToGemmBwdDataTransform conv_to_gemm_transform_d{
                                a_g_n_k_wos_lengths,
                                a_g_n_k_wos_strides,
                                b_g_k_c_xs_lengths,
                                b_g_k_c_xs_strides,
                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
+                                ds_g_n_c_wis_strides[i],
                                conv_filter_strides,
                                conv_filter_dilations,
                                input_left_pads,
                                input_right_pads,
-                                tildes);
+                                tildes};
+
+                            ds_grid_desc_m_n(i) = conv_to_gemm_transform_d.MakeCDescriptor_M_N();
+                        });
+
+                        const auto e_grid_desc_m_n = conv_to_gemm_transform_.MakeCDescriptor_M_N();

                        // for check validity
                        ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n);
@@ -522,17 +440,9 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle
        BElementwiseOp b_element_op_;
        CDEElementwiseOp cde_element_op_;

-        // for checking IsSupportedArgument()
        std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_lengths_;
-        std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_;
        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
-        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
-        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_lengths_;
-        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_strides_;
-        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_lengths_;
-        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_strides_;
        std::array<index_t, NDimSpatial> conv_filter_strides_;
-        std::array<index_t, NDimSpatial> conv_filter_dilations_;
        std::array<index_t, NDimSpatial> input_left_pads_;
        std::array<index_t, NDimSpatial> input_right_pads_;
    };

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -54,15 +54,16 @@ template <typename GridwiseGemm,
          typename ABDataType,
          typename DsPointer,
          typename EDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
+          typename AElementwiseOp,
+          typename BElementwiseOp,
+          typename CDEElementwiseOp,
          typename AGridDesc_AK0_M_AK1,
          typename BGridDesc_BK0_N_BK1,
          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
          typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
          typename Block2ETileMap,
          typename ComputePtrOffsetOfBatch,
+          typename ComputePtrOffsetOfN,
          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -73,10 +74,9 @@ __global__ void
            const ABDataType* __restrict__ p_b_grid,
            DsPointer p_ds_grid,
            EDataType* __restrict__ p_e_grid,
-            const AElementwiseOperation a_element_op,
-            const BElementwiseOperation b_element_op,
-            const CDEElementwiseOperation cde_element_op,
-            const index_t batch_count,
+            const AElementwiseOp a_element_op,
+            const BElementwiseOp b_element_op,
+            const CDEElementwiseOp cde_element_op,
            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
@@ -84,24 +84,28 @@ __global__ void
            const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
                e_grid_desc_mblock_mperblock_nblock_nperblock_,
            const Block2ETileMap block_2_ctile_map,
-            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+            const ComputePtrOffsetOfN compute_ptr_offset_of_n)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    // offset base pointer for each work-group
-    const index_t num_blocks_per_batch =
-        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
-    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+    const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);

-    const long_index_t a_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
-    const long_index_t b_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
-    const long_index_t e_batch_offset = amd_wave_read_first_lane(
-        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+    const long_index_t a_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+    const long_index_t b_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+    const long_index_t e_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));

    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);

+    const long_index_t a_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+    const long_index_t e_n_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_n.GetEPtrOffset(n_idx));
+
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    DsPointer p_ds_grid_grp;
@@ -112,10 +116,10 @@ __global__ void
    static_for<0, NumDTensor, 1>{}(
        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });

-    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset + a_n_offset,
                                                  p_b_grid + b_batch_offset,
                                                  p_ds_grid_grp,
-                                                  p_e_grid + e_batch_offset,
+                                                  p_e_grid + e_batch_offset + e_n_offset,
                                                  p_shared,
                                                  a_element_op,
                                                  b_element_op,
@@ -130,7 +134,6 @@ __global__ void
    ignore = p_b_grid;
    ignore = p_ds_grid;
    ignore = p_e_grid;
-    ignore = batch_count;
    ignore = a_grid_desc_ak0_m_ak1;
    ignore = b_grid_desc_bk0_n_bk1;
    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
@@ -139,6 +142,7 @@ __global__ void
    ignore = b_element_op;
    ignore = cde_element_op;
    ignore = compute_ptr_offset_of_batch;
+    ignore = compute_ptr_offset_of_n;
    ignore = block_2_ctile_map;
 #endif
 }
@@ -233,82 +237,54 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
    static constexpr auto I2 = Number<2>{};
    static constexpr auto I3 = Number<3>{};

-    static constexpr auto transform_conv_to_gemm =
-        TransformConvBwdDataToGemm_v1<NDimSpatial,
-                                      ConvBackwardDataSpecialization,
-                                      AK1,
-                                      BK1,
-                                      MPerBlock,
-                                      NPerBlock,
-                                      KPerBlock,
-                                      DoPadGemmM,
-                                      DoPadGemmN>{};
-
-    static auto GetDummyABDsEGridDescriptor()
+    using ConvToGemmBwdDataTransform = TransformConvBwdDataToGemm_v1<NDimSpatial,
+                                                                     ConvBackwardDataSpecialization,
+                                                                     AK1,
+                                                                     BK1,
+                                                                     MPerBlock,
+                                                                     NPerBlock,
+                                                                     KPerBlock,
+                                                                     DoPadGemmM,
+                                                                     DoPadGemmN,
+                                                                     ALayout,
+                                                                     BLayout,
+                                                                     ELayout,
+                                                                     true, /*SplitConvN*/
+                                                                     ABDataType,
+                                                                     EDataType>;
+
+    static auto
+    GetDummyABDsEGridDescriptor(const ConvToGemmBwdDataTransform& conv_to_gemm_transform)
    {
-        const std::array<index_t, NDimSpatial + 3> dummy_tensor_lengths = {1};
-        const std::array<index_t, NDimSpatial + 3> dummy_tensor_strides = {1};
-        const std::array<index_t, NDimSpatial> dummy_spatial_lengths    = {1};
-
-        const auto a_grid_desc_ak0_m_ak1 =
-            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths);
-
-        const auto b_grid_desc_bk0_n_bk1 =
-            transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_tensor_lengths,
-                dummy_tensor_strides,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths,
-                dummy_spatial_lengths);
+        const auto a_grid_desc_ak0_m_ak1 = conv_to_gemm_transform.MakeADescriptor_AK0_M_AK1();
+
+        const auto b_grid_desc_bk0_n_bk1 = conv_to_gemm_transform.MakeBDescriptor_BK0_N_BK1();

        const auto ds_grid_desc_m_n = generate_tuple(
            [&](auto i) {
-                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
-
-                return transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_tensor_lengths,
-                    dummy_tensor_strides,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths,
-                    dummy_spatial_lengths);
+                using DLayout   = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                using ConvToGemmBwdDataTransformD =
+                    TransformConvBwdDataToGemm_v1<NDimSpatial,
+                                                  ConvBackwardDataSpecialization,
+                                                  AK1,
+                                                  BK1,
+                                                  MPerBlock,
+                                                  NPerBlock,
+                                                  KPerBlock,
+                                                  DoPadGemmM,
+                                                  DoPadGemmN,
+                                                  ALayout,
+                                                  BLayout,
+                                                  DLayout,
+                                                  true, /*SplitConvN*/
+                                                  ABDataType,
+                                                  DDataType>;
+                return ConvToGemmBwdDataTransformD{}.MakeCDescriptor_M_N();
            },
            Number<NumDTensor>{});

-        const auto e_grid_desc_m_n =
-            transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_tensor_lengths,
-                                                                         dummy_tensor_strides,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths,
-                                                                         dummy_spatial_lengths);
+        const auto e_grid_desc_m_n = conv_to_gemm_transform.MakeCDescriptor_M_N();

        return make_tuple(
            a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n);
@@ -377,7 +353,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
    }

    // desc
-    using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor());
+    constexpr static ConvToGemmBwdDataTransform dummy_conv_to_gemm_transform;
+    using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor(dummy_conv_to_gemm_transform));

    using AGridDesc_AK0_M_AK1 = remove_cvref_t<tuple_element_t<0, ABDsEGridDesc>>;
    using BGridDesc_BK0_N_BK1 = remove_cvref_t<tuple_element_t<1, ABDsEGridDesc>>;
@@ -431,15 +408,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
              b_element_op_{b_element_op},
              cde_element_op_{cde_element_op},
              a_g_n_k_wos_lengths_{a_g_n_k_wos_lengths},
-              a_g_n_k_wos_strides_{a_g_n_k_wos_strides},
              b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
-              b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
-              ds_g_n_c_wis_lengths_{ds_g_n_c_wis_lengths},
-              ds_g_n_c_wis_strides_{ds_g_n_c_wis_strides},
-              e_g_n_c_wis_lengths_{e_g_n_c_wis_lengths},
-              e_g_n_c_wis_strides_{e_g_n_c_wis_strides},
              conv_filter_strides_{conv_filter_strides},
-              conv_filter_dilations_{conv_filter_dilations},
              input_left_pads_{input_left_pads},
              input_right_pads_{input_right_pads}
        {
@@ -450,11 +420,6 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds[i]);
            });

-            // A/B/Ds/E Batch Stride
-            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0];
-            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
-            compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides[0];
-
            static_for<0, NumDTensor, 1>{}([&](auto i) {
                compute_ptr_offset_of_batch_.BatchStrideDs_(i) = ds_g_n_c_wis_strides[i][0];
            });
@@ -526,68 +491,65 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                            throw std::runtime_error("wrong! only implemented for 2D and 3D now");
                        }

+                        ConvToGemmBwdDataTransform conv_to_gemm_transform_{a_g_n_k_wos_lengths,
+                                                                           a_g_n_k_wos_strides,
+                                                                           b_g_k_c_xs_lengths,
+                                                                           b_g_k_c_xs_strides,
+                                                                           e_g_n_c_wis_lengths,
+                                                                           e_g_n_c_wis_strides,
+                                                                           conv_filter_strides,
+                                                                           conv_filter_dilations,
+                                                                           input_left_pads,
+                                                                           input_right_pads,
+                                                                           tildes};
+
+                        conv_N_per_block_ = conv_to_gemm_transform_.N_;
+
                        const auto a_grid_desc_ak0_m_ak1 =
-                            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
-                                a_g_n_k_wos_lengths,
-                                a_g_n_k_wos_strides,
-                                b_g_k_c_xs_lengths,
-                                b_g_k_c_xs_strides,
-                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
-                                conv_filter_strides,
-                                conv_filter_dilations,
-                                input_left_pads,
-                                input_right_pads,
-                                tildes);
+                            conv_to_gemm_transform_.MakeADescriptor_AK0_M_AK1();

                        const auto b_grid_desc_bk0_n_bk1 =
-                            transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
-                                a_g_n_k_wos_lengths,
-                                a_g_n_k_wos_strides,
-                                b_g_k_c_xs_lengths,
-                                b_g_k_c_xs_strides,
-                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
-                                conv_filter_strides,
-                                conv_filter_dilations,
-                                input_left_pads,
-                                input_right_pads,
-                                tildes);
+                            conv_to_gemm_transform_.MakeBDescriptor_BK0_N_BK1();

                        DsGridDesc_M_N ds_grid_desc_m_n;

                        // populate Ds desc
                        static_for<0, NumDTensor, 1>{}([&](auto i) {
-                            using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
-
-                            ds_grid_desc_m_n(i) =
-                                transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
-                                    a_g_n_k_wos_lengths,
-                                    a_g_n_k_wos_strides,
-                                    b_g_k_c_xs_lengths,
-                                    b_g_k_c_xs_strides,
-                                    ds_g_n_c_wis_lengths[i],
-                                    ds_g_n_c_wis_strides[i],
-                                    conv_filter_strides,
-                                    conv_filter_dilations,
-                                    input_left_pads,
-                                    input_right_pads,
-                                    tildes);
-                        });
-
-                        const auto e_grid_desc_m_n =
-                            transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(
+                            using DLayout   = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                            using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                            using ConvToGemmBwdDataTransformD =
+                                TransformConvBwdDataToGemm_v1<NDimSpatial,
+                                                              ConvBackwardDataSpecialization,
+                                                              AK1,
+                                                              BK1,
+                                                              MPerBlock,
+                                                              NPerBlock,
+                                                              KPerBlock,
+                                                              DoPadGemmM,
+                                                              DoPadGemmN,
+                                                              ALayout,
+                                                              BLayout,
+                                                              DLayout,
+                                                              true, /*SplitConvN*/
+                                                              ABDataType,
+                                                              DDataType>;
+                            ConvToGemmBwdDataTransformD conv_to_gemm_transform_d{
                                a_g_n_k_wos_lengths,
                                a_g_n_k_wos_strides,
                                b_g_k_c_xs_lengths,
                                b_g_k_c_xs_strides,
-                                e_g_n_c_wis_lengths,
-                                e_g_n_c_wis_strides,
+                                ds_g_n_c_wis_lengths[i],
+                                ds_g_n_c_wis_strides[i],
                                conv_filter_strides,
                                conv_filter_dilations,
                                input_left_pads,
                                input_right_pads,
-                                tildes);
+                                tildes};
+
+                            ds_grid_desc_m_n(i) = conv_to_gemm_transform_d.MakeCDescriptor_M_N();
+                        });
+
+                        const auto e_grid_desc_m_n = conv_to_gemm_transform_.MakeCDescriptor_M_N();

                        // desc for problem definition
                        const auto a_grid_desc_m_k =
@@ -628,6 +590,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                    }
                }
            }
+            // A/B/Ds/E Batch Stride
+            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides[0];
+
+            compute_ptr_offset_of_n_.BatchStrideA_ = a_g_n_k_wos_strides[1] * conv_N_per_block_;
+            compute_ptr_offset_of_n_.BatchStrideE_ = e_g_n_c_wis_strides[1] * conv_N_per_block_;
        }

        void Print() const
@@ -660,6 +629,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1

        // tensor descriptor for problem definition
        index_t num_group_;
+        index_t conv_N_per_block_;
        std::vector<AGridDesc_M_K> a_grid_desc_m_k_container_;
        std::vector<BGridDesc_N_K> b_grid_desc_n_k_container_;
        std::vector<DsGridDesc_M_N> ds_grid_desc_m_n_container_;
@@ -678,23 +648,16 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1

        // for computing batch offset
        ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor> compute_ptr_offset_of_batch_;
+        ComputePtrOffsetOfStridedBatch<I1, I1, I0> compute_ptr_offset_of_n_;

        // element-wise op
        AElementwiseOp a_element_op_;
        BElementwiseOp b_element_op_;
        CDEElementwiseOp cde_element_op_;

-        // for checking IsSupportedArgument()
        std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_lengths_;
-        std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_;
        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
-        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
-        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_lengths_;
-        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_strides_;
-        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_lengths_;
-        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_strides_;
        std::array<index_t, NDimSpatial> conv_filter_strides_;
-        std::array<index_t, NDimSpatial> conv_filter_dilations_;
        std::array<index_t, NDimSpatial> input_left_pads_;
        std::array<index_t, NDimSpatial> input_right_pads_;
    };
@@ -711,8 +674,12 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                arg.Print();
            }

-            float ave_time = 0;
+            const index_t gdy = arg.num_group_;
+            const index_t num_workgroups_per_Conv_N =
+                arg.a_g_n_k_wos_lengths_[I1] / arg.conv_N_per_block_;
+            const index_t gdz = num_workgroups_per_Conv_N;

+            float ave_time = 0;
            for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
            {
                if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
@@ -724,9 +691,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                    throw std::runtime_error("wrong! device_op has invalid setting");
                }

-                const index_t grid_size = arg.block_2_etile_map_container_[i].CalculateGridSize(
-                                              arg.e_grid_desc_m_n_container_[i]) *
-                                          arg.num_group_;
+                const index_t gdx = arg.block_2_etile_map_container_[i].CalculateGridSize(
+                    arg.e_grid_desc_m_n_container_[i]);

                const auto GemmK = arg.a_grid_desc_m_k_container_[i].GetLength(I1);

@@ -747,12 +713,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                        DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
                        Block2ETileMap,
                        ComputePtrOffsetOfStridedBatch<I1, I1, NumDTensor>,
+                        ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
                        has_main_loop>;

                    return launch_and_time_kernel(
                        stream_config,
                        kernel,
-                        dim3(grid_size),
+                        dim3(gdx, gdy, gdz),
                        dim3(BlockSize),
                        0,
                        arg.p_a_grid_,
@@ -762,13 +729,13 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
                        arg.a_element_op_,
                        arg.b_element_op_,
                        arg.cde_element_op_,
-                        arg.a_g_n_k_wos_lengths_[0], // Group count
                        arg.a_grid_desc_ak0_m_ak1_container_[i],
                        arg.b_grid_desc_bk0_n_bk1_container_[i],
                        arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
                        arg.e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
                        arg.block_2_etile_map_container_[i],
-                        arg.compute_ptr_offset_of_batch_);
+                        arg.compute_ptr_offset_of_batch_,
+                        arg.compute_ptr_offset_of_n_);
                };

                if(GridwiseGemm::CalculateHasMainKBlockLoop(GemmK))

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
@@ -60,8 +60,7 @@ __global__ void
            const Block2CTileMap block_2_ctile_map,
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
@@ -103,7 +102,7 @@ __global__ void
    compute_ptr_offset_of_batch.GetAPtrOffset(0);
    compute_ptr_offset_of_batch.GetBPtrOffset(0);
    compute_ptr_offset_of_batch.GetCPtrOffset(0);
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }

 template <index_t NDimSpatial,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -55,8 +55,7 @@ __global__ void
            [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
            [[maybe_unused]] const index_t num_k_per_block)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);

@@ -85,7 +84,7 @@ __global__ void
                                        k_idx);
 #else
    ignore = karg;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }

 template <typename GridwiseGemm,
@@ -111,8 +110,7 @@ __global__ void
            [[maybe_unused]] const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
            [[maybe_unused]] const index_t num_k_per_block)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    // offset base pointer for each work-group
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z * NumGroupsToMerge);
    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
@@ -146,7 +144,7 @@ __global__ void
                                             k_idx);
 #else
    ignore = karg;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }

 template <ck::index_t NDimSpatial,
@@ -1559,14 +1557,23 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
            }
        }

-        if(!(arg.Conv_C_ % BBlockTransferSrcScalarPerVector == 0 &&
+        const bool is_w_pad_zero = arg.input_left_pads_[NDimSpatial - 1] == 0 &&
+                                   arg.input_right_pads_[NDimSpatial - 1] == 0;
+        const auto X                 = arg.filter_spatial_lengths_[NDimSpatial - 1];
+        const bool XC_access_allowed = arg.Conv_G_ == 1 &&
+                                       (arg.Conv_C_ * X) % BBlockTransferSrcScalarPerVector == 0 &&
+                                       is_w_pad_zero;
+
+        if(!((arg.Conv_C_ % BBlockTransferSrcScalarPerVector == 0 || XC_access_allowed) &&
             arg.Conv_K_ % ABlockTransferSrcScalarPerVector == 0))
        {
-            if(!(arg.Conv_K_ == 1 && arg.compute_ptr_offset_of_batch_.BatchStrideA_ == 1))
+            if(!(arg.Conv_K_ == 1 && arg.compute_ptr_offset_of_batch_.BatchStrideA_ == 1 &&
+                 NumGroupsToMerge > 1))
            {
                return false;
            }
-            if(!(arg.Conv_C_ == 1 && arg.compute_ptr_offset_of_batch_.BatchStrideB_ == 1))
+            if(!(arg.Conv_C_ == 1 && arg.compute_ptr_offset_of_batch_.BatchStrideB_ == 1 &&
+                 NumGroupsToMerge > 1))
            {
                return false;
            }

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -584,6 +584,10 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
        {
            return false;
        }
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t>)
+        {
+            return false;
+        }
        if constexpr(NDimSpatial == 1)
        {
            if constexpr(!is_GNWC_GKXC_GNWK<InLayout, WeiLayout, OutLayout>())

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -9,6 +9,7 @@
 #include <numeric>
 #include <sstream>

+#include "ck/library/utility/numeric.hpp"
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
@@ -98,8 +99,7 @@ __global__ void
            const ComputePtrOffsetOfG compute_ptr_offset_of_groups,
            const ComputePtrOffsetOfN compute_ptr_offset_of_n)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))

    // offset base pointer for each work-group
    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.y);
@@ -121,19 +121,6 @@ __global__ void
    static_for<0, NumDTensor, 1>{}(
        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_group_offset[i]; });

-    if constexpr(is_same_v<AElementwiseOperation, element_wise::DynamicUnaryOp>)
-    {
-        a_element_op.InitUnaryOpPtrOnDevice();
-    }
-    if constexpr(is_same_v<BElementwiseOperation, element_wise::DynamicUnaryOp>)
-    {
-        b_element_op.InitUnaryOpPtrOnDevice();
-    }
-    if constexpr(is_same_v<CDEElementwiseOperation, element_wise::DynamicUnaryOp>)
-    {
-        cde_element_op.InitUnaryOpPtrOnDevice();
-    }
-
    if constexpr(isMultiA || isMultiB)
    {
        AsPointer p_as_grid_grp;
@@ -225,9 +212,13 @@ __global__ void
 }

 } // namespace
-
+#ifdef CK_CODE_GEN_RTC
+template <typename T>
+using is_tuple = decltype(ck::declval<T&>().IsTuple());
+#else
 template <typename T>
 using is_tuple = decltype(std::declval<T&>().IsTuple());
+#endif

 //
 // @brief      Device Convolution operation.

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -9,6 +9,7 @@
 #include <numeric>
 #include <sstream>

+#include "ck/library/utility/numeric.hpp"
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
@@ -117,7 +118,7 @@ __global__ void
                                        c_grid_desc_mblock_mperblock_nblock_nperblock);
 #else
    ignore = karg;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }

 template <typename GridwiseGemm,
@@ -183,7 +184,7 @@ __global__ void
                                             c_grid_desc_mblock_mperblock_nblock_nperblock);
 #else
    ignore = karg;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }

 } // namespace

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -155,8 +155,7 @@ __global__ void
            const Block2ETileMap block_2_ctile_map,
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
@@ -52,8 +52,7 @@ __global__ void
            const ComputePtrOffset compute_ptr_offset_of_groups,
            const ComputePtrOffset compute_ptr_offset_of_n)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t block_id_x = __builtin_amdgcn_readfirstlane(blockIdx.x);

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_xdl_cshuffle_tile_loop.hpp
@@ -68,8 +68,7 @@ __global__ void
                                           const BElementwiseOperation b_element_op,
                                           const CDEElementwiseOperation cde_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))

    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
    __shared__ uint8_t p_shared[shared_size];
@@ -404,7 +403,7 @@ __global__ void
    ignore = a_element_op;
    ignore = b_element_op;
    ignore = cde_element_op;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }

 template <typename ALayout,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -43,8 +43,7 @@ __global__ void
            const B1ElementwiseOperation b1_element_op,
            const CElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t block_id = get_block_1d_id();
@@ -109,7 +108,7 @@ __global__ void
    ignore = acc_element_op;
    ignore = b1_element_op;
    ignore = c_element_op;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }

 // Computes C = A * B0 * B1

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -38,8 +38,7 @@ __global__ void
                                const BElementwiseOperation b_element_op,
                                const CDEElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t block_id = get_block_1d_id();

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_fixed_nk.hpp
@@ -50,8 +50,7 @@ __global__ void
                                         const BElementwiseOperation b_element_op,
                                         const CDEElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t block_id = get_block_1d_id();

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -40,8 +40,7 @@ __global__ void
                                       const BElementwiseOperation b_element_op,
                                       const CElementwiseOperation c_element_op)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx94__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
    __shared__ uint8_t p_shared[shared_size];

@@ -80,7 +79,7 @@ __global__ void
    ignore = a_element_op;
    ignore = b_element_op;
    ignore = c_element_op;
-#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+#endif // end of if (defined(__gfx9__))
 }

 template <typename ALayout,

--- a/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
@@ -3,6 +3,7 @@

 #pragma once

+#include "ck/library/utility/numeric.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp"