Merge branch 'develop' into add_fp16_wmma_conv_instance

4b70d68e · Chao Liu · GitHub · 212b9299 · f82bd593 · 4b70d68e
Unverified Commit 4b70d68e authored Jul 18, 2023 by Chao Liu Committed by GitHub Jul 18, 2023
20 changed files
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
@@ -27,17 +27,19 @@ struct DeviceGroupedConvBwdWeight : public BaseOperator
    MakeArgumentPointer(const void* p_in,
                        void* p_wei,
                        const void* p_out,
-                        ck::index_t G,
+                        const ck::index_t G,
-                        ck::index_t N,
+                        const ck::index_t N,
-                        ck::index_t K,
+                        const ck::index_t K,
-                        ck::index_t C,
+                        const ck::index_t C,
-                        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+                        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+                        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+                        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+                        const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
-                        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+                        const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
-                        std::array<ck::index_t, NDimSpatial> input_left_pads,
+                        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
-                        std::array<ck::index_t, NDimSpatial> input_right_pads,
+                        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+                        const std::array<ck::index_t, NDimSpatial>& input_right_pads,
                        InElementwiseOperation in_element_op,
                        WeiElementwiseOperation wei_element_op,
                        OutElementwiseOperation out_element_op,

--- a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
@@ -10,12 +10,14 @@
 #include "ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
 #include "ck/tensor_operation/gpu/device/welford_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_batchnorm_forward.hpp"
 #include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp"
-#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp"
+#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final_obsolete.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/hip_check_error.hpp"
 namespace ck {
 namespace tensor_operation {
@@ -114,8 +116,8 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
    static auto MakeMeanVarCountOutputMG2dDescriptor(int invariantLength, int blkGroupSize)
    {
-        const auto grid_desc_m_g =
+        const auto grid_desc_m_g = make_naive_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(invariantLength, blkGroupSize));
+            make_tuple(invariantLength, blkGroupSize), make_tuple(1, invariantLength));
        const auto mPad =
            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
@@ -132,9 +134,9 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
    static auto MakeMeanVarCountInputMK2dDescriptor(int invariantLength, int blkGroupSize)
    {
-        const auto reduceLength = blkGroupSize;
+        const auto reduceLength  = blkGroupSize;
-        const auto grid_desc_m_k =
+        const auto grid_desc_m_k = make_naive_tensor_descriptor(
-            make_naive_tensor_descriptor_packed(make_tuple(invariantLength, reduceLength));
+            make_tuple(invariantLength, reduceLength), make_tuple(1, invariantLength));
        const auto mPad =
            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
@@ -244,8 +246,8 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
                    int testBlkGroupSize = (reduce_length_ + (K_BlockTileSize * iterations) - 1) /
                                           (K_BlockTileSize * iterations);
-                    // we want the blkGroupSize be not more than 128
+                    // we want the blkGroupSize be not more than 16
-                    if(testBlkGroupSize <= 128)
+                    if(testBlkGroupSize <= 16)
                        break;
                    iterations++;
@@ -319,6 +321,8 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
        void* workspace_mean_;
        void* workspace_variance_;
        void* workspace_count_;
+        void* control_;
    };
    size_t GetWorkSpaceSize(const BaseArgument* pArg) const override
@@ -340,6 +344,11 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
            // workspace for welford intermediate count
            workspace_size +=
                pArg_->invariant_length_ * pArg_->blkGroupSize_ * sizeof(int32_t) + 64;
+            // workspace for barrier objects, each barrier object consists of two integers
+            // TODO: allocate barrier object memory globally to reuse it by other operators
+            workspace_size += (pArg_->invariant_length_ + M_BlockTileSize - 1) / M_BlockTileSize *
+                              sizeof(int) * 2;
        }
        return (workspace_size);
@@ -353,7 +362,6 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
        if(UseMultiblockInK && pArg_->blkGroupSize_ > 1)
        {
            // setup buffer used for intermediate welford mean
            pArg_->workspace_mean_ = static_cast<char*>(pArg_->p_workspace_);
@@ -374,6 +382,18 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
            // setup buffer used for intermediate welfor count
            pArg_->workspace_count_ =
                reinterpret_cast<char*>(pArg_->workspace_variance_) + variance_space_sz;
+            index_t count_space_sz =
+                pArg_->invariant_length_ * pArg_->blkGroupSize_ * sizeof(int32_t);
+            count_space_sz = math::integer_least_multiple(count_space_sz, 64);
+            pArg_->control_ = reinterpret_cast<char*>(pArg_->workspace_count_) + count_space_sz;
+            index_t control_space_sz = (pArg_->invariant_length_ + M_BlockTileSize - 1) /
+                                       M_BlockTileSize * sizeof(int) * 2;
+            hip_check_error(hipMemset(pArg_->control_, 0, control_space_sz));
        };
    };
@@ -402,6 +422,32 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
                using MeanVarCountGridDesc_M_G = decltype(mean_var_count_grid_desc_m_g);
                using MeanVarCountGridDesc_M_K = decltype(mean_var_count_grid_desc_m_k);
+                using GridwiseMultiblockBatchNormForward_ =
+                    GridwiseMultiblockBatchNormForward<XDataType,
+                                                       YDataType,
+                                                       AccDataType,
+                                                       ScaleDataType,
+                                                       BiasDataType,
+                                                       MeanVarDataType,
+                                                       YElementwiseOp,
+                                                       XYGridDesc_M_K,
+                                                       MeanVarCountGridDesc_M_G,
+                                                       MeanVarCountGridDesc_M_K,
+                                                       ScaleBiasMeanVarGridDesc_M,
+                                                       ScaleBiasMeanVarGridDesc_M,
+                                                       GetReduceCountPerThreadFunctor,
+                                                       BlockSize,
+                                                       MThreadClusterSize,
+                                                       KThreadClusterSize,
+                                                       MThreadSliceSize,
+                                                       KThreadSliceSize,
+                                                       XSrcYDstVectorDim,
+                                                       XSrcVectorSize,
+                                                       YDstVectorSize,
+                                                       ScaleSrcVectorSize,
+                                                       BiasSrcVectorSize,
+                                                       MeanVarSrcDstVectorSize>;
                using GridwiseMultiblockWelfordFirstHalf_ =
                    GridwiseMultiblockWelfordFirstHalf<XDataType,
                                                       AccDataType,
@@ -441,78 +487,136 @@ struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
                                                                   BiasSrcVectorSize,
                                                                   MeanVarSrcDstVectorSize>;
-                index_t numMeanVarCountBlockTileIteration =
+                // It is found that:
-                    (arg.blkGroupSize_ + KThreadClusterSize - 1) / KThreadClusterSize;
+                // 1) gfx1030 does not support the GLC enabled vector load/store, so using the
+                //    two-kernel method for gfx1030
-                const auto kern_multiblock_welford_first_half =
+                // 2) Profiler on gfx908 could hang even though it works when running examples
-                    kernel_multiblock_welford_first_half<GridwiseMultiblockWelfordFirstHalf_,
+                // 3) Single-kernel method works on gfx1100, but the performance it not better
-                                                         XDataType,
+                //    than two-kernel method (due to more warps participating the barrier)
-                                                         MeanVarDataType,
+                if(ck::get_device_name() == "gfx90a")
-                                                         XYGridDesc_M_K,
+                {
-                                                         MeanVarCountGridDesc_M_G,
+                    const auto kern_multiblock_batchnorm_fwd_ =
-                                                         GetReduceCountPerThreadFunctor>;
+                        kernel_multiblock_batchnorm_forward<GridwiseMultiblockBatchNormForward_,
+                                                            XDataType,
-                const auto kern_welford_second_half_batchnorm_forward_final =
+                                                            YDataType,
-                    kernel_welford_second_half_batchnorm_forward_final<
+                                                            AccDataType,
-                        GridwiseWelfordSecondHalfBatchNormForwardFinal_,
+                                                            ScaleDataType,
-                        XDataType,
+                                                            BiasDataType,
-                        YDataType,
+                                                            MeanVarDataType,
-                        AccDataType,
+                                                            YElementwiseOp,
-                        ScaleDataType,
+                                                            XYGridDesc_M_K,
-                        BiasDataType,
+                                                            MeanVarCountGridDesc_M_G,
-                        MeanVarDataType,
+                                                            MeanVarCountGridDesc_M_K,
-                        YElementwiseOp,
+                                                            ScaleBiasMeanVarGridDesc_M,
-                        XYGridDesc_M_K,
+                                                            ScaleBiasMeanVarGridDesc_M,
-                        MeanVarCountGridDesc_M_K,
+                                                            GetReduceCountPerThreadFunctor>;
-                        ScaleBiasMeanVarGridDesc_M,
-                        ScaleBiasMeanVarGridDesc_M>;
+                    avg_time += launch_and_time_kernel(
+                        stream_config,
-                avg_time +=
+                        kern_multiblock_batchnorm_fwd_,
-                    launch_and_time_kernel(stream_config,
+                        dim3(arg.gridSize_),
-                                           kern_multiblock_welford_first_half,
+                        dim3(BlockSize),
-                                           dim3(arg.gridSize_),
+                        0,
-                                           dim3(BlockSize),
+                        arg.x_grid_desc_m_k_,
-                                           0,
+                        arg.y_grid_desc_m_k_,
-                                           arg.x_grid_desc_m_k_,
+                        mean_var_count_grid_desc_m_g, // for writing to mean/variance/count
-                                           mean_var_count_grid_desc_m_g,
+                                                      // workspace by multiple workgroups
-                                           get_reduce_count_per_thread,
+                        mean_var_count_grid_desc_m_k, // for reading from mean/variance/count
-                                           arg.numBlockTileIteration_,
+                                                      // workspace by each workgroup
-                                           arg.p_x_,
+                        arg.scale_grid_desc_m_,
-                                           static_cast<MeanVarDataType*>(arg.workspace_mean_),
+                        arg.bias_grid_desc_m_,
-                                           static_cast<MeanVarDataType*>(arg.workspace_variance_),
+                        arg.mean_var_grid_desc_m_,
-                                           static_cast<int32_t*>(arg.workspace_count_));
+                        get_reduce_count_per_thread,
+                        arg.numBlockTileIteration_,
-                avg_time +=
+                        arg.epsilon_,
-                    launch_and_time_kernel(stream_config,
+                        arg.p_x_,
-                                           kern_welford_second_half_batchnorm_forward_final,
+                        static_cast<MeanVarDataType*>(arg.workspace_mean_),
-                                           dim3(arg.gridSize_),
+                        static_cast<MeanVarDataType*>(arg.workspace_variance_),
-                                           dim3(BlockSize),
+                        static_cast<int32_t*>(arg.workspace_count_),
-                                           0,
+                        static_cast<int*>(arg.control_),
-                                           arg.x_grid_desc_m_k_,
+                        arg.p_scale_,
-                                           arg.y_grid_desc_m_k_,
+                        arg.p_bias_,
-                                           mean_var_count_grid_desc_m_k,
+                        arg.y_elementwise_op_,
-                                           arg.scale_grid_desc_m_,
+                        arg.p_y_,
-                                           arg.bias_grid_desc_m_,
+                        arg.updateMovingAverage_, // true or false
-                                           arg.mean_var_grid_desc_m_,
+                        arg.averageFactor_,
-                                           arg.blkGroupSize_,
+                        arg.resultRunningMean_,
-                                           arg.numBlockTileIteration_,
+                        arg.resultRunningVariance_,
-                                           numMeanVarCountBlockTileIteration,
+                        arg.saveMeanInvVariance_, // true or false
-                                           arg.epsilon_,
+                        arg.resultSaveMean_,
-                                           static_cast<MeanVarDataType*>(arg.workspace_mean_),
+                        arg.resultSaveInvVariance_);
-                                           static_cast<MeanVarDataType*>(arg.workspace_variance_),
+                }
-                                           static_cast<int32_t*>(arg.workspace_count_),
+                else
-                                           arg.p_x_,
+                {
-                                           arg.p_scale_,
+                    const auto kern_multiblock_welford_first_half =
-                                           arg.p_bias_,
+                        kernel_multiblock_welford_first_half<GridwiseMultiblockWelfordFirstHalf_,
-                                           arg.y_elementwise_op_,
+                                                             XDataType,
-                                           arg.p_y_,
+                                                             MeanVarDataType,
-                                           arg.updateMovingAverage_,
+                                                             XYGridDesc_M_K,
-                                           arg.averageFactor_,
+                                                             MeanVarCountGridDesc_M_G,
-                                           arg.resultRunningMean_,
+                                                             GetReduceCountPerThreadFunctor>;
-                                           arg.resultRunningVariance_,
-                                           arg.saveMeanInvVariance_,
+                    const auto kern_welford_second_half_batchnorm_forward_final =
-                                           arg.resultSaveMean_,
+                        kernel_welford_second_half_batchnorm_forward_final<
-                                           arg.resultSaveInvVariance_);
+                            GridwiseWelfordSecondHalfBatchNormForwardFinal_,
+                            XDataType,
+                            YDataType,
+                            AccDataType,
+                            ScaleDataType,
+                            BiasDataType,
+                            MeanVarDataType,
+                            YElementwiseOp,
+                            XYGridDesc_M_K,
+                            MeanVarCountGridDesc_M_K,
+                            ScaleBiasMeanVarGridDesc_M,
+                            ScaleBiasMeanVarGridDesc_M>;
+                    avg_time += launch_and_time_kernel(
+                        stream_config,
+                        kern_multiblock_welford_first_half,
+                        dim3(arg.gridSize_),
+                        dim3(BlockSize),
+                        0,
+                        arg.x_grid_desc_m_k_,
+                        mean_var_count_grid_desc_m_g,
+                        get_reduce_count_per_thread,
+                        arg.numBlockTileIteration_,
+                        arg.p_x_,
+                        static_cast<MeanVarDataType*>(arg.workspace_mean_),
+                        static_cast<MeanVarDataType*>(arg.workspace_variance_),
+                        static_cast<int32_t*>(arg.workspace_count_));
+                    avg_time += launch_and_time_kernel(
+                        stream_config,
+                        kern_welford_second_half_batchnorm_forward_final,
+                        dim3(arg.gridSize_),
+                        dim3(BlockSize),
+                        0,
+                        arg.x_grid_desc_m_k_,
+                        arg.y_grid_desc_m_k_,
+                        mean_var_count_grid_desc_m_k,
+                        arg.scale_grid_desc_m_,
+                        arg.bias_grid_desc_m_,
+                        arg.mean_var_grid_desc_m_,
+                        arg.blkGroupSize_,
+                        arg.numBlockTileIteration_,
+                        arg.epsilon_,
+                        static_cast<MeanVarDataType*>(arg.workspace_mean_),
+                        static_cast<MeanVarDataType*>(arg.workspace_variance_),
+                        static_cast<int32_t*>(arg.workspace_count_),
+                        arg.p_x_,
+                        arg.p_scale_,
+                        arg.p_bias_,
+                        arg.y_elementwise_op_,
+                        arg.p_y_,
+                        arg.updateMovingAverage_,
+                        arg.averageFactor_,
+                        arg.resultRunningMean_,
+                        arg.resultRunningVariance_,
+                        arg.saveMeanInvVariance_,
+                        arg.resultSaveMean_,
+                        arg.resultSaveInvVariance_);
+                };
            }
            else
            {

--- a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl_obsolete.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl_obsolete.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <sstream>
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
+#include "ck/tensor_operation/gpu/device/welford_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp"
+#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final_obsolete.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim,
+          bool UseMultiblockInK,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XSrcYDstVectorDim,
+          index_t XSrcVectorSize,
+          index_t YDstVectorSize,
+          index_t ScaleSrcVectorSize,
+          index_t BiasSrcVectorSize,
+          index_t MeanVarSrcDstVectorSize>
+struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
+                                                          YDataType,
+                                                          AccDataType,
+                                                          ScaleDataType,
+                                                          BiasDataType,
+                                                          MeanVarDataType,
+                                                          YElementwiseOp,
+                                                          Rank,
+                                                          NumBatchNormReduceDim>
+{
+    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
+    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
+                  "Invalid thread cluster size assignments!");
+    static_assert((XSrcYDstVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
+                      (XSrcYDstVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+    static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim;
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+    static auto MakeXY2dDescriptor(const std::array<index_t, Rank>& xyLengths,
+                                   const std::array<index_t, Rank>& xyStrides,
+                                   int blkGroupSize,
+                                   int numBlockTileIteration)
+    {
+        const auto tupleXYLengths =
+            generate_tuple([&](auto I) { return xyLengths[I]; }, Number<Rank>{});
+        const auto tupleXYStrides =
+            generate_tuple([&](auto I) { return xyStrides[I]; }, Number<Rank>{});
+        const auto raw_grid_desc = make_naive_tensor_descriptor(tupleXYLengths, tupleXYStrides);
+        const auto grid_desc_m_k = [&]() {
+            using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+            using ReduceDims    = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+            const auto reduceDimLengths =
+                generate_tuple([&](auto I) { return xyLengths[NumInvariantDim + I]; },
+                               Number<NumBatchNormReduceDim>{});
+            const auto invariantDimLengths =
+                generate_tuple([&](auto I) { return xyLengths[I]; }, Number<NumInvariantDim>{});
+            return transform_tensor_descriptor(raw_grid_desc,
+                                               make_tuple(make_merge_transform(invariantDimLengths),
+                                                          make_merge_transform(reduceDimLengths)),
+                                               make_tuple(InvariantDims{}, ReduceDims{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }();
+        const auto invariantLength = grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = grid_desc_m_k.GetLength(Number<1>{});
+        const int workSizePerBlock = K_BlockTileSize * numBlockTileIteration;
+        const auto mPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto kPad = workSizePerBlock * blkGroupSize - reduceLength;
+        auto grid_desc_m_k_padded =
+            transform_tensor_descriptor(grid_desc_m_k,
+                                        make_tuple(make_right_pad_transform(invariantLength, mPad),
+                                                   make_right_pad_transform(reduceLength, kPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        return (grid_desc_m_k_padded);
+    };
+    static auto MakeMeanVarCountOutputMG2dDescriptor(int invariantLength, int blkGroupSize)
+    {
+        const auto grid_desc_m_g = make_naive_tensor_descriptor(
+            make_tuple(invariantLength, blkGroupSize), make_tuple(1, invariantLength));
+        const auto mPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        auto grid_desc_m_g_padded =
+            transform_tensor_descriptor(grid_desc_m_g,
+                                        make_tuple(make_right_pad_transform(invariantLength, mPad),
+                                                   make_pass_through_transform(blkGroupSize)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        return (grid_desc_m_g_padded);
+    };
+    static auto MakeMeanVarCountInputMK2dDescriptor(int invariantLength, int blkGroupSize)
+    {
+        const auto reduceLength  = blkGroupSize;
+        const auto grid_desc_m_k = make_naive_tensor_descriptor(
+            make_tuple(invariantLength, reduceLength), make_tuple(1, invariantLength));
+        const auto mPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto kPad =
+            math::integer_least_multiple(reduceLength, KThreadClusterSize) - reduceLength;
+        auto grid_desc_m_k_padded =
+            transform_tensor_descriptor(grid_desc_m_k,
+                                        make_tuple(make_right_pad_transform(invariantLength, mPad),
+                                                   make_right_pad_transform(reduceLength, kPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+        return (grid_desc_m_k_padded);
+    };
+    static auto
+    MakeScaleBiasMeanVar1dDescriptor(const std::array<index_t, NumInvariantDim>& lengths,
+                                     const std::array<index_t, NumInvariantDim>& strides)
+    {
+        const auto tupleLengths =
+            generate_tuple([&](auto I) { return lengths[I]; }, Number<NumInvariantDim>{});
+        const auto tupleStrides =
+            generate_tuple([&](auto I) { return strides[I]; }, Number<NumInvariantDim>{});
+        auto raw_grid_desc = make_naive_tensor_descriptor(tupleLengths, tupleStrides);
+        auto grid_desc_m = transform_tensor_descriptor(
+            raw_grid_desc,
+            make_tuple(make_merge_transform(tupleLengths)),
+            make_tuple(typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type{}),
+            make_tuple(Sequence<0>{}));
+        const auto invariantLength = grid_desc_m.GetLength(Number<0>{});
+        const auto mPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        auto grid_desc_m_padded =
+            transform_tensor_descriptor(grid_desc_m,
+                                        make_tuple(make_right_pad_transform(invariantLength, mPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return (grid_desc_m_padded);
+    };
+    using XYGridDesc_M_K             = decltype(MakeXY2dDescriptor({1}, {1}, 1, 1));
+    using ScaleBiasMeanVarGridDesc_M = decltype(MakeScaleBiasMeanVar1dDescriptor({1}, {1}));
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::array<index_t, Rank> xyLengths,
+                 const std::array<index_t, Rank> xStrides,
+                 const std::array<index_t, Rank> yStrides,
+                 const std::array<int, NumBatchNormReduceDim> reduceDims,
+                 const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
+                 const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides,
+                 const std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides,
+                 const std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides,
+                 const XDataType* p_x,
+                 const ScaleDataType* p_scale,
+                 const BiasDataType* p_bias,
+                 const YElementwiseOp y_elementwise_op,
+                 double epsilon,
+                 YDataType* p_y,
+                 MeanVarDataType* resultSaveMean,
+                 MeanVarDataType* resultSaveInvVariance,
+                 double averageFactor,
+                 MeanVarDataType* resultRunningMean,
+                 MeanVarDataType* resultRunningVariance)
+            : bnScaleBiasMeanVarLengths_(bnScaleBiasMeanVarLengths),
+              bnScaleStrides_(bnScaleStrides),
+              bnBiasStrides_(bnBiasStrides),
+              bnMeanVarStrides_(bnMeanVarStrides),
+              p_x_(p_x),
+              p_scale_(p_scale),
+              p_bias_(p_bias),
+              y_elementwise_op_(y_elementwise_op),
+              p_y_(p_y),
+              resultSaveMean_(resultSaveMean),
+              resultSaveInvVariance_(resultSaveInvVariance),
+              resultRunningMean_(resultRunningMean),
+              resultRunningVariance_(resultRunningVariance)
+        {
+            xyLengths_ =
+                shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(xyLengths, reduceDims);
+            xStrides_ =
+                shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(xStrides, reduceDims);
+            yStrides_ =
+                shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(yStrides, reduceDims);
+            std::tie(invariant_length_, reduce_length_) =
+                get_2d_lengths<Rank, NumBatchNormReduceDim>(xyLengths_);
+            epsilon_       = type_convert<AccDataType>(epsilon);
+            averageFactor_ = type_convert<AccDataType>(averageFactor);
+            updateMovingAverage_ =
+                (resultRunningMean != nullptr && resultRunningVariance != nullptr);
+            saveMeanInvVariance_ = (resultSaveMean != nullptr && resultSaveInvVariance_ != nullptr);
+            if(UseMultiblockInK)
+            {
+                int iterations = 1;
+                while(true)
+                {
+                    int testBlkGroupSize = (reduce_length_ + (K_BlockTileSize * iterations) - 1) /
+                                           (K_BlockTileSize * iterations);
+                    // we want the blkGroupSize be not more than 16
+                    if(testBlkGroupSize <= 16)
+                        break;
+                    iterations++;
+                };
+                blkGroupSize_ = (reduce_length_ + (K_BlockTileSize * iterations) - 1) /
+                                (K_BlockTileSize * iterations);
+                numBlockTileIteration_ = iterations;
+            }
+            else
+            {
+                blkGroupSize_          = 1;
+                numBlockTileIteration_ = (reduce_length_ + K_BlockTileSize - 1) / K_BlockTileSize;
+            };
+            gridSize_ = (invariant_length_ + M_BlockTileSize - 1) / M_BlockTileSize * blkGroupSize_;
+            x_grid_desc_m_k_ =
+                MakeXY2dDescriptor(xyLengths_, xStrides_, blkGroupSize_, numBlockTileIteration_);
+            y_grid_desc_m_k_ =
+                MakeXY2dDescriptor(xyLengths_, yStrides_, blkGroupSize_, numBlockTileIteration_);
+            scale_grid_desc_m_ =
+                MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnScaleStrides_);
+            bias_grid_desc_m_ =
+                MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnBiasStrides_);
+            mean_var_grid_desc_m_ =
+                MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnMeanVarStrides_);
+        }
+        AccDataType epsilon_;
+        AccDataType averageFactor_;
+        bool updateMovingAverage_;
+        bool saveMeanInvVariance_;
+        std::array<index_t, Rank> xyLengths_;
+        std::array<index_t, Rank> xStrides_;
+        std::array<index_t, Rank> yStrides_;
+        std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths_;
+        std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides_;
+        std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides_;
+        std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides_;
+        const XDataType* p_x_;
+        const ScaleDataType* p_scale_;
+        const BiasDataType* p_bias_;
+        const YElementwiseOp y_elementwise_op_;
+        YDataType* p_y_;
+        MeanVarDataType* resultSaveMean_;
+        MeanVarDataType* resultSaveInvVariance_;
+        MeanVarDataType* resultRunningMean_;
+        MeanVarDataType* resultRunningVariance_;
+        long_index_t invariant_length_;
+        long_index_t reduce_length_;
+        int blkGroupSize_;
+        int numBlockTileIteration_;
+        size_t gridSize_;
+        XYGridDesc_M_K x_grid_desc_m_k_;
+        XYGridDesc_M_K y_grid_desc_m_k_;
+        ScaleBiasMeanVarGridDesc_M scale_grid_desc_m_;
+        ScaleBiasMeanVarGridDesc_M bias_grid_desc_m_;
+        ScaleBiasMeanVarGridDesc_M mean_var_grid_desc_m_;
+        void* workspace_mean_;
+        void* workspace_variance_;
+        void* workspace_count_;
+    };
+    size_t GetWorkSpaceSize(const BaseArgument* pArg) const override
+    {
+        const Argument* pArg_ = dynamic_cast<const Argument*>(pArg);
+        size_t workspace_size = 0;
+        if(UseMultiblockInK && pArg_->blkGroupSize_ > 1)
+        {
+            // workspace for welford intermediate mean
+            workspace_size +=
+                pArg_->invariant_length_ * pArg_->blkGroupSize_ * sizeof(MeanVarDataType) + 64;
+            // workspace for welford intermediate variance
+            workspace_size +=
+                pArg_->invariant_length_ * pArg_->blkGroupSize_ * sizeof(MeanVarDataType) + 64;
+            // workspace for welford intermediate count
+            workspace_size +=
+                pArg_->invariant_length_ * pArg_->blkGroupSize_ * sizeof(int32_t) + 64;
+        }
+        return (workspace_size);
+    };
+    void SetWorkSpacePointer(BaseArgument* pArg, void* p_workspace) const override
+    {
+        Argument* pArg_ = dynamic_cast<Argument*>(pArg);
+        pArg_->p_workspace_ = p_workspace;
+        if(UseMultiblockInK && pArg_->blkGroupSize_ > 1)
+        {
+            // setup buffer used for intermediate welford mean
+            pArg_->workspace_mean_ = static_cast<char*>(pArg_->p_workspace_);
+            index_t mean_space_sz =
+                pArg_->invariant_length_ * pArg_->blkGroupSize_ * sizeof(MeanVarDataType);
+            mean_space_sz = math::integer_least_multiple(mean_space_sz, 64);
+            // setup buffer used for intermediate welford varirance
+            pArg_->workspace_variance_ =
+                reinterpret_cast<char*>(pArg_->workspace_mean_) + mean_space_sz;
+            index_t variance_space_sz =
+                pArg_->invariant_length_ * pArg_->blkGroupSize_ * sizeof(MeanVarDataType);
+            variance_space_sz = math::integer_least_multiple(variance_space_sz, 64);
+            // setup buffer used for intermediate welfor count
+            pArg_->workspace_count_ =
+                reinterpret_cast<char*>(pArg_->workspace_variance_) + variance_space_sz;
+        };
+    };
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            float avg_time = 0;
+            if(UseMultiblockInK && arg.blkGroupSize_ > 1)
+            {
+                using GetReduceCountPerThreadFunctor =
+                    GetReduceCountPerThreadForMultiblockWelford<K_BlockTileSize, KThreadSliceSize>;
+                GetReduceCountPerThreadFunctor get_reduce_count_per_thread(
+                    arg.blkGroupSize_, arg.numBlockTileIteration_, arg.reduce_length_);
+                const auto mean_var_count_grid_desc_m_g =
+                    DeviceBatchNormFwdImpl::MakeMeanVarCountOutputMG2dDescriptor(
+                        arg.invariant_length_, arg.blkGroupSize_);
+                const auto mean_var_count_grid_desc_m_k =
+                    DeviceBatchNormFwdImpl::MakeMeanVarCountInputMK2dDescriptor(
+                        arg.invariant_length_, arg.blkGroupSize_);
+                using MeanVarCountGridDesc_M_G = decltype(mean_var_count_grid_desc_m_g);
+                using MeanVarCountGridDesc_M_K = decltype(mean_var_count_grid_desc_m_k);
+                using GridwiseMultiblockWelfordFirstHalf_ =
+                    GridwiseMultiblockWelfordFirstHalf<XDataType,
+                                                       AccDataType,
+                                                       MeanVarDataType,
+                                                       XYGridDesc_M_K,
+                                                       MeanVarCountGridDesc_M_G,
+                                                       GetReduceCountPerThreadFunctor,
+                                                       BlockSize,
+                                                       MThreadClusterSize,
+                                                       KThreadClusterSize,
+                                                       MThreadSliceSize,
+                                                       KThreadSliceSize,
+                                                       XSrcYDstVectorDim,
+                                                       XSrcVectorSize>;
+                using GridwiseWelfordSecondHalfBatchNormForwardFinal_ =
+                    GridwiseWelfordSecondHalfBatchNormForwardFinal<XDataType,
+                                                                   YDataType,
+                                                                   AccDataType,
+                                                                   ScaleDataType,
+                                                                   BiasDataType,
+                                                                   MeanVarDataType,
+                                                                   YElementwiseOp,
+                                                                   XYGridDesc_M_K,
+                                                                   MeanVarCountGridDesc_M_K,
+                                                                   ScaleBiasMeanVarGridDesc_M,
+                                                                   ScaleBiasMeanVarGridDesc_M,
+                                                                   BlockSize,
+                                                                   MThreadClusterSize,
+                                                                   KThreadClusterSize,
+                                                                   MThreadSliceSize,
+                                                                   KThreadSliceSize,
+                                                                   XSrcYDstVectorDim,
+                                                                   XSrcVectorSize,
+                                                                   YDstVectorSize,
+                                                                   ScaleSrcVectorSize,
+                                                                   BiasSrcVectorSize,
+                                                                   MeanVarSrcDstVectorSize>;
+                const auto kern_multiblock_welford_first_half =
+                    kernel_multiblock_welford_first_half<GridwiseMultiblockWelfordFirstHalf_,
+                                                         XDataType,
+                                                         MeanVarDataType,
+                                                         XYGridDesc_M_K,
+                                                         MeanVarCountGridDesc_M_G,
+                                                         GetReduceCountPerThreadFunctor>;
+                const auto kern_welford_second_half_batchnorm_forward_final =
+                    kernel_welford_second_half_batchnorm_forward_final<
+                        GridwiseWelfordSecondHalfBatchNormForwardFinal_,
+                        XDataType,
+                        YDataType,
+                        AccDataType,
+                        ScaleDataType,
+                        BiasDataType,
+                        MeanVarDataType,
+                        YElementwiseOp,
+                        XYGridDesc_M_K,
+                        MeanVarCountGridDesc_M_K,
+                        ScaleBiasMeanVarGridDesc_M,
+                        ScaleBiasMeanVarGridDesc_M>;
+                avg_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kern_multiblock_welford_first_half,
+                                           dim3(arg.gridSize_),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.x_grid_desc_m_k_,
+                                           mean_var_count_grid_desc_m_g,
+                                           get_reduce_count_per_thread,
+                                           arg.numBlockTileIteration_,
+                                           arg.p_x_,
+                                           static_cast<MeanVarDataType*>(arg.workspace_mean_),
+                                           static_cast<MeanVarDataType*>(arg.workspace_variance_),
+                                           static_cast<int32_t*>(arg.workspace_count_));
+                avg_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kern_welford_second_half_batchnorm_forward_final,
+                                           dim3(arg.gridSize_),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.x_grid_desc_m_k_,
+                                           arg.y_grid_desc_m_k_,
+                                           mean_var_count_grid_desc_m_k,
+                                           arg.scale_grid_desc_m_,
+                                           arg.bias_grid_desc_m_,
+                                           arg.mean_var_grid_desc_m_,
+                                           arg.blkGroupSize_,
+                                           arg.numBlockTileIteration_,
+                                           arg.epsilon_,
+                                           static_cast<MeanVarDataType*>(arg.workspace_mean_),
+                                           static_cast<MeanVarDataType*>(arg.workspace_variance_),
+                                           static_cast<int32_t*>(arg.workspace_count_),
+                                           arg.p_x_,
+                                           arg.p_scale_,
+                                           arg.p_bias_,
+                                           arg.y_elementwise_op_,
+                                           arg.p_y_,
+                                           arg.updateMovingAverage_,
+                                           arg.averageFactor_,
+                                           arg.resultRunningMean_,
+                                           arg.resultRunningVariance_,
+                                           arg.saveMeanInvVariance_,
+                                           arg.resultSaveMean_,
+                                           arg.resultSaveInvVariance_);
+            }
+            else
+            {
+                using GetReduceCountPerThreadFunctor =
+                    GetReduceCountPerThreadForBlockwiseWelford<K_BlockTileSize, KThreadSliceSize>;
+                GetReduceCountPerThreadFunctor get_reduce_count_per_thread(
+                    arg.numBlockTileIteration_, arg.reduce_length_);
+                using GridwiseBatchNormForwardWithBlockwiseWelford_ =
+                    GridwiseBatchNormForwardWithBlockwiseWelford<XDataType,
+                                                                 YDataType,
+                                                                 AccDataType,
+                                                                 ScaleDataType,
+                                                                 BiasDataType,
+                                                                 MeanVarDataType,
+                                                                 YElementwiseOp,
+                                                                 XYGridDesc_M_K,
+                                                                 ScaleBiasMeanVarGridDesc_M,
+                                                                 ScaleBiasMeanVarGridDesc_M,
+                                                                 GetReduceCountPerThreadFunctor,
+                                                                 BlockSize,
+                                                                 MThreadClusterSize,
+                                                                 KThreadClusterSize,
+                                                                 MThreadSliceSize,
+                                                                 KThreadSliceSize,
+                                                                 XSrcYDstVectorDim,
+                                                                 XSrcVectorSize,
+                                                                 YDstVectorSize,
+                                                                 ScaleSrcVectorSize,
+                                                                 BiasSrcVectorSize,
+                                                                 MeanVarSrcDstVectorSize>;
+                const auto kern_batchnorm_fwd = kernel_batchnorm_forward_with_blockwise_welford<
+                    GridwiseBatchNormForwardWithBlockwiseWelford_,
+                    XDataType,
+                    YDataType,
+                    AccDataType,
+                    ScaleDataType,
+                    BiasDataType,
+                    MeanVarDataType,
+                    YElementwiseOp,
+                    XYGridDesc_M_K,
+                    ScaleBiasMeanVarGridDesc_M,
+                    ScaleBiasMeanVarGridDesc_M,
+                    GetReduceCountPerThreadFunctor>;
+                avg_time += launch_and_time_kernel(stream_config,
+                                                   kern_batchnorm_fwd,
+                                                   dim3(arg.gridSize_),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.x_grid_desc_m_k_,
+                                                   arg.y_grid_desc_m_k_,
+                                                   arg.scale_grid_desc_m_,
+                                                   arg.bias_grid_desc_m_,
+                                                   arg.mean_var_grid_desc_m_,
+                                                   get_reduce_count_per_thread,
+                                                   arg.numBlockTileIteration_,
+                                                   arg.epsilon_,
+                                                   arg.p_x_,
+                                                   arg.p_scale_,
+                                                   arg.p_bias_,
+                                                   arg.y_elementwise_op_,
+                                                   arg.p_y_,
+                                                   arg.updateMovingAverage_, // true or false
+                                                   arg.averageFactor_,
+                                                   arg.resultRunningMean_,
+                                                   arg.resultRunningVariance_,
+                                                   arg.saveMeanInvVariance_, // true or false
+                                                   arg.resultSaveMean_,
+                                                   arg.resultSaveInvVariance_);
+            };
+            return (avg_time);
+        };
+        float Run(const BaseArgument* pArg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(pArg), stream_config);
+        };
+    };
+    bool IsSupportedArgument(const BaseArgument* pArg) override
+    {
+        const Argument* pArg_ = dynamic_cast<const Argument*>(pArg);
+        if constexpr(XSrcYDstVectorDim == 0)
+        {
+            if(pArg_->xStrides_[NumInvariantDim - 1] != 1 ||
+               pArg_->yStrides_[NumInvariantDim - 1] != 1)
+                return false;
+            if(pArg_->xyLengths_[NumInvariantDim - 1] % XSrcVectorSize != 0 ||
+               pArg_->xyLengths_[NumInvariantDim - 1] % YDstVectorSize != 0)
+                return false;
+        }
+        else
+        {
+            if(pArg_->xStrides_[Rank - 1] != 1 || pArg_->yStrides_[Rank - 1] != 1)
+                return false;
+            if(pArg_->xyLengths_[Rank - 1] % XSrcVectorSize != 0 ||
+               pArg_->xyLengths_[Rank - 1] % YDstVectorSize != 0)
+                return false;
+        };
+        if(pArg_->bnScaleStrides_[NumInvariantDim - 1] != 1 && ScaleSrcVectorSize != 1)
+            return false;
+        if(pArg_->bnBiasStrides_[NumInvariantDim - 1] != 1 && BiasSrcVectorSize != 1)
+            return false;
+        if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % ScaleSrcVectorSize != 0)
+            return false;
+        if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % BiasSrcVectorSize != 0)
+            return false;
+        if(pArg_->bnMeanVarStrides_[NumInvariantDim - 1] != 1 && MeanVarSrcDstVectorSize != 1)
+            return false;
+        if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % MeanVarSrcDstVectorSize != 0)
+            return false;
+        bool is_valid = true;
+        static_for<0, NumInvariantDim, 1>{}([&](auto I) {
+            if(pArg_->xyLengths_[I] != pArg_->bnScaleBiasMeanVarLengths_[I])
+                is_valid = false;
+        });
+        if(!is_valid)
+            return false;
+        return true;
+    };
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const std::array<index_t, Rank> xyLengths,
+        const std::array<index_t, Rank> xStrides,
+        const std::array<index_t, Rank> yStrides,
+        const std::array<int, NumBatchNormReduceDim> reduceDims,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides,
+        const void* p_x,
+        const void* p_scale,
+        const void* p_bias,
+        double epsilon,
+        const YElementwiseOp y_elementwise_op,
+        void* p_y,
+        void* resultSaveMean,
+        void* resultSaveInvVariance,
+        double averageFactor,
+        void* resultRunningMean,
+        void* resultRunningVariance) override
+    {
+        return std::make_unique<Argument>(xyLengths,
+                                          xStrides,
+                                          yStrides,
+                                          reduceDims,
+                                          bnScaleBiasMeanVarLengths,
+                                          bnScaleStrides,
+                                          bnBiasStrides,
+                                          bnMeanVarStrides,
+                                          static_cast<const XDataType*>(p_x),
+                                          static_cast<const ScaleDataType*>(p_scale),
+                                          static_cast<const BiasDataType*>(p_bias),
+                                          y_elementwise_op,
+                                          epsilon,
+                                          static_cast<YDataType*>(p_y),
+                                          static_cast<MeanVarDataType*>(resultSaveMean),
+                                          static_cast<MeanVarDataType*>(resultSaveInvVariance),
+                                          averageFactor,
+                                          static_cast<MeanVarDataType*>(resultRunningMean),
+                                          static_cast<MeanVarDataType*>(resultRunningVariance));
+    };
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "DeviceBatchNormFwdImpl<" << BlockSize << ",";
+        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str << "XSrcYDstVectorDim_" << XSrcYDstVectorDim  << ",";
+        str << "VectorSize_X" << XSrcVectorSize << "_scale_" << ScaleSrcVectorSize << "_bias_" << BiasSrcVectorSize << "_mean_var_" << MeanVarSrcDstVectorSize << "_Y" << YDstVectorSize << ">";
+        // clang-format on
+        return str.str();
+    }
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_waveletmodel_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_dl.hpp
@@ -195,17 +195,17 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
    template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
-        ck::index_t N,
+        const ck::index_t N,
-        ck::index_t K,
+        const ck::index_t K,
-        ck::index_t C,
+        const ck::index_t C,
-        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
-        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
-        std::array<ck::index_t, NDimSpatial> input_left_pads,
+        const std::array<ck::index_t, NDimSpatial>& input_left_pads,
-        std::array<ck::index_t, NDimSpatial> input_right_pads,
+        const std::array<ck::index_t, NDimSpatial>& input_right_pads,
-        ck::index_t batch_k)
+        const ck::index_t batch_k)
    {
        using namespace ck;
@@ -347,17 +347,17 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
    } // function end
    template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
-        ck::index_t N,
+        const ck::index_t N,
-        ck::index_t K,
+        const ck::index_t K,
-        ck::index_t C,
+        const ck::index_t C,
-        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
-        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
-        std::array<ck::index_t, NDimSpatial> input_left_pads,
+        const std::array<ck::index_t, NDimSpatial>& input_left_pads,
-        std::array<ck::index_t, NDimSpatial> input_right_pads,
+        const std::array<ck::index_t, NDimSpatial>& input_right_pads,
-        ck::index_t batch_k)
+        const ck::index_t batch_k)
    {
        using namespace ck;
@@ -515,17 +515,17 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
    template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
-        ck::index_t N,
+        const ck::index_t N,
-        ck::index_t K,
+        const ck::index_t K,
-        ck::index_t C,
+        const ck::index_t C,
-        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
-        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
-        std::array<ck::index_t, NDimSpatial> input_left_pads,
+        const std::array<ck::index_t, NDimSpatial>& input_left_pads,
-        std::array<ck::index_t, NDimSpatial> input_right_pads,
+        const std::array<ck::index_t, NDimSpatial>& input_right_pads,
-        ck::index_t batch_k)
+        const ck::index_t batch_k)
    {
        using namespace ck;
@@ -784,17 +784,19 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
        Argument(const InDataType* p_in_grid,
                 WeiDataType* p_wei_grid,
                 const OutDataType* p_out_grid,
-                 ck::index_t G,
+                 const ck::index_t G,
-                 ck::index_t N,
+                 const ck::index_t N,
-                 ck::index_t K,
+                 const ck::index_t K,
-                 ck::index_t C,
+                 const ck::index_t C,
-                 std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+                 const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                 std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+                 const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                 std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+                 const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                 std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+                 const std::array<ck::index_t, NDimSpatial + 3>& /*input_strides*/,
-                 std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+                 const std::array<ck::index_t, NDimSpatial + 3>& /*output_strides*/,
-                 std::array<ck::index_t, NDimSpatial> input_left_pads,
+                 const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
-                 std::array<ck::index_t, NDimSpatial> input_right_pads,
+                 const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+                 const std::array<ck::index_t, NDimSpatial>& input_right_pads,
                 InElementwiseOperation in_element_op,
                 WeiElementwiseOperation wei_element_op,
                 OutElementwiseOperation out_element_op,
@@ -897,18 +899,18 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
        InElementwiseOperation c_element_op_;
        // for checking IsSupportedArgument()
-        index_t Conv_G_;
+        const index_t Conv_G_;
-        index_t Conv_N_;
+        const index_t Conv_N_;
-        index_t Conv_K_;
+        const index_t Conv_K_;
-        index_t Conv_C_;
+        const index_t Conv_C_;
-        std::array<ck::index_t, NDimSpatial> input_spatial_lengths_;
+        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths_;
-        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths_;
+        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths_;
-        std::array<ck::index_t, NDimSpatial> output_spatial_lengths_;
+        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths_;
-        std::array<ck::index_t, NDimSpatial> conv_filter_strides_;
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides_;
-        std::array<ck::index_t, NDimSpatial> conv_filter_dilations_;
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations_;
-        std::array<ck::index_t, NDimSpatial> input_left_pads_;
+        const std::array<ck::index_t, NDimSpatial>& input_left_pads_;
-        std::array<ck::index_t, NDimSpatial> input_right_pads_;
+        const std::array<ck::index_t, NDimSpatial>& input_right_pads_;
        index_t k_batch_;
    };
@@ -1111,17 +1113,19 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
    static auto MakeArgument(const InDataType* p_in_grid,
                             WeiDataType* p_wei_grid,
                             const OutDataType* p_out_grid,
-                             ck::index_t G,
+                             const ck::index_t G,
-                             ck::index_t N,
+                             const ck::index_t N,
-                             ck::index_t K,
+                             const ck::index_t K,
-                             ck::index_t C,
+                             const ck::index_t C,
-                             std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+                             const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                             std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+                             const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                             std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+                             const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                             std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+                             const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
-                             std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+                             const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
-                             std::array<ck::index_t, NDimSpatial> input_left_pads,
+                             const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
-                             std::array<ck::index_t, NDimSpatial> input_right_pads,
+                             const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
+                             const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+                             const std::array<ck::index_t, NDimSpatial>& input_right_pads,
                             InElementwiseOperation in_element_op,
                             WeiElementwiseOperation wei_element_op,
                             OutElementwiseOperation out_element_op,
@@ -1137,6 +1141,8 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
                        input_spatial_lengths,
                        filter_spatial_lengths,
                        output_spatial_lengths,
+                        input_strides,
+                        output_strides,
                        conv_filter_strides,
                        conv_filter_dilations,
                        input_left_pads,
@@ -1153,17 +1159,19 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
    MakeArgumentPointer(const void* p_in_grid,
                        void* p_wei_grid,
                        const void* p_out_grid,
-                        ck::index_t G,
+                        const ck::index_t G,
-                        ck::index_t N,
+                        const ck::index_t N,
-                        ck::index_t K,
+                        const ck::index_t K,
-                        ck::index_t C,
+                        const ck::index_t C,
-                        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+                        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+                        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+                        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+                        const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
-                        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+                        const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
-                        std::array<ck::index_t, NDimSpatial> input_left_pads,
+                        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
-                        std::array<ck::index_t, NDimSpatial> input_right_pads,
+                        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+                        const std::array<ck::index_t, NDimSpatial>& input_right_pads,
                        InElementwiseOperation in_element_op,
                        WeiElementwiseOperation wei_element_op,
                        OutElementwiseOperation out_element_op,
@@ -1179,6 +1187,8 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Dl
                                          input_spatial_lengths,
                                          filter_spatial_lengths,
                                          output_spatial_lengths,
+                                          input_strides,
+                                          output_strides,
                                          conv_filter_strides,
                                          conv_filter_dilations,
                                          input_left_pads,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
@@ -126,6 +126,9 @@ __global__ void
 // out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
 template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
          typename InDataType,
          typename WeiDataType,
          typename OutDataType,
@@ -161,29 +164,19 @@ template <ck::index_t NDimSpatial,
          index_t CShuffleNXdlPerWavePerShuffle,
          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CBlockTransferScalarPerVector_NWaveNPerXdl>
-struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
+struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
-    : public DeviceGroupedConvBwdWeight<
+    : public DeviceGroupedConvBwdWeight<NDimSpatial,
-          NDimSpatial,
+                                        InLayout,
-          ck::tuple_element_t<NDimSpatial - 1,
+                                        WeiLayout,
-                              ck::Tuple<ck::tensor_layout::convolution::GNWC,
+                                        OutLayout,
-                                        ck::tensor_layout::convolution::GNHWC,
+                                        InDataType,
-                                        ck::tensor_layout::convolution::GNDHWC>>,
+                                        WeiDataType,
-          ck::tuple_element_t<NDimSpatial - 1,
+                                        OutDataType,
-                              ck::Tuple<ck::tensor_layout::convolution::GKXC,
+                                        InElementwiseOperation,
-                                        ck::tensor_layout::convolution::GKYXC,
+                                        WeiElementwiseOperation,
-                                        ck::tensor_layout::convolution::GKZYXC>>,
+                                        OutElementwiseOperation>
-          ck::tuple_element_t<NDimSpatial - 1,
-                              ck::Tuple<ck::tensor_layout::convolution::GNWK,
-                                        ck::tensor_layout::convolution::GNHWK,
-                                        ck::tensor_layout::convolution::GNDHWK>>,
-          InDataType,
-          WeiDataType,
-          OutDataType,
-          InElementwiseOperation,
-          WeiElementwiseOperation,
-          OutElementwiseOperation>
 {
-    using DeviceOp = DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle;
+    using DeviceOp = DeviceGroupedConvBwdWeight_Xdl_CShuffle;
    using ADataType = OutDataType;
    using BDataType = InDataType;
@@ -222,17 +215,19 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
    template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
-        ck::index_t N,
+        const ck::index_t N,
-        ck::index_t K,
+        const ck::index_t K,
-        ck::index_t C,
+        const ck::index_t C,
-        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+        const std::array<ck::index_t, NDimSpatial + 3>& /* input_strides */,
-        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+        const std::array<ck::index_t, NDimSpatial + 3>& /* output_strides */,
-        std::array<ck::index_t, NDimSpatial> input_left_pads,
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
-        std::array<ck::index_t, NDimSpatial> input_right_pads,
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
-        ck::index_t batch_k)
+        const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+        const std::array<ck::index_t, NDimSpatial>& input_right_pads,
+        const ck::index_t batch_k)
    {
        using namespace ck;
@@ -282,14 +277,14 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
                in_gemmktotal_gemmn_grid_desc,
                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
-                           make_pass_through_transform(GemmM)),
+                           make_pass_through_transform(GemmN)),
                make_tuple(Sequence<0>{}, Sequence<1>{}),
                make_tuple(Sequence<0>{}, Sequence<1>{}));
            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                in_gemmkpad_gemmn_grid_desc,
                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_pass_through_transform(GemmN)),
                make_tuple(Sequence<0>{}, Sequence<1>{}),
                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
@@ -372,19 +367,25 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
        }
    }
-    template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
+    template <ck::index_t NDim,
+              typename ck::enable_if<NDim == 2 &&
+                                         is_same_v<InLayout, tensor_layout::convolution::GNHWC> &&
+                                         is_same_v<OutLayout, tensor_layout::convolution::GNHWK>,
+                                     bool>::type = false>
    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
-        ck::index_t N,
+        const ck::index_t N,
-        ck::index_t K,
+        const ck::index_t K,
-        ck::index_t C,
+        const ck::index_t C,
-        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+        const std::array<ck::index_t, NDimSpatial + 3>& /* input_strides */,
-        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+        const std::array<ck::index_t, NDimSpatial + 3>& /* output_strides */,
-        std::array<ck::index_t, NDimSpatial> input_left_pads,
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
-        std::array<ck::index_t, NDimSpatial> input_right_pads,
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
-        ck::index_t batch_k)
+        const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+        const std::array<ck::index_t, NDimSpatial>& input_right_pads,
+        const ck::index_t batch_k)
    {
        using namespace ck;
@@ -447,14 +448,14 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
                in_gemmktotal_gemmn_grid_desc,
                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
-                           make_pass_through_transform(GemmM)),
+                           make_pass_through_transform(GemmN)),
                make_tuple(Sequence<0>{}, Sequence<1>{}),
                make_tuple(Sequence<0>{}, Sequence<1>{}));
            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                in_gemmkpad_gemmn_grid_desc,
                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_pass_through_transform(GemmN)),
                make_tuple(Sequence<0>{}, Sequence<1>{}),
                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
@@ -539,19 +540,202 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
        }
    }
+    template <ck::index_t NDim,
+              typename ck::enable_if<NDim == 2 &&
+                                         is_same_v<InLayout, tensor_layout::convolution::NHWGC> &&
+                                         is_same_v<OutLayout, tensor_layout::convolution::NHWGK>,
+                                     bool>::type = false>
+    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        const ck::index_t N,
+        const ck::index_t K,
+        const ck::index_t C,
+        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
+        const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+        const std::array<ck::index_t, NDimSpatial>& input_right_pads,
+        const ck::index_t batch_k)
+    {
+        using namespace ck;
+        const index_t Hi = input_spatial_lengths[0];
+        const index_t Wi = input_spatial_lengths[1];
+        const index_t Ho = output_spatial_lengths[0];
+        const index_t Wo = output_spatial_lengths[1];
+        const index_t Y = filter_spatial_lengths[0];
+        const index_t X = filter_spatial_lengths[1];
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+        const index_t ConvDilationH = conv_filter_dilations[0];
+        const index_t ConvDilationW = conv_filter_dilations[1];
+        const index_t InLeftPadH = input_left_pads[0];
+        const index_t InLeftPadW = input_left_pads[1];
+        const index_t InRightPadH = input_right_pads[0];
+        const index_t InRightPadW = input_right_pads[1];
+        const index_t GemmKTotal = N * Ho * Wo;
+        const index_t GemmM      = K;
+        const index_t GemmN      = C * X * Y;
+        const index_t NStride  = input_strides[1];
+        const index_t HiStride = input_strides[3];
+        const index_t WiStride = input_strides[4];
+        const auto CStride     = input_strides[2];
+        const index_t WoStride = output_strides[4];
+        const auto KStride     = Number<1>{};
+        const index_t GemmKBatch = batch_k;
+        const index_t GemmK0 =
+            math::integer_divide_ceil(GemmKTotal, GemmK1Number * K0PerBlock * GemmKBatch) *
+            K0PerBlock;
+        const index_t GemmKPad = GemmKBatch * GemmK0 * GemmK1Number;
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmktotal_gemmm_grid_desc = make_naive_tensor_descriptor(
+                make_tuple(N * Ho * Wo, K), make_tuple(WoStride, KStride));
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_gemmktotal_gemmm_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+            // B: input tensor
+            const auto in_gemmktotal_gemmn_grid_desc = make_naive_tensor_descriptor(
+                make_tuple(N * Hi * Wi, C), make_tuple(WiStride, CStride));
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_gemmktotal_gemmn_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+            // C: weight tensor
+            const auto wei_gemmm_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C));
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                              wei_gemmm_gemmn_grid_desc);
+        }
+        else
+        {
+            const auto out_gemmktotal_gemmm_grid_desc = make_naive_tensor_descriptor(
+                make_tuple(N * Ho * Wo, K), make_tuple(WoStride, KStride));
+            const auto in_n_hi_wi_c_grid_desc = make_naive_tensor_descriptor(
+                make_tuple(N, Hi, Wi, C), make_tuple(NStride, HiStride, WiStride, CStride));
+            // A: output tensor
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_gemmktotal_gemmm_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+            // B: input tensor
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+            const auto in_gemmktotal_gemmn_grid_desc =
+                transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}));
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_gemmktotal_gemmn_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+            // C: weight tensor
+            const auto wei_gemmm_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C));
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                              wei_gemmm_gemmn_grid_desc);
+        }
+    }
    template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
-        ck::index_t N,
+        const ck::index_t N,
-        ck::index_t K,
+        const ck::index_t K,
-        ck::index_t C,
+        const ck::index_t C,
-        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+        const std::array<ck::index_t, NDimSpatial + 3>& /* input_strides */,
-        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+        const std::array<ck::index_t, NDimSpatial + 3>& /* output_strides */,
-        std::array<ck::index_t, NDimSpatial> input_left_pads,
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
-        std::array<ck::index_t, NDimSpatial> input_right_pads,
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
-        ck::index_t batch_k)
+        const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+        const std::array<ck::index_t, NDimSpatial>& input_right_pads,
+        const ck::index_t batch_k)
    {
        using namespace ck;
@@ -621,14 +805,14 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
                in_gemmktotal_gemmn_grid_desc,
                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
-                           make_pass_through_transform(GemmM)),
+                           make_pass_through_transform(GemmN)),
                make_tuple(Sequence<0>{}, Sequence<1>{}),
                make_tuple(Sequence<0>{}, Sequence<1>{}));
            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
                in_gemmkpad_gemmn_grid_desc,
                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
-                           make_pass_through_transform(GemmM)),
+                           make_pass_through_transform(GemmN)),
                make_tuple(Sequence<0>{}, Sequence<1>{}),
                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
@@ -725,31 +909,70 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
    template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
    static auto GetABCGridDesc()
    {
-        return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<1>(
+        const ck::index_t dim   = 1;
-            1, 1, 1, {1}, {1}, {1}, {1}, {1}, {1}, {1}, 1);
+        const ck::index_t batch = 1;
+        const std::array<ck::index_t, NDimSpatial> lengths{1};
+        const std::array<ck::index_t, NDimSpatial + 3> strides{1, 1, 1, 1};
+        const std::array<ck::index_t, NDimSpatial> params{1};
+        return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<1>(dim,
+                                                                  dim,
+                                                                  dim,
+                                                                  lengths,
+                                                                  lengths,
+                                                                  lengths,
+                                                                  strides,
+                                                                  strides,
+                                                                  params,
+                                                                  params,
+                                                                  params,
+                                                                  params,
+                                                                  batch);
    }
    template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
    static auto GetABCGridDesc()
    {
-        return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<2>(
+        const ck::index_t dim   = 1;
-            1, 1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, 1);
+        const ck::index_t batch = 1;
+        const std::array<ck::index_t, NDimSpatial> lengths{1, 1};
+        const std::array<ck::index_t, NDimSpatial + 3> strides{1, 1, 1, 1, 1};
+        const std::array<ck::index_t, NDimSpatial> params{1, 1};
+        return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<2>(dim,
+                                                                  dim,
+                                                                  dim,
+                                                                  lengths,
+                                                                  lengths,
+                                                                  lengths,
+                                                                  strides,
+                                                                  strides,
+                                                                  params,
+                                                                  params,
+                                                                  params,
+                                                                  params,
+                                                                  batch);
    }
    template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
    static auto GetABCGridDesc()
    {
-        return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<3>(1,
+        const ck::index_t dim   = 1;
-                                                                  1,
+        const ck::index_t batch = 1;
-                                                                  1,
+        const std::array<ck::index_t, NDimSpatial> lengths{1, 1, 1};
-                                                                  {1, 1, 1},
+        const std::array<ck::index_t, NDimSpatial + 3> strides{1, 1, 1, 1, 1, 1};
-                                                                  {1, 1, 1},
+        const std::array<ck::index_t, NDimSpatial> params{1, 1, 1};
-                                                                  {1, 1, 1},
+        return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<3>(dim,
-                                                                  {1, 1, 1},
+                                                                  dim,
-                                                                  {1, 1, 1},
+                                                                  dim,
-                                                                  {1, 1, 1},
+                                                                  lengths,
-                                                                  {1, 1, 1},
+                                                                  lengths,
-                                                                  1);
+                                                                  lengths,
+                                                                  strides,
+                                                                  strides,
+                                                                  params,
+                                                                  params,
+                                                                  params,
+                                                                  params,
+                                                                  batch);
    }
    // type convert descs
@@ -863,19 +1086,21 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
        Argument(const InDataType* p_in_grid,
                 WeiDataType* p_wei_grid,
                 const OutDataType* p_out_grid,
-                 ck::index_t G,
+                 const ck::index_t G,
-                 ck::index_t N,
+                 const ck::index_t N,
-                 ck::index_t K,
+                 const ck::index_t K,
-                 ck::index_t C,
+                 const ck::index_t C,
-                 std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+                 const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                 std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+                 const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                 std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+                 const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                 std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+                 const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
-                 std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+                 const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
-                 std::array<ck::index_t, NDimSpatial> input_left_pads,
+                 const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
-                 std::array<ck::index_t, NDimSpatial> input_right_pads,
+                 const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
-                 ck::index_t M01,
+                 const std::array<ck::index_t, NDimSpatial>& input_left_pads,
-                 ck::index_t N01,
+                 const std::array<ck::index_t, NDimSpatial>& input_right_pads,
+                 const ck::index_t M01,
+                 const ck::index_t N01,
                 InElementwiseOperation in_element_op,
                 WeiElementwiseOperation wei_element_op,
                 OutElementwiseOperation out_element_op,
@@ -913,6 +1138,8 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
                    input_spatial_lengths,
                    filter_spatial_lengths,
                    output_spatial_lengths,
+                    input_strides,
+                    output_strides,
                    conv_filter_strides,
                    conv_filter_dilations,
                    input_left_pads,
@@ -927,18 +1154,8 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
                GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n_, M01, N01, k_batch_);
            // A/B/C Batch Stride
-            compute_ptr_offset_of_batch_.BatchStrideA_ =
+            compute_ptr_offset_of_batch_.BatchStrideA_ = output_strides[0];
-                N * K *
+            compute_ptr_offset_of_batch_.BatchStrideB_ = input_strides[0];
-                std::accumulate(begin(output_spatial_lengths),
-                                end(output_spatial_lengths),
-                                index_t{1},
-                                std::multiplies<>{});
-            compute_ptr_offset_of_batch_.BatchStrideB_ =
-                N * C *
-                std::accumulate(begin(input_spatial_lengths),
-                                end(input_spatial_lengths),
-                                index_t{1},
-                                std::multiplies<>{});
            compute_ptr_offset_of_batch_.BatchStrideC_ =
                K * C *
                std::accumulate(begin(filter_spatial_lengths),
@@ -977,16 +1194,16 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
        WeiElementwiseOperation c_element_op_;
        // for checking IsSupportedArgument()
-        index_t Conv_G_;
+        const index_t Conv_G_;
-        index_t Conv_N_;
+        const index_t Conv_N_;
-        index_t Conv_K_;
+        const index_t Conv_K_;
-        index_t Conv_C_;
+        const index_t Conv_C_;
-        std::array<ck::index_t, NDimSpatial> output_spatial_lengths_;
+        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths_;
-        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths_;
+        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths_;
-        std::array<ck::index_t, NDimSpatial> conv_filter_strides_;
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides_;
-        std::array<ck::index_t, NDimSpatial> input_left_pads_;
+        const std::array<ck::index_t, NDimSpatial>& input_left_pads_;
-        std::array<ck::index_t, NDimSpatial> input_right_pads_;
+        const std::array<ck::index_t, NDimSpatial>& input_right_pads_;
-        index_t k_batch_;
+        const index_t k_batch_;
    };
    // Invoker
@@ -1091,6 +1308,45 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
    static bool IsSupportedArgument(const Argument& arg)
    {
+        if constexpr(NDimSpatial == 1)
+        {
+            if constexpr(!(is_same_v<InLayout, tensor_layout::convolution::GNWC> &&
+                           is_same_v<WeiLayout, tensor_layout::convolution::GKXC> &&
+                           is_same_v<OutLayout, tensor_layout::convolution::GNWK>))
+            {
+                return false;
+            }
+        }
+        else if constexpr(NDimSpatial == 2)
+        {
+            if constexpr(!(is_same_v<InLayout, tensor_layout::convolution::GNHWC> &&
+                           is_same_v<WeiLayout, tensor_layout::convolution::GKYXC> &&
+                           is_same_v<OutLayout,
+                                     tensor_layout::convolution::
+                                         GNHWK>)&&!(is_same_v<InLayout,
+                                                              tensor_layout::convolution::NHWGC> &&
+                                                    is_same_v<WeiLayout,
+                                                              tensor_layout::convolution::GKYXC> &&
+                                                    is_same_v<OutLayout,
+                                                              tensor_layout::convolution::NHWGK>))
+            {
+                return false;
+            }
+        }
+        else if constexpr(NDimSpatial == 3)
+        {
+            if constexpr(!(is_same_v<InLayout, tensor_layout::convolution::GNDHWC> &&
+                           is_same_v<WeiLayout, tensor_layout::convolution::GKZYXC> &&
+                           is_same_v<OutLayout, tensor_layout::convolution::GNDHWK>))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
        if constexpr(ConvBackwardWeightSpecialization ==
                     ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
        {
@@ -1134,21 +1390,23 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
    static auto MakeArgument(const InDataType* p_in_grid,
                             WeiDataType* p_wei_grid,
                             const OutDataType* p_out_grid,
-                             ck::index_t G,
+                             const ck::index_t G,
-                             ck::index_t N,
+                             const ck::index_t N,
-                             ck::index_t K,
+                             const ck::index_t K,
-                             ck::index_t C,
+                             const ck::index_t C,
-                             std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+                             const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                             std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+                             const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                             std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+                             const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                             std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+                             const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
-                             std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+                             const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
-                             std::array<ck::index_t, NDimSpatial> input_left_pads,
+                             const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
-                             std::array<ck::index_t, NDimSpatial> input_right_pads,
+                             const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
+                             const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+                             const std::array<ck::index_t, NDimSpatial>& input_right_pads,
                             InElementwiseOperation in_element_op,
                             WeiElementwiseOperation wei_element_op,
                             OutElementwiseOperation out_element_op,
-                             ck::index_t split_k)
+                             const ck::index_t split_k)
    {
        return Argument{p_in_grid,
                        p_wei_grid,
@@ -1160,6 +1418,8 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
                        input_spatial_lengths,
                        filter_spatial_lengths,
                        output_spatial_lengths,
+                        input_strides,
+                        output_strides,
                        conv_filter_strides,
                        conv_filter_dilations,
                        input_left_pads,
@@ -1178,21 +1438,23 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
    MakeArgumentPointer(const void* p_in_grid,
                        void* p_wei_grid,
                        const void* p_out_grid,
-                        ck::index_t G,
+                        const ck::index_t G,
-                        ck::index_t N,
+                        const ck::index_t N,
-                        ck::index_t K,
+                        const ck::index_t K,
-                        ck::index_t C,
+                        const ck::index_t C,
-                        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+                        const std::array<ck::index_t, NDimSpatial>& input_spatial_lengths,
-                        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+                        const std::array<ck::index_t, NDimSpatial>& filter_spatial_lengths,
-                        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+                        const std::array<ck::index_t, NDimSpatial>& output_spatial_lengths,
-                        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+                        const std::array<ck::index_t, NDimSpatial + 3>& input_strides,
-                        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+                        const std::array<ck::index_t, NDimSpatial + 3>& output_strides,
-                        std::array<ck::index_t, NDimSpatial> input_left_pads,
+                        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
-                        std::array<ck::index_t, NDimSpatial> input_right_pads,
+                        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+                        const std::array<ck::index_t, NDimSpatial>& input_right_pads,
                        InElementwiseOperation in_element_op,
                        WeiElementwiseOperation wei_element_op,
                        OutElementwiseOperation out_element_op,
-                        ck::index_t split_k) override
+                        const ck::index_t split_k) override
    {
        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_grid),
                                          static_cast<WeiDataType*>(p_wei_grid),
@@ -1204,6 +1466,8 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
                                          input_spatial_lengths,
                                          filter_spatial_lengths,
                                          output_spatial_lengths,
+                                          input_strides,
+                                          output_strides,
                                          conv_filter_strides,
                                          conv_filter_dilations,
                                          input_left_pads,
@@ -1226,7 +1490,7 @@ struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
        auto str = std::stringstream();
        // clang-format off
-        str << "DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle"
+        str << "DeviceGroupedConvBwdWeight_Xdl_CShuffle"
            << "<"
            << BlockSize << ", "
            << MPerBlock << ", "

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
--- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_batchnorm_forward.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_batchnorm_forward.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/math_v2.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/workgroup_synchronization.hpp"
+namespace ck {
+template <typename GridwiseMultiblockBatchNormForward_,
+          typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename MeanVarCountGridDesc_M_G,
+          typename MeanVarCountGridDesc_M_K,
+          typename ScaleBiasGridDesc_M,
+          typename MeanVarGridDesc_M,
+          typename GetReduceCountPerThreadFunctor>
+__global__ void kernel_multiblock_batchnorm_forward(
+    const XYGridDesc_M_K x_grid_desc_m_k,
+    const XYGridDesc_M_K y_grid_desc_m_k,
+    const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g,
+    const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k,
+    const ScaleBiasGridDesc_M scale_grid_desc_m,
+    const ScaleBiasGridDesc_M bias_grid_desc_m,
+    const MeanVarGridDesc_M mean_var_grid_desc_m,
+    const GetReduceCountPerThreadFunctor get_reduce_count_per_thread,
+    index_t num_k_block_tile_iteration,
+    AccDataType epsilon,
+    const XDataType* const __restrict__ p_x,
+    MeanVarDataType* const __restrict__ p_welford_mean,
+    MeanVarDataType* const __restrict__ p_welford_variance,
+    int32_t* const __restrict__ p_welford_count,
+    int32_t* const __restrict__ p_control,
+    const ScaleDataType* const __restrict__ p_scale,
+    const BiasDataType* const __restrict__ p_bias,
+    const YElementwiseOp y_elementwise_op,
+    YDataType* const __restrict__ p_y,
+    bool updateMovingAverage,
+    AccDataType averageFactor,
+    MeanVarDataType* const __restrict__ resultRunningMean,
+    MeanVarDataType* const __restrict__ resultRunningVariance,
+    bool saveMeanInvVariance,
+    MeanVarDataType* const __restrict__ resultSaveMean,
+    MeanVarDataType* const __restrict__ resultSaveInvVariance)
+{
+    GridwiseMultiblockBatchNormForward_::Run(x_grid_desc_m_k,
+                                             y_grid_desc_m_k,
+                                             mean_var_count_grid_desc_m_g,
+                                             mean_var_count_grid_desc_m_k,
+                                             scale_grid_desc_m,
+                                             bias_grid_desc_m,
+                                             mean_var_grid_desc_m,
+                                             get_reduce_count_per_thread,
+                                             num_k_block_tile_iteration,
+                                             epsilon,
+                                             p_x,
+                                             p_welford_mean,
+                                             p_welford_variance,
+                                             p_welford_count,
+                                             p_control,
+                                             p_scale,
+                                             p_bias,
+                                             y_elementwise_op,
+                                             p_y,
+                                             updateMovingAverage,
+                                             averageFactor,
+                                             resultRunningMean,
+                                             resultRunningVariance,
+                                             saveMeanInvVariance,
+                                             resultSaveMean,
+                                             resultSaveInvVariance);
+};
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename MeanVarCountGridDesc_M_G,
+          typename MeanVarCountGridDesc_M_K,
+          typename ScaleBiasGridDesc_M,
+          typename MeanVarGridDesc_M,
+          typename GetReduceCountPerThreadFunctor,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XSrcYDstVectorDim,
+          index_t XSrcVectorSize,
+          index_t YDstVectorSize,
+          index_t ScaleSrcVectorSize,
+          index_t BiasSrcVectorSize,
+          index_t MeanVarSrcDstVectorSize>
+struct GridwiseMultiblockBatchNormForward
+{
+    static_assert((XSrcYDstVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
+                      (XSrcYDstVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+    static_assert((XSrcYDstVectorDim == 0 && MThreadSliceSize % YDstVectorSize == 0) ||
+                      (XSrcYDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+    static constexpr bool reorder_thread_cluster = (XSrcYDstVectorDim == 0);
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+    using ThreadReduceSrcDesc_M_1 = decltype(
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}, Number<1>{})));
+    using ThreadwiseWelford1 =
+        ThreadwiseWelford<AccDataType, ThreadReduceSrcDesc_M_K, ThreadReduceDstDesc_M>;
+    using ThreadwiseWelford2 =
+        ThreadwiseWelfordMerge<AccDataType, ThreadReduceSrcDesc_M_1, ThreadReduceDstDesc_M>;
+    using BlockwiseWelford1 = BlockwiseWelford<AccDataType,
+                                               BlockSize,
+                                               ThreadClusterLengths_M_K,
+                                               ThreadClusterArrangeOrder,
+                                               false>;
+    using BlockwiseWelford2 = BlockwiseWelford<AccDataType,
+                                               BlockSize,
+                                               ThreadClusterLengths_M_K,
+                                               ThreadClusterArrangeOrder,
+                                               true>;
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+    __device__ static void Run(const XYGridDesc_M_K& x_grid_desc_m_k,
+                               const XYGridDesc_M_K& y_grid_desc_m_k,
+                               const MeanVarCountGridDesc_M_G& mean_var_count_grid_desc_m_g,
+                               const MeanVarCountGridDesc_M_K& mean_var_count_grid_desc_m_k,
+                               const ScaleBiasGridDesc_M& scale_grid_desc_m,
+                               const ScaleBiasGridDesc_M& bias_grid_desc_m,
+                               const MeanVarGridDesc_M& mean_var_grid_desc_m,
+                               const GetReduceCountPerThreadFunctor& get_reduce_count_per_thread,
+                               index_t num_k_block_tile_iteration,
+                               AccDataType epsilon,
+                               const XDataType* const __restrict__ p_x,
+                               MeanVarDataType* const __restrict__ p_welford_mean,
+                               MeanVarDataType* const __restrict__ p_welford_variance,
+                               int32_t* const __restrict__ p_welford_count,
+                               int32_t* const __restrict__ p_control,
+                               const ScaleDataType* const __restrict__ p_scale,
+                               const BiasDataType* const __restrict__ p_bias,
+                               const YElementwiseOp y_elementwise_op,
+                               YDataType* const __restrict__ p_y,
+                               bool updateMovingAverage,
+                               AccDataType averageFactor,
+                               MeanVarDataType* const __restrict__ resultRunningMean,
+                               MeanVarDataType* const __restrict__ resultRunningVariance,
+                               bool saveMeanInvVariance,
+                               MeanVarDataType* const __restrict__ resultSaveMean,
+                               MeanVarDataType* const __restrict__ resultSaveInvVariance)
+    {
+        using ck::math::sqrt;
+        const index_t blkgroup_size = mean_var_count_grid_desc_m_g.GetLength(I1);
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t blkgroup_id     = block_global_id / blkgroup_size;
+        const index_t block_local_id  = block_global_id % blkgroup_size;
+        if(block_local_id == 0)
+            gms_init(BlockSize / warpSize * blkgroup_size, &p_control[blkgroup_id * 2]);
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+        using ThreadBufferLengths_M_K = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        using ThreadBufferLengths_M   = Sequence<MThreadSliceSize>;
+        using ThreadBufferLengths_M_1 = Sequence<MThreadSliceSize, 1>;
+        constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m_1 = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            x_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true> count_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            tmp_mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            tmp_var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true> tmp_count_thread_buf;
+        const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  AccDataType,
+                                                                  XYGridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XSrcYDstVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             block_local_id * reduceSizePerBlock +
+                                 thread_k_cluster_id * KThreadSliceSize));
+        constexpr auto xy_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileSize);
+        const auto x_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x, x_grid_desc_m_k.GetElementSpaceSize());
+        // Step 1: each workgroup does local welford reduction
+        auto threadwise_welford_1 = ThreadwiseWelford1();
+        threadwise_welford_1.max_count_ =
+            get_reduce_count_per_thread(block_local_id, thread_k_cluster_id);
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            mean_thread_buf(I) = type_convert<AccDataType>(0.0f);
+            var_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+        });
+        for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+        {
+            threadwise_x_load.Run(x_grid_desc_m_k,
+                                  x_global_val_buf,
+                                  thread_buffer_desc_m_k,
+                                  make_tuple(I0, I0),
+                                  x_thread_buf);
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, xy_copy_fwd_step_m_k);
+            threadwise_welford_1.Run(x_thread_buf, mean_thread_buf, var_thread_buf);
+        }
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+            count_thread_buf(I) = threadwise_welford_1.cur_count_;
+            BlockwiseWelford1::Run(mean_thread_buf(I), var_thread_buf(I), count_thread_buf(I));
+        });
+        // Step 2: each workgroup writes its local welford result to workspace memory
+        auto mean_global_val_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::GLC>(
+                p_welford_mean, mean_var_count_grid_desc_m_g.GetElementSpaceSize());
+        auto var_global_val_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::GLC>(
+                p_welford_variance, mean_var_count_grid_desc_m_g.GetElementSpaceSize());
+        auto count_global_val_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Global, AmdBufferCoherenceEnum::GLC>(
+                p_welford_count, mean_var_count_grid_desc_m_g.GetElementSpaceSize());
+        auto threadwise_mean_var_store_m_g =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               MeanVarDataType,
+                                               decltype(thread_buffer_desc_m_1),
+                                               MeanVarCountGridDesc_M_G,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M_1,
+                                               Sequence<0, 1>,
+                                               0,
+                                               1,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                mean_var_count_grid_desc_m_g,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 block_local_id),
+                PassThroughOp{});
+        auto threadwise_count_store_m_g =
+            ThreadwiseTensorSliceTransfer_v1r3<int32_t,
+                                               int32_t,
+                                               decltype(thread_buffer_desc_m_1),
+                                               MeanVarCountGridDesc_M_G,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M_1,
+                                               Sequence<0, 1>,
+                                               0,
+                                               1,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                mean_var_count_grid_desc_m_g,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 block_local_id),
+                PassThroughOp{});
+        if(thread_k_cluster_id == 0)
+        {
+            threadwise_mean_var_store_m_g.Run(thread_buffer_desc_m_1,
+                                              make_tuple(I0, I0),
+                                              mean_thread_buf,
+                                              mean_var_count_grid_desc_m_g,
+                                              mean_global_val_buf);
+            threadwise_mean_var_store_m_g.Run(thread_buffer_desc_m_1,
+                                              make_tuple(I0, I0),
+                                              var_thread_buf,
+                                              mean_var_count_grid_desc_m_g,
+                                              var_global_val_buf);
+            threadwise_count_store_m_g.Run(thread_buffer_desc_m_1,
+                                           make_tuple(I0, I0),
+                                           count_thread_buf,
+                                           mean_var_count_grid_desc_m_g,
+                                           count_global_val_buf);
+        };
+        gms_barrier(&p_control[blkgroup_id * 2]);
+        if(block_local_id == 0)
+            gms_reset(&p_control[blkgroup_id * 2]);
+        // Step 3: each workgroup reads welford results from workspace memory and does final welford
+        // reduction
+        auto threadwise_mean_var_load_m_k =
+            ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
+                                             AccDataType,
+                                             MeanVarCountGridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_1),
+                                             ThreadBufferLengths_M_1,
+                                             Sequence<0, 1>,
+                                             0,
+                                             1,
+                                             1,
+                                             true>(
+                mean_var_count_grid_desc_m_k,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * 1));
+        auto threadwise_count_load_m_k =
+            ThreadwiseTensorSliceTransfer_v2<int32_t,
+                                             int32_t,
+                                             MeanVarCountGridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_1),
+                                             ThreadBufferLengths_M_1,
+                                             Sequence<0, 1>,
+                                             0,
+                                             1,
+                                             1,
+                                             true>(
+                mean_var_count_grid_desc_m_k,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * 1));
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            mean_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+            var_thread_buf(I)   = type_convert<AccDataType>(0.0f);
+            count_thread_buf(I) = 0;
+        });
+        constexpr auto mean_var_count_read_fwd_step_m_k = make_multi_index(0, KThreadClusterSize);
+        int32_t reducedSize = 0;
+        while(reducedSize < blkgroup_size)
+        {
+            threadwise_mean_var_load_m_k.Run(mean_var_count_grid_desc_m_k,
+                                             mean_global_val_buf,
+                                             thread_buffer_desc_m_1,
+                                             make_tuple(I0, I0),
+                                             tmp_mean_thread_buf);
+            threadwise_mean_var_load_m_k.Run(mean_var_count_grid_desc_m_k,
+                                             var_global_val_buf,
+                                             thread_buffer_desc_m_1,
+                                             make_tuple(I0, I0),
+                                             tmp_var_thread_buf);
+            threadwise_count_load_m_k.Run(mean_var_count_grid_desc_m_k,
+                                          count_global_val_buf,
+                                          thread_buffer_desc_m_1,
+                                          make_tuple(I0, I0),
+                                          tmp_count_thread_buf);
+            ThreadwiseWelford2::Run(tmp_mean_thread_buf,
+                                    tmp_var_thread_buf,
+                                    tmp_count_thread_buf,
+                                    mean_thread_buf,
+                                    var_thread_buf,
+                                    count_thread_buf);
+            reducedSize += KThreadClusterSize;
+            threadwise_mean_var_load_m_k.MoveSrcSliceWindow(mean_var_count_grid_desc_m_k,
+                                                            mean_var_count_read_fwd_step_m_k);
+            threadwise_count_load_m_k.MoveSrcSliceWindow(mean_var_count_grid_desc_m_k,
+                                                         mean_var_count_read_fwd_step_m_k);
+        };
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+            BlockwiseWelford2::Run(mean_thread_buf(I), var_thread_buf(I), count_thread_buf(I));
+        });
+        // Step 4: do normalization using the mean/variance
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> scale_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> bias_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            y_thread_buf;
+        auto threadwise_y_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               YDataType,
+                                               decltype(thread_buffer_desc_m_k),
+                                               XYGridDesc_M_K,
+                                               YElementwiseOp,
+                                               ThreadBufferLengths_M_K,
+                                               ThreadBufferDimAccessOrder,
+                                               XSrcYDstVectorDim,
+                                               YDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                y_grid_desc_m_k,
+                make_multi_index(
+                    blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                    block_local_id * reduceSizePerBlock + thread_k_cluster_id * KThreadSliceSize),
+                y_elementwise_op);
+        auto threadwise_scale_load =
+            ThreadwiseTensorSliceTransfer_v2<ScaleDataType,
+                                             AccDataType,
+                                             ScaleBiasGridDesc_M,
+                                             decltype(thread_buffer_desc_m),
+                                             ThreadBufferLengths_M,
+                                             Sequence<0>,
+                                             0,
+                                             ScaleSrcVectorSize,
+                                             1,
+                                             true>(
+                scale_grid_desc_m,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize));
+        auto threadwise_bias_load = ThreadwiseTensorSliceTransfer_v2<BiasDataType,
+                                                                     AccDataType,
+                                                                     ScaleBiasGridDesc_M,
+                                                                     decltype(thread_buffer_desc_m),
+                                                                     ThreadBufferLengths_M,
+                                                                     Sequence<0>,
+                                                                     0,
+                                                                     BiasSrcVectorSize,
+                                                                     1,
+                                                                     true>(
+            bias_grid_desc_m,
+            make_multi_index(blkgroup_id * M_BlockTileSize +
+                             thread_m_cluster_id * MThreadSliceSize));
+        const auto scale_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_scale, scale_grid_desc_m.GetElementSpaceSize());
+        const auto bias_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_bias, bias_grid_desc_m.GetElementSpaceSize());
+        auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_y, y_grid_desc_m_k.GetElementSpaceSize());
+        threadwise_scale_load.Run(scale_grid_desc_m,
+                                  scale_global_val_buf,
+                                  thread_buffer_desc_m,
+                                  make_tuple(I0),
+                                  scale_thread_buf);
+        threadwise_bias_load.Run(bias_grid_desc_m,
+                                 bias_global_val_buf,
+                                 thread_buffer_desc_m,
+                                 make_tuple(I0),
+                                 bias_thread_buf);
+        threadwise_x_load.SetSrcSliceOrigin(
+            x_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             block_local_id * reduceSizePerBlock +
+                                 thread_k_cluster_id * KThreadSliceSize));
+        for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+        {
+            threadwise_x_load.Run(x_grid_desc_m_k,
+                                  x_global_val_buf,
+                                  thread_buffer_desc_m_k,
+                                  make_tuple(I0, I0),
+                                  x_thread_buf);
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                AccDataType multiplier =
+                    scale_thread_buf[Number<iM>{}] / sqrt(var_thread_buf[iM] + epsilon);
+                AccDataType fused_mean_bias =
+                    bias_thread_buf[Number<iM>{}] - mean_thread_buf[iM] * multiplier;
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset =
+                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+                    // normalize
+                    y_thread_buf(Number<offset>{}) =
+                        x_thread_buf[Number<offset>{}] * multiplier + fused_mean_bias;
+                });
+            });
+            threadwise_y_store.Run(thread_buffer_desc_m_k,
+                                   make_tuple(I0, I0),
+                                   y_thread_buf,
+                                   y_grid_desc_m_k,
+                                   y_global_val_buf);
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, xy_copy_fwd_step_m_k);
+            threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, xy_copy_fwd_step_m_k);
+        }
+        // Step 5: update the moving average of mean and variance (optional)
+        if(updateMovingAverage && block_local_id == 0 && thread_k_cluster_id == 0)
+        {
+            StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+                running_mean_thread_buf;
+            StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+                running_var_thread_buf;
+            auto running_mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                resultRunningMean, mean_var_grid_desc_m.GetElementSpaceSize());
+            auto running_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                resultRunningVariance, mean_var_grid_desc_m.GetElementSpaceSize());
+            auto threadwise_mean_var_load =
+                ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
+                                                 AccDataType,
+                                                 MeanVarGridDesc_M,
+                                                 decltype(thread_buffer_desc_m),
+                                                 ThreadBufferLengths_M,
+                                                 Sequence<0>,
+                                                 0,
+                                                 MeanVarSrcDstVectorSize,
+                                                 1,
+                                                 true>(
+                    mean_var_grid_desc_m,
+                    make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize));
+            threadwise_mean_var_load.Run(mean_var_grid_desc_m,
+                                         running_mean_global_buf,
+                                         thread_buffer_desc_m,
+                                         make_tuple(I0),
+                                         running_mean_thread_buf);
+            threadwise_mean_var_load.Run(mean_var_grid_desc_m,
+                                         running_var_global_buf,
+                                         thread_buffer_desc_m,
+                                         make_tuple(I0),
+                                         running_var_thread_buf);
+            AccDataType oneMinusAverageFactor = type_convert<AccDataType>(1.0) - averageFactor;
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                running_mean_thread_buf(I) = running_mean_thread_buf[I] * oneMinusAverageFactor +
+                                             mean_thread_buf[I] * averageFactor;
+                running_var_thread_buf(I) = running_var_thread_buf[I] * oneMinusAverageFactor +
+                                            var_thread_buf[I] * averageFactor;
+            });
+            auto threadwise_mean_var_store =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   MeanVarDataType,
+                                                   decltype(thread_buffer_desc_m),
+                                                   MeanVarGridDesc_M,
+                                                   PassThroughOp,
+                                                   ThreadBufferLengths_M,
+                                                   Sequence<0>,
+                                                   0,
+                                                   MeanVarSrcDstVectorSize,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>(
+                    mean_var_grid_desc_m,
+                    make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize),
+                    PassThroughOp{});
+            threadwise_mean_var_store.Run(thread_buffer_desc_m,
+                                          make_tuple(I0),
+                                          running_mean_thread_buf,
+                                          mean_var_grid_desc_m,
+                                          running_mean_global_buf);
+            threadwise_mean_var_store.Run(thread_buffer_desc_m,
+                                          make_tuple(I0),
+                                          running_var_thread_buf,
+                                          mean_var_grid_desc_m,
+                                          running_var_global_buf);
+        };
+        // Step 6: save mean and inv-variance (optional)
+        if(saveMeanInvVariance && block_local_id == 0 && thread_k_cluster_id == 0)
+        {
+            auto result_mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                resultSaveMean, mean_var_grid_desc_m.GetElementSpaceSize());
+            auto result_inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                resultSaveInvVariance, mean_var_grid_desc_m.GetElementSpaceSize());
+            // calculate inv-variance as 1/sqrt(epsilon+variance), stored in place of variance
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                var_thread_buf(I) =
+                    type_convert<AccDataType>(1.0f) / sqrt(epsilon + var_thread_buf[I]);
+            });
+            auto threadwise_mean_inv_var_store =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   MeanVarDataType,
+                                                   decltype(thread_buffer_desc_m),
+                                                   MeanVarGridDesc_M,
+                                                   PassThroughOp,
+                                                   ThreadBufferLengths_M,
+                                                   Sequence<0>,
+                                                   0,
+                                                   MeanVarSrcDstVectorSize,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>(
+                    mean_var_grid_desc_m,
+                    make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize),
+                    PassThroughOp{});
+            threadwise_mean_inv_var_store.Run(thread_buffer_desc_m,
+                                              make_tuple(I0),
+                                              mean_thread_buf,
+                                              mean_var_grid_desc_m,
+                                              result_mean_global_buf);
+            threadwise_mean_inv_var_store.Run(thread_buffer_desc_m,
+                                              make_tuple(I0),
+                                              var_thread_buf,
+                                              mean_var_grid_desc_m,
+                                              result_inv_var_global_buf);
+        };
+    }
+}; // namespace ck
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
@@ -161,7 +161,7 @@ struct GridwiseMultiblockWelfordFirstHalf
                                               PassThroughOp,
                                               ThreadBufferLengths_M_1,
                                               Sequence<0, 1>,
-                                               1,
+                                               0,
                                               1,
                                               InMemoryDataOperationEnum::Set,
                                               1,
@@ -180,7 +180,7 @@ struct GridwiseMultiblockWelfordFirstHalf
                                               PassThroughOp,
                                               ThreadBufferLengths_M_1,
                                               Sequence<0, 1>,
-                                               1,
+                                               0,
                                               1,
                                               InMemoryDataOperationEnum::Set,
                                               1,

--- a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
@@ -33,7 +33,6 @@ __global__ void kernel_welford_second_half_batchnorm_forward_final(
    const MeanVarGridDesc_M mean_var_grid_desc_m,
    index_t blkgroup_size,
    index_t num_xy_k_block_tile_iteration,
-    index_t num_mean_var_count_k_block_tile_iteration,
    AccDataType epsilon,
    const MeanVarDataType* const __restrict__ p_in_welford_mean,
    const MeanVarDataType* const __restrict__ p_in_welford_variance,
@@ -59,7 +58,6 @@ __global__ void kernel_welford_second_half_batchnorm_forward_final(
                                                         mean_var_grid_desc_m,
                                                         blkgroup_size,
                                                         num_xy_k_block_tile_iteration,
-                                                         num_mean_var_count_k_block_tile_iteration,
                                                         epsilon,
                                                         p_in_welford_mean,
                                                         p_in_welford_variance,
@@ -152,7 +150,6 @@ struct GridwiseWelfordSecondHalfBatchNormForwardFinal
                               const MeanVarGridDesc_M& mean_var_grid_desc_m,
                               index_t blkgroup_size,
                               index_t num_xy_k_block_tile_iteration,
-                               index_t num_mean_var_count_k_block_tile_iteration,
                               AccDataType epsilon,
                               const MeanVarDataType* const __restrict__ p_in_welford_mean,
                               const MeanVarDataType* const __restrict__ p_in_welford_variance,
@@ -223,7 +220,7 @@ struct GridwiseWelfordSecondHalfBatchNormForwardFinal
                                             decltype(thread_buffer_desc_m_1),
                                             ThreadBufferLengths_M_1,
                                             Sequence<0, 1>,
-                                             1,
+                                             0,
                                             1,
                                             1,
                                             true>(
@@ -239,7 +236,7 @@ struct GridwiseWelfordSecondHalfBatchNormForwardFinal
                                             decltype(thread_buffer_desc_m_1),
                                             ThreadBufferLengths_M_1,
                                             Sequence<0, 1>,
-                                             1,
+                                             0,
                                             1,
                                             1,
                                             true>(
@@ -257,9 +254,6 @@ struct GridwiseWelfordSecondHalfBatchNormForwardFinal
        const auto welford_count_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_in_welford_count, mean_var_count_grid_desc_m_k.GetElementSpaceSize());
-        constexpr auto mean_var_count_thread_copy_step_m_k =
-            make_multi_index(0, KThreadClusterSize * 1);
        // Step 1: do final welford reduction to get mean and variance
        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
@@ -268,8 +262,11 @@ struct GridwiseWelfordSecondHalfBatchNormForwardFinal
            welford_count_thread_buf(I) = 0;
        });
-        for(index_t reducedTiles = 0; reducedTiles < num_mean_var_count_k_block_tile_iteration;
+        constexpr auto mean_var_count_thread_copy_step_m_k =
-            ++reducedTiles)
+            make_multi_index(0, KThreadClusterSize);
+        int32_t reducedSize = 0;
+        while(reducedSize < blkgroup_size)
        {
            threadwise_mean_var_load_m_k.Run(mean_var_count_grid_desc_m_k,
                                             welford_mean_global_val_buf,
@@ -296,6 +293,8 @@ struct GridwiseWelfordSecondHalfBatchNormForwardFinal
                                   welford_var_thread_buf,
                                   welford_count_thread_buf);
+            reducedSize += KThreadClusterSize;
            threadwise_mean_var_load_m_k.MoveSrcSliceWindow(mean_var_count_grid_desc_m_k,
                                                            mean_var_count_thread_copy_step_m_k);
            threadwise_count_load_m_k.MoveSrcSliceWindow(mean_var_count_grid_desc_m_k,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
@@ -3,6 +3,8 @@
 #pragma once
+#include <iostream>
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp"

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
@@ -79,6 +79,10 @@ struct GridwiseGemmPipeline_v2
            do
            {
+#if CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT
+                __builtin_amdgcn_iglp_opt(CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT);
+#endif
                block_sync_lds();
                // GEMM i

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -27,6 +27,9 @@ template <typename GridwiseGemm,
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+#if CK_USE_WAVES_PER_EU
+        __attribute__((amdgpu_waves_per_eu(CK_MIN_WAVES_PER_EU, CK_MAX_WAVES_PER_EU)))
 #endif
        kernel_gemm_xdlops_v2r3(const FloatAB* __restrict__ p_a_grid,
                                const FloatAB* __restrict__ p_b_grid,
@@ -60,6 +63,9 @@ template <typename GridwiseGemm, bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+#if CK_USE_WAVES_PER_EU
+        __attribute__((amdgpu_waves_per_eu(CK_MIN_WAVES_PER_EU, CK_MAX_WAVES_PER_EU)))
 #endif
        kernel_gemm_xdlops_v2r3(const typename GridwiseGemm::Argument karg)
 {

--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -29,7 +29,9 @@ enum struct MfmaInstr
    mfma_i32_16x16x16i8,
    mfma_i32_32x32x16i8,
    mfma_i32_16x16x32i8,
-    mfma_f64_16x16x4f64
+    mfma_f64_16x16x4f64,
+    mfma_f32_32x32x16f8f8,
+    mfma_f32_16x16x32f8f8
 };
 template <MfmaInstr instr>
@@ -454,6 +456,50 @@ struct mfma_type<MfmaInstr::mfma_f64_16x16x4f64>
    }
 };
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x16f8f8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x16f8f8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x32f8f8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 8;
+    static constexpr bool is_k_reduction         = true;
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x32f8f8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
 template <typename base_type, index_t MPerXdlops, index_t NPerXdlops>
 struct MfmaSelector
 {
@@ -594,6 +640,18 @@ struct MfmaSelector
    }
 #endif
+    template <>
+    static constexpr auto GetMfma<f8_t, 32, 32>()
+    {
+        return MfmaInstr::mfma_f32_32x32x16f8f8;
+    }
+    template <>
+    static constexpr auto GetMfma<f8_t, 16, 16>()
+    {
+        return MfmaInstr::mfma_f32_16x16x32f8f8;
+    }
    static constexpr auto selected_mfma = mfma_type<GetMfma<base_type, MPerXdlops, NPerXdlops>()>{};
    __host__ __device__ constexpr MfmaSelector()
@@ -794,7 +852,7 @@ struct XdlopsGemm
    {
        static_assert(is_same<base_type, double>::value || is_same<base_type, float>::value ||
                          is_same<base_type, half_t>::value || is_same<base_type, bhalf_t>::value ||
-                          is_same<base_type, int8_t>::value,
+                          is_same<base_type, int8_t>::value || is_same<base_type, f8_t>::value,
                      "base base_type must be double, float, half, bfloat16, and int8_t!");
        static_for<0, KPack / mfma_instr.k_per_blk, 1>{}([&](auto k) {

--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -1114,13 +1114,30 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
 #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
    uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x80000000;
-    return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+    if constexpr(is_same<scalar_t, f8_t>::value)
-        src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
+    {
+        auto tmp = amd_buffer_load_impl<int8_t, vector_size, coherence>(
+            src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
+        return bit_cast<vector_t>(tmp);
+    }
+    else
+    {
+        return amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+            src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
+    }
 #else
-    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+    if constexpr(is_same<scalar_t, f8_t>::value)
-        src_wave_buffer_resource, src_thread_addr_offset, 0);
+    {
+        auto tmp = amd_buffer_load_impl<int8_t, vector_size, coherence>(
-    return src_thread_element_valid ? tmp : vector_t(0);
+            src_wave_buffer_resource, src_thread_addr_offset, 0);
+        return src_thread_element_valid ? bit_cast<vector_t>(tmp) : vector_t(0);
+    }
+    else
+    {
+        vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size, coherence>(
+            src_wave_buffer_resource, src_thread_addr_offset, 0);
+        return src_thread_element_valid ? tmp : vector_t(0);
+    }
 #endif
 }
@@ -1179,13 +1196,33 @@ __device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::t
 #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
-    amd_buffer_store_impl<scalar_t, vector_size, coherence>(
+    if constexpr(is_same<scalar_t, f8_t>::value)
-        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+    {
+        auto tmp =
+            bit_cast<typename vector_type_maker<int8_t, vector_size>::type::type>(src_thread_data);
+        amd_buffer_store_impl<int8_t, vector_size, coherence>(
+            tmp, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+    }
+    else
+    {
+        amd_buffer_store_impl<scalar_t, vector_size, coherence>(
+            src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+    }
 #else
    if(dst_thread_element_valid)
    {
-        amd_buffer_store_impl<scalar_t, vector_size, coherence>(
+        if constexpr(is_same<scalar_t, f8_t>::value)
-            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+        {
+            auto tmp = bit_cast<typename vector_type_maker<int8_t, vector_size>::type::type>(
+                src_thread_data);
+            amd_buffer_store_impl<int8_t, vector_size, coherence>(
+                tmp, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+        }
+        else
+        {
+            amd_buffer_store_impl<scalar_t, vector_size, coherence>(
+                src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+        }
    }
 #endif
 }

--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -354,5 +354,68 @@ struct intrin_mfma_f64_16x16x4f64<16, 16>
 #endif
    }
 };
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_32x32x16f8f8;
+template <>
+struct intrin_mfma_f32_32x32x16f8f8<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const f8x8_t& reg_a, const f8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        reg_c.template AsType<float16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(
+                bit_cast<long>(reg_a),
+                bit_cast<long>(reg_b),
+                reg_c.template AsType<float16_t>()[Number<0>{}],
+                0,
+                0,
+                0);
+#else
+        vector_type<f8_t, 8> reg_a_v(reg_a);
+        vector_type<f8_t, 8> reg_b_v(reg_b);
+        static_for<0, 8, 1>{}([&](auto k) {
+            float reg_a_f32 = type_convert<float>(reg_a_v.template AsType<f8_t>()[Number<k>{}]);
+            float reg_b_f32 = type_convert<float>(reg_b_v.template AsType<f8_t>()[Number<k>{}]);
+            intrin_mfma_f32_32x32x2f32<32, 32>::Run(reg_a_f32, reg_b_f32, reg_c);
+        });
+#endif
+    }
+};
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_16x16x32f8f8;
+template <>
+struct intrin_mfma_f32_16x16x32f8f8<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const f8x8_t& reg_a, const f8x8_t& reg_b, FloatC& reg_c)
+    {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
+            bit_cast<long>(reg_a),
+            bit_cast<long>(reg_b),
+            reg_c.template AsType<float4_t>()[Number<0>{}],
+            0,
+            0,
+            0);
+#else
+        vector_type<f8_t, 8> reg_a_v(reg_a);
+        vector_type<f8_t, 8> reg_b_v(reg_b);
+        static_for<0, 8, 1>{}([&](auto k) {
+            float reg_a_f32 = type_convert<float>(reg_a_v.template AsType<f8_t>()[Number<k>{}]);
+            float reg_b_f32 = type_convert<float>(reg_b_v.template AsType<f8_t>()[Number<k>{}]);
+            intrin_mfma_f32_16x16x4f32<16, 16>::Run(reg_a_f32, reg_b_f32, reg_c);
+        });
+#endif
+    }
+};
 } // namespace ck
 #endif
--- a/include/ck/utility/get_shift.hpp
+++ b/include/ck/utility/get_shift.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+namespace ck {
+template <index_t N>
+static constexpr __device__ index_t get_shift()
+{
+    return (get_shift<N / 2>() + 1);
+};
+template <>
+constexpr __device__ index_t get_shift<1>()
+{
+    return (0);
+}
+} // namespace ck