Merge branch 'feature/integrage-karg-simplification-pr' of...

Merge branch 'feature/integrage-karg-simplification-pr' of github.com:ROCmSoftwarePlatform/composable_kernel into feature/integrage-karg-simplification-pr

Merge branch 'feature/integrage-karg-simplification-pr' of...
Merge branch 'feature/integrage-karg-simplification-pr' of github.com:ROCmSoftwarePlatform/composable_kernel into feature/integrage-karg-simplification-pr
e287475b · Po-Yen, Chen · 64b9b6a0 · 845dce3f · e287475b · e287475b
Commit e287475b authored May 17, 2023 by Po-Yen, Chen
20 changed files
--- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -74,141 +75,6 @@ using DeviceOpInstanceMNNN = ck::tensor_operation::device::

 using DeviceOpInstance = DeviceOpInstanceKKNN;

-// hardcoded for NumDimM == NumDimN == NumDimK == 2
-template <ck::index_t NumDimM,
-          ck::index_t NumDimN,
-          ck::index_t NumDimK,
-          typename ADataType,
-          typename BDataType,
-          typename EDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
-struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
-{
-    // Argument
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
-              b_ns_ks_{b_ns_ks},
-              e_ms_ns_{e_ms_ns},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public ck::tensor_operation::device::BaseInvoker
-    {
-        using Argument = ReferenceContraction_M2_N2_K2::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
-                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
-
-                AccDataType v_acc = 0;
-
-                for(int k0 = 0; k0 < K0; ++k0)
-                {
-                    for(int k1 = 0; k1 < K1; ++k1)
-                    {
-                        AccDataType v_a;
-                        AccDataType v_b;
-
-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
-
-                        v_acc += v_a * v_b;
-                    }
-                }
-
-                AccDataType v_c;
-
-                arg.cde_element_op_(v_c, v_acc);
-
-                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
-            };
-
-            make_ParallelTensorFunctor(f_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
-    {
-        return true;
-    }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceContraction_M2_N2_K2"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
 int main(int argc, char* argv[])
 {
    bool do_verification = true;
@@ -385,22 +251,22 @@ int main(int argc, char* argv[])
    {
        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);

-        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
-                                                                  NumDimN,
-                                                                  NumDimK,
-                                                                  ADataType,
-                                                                  BDataType,
-                                                                  CShuffleDataType,
-                                                                  AccDataType,
-                                                                  AElementOp,
-                                                                  BElementOp,
-                                                                  PassThrough>;
-
-        auto ref_gemm    = ReferenceOpInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-
-        auto ref_argument = ref_gemm.MakeArgument(
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+        using ReferenceOpInstance =
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                      NumDimN,
+                                                                      NumDimK,
+                                                                      ADataType,
+                                                                      BDataType,
+                                                                      CShuffleDataType,
+                                                                      AccDataType,
+                                                                      AElementOp,
+                                                                      BElementOp>;
+
+        auto ref_op      = ReferenceOpInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();
+
+        auto ref_argument =
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);

        ref_invoker.Run(ref_argument);


--- a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -74,141 +75,6 @@ using DeviceOpInstanceMNNN = ck::tensor_operation::device::

 using DeviceOpInstance = DeviceOpInstanceKKNN;

-// hardcoded for NumDimM == NumDimN == NumDimK == 2
-template <ck::index_t NumDimM,
-          ck::index_t NumDimN,
-          ck::index_t NumDimK,
-          typename ADataType,
-          typename BDataType,
-          typename EDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
-struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
-{
-    // Argument
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
-              b_ns_ks_{b_ns_ks},
-              e_ms_ns_{e_ms_ns},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public ck::tensor_operation::device::BaseInvoker
-    {
-        using Argument = ReferenceContraction_M2_N2_K2::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
-                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
-
-                AccDataType v_acc = 0;
-
-                for(int k0 = 0; k0 < K0; ++k0)
-                {
-                    for(int k1 = 0; k1 < K1; ++k1)
-                    {
-                        AccDataType v_a;
-                        AccDataType v_b;
-
-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
-
-                        v_acc += v_a * v_b;
-                    }
-                }
-
-                AccDataType v_c;
-
-                arg.cde_element_op_(v_c, v_acc);
-
-                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
-            };
-
-            make_ParallelTensorFunctor(f_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
-    {
-        return true;
-    }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceContraction_M2_N2_K2"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
 int main(int argc, char* argv[])
 {
    bool do_verification = true;
@@ -385,22 +251,22 @@ int main(int argc, char* argv[])
    {
        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);

-        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
-                                                                  NumDimN,
-                                                                  NumDimK,
-                                                                  ADataType,
-                                                                  BDataType,
-                                                                  CShuffleDataType,
-                                                                  AccDataType,
-                                                                  AElementOp,
-                                                                  BElementOp,
-                                                                  PassThrough>;
-
-        auto ref_gemm    = ReferenceOpInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-
-        auto ref_argument = ref_gemm.MakeArgument(
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+        using ReferenceOpInstance =
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                      NumDimN,
+                                                                      NumDimK,
+                                                                      ADataType,
+                                                                      BDataType,
+                                                                      CShuffleDataType,
+                                                                      AccDataType,
+                                                                      AElementOp,
+                                                                      BElementOp>;
+
+        auto ref_op      = ReferenceOpInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();
+
+        auto ref_argument =
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);

        ref_invoker.Run(ref_argument);


--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -73,141 +74,6 @@ using DeviceOpInstanceMNN = ck::tensor_operation::device::

 using DeviceOpInstance = DeviceOpInstanceKKN;

-// hardcoded for NumDimM == NumDimN == NumDimK == 2
-template <ck::index_t NumDimM,
-          ck::index_t NumDimN,
-          ck::index_t NumDimK,
-          typename ADataType,
-          typename BDataType,
-          typename EDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
-struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
-{
-    // Argument
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
-              b_ns_ks_{b_ns_ks},
-              e_ms_ns_{e_ms_ns},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public ck::tensor_operation::device::BaseInvoker
-    {
-        using Argument = ReferenceContraction_M2_N2_K2::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
-                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
-
-                AccDataType v_acc = 0;
-
-                for(int k0 = 0; k0 < K0; ++k0)
-                {
-                    for(int k1 = 0; k1 < K1; ++k1)
-                    {
-                        AccDataType v_a;
-                        AccDataType v_b;
-
-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
-
-                        v_acc += v_a * v_b;
-                    }
-                }
-
-                AccDataType v_c;
-
-                arg.cde_element_op_(v_c, v_acc);
-
-                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
-            };
-
-            make_ParallelTensorFunctor(f_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
-    {
-        return true;
-    }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceContraction_M2_N2_K2"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
 int main(int argc, char* argv[])
 {
    bool do_verification = true;
@@ -368,22 +234,23 @@ int main(int argc, char* argv[])
    {
        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);

-        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
-                                                                  NumDimN,
-                                                                  NumDimK,
-                                                                  ADataType,
-                                                                  BDataType,
-                                                                  CShuffleDataType,
-                                                                  AccDataType,
-                                                                  AElementOp,
-                                                                  BElementOp,
-                                                                  PassThrough>;
-
-        auto ref_gemm    = ReferenceOpInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-
-        auto ref_argument = ref_gemm.MakeArgument(
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+        using ReferenceOpInstance =
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                      NumDimN,
+                                                                      NumDimK,
+                                                                      ADataType,
+                                                                      BDataType,
+                                                                      CShuffleDataType,
+                                                                      AccDataType,
+                                                                      AElementOp,
+                                                                      BElementOp>;
+
+        auto ref_op      = ReferenceOpInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();
+
+        Tensor<float> empty_tensor(std::vector<ck::index_t>{}, std::vector<ck::index_t>{});
+        auto ref_argument =
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);

        ref_invoker.Run(ref_argument);


--- a/example/26_contraction/contraction_scale_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"

 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -73,141 +74,6 @@ using DeviceOpInstanceMNN = ck::tensor_operation::device::

 using DeviceOpInstance = DeviceOpInstanceKKN;

-// hardcoded for NumDimM == NumDimN == NumDimK == 2
-template <ck::index_t NumDimM,
-          ck::index_t NumDimN,
-          ck::index_t NumDimK,
-          typename ADataType,
-          typename BDataType,
-          typename EDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
-struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
-{
-    // Argument
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
-              b_ns_ks_{b_ns_ks},
-              e_ms_ns_{e_ms_ns},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-        }
-
-        const Tensor<ADataType>& a_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
-
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-
-    // Invoker
-    struct Invoker : public ck::tensor_operation::device::BaseInvoker
-    {
-        using Argument = ReferenceContraction_M2_N2_K2::Argument;
-
-        float Run(const Argument& arg)
-        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
-                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
-
-                AccDataType v_acc = 0;
-
-                for(int k0 = 0; k0 < K0; ++k0)
-                {
-                    for(int k1 = 0; k1 < K1; ++k1)
-                    {
-                        AccDataType v_a;
-                        AccDataType v_b;
-
-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
-
-                        v_acc += v_a * v_b;
-                    }
-                }
-
-                AccDataType v_c;
-
-                arg.cde_element_op_(v_c, v_acc);
-
-                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
-            };
-
-            make_ParallelTensorFunctor(f_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-
-            return 0;
-        }
-
-        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-
-    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
-    {
-        return true;
-    }
-
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
-    }
-
-    static auto MakeInvoker() { return Invoker{}; }
-
-    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-
-        // clang-format off
-        str << "ReferenceContraction_M2_N2_K2"
-            << std::endl;
-        // clang-format on
-
-        return str.str();
-    }
-};
-
 int main(int argc, char* argv[])
 {
    bool do_verification = true;
@@ -368,22 +234,23 @@ int main(int argc, char* argv[])
    {
        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);

-        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
-                                                                  NumDimN,
-                                                                  NumDimK,
-                                                                  ADataType,
-                                                                  BDataType,
-                                                                  CShuffleDataType,
-                                                                  AccDataType,
-                                                                  AElementOp,
-                                                                  BElementOp,
-                                                                  PassThrough>;
-
-        auto ref_gemm    = ReferenceOpInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
-
-        auto ref_argument = ref_gemm.MakeArgument(
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+        using ReferenceOpInstance =
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                      NumDimN,
+                                                                      NumDimK,
+                                                                      ADataType,
+                                                                      BDataType,
+                                                                      CShuffleDataType,
+                                                                      AccDataType,
+                                                                      AElementOp,
+                                                                      BElementOp>;
+
+        auto ref_op      = ReferenceOpInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();
+
+        Tensor<float> empty_tensor(std::vector<ck::index_t>{}, std::vector<ck::index_t>{});
+        auto ref_argument =
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);

        ref_invoker.Run(ref_argument);


--- a/example/27_layernorm/CMakeLists.txt
+++ b/example/27_layernorm/CMakeLists.txt
-add_example_executable(example_layernorm_blockwise layernorm_blockwise.cpp)
+add_example_executable(example_layernorm_fp16 layernorm_fp16.cpp)
+add_example_executable(example_layernorm_splitk_fp16 layernorm_splitk_fp16.cpp)
--- a/example/27_layernorm/common.hpp
+++ b/example/27_layernorm/common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
--- a/example/27_layernorm/layernorm_fp16.cpp
+++ b/example/27_layernorm/layernorm_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using PassThrough     = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank         = 2;
+constexpr int NumReduceDim = 1;
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
+                                                          GammaDataType,
+                                                          BetaDataType,
+                                                          ComputeDataType,
+                                                          YDataType,
+                                                          PassThrough,
+                                                          Rank,
+                                                          NumReduceDim,
+                                                          256, // BlockSize
+                                                          8,   // ClusterM
+                                                          32,  // ClusterK
+                                                          1,   // SliceM
+                                                          8,   // SliceK
+                                                          1,   // XYVectorDim (0=M, 1=K)
+                                                          8,   // SrcScalarPerVector
+                                                          1,   // GammaVecDim (0=M, 1=K)
+                                                          8,   // GammaScalarPerVector
+                                                          1,   // BetaVecDim (0=M, 1=K)
+                                                          8,   // BetaScalarPerVector
+                                                          8>;  // OutScalarPerVector
+#include "run_layernorm_example.inc"
+
+int main() { return run_groupnorm_example<DeviceInstance>(); }
--- a/example/27_layernorm/layernorm_splitk_fp16.cpp
+++ b/example/27_layernorm/layernorm_splitk_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using PassThrough     = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank         = 2;
+constexpr int NumReduceDim = 1;
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationSplitKImpl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                ComputeDataType,
+                                                                YDataType,
+                                                                PassThrough,
+                                                                Rank,
+                                                                NumReduceDim,
+                                                                256, // BlockSize
+                                                                8,   // ClusterM
+                                                                32,  // ClusterK
+                                                                1,   // SliceM
+                                                                8,   // SliceK
+                                                                1,   // XYVectorDim (0=M, 1=K)
+                                                                8,   // XScalarPerVector
+                                                                1,   // GammaVecDim (0=M, 1=K)
+                                                                8,   // GammaScalarPerVector
+                                                                1,   // BetaVecDim (0=M, 1=K)
+                                                                8,   // BetaScalarPerVector
+                                                                8>;  // YScalarPerVector
+
+#include "run_layernorm_example.inc"
+
+int main() { return run_groupnorm_example<DeviceInstance>(); }
--- a/example/27_layernorm/layernorm_blockwise.cpp
+++ b/example/27_layernorm/layernorm_blockwise.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <getopt.h>
-
-#include "ck/ck.hpp"
-#include "ck/utility/reduction_enums.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
-#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
-
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_common_util.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/literals.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
-
-using XDataType       = ck::half_t;
-using GammaDataType   = ck::half_t;
-using BetaDataType    = ck::half_t;
-using YDataType       = ck::half_t;
-using ComputeDataType = float;
-using PassThrough     = ck::tensor_operation::element_wise::PassThrough;
-
-constexpr int Rank         = 2;
-constexpr int NumReduceDim = 1;
-
-using DeviceInstance =
-    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
-                                                          GammaDataType,
-                                                          BetaDataType,
-                                                          ComputeDataType,
-                                                          YDataType,
-                                                          PassThrough,
-                                                          Rank,
-                                                          NumReduceDim,
-                                                          256, // BlockSize
-                                                          8,   // ClusterM
-                                                          32,  // ClusterK
-                                                          1,   // SliceM
-                                                          8,   // SliceK
-                                                          1,   // SrcVecDim (0=M, 1=K)
-                                                          8,   // SrcScalarPerVector
-                                                          1,   // GammaVecDim (0=M, 1=K)
-                                                          8,   // GammaScalarPerVector
-                                                          1,   // BetaVecDim (0=M, 1=K)
-                                                          8,   // BetaScalarPerVector
-                                                          8>;  // OutScalarPerVector
-
-int main()
+#pragma once
+
+template <typename DeviceInstance>
+int run_groupnorm_example()
 {
    bool time_kernel = false;

@@ -111,6 +63,10 @@ int main()
        return 1;
    };

+    size_t workspace_sz = device_instance.GetWorkSpaceSize(argument_ptr.get());
+    DeviceMem workspace_dev(workspace_sz);
+    device_instance.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
    auto invoker_ptr = device_instance.MakeInvokerPointer();
    invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

@@ -133,7 +89,8 @@ int main()
        ref_invoker.Run(ref_argument);

        y_dev.FromDevice(y.mData.data());
-        pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results d1", 1e-3, 1e-3);
+        pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
    }
+
    return (pass ? 0 : 1);
 }
--- a/example/42_groupnorm/CMakeLists.txt
+++ b/example/42_groupnorm/CMakeLists.txt
 add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp)
+add_example_executable(example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp)
 add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp)
--- a/example/42_groupnorm/common.hpp
+++ b/example/42_groupnorm/common.hpp
@@ -12,6 +12,7 @@
 #include "ck/ck.hpp"
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp"
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"

 #include "ck/library/utility/fill.hpp"

--- a/example/42_groupnorm/groupnorm_splitk_fp16.cpp
+++ b/example/42_groupnorm/groupnorm_splitk_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using YElementOp      = ck::tensor_operation::element_wise::Swish;
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationSplitKImpl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                ComputeDataType,
+                                                                YDataType,
+                                                                YElementOp,
+                                                                Rank,
+                                                                NumReduceDim,
+                                                                256, // BlockSize
+                                                                1,   // ClusterM
+                                                                256, // ClusterK
+                                                                1,   // SliceM
+                                                                16,  // SliceK
+                                                                1,   // SrcVecDim (0=M, 1=K)
+                                                                2,   // SrcScalarPerVector
+                                                                1,   // GammaVecDim (0=M, 1=K)
+                                                                2,   // GammaScalarPerVector
+                                                                1,   // BetaVecDim (0=M, 1=K)
+                                                                2,   // BetaScalarPerVector
+                                                                2>;  // OutScalarPerVector
+
+#include "run_groupnorm_example.inc"
+
+int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); }
--- a/example/42_groupnorm/run_groupnorm_example.inc
+++ b/example/42_groupnorm/run_groupnorm_example.inc
@@ -73,6 +73,10 @@ int run_groupnorm_example(int argc, char* argv[])
        return 1;
    };

+    size_t workspace_sz = device_instance.GetWorkSpaceSize(argument_ptr.get());
+    DeviceMem workspace_dev(workspace_sz);
+    device_instance.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
    auto invoker_ptr = device_instance.MakeInvokerPointer();
    float ave_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true});


--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
@@ -807,7 +807,7 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
        // workspace for welford intermediate mean
        workspace_size += gemm_welford_size * sizeof(EMeanVarDataType) + 64;

-        // workspace for welford intermediate mean
+        // workspace for welford intermediate variance
        workspace_size += gemm_welford_size * sizeof(EMeanVarDataType) + 64;

        // workspace for welford intermediate count

--- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
@@ -10,8 +10,7 @@
 #include "ck/tensor_operation/gpu/device/device_normalization.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_normalization_selector.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
+#include "ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_selector.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"

@@ -20,6 +19,10 @@ namespace tensor_operation {
 namespace device {

 // Y = Normalization(X, Beta, Gamma)
+// M: Invarient length
+// K: Reduce length (Calculate mean and variance along K dimension)
+// eg. Length = [N, C, H, W], reduce dim = [C, H, W]
+// Then, M = N, K = C * H * W
 template <typename XDataType,
          typename GammaDataType,
          typename BetaDataType,
@@ -68,7 +71,6 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,

    static auto MakeSrc2dDescriptor(const std::vector<index_t>& inLengths,
                                    const std::vector<index_t>& inStrides,
-                                    int blkGroupSize,
                                    int numBlockTileIteration)
    {
        constexpr index_t NumInvariantDim  = Rank - NumReduceDim;
@@ -117,10 +119,9 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});

-        const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration;
        const auto inPad_M =
            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
-        const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength;
+        const auto inPad_K = K_BlockTileSize * numBlockTileIteration - reduceLength;

        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
            in_grid_desc_m_k,
@@ -132,7 +133,7 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
        return (in_grid_desc_m_k_padded);
    };

-    using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1));
+    using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1));

    struct Argument : public BaseArgument
    {
@@ -162,26 +163,22 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
            gammaStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(gammaStrides, reduceDims);
            betaStrides_  = shuffle_tensor_dimensions<Rank, NumReduceDim>(betaStrides, reduceDims);

-            long_index_t invariant_total_length;
-            long_index_t reduce_total_length;
+            long_index_t invariant_length;
+            long_index_t reduce_length;

-            std::tie(invariant_total_length, reduce_total_length) =
+            std::tie(invariant_length, reduce_length) =
                get_2d_lengths<Rank, NumReduceDim>(Lengths_);

-            blkGroupSize_          = 1;
-            numBlockTileIteration_ = (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize;
+            numBlockTileIteration_ = math::integer_divide_ceil(reduce_length, K_BlockTileSize);

-            gridSize_ = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
-                        M_BlockTileSize * blkGroupSize_;
+            gridSize_ = math::integer_divide_ceil(invariant_length, M_BlockTileSize);

-            x_grid_desc_m_k_ =
-                MakeSrc2dDescriptor(Lengths_, xStrides_, blkGroupSize_, numBlockTileIteration_);
+            x_grid_desc_m_k_ = MakeSrc2dDescriptor(Lengths_, xStrides_, numBlockTileIteration_);
            gamma_grid_desc_m_k_ =
-                MakeSrc2dDescriptor(Lengths_, gammaStrides_, blkGroupSize_, numBlockTileIteration_);
+                MakeSrc2dDescriptor(Lengths_, gammaStrides_, numBlockTileIteration_);
            beta_grid_desc_m_k_ =
-                MakeSrc2dDescriptor(Lengths_, betaStrides_, blkGroupSize_, numBlockTileIteration_);
-            y_grid_desc_m_k_ =
-                MakeSrc2dDescriptor(Lengths_, yStrides_, blkGroupSize_, numBlockTileIteration_);
+                MakeSrc2dDescriptor(Lengths_, betaStrides_, numBlockTileIteration_);
+            y_grid_desc_m_k_ = MakeSrc2dDescriptor(Lengths_, yStrides_, numBlockTileIteration_);

            isSweeponce_ =
                x_grid_desc_m_k_.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize;
@@ -202,7 +199,6 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,

        YElementwiseOperation y_elementwise_op_;

-        int blkGroupSize_;
        int numBlockTileIteration_;
        size_t gridSize_;

@@ -286,6 +282,9 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,

                if(p_arg_->invariant_lowest_length % XSrcVectorSize != 0)
                    return false;
+
+                if(p_arg_->invariant_lowest_length % YDstVectorSize != 0)
+                    return false;
            };
        }
        else
@@ -295,12 +294,12 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,

            if(p_arg_->Lengths_[Rank - 1] % XSrcVectorSize != 0)
                return false;
-        };

-        if(p_arg_->Lengths_[Rank - 1] % YDstVectorSize != 0)
-        {
-            return false;
-        }
+            if(p_arg_->Lengths_[Rank - 1] % YDstVectorSize != 0)
+            {
+                return false;
+            }
+        };

        // if fastest dim is not reduced
        if constexpr(GammaSrcVectorDim == 0)

--- a/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_selector.hpp
@@ -3,8 +3,8 @@

 #pragma once

-#include "ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp"
+#include "ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_naive_variance.hpp"
+#include "ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_welford_variance.hpp"

 namespace ck {
 template <typename GridwiseReduction,

--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/math.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename XDataType,
+          typename ComputeDataType,
+          typename MeanVarDataType,
+          typename XGridDesc_M_K,
+          typename MeanVarGridDesc_M_KBlock,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XSrcVectorDim,
+          index_t XSrcVectorSize>
+struct GridwiseNormalizationSplitK1st
+{
+    static_assert((XSrcVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
+                      (XSrcVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (XSrcVectorDim == 0);
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadBufferLengths_M_K                = Sequence<MThreadSliceSize, XSrcVectorSize>;
+    static constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{}));
+
+    using ThreadBufferLengths_M_1 = Sequence<MThreadSliceSize, 1>;
+    static constexpr auto thread_buffer_desc_m_1 =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}, I1));
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using ThreadwiseWelford =
+        ThreadwiseWelford<ComputeDataType, ThreadReduceSrcDesc_M_K, ThreadReduceDstDesc_M>;
+
+    using BlockwiseWelford = BlockwiseWelford<ComputeDataType,
+                                              BlockSize,
+                                              ThreadClusterLengths_M_K,
+                                              ThreadClusterArrangeOrder,
+                                              false>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr index_t M_BlockTileSize     = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize     = KThreadClusterSize * KThreadSliceSize;
+    static constexpr index_t K_BlockTileStepSize = KThreadClusterSize * XSrcVectorSize;
+
+    static constexpr auto ThreadBufferNumber = Number<KThreadSliceSize / XSrcVectorSize>{};
+
+    __device__ static int
+    GetKPerThread(int kRaw, int kGridSize, int block_k_cluster_id, int thread_k_cluster_id)
+    {
+        bool is_rightmost_block = block_k_cluster_id == kGridSize - 1;
+
+        if(is_rightmost_block)
+        {
+            int left_kPerBlock = math::integer_divide_ceil(kRaw, kGridSize);
+            int kPerBlock      = kRaw % kGridSize == 0 ? left_kPerBlock : kRaw % left_kPerBlock;
+            int kPerThread =
+                kPerBlock < K_BlockTileSize ? 0 : KThreadSliceSize * (kPerBlock / K_BlockTileSize);
+            int kPerBlockTail = kPerBlock - kPerThread * KThreadClusterSize;
+
+            if(kPerBlockTail > 0)
+            {
+                static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                    int thread_max_len =
+                        (thread_k_cluster_id + 1) * XSrcVectorSize + K_BlockTileStepSize * i;
+                    int delta = thread_max_len - kPerBlockTail;
+                    delta     = math::clamp(thread_max_len - kPerBlockTail, 0, XSrcVectorSize);
+                    kPerThread += XSrcVectorSize - delta;
+                });
+            }
+
+            return kPerThread;
+        }
+        else
+        {
+            int kPerBlock = math::integer_divide_ceil(kRaw, kGridSize);
+            return KThreadSliceSize * (kPerBlock / K_BlockTileSize);
+        }
+    }
+
+    // Calculate mean and variance by welford along k dimension
+    __device__ static void Run(const XGridDesc_M_K& x_grid_desc_m_k,
+                               const MeanVarGridDesc_M_KBlock& mean_var_grid_desc_m_kblock,
+                               index_t num_k_block_tile_iteration,
+                               const XDataType* const __restrict__ p_x_global,
+                               MeanVarDataType* const p_mean_global,
+                               MeanVarDataType* const p_variance_global,
+                               int32_t* const p_welford_count_global)
+    {
+        auto x_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    ComputeDataType,
+                                    MThreadSliceSize * XSrcVectorSize,
+                                    true>{};
+            },
+            Number<ThreadBufferNumber>{});
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
+            mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
+            var_thread_buf;
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+
+        const index_t k_grid_size        = mean_var_grid_desc_m_kblock.GetLength(I1);
+        const index_t block_m_cluster_id = block_global_id / k_grid_size;
+        const index_t block_k_cluster_id = block_global_id % k_grid_size;
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
+
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  ComputeDataType,
+                                                                  XGridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XSrcVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(
+                block_m_cluster_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                block_k_cluster_id * reduceSizePerBlock + thread_k_cluster_id * XSrcVectorSize));
+
+        auto mean_var_count_store_index = make_multi_index(
+            block_m_cluster_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+            block_k_cluster_id);
+
+        auto threadwise_welford_mean_var_store =
+            ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
+                                               MeanVarDataType,
+                                               decltype(thread_buffer_desc_m_1),
+                                               MeanVarGridDesc_M_KBlock,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M_1,
+                                               Sequence<0, 1>,
+                                               1,
+                                               1,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                mean_var_grid_desc_m_kblock, mean_var_count_store_index, PassThroughOp{});
+
+        constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize);
+
+        const auto x_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x_global, x_grid_desc_m_k.GetElementSpaceSize());
+
+        auto mean_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_mean_global, mean_var_grid_desc_m_kblock.GetElementSpaceSize());
+
+        auto var_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_variance_global, mean_var_grid_desc_m_kblock.GetElementSpaceSize());
+
+        auto threadwise_welford = ThreadwiseWelford();
+        int kRaw                = x_grid_desc_m_k.GetTransforms()[I2].GetUpperLengths()[I0];
+        threadwise_welford.max_count_ =
+            GetKPerThread(kRaw, k_grid_size, block_k_cluster_id, thread_k_cluster_id);
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            mean_thread_buf(I) = type_convert<ComputeDataType>(0.0f);
+            var_thread_buf(I)  = type_convert<ComputeDataType>(0.0f);
+        });
+
+        for(index_t k = 0; k < num_k_block_tile_iteration; ++k)
+        {
+            static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_x_load.Run(x_grid_desc_m_k,
+                                      x_global_val_buf,
+                                      thread_buffer_desc_m_k,
+                                      make_tuple(I0, I0),
+                                      x_thread_buf(i));
+                threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+                threadwise_welford.Run(x_thread_buf[i], mean_thread_buf, var_thread_buf);
+            });
+        }
+
+        int welford_count = 0;
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+
+            int count = threadwise_welford.cur_count_;
+            BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count);
+
+            // The value of count is same for all I
+            if constexpr(I == MThreadSliceSize - 1)
+                welford_count = count;
+        });
+
+        if(thread_k_cluster_id == 0)
+        {
+            threadwise_welford_mean_var_store.Run(thread_buffer_desc_m_1,
+                                                  make_tuple(I0, I0),
+                                                  mean_thread_buf,
+                                                  mean_var_grid_desc_m_kblock,
+                                                  mean_global_val_buf);
+
+            threadwise_welford_mean_var_store.Run(thread_buffer_desc_m_1,
+                                                  make_tuple(I0, I0),
+                                                  var_thread_buf,
+                                                  mean_var_grid_desc_m_kblock,
+                                                  var_global_val_buf);
+
+            if(block_m_cluster_id == 0 && thread_m_cluster_id == 0)
+                p_welford_count_global[block_k_cluster_id] = welford_count;
+        }
+    }
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp
+++ b/include/ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp