Update staging branch. (#706)

* update daily build from rocm 5.4.3 to 5.5 (#693) * Fix grouped_gemm_splitk kernels on MI300. (#694) * replace amd_buffer_atomic_add with hip_atomic_add * fix grouped_gemm_splitk kernels on mi300 * fix syntax * revert experimental atomic_add changes --------- Co-authored-by: Jing Zhang <jizhan@amd.com> * Fix the group of quantization_int8 kernels on MI300. (#695) * replace amd_buffer_atomic_add with hip_atomic_add * fix grouped_gemm_splitk kernels on mi300 * fix syntax * revert experimental atomic_add changes * fix the group of kernels from ticket 723 on MI300 --------- Co-authored-by: Jing Zhang <jizhan@amd.com> * Optimize bf16 conversion (#664) * Add TypeConvert class and start refactoring * Refactor TypeConvert as a struct * Get back to template functions type_convert * Add a type_convert_bf16_rtn, set rtz as default * Clean up * Add UnaryConvertPrecision struct for high-precision workloads * Format * Update type_convert to UnaryConvert on threadwise level * Update UnaryConvertPrecision * Format * Fix chmod * Add a flag to pick converion method * Format * Remove the added flag * Merge elementwise op with type conversion * Move type_convert to elemwise op, update the op * Update type_convert_precision -> bf16_convert_rtn * Clean up * Update comments * Update the CK_WORKAROUND_DENORM_FIX flag handling * Update the unneeded op to work but warn user * Remove the message * Use a PassThrough instead of ConvertBF16RTN to calcaulate reference * Format * Add missing include * Normalization/split k (#615) * Add contraction profiler and tests (#701) * Add contraction profiler and tests * Build and style fixes * Allow to use any elementwise operator for ref_contraction * Introduce profile_contraction_scale and profile_contraction_bilinear * Make ref_contraction generic and extend interface tests * Stylistic minor fixes * Extend test_contraction_interface --------- Co-authored-by: Jing Zhang <jizhan@amd.com> Co-authored-by: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com> Co-authored-by: rocking <ChunYu.Lai@amd.com> Co-authored-by: Bartłomiej Kocot <bartlomiejkocot98@gmail.com>

Update staging branch. (#706)
* update daily build from rocm 5.4.3 to 5.5 (#693) * Fix grouped_gemm_splitk kernels on MI300. (#694) * replace amd_buffer_atomic_add with hip_atomic_add * fix grouped_gemm_splitk kernels on mi300 * fix syntax * revert experimental atomic_add changes --------- Co-authored-by: Jing Zhang <jizhan@amd.com> * Fix the group of quantization_int8 kernels on MI300. (#695) * replace amd_buffer_atomic_add with hip_atomic_add * fix grouped_gemm_splitk kernels on mi300 * fix syntax * revert experimental atomic_add changes * fix the group of kernels from ticket 723 on MI300 --------- Co-authored-by: Jing Zhang <jizhan@amd.com> * Optimize bf16 conversion (#664) * Add TypeConvert class and start refactoring * Refactor TypeConvert as a struct * Get back to template functions type_convert * Add a type_convert_bf16_rtn, set rtz as default * Clean up * Add UnaryConvertPrecision struct for high-precision workloads * Format * Update type_convert to UnaryConvert on threadwise level * Update UnaryConvertPrecision * Format * Fix chmod * Add a flag to pick converion method * Format * Remove the added flag * Merge elementwise op with type conversion * Move type_convert to elemwise op, update the op * Update type_convert_precision -> bf16_convert_rtn * Clean up * Update comments * Update the CK_WORKAROUND_DENORM_FIX flag handling * Update the unneeded op to work but warn user * Remove the message * Use a PassThrough instead of ConvertBF16RTN to calcaulate reference * Format * Add missing include * Normalization/split k (#615) * Add contraction profiler and tests (#701) * Add contraction profiler and tests * Build and style fixes * Allow to use any elementwise operator for ref_contraction * Introduce profile_contraction_scale and profile_contraction_bilinear * Make ref_contraction generic and extend interface tests * Stylistic minor fixes * Extend test_contraction_interface --------- Co-authored-by: Jing Zhang <jizhan@amd.com> Co-authored-by: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com> Co-authored-by: rocking <ChunYu.Lai@amd.com> Co-authored-by: Bartłomiej Kocot <bartlomiejkocot98@gmail.com>
72b7ae25 · Illia Silin · GitHub · bbe74503 · 72b7ae25 · 72b7ae25
Unverified Commit 72b7ae25 authored May 15, 2023 by Illia Silin Committed by GitHub May 15, 2023
20 changed files
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -19,7 +19,7 @@ def runShell(String command){
 def getDockerImageName(){
    def img
-    if (params.ROCMVERSION != "5.5" && params.ROCMVERSION != "5.6"){
+    if (params.ROCMVERSION != "5.6"){
       if (params.COMPILER_VERSION == "") {
           img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}"
       }
@@ -597,7 +597,7 @@ def process_results(Map conf=[:]){
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true
-                                              0 21 * * * % ROCMVERSION=5.4.3;COMPILER_VERSION=release;COMPILER_COMMIT=
+                                              0 21 * * * % ROCMVERSION=5.5;COMPILER_VERSION=release;COMPILER_COMMIT=
                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""
 pipeline {

--- a/example/15_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
@@ -147,7 +147,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
 #else
        a_tensors_device[i]->ToDevice(a_tensors[i].mData.data());
        b_tensors_device[i]->ToDevice(b_tensors[i].mData.data());
-	c_tensors_device[i]->SetZero();
+        c_tensors_device[i]->SetZero();
 #endif
        p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());

--- a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -74,141 +75,6 @@ using DeviceOpInstanceMNNN = ck::tensor_operation::device::
 using DeviceOpInstance = DeviceOpInstanceKKNN;
-// hardcoded for NumDimM == NumDimN == NumDimK == 2
-template <ck::index_t NumDimM,
-          ck::index_t NumDimN,
-          ck::index_t NumDimK,
-          typename ADataType,
-          typename BDataType,
-          typename EDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
-struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
-{
-    // Argument
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
-              b_ns_ks_{b_ns_ks},
-              e_ms_ns_{e_ms_ns},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-        }
-        const Tensor<ADataType>& a_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-    // Invoker
-    struct Invoker : public ck::tensor_operation::device::BaseInvoker
-    {
-        using Argument = ReferenceContraction_M2_N2_K2::Argument;
-        float Run(const Argument& arg)
-        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
-                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
-                AccDataType v_acc = 0;
-                for(int k0 = 0; k0 < K0; ++k0)
-                {
-                    for(int k1 = 0; k1 < K1; ++k1)
-                    {
-                        AccDataType v_a;
-                        AccDataType v_b;
-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
-                        v_acc += v_a * v_b;
-                    }
-                }
-                AccDataType v_c;
-                arg.cde_element_op_(v_c, v_acc);
-                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
-            };
-            make_ParallelTensorFunctor(f_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-            return 0;
-        }
-        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
-    {
-        return true;
-    }
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
-    }
-    static auto MakeInvoker() { return Invoker{}; }
-    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-        // clang-format off
-        str << "ReferenceContraction_M2_N2_K2"
-            << std::endl;
-        // clang-format on
-        return str.str();
-    }
-};
 int main(int argc, char* argv[])
 {
    bool do_verification = true;
@@ -385,22 +251,22 @@ int main(int argc, char* argv[])
    {
        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
+        using ReferenceOpInstance =
-                                                                  NumDimN,
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
-                                                                  NumDimK,
+                                                                      NumDimN,
-                                                                  ADataType,
+                                                                      NumDimK,
-                                                                  BDataType,
+                                                                      ADataType,
-                                                                  CShuffleDataType,
+                                                                      BDataType,
-                                                                  AccDataType,
+                                                                      CShuffleDataType,
-                                                                  AElementOp,
+                                                                      AccDataType,
-                                                                  BElementOp,
+                                                                      AElementOp,
-                                                                  PassThrough>;
+                                                                      BElementOp>;
-        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_op      = ReferenceOpInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_invoker = ref_op.MakeInvoker();
-        auto ref_argument = ref_gemm.MakeArgument(
+        auto ref_argument =
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);
        ref_invoker.Run(ref_argument);

--- a/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_bilinear_xdl_fp64.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -74,141 +75,6 @@ using DeviceOpInstanceMNNN = ck::tensor_operation::device::
 using DeviceOpInstance = DeviceOpInstanceKKNN;
-// hardcoded for NumDimM == NumDimN == NumDimK == 2
-template <ck::index_t NumDimM,
-          ck::index_t NumDimN,
-          ck::index_t NumDimK,
-          typename ADataType,
-          typename BDataType,
-          typename EDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
-struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
-{
-    // Argument
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
-              b_ns_ks_{b_ns_ks},
-              e_ms_ns_{e_ms_ns},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-        }
-        const Tensor<ADataType>& a_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-    // Invoker
-    struct Invoker : public ck::tensor_operation::device::BaseInvoker
-    {
-        using Argument = ReferenceContraction_M2_N2_K2::Argument;
-        float Run(const Argument& arg)
-        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
-                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
-                AccDataType v_acc = 0;
-                for(int k0 = 0; k0 < K0; ++k0)
-                {
-                    for(int k1 = 0; k1 < K1; ++k1)
-                    {
-                        AccDataType v_a;
-                        AccDataType v_b;
-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
-                        v_acc += v_a * v_b;
-                    }
-                }
-                AccDataType v_c;
-                arg.cde_element_op_(v_c, v_acc);
-                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
-            };
-            make_ParallelTensorFunctor(f_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-            return 0;
-        }
-        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
-    {
-        return true;
-    }
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
-    }
-    static auto MakeInvoker() { return Invoker{}; }
-    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-        // clang-format off
-        str << "ReferenceContraction_M2_N2_K2"
-            << std::endl;
-        // clang-format on
-        return str.str();
-    }
-};
 int main(int argc, char* argv[])
 {
    bool do_verification = true;
@@ -385,22 +251,22 @@ int main(int argc, char* argv[])
    {
        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
+        using ReferenceOpInstance =
-                                                                  NumDimN,
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
-                                                                  NumDimK,
+                                                                      NumDimN,
-                                                                  ADataType,
+                                                                      NumDimK,
-                                                                  BDataType,
+                                                                      ADataType,
-                                                                  CShuffleDataType,
+                                                                      BDataType,
-                                                                  AccDataType,
+                                                                      CShuffleDataType,
-                                                                  AElementOp,
+                                                                      AccDataType,
-                                                                  BElementOp,
+                                                                      AElementOp,
-                                                                  PassThrough>;
+                                                                      BElementOp>;
-        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_op      = ReferenceOpInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_invoker = ref_op.MakeInvoker();
-        auto ref_argument = ref_gemm.MakeArgument(
+        auto ref_argument =
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);
        ref_invoker.Run(ref_argument);

--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -73,141 +74,6 @@ using DeviceOpInstanceMNN = ck::tensor_operation::device::
 using DeviceOpInstance = DeviceOpInstanceKKN;
-// hardcoded for NumDimM == NumDimN == NumDimK == 2
-template <ck::index_t NumDimM,
-          ck::index_t NumDimN,
-          ck::index_t NumDimK,
-          typename ADataType,
-          typename BDataType,
-          typename EDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
-struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
-{
-    // Argument
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
-              b_ns_ks_{b_ns_ks},
-              e_ms_ns_{e_ms_ns},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-        }
-        const Tensor<ADataType>& a_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-    // Invoker
-    struct Invoker : public ck::tensor_operation::device::BaseInvoker
-    {
-        using Argument = ReferenceContraction_M2_N2_K2::Argument;
-        float Run(const Argument& arg)
-        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
-                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
-                AccDataType v_acc = 0;
-                for(int k0 = 0; k0 < K0; ++k0)
-                {
-                    for(int k1 = 0; k1 < K1; ++k1)
-                    {
-                        AccDataType v_a;
-                        AccDataType v_b;
-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
-                        v_acc += v_a * v_b;
-                    }
-                }
-                AccDataType v_c;
-                arg.cde_element_op_(v_c, v_acc);
-                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
-            };
-            make_ParallelTensorFunctor(f_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-            return 0;
-        }
-        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
-    {
-        return true;
-    }
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
-    }
-    static auto MakeInvoker() { return Invoker{}; }
-    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-        // clang-format off
-        str << "ReferenceContraction_M2_N2_K2"
-            << std::endl;
-        // clang-format on
-        return str.str();
-    }
-};
 int main(int argc, char* argv[])
 {
    bool do_verification = true;
@@ -368,22 +234,23 @@ int main(int argc, char* argv[])
    {
        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
+        using ReferenceOpInstance =
-                                                                  NumDimN,
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
-                                                                  NumDimK,
+                                                                      NumDimN,
-                                                                  ADataType,
+                                                                      NumDimK,
-                                                                  BDataType,
+                                                                      ADataType,
-                                                                  CShuffleDataType,
+                                                                      BDataType,
-                                                                  AccDataType,
+                                                                      CShuffleDataType,
-                                                                  AElementOp,
+                                                                      AccDataType,
-                                                                  BElementOp,
+                                                                      AElementOp,
-                                                                  PassThrough>;
+                                                                      BElementOp>;
-        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_op      = ReferenceOpInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_invoker = ref_op.MakeInvoker();
-        auto ref_argument = ref_gemm.MakeArgument(
+        Tensor<float> empty_tensor(std::vector<ck::index_t>{}, std::vector<ck::index_t>{});
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+        auto ref_argument =
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);
        ref_invoker.Run(ref_argument);

--- a/example/26_contraction/contraction_scale_xdl_fp64.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp64.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/numeric.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;
@@ -73,141 +74,6 @@ using DeviceOpInstanceMNN = ck::tensor_operation::device::
 using DeviceOpInstance = DeviceOpInstanceKKN;
-// hardcoded for NumDimM == NumDimN == NumDimK == 2
-template <ck::index_t NumDimM,
-          ck::index_t NumDimN,
-          ck::index_t NumDimK,
-          typename ADataType,
-          typename BDataType,
-          typename EDataType,
-          typename AccDataType,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CDEElementwiseOperation,
-          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
-struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
-{
-    // Argument
-    struct Argument : public ck::tensor_operation::device::BaseArgument
-    {
-        Argument(const Tensor<ADataType>& a_ms_ks,
-                 const Tensor<BDataType>& b_ns_ks,
-                 Tensor<EDataType>& e_ms_ns,
-                 AElementwiseOperation a_element_op,
-                 BElementwiseOperation b_element_op,
-                 CDEElementwiseOperation cde_element_op)
-            : a_ms_ks_{a_ms_ks},
-              b_ns_ks_{b_ns_ks},
-              e_ms_ns_{e_ms_ns},
-              a_element_op_{a_element_op},
-              b_element_op_{b_element_op},
-              cde_element_op_{cde_element_op}
-        {
-        }
-        const Tensor<ADataType>& a_ms_ks_;
-        const Tensor<BDataType>& b_ns_ks_;
-        Tensor<EDataType>& e_ms_ns_;
-        AElementwiseOperation a_element_op_;
-        BElementwiseOperation b_element_op_;
-        CDEElementwiseOperation cde_element_op_;
-    };
-    // Invoker
-    struct Invoker : public ck::tensor_operation::device::BaseInvoker
-    {
-        using Argument = ReferenceContraction_M2_N2_K2::Argument;
-        float Run(const Argument& arg)
-        {
-            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
-                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
-                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
-                AccDataType v_acc = 0;
-                for(int k0 = 0; k0 < K0; ++k0)
-                {
-                    for(int k1 = 0; k1 < K1; ++k1)
-                    {
-                        AccDataType v_a;
-                        AccDataType v_b;
-                        arg.a_element_op_(
-                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
-                        arg.b_element_op_(
-                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
-                        v_acc += v_a * v_b;
-                    }
-                }
-                AccDataType v_c;
-                arg.cde_element_op_(v_c, v_acc);
-                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
-            };
-            make_ParallelTensorFunctor(f_ms_ns,
-                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
-                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
-                std::thread::hardware_concurrency());
-            return 0;
-        }
-        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
-                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
-        {
-            return Run(*dynamic_cast<const Argument*>(p_arg));
-        }
-    };
-    static constexpr bool IsValidCompilationParameter()
-    {
-        // TODO: properly implement this check
-        return true;
-    }
-    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
-    {
-        return true;
-    }
-    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
-                             const Tensor<BDataType>& b_ns_ks,
-                             Tensor<EDataType>& e_ms_ns,
-                             AElementwiseOperation a_element_op,
-                             BElementwiseOperation b_element_op,
-                             CDEElementwiseOperation cde_element_op)
-    {
-        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
-    }
-    static auto MakeInvoker() { return Invoker{}; }
-    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
-    {
-        return std::make_unique<Invoker>(Invoker{});
-    }
-    std::string GetTypeString() const override
-    {
-        auto str = std::stringstream();
-        // clang-format off
-        str << "ReferenceContraction_M2_N2_K2"
-            << std::endl;
-        // clang-format on
-        return str.str();
-    }
-};
 int main(int argc, char* argv[])
 {
    bool do_verification = true;
@@ -368,22 +234,23 @@ int main(int argc, char* argv[])
    {
        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
-        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
+        using ReferenceOpInstance =
-                                                                  NumDimN,
+            ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimM,
-                                                                  NumDimK,
+                                                                      NumDimN,
-                                                                  ADataType,
+                                                                      NumDimK,
-                                                                  BDataType,
+                                                                      ADataType,
-                                                                  CShuffleDataType,
+                                                                      BDataType,
-                                                                  AccDataType,
+                                                                      CShuffleDataType,
-                                                                  AElementOp,
+                                                                      AccDataType,
-                                                                  BElementOp,
+                                                                      AElementOp,
-                                                                  PassThrough>;
+                                                                      BElementOp>;
-        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_op      = ReferenceOpInstance{};
-        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_invoker = ref_op.MakeInvoker();
-        auto ref_argument = ref_gemm.MakeArgument(
+        Tensor<float> empty_tensor(std::vector<ck::index_t>{}, std::vector<ck::index_t>{});
-            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+        auto ref_argument =
+            ref_op.MakeArgument(a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op);
        ref_invoker.Run(ref_argument);

--- a/example/27_layernorm/CMakeLists.txt
+++ b/example/27_layernorm/CMakeLists.txt
-add_example_executable(example_layernorm_blockwise layernorm_blockwise.cpp)
+add_example_executable(example_layernorm_fp16 layernorm_fp16.cpp)
+add_example_executable(example_layernorm_splitk_fp16 layernorm_splitk_fp16.cpp)
--- a/example/27_layernorm/common.hpp
+++ b/example/27_layernorm/common.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
--- a/example/27_layernorm/layernorm_fp16.cpp
+++ b/example/27_layernorm/layernorm_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using PassThrough     = ck::tensor_operation::element_wise::PassThrough;
+constexpr int Rank         = 2;
+constexpr int NumReduceDim = 1;
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
+                                                          GammaDataType,
+                                                          BetaDataType,
+                                                          ComputeDataType,
+                                                          YDataType,
+                                                          PassThrough,
+                                                          Rank,
+                                                          NumReduceDim,
+                                                          256, // BlockSize
+                                                          8,   // ClusterM
+                                                          32,  // ClusterK
+                                                          1,   // SliceM
+                                                          8,   // SliceK
+                                                          1,   // XYVectorDim (0=M, 1=K)
+                                                          8,   // SrcScalarPerVector
+                                                          1,   // GammaVecDim (0=M, 1=K)
+                                                          8,   // GammaScalarPerVector
+                                                          1,   // BetaVecDim (0=M, 1=K)
+                                                          8,   // BetaScalarPerVector
+                                                          8>;  // OutScalarPerVector
+#include "run_layernorm_example.inc"
+int main() { return run_groupnorm_example<DeviceInstance>(); }
--- a/example/27_layernorm/layernorm_splitk_fp16.cpp
+++ b/example/27_layernorm/layernorm_splitk_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using PassThrough     = ck::tensor_operation::element_wise::PassThrough;
+constexpr int Rank         = 2;
+constexpr int NumReduceDim = 1;
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationSplitKImpl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                ComputeDataType,
+                                                                YDataType,
+                                                                PassThrough,
+                                                                Rank,
+                                                                NumReduceDim,
+                                                                256, // BlockSize
+                                                                8,   // ClusterM
+                                                                32,  // ClusterK
+                                                                1,   // SliceM
+                                                                8,   // SliceK
+                                                                1,   // XYVectorDim (0=M, 1=K)
+                                                                8,   // XScalarPerVector
+                                                                1,   // GammaVecDim (0=M, 1=K)
+                                                                8,   // GammaScalarPerVector
+                                                                1,   // BetaVecDim (0=M, 1=K)
+                                                                8,   // BetaScalarPerVector
+                                                                8>;  // YScalarPerVector
+#include "run_layernorm_example.inc"
+int main() { return run_groupnorm_example<DeviceInstance>(); }
--- a/example/27_layernorm/layernorm_blockwise.cpp
+++ b/example/27_layernorm/layernorm_blockwise.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include <iostream>
+#pragma once
-#include <numeric>
-#include <initializer_list>
+template <typename DeviceInstance>
-#include <cstdlib>
+int run_groupnorm_example()
-#include <getopt.h>
-#include "ck/ck.hpp"
-#include "ck/utility/reduction_enums.hpp"
-#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
-#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
-#include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/device_memory.hpp"
-#include "ck/library/utility/host_common_util.hpp"
-#include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/utility/literals.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
-using XDataType       = ck::half_t;
-using GammaDataType   = ck::half_t;
-using BetaDataType    = ck::half_t;
-using YDataType       = ck::half_t;
-using ComputeDataType = float;
-using PassThrough     = ck::tensor_operation::element_wise::PassThrough;
-constexpr int Rank         = 2;
-constexpr int NumReduceDim = 1;
-using DeviceInstance =
-    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
-                                                          GammaDataType,
-                                                          BetaDataType,
-                                                          ComputeDataType,
-                                                          YDataType,
-                                                          PassThrough,
-                                                          Rank,
-                                                          NumReduceDim,
-                                                          256, // BlockSize
-                                                          8,   // ClusterM
-                                                          32,  // ClusterK
-                                                          1,   // SliceM
-                                                          8,   // SliceK
-                                                          1,   // SrcVecDim (0=M, 1=K)
-                                                          8,   // SrcScalarPerVector
-                                                          1,   // GammaVecDim (0=M, 1=K)
-                                                          8,   // GammaScalarPerVector
-                                                          1,   // BetaVecDim (0=M, 1=K)
-                                                          8,   // BetaScalarPerVector
-                                                          8>;  // OutScalarPerVector
-int main()
 {
    bool time_kernel = false;
@@ -111,6 +63,10 @@ int main()
        return 1;
    };
+    size_t workspace_sz = device_instance.GetWorkSpaceSize(argument_ptr.get());
+    DeviceMem workspace_dev(workspace_sz);
+    device_instance.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
    auto invoker_ptr = device_instance.MakeInvokerPointer();
    invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
@@ -133,7 +89,8 @@ int main()
        ref_invoker.Run(ref_argument);
        y_dev.FromDevice(y.mData.data());
-        pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results d1", 1e-3, 1e-3);
+        pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
    }
    return (pass ? 0 : 1);
 }
--- a/example/42_groupnorm/CMakeLists.txt
+++ b/example/42_groupnorm/CMakeLists.txt
 add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp)
+add_example_executable(example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp)
 add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp)
--- a/example/42_groupnorm/common.hpp
+++ b/example/42_groupnorm/common.hpp
@@ -12,6 +12,7 @@
 #include "ck/ck.hpp"
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_splitk_impl.hpp"
 #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
 #include "ck/library/utility/fill.hpp"

--- a/example/42_groupnorm/groupnorm_splitk_fp16.cpp
+++ b/example/42_groupnorm/groupnorm_splitk_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+using XDataType       = ck::half_t;
+using GammaDataType   = ck::half_t;
+using BetaDataType    = ck::half_t;
+using YDataType       = ck::half_t;
+using ComputeDataType = float;
+using YElementOp      = ck::tensor_operation::element_wise::Swish;
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationSplitKImpl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                ComputeDataType,
+                                                                YDataType,
+                                                                YElementOp,
+                                                                Rank,
+                                                                NumReduceDim,
+                                                                256, // BlockSize
+                                                                1,   // ClusterM
+                                                                256, // ClusterK
+                                                                1,   // SliceM
+                                                                16,  // SliceK
+                                                                1,   // SrcVecDim (0=M, 1=K)
+                                                                2,   // SrcScalarPerVector
+                                                                1,   // GammaVecDim (0=M, 1=K)
+                                                                2,   // GammaScalarPerVector
+                                                                1,   // BetaVecDim (0=M, 1=K)
+                                                                2,   // BetaScalarPerVector
+                                                                2>;  // OutScalarPerVector
+#include "run_groupnorm_example.inc"
+int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); }
--- a/example/42_groupnorm/run_groupnorm_example.inc
+++ b/example/42_groupnorm/run_groupnorm_example.inc
@@ -73,6 +73,10 @@ int run_groupnorm_example(int argc, char* argv[])
        return 1;
    };
+    size_t workspace_sz = device_instance.GetWorkSpaceSize(argument_ptr.get());
+    DeviceMem workspace_dev(workspace_sz);
+    device_instance.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
    auto invoker_ptr = device_instance.MakeInvokerPointer();
    float ave_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true});

--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -175,7 +175,10 @@
 // denorm test fix, required to work around dissue
 #ifndef CK_WORKAROUND_DENORM_FIX
 #define CK_WORKAROUND_DENORM_FIX 0
-#endif
+#elif
+// enable only on MI200
+#define CK_WORKAROUND_DENORM_FIX = CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__)
+#endif // CK_WORKAROUND_DENORM_FIX
 namespace ck {

--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -135,7 +135,7 @@ __global__ void
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__) || \
-    defined(__gfx90a__) || defined(__gfx908__))
+    defined(__gfx90a__) || defined(__gfx908__) || defined(__gfx940__))
    // offset base pointer for each work-group
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -710,7 +710,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
        // check device
        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
-             ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx908"))
+             ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx908" ||
+             ck::get_device_name() == "gfx940"))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
@@ -51,7 +51,7 @@ __global__ void
            const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
-    defined(__gfx90a__) || defined(__gfx1030__))
+    defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__))
    constexpr index_t shared_block_size =
        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(ABDataType);
@@ -552,7 +552,8 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD<ALayout,
    static bool IsSupportedArgument(const Argument& arg)
    {
        if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx908" ||
-           ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx1030")
+           ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx1030" ||
+           ck::get_device_name() == "gfx940")
        {
            return GridwiseGemm::CheckValidity(
                arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.e_grid_desc_m_n_);

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_layernorm_xdl_cshuffle.hpp
@@ -807,7 +807,7 @@ struct DeviceGemmMultipleDLayernorm_Xdl_CShuffle
        // workspace for welford intermediate mean
        workspace_size += gemm_welford_size * sizeof(EMeanVarDataType) + 64;
-        // workspace for welford intermediate mean
+        // workspace for welford intermediate variance
        workspace_size += gemm_welford_size * sizeof(EMeanVarDataType) + 64;
        // workspace for welford intermediate count

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -34,7 +34,8 @@ __global__ void
        kernel_grouped_gemm_xdl_splitk(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
                                       const index_t group_count)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__))
    constexpr index_t shared_size = GridwiseGemm::GetSharedMemoryNumberOfByte();
    __shared__ uint8_t p_shared[shared_size];