Merge branch 'develop' into amd-develop

9b3c4ac4 · Jun Liu · 1d784873 · 7843a8a7 · 9b3c4ac4 · 9b3c4ac4
Commit 9b3c4ac4 authored May 14, 2024 by Jun Liu
20 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -446,12 +446,12 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        {
            if(!(karg.M % MPerBlock == 0))
            {
-#if DEBUG_LOG
+                if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-                std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
+                {
-                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                    std::cout << "Arg M value is not a multiple of MPerBlock! M: " << karg.M << " "
-                          << std::endl;
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
-#endif // DEBUG_LOG
+                }
                return false;
            }
        }
@@ -463,12 +463,12 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        {
            if(!(karg.N % NPerBlock == 0))
            {
-#if DEBUG_LOG
+                if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-                std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
+                {
-                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                    std::cout << "Arg N value is not a multiple of NPerBlock! N: " << karg.N << " "
-                          << std::endl;
+                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                              << std::endl;
-#endif // DEBUG_LOG
+                }
                return false;
            }
        }
@@ -482,12 +482,12 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
            auto K_t = karg.k_batch * K0PerBlock * K1;
            if(!(karg.K % K_t == 0))
            {
-#if DEBUG_LOG
+                if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-                std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
+                {
-                          << karg.K << " " << __FILE__ << ":" << __LINE__
+                    std::cout << "Arg K value is not a multiple of K_Batch * K0PerBlock * K1! K: "
-                          << ", in function: " << __func__ << std::endl;
+                              << karg.K << " " << __FILE__ << ":" << __LINE__
+                              << ", in function: " << __func__ << std::endl;
-#endif // DEBUG_LOG
+                }
                return false;
            }
        }
@@ -496,13 +496,13 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        {
            if(karg.K % ABlockTransferSrcScalarPerVector != 0)
            {
-#if DEBUG_LOG
+                if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-                std::cout << "Arg K (" << karg.K
+                {
-                          << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                    std::cout << "Arg K (" << karg.K
-                          << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
-                          << __LINE__ << ", in function: " << __func__ << std::endl;
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
-#endif // DEBUG_LOG
+                }
                return false;
            }
        }
@@ -510,13 +510,13 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        {
            if(karg.M % ABlockTransferSrcScalarPerVector != 0)
            {
-#if DEBUG_LOG
+                if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-                std::cout << "Arg M (" << karg.M
+                {
-                          << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
+                    std::cout << "Arg M (" << karg.M
-                          << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << ") value is not a multiple of ABlockTransferSrcScalarPerVector ("
-                          << __LINE__ << ", in function: " << __func__ << std::endl;
+                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
-#endif // DEBUG_LOG
+                }
                return false;
            }
        }
@@ -525,13 +525,13 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        {
            if(karg.N % BBlockTransferSrcScalarPerVector != 0)
            {
-#if DEBUG_LOG
+                if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-                std::cout << "Arg N (" << karg.N
+                {
-                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                    std::cout << "Arg N (" << karg.N
-                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
-                          << __LINE__ << ", in function: " << __func__ << std::endl;
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
-#endif // DEBUG_LOG
+                }
                return false;
            }
        }
@@ -539,13 +539,13 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        {
            if(karg.K % BBlockTransferSrcScalarPerVector != 0)
            {
-#if DEBUG_LOG
+                if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-                std::cout << "Arg K (" << karg.K
+                {
-                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                    std::cout << "Arg K (" << karg.K
-                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
-                          << __LINE__ << ", in function: " << __func__ << std::endl;
+                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                              << __LINE__ << ", in function: " << __func__ << std::endl;
-#endif // DEBUG_LOG
+                }
                return false;
            }
        }
@@ -554,14 +554,14 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        {
            if(karg.N % CBlockTransferScalarPerVector_NWaveNPerXDL != 0)
            {
-#if DEBUG_LOG
+                if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-                std::cout << "Arg N (" << karg.N
+                {
-                          << ") value is not a multiple of "
+                    std::cout << "Arg N (" << karg.N
-                             "CBlockTransferScalarPerVector_NWaveNPerXDL ("
+                              << ") value is not a multiple of "
-                          << CBlockTransferScalarPerVector_NWaveNPerXDL << " )! " << __FILE__ << ":"
+                                 "CBlockTransferScalarPerVector_NWaveNPerXDL ("
-                          << __LINE__ << ", in function: " << __func__ << std::endl;
+                              << CBlockTransferScalarPerVector_NWaveNPerXDL << " )! " << __FILE__
+                              << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
-#endif // DEBUG_LOG
+                }
                return false;
            }
        }
@@ -569,14 +569,14 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        {
            if(karg.M % CBlockTransferScalarPerVector_NWaveNPerXDL != 0)
            {
-#if DEBUG_LOG
+                if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-                std::cout << "Arg M (" << karg.M
+                {
-                          << ") value is not a multiple of "
+                    std::cout << "Arg M (" << karg.M
-                             "CBlockTransferScalarPerVector_NWaveNPerXDL ("
+                              << ") value is not a multiple of "
-                          << CBlockTransferScalarPerVector_NWaveNPerXDL << " )! " << __FILE__ << ":"
+                                 "CBlockTransferScalarPerVector_NWaveNPerXDL ("
-                          << __LINE__ << ", in function: " << __func__ << std::endl;
+                              << CBlockTransferScalarPerVector_NWaveNPerXDL << " )! " << __FILE__
+                              << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
-#endif // DEBUG_LOG
+                }
                return false;
            }
        }
@@ -584,12 +584,14 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
        const auto num_k_loop = karg.K0Padded / K0PerBlock;
        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
        {
-#if DEBUG_LOG
+            if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-            std::cout << "The number of k loops (" << num_k_loop
+            {
-                      << ") value is not supported by GridwiseGemm Pipeline."
+                std::cout << "The number of k loops (" << num_k_loop
-                      << " K0Padded: " << karg.K0Padded << ", K0PerBlock: " << K0PerBlock << " "
+                          << ") value is not supported by GridwiseGemm Pipeline."
-                      << __FILE__ << ":" << __LINE__ << ", in function: " << __func__ << std::endl;
+                          << " K0Padded: " << karg.K0Padded << ", K0PerBlock: " << K0PerBlock << " "
-#endif // DEBUG_LOG
+                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
+                          << std::endl;
+            }
            return false;
        }

--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -95,7 +95,7 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16,
    // Wave mode dependent propety
    static constexpr index_t wave_size = Number<WaveSize>{};
-    // * Fixed in Navi3x, Will be wave mode dependent on Navi4x
+    // * Fixed on gfx11, Will be wave mode dependent for future architectures
    static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4;
    static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4;
    // * num_acc_vgprs_per_wave alone M direction

--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include "data_type.hpp"
@@ -297,6 +297,17 @@ enum struct AmdBufferCoherenceEnum
    GLC              = 1,
    SLC              = 2,
    GLC_SLC          = 3,
+    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
+    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
+    WAVE_NT0   = 0,
+    WAVE_NT1   = 2,
+    GROUP_NT0  = 1,
+    GROUP_NT1  = 3,
+    DEVICE_NT0 = 8,
+    DEVICE_NT1 = 10,
+    SYSTEM_NT0 = 9,
+    SYSTEM_NT1 = 11,
 };
 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>

--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -4,7 +4,7 @@
 #pragma once
 namespace ck {
-// Define the common macro for MI300 models
+// Define the common macro for gfx94x models
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
 #define __gfx94__
 #endif

--- a/include/ck/utility/env.hpp
+++ b/include/ck/utility/env.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <string_view>
+namespace ck {
+namespace internal {
+template <typename T>
+struct ParseEnvVal
+{
+};
+template <>
+struct ParseEnvVal<bool>
+{
+    static bool parse_env_var_value(const char* vp)
+    {
+        std::string value_env_str{vp};
+        for(auto& c : value_env_str)
+        {
+            if(std::isalpha(c) != 0)
+            {
+                c = std::tolower(static_cast<unsigned char>(c));
+            }
+        }
+        if(value_env_str == "disable" || value_env_str == "disabled" || value_env_str == "0" ||
+           value_env_str == "no" || value_env_str == "off" || value_env_str == "false")
+        {
+            return false;
+        }
+        else if(value_env_str == "enable" || value_env_str == "enabled" || value_env_str == "1" ||
+                value_env_str == "yes" || value_env_str == "on" || value_env_str == "true")
+        {
+            return true;
+        }
+        else
+        {
+            throw std::runtime_error("Invalid value for env variable");
+        }
+        return false; // shouldn't reach here
+    }
+};
+// Supports hexadecimals (with leading "0x"), octals (if prefix is "0") and decimals (default).
+// Returns 0 if environment variable is in wrong format (strtoull fails to parse the string).
+template <>
+struct ParseEnvVal<uint64_t>
+{
+    static uint64_t parse_env_var_value(const char* vp) { return std::strtoull(vp, nullptr, 0); }
+};
+template <>
+struct ParseEnvVal<std::string>
+{
+    static std::string parse_env_var_value(const char* vp) { return std::string{vp}; }
+};
+template <typename T>
+struct EnvVar
+{
+    private:
+    T value{};
+    bool is_unset = true;
+    public:
+    const T& GetValue() const { return value; }
+    bool IsUnset() const { return is_unset; }
+    void Unset() { is_unset = true; }
+    void UpdateValue(const T& val)
+    {
+        is_unset = false;
+        value    = val;
+    }
+    explicit EnvVar(const char* const name, const T& def_val)
+    {
+        // NOLINTNEXTLINE (concurrency-mt-unsafe)
+        const char* vp = std::getenv(name);
+        if(vp != nullptr) // a value was provided
+        {
+            is_unset = false;
+            value    = ParseEnvVal<T>::parse_env_var_value(vp);
+        }
+        else // no value provided, use default value
+        {
+            value = def_val;
+        }
+    }
+};
+} // end namespace internal
+// static inside function hides the variable and provides
+// thread-safety/locking
+// Used in global namespace
+#define CK_DECLARE_ENV_VAR(name, type, default_val)                            \
+    namespace ck::env {                                                        \
+    struct name                                                                \
+    {                                                                          \
+        static_assert(std::is_same_v<name, ::ck::env::name>,                   \
+                      "CK_DECLARE_ENV* must be used in the global namespace"); \
+        using value_type = type;                                               \
+        static ck::internal::EnvVar<type>& Ref()                               \
+        {                                                                      \
+            static ck::internal::EnvVar<type> var{#name, default_val};         \
+            return var;                                                        \
+        }                                                                      \
+    };                                                                         \
+    }
+#define CK_DECLARE_ENV_VAR_BOOL(name) CK_DECLARE_ENV_VAR(name, bool, false)
+#define CK_DECLARE_ENV_VAR_UINT64(name) CK_DECLARE_ENV_VAR(name, uint64_t, 0)
+#define CK_DECLARE_ENV_VAR_STR(name) CK_DECLARE_ENV_VAR(name, std::string, "")
+#define ENV(name) \
+    ck::env::name {}
+template <class EnvVar>
+inline const std::string& EnvGetString(EnvVar)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, std::string>);
+    return EnvVar::Ref().GetValue();
+}
+template <class EnvVar>
+inline bool EnvIsEnabled(EnvVar)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, bool>);
+    return !EnvVar::Ref().IsUnset() && EnvVar::Ref().GetValue();
+}
+template <class EnvVar>
+inline bool EnvIsDisabled(EnvVar)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, bool>);
+    return !EnvVar::Ref().IsUnset() && !EnvVar::Ref().GetValue();
+}
+template <class EnvVar>
+inline uint64_t EnvValue(EnvVar)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, uint64_t>);
+    return EnvVar::Ref().GetValue();
+}
+template <class EnvVar>
+inline bool EnvIsUnset(EnvVar)
+{
+    return EnvVar::Ref().IsUnset();
+}
+template <class EnvVar>
+void EnvUnset(EnvVar)
+{
+    EnvVar::Ref().Unset();
+}
+/// updates the cached value of an environment variable
+template <typename EnvVar, typename ValueType>
+void UpdateEnvVar(EnvVar, const ValueType& val)
+{
+    static_assert(std::is_same_v<typename EnvVar::value_type, ValueType>);
+    EnvVar::Ref().UpdateValue(val);
+}
+template <typename EnvVar>
+void UpdateEnvVar(EnvVar, const std::string_view& val)
+{
+    EnvVar::Ref().UpdateValue(
+        ck::internal::ParseEnvVal<typename EnvVar::value_type>::parse_env_var_value(val.data()));
+}
+} // namespace ck
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -8,7 +8,7 @@
 #include "ck/utility/random_gen.hpp"
 namespace ck {
-// Define the common macro for MI300 models
+// Define the common macro for gfx94x models
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
 #define __gfx94__
 #endif

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+using namespace ck::tensor_layout::convolution;
+using F16 = ck::half_t;
+using F32 = float;
+using Empty_Tuple = ck::Tuple<>;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances = std::tuple<
+    // clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     4,  8,   16,   16,    1,    1,  S<1, 4, 8,  1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              1,              4,      true,  S<1, 4, 8,  1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              4,      true,           1,           1,   S<1, 8, 1, 8>,               1>
+    // clang-format on
+    >;
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_bilinear_instance.hpp
@@ -86,6 +86,7 @@ using device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_bilinear_instances = std:
        //#########################################| Spatial|         |          |          |       |       |        |        |        |       |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
        //#########################################|        |         |          |          |       |       |        |        |        |       |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
        // generic instance
+        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              1,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              4,      true,           1,           1,   S<1, 16, 1, 4>,               1>,   
        DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout, Tuple<BLayout>, F16,     F16,     F16,     F32, Tuple<F16>, PassThrough, Bilinear, PassThrough,  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2>,   
        // instance for small conv.K
        // for fp16 conv.K and conv.C must be divisible by 2

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -352,6 +352,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                {
                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+                        op_ptrs);
                }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -419,6 +421,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                {
                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+                        op_ptrs);
                }
 #endif
 #ifdef CK_ENABLE_BF16

--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
@@ -113,6 +113,18 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                           PassThrough,
                                                           PassThrough,
                                                           PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
@@ -192,6 +204,18 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances
                                                           PassThrough,
                                                           PassThrough,
                                                           PassThrough>>>& instances);
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(

--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
@@ -5,7 +5,8 @@ set(GROUPED_CONV2D_BWD_WEIGHT
    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
-    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp)
+    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp)
 if(DL_KERNELS)
    list(APPEND GROUPED_CONV2D_BWD_WEIGHT

--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances<
+            2,
+            NHWGC,
+            GKYXC,
+            NHWGK,
+            ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances<
+            2,
+            NHWGC,
+            GKYXC,
+            NHWGK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
@@ -5,7 +5,8 @@ set(GROUPED_CONV3D_BWD_WEIGHT
    xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
    xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
    xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
-    xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp)
+    xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
+    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp)
 if(DL_KERNELS)
    list(APPEND GROUPED_CONV3D_BWD_WEIGHT

--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightDefault>{});
+    // 2. Filter1x1Stride1Pad0
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_two_stage_xdl_c_shuffle_f16_instances<
+            3,
+            NDHWGC,
+            GKZYXC,
+            NDHWGK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/profiler/README.md
+++ b/profiler/README.md
@@ -13,15 +13,6 @@
 ./bin/ckProfiler      gemm         1       1       1     1    0       5  3840 4096 4096     4096    4096    4096
 ```
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```bash
-a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
-b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
-c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
-....
-Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s
-```
 ## Profile 2D forward convolution kernels
 ```bash
 #arg1: tensor operation (conv=Convolution)
@@ -37,15 +28,6 @@ Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s
 ################          op datatype  in_layout   wei_layout  out_layout  verify  init  log  repeat  N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
 ./bin/ckProfiler  conv2d_fwd        1          1            1           1       1     1    0       5  128  256  192 3 3   71   71     2 2       1 1      1 1       1 1
 ```
-Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```bash
-in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
-wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
-out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-....
-Best Perf: 1.42509 ms, 102.988 TFlops, 234.086 GB/s
-```
 ## Profile contraction kernels
 ```bash
@@ -71,16 +53,6 @@ Best Perf: 1.42509 ms, 102.988 TFlops, 234.086 GB/s
 ./bin/ckProfiler contraction_bilinear         0                 0        2      1       0     0    0     1    1.0   1.0 128 128 128 128 128 128
 ```
-Result (MI100)
-```bash
-a_m_k: dim 4, lengths {128, 128, 128, 128}, strides {2097152, 16384, 128, 1}
-b_k_n: dim 4, lengths {128, 128, 128, 128}, strides {128, 1, 2097152, 16384}
-d_m_n: dim 4, lengths {128, 128, 128, 128}, strides {2097152, 16384, 128, 1}
-e_m_n: dim 4, lengths {128, 128, 128, 128}, strides {2097152, 16384, 128, 1}
-....
-Best Perf: 211.405 ms, 41.6077 TFlops, 15.2372 GB/s
-```
 ## Profile batched gemm multiple D kernels
 ```bash
 #arg1: tensor operation (batched_gemm_multi_d=Batched GEMM multi D);
@@ -99,14 +71,6 @@ Best Perf: 211.405 ms, 41.6077 TFlops, 15.2372 GB/s
 ./bin/ckProfiler batched_gemm_multi_d         0       1       0     0    0     1 4096 4096 4096    4096    4096    4096     16777216     16777216     16777216         16
 ```
-Result (Radeon RX 6800 XT)
-```bash
-arg.a_grid_desc_k0_m0_m1_k1_{2048, 4096, 2}
-arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
-arg.e_grid_desc_m_n_{ 4096, 4096}
-....
-Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s
-```
 ## Profile grouped convolution backward data kernels
 ```bash
 # arg1: tensor operation (grouped_conv_bwd_data: Grouped Convolution Backward Data)
@@ -134,20 +98,6 @@ Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s
 ```
-Result (MI100, FP16, GNHWC_GKYXC_GNHWK)
-```bash
-out: dim 5, lengths {32, 4, 192, 28, 28}, strides {602112, 150528, 1, 5376, 192}
-wei: dim 5, lengths {32, 192, 192, 3, 3}, strides {331776, 1728, 1, 576, 192}
-in: dim 5, lengths {32, 4, 192, 28, 28}, strides {602112, 150528, 1, 5376, 192}
-....
-Best configuration parameters:
-name: DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 2, Default, 32, 32, 2, 4, 8, 4, 1, 1>
-avg_time: 0.768321
-tflops: 86.6679
-GB/s: 127.947
-```
 ## Profile grouped convolution backward weight kernels
 ```bash
 # arg1: tensor operation (grouped_conv_bwd_weight: Grouped Convolution Backward Weight)
@@ -179,19 +129,6 @@ GB/s: 127.947
 ```
-Result (MI100, FP16, GNHWC_GKYXC_GNHWK)
-```bash
-input: dim 5, lengths {32, 512, 1024, 28, 28}, strides {411041792, 802816, 1, 28672, 1024}
-weight: dim 5, lengths {32, 512, 1024, 3, 3}, strides {4718592, 9216, 1, 3072, 1024}
-output: dim 5, lengths {32, 512, 512, 26, 26}, strides {177209344, 346112, 1, 13312, 512}
-....
-Best configuration parameters:
-name: DeviceGroupedConvBwdWeight_Xdl_CShuffle<256, 256, 128, 4, Default, 8, 4, 2, 8, 4, 8, 2, 1, 1, 8>
-avg_time: 68.5216
-tflops: 95.337
-GB/s: 69.2301
-```
 Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
 ## Profile image to column/column to image kernels
@@ -224,17 +161,6 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate
 ```
-Result (MI210, FP32, NHWC)
-```bash
-input: dim 5, lengths {1, 256, 512, 28, 28}, strides {102760448, 401408, 1, 14336, 512}
-output: dim 2, lengths {173056, 4608}, strides {4608, 1}
-....
-Best configuration parameters:
-name: DeviceImageToColumn<128, 32, 64, 4>
-avg_time: 3.12326
-GB/s: 2042.59
-```
 Note: Column to image kernel adds to the output memory, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
 ## Profile Permute scale kernels
@@ -254,12 +180,3 @@ Note: Column to image kernel adds to the output memory, this will cause output b
 ################            op datatype  verify  init  log  time  dim0 dim1 dim2 in_stride0 in_stride1 in_stride2 out_stride0 out_stride1 out_stride2
 ./bin/ckProfiler permute_scale        0       1     1    0     1    64   64   64       4096         64          1           1          64        4096
 ```
-Result (MI100, FP32)
-```bash
-A: dim 3, lengths {64, 64, 64}, strides {4096, 64, 1}
-B: dim 3, lengths {64, 64, 64}, strides {1, 64, 4096}
-....
-Best perf = 0.0146878 ms, 142.782 GB/s, DeviceElementwiseNormalizationImpl<3, 2>
-```
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -188,6 +188,10 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                                        out_element_op,
                                        split_k);
+        const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+        DeviceMem workspace_dev(workspace_sz);
+        op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            // using atomic add, so need to reset input

--- a/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp
@@ -88,11 +88,12 @@ bool profile_grouped_gemm_fixed_nk_impl(int do_verification,
        c_m_n_host_results.push_back(
            Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
-#if DEBUG_LOG
+        if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-        std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i
+        {
-                  << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
+            std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n["
-                  << "]:" << c_m_n_device_results[i].mDesc << std::endl;
+                      << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
-#endif // DEBUG_LOG
+                      << "]:" << c_m_n_device_results[i].mDesc << std::endl;
+        }
        std::size_t num_thread = 1;
        switch(init_method)
        {

--- a/profiler/include/profiler/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -87,11 +87,12 @@ bool profile_grouped_gemm_impl(int do_verification,
        c_m_n_host_results.push_back(
            Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
-#if DEBUG_LOG
+        if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-        std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i
+        {
-                  << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
+            std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n["
-                  << "]:" << c_m_n_device_results[i].mDesc << std::endl;
+                      << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
-#endif // DEBUG_LOG
+                      << "]:" << c_m_n_device_results[i].mDesc << std::endl;
+        }
        std::size_t num_thread = 1;
        switch(init_method)
        {

--- a/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_tile_loop_impl.hpp
@@ -82,11 +82,12 @@ bool profile_grouped_gemm_tile_loop_impl(int do_verification,
            Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
        c_m_n_host_results.push_back(
            Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
-#if DEBUG_LOG
+        if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-        std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i
+        {
-                  << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
+            std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n["
-                  << "]:" << c_m_n_device_results[i].mDesc << std::endl;
+                      << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
-#endif // DEBUG_LOG
+                      << "]:" << c_m_n_device_results[i].mDesc << std::endl;
+        }
        switch(init_method)
        {
        case 0: break;

--- a/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_two_stage_impl.hpp
@@ -88,11 +88,12 @@ bool profile_grouped_gemm_two_stage_impl(int do_verification,
        c_m_n_host_results.push_back(
            Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
-#if DEBUG_LOG
+        if(ck::EnvIsEnabled(ENV(CK_LOGGING)))
-        std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i
+        {
-                  << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
+            std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n["
-                  << "]:" << c_m_n_device_results[i].mDesc << std::endl;
+                      << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
-#endif // DEBUG_LOG
+                      << "]:" << c_m_n_device_results[i].mDesc << std::endl;
+        }
        std::size_t num_thread = 1;
        switch(init_method)
        {