Merge remote-tracking branch 'origin/develop' into wavelet_model

95a83c6e · Adam Osewski · 5b7c2432 · 892a8d76 · 95a83c6e · 95a83c6e
Commit 95a83c6e authored Nov 18, 2022 by Adam Osewski
20 changed files
--- a/profiler/include/profile_convnd_bwd_weight_impl.hpp
+++ b/profiler/include/profile_convnd_bwd_weight_impl.hpp
@@ -433,7 +433,7 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
        {
            wei_device_buf.FromDevice(weights_device_result.mData.data());
-            success = ck::utils::check_err(weights_host_result.mData, weights_device_result.mData);
+            success = ck::utils::check_err(weights_host_result, weights_device_result);
            if(success == false)
            {

--- a/profiler/include/profile_elementwise_layernorm_impl.hpp
+++ b/profiler/include/profile_elementwise_layernorm_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iomanip>
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
+namespace ck {
+namespace profiler {
+template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
+void host_elementwise2D(HostTensorC& C,
+                        const HostTensorA& A,
+                        const HostTensorB& B,
+                        const std::vector<std::size_t>& shape,
+                        Functor functor)
+{
+    using ctype = ck::remove_reference_t<decltype(C(0, 0))>;
+    for(std::size_t m = 0; m < shape[0]; ++m)
+        for(std::size_t n = 0; n < shape[1]; ++n)
+        {
+            auto a_val  = A(m, n);
+            auto b_val  = B(m, n);
+            ctype c_val = 0;
+            functor(c_val, a_val, b_val);
+            C(m, n) = c_val;
+        }
+}
+template <typename ADataType,
+          typename BDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename YDataType>
+bool profile_elementwise_layernorm_impl(int do_verification,
+                                        int init_method,
+                                        bool do_log,
+                                        bool time_kernel,
+                                        std::vector<index_t> length)
+{
+    using Add         = ck::tensor_operation::element_wise::Add;
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    if(length.size() != 2)
+        return false;
+    index_t M      = length[0];
+    index_t N      = length[1];
+    index_t Stride = N;
+    constexpr int Rank         = 2;
+    constexpr int NumReduceDim = 1;
+    std::vector<index_t> reduce_dim      = {1};
+    std::vector<index_t> gammaBetaLength = {N};
+    std::vector<index_t> gammaBetaStride = {0, 1};
+    auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) {
+        using namespace ck::literals;
+        return HostTensorDescriptor({row, col}, {stride, 1_uz});
+    };
+    Tensor<ADataType> a(length);
+    Tensor<BDataType> b(length);
+    Tensor<GammaDataType> gamma(gammaBetaLength);
+    Tensor<BetaDataType> beta(gammaBetaLength);
+    Tensor<YDataType> y(length);
+    Tensor<YDataType> host_y(length);
+    switch(init_method)
+    {
+    case 0:
+        a.GenerateTensorValue(GeneratorTensor_1<ADataType>{});
+        b.GenerateTensorValue(GeneratorTensor_1<BDataType>{});
+        gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{});
+        beta.GenerateTensorValue(GeneratorTensor_1<BetaDataType>{});
+        break;
+    case 1:
+        a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        gamma.GenerateTensorValue(GeneratorTensor_2<GammaDataType>{-5, 5});
+        beta.GenerateTensorValue(GeneratorTensor_2<BetaDataType>{-5, 5});
+        break;
+    default:
+        a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0, 1});
+        b.GenerateTensorValue(GeneratorTensor_3<BDataType>{0, 1});
+        gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{-0.5, 0.5});
+        beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-0.5, 0.5});
+    }
+    DeviceMem a_dev(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
+    DeviceMem b_dev(sizeof(ADataType) * b.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+    a_dev.ToDevice(a.mData.data());
+    b_dev.ToDevice(b.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    beta_dev.ToDevice(beta.mData.data());
+    std::array<const void*, 2> input = {a_dev.GetDeviceBuffer(), b_dev.GetDeviceBuffer()};
+    // add device normalization instances
+    using DeviceOp = ck::tensor_operation::device::DeviceElementwiseNormalization<
+        ck::Tuple<ADataType, BDataType>,
+        GammaDataType,
+        BetaDataType,
+        AccDataType,
+        YDataType,
+        Add,
+        PassThrough,
+        2,
+        1>;
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    if(do_verification)
+    {
+        using XDataType             = ADataType;
+        std::vector<std::size_t> mn = {static_cast<unsigned long>(M),
+                                       static_cast<unsigned long>(N)};
+        Tensor<XDataType> x(f_host_tensor_descriptor2d(M, N, Stride));
+        host_elementwise2D<Tensor<ADataType>, Tensor<BDataType>, Tensor<XDataType>, Add>(
+            x, a, b, mn, Add{});
+        using ReferenceInstance = ck::tensor_operation::host::ReferenceLayernorm<XDataType,
+                                                                                 GammaDataType,
+                                                                                 BetaDataType,
+                                                                                 YDataType,
+                                                                                 AccDataType,
+                                                                                 PassThrough,
+                                                                                 Rank,
+                                                                                 NumReduceDim>;
+        ReferenceInstance ref;
+        auto ref_argument =
+            ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, {M, N}, {1}, 1e-4);
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+    }
+    int num_kernel = 0;
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(
+            length,
+            {
+                std::vector<ck::index_t>{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()},
+                std::vector<ck::index_t>{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()},
+            },
+            gammaBetaStride,
+            gammaBetaStride,
+            std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
+            reduce_dim,
+            1e-4,
+            input,
+            gamma_dev.GetDeviceBuffer(),
+            beta_dev.GetDeviceBuffer(),
+            y_dev.GetDeviceBuffer(),
+            Add{},
+            PassThrough{});
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
+        {
+            continue;
+        }
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+        std::size_t num_bytes = a.mDesc.GetElementSize() * sizeof(ADataType) +
+                                b.mDesc.GetElementSize() * sizeof(BDataType) +
+                                gamma.mDesc.GetElementSize() * sizeof(GammaDataType) +
+                                beta.mDesc.GetElementSize() * sizeof(BetaDataType) +
+                                y.mDesc.GetElementSize() * sizeof(YDataType);
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+        if(do_verification)
+        {
+            y_dev.FromDevice(y.mData.data());
+            bool pass =
+                ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
+            if(do_log)
+            {
+                LogRangeAsType<float>(std::cout << "a  : ", a.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "b  : ", b.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "host_y  : ", host_y.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "y  : ", y.mData, ",") << std::endl;
+            }
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
+                return false;
+            }
+            else
+            {
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
+            }
+        }
+    }
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", length, ",") << ", ";
+        std::cout << "num_kernel = " << num_kernel << ", best perf = " << best_avg_time << " ms, "
+                  << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl;
+    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is tested" << std::endl;
+        return false;
+    }
+    return true;
+}
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profile_gemm_add_add_fastgelu_impl.hpp
+++ b/profiler/include/profile_gemm_add_add_fastgelu_impl.hpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 namespace ck {
@@ -47,15 +48,15 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
 {
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                            std::vector<std::size_t>({1, stride}));
            }
        };
@@ -121,8 +122,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
    // run reference
    if(do_verification)
    {
-        Tensor<AccDataType> c_m_n(HostTensorDescriptor(
+        Tensor<AccDataType> c_m_n({M, N});
-            std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                                BDataType,
@@ -223,8 +223,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
            {
                e_device_buf.FromDevice(e_m_n_device_result.mData.data());
-                pass = pass &&
+                pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
-                       ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
            }
        }
        else

--- a/profiler/include/profile_gemm_bias_add_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_bias_add_reduce_impl.hpp
@@ -14,6 +14,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 namespace ck {
@@ -75,21 +76,20 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
                                       int StrideD0)
 {
    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
-        return HostTensorDescriptor(std::vector<std::size_t>({len}),
+        return HostTensorDescriptor({len}, {stride});
-                                    std::vector<std::size_t>({stride}));
    };
    auto f_host_tensor_descriptor2d =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                            std::vector<std::size_t>({1, stride}));
            }
        };
@@ -99,16 +99,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
    Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
-    Tensor<ReduceDataType> reduce0_m_host_result(
+    Tensor<ReduceDataType> reduce0_m_host_result({M});
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
+    Tensor<ReduceDataType> reduce1_m_host_result({M});
-    Tensor<ReduceDataType> reduce1_m_host_result(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
-    Tensor<ReduceDataType> reduce0_m_device_result(
+    Tensor<ReduceDataType> reduce0_m_device_result({M});
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
+    Tensor<ReduceDataType> reduce1_m_device_result({M});
-    Tensor<ReduceDataType> reduce1_m_device_result(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
@@ -347,9 +343,9 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
                reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
                reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
-                ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+                ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
-                ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData);
+                ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
-                ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData);
+                ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
                if(do_log)
                {

--- a/profiler/include/profile_gemm_bilinear_impl.hpp
+++ b/profiler/include/profile_gemm_bilinear_impl.hpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 namespace ck {
@@ -46,15 +47,15 @@ bool profile_gemm_bilinear_impl(int do_verification,
 {
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                            std::vector<std::size_t>({1, stride}));
            }
        };
@@ -116,8 +117,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
    // run reference
    if(do_verification)
    {
-        Tensor<AccDataType> c_m_n(HostTensorDescriptor(
+        Tensor<AccDataType> c_m_n({M, N});
-            std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                                                BDataType,
@@ -215,8 +215,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
            {
                e_device_buf.FromDevice(e_m_n_device_result.mData.data());
-                pass = pass &&
+                pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
-                       ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
            }
        }
        else

--- a/profiler/include/profile_gemm_impl.hpp
+++ b/profiler/include/profile_gemm_impl.hpp
@@ -18,6 +18,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 namespace ck {
@@ -45,15 +46,15 @@ int profile_gemm_impl(int do_verification,
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                            std::vector<std::size_t>({1, stride}));
            }
        };
@@ -187,8 +188,7 @@ int profile_gemm_impl(int do_verification,
            {
                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
-                pass =
+                pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
-                    pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
                if(do_log)
                {

--- a/profiler/include/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_reduce_impl.hpp
@@ -14,6 +14,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 namespace ck {
@@ -75,15 +76,15 @@ bool profile_gemm_reduce_impl(int do_verification,
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                            std::vector<std::size_t>({1, stride}));
            }
        };
@@ -91,16 +92,12 @@ bool profile_gemm_reduce_impl(int do_verification,
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<ReduceDataType> reduce0_m_host_result(
+    Tensor<ReduceDataType> reduce0_m_host_result({M});
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
+    Tensor<ReduceDataType> reduce1_m_host_result({M});
-    Tensor<ReduceDataType> reduce1_m_host_result(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-    Tensor<ReduceDataType> reduce0_m_device_result(
+    Tensor<ReduceDataType> reduce0_m_device_result({M});
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
+    Tensor<ReduceDataType> reduce1_m_device_result({M});
-    Tensor<ReduceDataType> reduce1_m_device_result(
-        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
@@ -313,9 +310,9 @@ bool profile_gemm_reduce_impl(int do_verification,
                reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
                reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
-                ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
+                ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
-                ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData);
+                ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
-                ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData);
+                ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
                if(do_log)
                {

--- a/profiler/include/profile_gemm_splitk_impl.hpp
+++ b/profiler/include/profile_gemm_splitk_impl.hpp
@@ -18,6 +18,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 namespace ck {
@@ -46,15 +47,15 @@ bool profile_gemm_splitk_impl(int do_verification,
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                            std::vector<std::size_t>({1, stride}));
            }
        };
@@ -190,8 +191,7 @@ bool profile_gemm_splitk_impl(int do_verification,
            {
                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
-                pass =
+                pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
-                    pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
                if(do_log)
                {

--- a/profiler/include/profile_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profile_conv_bwd_weight_impl.hpp
@@ -3,9 +3,10 @@
 #pragma once
-#include "ck/ck.hpp"
+#include <algorithm>
 #include <iomanip>
 #include <iostream>
+#include <iterator>
 #include <typeinfo>
 #include "ck/ck.hpp"
@@ -13,7 +14,7 @@
 #include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
-#include "ck/library/tensor_operation_instance/gpu/convolution_backward_weight.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
@@ -26,32 +27,6 @@
 namespace ck {
 namespace profiler {
-template <typename DataType>
-void show_data_nhwc_layout(Tensor<DataType>& nhwc)
-{
-    std::cout << "[";
-    for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
-    {
-        std::cout << "[";
-        for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
-        {
-            std::cout << "[";
-            for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
-            {
-                std::cout << "[";
-                for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
-                {
-                    std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << "  ";
-                }
-                std::cout << "]";
-            }
-            std::cout << "]";
-        }
-        std::cout << "]";
-    }
-    std::cout << "]";
-}
 template <ck::index_t NDimSpatial,
          typename InLayout,
          typename WeiLayout,
@@ -59,12 +34,12 @@ template <ck::index_t NDimSpatial,
          typename InDataType,
          typename WeiDataType,
          typename OutDataType>
-bool profile_conv_bwd_weight_impl(int do_verification,
+bool profile_grouped_conv_bwd_weight_impl(int do_verification,
-                                  int init_method,
+                                          int init_method,
-                                  bool do_log,
+                                          bool do_log,
-                                  bool time_kernel,
+                                          bool time_kernel,
-                                  const ck::utils::conv::ConvParam& conv_param,
+                                          const ck::utils::conv::ConvParam& conv_param,
-                                  ck::index_t split_k)
+                                          ck::index_t split_k)
 {
    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -114,16 +89,14 @@ bool profile_conv_bwd_weight_impl(int do_verification,
    if(do_verification)
    {
-        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
+        auto ref_conv     = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
                                                                           InDataType,
                                                                           WeiDataType,
                                                                           OutDataType,
                                                                           InElementOp,
                                                                           WeiElementOp,
                                                                           OutElementOp>{};
+        auto ref_invoker  = ref_conv.MakeInvoker();
-        auto ref_invoker = ref_conv.MakeInvoker();
        auto ref_argument = ref_conv.MakeArgument(input,
                                                  weight_host_result,
                                                  output,
@@ -138,16 +111,16 @@ bool profile_conv_bwd_weight_impl(int do_verification,
        ref_invoker.Run(ref_argument);
    }
-    using DeviceOp = ck::tensor_operation::device::DeviceConvBwdWeight<NDimSpatial,
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NDimSpatial,
-                                                                       InLayout,
+                                                                              InLayout,
-                                                                       WeiLayout,
+                                                                              WeiLayout,
-                                                                       OutLayout,
+                                                                              OutLayout,
-                                                                       InDataType,
+                                                                              InDataType,
-                                                                       WeiDataType,
+                                                                              WeiDataType,
-                                                                       OutDataType,
+                                                                              OutDataType,
-                                                                       InElementOp,
+                                                                              InElementOp,
-                                                                       WeiElementOp,
+                                                                              WeiElementOp,
-                                                                       OutElementOp>;
+                                                                              OutElementOp>;
    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
@@ -163,22 +136,41 @@ bool profile_conv_bwd_weight_impl(int do_verification,
    // profile device Conv instances
    bool all_pass = true;
+    std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    auto range_copy = [](const auto& from, auto to) { std::copy(begin(from), end(from), to); };
+    range_copy(conv_param.input_spatial_lengths_, begin(input_spatial_lengths));
+    range_copy(conv_param.filter_spatial_lengths_, begin(filter_spatial_lengths));
+    range_copy(conv_param.output_spatial_lengths_, begin(output_spatial_lengths));
+    range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
+    range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
+    range_copy(conv_param.input_left_pads_, begin(input_left_pads));
+    range_copy(conv_param.input_right_pads_, begin(input_right_pads));
    for(auto& op_ptr : op_ptrs)
    {
        auto argument_ptr =
            op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                                        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                        conv_param.G_,
                                        conv_param.N_,
                                        conv_param.K_,
                                        conv_param.C_,
-                                        conv_param.input_spatial_lengths_,
+                                        input_spatial_lengths,
-                                        conv_param.filter_spatial_lengths_,
+                                        filter_spatial_lengths,
-                                        conv_param.output_spatial_lengths_,
+                                        output_spatial_lengths,
-                                        conv_param.conv_filter_strides_,
+                                        conv_filter_strides,
-                                        conv_param.conv_filter_dilations_,
+                                        conv_filter_dilations,
-                                        conv_param.input_left_pads_,
+                                        input_left_pads,
-                                        conv_param.input_right_pads_,
+                                        input_right_pads,
                                        in_element_op,
                                        wei_element_op,
                                        out_element_op,
@@ -217,33 +209,29 @@ bool profile_conv_bwd_weight_impl(int do_verification,
            {
                wei_device_buf.FromDevice(weight_device_result.mData.data());
-                bool pass =
+                bool pass = ck::utils::check_err(weight_device_result, weight_host_result);
-                    ck::utils::check_err(weight_host_result.mData, weight_device_result.mData);
                if(!pass)
                {
-                    std::cout << "Fail info:" << op_ptr->GetTypeString() << std::endl;
+                    std::cout << "Fail info: " << op_ptr->GetTypeString() << std::endl;
                }
                all_pass &= pass;
                if(do_log)
                {
-                    std::cout << "in : ";
+                    LogRangeAsType<float>(std::cout << "output : ", output.mData, ",") << std::endl;
-                    show_data_nhwc_layout(output);
+                    ;
-                    std::cout << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "weight (device): ", weight_device_result.mData, ",")
-                    std::cout << "wei: ";
+                        << std::endl;
-                    show_data_nhwc_layout(weight_host_result);
+                    ;
-                    std::cout << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "weight (host): ", weight_host_result.mData, ",")
-                    std::cout << "out  : ";
+                        << std::endl;
-                    show_data_nhwc_layout(input);
+                    ;
-                    std::cout << std::endl;
+                    LogRangeAsType<float>(std::cout << "input: ", input.mData, ",") << std::endl;
+                    ;
-                    std::cout << "wei_device: ";
-                    show_data_nhwc_layout(weight_device_result);
-                    std::cout << std::endl;
                }
            }
        }

--- a/profiler/include/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profile_grouped_conv_fwd_impl.hpp
@@ -9,11 +9,12 @@
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp"
+#include "ck/library/utility/algorithm.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
@@ -66,7 +67,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
    std::array<ck::index_t, NDimSpatial> input_left_pads{};
    std::array<ck::index_t, NDimSpatial> input_right_pads{};
-    auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
@@ -136,25 +137,6 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
        ref_invoker.Run(ref_argument);
    }
-    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NDimSpatial,
-                                                                                 InLayout,
-                                                                                 WeiLayout,
-                                                                                 ck::Tuple<>,
-                                                                                 OutLayout,
-                                                                                 InDataType,
-                                                                                 WeiDataType,
-                                                                                 ck::Tuple<>,
-                                                                                 OutDataType,
-                                                                                 InElementOp,
-                                                                                 WeiElementOp,
-                                                                                 OutElementOp>;
-    // get device op instances
-    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceOp>::GetInstances();
-    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
    std::string best_op_name;
    float best_avg_time   = 0;
    float best_tflops     = 0;
@@ -163,29 +145,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
    // profile device op instances
    bool pass = true;
-    for(auto& op_ptr : op_ptrs)
+    auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
-    {
-        auto argument_ptr =
-            op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
-                                        wei_device_buf.GetDeviceBuffer(),
-                                        std::array<const void*, 0>{},
-                                        out_device_buf.GetDeviceBuffer(),
-                                        a_g_n_c_wis_lengths,
-                                        a_g_n_c_wis_strides,
-                                        b_g_k_c_xs_lengths,
-                                        b_g_k_c_xs_strides,
-                                        std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
-                                        std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
-                                        e_g_n_k_wos_lengths,
-                                        e_g_n_k_wos_strides,
-                                        conv_filter_strides,
-                                        conv_filter_dilations,
-                                        input_left_pads,
-                                        input_right_pads,
-                                        in_element_op,
-                                        wei_element_op,
-                                        out_element_op);
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
            // re-init output to zero before profiling next kernel
@@ -220,7 +180,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
            {
                out_device_buf.FromDevice(device_output.mData.data());
-                pass = pass & ck::utils::check_err(device_output.mData, host_output.mData);
+                pass = pass & ck::utils::check_err(device_output, host_output);
                if(do_log)
                {
@@ -237,6 +197,95 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
        {
            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
        }
+    };
+    // xdl
+    {
+        using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                                                                     InLayout,
+                                                                                     WeiLayout,
+                                                                                     ck::Tuple<>,
+                                                                                     OutLayout,
+                                                                                     InDataType,
+                                                                                     WeiDataType,
+                                                                                     ck::Tuple<>,
+                                                                                     OutDataType,
+                                                                                     InElementOp,
+                                                                                     WeiElementOp,
+                                                                                     OutElementOp>;
+        // get device op instances
+        const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+        std::cout << "xdl found " << op_ptrs.size() << " instances" << std::endl;
+        for(auto& op_ptr : op_ptrs)
+        {
+            auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
+                                                            wei_device_buf.GetDeviceBuffer(),
+                                                            {},
+                                                            out_device_buf.GetDeviceBuffer(),
+                                                            a_g_n_c_wis_lengths,
+                                                            a_g_n_c_wis_strides,
+                                                            b_g_k_c_xs_lengths,
+                                                            b_g_k_c_xs_strides,
+                                                            {},
+                                                            {},
+                                                            e_g_n_k_wos_lengths,
+                                                            e_g_n_k_wos_strides,
+                                                            conv_filter_strides,
+                                                            conv_filter_dilations,
+                                                            input_left_pads,
+                                                            input_right_pads,
+                                                            in_element_op,
+                                                            wei_element_op,
+                                                            out_element_op);
+            run_impl(op_ptr, argument_ptr);
+        }
+    }
+    // dl
+    {
+        using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwd<NDimSpatial,
+                                                                            InLayout,
+                                                                            WeiLayout,
+                                                                            OutLayout,
+                                                                            InDataType,
+                                                                            WeiDataType,
+                                                                            OutDataType,
+                                                                            InElementOp,
+                                                                            WeiElementOp,
+                                                                            OutElementOp>;
+        // get device op instances
+        const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+        std::cout << "dl found " << op_ptrs.size() << " instances" << std::endl;
+        for(auto& op_ptr : op_ptrs)
+        {
+            auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
+                                                            wei_device_buf.GetDeviceBuffer(),
+                                                            out_device_buf.GetDeviceBuffer(),
+                                                            a_g_n_c_wis_lengths,
+                                                            a_g_n_c_wis_strides,
+                                                            b_g_k_c_xs_lengths,
+                                                            b_g_k_c_xs_strides,
+                                                            e_g_n_k_wos_lengths,
+                                                            e_g_n_k_wos_strides,
+                                                            conv_filter_strides,
+                                                            conv_filter_dilations,
+                                                            input_left_pads,
+                                                            input_right_pads,
+                                                            in_element_op,
+                                                            wei_element_op,
+                                                            out_element_op);
+            run_impl(op_ptr, argument_ptr);
+        }
    }
    std::cout << "Best configuration parameters:"

--- a/profiler/include/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profile_grouped_gemm_impl.hpp
@@ -17,6 +17,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 namespace ck {
@@ -45,15 +46,15 @@ bool profile_grouped_gemm_impl(int do_verification,
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                            std::vector<std::size_t>({1, stride}));
            }
        };
@@ -257,8 +258,7 @@ bool profile_grouped_gemm_impl(int do_verification,
                                                              c_element_op);
                    ref_invoker.Run(ref_argument);
-                    pass = pass && ck::utils::check_err(c_m_n_device_results[i].mData,
+                    pass = pass && ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);
-                                                        c_m_n_host_result.mData);
                    if(do_log)
                    {

--- a/profiler/include/profile_groupnorm_impl.hpp
+++ b/profiler/include/profile_groupnorm_impl.hpp
@@ -126,6 +126,8 @@ bool profile_groupnorm_impl(int do_verification,
            gamma_dev.GetDeviceBuffer(),
            beta_dev.GetDeviceBuffer(),
            y_dev.GetDeviceBuffer(),
+            nullptr,
+            nullptr,
            PassThrough{});
        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
@@ -163,8 +165,7 @@ bool profile_groupnorm_impl(int do_verification,
        {
            y_dev.FromDevice(y.mData.data());
-            bool pass =
+            bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
-                ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
            if(do_log)
            {
@@ -196,7 +197,7 @@ bool profile_groupnorm_impl(int do_verification,
    if(num_kernel == 0)
    {
-        std::cout << "Error: No kernel is tested" << std::endl;
+        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }

--- a/profiler/include/profile_layernorm_impl.hpp
+++ b/profiler/include/profile_layernorm_impl.hpp
@@ -22,7 +22,7 @@ template <typename XDataType,
          typename AccDataType,
          typename YDataType,
          index_t Rank>
-void profile_layernorm_impl(int do_verification,
+bool profile_layernorm_impl(int do_verification,
                            int init_method,
                            bool do_log,
                            bool time_kernel,
@@ -31,7 +31,7 @@ void profile_layernorm_impl(int do_verification,
    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
    if(length.size() < 2)
-        return;
+        return false;
    // Assume normalize dimension except for batch (first) dimension
    std::vector<index_t> reduce_length{length.begin() + 1, length.end()};
@@ -52,7 +52,6 @@ void profile_layernorm_impl(int do_verification,
    switch(init_method)
    {
-    // case 0: break;
    case 0:
        x.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
        gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{});
@@ -122,6 +121,8 @@ void profile_layernorm_impl(int do_verification,
        ref_invoker.Run(ref_argument);
    }
+    int num_kernel = 0;
    for(auto& inst_ptr : instance_ptrs)
    {
        auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
@@ -135,12 +136,21 @@ void profile_layernorm_impl(int do_verification,
                                                          gamma_dev.GetDeviceBuffer(),
                                                          beta_dev.GetDeviceBuffer(),
                                                          y_dev.GetDeviceBuffer(),
+                                                          nullptr,
+                                                          nullptr,
                                                          PassThrough{});
-        if(!inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
        {
-            std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+            if(time_kernel)
-            LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
+            }
            continue;
        }
@@ -156,8 +166,9 @@ void profile_layernorm_impl(int do_verification,
        float gb_per_sec = num_bytes / 1.E6 / avg_time;
-        std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+        if(time_kernel)
-                  << inst_ptr->GetTypeString() << std::endl;
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
        if(avg_time < best_avg_time)
        {
@@ -184,20 +195,32 @@ void profile_layernorm_impl(int do_verification,
            {
                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
                LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
-                return;
+                return false;
            }
            else
            {
-                std::cout << "pass" << std::endl;
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
            }
        }
    }
-    LogRange(std::cout << "length = ", length, ",") << ", ";
+    if(time_kernel)
-    LogRange(std::cout << "stride = ", strideXY, ",") << ", ";
+    {
-    LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl;
+        LogRange(std::cout << "length = ", length, ",") << ", ";
-    std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+        LogRange(std::cout << "stride = ", strideXY, ",") << ", ";
-              << best_instance_name << std::endl;
+        LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+    return true;
 }
 } // namespace profiler

--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
@@ -6,8 +6,9 @@
 #include "ck/utility/reduction_enums.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
-#include "ck/library/utility/check_err.hpp"
 #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_reduction.hpp"
 #include "ck/library/utility/host_common_util.hpp"
@@ -18,57 +19,61 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
-template <int Rank, int NumReduceDim, int ReduceOpId, bool PropagateNan, bool UseIndex>
+template <index_t Rank,
+          index_t NumReduceDim,
+          ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool UseIndex>
 struct ReduceDescription
 {
-    static constexpr int Rank_         = Rank;
+    static constexpr index_t Rank_              = Rank;
-    static constexpr int NumReduceDim_ = NumReduceDim;
+    static constexpr index_t NumReduceDim_      = NumReduceDim;
-    static constexpr int ReduceOpId_   = ReduceOpId;
+    static constexpr ReduceTensorOp ReduceOpId_ = ReduceOpId;
-    static constexpr int PropagateNan_ = PropagateNan;
+    static constexpr bool PropagateNan_         = PropagateNan;
-    static constexpr int UseIndex_     = UseIndex;
+    static constexpr bool UseIndex_             = UseIndex;
 };
 using reduce_description_instances =
-    std::tuple<ReduceDescription<4, 3, 0, false, false>, // for ADD
+    std::tuple<ReduceDescription<4, 3, ReduceTensorOp::ADD, false, false>, // for ADD
-               ReduceDescription<4, 4, 0, false, false>,
+               ReduceDescription<4, 4, ReduceTensorOp::ADD, false, false>,
-               ReduceDescription<4, 1, 0, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::ADD, false, false>,
-               ReduceDescription<2, 1, 0, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::ADD, false, false>,
-               ReduceDescription<4, 3, 5, false, false>, // for AVG
+               ReduceDescription<4, 3, ReduceTensorOp::AVG, false, false>, // for AVG
-               ReduceDescription<4, 4, 5, false, false>,
+               ReduceDescription<4, 4, ReduceTensorOp::AVG, false, false>,
-               ReduceDescription<4, 1, 5, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::AVG, false, false>,
-               ReduceDescription<2, 1, 5, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::AVG, false, false>,
-               ReduceDescription<4, 3, 7, false, false>, // for NORM2
+               ReduceDescription<4, 3, ReduceTensorOp::NORM2, false, false>, // for NORM2
-               ReduceDescription<4, 4, 7, false, false>,
+               ReduceDescription<4, 4, ReduceTensorOp::NORM2, false, false>,
-               ReduceDescription<4, 1, 7, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::NORM2, false, false>,
-               ReduceDescription<2, 1, 7, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::NORM2, false, false>,
-               ReduceDescription<4, 3, 2, false, false>, // for MIN
+               ReduceDescription<4, 3, ReduceTensorOp::MIN, false, false>, // for MIN
-               ReduceDescription<4, 4, 2, false, false>,
+               ReduceDescription<4, 4, ReduceTensorOp::MIN, false, false>,
-               ReduceDescription<4, 1, 2, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::MIN, false, false>,
-               ReduceDescription<2, 1, 2, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::MIN, false, false>,
-               ReduceDescription<4, 3, 3, false, false>, // for MAX
+               ReduceDescription<4, 3, ReduceTensorOp::MAX, false, false>, // for MAX
-               ReduceDescription<4, 4, 3, false, false>,
+               ReduceDescription<4, 4, ReduceTensorOp::MAX, false, false>,
-               ReduceDescription<4, 1, 3, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::MAX, false, false>,
-               ReduceDescription<2, 1, 3, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::MAX, false, false>,
-               ReduceDescription<4, 3, 4, false, false>, // for AMAX
+               ReduceDescription<4, 3, ReduceTensorOp::AMAX, false, false>, // for AMAX
-               ReduceDescription<4, 4, 4, false, false>,
+               ReduceDescription<4, 4, ReduceTensorOp::AMAX, false, false>,
-               ReduceDescription<4, 1, 4, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::AMAX, false, false>,
-               ReduceDescription<2, 1, 4, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::AMAX, false, false>,
-               ReduceDescription<4, 3, 2, false, true>, // for MIN
+               ReduceDescription<4, 3, ReduceTensorOp::MIN, false, true>, // for MIN
-               ReduceDescription<4, 4, 2, false, true>,
+               ReduceDescription<4, 4, ReduceTensorOp::MIN, false, true>,
-               ReduceDescription<4, 1, 2, false, true>,
+               ReduceDescription<4, 1, ReduceTensorOp::MIN, false, true>,
-               ReduceDescription<2, 1, 2, false, true>,
+               ReduceDescription<2, 1, ReduceTensorOp::MIN, false, true>,
-               ReduceDescription<4, 3, 3, false, true>, // for MAX
+               ReduceDescription<4, 3, ReduceTensorOp::MAX, false, true>, // for MAX
-               ReduceDescription<4, 4, 3, false, true>,
+               ReduceDescription<4, 4, ReduceTensorOp::MAX, false, true>,
-               ReduceDescription<4, 1, 3, false, true>,
+               ReduceDescription<4, 1, ReduceTensorOp::MAX, false, true>,
-               ReduceDescription<2, 1, 3, false, true>,
+               ReduceDescription<2, 1, ReduceTensorOp::MAX, false, true>,
-               ReduceDescription<4, 3, 4, false, true>, // for AMAX
+               ReduceDescription<4, 3, ReduceTensorOp::AMAX, false, true>, // for AMAX
-               ReduceDescription<4, 4, 4, false, true>,
+               ReduceDescription<4, 4, ReduceTensorOp::AMAX, false, true>,
-               ReduceDescription<4, 1, 4, false, true>,
+               ReduceDescription<4, 1, ReduceTensorOp::AMAX, false, true>,
-               ReduceDescription<2, 1, 4, false, true>>;
+               ReduceDescription<2, 1, ReduceTensorOp::AMAX, false, true>>;
 template <typename DescriptionType>
 bool description_match(const DescriptionType& description,
@@ -78,9 +83,8 @@ bool description_match(const DescriptionType& description,
                       bool PropagateNan,
                       bool UseIndex)
 {
-    if(description.Rank_ != Rank || description.ReduceOpId_ != static_cast<int>(ReduceOpId) ||
+    if(description.Rank_ != Rank || description.ReduceOpId_ != ReduceOpId ||
-       description.PropagateNan_ != static_cast<int>(PropagateNan) ||
+       description.PropagateNan_ != PropagateNan || description.UseIndex_ != UseIndex)
-       description.UseIndex_ != static_cast<int>(UseIndex))
        return (false);
    if(DescriptionType::NumReduceDim_ != reduceDims.size())
@@ -99,11 +103,10 @@ bool description_match(const DescriptionType& description,
 namespace ck {
 namespace profiler {
-template <index_t Rank, index_t NumReduceDim>
+template <int Rank, int NumReduceDim>
-static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
+static inline std::array<int, Rank - NumReduceDim>
+get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
 {
-    assert(NumReduceDim == reduceDims.size());
    int reduceFlag = 0;
    // flag the bits for the reduceDims
@@ -112,13 +115,15 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
        reduceFlag |= 1 << reduceDims[i];
    };
-    std::vector<int> invariantDims;
+    std::array<int, Rank - NumReduceDim> invariantDims;
    // collect invariant dimensions
+    int dim = 0;
    for(int i = 0; i < Rank; i++)
        if((reduceFlag & (1 << i)) == 0)
        {
-            invariantDims.push_back(i);
+            invariantDims[dim] = i;
+            dim++;
        };
    return invariantDims;
@@ -137,7 +142,7 @@ bool profile_reduce_impl_impl(bool do_verification,
                              bool do_dumpout,
                              bool time_kernel,
                              const std::vector<size_t>& inLengths,
-                              const std::vector<int>& reduceDims,
+                              const std::array<int, NumReduceDim>& reduceDims,
                              float alpha,
                              float beta)
 {
@@ -145,6 +150,8 @@ bool profile_reduce_impl_impl(bool do_verification,
    using namespace ck::tensor_operation::device::instance;
    using ck::host_common::dumpBufferToFile;
+    constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
    constexpr bool op_support_indices =
        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
         ReduceOpId == ReduceTensorOp::AMAX);
@@ -279,28 +286,32 @@ bool profile_reduce_impl_impl(bool do_verification,
            reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
                static_cast<int32_t>(reduce_total_length));
-        using DeviceReduceInstPtr0 =
+        using DeviceReduceInstPtr =
-            DeviceReducePtr<InElementwiseOperation, AccElementwiseOperation>;
+            DeviceReducePtr<Rank, NumReduceDim, InElementwiseOperation, AccElementwiseOperation>;
-        std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
+        std::vector<DeviceReduceInstPtr> reduce_ptrs;
        add_device_reduce_instance_threadwise<InDataType,
                                              AccDataType,
                                              OutDataType,
                                              Rank,
                                              NumReduceDim,
-                                              ReduceOpId,
+                                              ReduceOperation,
+                                              InElementwiseOperation,
+                                              AccElementwiseOperation,
                                              PropagateNan,
-                                              UseIndex>(reduce0_ptrs);
+                                              UseIndex>(reduce_ptrs);
        add_device_reduce_instance_blockwise<InDataType,
                                             AccDataType,
                                             OutDataType,
                                             Rank,
                                             NumReduceDim,
-                                             ReduceOpId,
+                                             ReduceOperation,
+                                             InElementwiseOperation,
+                                             AccElementwiseOperation,
                                             PropagateNan,
-                                             UseIndex>(reduce0_ptrs);
+                                             UseIndex>(reduce_ptrs);
        if constexpr(use_atomic_add)
        {
@@ -309,12 +320,14 @@ bool profile_reduce_impl_impl(bool do_verification,
                                                             OutDataType,
                                                             Rank,
                                                             NumReduceDim,
-                                                             ReduceOpId,
+                                                             ReduceOperation,
+                                                             InElementwiseOperation,
+                                                             AccElementwiseOperation,
                                                             PropagateNan,
-                                                             UseIndex>(reduce0_ptrs);
+                                                             UseIndex>(reduce_ptrs);
        }
-        if(reduce0_ptrs.empty())
+        if(reduce_ptrs.empty())
        {
            throw std::runtime_error("Wrong! No device REDUCE instance found");
        };
@@ -342,22 +355,22 @@ bool profile_reduce_impl_impl(bool do_verification,
                           acc_elementwise_op);
        };
-        std::vector<ck::index_t> i_inLengths;
+        std::array<index_t, Rank> arrInLengths;
-        std::vector<ck::index_t> i_inStrides;
+        std::array<index_t, Rank> arrInStrides;
-        std::vector<ck::index_t> i_outLengths;
+        std::array<index_t, NumOutDim> arrOutLengths;
-        std::vector<ck::index_t> i_outStrides;
+        std::array<index_t, NumOutDim> arrOutStrides;
-        i_inLengths.assign(inLengths.begin(), inLengths.end());
+        ck::ranges::copy(inLengths, arrInLengths.begin());
-        i_inStrides.assign(inStrides.begin(), inStrides.end());
+        ck::ranges::copy(inStrides, arrInStrides.begin());
-        i_outLengths.assign(outLengths.begin(), outLengths.end());
+        ck::ranges::copy(outLengths, arrOutLengths.begin());
-        i_outStrides.assign(outStrides.begin(), outStrides.end());
+        ck::ranges::copy(outStrides, arrOutStrides.begin());
-        for(auto& reduce_ptr : reduce0_ptrs)
+        for(auto& reduce_ptr : reduce_ptrs)
        {
-            auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
+            auto argument_ptr = reduce_ptr->MakeArgumentPointer(arrInLengths,
-                                                                i_inStrides,
+                                                                arrInStrides,
-                                                                i_outLengths,
+                                                                arrOutLengths,
-                                                                i_outStrides,
+                                                                arrOutStrides,
                                                                reduceDims,
                                                                alpha,
                                                                beta,
@@ -399,13 +412,12 @@ bool profile_reduce_impl_impl(bool do_verification,
                bool single_pass;
                out_dev.FromDevice(out.mData.data());
-                single_pass = ck::utils::check_err(out.mData, out_ref.mData);
+                single_pass = ck::utils::check_err(out, out_ref);
                if(OutputIndex)
                {
                    out_indices_dev.FromDevice(out_indices.mData.data());
-                    single_pass = single_pass &&
+                    single_pass = single_pass && ck::utils::check_err(out_indices, out_indices_ref);
-                                  ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
                };
                if(!single_pass)
@@ -478,22 +490,25 @@ bool profile_reduce_impl(bool do_verification,
               descType{}, inLengths.size(), reduceDims, ReduceOpId, PropagateNan, UseIndex))
            return;
-        pass = pass &&
+        std::array<ck::index_t, descType::NumReduceDim_> arrReduceDims;
-               profile_reduce_impl_impl<InDataType,
-                                        AccDataType,
+        ck::ranges::copy(reduceDims, arrReduceDims.begin());
-                                        OutDataType,
-                                        descType::Rank_,
+        pass = pass && profile_reduce_impl_impl<InDataType,
-                                        descType::NumReduceDim_,
+                                                AccDataType,
-                                        static_cast<ReduceTensorOp>(descType::ReduceOpId_),
+                                                OutDataType,
-                                        static_cast<bool>(descType::PropagateNan_),
+                                                descType::Rank_,
-                                        static_cast<bool>(descType::UseIndex_)>(do_verification,
+                                                descType::NumReduceDim_,
-                                                                                init_method,
+                                                static_cast<ReduceTensorOp>(descType::ReduceOpId_),
-                                                                                do_dumpout,
+                                                descType::PropagateNan_,
-                                                                                time_kernel,
+                                                descType::UseIndex_>(do_verification,
-                                                                                inLengths,
+                                                                     init_method,
-                                                                                reduceDims,
+                                                                     do_dumpout,
-                                                                                alpha,
+                                                                     time_kernel,
-                                                                                beta);
+                                                                     inLengths,
+                                                                     arrReduceDims,
+                                                                     alpha,
+                                                                     beta);
        matched = true;
    });

--- a/profiler/include/profile_softmax_impl.hpp
+++ b/profiler/include/profile_softmax_impl.hpp
@@ -3,55 +3,27 @@
 #pragma once
+#include <algorithm>
 #include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
 #include "ck/ck.hpp"
 #include "ck/library/utility/check_err.hpp"
-#include "ck/library/utility/convolution_parameter.hpp"
 #include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/fill.hpp"
 #include "ck/library/utility/host_tensor.hpp"
-#include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax.hpp"
 #include "ck/tensor_operation/gpu/device/device_softmax.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/utility/data_type.hpp"
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-namespace {
-using F16         = ck::half_t;
-using F32         = float;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-} // namespace
-void add_device_softmax_f16_f16_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>&);
-void add_device_softmax_f16_f16_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>&);
-void add_device_softmax_f32_f32_rank3_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>&);
-void add_device_softmax_f32_f32_rank4_instances(
-    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>&);
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
 namespace ck {
 namespace profiler {
-enum struct NormType
+enum struct SoftmaxDataType
-{
-    BATCHNORM,
-    SOFTMAX,
-};
-enum struct NormDataType
 {
    F32_F32, // in, out
    F16_F16,
@@ -60,7 +32,7 @@ enum struct NormDataType
 };
 // clang-format off
-template <typename NormDataType> std::string type_to_string();
+template <typename SoftmaxDataType> std::string type_to_string();
 template <> std::string type_to_string<float>()   { return "f32"; }
 template <> std::string type_to_string<half_t>()  { return "f16"; }
 template <> std::string type_to_string<bhalf_t>() { return "bf16"; }
@@ -69,7 +41,7 @@ template <> std::string type_to_string<int32_t>() { return "int32"; }
 // clang-format on
 template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
-void profile_softmax_impl(int do_verification,
+bool profile_softmax_impl(int do_verification,
                          int init_method,
                          bool do_log,
                          bool time_kernel,
@@ -77,8 +49,7 @@ void profile_softmax_impl(int do_verification,
                          std::vector<index_t> in_strides,
                          std::vector<index_t> reduce_dims,
                          AccDataType alpha,
-                          AccDataType beta,
+                          AccDataType beta)
-                          NormType norm_type)
 {
    if(Rank != in_length.size())
    {
@@ -88,62 +59,46 @@ void profile_softmax_impl(int do_verification,
    Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
                                               : Tensor<InDataType>(in_length, in_strides);
    Tensor<OutDataType> out(in.mDesc);
+    Tensor<OutDataType> prior_out(in.mDesc);
    switch(init_method)
    {
-    // case 0: break;
+    case 0: break;
-    case 0:
-        in.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
-        out.GenerateTensorValue(GeneratorTensor_1<OutDataType>{});
-        break;
    case 1:
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        ck::utils::FillUniformDistributionIntegerValue<InDataType>{-5.f, 5.f}(in.begin(), in.end());
-        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        ck::utils::FillUniformDistributionIntegerValue<OutDataType>{-5.f, 5.f}(prior_out.begin(),
+                                                                               prior_out.end());
        break;
    default:
-        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        ck::utils::FillUniformDistribution<InDataType>{0.0f, 1.0f}(in);
-        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+        ck::utils::FillUniformDistribution<OutDataType>{-0.5f, 0.5f}(prior_out);
    }
-    Tensor<OutDataType> out_ref(out);
+    Tensor<OutDataType> out_ref(prior_out);
+    if(do_verification)
+    {
+        using ReferenceSoftmax =
+            tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
+        ReferenceSoftmax{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});
+    }
-    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem in_dev(in.GetElementSpaceSizeInBytes());
-    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+    DeviceMem out_dev(out.GetElementSpaceSizeInBytes());
-    in_dev.ToDevice(in.mData.data());
+    in_dev.ToDevice(in.data());
-    out_dev.ToDevice(out.mData.data());
-    std::vector<index_t> i_in_lengths(in.mDesc.GetLengths().begin(), in.mDesc.GetLengths().end());
+    std::vector<index_t> in_tensor_lengths(in.GetLengths().begin(), in.GetLengths().end());
-    std::vector<index_t> i_in_strides(in.mDesc.GetStrides().begin(), in.mDesc.GetStrides().end());
+    std::vector<index_t> in_tensor_strides(in.GetStrides().begin(), in.GetStrides().end());
    // add device softmax instances
    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-    using DeviceOpPtr = tensor_operation::device::
+    using DeviceOp    = tensor_operation::device::
-        DeviceSoftmaxPtr<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
-    std::vector<DeviceOpPtr> instances;
-    if(norm_type == NormType::SOFTMAX)
+    // get device op instances
-    {
+    const auto instances = tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        if constexpr(is_same<InDataType, half_t>::value && is_same<OutDataType, half_t>::value &&
+        DeviceOp>::GetInstances();
-                     is_same<AccDataType, float>::value)
+    std::cout << "found " << instances.size() << " instances" << std::endl;
-        {
-            if constexpr(Rank == 3)
-                tensor_operation::device::instance::add_device_softmax_f16_f16_rank3_instances(
-                    instances);
-            else if constexpr(Rank == 4)
-                tensor_operation::device::instance::add_device_softmax_f16_f16_rank4_instances(
-                    instances);
-        }
-        else if constexpr(is_same<InDataType, float>::value && is_same<OutDataType, float>::value &&
-                          is_same<AccDataType, float>::value)
-        {
-            if constexpr(Rank == 3)
-                tensor_operation::device::instance::add_device_softmax_f32_f32_rank3_instances(
-                    instances);
-            else if constexpr(Rank == 4)
-                tensor_operation::device::instance::add_device_softmax_f32_f32_rank4_instances(
-                    instances);
-        }
-    }
    if(instances.size() <= 0)
    {
@@ -153,21 +108,19 @@ void profile_softmax_impl(int do_verification,
    std::string best_instance_name;
    float best_avg_time   = std::numeric_limits<float>::max();
    float best_gb_per_sec = 0;
+    std::vector<bool> instance_pass;
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
    for(auto& inst_ptr : instances)
    {
        // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
        // problem to rank 4 kernel) other than invoking IsSupportedArgument()?
-        if(!(inst_ptr->GetRank() == static_cast<index_t>(i_in_lengths.size()) &&
+        if(!(inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
-             inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
        {
            continue;
        }
-        auto argument_ptr = inst_ptr->MakeArgumentPointer(i_in_lengths,
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths,
-                                                          i_in_strides,
+                                                          in_tensor_strides,
                                                          reduce_dims,
                                                          &alpha,
                                                          &beta,
@@ -181,45 +134,42 @@ void profile_softmax_impl(int do_verification,
            std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
            LogRange(std::cout << "input lengths = [", in_length, ", ")
                << "], "
-                << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
+                << "scaler = [" << alpha << ", " << beta << "]";
-            return;
+            LogRange(std::cout << ", reduce dims = [", reduce_dims, ", ") << "]." << std::endl;
+            instance_pass.push_back(true);
+            continue;
        }
+        out_dev.ToDevice(prior_out.data());
        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+        float avg_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
-        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+        if(time_kernel)
+        {
-        std::size_t num_bytes =
+            std::size_t num_bytes =
-            in.mDesc.GetElementSize() * sizeof(InDataType) +
+                in.GetElementSize() * sizeof(InDataType) +
-            (beta == 0.0f ? 1 : 2) * out.mDesc.GetElementSize() * sizeof(OutDataType);
+                (beta == 0.0f ? 1 : 2) * out.GetElementSize() * sizeof(OutDataType);
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
-        float gb_per_sec = num_bytes / 1.E6 / avg_time;
-        std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
-                  << inst_ptr->GetTypeString() << std::endl;
+                      << inst_ptr->GetTypeString() << std::endl;
-        if(avg_time < best_avg_time)
+            if(avg_time < best_avg_time)
-        {
+            {
-            best_instance_name = inst_ptr->GetTypeString();
+                best_instance_name = inst_ptr->GetTypeString();
-            best_avg_time      = avg_time;
+                best_avg_time      = avg_time;
-            best_gb_per_sec    = gb_per_sec;
+                best_gb_per_sec    = gb_per_sec;
+            }
        }
        if(do_verification)
        {
-            // TODO: factory method to dynamically switch between different reference normalizations
+            out_dev.FromDevice(out.data());
-            using ReferenceFactory =
+            bool pass = true;
-                tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
-            ReferenceFactory{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});
-            out_dev.FromDevice(out.mData.data());
-            bool pass;
            if(std::is_same<InDataType, int8_t>::value)
            {
-                pass = ck::utils::check_err(
+                pass = pass && ck::utils::check_err(
-                    out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1);
+                                   out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1);
                if(do_log)
                {
                    LogRangeAsType<int>(std::cout << "in  : ", in.mData, ",") << std::endl;
@@ -230,7 +180,7 @@ void profile_softmax_impl(int do_verification,
            }
            else
            {
-                pass = ck::utils::check_err(out.mData, out_ref.mData);
+                pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
                if(do_log)
                {
                    LogRangeAsType<float>(std::cout << "in  : ", in.mData, ",") << std::endl;
@@ -247,16 +197,22 @@ void profile_softmax_impl(int do_verification,
                    << "], "
                    << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
            }
+            instance_pass.push_back(pass);
        }
    }
-    std::cout << "Best Perf for datatype = " << type_to_string<InDataType>() << "_"
+    if(time_kernel)
-              << type_to_string<OutDataType>() << ", ";
+    {
-    LogRange(std::cout << "length = ", i_in_lengths, ",") << ", ";
+        std::cout << "Best Perf for datatype = " << type_to_string<InDataType>() << "_"
-    LogRange(std::cout << "stride = ", i_in_strides, ",") << ", ";
+                  << type_to_string<OutDataType>() << ", ";
-    LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", ";
+        LogRange(std::cout << "length = ", in_tensor_lengths, ",") << ", ";
-    std::cout << "alpha = " << alpha << ", "
+        LogRange(std::cout << "stride = ", in_tensor_strides, ",") << ", ";
-              << "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec
+        LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", ";
-              << " GB/s, " << best_instance_name << std::endl;
+        std::cout << "alpha = " << alpha << ", "
+                  << "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec
+                  << " GB/s, " << best_instance_name << std::endl;
+    }
+    return std::all_of(
+        std::begin(instance_pass), std::end(instance_pass), [](bool p) { return p; });
 }
 } // namespace profiler

--- a/profiler/src/profile_conv_bwd_weight.cpp
+++ b/profiler/src/profile_conv_bwd_weight.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <initializer_list>
 #include <iostream>
 #include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include "profiler/include/profile_conv_bwd_weight_impl.hpp"
+#include "profiler/include/profile_grouped_conv_bwd_weight_impl.hpp"
 namespace {
 enum struct ConvLayout
 {
-    NCHW_KCYX_NKHW, // 0
+    GNCHW_GKCYX_GNKHW, // 0
-    NHWC_KYXC_NHWK, // 1
+    GNHWC_GKYXC_GNHWK, // 1
 };
 enum struct ConvDataType
@@ -25,24 +25,25 @@ enum struct ConvDataType
 static void print_helper_msg()
 {
-    std::cout
+    std::cout << "arg1: tensor operation (conv_bwd_weight: Convolution Backward Weight\n"
-        << "arg1: tensor operation (conv_bwd_weight: Convolution Backward Weight\n"
+              << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
-        << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
+              << "                 1: Input fp16, Weight fp16, Output fp16\n"
-        << "                 1: Input fp16, Weight fp16, Output fp16\n"
+              << "                 2: Input bf16, Weight fp32, Output bf16)\n"
-        << "                 2: Input bf16, Weight fp32, Output bf16)\n"
+              << "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, "
-        << "arg3: tensor layout (0: Input[N, C, Hi, Wi], Weight[K, C, Y, X], Output[N, K, Ho, Wo]\n"
+                 "N, K, Ho, Wo]\n"
-        << "                     1: Input[N, Hi, Wi, C], Weight[K, Y, X, C], Output[N, Ho, Wo, K]\n"
+              << "                     1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, "
-        << "arg4: verification (0: no, 1: yes)\n"
+                 "N, Ho, Wo, K]\n"
-        << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+              << "arg4: verification (0: no, 1: yes)\n"
-        << "arg6: print tensor value (0: no; 1: yes)\n"
+              << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
-        << "arg7: time kernel (0: no, 1: yes)\n"
+              << "arg6: print tensor value (0: no; 1: yes)\n"
-        << ck::utils::conv::get_conv_param_parser_helper_msg() << " SplitK\n"
+              << "arg7: time kernel (0: no, 1: yes)\n"
-        << std::endl;
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << " SplitK\n"
+              << std::endl;
 }
 } // namespace
-int profile_conv_bwd_weight(int argc, char* argv[])
+int profile_grouped_conv_bwd_weight(int argc, char* argv[])
 {
    // 8 for control, 1 for num_dim_spatial
    if(argc < 9)
@@ -75,17 +76,17 @@ int profile_conv_bwd_weight(int argc, char* argv[])
    using F16  = ck::half_t;
    using BF16 = ck::bhalf_t;
-    using NWC   = ck::tensor_layout::convolution::NWC;
+    using GNWC   = ck::tensor_layout::convolution::GNWC;
-    using NHWC  = ck::tensor_layout::convolution::NHWC;
+    using GNHWC  = ck::tensor_layout::convolution::GNHWC;
-    using NDHWC = ck::tensor_layout::convolution::NDHWC;
+    using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
-    using KXC   = ck::tensor_layout::convolution::KXC;
+    using GKXC   = ck::tensor_layout::convolution::GKXC;
-    using KYXC  = ck::tensor_layout::convolution::KYXC;
+    using GKYXC  = ck::tensor_layout::convolution::GKYXC;
-    using KZYXC = ck::tensor_layout::convolution::KZYXC;
+    using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
-    using NWK   = ck::tensor_layout::convolution::NWK;
+    using GNWK   = ck::tensor_layout::convolution::GNWK;
-    using NHWK  = ck::tensor_layout::convolution::NHWK;
+    using GNHWK  = ck::tensor_layout::convolution::GNHWK;
-    using NDHWK = ck::tensor_layout::convolution::NDHWK;
+    using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
    constexpr auto I1 = ck::Number<1>{};
    constexpr auto I2 = ck::Number<2>{};
@@ -108,64 +109,64 @@ int profile_conv_bwd_weight(int argc, char* argv[])
        using WeiDataType = decltype(wei_type);
        using OutDataType = decltype(out_type);
-        bool pass = ck::profiler::profile_conv_bwd_weight_impl<NDimSpatial,
+        bool pass = ck::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial,
-                                                               InLayout,
+                                                                       InLayout,
-                                                               WeiLayout,
+                                                                       WeiLayout,
-                                                               OutLayout,
+                                                                       OutLayout,
-                                                               InDataType,
+                                                                       InDataType,
-                                                               WeiDataType,
+                                                                       WeiDataType,
-                                                               OutDataType>(
+                                                                       OutDataType>(
            do_verification, init_method, do_log, time_kernel, params, split_k);
        return pass ? 0 : 1;
    };
-    if(num_dim_spatial == 1 && layout == ConvLayout::NHWC_KYXC_NHWK)
+    if(num_dim_spatial == 1 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
-            return profile(I1, NWC{}, KXC{}, NWK{}, F32{}, F32{}, F32{});
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{});
        }
        else if(data_type == ConvDataType::F16_F16_F16)
        {
-            return profile(I1, NWC{}, KXC{}, NWK{}, F16{}, F16{}, F16{});
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F16{}, F16{}, F16{});
        }
        else if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
-            return profile(I1, NWC{}, KXC{}, NWK{}, BF16{}, F32{}, BF16{});
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, F32{}, BF16{});
        }
    }
-    else if(num_dim_spatial == 2 && layout == ConvLayout::NHWC_KYXC_NHWK)
+    else if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
-            return profile(I2, NHWC{}, KYXC{}, NHWK{}, F32{}, F32{}, F32{});
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{});
        }
        else if(data_type == ConvDataType::F16_F16_F16)
        {
-            return profile(I2, NHWC{}, KYXC{}, NHWK{}, F16{}, F16{}, F16{});
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{});
        }
        else if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
-            return profile(I2, NHWC{}, KYXC{}, NHWK{}, BF16{}, F32{}, BF16{});
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, F32{}, BF16{});
        }
    }
-    else if(num_dim_spatial == 3 && layout == ConvLayout::NHWC_KYXC_NHWK)
+    else if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
-            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F32{}, F32{}, F32{});
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{});
        }
        else if(data_type == ConvDataType::F16_F16_F16)
        {
-            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F16{}, F16{}, F16{});
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{});
        }
        else if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
-            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, BF16{}, F32{}, BF16{});
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{});
        }
    }

--- a/profiler/src/profile_softmax.cpp
+++ b/profiler/src/profile_softmax.cpp
@@ -8,14 +8,10 @@
 #include "profiler/include/profile_softmax_impl.hpp"
 using ck::index_t;
-using ck::profiler::NormDataType;
+using ck::profiler::SoftmaxDataType;
-using ck::profiler::NormType;
 struct ArgParser
 {
-    std::unordered_map<std::string, NormType> norm_dict = {{"batchnorm", NormType::BATCHNORM},
-                                                           {"softmax", NormType::SOFTMAX}};
    std::unordered_map<std::string, std::vector<int>> long_opts = {
        {"length", {}}, {"stride", {}}, {"reduce", {}}, {"alpha", {}}, {"beta", {}}};
@@ -50,7 +46,7 @@ struct ArgParser
 void print_help()
 {
-    std::cout << "arg1: tensor operation (batchnorm/softmax)\n"
+    std::cout << "arg1: tensor operation (softmax)\n"
              << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
              << "arg3: verification (0: no; 1: yes)\n"
              << "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"
@@ -64,7 +60,7 @@ void print_help()
              << std::endl;
 }
-int profile_normalization(int argc, char* argv[])
+int profile_softmax(int argc, char* argv[])
 {
    if(argc <= 2)
    {
@@ -75,12 +71,11 @@ int profile_normalization(int argc, char* argv[])
    ArgParser arg_parser;
    // short unnamed options
-    const NormType norm_type     = arg_parser.norm_dict[argv[1]];
+    const SoftmaxDataType data_type = static_cast<SoftmaxDataType>(std::stoi(argv[2]));
-    const NormDataType data_type = static_cast<NormDataType>(std::stoi(argv[2]));
+    const bool do_verification      = std::stoi(argv[3]);
-    const bool do_verification   = std::stoi(argv[3]);
+    const int init_method           = std::stoi(argv[4]);
-    const int init_method        = std::stoi(argv[4]);
+    const bool do_log               = std::stoi(argv[5]);
-    const bool do_log            = std::stoi(argv[5]);
+    const bool time_kernel          = std::stoi(argv[6]);
-    const bool time_kernel       = std::stoi(argv[6]);
    // parse the long options
    arg_parser(argc, argv);
@@ -91,9 +86,10 @@ int profile_normalization(int argc, char* argv[])
        arg_parser.long_opts["alpha"].empty() ? 1 : arg_parser.long_opts["alpha"][0];
    const index_t beta = arg_parser.long_opts["beta"].empty() ? 0 : arg_parser.long_opts["beta"][0];
+    // Rank 3
    if(length.size() == 3)
    {
-        if(data_type == NormDataType::F16_F16)
+        if(data_type == SoftmaxDataType::F16_F16)
        {
            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3>(do_verification,
                                                                                 init_method,
@@ -103,10 +99,9 @@ int profile_normalization(int argc, char* argv[])
                                                                                 stride,
                                                                                 reduce,
                                                                                 float(alpha),
-                                                                                 float(beta),
+                                                                                 float(beta));
-                                                                                 norm_type);
        }
-        else if(data_type == NormDataType::F32_F32)
+        else if(data_type == SoftmaxDataType::F32_F32)
        {
            ck::profiler::profile_softmax_impl<float, float, float, 3>(do_verification,
                                                                       init_method,
@@ -116,17 +111,17 @@ int profile_normalization(int argc, char* argv[])
                                                                       stride,
                                                                       reduce,
                                                                       float(alpha),
-                                                                       float(beta),
+                                                                       float(beta));
-                                                                       norm_type);
        }
        else
        {
            throw std::runtime_error("not implemented yet");
        }
    }
+    // Rank 4
    else if(length.size() == 4)
    {
-        if(data_type == NormDataType::F16_F16)
+        if(data_type == SoftmaxDataType::F16_F16)
        {
            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4>(do_verification,
                                                                                 init_method,
@@ -136,10 +131,9 @@ int profile_normalization(int argc, char* argv[])
                                                                                 stride,
                                                                                 reduce,
                                                                                 float(alpha),
-                                                                                 float(beta),
+                                                                                 float(beta));
-                                                                                 norm_type);
        }
-        else if(data_type == NormDataType::F32_F32)
+        else if(data_type == SoftmaxDataType::F32_F32)
        {
            ck::profiler::profile_softmax_impl<float, float, float, 4>(do_verification,
                                                                       init_method,
@@ -149,8 +143,7 @@ int profile_normalization(int argc, char* argv[])
                                                                       stride,
                                                                       reduce,
                                                                       float(alpha),
-                                                                       float(beta),
+                                                                       float(beta));
-                                                                       norm_type);
        }
        else
        {

--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -18,9 +18,9 @@ int profile_conv_fwd(int, char*[]);
 int profile_conv_fwd_bias_relu(int, char*[]);
 int profile_conv_fwd_bias_relu_add(int, char*[]);
 int profile_conv_bwd_data(int, char*[]);
-int profile_conv_bwd_weight(int, char*[]);
 int profile_grouped_conv_fwd(int, char*[]);
-int profile_normalization(int, char*[]);
+int profile_grouped_conv_bwd_weight(int, char*[]);
+int profile_softmax(int, char*[]);
 int profile_layernorm(int, char*[]);
 int profile_groupnorm(int, char*[]);
 int profile_reduce(int, char*[]);
@@ -43,8 +43,9 @@ static void print_helper_message()
           "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
           "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
           "                        conv_bwd_data: Convolution Backward Data\n"
-           "                        conv_bwd_weight: Convolution Backward Weight\n"
           "                        grouped_conv_fwd: Grouped Convolution Forward\n"
+           "                        grouped_conv_bwd_weight: Grouped Convolution Backward Weight\n"
+           "                        softmax: Softmax\n"
           "                        reduce: Reduce\n");
    // clang-format on
 }
@@ -117,21 +118,21 @@ int main(int argc, char* argv[])
    {
        return profile_conv_bwd_data(argc, argv);
    }
-    else if(strcmp(argv[1], "conv_bwd_weight") == 0)
-    {
-        return profile_conv_bwd_weight(argc, argv);
-    }
    else if(strcmp(argv[1], "grouped_conv_fwd") == 0)
    {
        return profile_grouped_conv_fwd(argc, argv);
    }
+    else if(strcmp(argv[1], "conv_bwd_weight") == 0)
+    {
+        return profile_grouped_conv_bwd_weight(argc, argv);
+    }
    else if(strcmp(argv[1], "reduce") == 0)
    {
        return profile_reduce(argc, argv);
    }
-    else if(strcmp(argv[1], "batchnorm") == 0 || strcmp(argv[1], "softmax") == 0)
+    else if(strcmp(argv[1], "softmax") == 0)
    {
-        return profile_normalization(argc, argv);
+        return profile_softmax(argc, argv);
    }
    else if(strcmp(argv[1], "layernorm") == 0)
    {

--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -11,7 +11,7 @@ cmake
 -D CMAKE_CXX_FLAGS="-O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD"         \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
-D GPU_TARGETS=gfx908;gfx90a                                                                      \
+-D GPU_TARGETS="gfx908;gfx90a"                                                                    \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}

--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -11,7 +11,7 @@ cmake
 -D CMAKE_CXX_FLAGS="-O3"                                                                          \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=OFF                                                                                  \
-D GPU_TARGETS=gfx908;gfx90a                                                                      \
+-D GPU_TARGETS="gfx908;gfx90a"                                                                      \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}