Merge branch 'develop' into amd-develop

ea5be216 · Jun Liu · e2eb0418 · 25935b57 · ea5be216 · ea5be216
Commit ea5be216 authored Aug 23, 2024 by Jun Liu
20 changed files
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/CMakeLists.txt
 # ONLY XDL_KERNELS
 set(GROUPED_CONV3D_FWD_CONVSCALE_RELU
-   xdl/device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp)
+   xdl/device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp
+   xdl/device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instance.cpp)

 add_instance_library(device_grouped_conv3d_fwd_convscale_relu_instance ${GROUPED_CONV3D_FWD_CONVSCALE_RELU})
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/xdl/device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/xdl/device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instance.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_outelementop_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                ck::Tuple<>,
+                                                                NDHWGK,
+                                                                F8,
+                                                                F8,
+                                                                ck::Tuple<>,
+                                                                F32,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                CombConvScaleRelu,
+                                                                F8,
+                                                                F8>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3,
+                                                                     NDHWGC,
+                                                                     GKZYXC,
+                                                                     ck::Tuple<>,
+                                                                     NDHWGK,
+                                                                     ConvFwdDefault,
+                                                                     CombConvScaleRelu>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3,
+                                                                     NDHWGC,
+                                                                     GKZYXC,
+                                                                     ck::Tuple<>,
+                                                                     NDHWGK,
+                                                                     ConvFwd1x1P0,
+                                                                     CombConvScaleRelu>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3,
+                                                                     NDHWGC,
+                                                                     GKZYXC,
+                                                                     ck::Tuple<>,
+                                                                     NDHWGK,
+                                                                     ConvFwd1x1S1P0,
+                                                                     CombConvScaleRelu>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/xdl/device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/xdl/device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp
@@ -3,15 +3,13 @@

 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_outelementop_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp"

 namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {

-using ConvScaleRelu = ck::tensor_operation::element_wise::ConvScaleRelu;
-
 void add_device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                NDHWGC,
@@ -56,7 +54,6 @@ void add_device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_in
                                                              ConvFwd1x1S1P0,
                                                              ConvScaleRelu>{});
 }
-
 } // namespace instance
 } // namespace device
 } // namespace tensor_operation

--- a/library/src/tensor_operation_instance/gpu/permute_scale/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/permute_scale/CMakeLists.txt
-add_instance_library(device_permute_scale_instance 
+add_instance_library(device_permute_scale_instance
 	device_permute_scale_1d_fp16_instances.cpp
 	device_permute_scale_2d_fp16_instances.cpp
 	device_permute_scale_3d_fp16_instances.cpp
@@ -10,4 +10,5 @@ add_instance_library(device_permute_scale_instance
 	device_permute_scale_3d_fp32_instances.cpp
 	device_permute_scale_4d_fp32_instances.cpp
 	device_permute_scale_5d_fp32_instances.cpp
-	device_permute_scale_6d_fp32_instances.cpp)
+	device_permute_scale_6d_fp32_instances.cpp
+	device_permute_scale_6d_fp32_fp8_instances.cpp)
--- a/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_6d_fp32_fp8_instances.cpp
+++ b/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_6d_fp32_fp8_instances.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Scale = element_wise::Scale;
+
+void add_device_permute_scale_6d_f32_f8_instances(
+    std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F8>, Scale, 6>>>&
+        instances)
+{
+#ifdef CK_ENABLE_FP8
+    add_device_operation_instances(instances, device_permute_scale_f32_f8_instances<6, Scale>{});
+#else
+    ignore = instances;
+#endif
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp
@@ -10,15 +10,24 @@ namespace device {
 namespace instance {

 // clang-format off
-// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
-template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
-template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
-template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
+//                                                 InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     4,             3,       ReduceAMax,         UnaryAbs,       PassThrough,        false,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     4,             4,       ReduceAMax,         UnaryAbs,       PassThrough,        false,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     4,             1,       ReduceAMax,         UnaryAbs,       PassThrough,        false,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     2,             1,       ReduceAMax,         UnaryAbs,       PassThrough,        false,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     4,             3,       ReduceAMax,         UnaryAbs,       PassThrough,        false,      true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     4,             4,       ReduceAMax,         UnaryAbs,       PassThrough,        false,      true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     4,             1,       ReduceAMax,         UnaryAbs,       PassThrough,        false,      true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     2,             1,       ReduceAMax,         UnaryAbs,       PassThrough,        false,      true>(std::vector<DeviceReducePtr<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     6,             6,       ReduceAMax,         UnaryAbs,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 6, 6, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     5,             5,       ReduceAMax,         UnaryAbs,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 5, 5, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     4,             4,       ReduceAMax,         UnaryAbs,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     6,             3,       ReduceAMax,         UnaryAbs,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 6, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     5,             3,       ReduceAMax,         UnaryAbs,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 5, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     4,             3,       ReduceAMax,         UnaryAbs,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     3,             3,       ReduceAMax,      PassThrough,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 3, 3, ReduceAMax, PassThrough, PassThrough, true, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     2,             2,       ReduceAMax,      PassThrough,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 2, 2, ReduceAMax, PassThrough, PassThrough, true, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     1,             1,       ReduceAMax,      PassThrough,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 1, 1, ReduceAMax, PassThrough, PassThrough, true, false>>&);
 // clang-format on

 } // namespace instance

--- a/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp
@@ -48,6 +48,7 @@ bool profile_gemm_multiply_multiply_impl(int do_verification,
                                         int StrideD0,
                                         int StrideD1,
                                         int StrideE,
+                                         int KBatch,
                                         int n_warmup,
                                         int n_iter,
                                         uint64_t rotating = 0)
@@ -129,17 +130,17 @@ bool profile_gemm_multiply_multiply_impl(int do_verification,
    d1_device_buf.ToDevice(d1_m_n.mData.data());

    using DeviceOp =
-        ck::tensor_operation::device::DeviceGemmMultipleD<ALayout,
-                                                          BLayout,
-                                                          ck::Tuple<D0Layout, D1Layout>,
-                                                          ELayout,
-                                                          ADataType,
-                                                          BDataType,
-                                                          ck::Tuple<D0DataType, D1DataType>,
-                                                          EDataType,
-                                                          AElementOp,
-                                                          BElementOp,
-                                                          CElementOp>;
+        ck::tensor_operation::device::DeviceGemmMultipleDSplitK<ALayout,
+                                                                BLayout,
+                                                                ck::Tuple<D0Layout, D1Layout>,
+                                                                ELayout,
+                                                                ADataType,
+                                                                BDataType,
+                                                                ck::Tuple<D0DataType, D1DataType>,
+                                                                EDataType,
+                                                                AElementOp,
+                                                                BElementOp,
+                                                                CElementOp>;

    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
@@ -182,104 +183,128 @@ bool profile_gemm_multiply_multiply_impl(int do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    float best_kbatch     = 0;

    // profile device GEMM instances
    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr =
-            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
-                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
-                                        std::array<const void*, 2>{d0_device_buf.GetDeviceBuffer(),
-                                                                   d1_device_buf.GetDeviceBuffer()},
-                                        static_cast<EDataType*>(c_device_buf.GetDeviceBuffer()),
-                                        M,
-                                        N,
-                                        K,
-                                        StrideA,
-                                        StrideB,
-                                        std::array<ck::index_t, 2>{StrideD0, StrideD1},
-                                        StrideE,
-                                        a_element_op,
-                                        b_element_op,
-                                        c_element_op);
-
-        auto invoker_ptr = op_ptr->MakeInvokerPointer();
-
-        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-
-            // re-init C to zero before profiling next kernel
-            c_device_buf.SetZero();
+        std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38};

-            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false, 0, n_warmup, n_iter});
+        if(KBatch > 0)
+        {
+            kbatch_list = {KBatch};
+        }

-            if(do_verification)
+        for(std::size_t i = 0; i < kbatch_list.size(); i++)
+        {
+            auto kbatch_curr = kbatch_list[i];
+
+            auto argument_ptr = op_ptr->MakeArgumentPointer(
+                static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                std::array<const void*, 2>{d0_device_buf.GetDeviceBuffer(),
+                                           d1_device_buf.GetDeviceBuffer()},
+                static_cast<EDataType*>(c_device_buf.GetDeviceBuffer()),
+                M,
+                N,
+                K,
+                StrideA,
+                StrideB,
+                std::array<ck::index_t, 2>{StrideD0, StrideD1},
+                StrideE,
+                kbatch_curr,
+                a_element_op,
+                b_element_op,
+                c_element_op);
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
            {
-                c_device_buf.FromDevice(e_m_n_device_result.mData.data());

-                pass = pass & ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+                // re-init C to zero before profiling next kernel
+                c_device_buf.SetZero();
+
+                invoker_ptr->Run(argument_ptr.get(),
+                                 StreamConfig{nullptr, false, 0, n_warmup, n_iter});

-                if(do_log)
+                if(do_verification)
                {
-                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_host  : ", e_m_n_host_result.mData, ",")
-                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_device: ", e_m_n_device_result.mData, ",")
-                        << std::endl;
+                    c_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+                    pass = pass & ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+
+                    if(do_log)
+                    {
+                        LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
+                        LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_host  : ", e_m_n_host_result.mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_device: ", e_m_n_device_result.mData, ",")
+                            << std::endl;
+                    }
                }
-            }

-            std::string op_name = op_ptr->GetTypeString();
+                std::string op_name = op_ptr->GetTypeString();

-            float ave_time = invoker_ptr->Run(
-                argument_ptr.get(),
-                StreamConfig{
-                    nullptr, time_kernel, 0, n_warmup, n_iter, rotating_count > 1, rotating_count});
+                float ave_time = invoker_ptr->Run(argument_ptr.get(),
+                                                  StreamConfig{nullptr,
+                                                               time_kernel,
+                                                               0,
+                                                               n_warmup,
+                                                               n_iter,
+                                                               rotating_count > 1,
+                                                               rotating_count});

-            std::size_t flop = std::size_t(2) * M * N * K;
+                std::size_t flop = std::size_t(2) * M * N * K;

-            std::size_t num_btype =
-                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+                std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                        sizeof(EDataType) * M * N;

-            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+                float tflops = static_cast<float>(flop) / 1.E9 / ave_time;

-            float gb_per_sec = num_btype / 1.E6 / ave_time;
+                float gb_per_sec = num_btype / 1.E6 / ave_time;

-            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
-                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+                std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
+                          << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch "
+                          << kbatch_curr << std::endl;

 #if defined CK_ENABLE_FP8
-            // set softer tolerances for fp8
-            if constexpr(is_same_v<ADataType, f8_t> || is_same_v<BDataType, f8_t> ||
-                         is_same_v<EDataType, f8_t>)
-            {
-                std::string msg = "Error: Incorrect results!";
-                double rtol     = 1e-1;
-                double atol     = 1e-1;
-                pass            = pass & ck::utils::check_err(
-                                  e_m_n_device_result, e_m_n_host_result, msg, rtol, atol);
-            }
-            else
-            {
+                // set softer tolerances for fp8
+                if constexpr(is_same_v<ADataType, f8_t> || is_same_v<BDataType, f8_t> ||
+                             is_same_v<EDataType, f8_t>)
+                {
+                    std::string msg = "Error: Incorrect results!";
+                    double rtol     = 1e-1;
+                    double atol     = 1e-1;
+                    pass            = pass & ck::utils::check_err(
+                                      e_m_n_device_result, e_m_n_host_result, msg, rtol, atol);
+                }
+                else
+                {
 #endif
-                pass = pass & ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+                    pass = pass & ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
 #if defined CK_ENABLE_FP8
-            }
+                }
 #endif

-            if(tflops > best_tflops)
+                if(tflops > best_tflops && ave_time > 1e-10)
+                {
+                    best_op_name    = op_name;
+                    best_tflops     = tflops;
+                    best_ave_time   = ave_time;
+                    best_gb_per_sec = gb_per_sec;
+                    best_kbatch     = kbatch_curr;
+                }
+            }
+            else
            {
-                best_op_name    = op_name;
-                best_tflops     = tflops;
-                best_ave_time   = ave_time;
-                best_gb_per_sec = gb_per_sec;
+                std::cout << op_ptr->GetTypeString() << " does not support this problem"
+                          << std::endl;
            }
        }
-        else
-        {
-            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
-        }
    }

    if constexpr(is_same<EDataType, float>::value)
@@ -318,9 +343,9 @@ bool profile_gemm_multiply_multiply_impl(int do_verification,
    }

    std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
-              << " StrideB = " << StrideB << " StrideE = " << StrideE << " : " << best_ave_time
-              << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
-              << best_op_name << std::endl;
+              << " StrideB = " << StrideB << " StrideE = " << StrideE << " KBatch = " << best_kbatch
+              << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
+              << " GB/s, " << best_op_name << std::endl;

    return pass;
 }

--- a/profiler/include/profiler/profile_gemm_universal_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_impl.hpp
@@ -152,7 +152,7 @@ bool profile_gemm_universal_impl(int do_verification,
    // profile device GEMM instances
    for(auto& op_ptr : op_ptrs)
    {
-        std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 19, 20, 32, 38};
+        std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38};

        if(KBatch > 0)
        {
@@ -249,7 +249,7 @@ bool profile_gemm_universal_impl(int do_verification,
                          << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch "
                          << kbatch_curr << std::endl;

-                if(tflops > best_tflops)
+                if(tflops > best_tflops && ave_time > 1e-10)
                {
                    best_op_name    = op_name;
                    best_tflops     = tflops;

--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -136,9 +136,10 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;

    std::string best_op_name;
-    float best_avg_time   = 0;
-    float best_tflops     = 0;
-    float best_gb_per_sec = 0;
+    float best_avg_time      = 0;
+    float best_tflops        = 0;
+    float best_gb_per_sec    = 0;
+    ck::index_t best_split_k = 1;

    // profile device Conv instances
    bool all_pass = true;
@@ -167,99 +168,111 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
    range_copy(conv_param.input_left_pads_, begin(input_left_pads));
    range_copy(conv_param.input_right_pads_, begin(input_right_pads));

+    std::vector<ck::index_t> split_k_list = {1, 2, 4, 8, 16, 32, 64, 128};
+
+    if(split_k > 0)
+    {
+        split_k_list = {split_k};
+    }
+
    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr =
-            op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-                                        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                        input_lengths,
-                                        input_strides,
-                                        filter_lengths,
-                                        weights_strides,
-                                        output_lengths,
-                                        output_strides,
-                                        conv_filter_strides,
-                                        conv_filter_dilations,
-                                        input_left_pads,
-                                        input_right_pads,
-                                        in_element_op,
-                                        wei_element_op,
-                                        out_element_op,
-                                        split_k);
-
-        const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
-        DeviceMem workspace_dev(workspace_sz);
-        op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
-
-        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++)
        {
-            // using atomic add, so need to reset input
-            wei_device_buf.SetZero();
-
-            std::string op_name = op_ptr->GetTypeString();
-
-            auto invoker_ptr = op_ptr->MakeInvokerPointer();
-
-            float avg_time =
-                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+            auto argument_ptr = op_ptr->MakeArgumentPointer(
+                static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                input_lengths,
+                input_strides,
+                filter_lengths,
+                weights_strides,
+                output_lengths,
+                output_strides,
+                conv_filter_strides,
+                conv_filter_dilations,
+                input_left_pads,
+                input_right_pads,
+                in_element_op,
+                wei_element_op,
+                out_element_op,
+                split_k_list[split_k_id]);
+
+            const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            DeviceMem workspace_dev(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+                // using atomic add, so need to reset input
+                wei_device_buf.SetZero();

-            std::size_t flop      = conv_param.GetFlops();
-            std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+                std::string op_name = op_ptr->GetTypeString();

-            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
-            float gb_per_sec = num_btype / 1.E6 / avg_time;
+                auto invoker_ptr = op_ptr->MakeInvokerPointer();

-            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
-                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+                float avg_time =
+                    invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

-            if(tflops > best_tflops)
-            {
-                best_op_name    = op_name;
-                best_tflops     = tflops;
-                best_avg_time   = avg_time;
-                best_gb_per_sec = gb_per_sec;
-            }
+                std::size_t flop      = conv_param.GetFlops();
+                std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();

-            if(do_verification)
-            {
-                wei_device_buf.FromDevice(weight_device_result.mData.data());
+                float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+                float gb_per_sec = num_btype / 1.E6 / avg_time;

-                bool pass = ck::utils::check_err(weight_device_result, weight_host_result);
+                std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
+                          << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
+                          << split_k_list[split_k_id] << std::endl;

-                if(!pass)
+                if(tflops > best_tflops)
                {
-                    std::cout << "Fail info: " << op_ptr->GetTypeString() << std::endl;
+                    best_op_name    = op_name;
+                    best_tflops     = tflops;
+                    best_avg_time   = avg_time;
+                    best_gb_per_sec = gb_per_sec;
+                    best_split_k    = split_k_list[split_k_id];
                }

-                all_pass &= pass;
-
-                if(do_log)
+                if(do_verification)
                {
-                    LogRangeAsType<float>(std::cout << "output : ", output.mData, ",") << std::endl;
-                    ;
-                    LogRangeAsType<float>(
-                        std::cout << "weight (device): ", weight_device_result.mData, ",")
-                        << std::endl;
-                    ;
-                    LogRangeAsType<float>(
-                        std::cout << "weight (host): ", weight_host_result.mData, ",")
-                        << std::endl;
-                    ;
-                    LogRangeAsType<float>(std::cout << "input: ", input.mData, ",") << std::endl;
-                    ;
+                    wei_device_buf.FromDevice(weight_device_result.mData.data());
+
+                    bool pass = ck::utils::check_err(weight_device_result, weight_host_result);
+
+                    if(!pass)
+                    {
+                        std::cout << "Fail info: " << op_ptr->GetTypeString() << std::endl;
+                    }
+
+                    all_pass &= pass;
+
+                    if(do_log)
+                    {
+                        LogRangeAsType<float>(std::cout << "output : ", output.mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "weight (device): ", weight_device_result.mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "weight (host): ", weight_host_result.mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(std::cout << "input: ", input.mData, ",")
+                            << std::endl;
+                    }
                }
            }
-        }
-        else
-        {
-            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+            else
+            {
+                std::cout << op_ptr->GetTypeString() << " does not support this problem"
+                          << std::endl;
+            }
        }
    }

    std::cout << "Best configuration parameters:"
              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
-              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << ", SplitK "
+              << best_split_k << std::endl;

    return all_pass;
 }

--- a/profiler/src/profile_gemm_multiply_multiply.cpp
+++ b/profiler/src/profile_gemm_multiply_multiply.cpp
@@ -34,7 +34,7 @@ enum struct GemmDataType

 int profile_gemm_multiply_multiply(int argc, char* argv[])
 {
-    if(argc != 16 && argc != 19)
+    if(argc != 16 && argc != 20)
    {
        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
@@ -50,9 +50,10 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
        printf("arg7: time kernel (0=no, 1=yes)\n");
        printf("arg8 to 15: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
        printf("optional:\n");
-        printf("arg16: number of warm-up cycles (default 1)\n");
-        printf("arg17: number of iterations (default 10)\n");
-        printf("arg18: memory for rotating buffer (default 0, size in MB)\n");
+        printf("arg16: number of kbatch (default 1)\n");
+        printf("arg17: number of warm-up cycles (default 1)\n");
+        printf("arg18: number of iterations (default 10)\n");
+        printf("arg19: memory for rotating buffer (default 0, size in MB)\n");
        exit(1);
    }

@@ -76,11 +77,13 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
    int n_warmup      = 1;
    int n_iter        = 10;
    uint64_t rotating = 0;
-    if(argc == 19)
+    int KBatch        = 1;
+    if(argc == 20)
    {
-        n_warmup = std::stoi(argv[16]);
-        n_iter   = std::stoi(argv[17]);
-        rotating = std::stoull(argv[18]) * 1024 * 1024;
+        KBatch   = std::stoi(argv[16]);
+        n_warmup = std::stoi(argv[17]);
+        n_iter   = std::stoi(argv[18]);
+        rotating = std::stoull(argv[19]) * 1024 * 1024;
    }

    using F32  = float;
@@ -146,6 +149,7 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
            (StrideD0 < 0) ? DefaultStrideD0 : StrideD0,
            (StrideD1 < 0) ? DefaultStrideD1 : StrideD1,
            (StrideE < 0) ? DefaultStrideE : StrideE,
+            KBatch,
            n_warmup,
            n_iter,
            rotating);

--- a/profiler/src/profile_gemm_universal.cpp
+++ b/profiler/src/profile_gemm_universal.cpp
@@ -171,6 +171,10 @@ int profile_gemm_universal(int argc, char* argv[])
    {
        return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{});
    }
+    else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Row{}, Row{});
+    }
    else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
    {
        return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});

--- a/profiler/src/profile_grouped_conv_bwd_weight.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <cstdlib>
 #include <initializer_list>
@@ -81,7 +81,6 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);

    ck::index_t split_k = std::stoi(argv[8 + 1 + 4 + 6 * num_dim_spatial]);
-    split_k             = std::max(1, split_k);

    using F32  = float;
    using F16  = ck::half_t;

--- a/profiler/src/profile_grouped_gemm_fixed_nk.cpp
+++ b/profiler/src/profile_grouped_gemm_fixed_nk.cpp
@@ -85,9 +85,11 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
    const auto StrideCs = argToIntArray(argv[13]);
    const int kbatch    = argc == 15 ? std::stoi(argv[14]) : 1;

-    using F32  = float;
-    using F16  = ck::half_t;
-    using F8   = ck::f8_t;
+    using F32 = float;
+    using F16 = ck::half_t;
+#if defined(CK_ENABLE_FP8)
+    using F8 = ck::f8_t;
+#endif
    using BF16 = ck::bhalf_t;
    using I8   = int8_t;


--- a/script/convert_miopen_driver_to_profiler.py
+++ b/script/convert_miopen_driver_to_profiler.py
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# Convert miopen driver command to ck Profiler
+# Example: python3 ../script/convert_miopen_driver_to_profiler.py
+# /opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 -k 64 -y 3 -x 3
+# -p 1 -q 1 -u 2 -v 2 -l 1 -j 1 -m conv -g 32 -F 1 -t 1
+
+import argparse
+import subprocess
+
+
+def init_const_args(args):
+    args.ck_profiler_cmd = '../build/bin/ckProfiler'
+    # use decimal values
+    args.init_method = 2
+    # don't print tensor values
+    args.log_value = 0
+
+
+def run_ck_profiler_cmd(cmd):
+    print("ckProfiler command:")
+    print(cmd)
+    subprocess.run(cmd)
+
+
+def parse_data_type(args):
+    if args.data_type == "fp32":
+        if args.ck_profier_op == "grouped_conv_bwd_weight" or \
+           args.ck_profier_op == "grouped_conv_bwd_data" or \
+           args.ck_profier_op == "grouped_conv_fwd":
+            args.data_type = 0
+    if args.data_type == "fp16":
+        if args.ck_profier_op == "grouped_conv_bwd_weight" or \
+           args.ck_profier_op == "grouped_conv_bwd_data" or \
+           args.ck_profier_op == "grouped_conv_fwd":
+            args.data_type = 1
+    if args.data_type == "int8":
+        if args.ck_profier_op == "grouped_conv_bwd_weight":
+            args.data_type = 4
+        if args.ck_profier_op == "grouped_conv_bwd_data":
+            print('Not supported data type for grouped_conv_bwd_data')
+            exit(1)
+        if args.ck_profier_op == "grouped_conv_fwd":
+            args.data_type = 3
+    if args.data_type == "bfp16":
+        if args.ck_profier_op == "grouped_conv_bwd_weight" or \
+           args.ck_profier_op == "grouped_conv_bwd_data" or \
+           args.ck_profier_op == "grouped_conv_fwd":
+            args.data_type = 2
+
+
+def add_conv_params_to_cmd(args, cmd):
+    if args.spatial_dim == 1:
+        cmd += [str(args.fil_w), str(args.in_w)]
+        cmd += [str(args.conv_stride_w), str(args.dilation_w)]
+        cmd += [str(args.pad_w), str(args.pad_w)]
+    elif args.spatial_dim == 2:
+        cmd += [str(args.fil_h), str(args.fil_w)]
+        cmd += [str(args.in_h), str(args.in_w)]
+        cmd += [str(args.conv_stride_h), str(args.conv_stride_w)]
+        cmd += [str(args.dilation_h), str(args.dilation_w)]
+        cmd += [str(args.pad_h), str(args.pad_w)]
+        cmd += [str(args.pad_h), str(args.pad_w)]
+    elif args.spatial_dim == 3:
+        cmd += [str(args.fil_d), str(args.fil_h), str(args.fil_w)]
+        cmd += [str(args.in_d), str(args.in_h), str(args.in_w)]
+        cmd += [str(args.conv_stride_d), str(args.conv_stride_h)]
+        cmd += [str(args.conv_stride_w)]
+        cmd += [str(args.dilation_d),
+                str(args.dilation_h),
+                str(args.dilation_w)]
+        cmd += [str(args.pad_d), str(args.pad_h), str(args.pad_w)]
+        cmd += [str(args.pad_d), str(args.pad_h), str(args.pad_w)]
+    else:
+        print('Not supported spatial dim (supported: 1, 2, 3)')
+        exit(1)
+
+
+def run_ck_grouped_conv_fwd(args):
+    args.ck_profier_op = "grouped_conv_fwd"
+    parse_data_type(args)
+    # default for MIOpen NHWGC
+    args.layout = 1
+    # use int32 by default
+    args.index_type = 0
+
+    cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
+    cmd += [str(args.data_type), str(args.layout), str(args.index_type)]
+    cmd += [str(args.verify), str(args.init_method)]
+    cmd += [str(args.log_value), str(args.time)]
+    cmd += [str(args.spatial_dim), str(args.group_count)]
+    cmd += [str(args.batchsize), str(args.out_channels)]
+    cmd += [str(args.in_channels)]
+    add_conv_params_to_cmd(args, cmd)
+
+    run_ck_profiler_cmd(cmd)
+
+
+def run_ck_grouped_conv_bwd_data(args):
+    args.ck_profier_op = "grouped_conv_bwd_data"
+    parse_data_type(args)
+    # default for MIOpen NHWGC
+    args.layout = 1
+
+    cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
+    cmd += [str(args.data_type), str(args.layout)]
+    cmd += [str(args.verify), str(args.init_method)]
+    cmd += [str(args.log_value), str(args.time)]
+    cmd += [str(args.spatial_dim), str(args.group_count)]
+    cmd += [str(args.batchsize), str(args.out_channels)]
+    cmd += [str(args.in_channels)]
+    add_conv_params_to_cmd(args, cmd)
+
+    run_ck_profiler_cmd(cmd)
+
+
+def run_ck_grouped_conv_bwd_weight(args):
+    args.ck_profier_op = "grouped_conv_bwd_weight"
+    parse_data_type(args)
+    # default for MIOpen NHWGC
+    args.layout = 2
+    # Test all split K value from the list {1, 2, 4, 8, 32, 64, 128}
+    args.split_k_value = -1
+
+    cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
+    cmd += [str(args.data_type), str(args.layout)]
+    cmd += [str(args.verify), str(args.init_method)]
+    cmd += [str(args.log_value), str(args.time)]
+    cmd += [str(args.spatial_dim), str(args.group_count)]
+    cmd += [str(args.batchsize), str(args.out_channels)]
+    cmd += [str(args.in_channels)]
+    add_conv_params_to_cmd(args, cmd)
+
+    cmd += [str(args.split_k_value)]
+    run_ck_profiler_cmd(cmd)
+
+# Get name of miopen driver, remove it from unknown
+def process_miopen_driver_name(args, unknown):
+    if "convint8" in unknown:
+        args.data_type = 'int8'
+        unknown.remove("convint8")
+    elif "convbfp16" in unknown:
+        args.data_type = 'bfp16'
+        unknown.remove("convbfp16")
+    elif "convfp16" in unknown:
+        args.data_type = 'fp16'
+        unknown.remove("convfp16")
+    elif "conv" in unknown:
+        args.data_type = 'fp32'
+        unknown.remove("conv")
+    else:
+        print('Not supported driver (supported: conv, convfp16, convint8,'
+              ' convbfp16).')
+        exit(1)
+
+
+def run_ck_profiler(args):
+    # MIOpen get number of channel per all groups, CK profiler get number of
+    # channel per group
+    args.in_channels = int(args.in_channels / args.group_count)
+    args.out_channels = int(args.out_channels / args.group_count)
+
+    if args.forw == 0 or args.forw == 1 or args.forw == 3 or args.forw == 5:
+        run_ck_grouped_conv_fwd(args)
+    if args.forw == 0 or args.forw == 2 or args.forw == 3 or args.forw == 6:
+        run_ck_grouped_conv_bwd_data(args)
+    if args.forw == 0 or args.forw == 4 or args.forw == 5 or args.forw == 6:
+        run_ck_grouped_conv_bwd_weight(args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="converter",
+        description="Convert miopen driver command to ck Profiler"
+                    "\nExample: python3 "
+                    "../script/convert_miopen_driver_to_profiler.py "
+                    "/opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 "
+                    "-k 64 -y 3 -x 3 -p 1 -q 1 -u 1 -v 1 -l 1 -j 1 -m conv -g "
+                    "32 -F 1 -t 1",
+    )
+    parser.add_argument(
+        "-in_layout",
+        "-I",
+        default=-1,
+        type=int,
+        required=False,
+        help="Input Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)"
+    )
+    parser.add_argument(
+        "-forw",
+        "-F",
+        default=0,
+        type=int,
+        required=False,
+        help="Flag enables fwd, bwd, wrw convolutions"
+        "\n0 fwd+bwd+wrw (default)"
+        "\n1 fwd only"
+        "\n2 bwd only"
+        "\n4 wrw only"
+        "\n3 fwd+bwd"
+        "\n5 fwd+wrw"
+        "\n6 bwd+wrw"
+    )
+    parser.add_argument(
+        "-spatial_dim",
+        "-_",
+        default=2,
+        type=int,
+        required=False,
+        help="convolution spatial dimension (Default-2)"
+    )
+    parser.add_argument(
+        "-batchsize",
+        "-n",
+        default=100,
+        type=int,
+        required=False,
+        help="Mini-batch size (Default=100)"
+    )
+    parser.add_argument(
+        "-in_channels",
+        "-c",
+        default=3,
+        type=int,
+        required=False,
+        help="Number of Input Channels (Default=3)"
+    )
+    parser.add_argument(
+        "-in_d",
+        "-!",
+        default=32,
+        type=int,
+        required=False,
+        help="Input Depth (Default=32)"
+    )
+    parser.add_argument(
+        "-in_h",
+        "-H",
+        default=32,
+        type=int,
+        required=False,
+        help="Input Height (Default=32)"
+    )
+    parser.add_argument(
+        "-in_w",
+        "-W",
+        default=32,
+        type=int,
+        required=False,
+        help="Input Width (Default=32)"
+    )
+    parser.add_argument(
+        "-out_channels",
+        "-k",
+        default=32,
+        type=int,
+        required=False,
+        help="Number of Output Channels (Default=32)"
+    )
+    parser.add_argument(
+        "-fil_d",
+        "-@",
+        default=3,
+        type=int,
+        required=False,
+        help="Filter Depth (Default=3)"
+    )
+    parser.add_argument(
+        "-fil_h",
+        "-y",
+        default=3,
+        type=int,
+        required=False,
+        help="Filter Height (Default=3)"
+    )
+    parser.add_argument(
+        "-fil_w",
+        "-x",
+        default=3,
+        type=int,
+        required=False,
+        help="Filter Width (Default=3)"
+    )
+    parser.add_argument(
+        "-conv_stride_d",
+        "-#",
+        default=1,
+        type=int,
+        required=False,
+        help="Convolution Stride for Depth (Default=1)"
+    )
+    parser.add_argument(
+        "-conv_stride_h",
+        "-u",
+        default=1,
+        type=int,
+        required=False,
+        help="Convolution Stride for Height (Default=1)"
+    )
+    parser.add_argument(
+        "-conv_stride_w",
+        "-v",
+        default=1,
+        type=int,
+        required=False,
+        help="Convolution Stride for Width (Default=1)"
+    )
+    parser.add_argument(
+        "-pad_d",
+        "-$",
+        default=1,
+        type=int,
+        required=False,
+        help="Zero Padding for Depth (Default=0)"
+    )
+    parser.add_argument(
+        "-pad_h",
+        "-p",
+        default=1,
+        type=int,
+        required=False,
+        help="Zero Padding for Height (Default=0)"
+    )
+    parser.add_argument(
+        "-pad_w",
+        "-q",
+        default=1,
+        type=int,
+        required=False,
+        help="Zero Padding for Width (Default=0)"
+    )
+    parser.add_argument(
+        "-verify",
+        "-V",
+        default=1,
+        type=int,
+        required=False,
+        help="Verify Each Layer (Default=1)"
+    )
+    parser.add_argument(
+        "-time",
+        "-t",
+        default=0,
+        type=int,
+        required=False,
+        help="Time Each Layer (Default=0)"
+    )
+    parser.add_argument(
+        "-dilation_d",
+        "-^",
+        default=1,
+        type=int,
+        required=False,
+        help="Dilation of Filter Depth (Default=1)"
+    )
+    parser.add_argument(
+        "-dilation_h",
+        "-l",
+        default=1,
+        type=int,
+        required=False,
+        help="Dilation of Filter Height (Default=1)"
+    )
+    parser.add_argument(
+        "-dilation_w",
+        "-j",
+        default=1,
+        type=int,
+        required=False,
+        help="Dilation of Filter Width (Default=1)"
+    )
+    parser.add_argument(
+        "-group_count",
+        "-g",
+        type=int,
+        default=1,
+        required=False,
+        help="Number of Groups (Default=1)"
+    )
+
+    args, unknown = parser.parse_known_args()
+    init_const_args(args)
+    process_miopen_driver_name(args, unknown)
+    print("Ignored args:")
+    print(unknown)
+    run_ck_profiler(args)
--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -122,7 +122,7 @@ def parse_logfile(logfile):
        #sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
        test_list=list(range(1,len(tests)+1))
    #parse conv_fwd and conv_bwd performance tests:
-    elif 'conv_fwd' in logfile or 'conv_bwd_data' in logfile:
+    elif 'conv_fwd' in logfile or 'conv_bwd' in logfile:
        for line in open(logfile):
            if 'tflops:' in line:
                lst=line.split()
@@ -274,14 +274,26 @@ def main():
            for i in range(1,len(results)+1):
                testlist.append("Test%i"%i)
            table_name="ck_grouped_gemm_tflops"
-        if 'conv_fwd' in filename:
+        if 'perf_conv_fwd' in filename:
            for i in range(1,len(results)+1):
                testlist.append("Test%i"%i)
            table_name="ck_conv_fwd_tflops"
-        if 'conv_bwd_data' in filename:
+        if 'perf_conv_bwd_data' in filename:
            for i in range(1,len(results)+1):
                testlist.append("Test%i"%i)
            table_name="ck_conv_bwd_data_tflops"
+        if 'grouped_conv_fwd' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_grouped_conv_fwd_tflops"
+        if 'grouped_conv_bwd_data' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_grouped_conv_bwd_data_tflops"
+        if 'grouped_conv_bwd_weight' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_grouped_conv_bwd_weight_tflops"
        if 'gemm_bilinear' in filename:
            for i in range(1,len(results)+1):
                testlist.append("Test%i"%i)

--- a/script/process_qa_data.sh
+++ b/script/process_qa_data.sh
@@ -15,8 +15,9 @@ python3 process_perf_data.py perf_resnet50_N256.log
 python3 process_perf_data.py perf_resnet50_N4.log
 python3 process_perf_data.py perf_batched_gemm.log
 python3 process_perf_data.py perf_grouped_gemm.log
-python3 process_perf_data.py perf_conv_fwd.log
-python3 process_perf_data.py perf_conv_bwd_data.log
+python3 process_perf_data.py perf_grouped_conv_fwd.log
+python3 process_perf_data.py perf_grouped_conv_bwd_data.log
+python3 process_perf_data.py perf_grouped_conv_bwd_weight.log
 python3 process_perf_data.py perf_gemm_bilinear.log
 python3 process_perf_data.py perf_reduction.log
 python3 process_perf_data.py perf_splitK_gemm.log

--- a/script/profile_conv_bwd_data.sh
+++ b/script/profile_conv_bwd_data.sh
--- a/script/profile_conv_fwd.sh
+++ b/script/profile_conv_fwd.sh
@@ -12,27 +12,28 @@ INIT=$5
 LOG=$6
 TIME=$7

- N=$8
+N=$8
+SplitK=$9

 # Resnet50
 ########  op  datatype  layout  verify  init  log  time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256 1024 1 1   14   14     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 1024 1 1   14   14     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   28   28     1 1       1 1      1 1       1 1
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  128 1 1   28   28     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   56   56     2 2       1 1      1 1       1 1
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 2048 1 1    7    7     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 1024  256 1 1   14   14     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   14   14     1 1       1 1      1 1       1 1
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   28   28     2 2       1 1      1 1       1 1
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  256 1 1   56   56     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64  256 1 1   56   56     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3   14   14     2 2       1 1      1 1       1 1
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  512 1 1   28   28     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  512 1 1   28   28     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 2048  512 1 1    7    7     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3    7    7     1 1       1 1      1 1       1 1
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256   64 1 1   56   56     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 1 1   56   56     1 1       1 1      0 0       0 0
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 3 3   56   56     1 1       1 1      1 1       1 1
- $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64    3 7 7  224  224     2 2       1 1      3 3       3 3
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256 1024 1 1   14   14     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 1024 1 1   14   14     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   28   28     1 1       1 1      1 1       1 1 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  128 1 1   28   28     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   56   56     2 2       1 1      1 1       1 1 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 2048 1 1    7    7     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 1024  256 1 1   14   14     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   14   14     1 1       1 1      1 1       1 1 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   28   28     2 2       1 1      1 1       1 1 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  256 1 1   56   56     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64  256 1 1   56   56     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3   14   14     2 2       1 1      1 1       1 1 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  512 1 1   28   28     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  512 1 1   28   28     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 2048  512 1 1    7    7     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3    7    7     1 1       1 1      1 1       1 1 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256   64 1 1   56   56     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 1 1   56   56     1 1       1 1      0 0       0 0 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 3 3   56   56     1 1       1 1      1 1       1 1 $SplitK
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64    3 7 7  224  224     2 2       1 1      3 3       3 3 $SplitK
--- a/script/profile_grouped_conv_fwd.sh
+++ b/script/profile_grouped_conv_fwd.sh
+#!/bin/bash
+ 
+## GPU visibility
+export HIP_VISIBLE_DEVICES=0
+DRIVER="../build/bin/ckProfiler"
+
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+INDEXTYPE=$4
+VERIFY=$5
+INIT=$6
+LOG=$7
+TIME=$8
+
+ N=$9
+
+# Resnet50
+########  op  datatype  indextype  layout  verify  init  log  time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256 1024 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 1024 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   28   28     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  128 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   56   56     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 2048 1 1    7    7     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 1024  256 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   14   14     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   28   28     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  256 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64  256 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3   14   14     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  512 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  512 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 2048  512 1 1    7    7     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3    7    7     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256   64 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 3 3   56   56     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $INDEXTYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64    3 7 7  224  224     2 2       1 1      3 3       3 3
--- a/script/run_full_performance_tests.sh
+++ b/script/run_full_performance_tests.sh
@@ -90,21 +90,27 @@ print_log_header $gemm_bilinear_log $env_type $branch $host_name
 ./profile_gemm_bilinear.sh gemm_bilinear 1 2 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
 ./profile_gemm_bilinear.sh gemm_bilinear 1 3 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log

-#run conv_fwd tests
-export conv_fwd_log="perf_conv_fwd.log"
-print_log_header $conv_fwd_log $env_type $branch $host_name
-./profile_conv_fwd.sh conv_fwd 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
-./profile_conv_fwd.sh conv_fwd 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
-./profile_conv_fwd.sh conv_fwd 2 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
-./profile_conv_fwd.sh conv_fwd 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
+#run grouped_fwd tests
+export grouped_conv_fwd_log="perf_grouped_conv_fwd.log"
+print_log_header $grouped_conv_fwd_log $env_type $branch $host_name
+./profile_grouped_conv_fwd.sh grouped_conv_fwd 0 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
+./profile_grouped_conv_fwd.sh grouped_conv_fwd 1 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log
+./profile_grouped_conv_fwd.sh grouped_conv_fwd 2 1 0 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_fwd_log

-#run conv_bwd_data tests
-export conv_bwd_data_log="perf_conv_bwd_data.log"
-print_log_header $conv_bwd_data_log $env_type $branch $host_name
-./profile_conv_bwd_data.sh conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
-./profile_conv_bwd_data.sh conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
-./profile_conv_bwd_data.sh conv_bwd_data 2 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
-./profile_conv_bwd_data.sh conv_bwd_data 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
+#run grouped_bwd_data tests
+export grouped_conv_bwd_data_log="perf_grouped_conv_bwd_data.log"
+print_log_header $grouped_conv_bwd_data_log $env_type $branch $host_name
+./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
+./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
+./profile_grouped_conv_bwd_data.sh grouped_conv_bwd_data 2 1 $verify 1 0 1 256 2>&1 | tee -a $grouped_conv_bwd_data_log
+
+#run grouped_bwd_weight tests
+export grouped_conv_bwd_weight_log="perf_grouped_conv_bwd_weight.log"
+print_log_header $grouped_conv_bwd_weight_log $env_type $branch $host_name
+./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 0 2 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
+./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 2 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
+./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 2 2 $verify 1 0 1 256 1 2>&1 | tee -a $grouped_conv_bwd_weight_log
+./profile_grouped_conv_bwd_weight.sh grouped_conv_bwd_weight 1 2 $verify 1 0 1 256 4 2>&1 | tee -a $grouped_conv_bwd_weight_log

 #run resnet50 tests
 export resnet256_log="perf_resnet50_N256.log"