merge with the develop branch

f6ceef78 · ThomasNing · 536c5458 · 25935b57 · f6ceef78 · f6ceef78
Commit f6ceef78 authored Aug 26, 2024 by ThomasNing
20 changed files
--- a/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_6d_fp32_fp8_instances.cpp
+++ b/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_6d_fp32_fp8_instances.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Scale = element_wise::Scale;
+
+void add_device_permute_scale_6d_f32_f8_instances(
+    std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F8>, Scale, 6>>>&
+        instances)
+{
+#ifdef CK_ENABLE_FP8
+    add_device_operation_instances(instances, device_permute_scale_f32_f8_instances<6, Scale>{});
+#else
+    ignore = instances;
+#endif
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp
@@ -10,15 +10,24 @@ namespace device {
 namespace instance {

 // clang-format off
-// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
-template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
-template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
-template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
-template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
-template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
+//                                                 InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     4,             3,       ReduceAMax,         UnaryAbs,       PassThrough,        false,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     4,             4,       ReduceAMax,         UnaryAbs,       PassThrough,        false,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     4,             1,       ReduceAMax,         UnaryAbs,       PassThrough,        false,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     2,             1,       ReduceAMax,         UnaryAbs,       PassThrough,        false,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     4,             3,       ReduceAMax,         UnaryAbs,       PassThrough,        false,      true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     4,             4,       ReduceAMax,         UnaryAbs,       PassThrough,        false,      true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     4,             1,       ReduceAMax,         UnaryAbs,       PassThrough,        false,      true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     2,             1,       ReduceAMax,         UnaryAbs,       PassThrough,        false,      true>(std::vector<DeviceReducePtr<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     6,             6,       ReduceAMax,         UnaryAbs,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 6, 6, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     5,             5,       ReduceAMax,         UnaryAbs,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 5, 5, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     4,             4,       ReduceAMax,         UnaryAbs,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     6,             3,       ReduceAMax,         UnaryAbs,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 6, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     5,             3,       ReduceAMax,         UnaryAbs,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 5, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     4,             3,       ReduceAMax,         UnaryAbs,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     3,             3,       ReduceAMax,      PassThrough,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 3, 3, ReduceAMax, PassThrough, PassThrough, true, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     2,             2,       ReduceAMax,      PassThrough,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 2, 2, ReduceAMax, PassThrough, PassThrough, true, false>>&);
+template void add_device_reduce_instance_blockwise<        F32,          F32,          F32,     1,             1,       ReduceAMax,      PassThrough,       PassThrough,         true,     false>(std::vector<DeviceReducePtr<F32, F32, F32, 1, 1, ReduceAMax, PassThrough, PassThrough, true, false>>&);
 // clang-format on

 } // namespace instance

--- a/library/src/utility/convolution_parameter.cpp
+++ b/library/src/utility/convolution_parameter.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/host_utility/io.hpp"

@@ -20,6 +20,63 @@ ConvParam::ConvParam(ck::index_t n_dim,
                     const std::vector<ck::index_t>& dilations,
                     const std::vector<ck::index_t>& left_pads,
                     const std::vector<ck::index_t>& right_pads)
+    : num_dim_spatial_(static_cast<ck::long_index_t>(n_dim)),
+      G_(static_cast<ck::long_index_t>(group_count)),
+      N_(static_cast<ck::long_index_t>(n_batch)),
+      K_(static_cast<ck::long_index_t>(n_out_channels)),
+      C_(static_cast<ck::long_index_t>(n_in_channels)),
+      filter_spatial_lengths_(num_dim_spatial_),
+      input_spatial_lengths_(num_dim_spatial_),
+      output_spatial_lengths_(num_dim_spatial_),
+      conv_filter_strides_(num_dim_spatial_),
+      conv_filter_dilations_(num_dim_spatial_),
+      input_left_pads_(num_dim_spatial_),
+      input_right_pads_(num_dim_spatial_)
+{
+    if(static_cast<ck::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(input_right_pads_.size()) != num_dim_spatial_)
+    {
+        throw(
+            std::runtime_error("ConvParam::ConvParam: "
+                               "parameter size is different from number of declared dimensions!"));
+    }
+
+    for(ck::index_t i = 0; i < num_dim_spatial_; ++i)
+    {
+        filter_spatial_lengths_[i] = static_cast<ck::long_index_t>(filters_len[i]);
+        input_spatial_lengths_[i]  = static_cast<ck::long_index_t>(input_len[i]);
+        conv_filter_strides_[i]    = static_cast<ck::long_index_t>(strides[i]);
+        conv_filter_dilations_[i]  = static_cast<ck::long_index_t>(dilations[i]);
+        input_left_pads_[i]        = static_cast<ck::long_index_t>(left_pads[i]);
+        input_right_pads_[i]       = static_cast<ck::long_index_t>(right_pads[i]);
+
+        // XEff = (X - 1) * conv_dilation_w + 1;
+        // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+        const ck::long_index_t x_eff =
+            (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
+
+        output_spatial_lengths_[i] =
+            (input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
+                conv_filter_strides_[i] +
+            1;
+    }
+}
+
+ConvParam::ConvParam(ck::long_index_t n_dim,
+                     ck::long_index_t group_count,
+                     ck::long_index_t n_batch,
+                     ck::long_index_t n_out_channels,
+                     ck::long_index_t n_in_channels,
+                     const std::vector<ck::long_index_t>& filters_len,
+                     const std::vector<ck::long_index_t>& input_len,
+                     const std::vector<ck::long_index_t>& strides,
+                     const std::vector<ck::long_index_t>& dilations,
+                     const std::vector<ck::long_index_t>& left_pads,
+                     const std::vector<ck::long_index_t>& right_pads)
    : num_dim_spatial_(n_dim),
      G_(group_count),
      N_(n_batch),
@@ -49,7 +106,8 @@ ConvParam::ConvParam(ck::index_t n_dim,
    {
        // XEff = (X - 1) * conv_dilation_w + 1;
        // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-        const ck::index_t x_eff = (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
+        const ck::long_index_t x_eff =
+            (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;

        output_spatial_lengths_[i] =
            (input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
@@ -63,7 +121,7 @@ ConvParam::ConvParam()
 {
 }

-std::vector<ck::index_t> ConvParam::GetOutputSpatialLengths() const
+std::vector<ck::long_index_t> ConvParam::GetOutputSpatialLengths() const
 {
    return output_spatial_lengths_;
 }
@@ -97,46 +155,46 @@ std::string get_conv_param_parser_helper_msg()

 ck::utils::conv::ConvParam parse_conv_param(int num_dim_spatial, int arg_idx, char* const argv[])
 {
-    const ck::index_t G = std::stoi(argv[arg_idx++]);
-    const ck::index_t N = std::stoi(argv[arg_idx++]);
-    const ck::index_t K = std::stoi(argv[arg_idx++]);
-    const ck::index_t C = std::stoi(argv[arg_idx++]);
-
-    std::vector<ck::index_t> filter_spatial_lengths(num_dim_spatial);
-    std::vector<ck::index_t> input_spatial_lengths(num_dim_spatial);
-    std::vector<ck::index_t> conv_filter_strides(num_dim_spatial);
-    std::vector<ck::index_t> conv_filter_dilations(num_dim_spatial);
-    std::vector<ck::index_t> input_left_pads(num_dim_spatial);
-    std::vector<ck::index_t> input_right_pads(num_dim_spatial);
+    const ck::long_index_t G = std::stol(argv[arg_idx++]);
+    const ck::long_index_t N = std::stol(argv[arg_idx++]);
+    const ck::long_index_t K = std::stol(argv[arg_idx++]);
+    const ck::long_index_t C = std::stol(argv[arg_idx++]);
+
+    std::vector<ck::long_index_t> filter_spatial_lengths(num_dim_spatial);
+    std::vector<ck::long_index_t> input_spatial_lengths(num_dim_spatial);
+    std::vector<ck::long_index_t> conv_filter_strides(num_dim_spatial);
+    std::vector<ck::long_index_t> conv_filter_dilations(num_dim_spatial);
+    std::vector<ck::long_index_t> input_left_pads(num_dim_spatial);
+    std::vector<ck::long_index_t> input_right_pads(num_dim_spatial);

    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        filter_spatial_lengths[i] = std::stol(argv[arg_idx++]);
    }

    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+        input_spatial_lengths[i] = std::stol(argv[arg_idx++]);
    }

    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+        conv_filter_strides[i] = std::stol(argv[arg_idx++]);
    }

    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+        conv_filter_dilations[i] = std::stol(argv[arg_idx++]);
    }

    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        input_left_pads[i] = std::stoi(argv[arg_idx++]);
+        input_left_pads[i] = std::stol(argv[arg_idx++]);
    }

    for(int i = 0; i < num_dim_spatial; ++i)
    {
-        input_right_pads[i] = std::stoi(argv[arg_idx++]);
+        input_right_pads[i] = std::stol(argv[arg_idx++]);
    }

    return ck::utils::conv::ConvParam{num_dim_spatial,

--- a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -82,6 +82,29 @@ bool profile_conv_bwd_data_impl(int do_verification,
    Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
    Tensor<OutDataType> output(out_g_n_k_wos_desc);

+    std::vector<ck::index_t> input_spatial_lengths_i32(NDimSpatial);
+    std::vector<ck::index_t> filter_spatial_lengths_i32(NDimSpatial);
+    std::vector<ck::index_t> output_spatial_lengths_i32(NDimSpatial);
+    std::vector<ck::index_t> conv_filter_strides_i32(NDimSpatial);
+    std::vector<ck::index_t> conv_filter_dilations_i32(NDimSpatial);
+    std::vector<ck::index_t> input_left_pads_i32(NDimSpatial);
+    std::vector<ck::index_t> input_right_pads_i32(NDimSpatial);
+
+    for(ck::index_t d = 0; d < NDimSpatial; d++)
+    {
+        input_spatial_lengths_i32[d] =
+            static_cast<ck::index_t>(conv_param.input_spatial_lengths_[d]);
+        filter_spatial_lengths_i32[d] =
+            static_cast<ck::index_t>(conv_param.filter_spatial_lengths_[d]);
+        output_spatial_lengths_i32[d] =
+            static_cast<ck::index_t>(conv_param.GetOutputSpatialLengths()[d]);
+        conv_filter_strides_i32[d] = static_cast<ck::index_t>(conv_param.conv_filter_strides_[d]);
+        conv_filter_dilations_i32[d] =
+            static_cast<ck::index_t>(conv_param.conv_filter_dilations_[d]);
+        input_left_pads_i32[d]  = static_cast<ck::index_t>(conv_param.input_left_pads_[d]);
+        input_right_pads_i32[d] = static_cast<ck::index_t>(conv_param.input_right_pads_[d]);
+    }
+
    std::cout << "input: " << input_host_result.mDesc << std::endl;
    std::cout << "weight: " << weight.mDesc << std::endl;
    std::cout << "output: " << output.mDesc << std::endl;
@@ -161,16 +184,16 @@ bool profile_conv_bwd_data_impl(int do_verification,
            op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                                        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                        conv_param.N_,
-                                        conv_param.K_,
-                                        conv_param.C_,
-                                        conv_param.input_spatial_lengths_,
-                                        conv_param.filter_spatial_lengths_,
-                                        conv_param.output_spatial_lengths_,
-                                        conv_param.conv_filter_strides_,
-                                        conv_param.conv_filter_dilations_,
-                                        conv_param.input_left_pads_,
-                                        conv_param.input_right_pads_,
+                                        static_cast<ck::index_t>(conv_param.N_),
+                                        static_cast<ck::index_t>(conv_param.K_),
+                                        static_cast<ck::index_t>(conv_param.C_),
+                                        input_spatial_lengths_i32,
+                                        filter_spatial_lengths_i32,
+                                        output_spatial_lengths_i32,
+                                        conv_filter_strides_i32,
+                                        conv_filter_dilations_i32,
+                                        input_left_pads_i32,
+                                        input_right_pads_i32,
                                        in_element_op,
                                        wei_element_op,
                                        out_element_op);

--- a/profiler/include/profiler/profile_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_conv_fwd_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -60,6 +60,29 @@ bool profile_conv_fwd_impl(int do_verification,
    Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
    Tensor<OutDataType> device_output(out_g_n_k_wos_desc);

+    std::vector<ck::index_t> input_spatial_lengths_i32(NDimSpatial);
+    std::vector<ck::index_t> filter_spatial_lengths_i32(NDimSpatial);
+    std::vector<ck::index_t> output_spatial_lengths_i32(NDimSpatial);
+    std::vector<ck::index_t> conv_filter_strides_i32(NDimSpatial);
+    std::vector<ck::index_t> conv_filter_dilations_i32(NDimSpatial);
+    std::vector<ck::index_t> input_left_pads_i32(NDimSpatial);
+    std::vector<ck::index_t> input_right_pads_i32(NDimSpatial);
+
+    for(ck::index_t d = 0; d < NDimSpatial; d++)
+    {
+        input_spatial_lengths_i32[d] =
+            static_cast<ck::index_t>(conv_param.input_spatial_lengths_[d]);
+        filter_spatial_lengths_i32[d] =
+            static_cast<ck::index_t>(conv_param.filter_spatial_lengths_[d]);
+        output_spatial_lengths_i32[d] =
+            static_cast<ck::index_t>(conv_param.GetOutputSpatialLengths()[d]);
+        conv_filter_strides_i32[d] = static_cast<ck::index_t>(conv_param.conv_filter_strides_[d]);
+        conv_filter_dilations_i32[d] =
+            static_cast<ck::index_t>(conv_param.conv_filter_dilations_[d]);
+        input_left_pads_i32[d]  = static_cast<ck::index_t>(conv_param.input_left_pads_[d]);
+        input_right_pads_i32[d] = static_cast<ck::index_t>(conv_param.input_right_pads_[d]);
+    }
+
    std::cout << "input: " << input.mDesc << std::endl;
    std::cout << "weight: " << weight.mDesc << std::endl;
    std::cout << "output: " << host_output.mDesc << std::endl;
@@ -143,16 +166,16 @@ bool profile_conv_fwd_impl(int do_verification,
            op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                                        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                        conv_param.N_,
-                                        conv_param.K_,
-                                        conv_param.C_,
-                                        conv_param.input_spatial_lengths_,
-                                        conv_param.filter_spatial_lengths_,
-                                        conv_param.GetOutputSpatialLengths(),
-                                        conv_param.conv_filter_strides_,
-                                        conv_param.conv_filter_dilations_,
-                                        conv_param.input_left_pads_,
-                                        conv_param.input_right_pads_,
+                                        static_cast<ck::index_t>(conv_param.N_),
+                                        static_cast<ck::index_t>(conv_param.K_),
+                                        static_cast<ck::index_t>(conv_param.C_),
+                                        input_spatial_lengths_i32,
+                                        filter_spatial_lengths_i32,
+                                        output_spatial_lengths_i32,
+                                        conv_filter_strides_i32,
+                                        conv_filter_dilations_i32,
+                                        input_left_pads_i32,
+                                        input_right_pads_i32,
                                        in_element_op,
                                        wei_element_op,
                                        out_element_op);

--- a/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp
@@ -48,6 +48,7 @@ bool profile_gemm_multiply_multiply_impl(int do_verification,
                                         int StrideD0,
                                         int StrideD1,
                                         int StrideE,
+                                         int KBatch,
                                         int n_warmup,
                                         int n_iter,
                                         uint64_t rotating = 0)
@@ -129,17 +130,17 @@ bool profile_gemm_multiply_multiply_impl(int do_verification,
    d1_device_buf.ToDevice(d1_m_n.mData.data());

    using DeviceOp =
-        ck::tensor_operation::device::DeviceGemmMultipleD<ALayout,
-                                                          BLayout,
-                                                          ck::Tuple<D0Layout, D1Layout>,
-                                                          ELayout,
-                                                          ADataType,
-                                                          BDataType,
-                                                          ck::Tuple<D0DataType, D1DataType>,
-                                                          EDataType,
-                                                          AElementOp,
-                                                          BElementOp,
-                                                          CElementOp>;
+        ck::tensor_operation::device::DeviceGemmMultipleDSplitK<ALayout,
+                                                                BLayout,
+                                                                ck::Tuple<D0Layout, D1Layout>,
+                                                                ELayout,
+                                                                ADataType,
+                                                                BDataType,
+                                                                ck::Tuple<D0DataType, D1DataType>,
+                                                                EDataType,
+                                                                AElementOp,
+                                                                BElementOp,
+                                                                CElementOp>;

    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
@@ -182,104 +183,128 @@ bool profile_gemm_multiply_multiply_impl(int do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    float best_kbatch     = 0;

    // profile device GEMM instances
    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr =
-            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
-                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
-                                        std::array<const void*, 2>{d0_device_buf.GetDeviceBuffer(),
-                                                                   d1_device_buf.GetDeviceBuffer()},
-                                        static_cast<EDataType*>(c_device_buf.GetDeviceBuffer()),
-                                        M,
-                                        N,
-                                        K,
-                                        StrideA,
-                                        StrideB,
-                                        std::array<ck::index_t, 2>{StrideD0, StrideD1},
-                                        StrideE,
-                                        a_element_op,
-                                        b_element_op,
-                                        c_element_op);
-
-        auto invoker_ptr = op_ptr->MakeInvokerPointer();
-
-        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-
-            // re-init C to zero before profiling next kernel
-            c_device_buf.SetZero();
+        std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38};

-            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false, 0, n_warmup, n_iter});
+        if(KBatch > 0)
+        {
+            kbatch_list = {KBatch};
+        }

-            if(do_verification)
+        for(std::size_t i = 0; i < kbatch_list.size(); i++)
+        {
+            auto kbatch_curr = kbatch_list[i];
+
+            auto argument_ptr = op_ptr->MakeArgumentPointer(
+                static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                std::array<const void*, 2>{d0_device_buf.GetDeviceBuffer(),
+                                           d1_device_buf.GetDeviceBuffer()},
+                static_cast<EDataType*>(c_device_buf.GetDeviceBuffer()),
+                M,
+                N,
+                K,
+                StrideA,
+                StrideB,
+                std::array<ck::index_t, 2>{StrideD0, StrideD1},
+                StrideE,
+                kbatch_curr,
+                a_element_op,
+                b_element_op,
+                c_element_op);
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
            {
-                c_device_buf.FromDevice(e_m_n_device_result.mData.data());

-                pass = pass & ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+                // re-init C to zero before profiling next kernel
+                c_device_buf.SetZero();
+
+                invoker_ptr->Run(argument_ptr.get(),
+                                 StreamConfig{nullptr, false, 0, n_warmup, n_iter});

-                if(do_log)
+                if(do_verification)
                {
-                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_host  : ", e_m_n_host_result.mData, ",")
-                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "c_device: ", e_m_n_device_result.mData, ",")
-                        << std::endl;
+                    c_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+                    pass = pass & ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+
+                    if(do_log)
+                    {
+                        LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
+                        LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_host  : ", e_m_n_host_result.mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_device: ", e_m_n_device_result.mData, ",")
+                            << std::endl;
+                    }
                }
-            }

-            std::string op_name = op_ptr->GetTypeString();
+                std::string op_name = op_ptr->GetTypeString();

-            float ave_time = invoker_ptr->Run(
-                argument_ptr.get(),
-                StreamConfig{
-                    nullptr, time_kernel, 0, n_warmup, n_iter, rotating_count > 1, rotating_count});
+                float ave_time = invoker_ptr->Run(argument_ptr.get(),
+                                                  StreamConfig{nullptr,
+                                                               time_kernel,
+                                                               0,
+                                                               n_warmup,
+                                                               n_iter,
+                                                               rotating_count > 1,
+                                                               rotating_count});

-            std::size_t flop = std::size_t(2) * M * N * K;
+                std::size_t flop = std::size_t(2) * M * N * K;

-            std::size_t num_btype =
-                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+                std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                        sizeof(EDataType) * M * N;

-            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+                float tflops = static_cast<float>(flop) / 1.E9 / ave_time;

-            float gb_per_sec = num_btype / 1.E6 / ave_time;
+                float gb_per_sec = num_btype / 1.E6 / ave_time;

-            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
-                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+                std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
+                          << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch "
+                          << kbatch_curr << std::endl;

 #if defined CK_ENABLE_FP8
-            // set softer tolerances for fp8
-            if constexpr(is_same_v<ADataType, f8_t> || is_same_v<BDataType, f8_t> ||
-                         is_same_v<EDataType, f8_t>)
-            {
-                std::string msg = "Error: Incorrect results!";
-                double rtol     = 1e-1;
-                double atol     = 1e-1;
-                pass            = pass & ck::utils::check_err(
-                                  e_m_n_device_result, e_m_n_host_result, msg, rtol, atol);
-            }
-            else
-            {
+                // set softer tolerances for fp8
+                if constexpr(is_same_v<ADataType, f8_t> || is_same_v<BDataType, f8_t> ||
+                             is_same_v<EDataType, f8_t>)
+                {
+                    std::string msg = "Error: Incorrect results!";
+                    double rtol     = 1e-1;
+                    double atol     = 1e-1;
+                    pass            = pass & ck::utils::check_err(
+                                      e_m_n_device_result, e_m_n_host_result, msg, rtol, atol);
+                }
+                else
+                {
 #endif
-                pass = pass & ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+                    pass = pass & ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
 #if defined CK_ENABLE_FP8
-            }
+                }
 #endif

-            if(tflops > best_tflops)
+                if(tflops > best_tflops && ave_time > 1e-10)
+                {
+                    best_op_name    = op_name;
+                    best_tflops     = tflops;
+                    best_ave_time   = ave_time;
+                    best_gb_per_sec = gb_per_sec;
+                    best_kbatch     = kbatch_curr;
+                }
+            }
+            else
            {
-                best_op_name    = op_name;
-                best_tflops     = tflops;
-                best_ave_time   = ave_time;
-                best_gb_per_sec = gb_per_sec;
+                std::cout << op_ptr->GetTypeString() << " does not support this problem"
+                          << std::endl;
            }
        }
-        else
-        {
-            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
-        }
    }

    if constexpr(is_same<EDataType, float>::value)
@@ -318,9 +343,9 @@ bool profile_gemm_multiply_multiply_impl(int do_verification,
    }

    std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
-              << " StrideB = " << StrideB << " StrideE = " << StrideE << " : " << best_ave_time
-              << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
-              << best_op_name << std::endl;
+              << " StrideB = " << StrideB << " StrideE = " << StrideE << " KBatch = " << best_kbatch
+              << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
+              << " GB/s, " << best_op_name << std::endl;

    return pass;
 }

--- a/profiler/include/profiler/profile_gemm_universal_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_impl.hpp
@@ -152,7 +152,7 @@ bool profile_gemm_universal_impl(int do_verification,
    // profile device GEMM instances
    for(auto& op_ptr : op_ptrs)
    {
-        std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 19, 20, 32, 38};
+        std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38};

        if(KBatch > 0)
        {
@@ -249,7 +249,7 @@ bool profile_gemm_universal_impl(int do_verification,
                          << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch "
                          << kbatch_curr << std::endl;

-                if(tflops > best_tflops)
+                if(tflops > best_tflops && ave_time > 1e-10)
                {
                    best_op_name    = op_name;
                    best_tflops     = tflops;

--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -136,9 +136,10 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;

    std::string best_op_name;
-    float best_avg_time   = 0;
-    float best_tflops     = 0;
-    float best_gb_per_sec = 0;
+    float best_avg_time      = 0;
+    float best_tflops        = 0;
+    float best_gb_per_sec    = 0;
+    ck::index_t best_split_k = 1;

    // profile device Conv instances
    bool all_pass = true;
@@ -167,99 +168,111 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
    range_copy(conv_param.input_left_pads_, begin(input_left_pads));
    range_copy(conv_param.input_right_pads_, begin(input_right_pads));

+    std::vector<ck::index_t> split_k_list = {1, 2, 4, 8, 16, 32, 64, 128};
+
+    if(split_k > 0)
+    {
+        split_k_list = {split_k};
+    }
+
    for(auto& op_ptr : op_ptrs)
    {
-        auto argument_ptr =
-            op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-                                        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                        input_lengths,
-                                        input_strides,
-                                        filter_lengths,
-                                        weights_strides,
-                                        output_lengths,
-                                        output_strides,
-                                        conv_filter_strides,
-                                        conv_filter_dilations,
-                                        input_left_pads,
-                                        input_right_pads,
-                                        in_element_op,
-                                        wei_element_op,
-                                        out_element_op,
-                                        split_k);
-
-        const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
-        DeviceMem workspace_dev(workspace_sz);
-        op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
-
-        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++)
        {
-            // using atomic add, so need to reset input
-            wei_device_buf.SetZero();
-
-            std::string op_name = op_ptr->GetTypeString();
-
-            auto invoker_ptr = op_ptr->MakeInvokerPointer();
-
-            float avg_time =
-                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+            auto argument_ptr = op_ptr->MakeArgumentPointer(
+                static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                input_lengths,
+                input_strides,
+                filter_lengths,
+                weights_strides,
+                output_lengths,
+                output_strides,
+                conv_filter_strides,
+                conv_filter_dilations,
+                input_left_pads,
+                input_right_pads,
+                in_element_op,
+                wei_element_op,
+                out_element_op,
+                split_k_list[split_k_id]);
+
+            const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            DeviceMem workspace_dev(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+                // using atomic add, so need to reset input
+                wei_device_buf.SetZero();

-            std::size_t flop      = conv_param.GetFlops();
-            std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+                std::string op_name = op_ptr->GetTypeString();

-            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
-            float gb_per_sec = num_btype / 1.E6 / avg_time;
+                auto invoker_ptr = op_ptr->MakeInvokerPointer();

-            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
-                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+                float avg_time =
+                    invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});

-            if(tflops > best_tflops)
-            {
-                best_op_name    = op_name;
-                best_tflops     = tflops;
-                best_avg_time   = avg_time;
-                best_gb_per_sec = gb_per_sec;
-            }
+                std::size_t flop      = conv_param.GetFlops();
+                std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();

-            if(do_verification)
-            {
-                wei_device_buf.FromDevice(weight_device_result.mData.data());
+                float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+                float gb_per_sec = num_btype / 1.E6 / avg_time;

-                bool pass = ck::utils::check_err(weight_device_result, weight_host_result);
+                std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
+                          << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
+                          << split_k_list[split_k_id] << std::endl;

-                if(!pass)
+                if(tflops > best_tflops)
                {
-                    std::cout << "Fail info: " << op_ptr->GetTypeString() << std::endl;
+                    best_op_name    = op_name;
+                    best_tflops     = tflops;
+                    best_avg_time   = avg_time;
+                    best_gb_per_sec = gb_per_sec;
+                    best_split_k    = split_k_list[split_k_id];
                }

-                all_pass &= pass;
-
-                if(do_log)
+                if(do_verification)
                {
-                    LogRangeAsType<float>(std::cout << "output : ", output.mData, ",") << std::endl;
-                    ;
-                    LogRangeAsType<float>(
-                        std::cout << "weight (device): ", weight_device_result.mData, ",")
-                        << std::endl;
-                    ;
-                    LogRangeAsType<float>(
-                        std::cout << "weight (host): ", weight_host_result.mData, ",")
-                        << std::endl;
-                    ;
-                    LogRangeAsType<float>(std::cout << "input: ", input.mData, ",") << std::endl;
-                    ;
+                    wei_device_buf.FromDevice(weight_device_result.mData.data());
+
+                    bool pass = ck::utils::check_err(weight_device_result, weight_host_result);
+
+                    if(!pass)
+                    {
+                        std::cout << "Fail info: " << op_ptr->GetTypeString() << std::endl;
+                    }
+
+                    all_pass &= pass;
+
+                    if(do_log)
+                    {
+                        LogRangeAsType<float>(std::cout << "output : ", output.mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "weight (device): ", weight_device_result.mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "weight (host): ", weight_host_result.mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(std::cout << "input: ", input.mData, ",")
+                            << std::endl;
+                    }
                }
            }
-        }
-        else
-        {
-            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+            else
+            {
+                std::cout << op_ptr->GetTypeString() << " does not support this problem"
+                          << std::endl;
+            }
        }
    }

    std::cout << "Best configuration parameters:"
              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
-              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << ", SplitK "
+              << best_split_k << std::endl;

    return all_pass;
 }

--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -33,7 +33,8 @@ template <ck::index_t NDimSpatial,
          typename WeiDataType,
          typename OutDataType,
          typename AComputeType = InDataType,
-          typename BComputeType = AComputeType>
+          typename BComputeType = AComputeType,
+          typename IndexType    = ck::index_t>
 bool profile_grouped_conv_fwd_impl(int do_verification,
                                   int init_method,
                                   bool do_log,
@@ -57,16 +58,16 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
    const auto out_g_n_k_wos_desc =
        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);

-    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
-    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
-    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
-    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
-    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
-    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
-    std::array<ck::index_t, NDimSpatial> input_left_pads{};
-    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+    std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<IndexType, NDimSpatial> conv_filter_strides{};
+    std::array<IndexType, NDimSpatial> conv_filter_dilations{};
+    std::array<IndexType, NDimSpatial> input_left_pads{};
+    std::array<IndexType, NDimSpatial> input_right_pads{};

    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };


--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -46,8 +46,10 @@ if(GPU_TARGETS MATCHES "gfx9")
    list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp)
  endif()
  list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp)
-  list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp)
-  list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp)
+  if(GPU_TARGETS MATCHES "gfx94")
+    list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp)
+    list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp)
+  endif()
  list(APPEND PROFILER_SOURCES profile_batched_gemm.cpp)
  list(APPEND PROFILER_SOURCES profile_batched_gemm_reduce.cpp)
  list(APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp)
@@ -82,6 +84,11 @@ set(PROFILER_EXECUTABLE ckProfiler)

 add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
 target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
+# flags to compress the library
+if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
+  message("Adding --offload-compress flag for ${PROFILER_EXECUTABLE}")
+  target_compile_options(${PROFILER_EXECUTABLE} PRIVATE --offload-compress)
+endif()

 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility getopt::getopt)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
@@ -123,8 +130,10 @@ if(GPU_TARGETS MATCHES "gfx9")
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance)
-  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance)
+  if(GPU_TARGETS MATCHES "gfx94")
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance)
+  endif()
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance)

--- a/profiler/src/profile_gemm_multiply_multiply.cpp
+++ b/profiler/src/profile_gemm_multiply_multiply.cpp
@@ -34,7 +34,7 @@ enum struct GemmDataType

 int profile_gemm_multiply_multiply(int argc, char* argv[])
 {
-    if(argc != 16 && argc != 19)
+    if(argc != 16 && argc != 20)
    {
        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
@@ -50,9 +50,10 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
        printf("arg7: time kernel (0=no, 1=yes)\n");
        printf("arg8 to 15: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
        printf("optional:\n");
-        printf("arg16: number of warm-up cycles (default 1)\n");
-        printf("arg17: number of iterations (default 10)\n");
-        printf("arg18: memory for rotating buffer (default 0, size in MB)\n");
+        printf("arg16: number of kbatch (default 1)\n");
+        printf("arg17: number of warm-up cycles (default 1)\n");
+        printf("arg18: number of iterations (default 10)\n");
+        printf("arg19: memory for rotating buffer (default 0, size in MB)\n");
        exit(1);
    }

@@ -76,11 +77,13 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
    int n_warmup      = 1;
    int n_iter        = 10;
    uint64_t rotating = 0;
-    if(argc == 18)
+    int KBatch        = 1;
+    if(argc == 20)
    {
-        n_warmup = std::stoi(argv[16]);
-        n_iter   = std::stoi(argv[17]);
-        rotating = std::stoull(argv[18]) * 1024 * 1024;
+        KBatch   = std::stoi(argv[16]);
+        n_warmup = std::stoi(argv[17]);
+        n_iter   = std::stoi(argv[18]);
+        rotating = std::stoull(argv[19]) * 1024 * 1024;
    }

    using F32  = float;
@@ -146,6 +149,7 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
            (StrideD0 < 0) ? DefaultStrideD0 : StrideD0,
            (StrideD1 < 0) ? DefaultStrideD1 : StrideD1,
            (StrideE < 0) ? DefaultStrideE : StrideE,
+            KBatch,
            n_warmup,
            n_iter,
            rotating);

--- a/profiler/src/profile_gemm_universal.cpp
+++ b/profiler/src/profile_gemm_universal.cpp
@@ -171,6 +171,10 @@ int profile_gemm_universal(int argc, char* argv[])
    {
        return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{});
    }
+    else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Row{}, Row{});
+    }
    else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
    {
        return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});

--- a/profiler/src/profile_grouped_conv_bwd_weight.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.

 #include <cstdlib>
 #include <initializer_list>
@@ -81,7 +81,6 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);

    ck::index_t split_k = std::stoi(argv[8 + 1 + 4 + 6 * num_dim_spatial]);
-    split_k             = std::max(1, split_k);

    using F32  = float;
    using F16  = ck::half_t;

--- a/profiler/src/profile_grouped_conv_fwd.cpp
+++ b/profiler/src/profile_grouped_conv_fwd.cpp
@@ -29,6 +29,12 @@ enum struct ConvDataType
    BF8_F8_F8,      // 7
 };

+enum struct IndexType
+{
+    INDEX_T,      // 0
+    LONG_INDEX_T, // 1
+};
+
 #define OP_NAME "grouped_conv_fwd"
 #define OP_DESC "Grouped Convolution Forward"

@@ -45,12 +51,13 @@ static void print_helper_msg()
        << "                 5: Input bf8, Weight bf8, Output fp8\n"
        << "                 6: Input fp8, Weight bf8, Output fp8\n"
        << "                 7: Input bf8, Weight fp8, Output fp8)\n"
-        << "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
+        << "arg3: indexing data type (0: 32-bit, 1: 64-bit)\n"
+        << "arg4: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
        << "                     1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K])\n"
-        << "arg4: verification (0: no, 1: yes)\n"
-        << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
-        << "arg6: print tensor value (0: no; 1: yes)\n"
-        << "arg7: time kernel (0: no, 1: yes)\n"
+        << "arg5: verification (0: no, 1: yes)\n"
+        << "arg6: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+        << "arg7: print tensor value (0: no; 1: yes)\n"
+        << "arg8: time kernel (0: no, 1: yes)\n"
        << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
    // clang-format on
 }
@@ -60,7 +67,7 @@ static void print_helper_msg()
 int profile_grouped_conv_fwd(int argc, char* argv[])
 {
    // 8 for control, 1 for num_dim_spatial
-    if(argc < 9)
+    if(argc < 10)
    {
        print_helper_msg();
        return 1;
@@ -68,20 +75,21 @@ int profile_grouped_conv_fwd(int argc, char* argv[])

    const auto data_type       = static_cast<ConvDataType>(std::stoi(argv[2]));
    const auto layout          = static_cast<ConvLayout>(std::stoi(argv[3]));
-    const bool do_verification = std::stoi(argv[4]);
-    const int init_method      = std::stoi(argv[5]);
-    const bool do_log          = std::stoi(argv[6]);
-    const bool time_kernel     = std::stoi(argv[7]);
-    const int num_dim_spatial  = std::stoi(argv[8]);
+    const auto index_type      = static_cast<IndexType>(std::stoi(argv[4]));
+    const bool do_verification = std::stoi(argv[5]);
+    const int init_method      = std::stoi(argv[6]);
+    const bool do_log          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[8]);
+    const int num_dim_spatial  = std::stoi(argv[9]);

-    // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
-    if(argc != 8 + 1 + 4 + 6 * num_dim_spatial)
+    // 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
+    if(argc != 9 + 1 + 4 + 6 * num_dim_spatial)
    {
        print_helper_msg();
        return 1;
    }

-    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
+    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 10, argv);

    using F32  = float;
    using F16  = ck::half_t;
@@ -138,18 +146,43 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
        using AComputeType = decltype(a_compute_type);
        using BComputeType = decltype(b_compute_type);

-        bool pass = ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
-                                                                InLayout,
-                                                                WeiLayout,
-                                                                OutLayout,
-                                                                InDataType,
-                                                                WeiDataType,
-                                                                OutDataType,
-                                                                AComputeType,
-                                                                BComputeType>(
-            do_verification, init_method, do_log, time_kernel, params);
+        if(index_type == IndexType::INDEX_T)
+        {
+            bool pass = ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                                    InLayout,
+                                                                    WeiLayout,
+                                                                    OutLayout,
+                                                                    InDataType,
+                                                                    WeiDataType,
+                                                                    OutDataType,
+                                                                    AComputeType,
+                                                                    BComputeType,
+                                                                    ck::index_t>(
+                do_verification, init_method, do_log, time_kernel, params);
+
+            return pass ? 0 : 1;
+        }
+        else if(index_type == IndexType::LONG_INDEX_T)
+        {
+            bool pass = ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                                    InLayout,
+                                                                    WeiLayout,
+                                                                    OutLayout,
+                                                                    InDataType,
+                                                                    WeiDataType,
+                                                                    OutDataType,
+                                                                    AComputeType,
+                                                                    BComputeType,
+                                                                    ck::long_index_t>(
+                do_verification, init_method, do_log, time_kernel, params);

-        return pass ? 0 : 1;
+            return pass ? 0 : 1;
+        }
+        else
+        {
+            std::cout << "this indexing data type is not implemented" << std::endl;
+            return 1;
+        }
    };

    // GNHWC_GKYXC_GNHWK

--- a/profiler/src/profile_grouped_gemm_fixed_nk.cpp
+++ b/profiler/src/profile_grouped_gemm_fixed_nk.cpp
@@ -85,9 +85,11 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
    const auto StrideCs = argToIntArray(argv[13]);
    const int kbatch    = argc == 15 ? std::stoi(argv[14]) : 1;

-    using F32  = float;
-    using F16  = ck::half_t;
-    using F8   = ck::f8_t;
+    using F32 = float;
+    using F16 = ck::half_t;
+#if defined(CK_ENABLE_FP8)
+    using F8 = ck::f8_t;
+#endif
    using BF16 = ck::bhalf_t;
    using I8   = int8_t;


--- a/python/ck4inductor/universal_gemm/gen_instances.py
+++ b/python/ck4inductor/universal_gemm/gen_instances.py
@@ -62,17 +62,13 @@ def parse_instances(str_instances: List[str]) -> List[CKGemmOperation]:
                    i_current = i_next + 1
            if i_next == -1:
                break
-        # pad with `None`s for the fields which are not defined in the instance
+
+        template_args.insert(2, tuple())  # ds layout
+        template_args.insert(6, tuple())  # ds dtype
+
        new_instance = CKGemmOperation(
            *template_args,  # type: ignore[arg-type]
-            *((None,) * (len(fields(CKGemmOperation)) - len(template_args))),
        )
-        # the last 2 template parameters are optional
-        # if they are absent, substitute them with default values from Universal Gemm C++ template declaration
-        if new_instance.a_compute_dtype is None:
-            new_instance.a_compute_dtype = new_instance.c_element_dtype
-        if new_instance.b_compute_dtype is None:
-            new_instance.b_compute_dtype = new_instance.c_element_dtype

        op_instances.append(new_instance)
    return op_instances
@@ -208,6 +204,8 @@ def gen_ops_preselected() -> List[CKGemmOperation]:
        a_layout="Row",
        b_layout="Col",
        c_layout="Row",
+        ds_element_dtypes=tuple(),
+        ds_layouts=tuple(),
        a_element_dtype="F16",
        b_element_dtype="F16",
        c_element_dtype="F16",

--- a/python/ck4inductor/universal_gemm/op.py
+++ b/python/ck4inductor/universal_gemm/op.py
@@ -10,10 +10,12 @@ class CKGemmOperation:

    a_layout: str
    b_layout: str
+    ds_layouts: Tuple[str]  # addmm specific
    c_layout: str

    a_element_dtype: str
    b_element_dtype: str
+    ds_element_dtypes: Tuple[str]  # addmm specific
    c_element_dtype: str

    acc_dtype: str
@@ -64,16 +66,15 @@ class CKGemmOperation:
        Tuple[int, int, int, int]
    )
    c_shuffle_block_transfer_scalar_per_vector_n_per_block: int
-
    block_gemm_pipeline_scheduler: str
-    block_gemm_pipeline_version: Optional[str]
+    block_gemm_pipeline_version: str

-    a_compute_dtype: Optional[str]
-    b_compute_dtype: Optional[str]
+    a_compute_dtype: Optional[str] = None
+    b_compute_dtype: Optional[str] = None

    def name(self):
        # cpp alias for template instance
-        return f"ck_devicegemm_xdl_shuffle_v3_{self.key_name()}"
+        return f"ck_devicegemm_multid_xdl_shuffle_v3_{self.key_name()}"

    def key_name(self):
        # TBD; must be unique per instance. Intended to use as dict key

--- a/script/convert_miopen_driver_to_profiler.py
+++ b/script/convert_miopen_driver_to_profiler.py
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# Convert miopen driver command to ck Profiler
+# Example: python3 ../script/convert_miopen_driver_to_profiler.py
+# /opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 -k 64 -y 3 -x 3
+# -p 1 -q 1 -u 2 -v 2 -l 1 -j 1 -m conv -g 32 -F 1 -t 1
+
+import argparse
+import subprocess
+
+
+def init_const_args(args):
+    args.ck_profiler_cmd = '../build/bin/ckProfiler'
+    # use decimal values
+    args.init_method = 2
+    # don't print tensor values
+    args.log_value = 0
+
+
+def run_ck_profiler_cmd(cmd):
+    print("ckProfiler command:")
+    print(cmd)
+    subprocess.run(cmd)
+
+
+def parse_data_type(args):
+    if args.data_type == "fp32":
+        if args.ck_profier_op == "grouped_conv_bwd_weight" or \
+           args.ck_profier_op == "grouped_conv_bwd_data" or \
+           args.ck_profier_op == "grouped_conv_fwd":
+            args.data_type = 0
+    if args.data_type == "fp16":
+        if args.ck_profier_op == "grouped_conv_bwd_weight" or \
+           args.ck_profier_op == "grouped_conv_bwd_data" or \
+           args.ck_profier_op == "grouped_conv_fwd":
+            args.data_type = 1
+    if args.data_type == "int8":
+        if args.ck_profier_op == "grouped_conv_bwd_weight":
+            args.data_type = 4
+        if args.ck_profier_op == "grouped_conv_bwd_data":
+            print('Not supported data type for grouped_conv_bwd_data')
+            exit(1)
+        if args.ck_profier_op == "grouped_conv_fwd":
+            args.data_type = 3
+    if args.data_type == "bfp16":
+        if args.ck_profier_op == "grouped_conv_bwd_weight" or \
+           args.ck_profier_op == "grouped_conv_bwd_data" or \
+           args.ck_profier_op == "grouped_conv_fwd":
+            args.data_type = 2
+
+
+def add_conv_params_to_cmd(args, cmd):
+    if args.spatial_dim == 1:
+        cmd += [str(args.fil_w), str(args.in_w)]
+        cmd += [str(args.conv_stride_w), str(args.dilation_w)]
+        cmd += [str(args.pad_w), str(args.pad_w)]
+    elif args.spatial_dim == 2:
+        cmd += [str(args.fil_h), str(args.fil_w)]
+        cmd += [str(args.in_h), str(args.in_w)]
+        cmd += [str(args.conv_stride_h), str(args.conv_stride_w)]
+        cmd += [str(args.dilation_h), str(args.dilation_w)]
+        cmd += [str(args.pad_h), str(args.pad_w)]
+        cmd += [str(args.pad_h), str(args.pad_w)]
+    elif args.spatial_dim == 3:
+        cmd += [str(args.fil_d), str(args.fil_h), str(args.fil_w)]
+        cmd += [str(args.in_d), str(args.in_h), str(args.in_w)]
+        cmd += [str(args.conv_stride_d), str(args.conv_stride_h)]
+        cmd += [str(args.conv_stride_w)]
+        cmd += [str(args.dilation_d),
+                str(args.dilation_h),
+                str(args.dilation_w)]
+        cmd += [str(args.pad_d), str(args.pad_h), str(args.pad_w)]
+        cmd += [str(args.pad_d), str(args.pad_h), str(args.pad_w)]
+    else:
+        print('Not supported spatial dim (supported: 1, 2, 3)')
+        exit(1)
+
+
+def run_ck_grouped_conv_fwd(args):
+    args.ck_profier_op = "grouped_conv_fwd"
+    parse_data_type(args)
+    # default for MIOpen NHWGC
+    args.layout = 1
+    # use int32 by default
+    args.index_type = 0
+
+    cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
+    cmd += [str(args.data_type), str(args.layout), str(args.index_type)]
+    cmd += [str(args.verify), str(args.init_method)]
+    cmd += [str(args.log_value), str(args.time)]
+    cmd += [str(args.spatial_dim), str(args.group_count)]
+    cmd += [str(args.batchsize), str(args.out_channels)]
+    cmd += [str(args.in_channels)]
+    add_conv_params_to_cmd(args, cmd)
+
+    run_ck_profiler_cmd(cmd)
+
+
+def run_ck_grouped_conv_bwd_data(args):
+    args.ck_profier_op = "grouped_conv_bwd_data"
+    parse_data_type(args)
+    # default for MIOpen NHWGC
+    args.layout = 1
+
+    cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
+    cmd += [str(args.data_type), str(args.layout)]
+    cmd += [str(args.verify), str(args.init_method)]
+    cmd += [str(args.log_value), str(args.time)]
+    cmd += [str(args.spatial_dim), str(args.group_count)]
+    cmd += [str(args.batchsize), str(args.out_channels)]
+    cmd += [str(args.in_channels)]
+    add_conv_params_to_cmd(args, cmd)
+
+    run_ck_profiler_cmd(cmd)
+
+
+def run_ck_grouped_conv_bwd_weight(args):
+    args.ck_profier_op = "grouped_conv_bwd_weight"
+    parse_data_type(args)
+    # default for MIOpen NHWGC
+    args.layout = 2
+    # Test all split K value from the list {1, 2, 4, 8, 32, 64, 128}
+    args.split_k_value = -1
+
+    cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
+    cmd += [str(args.data_type), str(args.layout)]
+    cmd += [str(args.verify), str(args.init_method)]
+    cmd += [str(args.log_value), str(args.time)]
+    cmd += [str(args.spatial_dim), str(args.group_count)]
+    cmd += [str(args.batchsize), str(args.out_channels)]
+    cmd += [str(args.in_channels)]
+    add_conv_params_to_cmd(args, cmd)
+
+    cmd += [str(args.split_k_value)]
+    run_ck_profiler_cmd(cmd)
+
+# Get name of miopen driver, remove it from unknown
+def process_miopen_driver_name(args, unknown):
+    if "convint8" in unknown:
+        args.data_type = 'int8'
+        unknown.remove("convint8")
+    elif "convbfp16" in unknown:
+        args.data_type = 'bfp16'
+        unknown.remove("convbfp16")
+    elif "convfp16" in unknown:
+        args.data_type = 'fp16'
+        unknown.remove("convfp16")
+    elif "conv" in unknown:
+        args.data_type = 'fp32'
+        unknown.remove("conv")
+    else:
+        print('Not supported driver (supported: conv, convfp16, convint8,'
+              ' convbfp16).')
+        exit(1)
+
+
+def run_ck_profiler(args):
+    # MIOpen get number of channel per all groups, CK profiler get number of
+    # channel per group
+    args.in_channels = int(args.in_channels / args.group_count)
+    args.out_channels = int(args.out_channels / args.group_count)
+
+    if args.forw == 0 or args.forw == 1 or args.forw == 3 or args.forw == 5:
+        run_ck_grouped_conv_fwd(args)
+    if args.forw == 0 or args.forw == 2 or args.forw == 3 or args.forw == 6:
+        run_ck_grouped_conv_bwd_data(args)
+    if args.forw == 0 or args.forw == 4 or args.forw == 5 or args.forw == 6:
+        run_ck_grouped_conv_bwd_weight(args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="converter",
+        description="Convert miopen driver command to ck Profiler"
+                    "\nExample: python3 "
+                    "../script/convert_miopen_driver_to_profiler.py "
+                    "/opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 "
+                    "-k 64 -y 3 -x 3 -p 1 -q 1 -u 1 -v 1 -l 1 -j 1 -m conv -g "
+                    "32 -F 1 -t 1",
+    )
+    parser.add_argument(
+        "-in_layout",
+        "-I",
+        default=-1,
+        type=int,
+        required=False,
+        help="Input Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)"
+    )
+    parser.add_argument(
+        "-forw",
+        "-F",
+        default=0,
+        type=int,
+        required=False,
+        help="Flag enables fwd, bwd, wrw convolutions"
+        "\n0 fwd+bwd+wrw (default)"
+        "\n1 fwd only"
+        "\n2 bwd only"
+        "\n4 wrw only"
+        "\n3 fwd+bwd"
+        "\n5 fwd+wrw"
+        "\n6 bwd+wrw"
+    )
+    parser.add_argument(
+        "-spatial_dim",
+        "-_",
+        default=2,
+        type=int,
+        required=False,
+        help="convolution spatial dimension (Default-2)"
+    )
+    parser.add_argument(
+        "-batchsize",
+        "-n",
+        default=100,
+        type=int,
+        required=False,
+        help="Mini-batch size (Default=100)"
+    )
+    parser.add_argument(
+        "-in_channels",
+        "-c",
+        default=3,
+        type=int,
+        required=False,
+        help="Number of Input Channels (Default=3)"
+    )
+    parser.add_argument(
+        "-in_d",
+        "-!",
+        default=32,
+        type=int,
+        required=False,
+        help="Input Depth (Default=32)"
+    )
+    parser.add_argument(
+        "-in_h",
+        "-H",
+        default=32,
+        type=int,
+        required=False,
+        help="Input Height (Default=32)"
+    )
+    parser.add_argument(
+        "-in_w",
+        "-W",
+        default=32,
+        type=int,
+        required=False,
+        help="Input Width (Default=32)"
+    )
+    parser.add_argument(
+        "-out_channels",
+        "-k",
+        default=32,
+        type=int,
+        required=False,
+        help="Number of Output Channels (Default=32)"
+    )
+    parser.add_argument(
+        "-fil_d",
+        "-@",
+        default=3,
+        type=int,
+        required=False,
+        help="Filter Depth (Default=3)"
+    )
+    parser.add_argument(
+        "-fil_h",
+        "-y",
+        default=3,
+        type=int,
+        required=False,
+        help="Filter Height (Default=3)"
+    )
+    parser.add_argument(
+        "-fil_w",
+        "-x",
+        default=3,
+        type=int,
+        required=False,
+        help="Filter Width (Default=3)"
+    )
+    parser.add_argument(
+        "-conv_stride_d",
+        "-#",
+        default=1,
+        type=int,
+        required=False,
+        help="Convolution Stride for Depth (Default=1)"
+    )
+    parser.add_argument(
+        "-conv_stride_h",
+        "-u",
+        default=1,
+        type=int,
+        required=False,
+        help="Convolution Stride for Height (Default=1)"
+    )
+    parser.add_argument(
+        "-conv_stride_w",
+        "-v",
+        default=1,
+        type=int,
+        required=False,
+        help="Convolution Stride for Width (Default=1)"
+    )
+    parser.add_argument(
+        "-pad_d",
+        "-$",
+        default=1,
+        type=int,
+        required=False,
+        help="Zero Padding for Depth (Default=0)"
+    )
+    parser.add_argument(
+        "-pad_h",
+        "-p",
+        default=1,
+        type=int,
+        required=False,
+        help="Zero Padding for Height (Default=0)"
+    )
+    parser.add_argument(
+        "-pad_w",
+        "-q",
+        default=1,
+        type=int,
+        required=False,
+        help="Zero Padding for Width (Default=0)"
+    )
+    parser.add_argument(
+        "-verify",
+        "-V",
+        default=1,
+        type=int,
+        required=False,
+        help="Verify Each Layer (Default=1)"
+    )
+    parser.add_argument(
+        "-time",
+        "-t",
+        default=0,
+        type=int,
+        required=False,
+        help="Time Each Layer (Default=0)"
+    )
+    parser.add_argument(
+        "-dilation_d",
+        "-^",
+        default=1,
+        type=int,
+        required=False,
+        help="Dilation of Filter Depth (Default=1)"
+    )
+    parser.add_argument(
+        "-dilation_h",
+        "-l",
+        default=1,
+        type=int,
+        required=False,
+        help="Dilation of Filter Height (Default=1)"
+    )
+    parser.add_argument(
+        "-dilation_w",
+        "-j",
+        default=1,
+        type=int,
+        required=False,
+        help="Dilation of Filter Width (Default=1)"
+    )
+    parser.add_argument(
+        "-group_count",
+        "-g",
+        type=int,
+        default=1,
+        required=False,
+        help="Number of Groups (Default=1)"
+    )
+
+    args, unknown = parser.parse_known_args()
+    init_const_args(args)
+    process_miopen_driver_name(args, unknown)
+    print("Ignored args:")
+    print(unknown)
+    run_ck_profiler(args)
--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -122,7 +122,7 @@ def parse_logfile(logfile):
        #sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
        test_list=list(range(1,len(tests)+1))
    #parse conv_fwd and conv_bwd performance tests:
-    elif 'conv_fwd' in logfile or 'conv_bwd_data' in logfile:
+    elif 'conv_fwd' in logfile or 'conv_bwd' in logfile:
        for line in open(logfile):
            if 'tflops:' in line:
                lst=line.split()
@@ -143,6 +143,12 @@ def parse_logfile(logfile):
            if 'Best Perf' in line:
                lst=line.split()
                res.append(lst[36])
+    elif 'perf_fmha' in logfile:
+        for line in open(logfile):
+            if 'TFlops' in line:
+                lst=line.split()
+                line_dict=dict(zip(lst[1:],lst))
+                res.append(line_dict['TFlops,'])
    return res


@@ -268,14 +274,26 @@ def main():
            for i in range(1,len(results)+1):
                testlist.append("Test%i"%i)
            table_name="ck_grouped_gemm_tflops"
-        if 'conv_fwd' in filename:
+        if 'perf_conv_fwd' in filename:
            for i in range(1,len(results)+1):
                testlist.append("Test%i"%i)
            table_name="ck_conv_fwd_tflops"
-        if 'conv_bwd_data' in filename:
+        if 'perf_conv_bwd_data' in filename:
            for i in range(1,len(results)+1):
                testlist.append("Test%i"%i)
            table_name="ck_conv_bwd_data_tflops"
+        if 'grouped_conv_fwd' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_grouped_conv_fwd_tflops"
+        if 'grouped_conv_bwd_data' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_grouped_conv_bwd_data_tflops"
+        if 'grouped_conv_bwd_weight' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_grouped_conv_bwd_weight_tflops"
        if 'gemm_bilinear' in filename:
            for i in range(1,len(results)+1):
                testlist.append("Test%i"%i)
@@ -304,6 +322,14 @@ def main():
            for i in range(1,len(results)+1):
                testlist.append("Test%i"%i)
            table_name="ck_mixed_gemm_tflops"
+        if 'fmha_fwd' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_fmha_fwd_tflops"
+        if 'fmha_bwd' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_fmha_bwd_tflops"

        tflops_base = get_baseline(table_name,conn)
        store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, conn)

--- a/script/process_perf_data.sh
+++ b/script/process_perf_data.sh
@@ -13,3 +13,20 @@
 python3 process_perf_data.py perf_gemm.log
 python3 process_perf_data.py perf_resnet50_N256.log
 python3 process_perf_data.py perf_resnet50_N4.log
+
+file=./perf_fmha_fwd_gfx942.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_fwd_gfx942.log
+fi
+file=./perf_fmha_bwd_gfx942.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_bwd_gfx942.log
+fi
+file=./perf_fmha_fwd_gfx90a.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
+fi
+file=./perf_fmha_bwd_gfx90a.log
+if [ -e "$file" ]; then
+    python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
+fi