Merge remote-tracking branch 'origin/develop' into ggemm_multid_two_stage

3e4d0ff3 · Jakub Piasecki · 1ad29336 · 9e011bcd · 3e4d0ff3 · 3e4d0ff3
Commit 3e4d0ff3 authored Mar 19, 2024 by Jakub Piasecki
16 changed files
--- a/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_4d_instances.cpp
+++ b/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_4d_instances.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_permute_scale_4d_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4>>>& instances)
+{
+    add_device_operation_instances(instances, device_permute_scale_f16_instances<4>{});
+}
+void add_device_permute_scale_4d_f32_instances(
+    std::vector<std::unique_ptr<
+        DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4>>>& instances)
+{
+    add_device_operation_instances(instances, device_permute_scale_f32_instances<4>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_5d_instances.cpp
+++ b/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_5d_instances.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_permute_scale_5d_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 5>>>& instances)
+{
+    add_device_operation_instances(instances, device_permute_scale_f16_instances<5>{});
+}
+void add_device_permute_scale_5d_f32_instances(
+    std::vector<std::unique_ptr<
+        DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 5>>>& instances)
+{
+    add_device_operation_instances(instances, device_permute_scale_f32_instances<5>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_6d_instances.cpp
+++ b/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_6d_instances.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_permute_scale_6d_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 6>>>& instances)
+{
+    add_device_operation_instances(instances, device_permute_scale_f16_instances<6>{});
+}
+void add_device_permute_scale_6d_f32_instances(
+    std::vector<std::unique_ptr<
+        DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 6>>>& instances)
+{
+    add_device_operation_instances(instances, device_permute_scale_f32_instances<6>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/profiler/README.md
+++ b/profiler/README.md
@@ -37,9 +37,9 @@ Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s
 ################          op datatype  in_layout   wei_layout  out_layout  verify  init  log  repeat  N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
 ./bin/ckProfiler  conv2d_fwd        1          1            1           1       1     1    0       5  128  256  192 3 3   71   71     2 2       1 1      1 1       1 1
 ```
 Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
-```
+```bash
 in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
 wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
 out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
@@ -104,6 +104,7 @@ arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
 arg.e_grid_desc_m_n_{ 4096, 4096}
 ....
 Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s
+```
 ## Profile grouped convolution backward data kernels
 ```bash
 # arg1: tensor operation (grouped_conv_bwd_data: Grouped Convolution Backward Data)
@@ -129,10 +130,11 @@ Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s
 ################                   op   datatype  layout  verify  init  log  time  Ndims  G  N   K   C  Y  X  Hi  Wi  Sy  Sx  Dy  Dx  LeftPy  LeftPx  RightPy  RightPx
 ./bin/ckProfiler grouped_conv_bwd_data          1       0       1     1    0     1      2 32  4 192 192  3  3  28  28   1   1   1   1       1       1        1        1
- ```
+```
 Result (MI100, FP16, GNHWC_GKYXC_GNHWK)
-```
+```bash
 out: dim 5, lengths {32, 4, 192, 28, 28}, strides {602112, 150528, 1, 5376, 192}
 wei: dim 5, lengths {32, 192, 192, 3, 3}, strides {331776, 1728, 1, 576, 192}
 in: dim 5, lengths {32, 4, 192, 28, 28}, strides {602112, 150528, 1, 5376, 192}
@@ -173,10 +175,11 @@ GB/s: 127.947
 ################                   op   datatype  layout  verify  init  log  time  Ndims  G   N   K   C  Y  X  Hi  Wi  Sy  Sx  Dy  Dx  LeftPy  LeftPx  RightPy  RightPx  SplitK
 ./bin/ckProfiler grouped_conv_bwd_weight         1       1      0     1    0     1      2 32 256 256 512  3  3  28  28   1   1   1   1       1       0        0        0       1
- ```
+```
 Result (MI100, FP16, GNHWC_GKYXC_GNHWK)
-```
+```bash
 input: dim 5, lengths {32, 512, 1024, 28, 28}, strides {411041792, 802816, 1, 28672, 1024}
 weight: dim 5, lengths {32, 512, 1024, 3, 3}, strides {4718592, 9216, 1, 3072, 1024}
 output: dim 5, lengths {32, 512, 512, 26, 26}, strides {177209344, 346112, 1, 13312, 512}
@@ -190,8 +193,9 @@ GB/s: 69.2301
 Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
 ## Profile image to column/column to image kernels
 ```bash
-# arg1: tensor operation (" OP_NAME ": " OP_DESC ")
+# arg1: tensor operation ( conv_tensor_rearrange : Conv Tensor Rearrange )
 # arg2: data type (0: Input fp32, Weight fp32, Output fp32
 #                  1: Input fp16, Weight fp16, Output fp16
 #                  2: Input bf16, Weight bf16, Output bf16
@@ -216,10 +220,11 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate
 ################                   op   datatype  layout  verify  init  log  time opType Ndims  G   N   K   C  Y  X  Hi  Wi  Sy  Sx  Dy  Dx  LeftPy  LeftPx  RightPy  RightPx
 ./bin/ckProfiler conv_tensor_rearrange          0       0       0     1    0     1      0     2  1 256   1 512  3  3   28  28   1   1   1   1        0       0       0        0
- ```
+```
 Result (MI210, FP32, NHWC)
-```
+```bash
 input: dim 5, lengths {1, 256, 512, 28, 28}, strides {102760448, 401408, 1, 14336, 512}
 output: dim 2, lengths {173056, 4608}, strides {4608, 1}
 ....
@@ -229,3 +234,30 @@ avg_time: 3.12326
 GB/s: 2042.59
 ```
 Note: Column to image kernel adds to the output memory, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
+## Profile Permute scale kernels
+```bash
+# arg1: tensor operation ( permute_scale : Permute Scale )
+# arg2: data type (0: Input fp32, Output fp32
+#                  1: Input fp16, Output fp16
+# arg4: verification (0: no, 1: yes)
+# arg5: initialization (0: no init, 1: integer value, 2: decimal value)
+# arg6: print tensor value (0: no; 1: yes)
+# arg7: time kernel (0: no, 1: yes)
+# from arg8: tensor lengths
+#            input strides
+#            output strides
+################            op datatype  verify  init  log  time  dim0 dim1 dim2 in_stride0 in_stride1 in_stride2 out_stride0 out_stride1 out_stride2
+./bin/ckProfiler permute_scale        0       1     1    0     1    64   64   64       4096         64          1           1          64        4096
+```
+Result (MI100, FP32)
+```bash
+A: dim 3, lengths {64, 64, 64}, strides {4096, 64, 1}
+B: dim 3, lengths {64, 64, 64}, strides {1, 64, 4096}
+....
+Best perf = 0.0146878 ms, 142.782 GB/s, DeviceElementwiseNormalizationImpl<3, 2>
+```
--- a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
+++ b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
@@ -233,7 +233,7 @@ bool profile_elementwise_layernorm_impl(int do_verification,
            y_dev.FromDevice(y.mData.data());
            bool pass =
-                ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
+                ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 5e-3, 5e-3);
            if(do_log)
            {

--- a/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_gemm_fixed_nk_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iomanip>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_fixed_nk.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+namespace ck {
+namespace profiler {
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+bool profile_grouped_gemm_fixed_nk_impl(int do_verification,
+                                        int init_method,
+                                        bool do_log,
+                                        bool time_kernel,
+                                        const std::vector<int>& Ms,
+                                        const std::vector<int>& Ns,
+                                        const std::vector<int>& Ks,
+                                        const std::vector<int>& StrideAs,
+                                        const std::vector<int>& StrideBs,
+                                        const std::vector<int>& StrideCs,
+                                        int kbatch   = 1,
+                                        int n_warmup = 1,
+                                        int n_iter   = 10)
+{
+    bool pass = true;
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+    std::size_t group_count = Ms.size();
+    if(!(group_count == Ns.size() && group_count == Ks.size() && group_count == StrideAs.size() &&
+         group_count == StrideBs.size() && group_count == StrideCs.size()))
+    {
+        throw std::runtime_error("wrong! inconsistent M/N/Ks, StrideA/B/Cs size\n");
+    }
+    std::vector<Tensor<ADataType>> a_m_k;
+    std::vector<Tensor<BDataType>> b_k_n;
+    std::vector<Tensor<CDataType>> c_m_n_host_results;
+    std::vector<Tensor<CDataType>> c_m_n_device_results;
+    for(std::size_t i = 0; i < group_count; i++)
+    {
+        a_m_k.push_back(
+            Tensor<ADataType>(f_host_tensor_descriptor(Ms[i], Ks[i], StrideAs[i], ALayout{})));
+        b_k_n.push_back(
+            Tensor<BDataType>(f_host_tensor_descriptor(Ks[i], Ns[i], StrideBs[i], BLayout{})));
+        c_m_n_device_results.push_back(
+            Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
+        c_m_n_host_results.push_back(
+            Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
+#if DEBUG_LOG
+        std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i
+                  << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
+                  << "]:" << c_m_n_device_results[i].mDesc << std::endl;
+#endif // DEBUG_LOG
+        std::size_t num_thread = 1;
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            a_m_k[i].GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread);
+            b_k_n[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            a_m_k[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
+            b_k_n[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
+        }
+    }
+    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
+    std::vector<DeviceMemPtr> a_device_buf, b_device_buf, c_device_buf;
+    a_device_buf.reserve(group_count);
+    b_device_buf.reserve(group_count);
+    c_device_buf.reserve(group_count);
+    std::vector<const void*> p_a, p_b;
+    std::vector<void*> p_c;
+    p_a.reserve(group_count);
+    p_b.reserve(group_count);
+    p_c.reserve(group_count);
+    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
+    gemm_descs.reserve(group_count);
+    std::vector<ck::tensor_operation::device::GroupedGemmKernelArgument<1>>
+        grouped_gemm_kernel_args_;
+    grouped_gemm_kernel_args_.reserve(group_count);
+    for(std::size_t i = 0; i < group_count; i++)
+    {
+        a_device_buf.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpaceSize()));
+        b_device_buf.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpaceSize()));
+        c_device_buf.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSpaceSize()));
+        a_device_buf[i]->ToDevice(a_m_k[i].mData.data());
+        b_device_buf[i]->ToDevice(b_k_n[i].mData.data());
+        gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});
+        p_a.push_back(a_device_buf[i]->GetDeviceBuffer());
+        p_b.push_back(b_device_buf[i]->GetDeviceBuffer());
+        p_c.push_back(c_device_buf[i]->GetDeviceBuffer());
+        grouped_gemm_kernel_args_.push_back({a_device_buf[i]->GetDeviceBuffer(),
+                                             b_device_buf[i]->GetDeviceBuffer(),
+                                             {},
+                                             c_device_buf[i]->GetDeviceBuffer(),
+                                             Ms[i],
+                                             Ns[i],
+                                             Ks[i],
+                                             StrideAs[i],
+                                             StrideBs[i],
+                                             {},
+                                             StrideCs[i]});
+    }
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemmFixedNK<ALayout,
+                                                                            BLayout,
+                                                                            ck::Tuple<>,
+                                                                            CLayout,
+                                                                            ADataType,
+                                                                            BDataType,
+                                                                            ck::Tuple<>,
+                                                                            CDataType,
+                                                                            AElementOp,
+                                                                            BElementOp,
+                                                                            CElementOp>;
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    if(op_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device GEMM instance found");
+    }
+    std::string best_gemm_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    float best_kbatch     = 0;
+    auto p_ds = std::vector<std::array<const void*, 0>>{};
+    if(do_verification)
+    {
+        for(std::size_t i = 0; i < gemm_descs.size(); i++)
+        {
+            using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                    BDataType,
+                                                                                    CDataType,
+                                                                                    AccDataType,
+                                                                                    AElementOp,
+                                                                                    BElementOp,
+                                                                                    CElementOp>;
+            auto ref_gemm    = ReferenceGemmInstance{};
+            auto ref_invoker = ref_gemm.MakeInvoker();
+            auto ref_argument = ref_gemm.MakeArgument(a_m_k[i],
+                                                      b_k_n[i],
+                                                      c_m_n_host_results[i],
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      c_element_op);
+            ref_invoker.Run(ref_argument);
+        }
+    }
+    // profile device GEMM instances
+    for(auto& gemm_ptr : op_ptrs)
+    {
+        auto argument_ptr =
+            gemm_ptr->MakeArgumentPointer(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_c,
+                                          gemm_descs,
+                                          ck::tensor_operation::element_wise::PassThrough{},
+                                          ck::tensor_operation::element_wise::PassThrough{},
+                                          ck::tensor_operation::element_wise::PassThrough{});
+        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
+        DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get()));
+        DeviceMem grouped_gemm_kernel_args_dev(
+            gemm_ptr->GetDeviceKernelArgSize(argument_ptr.get()));
+        hipGetErrorString(hipMemcpy(grouped_gemm_kernel_args_dev.GetDeviceBuffer(),
+                                    grouped_gemm_kernel_args_.data(),
+                                    gemm_ptr->GetDeviceKernelArgSize(argument_ptr.get()),
+                                    hipMemcpyHostToDevice));
+        gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
+        gemm_ptr->SetDeviceKernelArgs(argument_ptr.get(),
+                                      grouped_gemm_kernel_args_dev.GetDeviceBuffer());
+        std::string gemm_name = gemm_ptr->GetTypeString();
+        std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 20, 24, 32, 48, 64};
+        if(kbatch > 0)
+        {
+            kbatch_list = {kbatch};
+        }
+        for(std::size_t j = 0; j < kbatch_list.size(); j++)
+        {
+            auto kbatch_curr = kbatch_list[j];
+            gemm_ptr->SetKBatch(argument_ptr.get(), kbatch_curr);
+            if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+                for(std::size_t i = 0; i < gemm_descs.size(); i++)
+                    c_device_buf[i]->SetZero();
+                invoker_ptr->Run(argument_ptr.get(),
+                                 StreamConfig{nullptr, false, 0, n_warmup, n_iter});
+                if(do_verification)
+                {
+                    bool instance_pass = true;
+                    for(std::size_t i = 0; i < gemm_descs.size(); i++)
+                    {
+                        c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
+                        if(std::is_same_v<CDataType, ck::half_t> && kbatch_curr > 1)
+                        {
+                            instance_pass =
+                                instance_pass && ck::utils::check_err(c_m_n_device_results[i],
+                                                                      c_m_n_host_results[i],
+                                                                      "Error: Incorrect results!",
+                                                                      0.06);
+                        }
+                        else
+                        {
+                            instance_pass =
+                                instance_pass && ck::utils::check_err(c_m_n_device_results[i],
+                                                                      c_m_n_host_results[i]);
+                        }
+                        if(do_log)
+                        {
+                            LogRangeAsType<float>(std::cout << "a : ", a_m_k[i].mData, ",")
+                                << std::endl;
+                            LogRangeAsType<float>(std::cout << "b: ", b_k_n[i].mData, ",")
+                                << std::endl;
+                            LogRangeAsType<float>(
+                                std::cout << "c_device: ", c_m_n_device_results[i].mData, ",")
+                                << std::endl;
+                            LogRangeAsType<float>(
+                                std::cout << "c_host  : ", c_m_n_host_results[i].mData, ",")
+                                << std::endl;
+                        }
+                    }
+                    std::cout << "Instance: " << gemm_name << " verification "
+                              << (instance_pass ? "SUCCEED" : "FAILED") << std::endl;
+                    pass = pass && instance_pass;
+                }
+                float ave_time = invoker_ptr->Run(
+                    argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter});
+                if(time_kernel)
+                {
+                    std::size_t flop = 0, num_btype = 0;
+                    for(std::size_t i = 0; i < gemm_descs.size(); i++)
+                    {
+                        flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
+                        num_btype += sizeof(ADataType) * Ms[i] * Ks[i] +
+                                     sizeof(BDataType) * Ks[i] * Ns[i] +
+                                     sizeof(CDataType) * Ms[i] * Ns[i];
+                    }
+                    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+                    float gb_per_sec = num_btype / 1.E6 / ave_time;
+                    std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
+                              << " TFlops, " << gb_per_sec << " GB/s, " << gemm_name << ", KBatch "
+                              << kbatch_curr << std::endl;
+                    if(tflops > best_tflops)
+                    {
+                        best_gemm_name  = gemm_name;
+                        best_tflops     = tflops;
+                        best_ave_time   = ave_time;
+                        best_gb_per_sec = gb_per_sec;
+                        best_kbatch     = kbatch_curr;
+                    }
+                }
+            }
+            else
+            {
+                std::cout << "Instance: " << gemm_name << ", does not support this GEMM problem"
+                          << std::endl;
+            }
+        }
+    }
+    if(time_kernel)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+                  << best_gb_per_sec << " GB/s, " << best_gemm_name << ", KBatch = " << best_kbatch
+                  << std::endl;
+    }
+    return pass;
+}
+} // namespace profiler
+} // namespace ck
--- a/test/permute_scale/test_permute_scale_impl.hpp
+++ b/test/permute_scale/test_permute_scale_impl.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 #include <iomanip>
 #include <random>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_elementwise_scale.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_elementwise_scale_impl.hpp"
 #include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp"
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/literals.hpp"
 namespace ck {
-template <typename HostTensorA, typename HostTensorB, typename FunctorA, typename FunctorB>
+template <typename HostTensorA,
-void host_elementwise4D(HostTensorB& B_nhwc,
+          typename HostTensorB,
-                        const HostTensorA& A_nchw,
+          typename AElementOp,
-                        FunctorA functor_a,
+          typename BElementOp,
-                        FunctorB functor_b,
+          typename ScaleElementOp>
-                        float scale)
+void reference_permute_scale(HostTensorB& b_tensor,
-{
+                             const HostTensorA& a_tensor,
-    std::size_t N = A_nchw.mDesc.GetLengths()[0];
+                             AElementOp a_tensor_op,
-    std::size_t C = A_nchw.mDesc.GetLengths()[1];
+                             BElementOp b_tensor_op,
-    std::size_t H = A_nchw.mDesc.GetLengths()[2];
+                             ScaleElementOp scale_op)
-    std::size_t W = A_nchw.mDesc.GetLengths()[3];
+{
-    for(std::size_t w = 0; w < W; ++w)
+    b_tensor.ForEach([&](auto& self, auto idx) {
-        for(std::size_t h = 0; h < H; ++h)
+        auto tmp_val = a_tensor(idx);
-            for(std::size_t c = 0; c < C; ++c)
+        b_tensor_op(tmp_val, tmp_val);
-                for(std::size_t n = 0; n < N; ++n)
+        scale_op(tmp_val, tmp_val);
-                {
+        a_tensor_op(self(idx), tmp_val);
-                    using tmp_type   = ck::remove_reference_t<decltype(B_nhwc(0, 0))>;
+    });
-                    tmp_type tmp_val = 0;
+}
-                    auto a_val       = A_nchw.mData[(n) + (c * N) + (h * C * N) + (w * H * C * N)];
-                    functor_b(tmp_val, a_val);
+namespace profiler {
-                    functor_a(B_nhwc.mData[(n) + (c * W * H * N) + (h * N) + (w * H * N)],
-                              scale * tmp_val);
+template <typename ADataType, typename BDataType, index_t NumDim>
-                }
+bool profile_permute_scale_impl(int do_verification,
-}
+                                int init_method,
+                                bool do_log,
-template <typename ADataType, typename BDataType, index_t NumDim>
+                                bool time_kernel,
-bool test_permute_scale_impl(int do_verification,
+                                std::vector<index_t> lengths_vector,
-                             int init_method,
+                                std::vector<index_t> input_strides_vector,
-                             bool do_log,
+                                std::vector<index_t> output_strides_vector)
-                             bool time_kernel,
+{
-                             std::vector<index_t> lengths)
+    bool pass           = true;
-{
+    bool instance_found = false;
-    bool pass = true;
+    using ElementOp = ck::tensor_operation::element_wise::PassThrough;
-    using ElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using UnaryOp   = ck::tensor_operation::element_wise::UnarySquare;
-    using UnaryOp   = ck::tensor_operation::element_wise::UnarySquare;
+    using Scale     = ck::tensor_operation::element_wise::Scale;
-    using Scale     = ck::tensor_operation::element_wise::Scale;
+    float scale     = 2.f;
-    float scale     = 2.f;
+    Tensor<ADataType> a(lengths_vector, input_strides_vector);
-    index_t N = lengths[0];
+    Tensor<BDataType> b(lengths_vector, output_strides_vector);
-    index_t C = lengths[1];
+    Tensor<BDataType> host_b(lengths_vector, output_strides_vector);
-    index_t H = lengths[2];
-    index_t W = lengths[3];
+    std::cout << "A: " << a.mDesc << std::endl;
+    std::cout << "B: " << b.mDesc << std::endl;
-    std::vector<ck::index_t> nchw = {N, C, H, W};
-    std::vector<ck::index_t> nhwc = {N, H, W, C};
+    switch(init_method)
-    Tensor<ADataType> a(nchw);
+    {
-    Tensor<BDataType> b(nhwc);
+    case 0: break;
-    Tensor<BDataType> host_b(nhwc);
+    case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
+    default: a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); break;
-    std::array<ck::index_t, 4> ab_lengths;
+    }
-    std::array<ck::index_t, 4> a_strides = {1,
+    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
-                                            static_cast<int>(nchw[0]),
+    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
-                                            static_cast<int>(nchw[0] * nchw[1]),
-                                            static_cast<int>(nchw[0] * nchw[1] * nchw[2])};
+    a_device_buf.ToDevice(a.mData.data());
-    std::array<ck::index_t, 4> b_strides = {1,
+    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
-                                            static_cast<int>(nhwc[0] * nhwc[1] * nhwc[2]),
+    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
-                                            static_cast<int>(nhwc[0]),
+    using DeviceOp = ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
-                                            static_cast<int>(nhwc[0] * nhwc[1])};
+                                                                     ck::Tuple<BDataType>,
-    ck::ranges::copy(nchw, ab_lengths.begin());
+                                                                     ElementOp,
+                                                                     UnaryOp,
-    std::cout << "A: " << a.mDesc << std::endl;
+                                                                     Scale,
-    std::cout << "B: " << b.mDesc << std::endl;
+                                                                     NumDim>;
-    switch(init_method)
+    // get device op instances
-    {
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-    case 0: break;
+        DeviceOp>::GetInstances();
-    case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
-    default: // a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
-        std::mt19937 gen(11939);
-        std::uniform_int_distribution<int> dis(0, 1);
+    std::string best_instance_name;
-        auto i = 0;
+    float best_ave_time   = std::numeric_limits<float>::max();
-        for(std::size_t w = 0; w < a.mDesc.GetLengths()[3]; ++w)
+    float best_gb_per_sec = 0;
-            for(std::size_t h = 0; h < a.mDesc.GetLengths()[2]; ++h)
+    float best_tflops     = 0;
-                for(std::size_t c = 0; c < a.mDesc.GetLengths()[1]; ++c)
-                    for(std::size_t n = 0; n < a.mDesc.GetLengths()[0]; ++n)
+    if(do_verification)
-                    {
+    {
-                        a.mData[(n * nchw[1] * nchw[2] * nchw[3]) + (c * nchw[2] * nchw[3]) +
+        reference_permute_scale(host_b, a, ElementOp{}, UnaryOp{}, Scale{scale});
-                                (h * nchw[3]) + w] = i;
+    }
-                        i                          = dis(gen);
-                    }
+    auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
-    }
+    std::array<ck::index_t, NumDim> lengths{};
+    std::array<ck::index_t, NumDim> input_strides{};
-    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
+    std::array<ck::index_t, NumDim> output_strides{};
-    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
+    copy(lengths_vector, lengths);
+    copy(input_strides_vector, input_strides);
-    a_device_buf.ToDevice(a.mData.data());
+    copy(output_strides_vector, output_strides);
-    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
+    for(auto& op_ptr : op_ptrs)
-    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
+    {
-    using DeviceOp = ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
+        auto argument_ptr = op_ptr->MakeArgumentPointer(lengths,
-                                                                     ck::Tuple<BDataType>,
+                                                        {input_strides},
-                                                                     ElementOp,
+                                                        {output_strides},
-                                                                     UnaryOp,
+                                                        input,
-                                                                     Scale,
+                                                        output,
-                                                                     NumDim>;
+                                                        ElementOp{},
+                                                        UnaryOp{},
-    // get device op instances
+                                                        Scale{scale});
-    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceOp>::GetInstances();
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
-    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
-    std::string best_instance_name;
+            instance_found = true;
-    float best_ave_time   = std::numeric_limits<float>::max();
-    float best_gb_per_sec = 0;
+            b_device_buf.SetZero();
-    float best_tflops     = 0;
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
-    if(do_verification)
+            if(do_verification)
-    {
+            {
-        host_elementwise4D(host_b, a, ElementOp{}, UnaryOp{}, scale);
+                b_device_buf.FromDevice(b.mData.data());
-    }
+                pass &= ck::utils::check_err(
-    for(auto& op_ptr : op_ptrs)
+                    b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
-    {
-        auto argument_ptr = op_ptr->MakeArgumentPointer(ab_lengths,
+                if(do_log)
-                                                        {a_strides},
+                {
-                                                        {b_strides},
+                    LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
-                                                        input,
+                    LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
-                                                        output,
+                }
-                                                        ElementOp{},
+            }
-                                                        UnaryOp{},
-                                                        Scale{scale});
+            std::string op_name = op_ptr->GetTypeString();
-        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
-        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
+            std::size_t flop = std::size_t(2) * a.mDesc.GetElementSpaceSize() / sizeof(ADataType);
-            b_device_buf.SetZero();
+            std::size_t num_btype = sizeof(ADataType) * a.mDesc.GetElementSpaceSize() +
-            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+                                    sizeof(BDataType) * b.mDesc.GetElementSpaceSize();
-            if(do_verification)
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-            {
-                b_device_buf.FromDevice(b.mData.data());
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
-                pass &= ck::utils::check_err(
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
-                    b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
-                if(do_log)
+            if(tflops > best_tflops)
-                {
+            {
-                    LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
+                best_instance_name = op_name;
-                    LogRangeAsType<float>(std::cout << "b: ", b.mData, ",") << std::endl;
+                best_tflops        = tflops;
-                }
+                best_ave_time      = ave_time;
-            }
+                best_gb_per_sec    = gb_per_sec;
+            }
-            std::string op_name = op_ptr->GetTypeString();
+        }
+        else
-            float ave_time =
+        {
-                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
-            std::size_t flop = std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3];
+    }
+    if(time_kernel)
-            std::size_t num_btype = sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) +
+    {
-                                    sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]);
+        std::cout << "Best perf = " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
-            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    }
-            float gb_per_sec = num_btype / 1.E6 / ave_time;
+    return pass && instance_found;
+}
-            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
-                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+} // namespace profiler
+} // namespace ck
-            if(tflops > best_tflops)
-            {
-                best_instance_name = op_name;
-                best_tflops        = tflops;
-                best_ave_time      = ave_time;
-                best_gb_per_sec    = gb_per_sec;
-            }
-        }
-        else
-        {
-            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
-        }
-    }
-    if(time_kernel)
-    {
-        LogRange(std::cout << "length = ", lengths, ",") << ", ";
-        std::cout << "best perf = " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
-                  << best_instance_name << std::endl;
-    }
-    return true;
-}
-} // namespace ck
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -32,6 +32,7 @@ set(PROFILER_SOURCES
    profile_grouped_conv_bwd_data.cpp
    profile_conv_tensor_rearrange.cpp
    profile_transpose.cpp
+    profile_permute_scale.cpp
 )
 if(DL_KERNELS)
@@ -51,6 +52,7 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
  list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
  list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp)
  list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp)
+  list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp)
  list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
 endif()
@@ -99,6 +101,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_d
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance)
 if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
@@ -124,6 +127,7 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fixed_nk_instance)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
 endif()

--- a/profiler/src/profile_grouped_conv_fwd.cpp
+++ b/profiler/src/profile_grouped_conv_fwd.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <numeric>
@@ -23,6 +23,7 @@ enum struct ConvDataType
    F16_F16_F16,    // 1
    BF16_BF16_BF16, // 2
    INT8_INT8_INT8, // 3
+    F8_F8_F8,       // 4
 };
 #define OP_NAME "grouped_conv_fwd"
@@ -36,7 +37,8 @@ static void print_helper_msg()
        << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
        << "                 1: Input fp16, Weight fp16, Output fp16\n"
        << "                 2: Input bf16, Weight bf16, Output bf16\n"
-        << "                 3: Input int8, Weight int8, Output int8)\n"
+        << "                 3: Input int8, Weight int8, Output int8\n"
+        << "                 4: Input fp8, Weight fp8, Output fp8)\n"
        << "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
        << "                     1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K])\n"
        << "arg4: verification (0: no, 1: yes)\n"
@@ -79,6 +81,7 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
    using F16  = ck::half_t;
    using BF16 = ck::bhalf_t;
    using INT8 = int8_t;
+    using F8   = ck::f8_t;
    //
    using GNWC   = ck::tensor_layout::convolution::GNWC;
@@ -250,6 +253,10 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
        {
            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, INT8{}, INT8{}, INT8{});
        }
+        else if(data_type == ConvDataType::F8_F8_F8)
+        {
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F8{}, F8{}, F8{});
+        }
    }
    std::cout << "this data_type & layout is not implemented" << std::endl;

--- a/profiler/src/profile_grouped_gemm_fixed_nk.cpp
+++ b/profiler/src/profile_grouped_gemm_fixed_nk.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "profiler/profile_grouped_gemm_fixed_nk_impl.hpp"
+#include "profiler_operation_registry.hpp"
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+};
+enum struct GemmDataType
+{
+    BF16_I8_BF16, // 0
+    F16_F16_F16,  // 1
+    F16_F8_F16,   // 2
+    F16_I8_F16,   // 3
+};
+#define OP_NAME "grouped_gemm_fixed_nk"
+#define OP_DESC "Grouped GEMM Fixed NK"
+namespace {
+std::vector<int> argToIntArray(char* input)
+{
+    std::vector<int> out;
+    std::istringstream in(input);
+    std::string item;
+    while(std::getline(in, item, ','))
+    {
+        out.push_back(std::stoi(item));
+    }
+    return out;
+}
+int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
+{
+    if(argc < 14)
+    {
+        std::cout
+            << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+            << "arg2: data type (0: bf16@int8; 1: fp16; 2: fp16@fp8; 3: fp16@int8)\n"
+            << "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"
+            << "                     1: A[m, k] * B[n, k] = C[m, n];\n"
+            << "arg4: verification (0: no; 1: yes)\n"
+            << "arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+            << "arg6: print tensor value (0: no; 1: yes)\n"
+            << "arg7: time kernel (0=n0, 1=yes)\n"
+            << "arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
+               "64,64 64,64 128,128)\n"
+            << "arg15: kbatch value (default 1)\n"
+            << "optional:\n"
+            << "arg16: number of warm-up cycles (default 1)\n"
+            << "arg17: number of iterations (default 10)\n"
+            << std::endl;
+        exit(1);
+    }
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+    const auto Ms = argToIntArray(argv[8]);
+    const auto Ns = argToIntArray(argv[9]);
+    const auto Ks = argToIntArray(argv[10]);
+    const auto StrideAs = argToIntArray(argv[11]);
+    const auto StrideBs = argToIntArray(argv[12]);
+    const auto StrideCs = argToIntArray(argv[13]);
+    const int kbatch    = argc == 15 ? std::stoi(argv[14]) : 1;
+    using F32  = float;
+    using F16  = ck::half_t;
+    using F8   = ck::f8_t;
+    using BF16 = ck::bhalf_t;
+    using I8   = int8_t;
+    int n_warmup = 1;
+    int n_iter   = 10;
+    if(argc == 17)
+    {
+        n_warmup = std::stoi(argv[16]);
+        n_iter   = std::stoi(argv[17]);
+    }
+#if defined(CK_ENABLE_BF16) && defined(CK_ENABLE_INT8)
+    if(data_type == GemmDataType::BF16_I8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<BF16,
+                                                         I8,
+                                                         BF16,
+                                                         F32,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            Ms,
+            Ns,
+            Ks,
+            StrideAs,
+            StrideBs,
+            StrideCs,
+            kbatch,
+            n_warmup,
+            n_iter);
+    }
+    else if(data_type == GemmDataType::BF16_I8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<BF16,
+                                                         I8,
+                                                         BF16,
+                                                         F32,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::ColumnMajor,
+                                                         ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            Ms,
+            Ns,
+            Ks,
+            StrideAs,
+            StrideBs,
+            StrideCs,
+            kbatch,
+            n_warmup,
+            n_iter);
+    }
+#endif
+#if defined(CK_ENABLE_FP16)
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
+                                                         F16,
+                                                         F16,
+                                                         F32,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            Ms,
+            Ns,
+            Ks,
+            StrideAs,
+            StrideBs,
+            StrideCs,
+            kbatch,
+            n_warmup,
+            n_iter);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
+                                                         F16,
+                                                         F16,
+                                                         F32,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::ColumnMajor,
+                                                         ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            Ms,
+            Ns,
+            Ks,
+            StrideAs,
+            StrideBs,
+            StrideCs,
+            kbatch,
+            n_warmup,
+            n_iter);
+    }
+#endif
+#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8)
+    else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
+                                                         F8,
+                                                         F16,
+                                                         F32,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            Ms,
+            Ns,
+            Ks,
+            StrideAs,
+            StrideBs,
+            StrideCs,
+            kbatch,
+            n_warmup,
+            n_iter);
+    }
+    else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
+                                                         F8,
+                                                         F16,
+                                                         F32,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::ColumnMajor,
+                                                         ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            Ms,
+            Ns,
+            Ks,
+            StrideAs,
+            StrideBs,
+            StrideCs,
+            kbatch,
+            n_warmup,
+            n_iter);
+    }
+#endif
+#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_INT8)
+    else if(data_type == GemmDataType::F16_I8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
+                                                         I8,
+                                                         F16,
+                                                         F32,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            Ms,
+            Ns,
+            Ks,
+            StrideAs,
+            StrideBs,
+            StrideCs,
+            kbatch,
+            n_warmup,
+            n_iter);
+    }
+    else if(data_type == GemmDataType::F16_I8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
+                                                         I8,
+                                                         F16,
+                                                         F32,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::ColumnMajor,
+                                                         ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            Ms,
+            Ns,
+            Ks,
+            StrideAs,
+            StrideBs,
+            StrideCs,
+            1,
+            n_warmup,
+            n_iter);
+    }
+#endif
+    else
+    {
+        throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
+    }
+    return 0;
+}
+} // anonymous namespace
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_gemm_fixed_nk);
--- a/profiler/src/profile_permute_scale.cpp
+++ b/profiler/src/profile_permute_scale.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "profiler/profile_permute_scale_impl.hpp"
+#include "profiler_operation_registry.hpp"
+namespace {
+enum struct DataType
+{
+    F32_F32, // 0
+    F16_F16  // 1
+};
+#define OP_NAME "permute_scale"
+#define OP_DESC "Permute Scale"
+static void print_helper_msg()
+{
+    std::cout
+        // clang-format off
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+        << "arg2: data type (0: Input fp32, Output fp32\n"
+        << "                 1: Input fp16, Output fp16\n"
+        << "arg4: verification (0: no, 1: yes)\n"
+        << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+        << "arg6: print tensor value (0: no; 1: yes)\n"
+        << "arg7: time kernel (0: no, 1: yes)\n"
+        << "from arg8: tensor lengths\n"
+        << "           input strides\n"
+        << "           output strides\n" << std::endl;
+    // clang-format on
+}
+} // namespace
+int profile_permute_scale(int argc, char* argv[])
+{
+    constexpr int control_argc = 7;
+    const int dims_argc        = argc - control_argc;
+    // Number of lenghs, input strides and outputs strides must be equal
+    if(argc < control_argc && dims_argc % 3 != 0)
+    {
+        print_helper_msg();
+        return 1;
+    }
+    const auto data_type       = static_cast<DataType>(std::stoi(argv[2]));
+    const bool do_verification = std::stoi(argv[3]);
+    const int init_method      = std::stoi(argv[4]);
+    const bool do_log          = std::stoi(argv[5]);
+    const bool time_kernel     = std::stoi(argv[6]);
+    const int num_dims         = dims_argc / 3;
+    std::vector<ck::index_t> lengths(num_dims);
+    std::vector<ck::index_t> input_strides(num_dims);
+    std::vector<ck::index_t> output_strides(num_dims);
+    for(int i = 0; i < num_dims; i++)
+    {
+        lengths[i]        = std::stoi(argv[control_argc + i]);
+        input_strides[i]  = std::stoi(argv[control_argc + num_dims + i]);
+        output_strides[i] = std::stoi(argv[control_argc + 2 * num_dims + i]);
+    }
+    using F32 = float;
+    using F16 = ck::half_t;
+    constexpr auto I1 = ck::Number<1>{};
+    constexpr auto I2 = ck::Number<2>{};
+    constexpr auto I3 = ck::Number<3>{};
+    constexpr auto I4 = ck::Number<4>{};
+    constexpr auto I5 = ck::Number<5>{};
+    constexpr auto I6 = ck::Number<6>{};
+    auto profile = [&](auto num_dim_tmp, auto in_type, auto out_type) {
+        constexpr ck::index_t NDim = num_dim_tmp.value;
+        using InDataType  = decltype(in_type);
+        using OutDataType = decltype(out_type);
+        bool pass =
+            ck::profiler::profile_permute_scale_impl<InDataType, OutDataType, NDim>(do_verification,
+                                                                                    init_method,
+                                                                                    do_log,
+                                                                                    time_kernel,
+                                                                                    lengths,
+                                                                                    input_strides,
+                                                                                    output_strides);
+        return pass ? 0 : 1;
+    };
+    if(num_dims == 1)
+    {
+        if(data_type == DataType::F32_F32)
+        {
+            return profile(I1, F32{}, F32{});
+        }
+        else if(data_type == DataType::F16_F16)
+        {
+            return profile(I1, F16{}, F16{});
+        }
+    }
+    else if(num_dims == 2)
+    {
+        if(data_type == DataType::F32_F32)
+        {
+            return profile(I2, F32{}, F32{});
+        }
+        else if(data_type == DataType::F16_F16)
+        {
+            return profile(I2, F16{}, F16{});
+        }
+    }
+    else if(num_dims == 3)
+    {
+        if(data_type == DataType::F32_F32)
+        {
+            return profile(I3, F32{}, F32{});
+        }
+        else if(data_type == DataType::F16_F16)
+        {
+            return profile(I3, F16{}, F16{});
+        }
+    }
+    else if(num_dims == 4)
+    {
+        if(data_type == DataType::F32_F32)
+        {
+            return profile(I4, F32{}, F32{});
+        }
+        else if(data_type == DataType::F16_F16)
+        {
+            return profile(I4, F16{}, F16{});
+        }
+    }
+    else if(num_dims == 5)
+    {
+        if(data_type == DataType::F32_F32)
+        {
+            return profile(I5, F32{}, F32{});
+        }
+        else if(data_type == DataType::F16_F16)
+        {
+            return profile(I5, F16{}, F16{});
+        }
+    }
+    else if(num_dims == 6)
+    {
+        if(data_type == DataType::F32_F32)
+        {
+            return profile(I6, F32{}, F32{});
+        }
+        else if(data_type == DataType::F16_F16)
+        {
+            return profile(I6, F16{}, F16{});
+        }
+    }
+    std::cout << "this data_type & layout is not implemented" << std::endl;
+    return 1;
+}
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_permute_scale);
--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -133,11 +133,16 @@ def parse_logfile(logfile):
            if 'Best Perf' in line:
                lst=line.split()
                res.append(lst[4])
-    elif 'onnx_gemm' in logfile or 'splitK_gemm' in logfile or 'mixed_gemm' in logfile:
+    elif 'onnx_gemm' in logfile or 'mixed_gemm' in logfile:
        for line in open(logfile):
            if 'Best Perf' in line:
                lst=line.split()
                res.append(lst[33])
+    elif 'splitK_gemm' in logfile:
+        for line in open(logfile):
+            if 'Best Perf' in line:
+                lst=line.split()
+                res.append(lst[36])
    return res
@@ -231,7 +236,7 @@ def main():
    sql_hostname = '127.0.0.1'
    sql_username = os.environ["dbuser"]
    sql_password = os.environ["dbpassword"]
-    sql_main_database = 'miopen_perf'
+    sql_main_database = os.environ["ck_perf_db"]
    sql_port = 3306
    ssh_host = os.environ["dbsship"]
    ssh_user = os.environ["dbsshuser"]

--- a/script/run_full_performance_tests.sh
+++ b/script/run_full_performance_tests.sh
@@ -121,26 +121,16 @@ print_log_header $reduction_log $env_type $branch $host_name
 ./profile_reduce_no_index.sh $verify 2 10 --half 2>&1 | tee -a $reduction_log
 #run splitK_gemm tests, first correctness verification, then performance
-export splitK_gemm_ver_log="perf_splitK_gemm_verify.log"
-print_log_header $splitK_gemm_ver_log $env_type $branch $host_name
-./profile_splitK_gemm.sh gemm_splitk 0 0 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
-./profile_splitK_gemm.sh gemm_splitk 0 1 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
-./profile_splitK_gemm.sh gemm_splitk 0 2 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
-./profile_splitK_gemm.sh gemm_splitk 0 3 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
-./profile_splitK_gemm.sh gemm_splitk 1 0 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
-./profile_splitK_gemm.sh gemm_splitk 1 1 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
-./profile_splitK_gemm.sh gemm_splitk 1 2 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
-./profile_splitK_gemm.sh gemm_splitk 1 3 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
 export splitK_gemm_log="perf_splitK_gemm.log"
 print_log_header $splitK_gemm_log $env_type $branch $host_name
-./profile_splitK_gemm.sh gemm_splitk 0 0 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 0 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 0 1 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 1 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 0 2 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 2 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 0 3 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 3 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 1 0 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 0 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 1 1 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 1 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 1 2 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 2 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 1 3 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 3 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
 #run ONNX gemm tests
 export onnx_log="perf_onnx_gemm.log"

--- a/test/grouped_convnd_bwd_data/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_data/CMakeLists.txt
 list(APPEND gpu_list_xdl gfx908 gfx90a gfx940)
-list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102)
+list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1103)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
    if(gpu IN_LIST gpu_list_xdl AND target EQUAL 0)

--- a/test/grouped_convnd_bwd_weight/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_weight/CMakeLists.txt
 list(APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942)
-list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102)
+list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1103)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)

--- a/test/permute_scale/test_permute_scale.cpp
+++ b/test/permute_scale/test_permute_scale.cpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #include "gtest/gtest.h"
-#include "test_permute_scale_impl.hpp"
+#include "profiler/profile_permute_scale_impl.hpp"
 using F16 = ck::half_t;
 using F32 = float;
@@ -15,15 +15,32 @@ class TestPermute : public ::testing::Test
    using ADataType = std::tuple_element_t<0, Tuple>;
    using BDataType = std::tuple_element_t<1, Tuple>;
-    void Run()
+    constexpr bool skip_case()
    {
-        std::vector<std::vector<ck::index_t>> lengths = {
+#ifndef CK_ENABLE_FP16
-            {4, 2, 1, 8}, {1, 1, 1, 1}, {16, 8, 32, 64}, {32, 64, 128, 128}};
+        if constexpr(ck::is_same_v<ADataType, F16> || ck::is_same_v<BDataType, F16>)
+        {
+            return true;
+        }
+#endif
+#ifndef CK_ENABLE_FP32
+        if constexpr(ck::is_same_v<ADataType, F32> || ck::is_same_v<BDataType, F32>)
+        {
+            return true;
+        }
+#endif
+        return false;
+    }
-        for(auto length : lengths)
+    template <ck::index_t NDims>
+    void Run(std::vector<ck::index_t> lengths,
+             std::vector<ck::index_t> input_strides,
+             std::vector<ck::index_t> output_strides)
+    {
+        if(!skip_case())
        {
-            bool success =
+            bool success = ck::profiler::profile_permute_scale_impl<ADataType, BDataType, NDims>(
-                ck::test_permute_scale_impl<ADataType, BDataType, 4>(true, 2, false, false, length);
+                true, 2, false, false, lengths, input_strides, output_strides);
            EXPECT_TRUE(success);
        }
    }
@@ -32,5 +49,52 @@ class TestPermute : public ::testing::Test
 using KernelTypes = ::testing::Types<std::tuple<F16, F16>, std::tuple<F32, F32>>;
 TYPED_TEST_SUITE(TestPermute, KernelTypes);
-TYPED_TEST(TestPermute, Test_FP16) { this->Run(); }
+TYPED_TEST(TestPermute, Test1D)
-TYPED_TEST(TestPermute, Test_FP32) { this->Run(); }
+{
+    constexpr ck::index_t NumDims = 1;
+    this->template Run<NumDims>({8}, {1}, {2});
+    this->template Run<NumDims>({8}, {2}, {1});
+    this->template Run<NumDims>({1}, {1}, {1});
+}
+TYPED_TEST(TestPermute, Test2D)
+{
+    constexpr ck::index_t NumDims = 2;
+    this->template Run<NumDims>({8, 4}, {4, 1}, {1, 8});
+    this->template Run<NumDims>({8, 4}, {1, 8}, {4, 1});
+    this->template Run<NumDims>({1, 1}, {1, 1}, {1, 1});
+}
+TYPED_TEST(TestPermute, Test3D)
+{
+    constexpr ck::index_t NumDims = 3;
+    this->template Run<NumDims>({2, 4, 4}, {16, 4, 1}, {1, 2, 8});
+    this->template Run<NumDims>({2, 4, 4}, {1, 2, 8}, {16, 4, 1});
+    this->template Run<NumDims>({1, 1, 1}, {1, 1, 1}, {1, 1, 1});
+}
+TYPED_TEST(TestPermute, Test4D)
+{
+    constexpr ck::index_t NumDims = 4;
+    this->template Run<NumDims>({2, 4, 4, 4}, {64, 16, 4, 1}, {1, 2, 8, 32});
+    this->template Run<NumDims>({2, 4, 4, 4}, {1, 2, 8, 32}, {64, 16, 4, 1});
+    this->template Run<NumDims>({1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 1});
+}
+TYPED_TEST(TestPermute, Test5D)
+{
+    constexpr ck::index_t NumDims = 5;
+    this->template Run<NumDims>({2, 4, 4, 4, 4}, {256, 64, 16, 4, 1}, {1, 2, 8, 32, 128});
+    this->template Run<NumDims>({2, 4, 4, 4, 4}, {1, 2, 8, 32, 128}, {256, 64, 16, 4, 1});
+    this->template Run<NumDims>({1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1});
+}
+TYPED_TEST(TestPermute, Test6D)
+{
+    constexpr ck::index_t NumDims = 6;
+    this->template Run<NumDims>(
+        {2, 4, 4, 4, 4, 4}, {1024, 256, 64, 16, 4, 1}, {1, 2, 8, 32, 128, 512});
+    this->template Run<NumDims>(
+        {2, 4, 4, 4, 4, 4}, {1, 2, 8, 32, 128, 512}, {1024, 256, 64, 16, 4, 1});
+    this->template Run<NumDims>({1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1});
+}