Merge remote-tracking branch 'origin/develop' into contraction

7a3b49e5 · Chao Liu · e07b3d8e · d3051d75 · 7a3b49e5 · 7a3b49e5
Commit 7a3b49e5 authored Jun 25, 2022 by Chao Liu
20 changed files
--- a/profiler/src/profile_gemm_bias_add_reduce.cpp
+++ b/profiler/src/profile_gemm_bias_add_reduce.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "profiler/include/profile_gemm_bias_add_reduce_impl.hpp"
+int profile_gemm_bias_add_reduce(int argc, char* argv[])
+{
+    enum struct GemmMatrixLayout
+    {
+        MK_KN_MN, // 0
+        MK_NK_MN, // 1
+        KM_KN_MN, // 2
+        KM_NK_MN, // 3
+    };
+    enum struct GemmReduceDataType
+    {
+        F32_F32_F32_F32_F32_F32_F32, // 0
+        F16_F16_F16_F16_F16_F32_F32, // 1
+    };
+    if(!(argc == 14 || argc == 15))
+    {
+        printf("arg1: tensor operation (gemm: GEMM+bias+add+Reduce)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
+        printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, StrideC1\n");
+        exit(1);
+    }
+    const auto data_type       = static_cast<GemmReduceDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+    const int StrideA  = std::stoi(argv[11]);
+    const int StrideB  = std::stoi(argv[12]);
+    const int StrideC  = std::stoi(argv[13]);
+    const int StrideC1 = std::stoi(argv[14]);
+    if(data_type == GemmReduceDataType::F16_F16_F16_F16_F16_F32_F32 &&
+       layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_gemm_bias_add_reduce_impl<ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        float,
+                                                        ck::tensor_layout::gemm::RowMajor,
+                                                        ck::tensor_layout::gemm::RowMajor,
+                                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC,
+            (StrideC1 < 0) ? N : StrideC1);
+    }
+    else if(data_type == GemmReduceDataType::F16_F16_F16_F16_F16_F32_F32 &&
+            layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_gemm_bias_add_reduce_impl<ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        float,
+                                                        ck::tensor_layout::gemm::RowMajor,
+                                                        ck::tensor_layout::gemm::ColumnMajor,
+                                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC,
+            (StrideC1 < 0) ? N : StrideC1);
+    }
+    else if(data_type == GemmReduceDataType::F16_F16_F16_F16_F16_F32_F32 &&
+            layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        ck::profiler::profile_gemm_bias_add_reduce_impl<ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        float,
+                                                        ck::tensor_layout::gemm::ColumnMajor,
+                                                        ck::tensor_layout::gemm::RowMajor,
+                                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC,
+            (StrideC1 < 0) ? N : StrideC1);
+    }
+    else if(data_type == GemmReduceDataType::F16_F16_F16_F16_F16_F32_F32 &&
+            layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        ck::profiler::profile_gemm_bias_add_reduce_impl<ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        float,
+                                                        ck::tensor_layout::gemm::ColumnMajor,
+                                                        ck::tensor_layout::gemm::ColumnMajor,
+                                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC,
+            (StrideC1 < 0) ? N : StrideC1);
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this data_type & layout is not implemented");
+    }
+    return 0;
+}
--- a/profiler/src/profile_gemm_bias_relu.cpp
+++ b/profiler/src/profile_gemm_bias_relu.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
 #include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
+#include "profiler/include/profile_gemm_bias_relu_impl.hpp"
-#include "profile_gemm_bias_relu_impl.hpp"
 enum struct GemmMatrixLayout
 {

--- a/profiler/src/profile_gemm_bias_relu_add.cpp
+++ b/profiler/src/profile_gemm_bias_relu_add.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
 #include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
+#include "profiler/include/profile_gemm_bias_relu_add_impl.hpp"
-#include "profile_gemm_bias_relu_add_impl.hpp"
 enum struct GemmMatrixLayout
 {

--- a/profiler/src/profile_gemm_reduce.cpp
+++ b/profiler/src/profile_gemm_reduce.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
 #include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
+#include "profiler/include/profile_gemm_reduce_impl.hpp"
-#include "profile_gemm_reduce_impl.hpp"
 int profile_gemm_reduce(int argc, char* argv[])
 {

--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
 #include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
+#include "profiler/include/profile_grouped_gemm_impl.hpp"
-#include "profile_grouped_gemm_impl.hpp"
 enum struct GemmMatrixLayout
 {

--- a/profiler/src/profile_reduce.cpp
+++ b/profiler/src/profile_reduce.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <fstream>
 #include <cstdlib>
@@ -6,11 +9,12 @@
 #include <sstream>
 #include <getopt.h>
-#include "data_type_enum.hpp"
+#include "ck/utility/reduction_enums.hpp"
-#include "reduction_enums.hpp"
+#include "ck/library/host_tensor/host_common_util.hpp"
-#include "host_common_util.hpp"
+#include "profiler/include/profile_reduce_impl.hpp"
-#include "profile_reduce_impl.hpp"
+#include "profiler/include/data_type_enum.hpp"
 using namespace std;

--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
 #include <cstdlib>
 #include <cstring>
-#include "profile_convnd_fwd.hpp"
+#include "profiler/include/profile_convnd_fwd.hpp"
 int profile_gemm(int, char*[]);
 int profile_gemm_bias_2d(int, char*[]);
 int profile_gemm_bias_relu(int, char*[]);
 int profile_gemm_bias_relu_add(int, char*[]);
 int profile_gemm_reduce(int, char*[]);
+int profile_gemm_bias_add_reduce(int, char*[]);
 int profile_batched_gemm(int, char*[]);
 int profile_grouped_gemm(int, char*[]);
 int profile_conv_fwd(int, char*[]);
 int profile_conv_fwd_bias_relu(int, char*[]);
 int profile_conv_fwd_bias_relu_add(int, char*[]);
-int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);
 int profile_convnd_bwd_data(int, char*[], int);
 int profile_reduce(int, char*[]);
 int profile_conv_bwd_weight(int, char*[]);
 int profile_batched_gemm_reduce(int, char*[]);
+int profile_gemm_add_add_fastgelu(int, char*[]);
+static void print_helper_message()
+{
+    // clang-format off
+        printf("arg1: tensor operation (gemm: GEMM\n"
+               "                        gemm_bias_2d: GEMM+Bias(2D)\n"
+               "                        gemm_bias_relu: GEMM+Bias+ReLU\n"
+               "                        gemm_bias_relu_add: GEMM+Bias+ReLU+Add\n"
+               "                        gemm_reduce: GEMM+Reduce\n"
+               "                        grouped_gemm: Grouped GEMM\n"
+               "                        conv_fwd: ForwardConvolution\n"
+               "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
+               "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
+               "                        conv1d_bwd_data: BackwardConvolution data 1 dim\n"
+               "                        conv2d_bwd_data: BackwardConvolution data 2 dim\n"
+               "                        conv3d_bwd_data: BackwardConvolution data 3 dim\n"
+               "                        reduce: Reduce\n"
+               "                        conv2d_bwd_weight: Backward Weight Convolution 2d\n"
+               "                        gemm_add_add_fastgelu: GEMM+Add+Add+FastGeLU\n");
+    // clang-format on
+}
 int main(int argc, char* argv[])
 {
+    if(argc == 1)
+    {
+        print_helper_message();
+        return 0;
+    }
    if(strcmp(argv[1], "gemm") == 0)
    {
        return profile_gemm(argc, argv);
@@ -44,6 +76,10 @@ int main(int argc, char* argv[])
    {
        return profile_gemm_reduce(argc, argv);
    }
+    else if(strcmp(argv[1], "gemm_bias_add_reduce") == 0)
+    {
+        return profile_gemm_bias_add_reduce(argc, argv);
+    }
    else if(strcmp(argv[1], "batched_gemm") == 0)
    {
        return profile_batched_gemm(argc, argv);
@@ -68,10 +104,6 @@ int main(int argc, char* argv[])
    {
        return profile_conv_fwd_bias_relu_add(argc, argv);
    }
-    else if(strcmp(argv[1], "conv_fwd_bias_relu_atomic_add") == 0)
-    {
-        return profile_conv_fwd_bias_relu_atomic_add(argc, argv);
-    }
    else if(strcmp(argv[1], "conv1d_bwd_data") == 0)
    {
        return profile_convnd_bwd_data(argc, argv, 1);
@@ -92,25 +124,14 @@ int main(int argc, char* argv[])
    {
        return profile_conv_bwd_weight(argc, argv);
    }
+    else if(strcmp(argv[1], "gemm_add_add_fastgelu") == 0)
+    {
+        return profile_gemm_add_add_fastgelu(argc, argv);
+    }
    else
    {
-        // clang-format off
+        print_helper_message();
-        printf("arg1: tensor operation (gemm: GEMM\n"
-               "                        gemm_bias_2d: GEMM+Bias(2D)\n"
+        return 0;
-               "                        gemm_bias_relu: GEMM+Bias+ReLU\n"
-               "                        gemm_bias_relu_add: GEMM+Bias+ReLU+Add\n"
-               "                        gemm_reduce: GEMM+Reduce\n"
-               "                        grouped_gemm: Grouped GEMM\n"
-               "                        conv_fwd: ForwardConvolution\n"
-               "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
-               "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
-               "                        conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd\n"
-               "                        conv1d_bwd_data: BackwardConvolution data 1 dim\n"
-               "                        conv2d_bwd_data: BackwardConvolution data 2 dim\n"
-               "                        conv3d_bwd_data: BackwardConvolution data 3 dim\n"
-               "                        reduce: Reduce\n"
-               "                        conv2d_bwd_weight: Backward Weight Convolution 2d\n");
-        // clang-format on
    }
-    return 0;
 }
--- a/script/parse_perf_data.py
+++ b/script/parse_perf_data.py
 #!/usr/bin/env python3
-import os, io, argparse, datetime
+import os, io, argparse, datetime, re
 import numpy as np
 import sqlalchemy
 from sqlalchemy.types import NVARCHAR, Float, Integer
@@ -45,66 +45,98 @@ def main():
    StrideB=[]
    StrideC=[]
    #parse results, get the Tflops value for "Best Perf" kernels
    glue=""
    for filename in args.files:
        for line in open(filename):
            if 'Branch name' in line:
                lst=line.split()
                branch_name=lst[2]
-    for filename in args.files:
+            if 'On branch' in line:
-        for line in open(filename):
+                lst=line.split()
-            if 'Best Perf' in line:
+                branch_name=lst[2]
+            if 'Node name' in line:
                lst=line.split()
-                if len(lst)>=37: #the line is complete
+                node_id=lst[2]
-                    tests.append(glue.join(lst[5:30]))
+            if 'GPU_arch' in line:
-                    kernels.append(glue.join(lst[37:]))
+                lst=line.split()
-                    tflops.append(lst[33])
+                gpu_arch=lst[2]
-                    dtype.append(lst[5])
+            if 'HIP version' in line:
-                    alayout.append(lst[8])
+                lst=line.split()
-                    blayout.append(lst[11])
+                hip_vers=lst[2]
-                    M.append(lst[14])
+            if 'Compute Unit' in line:
-                    N.append(lst[17])
+                lst=line.split()
-                    K.append(lst[20])
+                compute_units=lst[2]
-                    StrideA.append(lst[23])
+            if 'InstalledDir' in line:
-                    StrideB.append(lst[26])
+                lst=line.split()
-                    StrideC.append(lst[29])
+                rocm_vers=lst[1][lst[1].find('/opt/rocm-')+len('/opt/rocm-'):lst[1].rfind('/llvm/bin')]
-                elif len(lst)<37 and len(lst)>=33: #the tflops are available
-                    tests.append(glue.join(lst[5:30]))
-                    kernels.append("N/A")
-                    tflops.append(lst[33])
-                    dtype.append(lst[5])
-                    alayout.append(lst[8])
-                    blayout.append(lst[11])
-                    M.append(lst[14])
-                    N.append(lst[17])
-                    K.append(lst[20])
-                    StrideA.append(lst[23])
-                    StrideB.append(lst[26])
-                    StrideC.append(lst[29])
-                    print("warning: incomplete line:",lst)
-                elif len(lst)<33: #even the tflops are not available
-                    print("Error in ckProfiler output!")
-                    print("warning: incomplete line=",lst)
-    #sort results
-    print("Number of tests:",len(tests))
    print("Branch name:",branch_name)
-    #sorted_tests = sorted(tests)
+    print("Node name:",node_id)
-    #print("sorted tests:",sorted_tests)
+    print("GPU_arch:",gpu_arch)
-    sorted_tflops = [x for _,x in sorted(zip(tests,tflops))]
+    print("Compute units:",compute_units)
-    #sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
+    print("ROCM_version:",rocm_vers)
-    test_list=list(range(1,len(tests)+1))
+    print("HIP_version:",hip_vers)
+    #parse gemm performance tests:
+    if 'gemm' in filename:
+        for filename in args.files:
+            for line in open(filename):
+                if 'Best Perf' in line:
+                    lst=line.split()
+                    if len(lst)>=37: #the line is complete
+                        tests.append(glue.join(lst[5:30]))
+                        kernels.append(glue.join(lst[37:]))
+                        tflops.append(lst[33])
+                        dtype.append(lst[5])
+                        alayout.append(lst[8])
+                        blayout.append(lst[11])
+                        M.append(lst[14])
+                        N.append(lst[17])
+                        K.append(lst[20])
+                        StrideA.append(lst[23])
+                        StrideB.append(lst[26])
+                        StrideC.append(lst[29])
+                    elif len(lst)<37 and len(lst)>=33: #the tflops are available
+                        tests.append(glue.join(lst[5:30]))
+                        kernels.append("N/A")
+                        tflops.append(lst[33])
+                        dtype.append(lst[5])
+                        alayout.append(lst[8])
+                        blayout.append(lst[11])
+                        M.append(lst[14])
+                        N.append(lst[17])
+                        K.append(lst[20])
+                        StrideA.append(lst[23])
+                        StrideB.append(lst[26])
+                        StrideC.append(lst[29])
+                        print("warning: incomplete line:",lst)
+                    elif len(lst)<33: #even the tflops are not available
+                        print("Error in ckProfiler output!")
+                        print("warning: incomplete line=",lst)
+        #sort results
+        #sorted_tests = sorted(tests)
+        #print("sorted tests:",sorted_tests)
+        sorted_tflops = [x for _,x in sorted(zip(tests,tflops))]
+        #sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
+        test_list=list(range(1,len(tests)+1))
+    #parse resnet50 performance tests:
+    if 'resnet50' in filename:
+        for filename in args.files:
+            for line in open(filename):
+                if 'Best Perf' in line:
+                    lst=line.split()
+                    tflops.append(lst[4])
+    print("Number of tests:",len(tflops))
    sql_hostname = '127.0.0.1'
    sql_username = os.environ["dbuser"]
-    print("sql_username=",sql_username)
    sql_password = os.environ["dbpassword"]
    sql_main_database = 'miopen_perf'
    sql_port = 3306
    ssh_host = os.environ["dbsship"]
-    print("ssh_host=",ssh_host)
    ssh_user = os.environ["dbsshuser"]
-    print("ssh_user=",ssh_user)
    ssh_port = int(os.environ["dbsshport"])
    ssh_pass = os.environ["dbsshpassword"]
@@ -118,75 +150,140 @@ def main():
            format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database))
        conn = sqlEngine.connect()
-        #write the ck_gemm_test_params table
+        #save gemm performance tests:
-        #only needed once the test set changes
+        if 'gemm' in filename:
-        '''
-        sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
+            #write the ck_gemm_test_params table
-        sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
+            #only needed once the test set changes
-        sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
+            '''
-        sorted_M = [x for _,x in sorted(zip(tests,M))]
+            sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
-        sorted_N = [x for _,x in sorted(zip(tests,N))]
+            sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
-        sorted_K = [x for _,x in sorted(zip(tests,K))]
+            sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
-        sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
+            sorted_M = [x for _,x in sorted(zip(tests,M))]
-        sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
+            sorted_N = [x for _,x in sorted(zip(tests,N))]
-        sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
+            sorted_K = [x for _,x in sorted(zip(tests,K))]
-        ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout,
+            sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
-                    sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
+            sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
-                    sorted_StrideC]
+            sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
-        df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
+            ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout,
-            'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
+                        sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
-        print(df)
+                        sorted_StrideC]
+            df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
-        dtypes = {
+                'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
-            'Test_number': Integer(),
+            print(df)
-            'Data_type': NVARCHAR(length=5),
-            'Alayout': NVARCHAR(length=12),
+            dtypes = {
-            'Blayout': NVARCHAR(length=12),
+                'Test_number': Integer(),
-            'M': Integer(),
+                'Data_type': NVARCHAR(length=5),
-            'N': Integer(),
+                'Alayout': NVARCHAR(length=12),
-            'K': Integer(),
+                'Blayout': NVARCHAR(length=12),
-            'StrideA': Integer(),
+                'M': Integer(),
-            'StrideB': Integer(),
+                'N': Integer(),
-            'StrideC': Integer()
+                'K': Integer(),
-            }
+                'StrideA': Integer(),
-        df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes)
+                'StrideB': Integer(),
-        '''
+                'StrideC': Integer()
+                }
-        #read baseline results for the latest develop branch
+            df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes)
-        query = '''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );'''
+            '''
-        tflops_base = pd.read_sql_query(query, conn)
+            #read baseline results for the latest develop branch
-        #write new results to the db
+            query = '''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );'''
-        testlist=[]
+            tflops_base = pd.read_sql_query(query, conn)
-        for i in range(1,len(tests)+1):
-            testlist.append("Test%i"%i)
+            #write new results to the db
-        ck_gemm_tflops=[str(branch_name),str(datetime.datetime.now())]
+            testlist=[]
-        flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Datetime'])
+            for i in range(1,len(tests)+1):
-        df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist)
+                testlist.append("Test%i"%i)
-        flops=pd.concat([flops,df_add],axis=1)
+            ck_gemm_tflops=[str(branch_name),str(node_id),str(gpu_arch),compute_units,str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
-        print("new tflops results:",flops)
+            flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Datetime'])
-        flops.to_sql("ck_gemm_tflops",conn,if_exists='append',index=False)
+            df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist)
+            flops=pd.concat([flops,df_add],axis=1)
+            print("new tflops for gemm tests:",flops)
+            flops.to_sql("ck_gemm_tflops",conn,if_exists='append',index=False)
+        #save resnet50 performance tests:
+        if 'resnet50' in filename:
+            #read baseline results for the latest develop branch
+            query = '''SELECT * from ck_resnet50_N256_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N256_tflops where Branch_ID='develop' );'''
+            tflops_base_N256 = pd.read_sql_query(query, conn)
+            query = '''SELECT * from ck_resnet50_N4_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N4_tflops where Branch_ID='develop' );'''
+            tflops_base_N4 = pd.read_sql_query(query, conn)
+            #write new results to the db
+            testlist=[]
+            for i in range(1,50):
+                testlist.append("Layer%i"%i)
+            ck_resnet_tflops=[str(branch_name),str(node_id),str(gpu_arch),compute_units,str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
+            flops0=pd.DataFrame(data=[ck_resnet_tflops],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Datetime'])
+            df_add=pd.DataFrame(data=[tflops[0:49]],columns=testlist)
+            flops=pd.concat([flops0,df_add],axis=1)
+            print("new tflops for N=256 resnet50 test:",flops)
+            flops.to_sql("ck_resnet50_N256_tflops",conn,if_exists='append',index=False)
+            df_add=pd.DataFrame(data=[tflops[49:98]],columns=testlist)
+            flops=pd.concat([flops0,df_add],axis=1)
+            print("new tflops for N=4 resnet50 test:",flops)
+            flops.to_sql("ck_resnet50_N4_tflops",conn,if_exists='append',index=False)
        conn.close()
-    #compare the results to the baseline
+    #compare the results to the baseline if baseline exists
    regression=0
-    base=tflops_base[testlist].to_numpy(dtype='float')
+    if 'gemm' in filename:
-    base_list=base[0]
+        if not tflops_base.empty:
-    ave_perf=0
+            base=tflops_base[testlist].to_numpy(dtype='float')
-    for i in range(len(base_list)):
+            base_list=base[0]
-        # success criterion:
+            ave_perf=0
-        if base_list[i]>1.01*float(sorted_tflops[i]):
+            for i in range(len(base_list)):
-            print("test # ",i,"shows regression by {:.3f}%".format(
+                # success criterion:
-                (float(sorted_tflops[i])-base_list[i])/base_list[i]*100))
+                if base_list[i]>1.01*float(sorted_tflops[i]):
-            regression=1
+                    print("test # ",i,"shows regression by {:.3f}%".format(
-        ave_perf=ave_perf+float(sorted_tflops[i])/base_list[i]
+                        (float(sorted_tflops[i])-base_list[i])/base_list[i]*100))
-    if regression==0:
+                    regression=1
-        print("no regressions found")
+                ave_perf=ave_perf+float(sorted_tflops[i])/base_list[i]
-    ave_perf=ave_perf/len(base_list)
+            if regression==0:
-    print("average performance relative to baseline:",ave_perf)
+                print("no regressions found")
+            ave_perf=ave_perf/len(base_list)
+            print("average performance relative to baseline:",ave_perf)
+        else:
+            print("could not find a baseline")
+    if 'resnet50' in filename:
+        if not tflops_base_N256.empty:
+            base=tflops_base_N256[testlist].to_numpy(dtype='float')
+            base_list=base[0]
+            ave_perf=0
+            for i in range(len(base_list)):
+                # success criterion:
+                if base_list[i]>1.01*float(tflops[i]):
+                    print("layer # ",i,"shows regression by {:.3f}%".format(
+                        (float(tflops[i])-base_list[i])/base_list[i]*100))
+                    regression=1
+                ave_perf=ave_perf+float(tflops[i])/base_list[i]
+            if regression==0:
+                print("no regressions found")
+            ave_perf=ave_perf/len(base_list)
+            print("average performance relative to baseline:",ave_perf)
+        else:
+            print("could not find a baseline for N=256")
+        if not tflops_base_N4.empty:
+            base=tflops_base_N4[testlist].to_numpy(dtype='float')
+            base_list=base[0]
+            ave_perf=0
+            for i in range(len(base_list)):
+                # success criterion:
+                if base_list[i]>1.01*float(tflops[i+49]):
+                    print("layer # ",i,"shows regression by {:.3f}%".format(
+                        (float(tflops[i+49])-base_list[i])/base_list[i]*100))
+                    regression=1
+                ave_perf=ave_perf+float(tflops[i+49])/base_list[i]
+            if regression==0:
+                print("no regressions found")
+            ave_perf=ave_perf/len(base_list)
+            print("average performance relative to baseline:",ave_perf)
+        else:
+            print("could not find a baseline for N=4")
    #return 0 if performance criteria met, otherwise return 1
    return regression
 if __name__ == '__main__':

--- a/script/profile_conv.sh
+++ b/script/profile_conv.sh
@@ -3,9 +3,9 @@
 ## GPU visibility
 export HIP_VISIBLE_DEVICES=0
- make -j ckProfiler
+# make -j ckProfiler
- DRIVER="./profiler/ckProfiler"
+ DRIVER="../build/bin/ckProfiler"
 OP=$1
 DATATYPE=$2
@@ -26,7 +26,7 @@ REPEAT=$9
 N=${10}
-# Resnet50 from Bing
+# Resnet50 (no duplicated layer)
 ########  op  datatype  in_layout   wei_layout  out_layout  verify  init  log  repeat  N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
 #$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256 1024 1 1   14   14     1 1       1 1      0 0       0 0
 #$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512 1024 1 1   14   14     1 1       1 1      0 0       0 0
@@ -47,60 +47,60 @@ REPEAT=$9
 #$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256   64 1 1   56   56     1 1       1 1      0 0       0 0
 #$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64   64 1 1   56   56     1 1       1 1      0 0       0 0
 #$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64   64 3 3   56   56     1 1       1 1      1 1       1 1
-#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64    8 7 7  224  224     2 2       1 1      3 3       3 3
+#$DRIVER $OP $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64    3 7 7  224  224     2 2       1 1      3 3       3 3
-# Resnet50 from Bing
+# Resnet50 fusion
-#################### op____________________  datatype  in_layout   wei_layout  out_layout  verify  init  log  repeat  N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
+####### op_________________    datatype  in_layout   wei_layout  out_layout  verify  init  log  repeat    N__  K___ C_ Y X  Hi_ Wi__ Strides Dilations LeftPads RightPads
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64    3 7 7  224 224    2   2     1   1    3   3     3   3
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64    3 7 7  224 224    2   2     1   1    3   3     3   3
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64   64 1 1   56  56    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64   64 1 1   56  56    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64   64 3 3   56  56    1   1     1   1    1   1     1   1
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64   64 3 3   56  56    1   1     1   1    1   1     1   1
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256   64 1 1   56  56    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256   64 1 1   56  56    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64  256 1 1   56  56    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64  256 1 1   56  56    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64   64 3 3   56  56    1   1     1   1    1   1     1   1
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64   64 3 3   56  56    1   1     1   1    1   1     1   1
-#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256   64 1 1   56  56    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256   64 1 1   56  56    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64  256 1 1   56  56    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64  256 1 1   56  56    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64   64 3 3   56  56    1   1     1   1    1   1     1   1
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N   64   64 3 3   56  56    1   1     1   1    1   1     1   1
-#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256   64 1 1   56  56    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256   64 1 1   56  56    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  256 1 1   56  56    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  256 1 1   56  56    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  128 3 3   56  56    2   2     1   1    1   1     1   1
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  128 3 3   56  56    2   2     1   1    1   1     1   1
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  128 1 1   28  28    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  128 1 1   28  28    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  512 1 1   28  28    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  512 1 1   28  28    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  128 3 3   28  28    1   1     1   1    1   1     1   1
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  128 3 3   28  28    1   1     1   1    1   1     1   1
-#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  128 1 1   28  28    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  128 1 1   28  28    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  512 1 1   28  28    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  512 1 1   28  28    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  128 3 3   28  28    1   1     1   1    1   1     1   1
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  128 3 3   28  28    1   1     1   1    1   1     1   1
-#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  128 1 1   28  28    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  128 1 1   28  28    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  512 1 1   28  28    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  512 1 1   28  28    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  128 3 3   28  28    1   1     1   1    1   1     1   1
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  128  128 3 3   28  28    1   1     1   1    1   1     1   1
-#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  128 1 1   28  28    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  128 1 1   28  28    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  512 1 1   28  28    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  512 1 1   28  28    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  256 3 3   28  28    2   2     1   1    1   1     1   1
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  256 3 3   28  28    2   2     1   1    1   1     1   1
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256 1024 1 1   14  14    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256 1024 1 1   14  14    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  256 3 3   14  14    1   1     1   1    1   1     1   1
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  256 3 3   14  14    1   1     1   1    1   1     1   1
-#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256 1024 1 1   14  14    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256 1024 1 1   14  14    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  256 3 3   14  14    1   1     1   1    1   1     1   1
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  256 3 3   14  14    1   1     1   1    1   1     1   1
-#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256 1024 1 1   14  14    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256 1024 1 1   14  14    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  256 3 3   14  14    1   1     1   1    1   1     1   1
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  256 3 3   14  14    1   1     1   1    1   1     1   1
-#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256 1024 1 1   14  14    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256 1024 1 1   14  14    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  256 3 3   14  14    1   1     1   1    1   1     1   1
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  256 3 3   14  14    1   1     1   1    1   1     1   1
-#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256 1024 1 1   14  14    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256 1024 1 1   14  14    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  256 3 3   14  14    1   1     1   1    1   1     1   1
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  256  256 3 3   14  14    1   1     1   1    1   1     1   1
-#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512 1024 1 1   14  14    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512 1024 1 1   14  14    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  512 3 3   14  14    2   2     1   1    1   1     1   1
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  512 3 3   14  14    2   2     1   1    1   1     1   1
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 2048  512 1 1    7   7    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 2048  512 1 1    7   7    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512 2048 1 1    7   7    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512 2048 1 1    7   7    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  512 3 3    7   7    1   1     1   1    1   1     1   1
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  512 3 3    7   7    1   1     1   1    1   1     1   1
-#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 2048  512 1 1    7   7    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 2048  512 1 1    7   7    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512 2048 1 1    7   7    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512 2048 1 1    7   7    1   1     1   1    0   0     0   0
-#profiler/ckProfiler conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  512 3 3    7   7    1   1     1   1    1   1     1   1
+$DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N  512  512 3 3    7   7    1   1     1   1    1   1     1   1
-#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 2048  512 1 1    7   7    1   1     1   1    0   0     0   0
+$DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT   $N 2048  512 1 1    7   7    1   1     1   1    0   0     0   0
 # Resnet50

--- a/script/run_performance_tests.sh
+++ b/script/run_performance_tests.sh
+#!/bin/bash 
+#
+# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
+# and make sure the following python packages are installed in your environment:
+pip3 install --upgrade pip
+pip3 install sqlalchemy pymysql pandas sshtunnel
+# you would also need to set up some environment variables in order to 
+# post your new test results to the database and compare them to the baseline
+# please contact Illia.Silin@amd.com for more details
+#
+export gemm_log="perf_gemm.log"
+rm -f $gemm_log
+git status | grep -e 'On branch' > ${gemm_log}
+echo -n 'Node name: ' >>${gemm_log}; hostname >> ${gemm_log}
+#get GPU_arch and number of compute units from rocminfo
+echo -n "GPU_arch: " >> ${gemm_log}; rocminfo | grep "Name:" | grep "gfx" >> ${gemm_log} 
+rocminfo | grep "Compute Unit:" >> ${gemm_log} 
+hipcc --version | grep -e 'HIP version'  >> ${gemm_log}
+/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${gemm_log}
+./profile_gemm.sh gemm 0 0 0 1 0 5 | tee -a ${gemm_log}
+./profile_gemm.sh gemm 1 0 0 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 0 0 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 0 0 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 1 0 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 1 0 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 1 0 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 1 0 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 2 0 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 2 0 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 2 0 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 2 0 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 3 0 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 3 0 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 3 0 1 0 5 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 3 0 1 0 5 | tee -a $gemm_log
+python3 parse_perf_data.py ${gemm_log}
+#run resnet50 test
+export resnet_log="perf_resnet50.log"
+rm -f $resnet_log
+git status | grep -e 'On branch' > ${resnet_log}
+echo -n 'Node name: '>>${resnet_log}; hostname >>${resnet_log}
+#get GPU_arch and number of compute units from rocminfo
+echo -n "GPU_arch: " >> ${resnet_log}; rocminfo | grep "Name:" | grep "gfx" >> ${resnet_log}
+rocminfo | grep "Compute Unit:" >> ${resnet_log} 
+hipcc --version | grep -e 'HIP version'  >> ${resnet_log}
+/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> ${resnet_log}
+#first run tests with N=256
+./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 256 | tee -a ${resnet_log}
+#then run with N=4
+./profile_conv.sh conv_fwd_bias_relu 1 1 1 1 0 2 0 1 4 | tee -a ${resnet_log}
+#the script will put the results from N=256 and N=4 runs into separate tables
+python3 parse_perf_data.py ${resnet_log}
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
 include_directories(BEFORE
    ${PROJECT_SOURCE_DIR}/
-    ${PROJECT_SOURCE_DIR}/include/ck
-    ${PROJECT_SOURCE_DIR}/include/ck/utility
-    ${PROJECT_SOURCE_DIR}/include/ck/host_utility
-    ${PROJECT_SOURCE_DIR}/include/ck/tensor_description
-    ${PROJECT_SOURCE_DIR}/include/ck/tensor
-    ${PROJECT_SOURCE_DIR}/include/ck/problem_transform
-    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/device
-    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/grid
-    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/block
-    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/warp
-    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/thread
-    ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element
-    ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor
-    ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance
-    ${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce
-    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
-    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu
-    ${PROJECT_SOURCE_DIR}/library/include/ck/library/utility
-    ${PROJECT_SOURCE_DIR}/test/include
-    ${PROJECT_SOURCE_DIR}/profiler/include
-    ${PROJECT_SOURCE_DIR}/external/include/half
 )
 include(googletest)
@@ -65,4 +44,4 @@ add_subdirectory(reduce)
 add_subdirectory(conv2d_bwd_weight)
 add_subdirectory(convnd_bwd_data)
 add_subdirectory(block_to_ctile_map)
-# DONOT add client_app, that is tested via CI independently
+add_subdirectory(softmax)
--- a/test/batched_gemm/batched_gemm_fp16.cpp
+++ b/test/batched_gemm/batched_gemm_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
-#include "profile_batched_gemm_impl.hpp"
+#include "profiler/include/profile_batched_gemm_impl.hpp"
 namespace {
 using ADataType = ck::half_t;

--- a/test/batched_gemm/batched_gemm_util.hpp
+++ b/test/batched_gemm/batched_gemm_util.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #ifndef BATCHED_GEMM_UTILS_HPP
 #define BATCHED_GEMM_UTILS_HPP

--- a/test/batched_gemm_reduce/CMakeLists.txt
+++ b/test/batched_gemm_reduce/CMakeLists.txt
-include_directories(BEFORE
-    ${PROJECT_SOURCE_DIR}/profiler/include
-    ${PROJECT_SOURCE_DIR}/test/include
-    ${PROJECT_SOURCE_DIR}/external/include/half
-)
 add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
 target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE host_tensor)
 target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance)
--- a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
+++ b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include <iostream>
-#include "profile_batched_gemm_reduce_impl.hpp"
+#include "profiler/include/profile_batched_gemm_reduce_impl.hpp"
 int main()
 {

--- a/test/block_to_ctile_map/test_block_to_ctile_map.cpp
+++ b/test/block_to_ctile_map/test_block_to_ctile_map.cpp
-#include <ck/config.hpp>
+// SPDX-License-Identifier: MIT
-#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
-#include "gtest/gtest.h"
 #include <iostream>
 #include <vector>
+#include <gtest/gtest.h>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
 using namespace ck;

--- a/test/client_app/CMakeLists.txt
+++ b/test/client_app/CMakeLists.txt
-cmake_minimum_required(VERSION 3.15)
-project(ck_app)
-add_compile_options(-std=c++14)
-find_package(composable_kernel 1.0.0 COMPONENTS device_operations host_tensor)
-find_package(hip REQUIRED PATHS /opt/rocm)
-message(STATUS "Build with HIP ${hip_VERSION}")
-add_executable(test_client_app client_app.cpp)
-target_link_libraries(test_client_app PRIVATE composable_kernel::device_operations composable_kernel::host_tensor hip::host)
--- a/test/client_app/client_app.cpp
+++ b/test/client_app/client_app.cpp
-#include <iostream>
-#include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include <stdlib.h>
-#include <half.hpp>
-#include <vector>
-#include "client_app_impl.hpp"
-int main(int argc, char* argv[])
-{
-    if(argc != 25)
-    {
-        printf("arg1: tensor operation (conv_fwd: ForwardConvolution)\n");
-        printf("arg2: data type (0: fp32; 1: fp16)\n");
-        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
-        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
-        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
-        printf("arg6: verification (0: no; 1: yes)\n");
-        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
-        printf("arg9: time kernel (0=n0, 1=yes)\n");
-        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
-               "RightPx\n");
-        exit(1);
-    }
-    const ConvDataType data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
-    const int in_layout          = static_cast<ConvInputLayout>(std::stoi(argv[3]));
-    const int wei_layout         = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
-    const int out_layout         = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
-    const bool do_verification   = std::stoi(argv[6]);
-    const int init_method        = std::stoi(argv[7]);
-    const bool do_log            = std::stoi(argv[8]);
-    const bool time_kernel       = std::stoi(argv[9]);
-    const ck::index_t N  = std::stoi(argv[10]);
-    const ck::index_t K  = std::stoi(argv[11]);
-    const ck::index_t C  = std::stoi(argv[12]);
-    const ck::index_t Y  = std::stoi(argv[13]);
-    const ck::index_t X  = std::stoi(argv[14]);
-    const ck::index_t Hi = std::stoi(argv[15]);
-    const ck::index_t Wi = std::stoi(argv[16]);
-    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
-    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
-    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
-    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
-    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
-    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
-    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
-    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
-    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
-    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
-    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-    ck::app::profile_conv_fwd_impl(do_verification,
-                                   init_method,
-                                   do_log,
-                                   time_kernel,
-                                   data_type,
-                                   N,
-                                   K,
-                                   C,
-                                   std::vector<ck::index_t>{Hi, Wi},
-                                   std::vector<ck::index_t>{Y, X},
-                                   std::vector<ck::index_t>{Ho, Wo},
-                                   std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
-                                   std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
-                                   std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
-                                   std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
-    return 1;
-}
--- a/test/client_app/client_app_impl.hpp
+++ b/test/client_app/client_app_impl.hpp
-#pragma once
-#include "host_interface.hpp"
-enum ConvDataType
-{
-    F32_F32_F32,    // 0
-    F16_F16_F16,    // 1
-    BF16_BF16_BF16, // 2
-    INT8_INT8_INT8, // 3
-};
-enum ConvInputLayout
-{
-    NCHW, // 0
-    NHWC, // 1
-};
-enum ConvWeightLayout
-{
-    KCYX, // 0
-    KYXC, // 1
-};
-enum ConvOutputLayout
-{
-    NKHW, // 0
-    NHWK, // 1
-};
-void check_hip_error(void)
-{
-    hipError_t err = hipGetLastError();
-    if(err != hipSuccess)
-    {
-        std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
-        exit(err);
-    }
-}
-std::string getDeviceName(int device)
-{
-    struct hipDeviceProp_t prop;
-    hipGetDeviceProperties(&prop, device);
-    check_hip_error();
-    return std::string(prop.name);
-}
-int getDriver(void)
-{
-    int driver;
-    hipDriverGetVersion(&driver);
-    check_hip_error();
-    return driver;
-}
-namespace ck {
-namespace app {
-struct DeviceMem
-{
-    DeviceMem() = delete;
-    DeviceMem(std::size_t mem_size);
-    void* GetDeviceBuffer();
-    void ToDevice(const void* p);
-    void FromDevice(void* p);
-    ~DeviceMem();
-    void* mpDeviceBuf;
-    std::size_t mMemSize;
-};
-DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
-{
-    hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
-}
-void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
-void DeviceMem::ToDevice(const void* p)
-{
-    hipGetErrorString(
-        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
-}
-void DeviceMem::FromDevice(void* p)
-{
-    hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
-}
-DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
-void profile_conv_fwd_impl(int do_verification,
-                           int init_method,
-                           bool do_log,
-                           bool time_kernel,
-                           ConvDataType data_type,
-                           ck::index_t N,
-                           ck::index_t K,
-                           ck::index_t C,
-                           std::vector<ck::index_t> input_spatial_lengths,
-                           std::vector<ck::index_t> filter_spatial_lengths,
-                           std::vector<ck::index_t> output_spatial_lengths,
-                           std::vector<ck::index_t> conv_filter_strides,
-                           std::vector<ck::index_t> conv_filter_dilations,
-                           std::vector<ck::index_t> input_left_pads,
-                           std::vector<ck::index_t> input_right_pads)
-{
-    const ck::index_t Y = filter_spatial_lengths[0];
-    const ck::index_t X = filter_spatial_lengths[1];
-    const ck::index_t Hi = input_spatial_lengths[0];
-    const ck::index_t Wi = input_spatial_lengths[1];
-    const ck::index_t Ho = output_spatial_lengths[0];
-    const ck::index_t Wo = output_spatial_lengths[1];
-    const auto in_sz  = N * C * Hi * Wi;
-    const auto wei_sz = K * C * Y * X;
-    const auto out_sz = N * K * Ho * Wo;
-    using WeiDataType = float;
-    using InDataType  = float;
-    using OutDataType = float;
-    app::DeviceMem in_device_buf(sizeof(InDataType) * in_sz);
-    app::DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_sz);
-    app::DeviceMem out_device_buf(sizeof(OutDataType) * out_sz);
-    // data is already on device!
-    // add device Conv instances
-    std::vector<DeviceConvFwdPtr_t> conv_ptrs;
-    if(data_type == F16_F16_F16)
-    {
-        add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(conv_ptrs);
-        add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(conv_ptrs);
-    }
-    else if(data_type == BF16_BF16_BF16)
-        add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(conv_ptrs);
-    else if(data_type == F32_F32_F32)
-        add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(conv_ptrs);
-    else if(data_type == INT8_INT8_INT8)
-        add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(conv_ptrs);
-    else
-        throw std::runtime_error("wrong! Invalid data type");
-    if(conv_ptrs.empty())
-    {
-        throw std::runtime_error("wrong! no device Conv instance found");
-    }
-    std::string best_conv_name;
-    float best_ave_time   = 0;
-    float best_tflops     = 0;
-    float best_gb_per_sec = 0;
-    int deviceIndex       = 0;
-    hipSetDevice(deviceIndex);
-    check_hip_error();
-    StreamConfig stream_config{nullptr, time_kernel};
-    hipStreamCreate(&stream_config.stream_id_);
-    check_hip_error();
-    // profile device Conv instances
-    for(auto& conv_ptr : conv_ptrs)
-    {
-        auto argument_ptr =
-            conv_ptr.MakeArgumentPointer(static_cast<void*>(in_device_buf.GetDeviceBuffer()),
-                                         static_cast<void*>(wei_device_buf.GetDeviceBuffer()),
-                                         static_cast<void*>(out_device_buf.GetDeviceBuffer()),
-                                         N,
-                                         K,
-                                         C,
-                                         input_spatial_lengths,
-                                         filter_spatial_lengths,
-                                         output_spatial_lengths,
-                                         conv_filter_strides,
-                                         conv_filter_dilations,
-                                         input_left_pads,
-                                         input_right_pads);
-        auto invoker_ptr = conv_ptr.MakeInvokerPointer();
-        if(conv_ptr.IsSupportedArgument(argument_ptr.get()))
-        {
-            std::string conv_name = conv_ptr.GetTypeString();
-            float ave_time        = invoker_ptr->Run(argument_ptr.get(), stream_config);
-            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
-            std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
-                                    sizeof(WeiDataType) * (K * C * Y * X) +
-                                    sizeof(OutDataType) * (N * K * Ho * Wo);
-            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-            float gb_per_sec = num_btype / 1.E6 / ave_time;
-            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
-                      << " GB/s, " << conv_name << std::endl;
-            if(tflops > best_tflops)
-            {
-                best_conv_name  = conv_name;
-                best_tflops     = tflops;
-                best_ave_time   = ave_time;
-                best_gb_per_sec = gb_per_sec;
-            }
-        }
-    }
-    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
-              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
-}
-} // namespace app
-} // namespace ck
--- a/test/conv2d_bwd_data/conv2d_bwd_data.cpp
+++ b/test/conv2d_bwd_data/conv2d_bwd_data.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
 #include "config.hpp"
 #include "device.hpp"
 #include "host_tensor.hpp"