Merge branch 'develop' into bmatrix_skip_lds

f9c478e2 · ltqin · 7d85d04a · 91d8b7d6 · f9c478e2 · f9c478e2
Commit f9c478e2 authored May 30, 2022 by ltqin
20 changed files
--- a/profiler/src/profile_gemm_bias_2d.cpp
+++ b/profiler/src/profile_gemm_bias_2d.cpp
@@ -36,8 +36,8 @@ int profile_gemm_bias_2d(int argc, char* argv[])
        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
        printf("arg4: verification (0: no; 1: yes)\n");
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
        printf("arg14: alpha\n");
        printf("arg15: beta\n");
@@ -50,7 +50,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[4]);
    const int init_method      = std::stoi(argv[5]);
    const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
    const int M = std::stoi(argv[8]);
    const int N = std::stoi(argv[9]);
@@ -76,7 +76,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -99,7 +99,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -122,7 +122,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -145,7 +145,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -168,7 +168,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -191,7 +191,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -214,7 +214,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -237,7 +237,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -252,5 +252,5 @@ int profile_gemm_bias_2d(int argc, char* argv[])
        throw std::runtime_error("wrong! this data_type & layout is not implemented");
    }
-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_gemm_bias_relu.cpp
+++ b/profiler/src/profile_gemm_bias_relu.cpp
@@ -36,8 +36,8 @@ int profile_gemm_bias_relu(int argc, char* argv[])
        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
        printf("arg4: verification (0: no; 1: yes)\n");
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
        printf("arg14: split k into  mulitiple batch\n");
        exit(1);
@@ -48,7 +48,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[4]);
    const int init_method      = std::stoi(argv[5]);
    const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
    const int M = std::stoi(argv[8]);
    const int N = std::stoi(argv[9]);
@@ -69,7 +69,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -88,7 +88,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -107,7 +107,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -126,7 +126,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -139,5 +139,5 @@ int profile_gemm_bias_relu(int argc, char* argv[])
        throw std::runtime_error("wrong! this data_type & layout is not implemented");
    }
-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_gemm_bias_relu_add.cpp
+++ b/profiler/src/profile_gemm_bias_relu_add.cpp
@@ -36,8 +36,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
        printf("arg4: verification (0: no; 1: yes)\n");
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
        printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, StrideC1\n");
        printf("arg15: split k into  mulitiple batch\n");
        exit(1);
@@ -48,7 +48,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[4]);
    const int init_method      = std::stoi(argv[5]);
    const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
    const int M = std::stoi(argv[8]);
    const int N = std::stoi(argv[9]);
@@ -70,7 +70,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -90,7 +90,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -110,7 +110,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -130,7 +130,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -144,5 +144,5 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
        throw std::runtime_error("wrong! this data_type & layout is not implemented");
    }
-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_gemm_reduce.cpp
+++ b/profiler/src/profile_gemm_reduce.cpp
@@ -32,8 +32,8 @@ int profile_gemm_reduce(int argc, char* argv[])
        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
        printf("arg4: verification (0: no; 1: yes)\n");
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
        printf("arg14: split k into  mulitiple batch\n");
        exit(1);
@@ -44,7 +44,7 @@ int profile_gemm_reduce(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[4]);
    const int init_method      = std::stoi(argv[5]);
    const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
    const int M = std::stoi(argv[8]);
    const int N = std::stoi(argv[9]);
@@ -66,7 +66,7 @@ int profile_gemm_reduce(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -87,7 +87,7 @@ int profile_gemm_reduce(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -108,7 +108,7 @@ int profile_gemm_reduce(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -129,7 +129,7 @@ int profile_gemm_reduce(int argc, char* argv[])
            do_verification,
            init_method,
            do_log,
-            nrepeat,
+            time_kernel,
            M,
            N,
            K,
@@ -142,5 +142,5 @@ int profile_gemm_reduce(int argc, char* argv[])
        throw std::runtime_error("wrong! this data_type & layout is not implemented");
    }
-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
@@ -54,8 +54,8 @@ int profile_grouped_gemm(int argc, char* argv[])
        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
        printf("arg4: verification (0: no; 1: yes)\n");
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
-        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
        printf("arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
               "64,64 64,64 128,128)\n");
        exit(1);
@@ -66,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[])
    const bool do_verification = std::stoi(argv[4]);
    const int init_method      = std::stoi(argv[5]);
    const bool do_log          = std::stoi(argv[6]);
-    const int nrepeat          = std::stoi(argv[7]);
+    const bool time_kernel     = std::stoi(argv[7]);
    const auto Ms = argToIntArray(argv[8]);
    const auto Ns = argToIntArray(argv[9]);
@@ -79,6 +79,7 @@ int profile_grouped_gemm(int argc, char* argv[])
    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
                                                ck::half_t,
                                                ck::half_t,
                                                ck::tensor_layout::gemm::RowMajor,
@@ -86,7 +87,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
                                                                                   init_method,
                                                                                   do_log,
-                                                                                   nrepeat,
+                                                                                   time_kernel,
                                                                                   Ms,
                                                                                   Ns,
                                                                                   Ks,
@@ -97,6 +98,7 @@ int profile_grouped_gemm(int argc, char* argv[])
    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
    {
        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
                                                ck::half_t,
                                                ck::half_t,
                                                ck::tensor_layout::gemm::RowMajor,
@@ -104,7 +106,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
                                                                                   init_method,
                                                                                   do_log,
-                                                                                   nrepeat,
+                                                                                   time_kernel,
                                                                                   Ms,
                                                                                   Ns,
                                                                                   Ks,
@@ -115,6 +117,7 @@ int profile_grouped_gemm(int argc, char* argv[])
    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
    {
        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
                                                ck::half_t,
                                                ck::half_t,
                                                ck::tensor_layout::gemm::ColumnMajor,
@@ -122,7 +125,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
                                                                                   init_method,
                                                                                   do_log,
-                                                                                   nrepeat,
+                                                                                   time_kernel,
                                                                                   Ms,
                                                                                   Ns,
                                                                                   Ks,
@@ -133,6 +136,7 @@ int profile_grouped_gemm(int argc, char* argv[])
    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
    {
        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
                                                ck::half_t,
                                                ck::half_t,
                                                ck::tensor_layout::gemm::ColumnMajor,
@@ -140,7 +144,7 @@ int profile_grouped_gemm(int argc, char* argv[])
                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
                                                                                   init_method,
                                                                                   do_log,
-                                                                                   nrepeat,
+                                                                                   time_kernel,
                                                                                   Ms,
                                                                                   Ns,
                                                                                   Ks,
@@ -153,5 +157,5 @@ int profile_grouped_gemm(int argc, char* argv[])
        throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
    }
-    return 1;
+    return 0;
 }
--- a/profiler/src/profile_reduce.cpp
+++ b/profiler/src/profile_reduce.cpp
 #include <iostream>
 #include <fstream>
-#include <numeric>
-#include <initializer_list>
 #include <cstdlib>
 #include <vector>
 #include <stdexcept>
 #include <sstream>
 #include <getopt.h>
-#include "config.hpp"
+#include "data_type_enum.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "device_tensor.hpp"
 #include "reduction_enums.hpp"
+#include "host_common_util.hpp"
 #include "profile_reduce_impl.hpp"
 using namespace std;
-using ck::NanPropagation;
-using ck::ReduceTensorIndices;
 using ck::ReduceTensorOp;
 static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
@@ -38,63 +30,9 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
                                       {"bf16", no_argument, nullptr, '?'},
                                       {"dumpout", required_argument, nullptr, 'o'},
                                       {"verify", required_argument, nullptr, 'v'},
-                                       {"log", required_argument, nullptr, 'l'},
                                       {"help", no_argument, nullptr, '?'},
                                       {nullptr, 0, nullptr, 0}};
-template <typename T>
-static T getSingleValueFromString(const string& valueStr)
-{
-    std::istringstream iss(valueStr);
-    T val;
-    iss >> val;
-    return (val);
-};
-template <typename T>
-static std::vector<T> getTypeValuesFromString(const char* cstr_values)
-{
-    std::string valuesStr(cstr_values);
-    std::vector<T> values;
-    std::size_t pos = 0;
-    std::size_t new_pos;
-    new_pos = valuesStr.find(',', pos);
-    while(new_pos != std::string::npos)
-    {
-        const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
-        T val = getSingleValueFromString<T>(sliceStr);
-        values.push_back(val);
-        pos     = new_pos + 1;
-        new_pos = valuesStr.find(',', pos);
-    };
-    std::string sliceStr = valuesStr.substr(pos);
-    T val                = getSingleValueFromString<T>(sliceStr);
-    values.push_back(val);
-    return (values);
-}
-enum struct AppDataType
-{
-    appHalf     = 0,
-    appFloat    = 1,
-    appInt32    = 2,
-    appInt8     = 3,
-    appInt8x4   = 4,
-    appBFloat16 = 5,
-    appDouble   = 6,
-};
 static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims)
 {
    for(auto dim : reduceDims)
@@ -113,7 +51,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims
    };
 };
-class AppArgs
+class ReduceProfilerArgs
 {
    private:
    int option_index = 0;
@@ -130,26 +68,23 @@ class AppArgs
    std::vector<float> scales;
-    ReduceTensorOp reduceOp = ReduceTensorOp::ADD;
+    ReduceTensorOp reduceOp     = ReduceTensorOp::ADD;
-    AppDataType compTypeId  = AppDataType::appFloat;
+    ck::DataTypeEnum compTypeId = ck::DataTypeEnum::Float;
-    AppDataType outTypeId   = AppDataType::appFloat;
+    ck::DataTypeEnum outTypeId  = ck::DataTypeEnum::Float;
    bool compType_assigned = false;
    bool outType_assigned  = false;
-    NanPropagation nanOpt          = NanPropagation::NOT_PROPAGATE_NAN;
+    int nanOpt           = 0;
-    ReduceTensorIndices indicesOpt = ReduceTensorIndices::NO_INDICES;
+    int indicesOpt       = 0;
-    bool do_log                    = false;
+    bool do_verification = false;
-    bool do_verification           = false;
+    bool do_dumpout      = false;
-    bool do_dumpout                = false;
    int init_method;
-    int nrepeat;
+    bool time_kernel;
-    bool need_indices = false;
+    ReduceProfilerArgs()  = default;
+    ~ReduceProfilerArgs() = default;
-    AppArgs()  = default;
-    ~AppArgs() = default;
    void show_usage(const char* cmd)
    {
@@ -166,8 +101,11 @@ class AppArgs
        std::cout << "--outType or -W, optional enum value indicating the type of the reduced "
                     "output, which could be float when the input data is half"
                  << std::endl;
-        std::cout << "--nanOpt or -N, enum value indicates the selection for NanOpt" << std::endl;
+        std::cout
-        std::cout << "--indicesOpt or -I, enum value indicates the selection for IndicesOpt"
+            << "--nanOpt or -N, 1/0 value indicates the selection to use or not use Nan-Propagation"
+            << std::endl;
+        std::cout << "--indicesOpt or -I, 1/0 value indicates the selection to use or not use "
+                     "index in reduction"
                  << std::endl;
        std::cout << "--scales or -S, comma separated two float values for alpha and beta"
                  << std::endl;
@@ -181,18 +119,19 @@ class AppArgs
        std::cout << "--dumpout or -o, 1/0 to indicate where to save the reduction result to files "
                     "for further analysis"
                  << std::endl;
-        std::cout << "--log or -l, 1/0 to indicate whether to log some information" << std::endl;
    };
    int processArgs(int argc, char* argv[])
    {
-        unsigned int ch;
+        using ck::host_common::getTypeValuesFromString;
+        int ch;
        optind++; // to skip the "reduce" module name
        while(1)
        {
-            ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:l:", long_options, &option_index);
+            ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:", long_options, &option_index);
            if(ch == -1)
                break;
            switch(ch)
@@ -219,27 +158,27 @@ class AppArgs
                if(!optarg)
                    throw std::runtime_error("Invalid option format!");
-                compTypeId        = static_cast<AppDataType>(std::atoi(optarg));
+                compTypeId        = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
                compType_assigned = true;
                break;
            case 'W':
                if(!optarg)
                    throw std::runtime_error("Invalid option format!");
-                outTypeId        = static_cast<AppDataType>(std::atoi(optarg));
+                outTypeId        = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
                outType_assigned = true;
                break;
            case 'N':
                if(!optarg)
                    throw std::runtime_error("Invalid option format!");
-                nanOpt = static_cast<NanPropagation>(std::atoi(optarg));
+                nanOpt = std::atoi(optarg);
                break;
            case 'I':
                if(!optarg)
                    throw std::runtime_error("Invalid option format!");
-                indicesOpt = static_cast<ReduceTensorIndices>(std::atoi(optarg));
+                indicesOpt = std::atoi(optarg);
                break;
            case 'S':
                if(!optarg)
@@ -262,12 +201,6 @@ class AppArgs
                do_dumpout = static_cast<bool>(std::atoi(optarg));
                break;
-            case 'l':
-                if(!optarg)
-                    throw std::runtime_error("Invalid option format!");
-                do_log = static_cast<bool>(std::atoi(optarg));
-                break;
            case '?':
                if(std::string(long_options[option_index].name) == "half")
                    use_half = true;
@@ -295,7 +228,7 @@ class AppArgs
            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
        init_method = std::atoi(argv[optind++]);
-        nrepeat     = std::atoi(argv[optind]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
        if(scales.empty())
        {
@@ -306,9 +239,6 @@ class AppArgs
        if(reduceOp == ReduceTensorOp::MIN || reduceOp == ReduceTensorOp::MAX ||
           reduceOp == ReduceTensorOp::AMAX)
        {
-            if(indicesOpt != ReduceTensorIndices::NO_INDICES)
-                need_indices = true;
            // for indexable operations, no need to assign compType and outType, just let them be
            // same as inType
            compType_assigned = false;
@@ -322,9 +252,10 @@ class AppArgs
 int profile_reduce(int argc, char* argv[])
 {
-    using namespace ck::profiler;
+    using ck::DataTypeEnum;
+    using ck::profiler::profile_reduce_impl;
-    AppArgs args;
+    ReduceProfilerArgs args;
    if(args.processArgs(argc, argv) < 0)
        return (-1);
@@ -339,42 +270,41 @@ int profile_reduce(int argc, char* argv[])
    if(args.use_half)
    {
        if(!args.compType_assigned)
-            args.compTypeId = AppDataType::appHalf;
+            args.compTypeId = DataTypeEnum::Half;
        if(args.outType_assigned &&
-           (args.outTypeId != AppDataType::appHalf && args.outTypeId != AppDataType::appFloat))
+           (args.outTypeId != DataTypeEnum::Half && args.outTypeId != DataTypeEnum::Float))
-            args.outTypeId = AppDataType::appFloat;
+            args.outTypeId = DataTypeEnum::Float;
        if(!args.outType_assigned)
-            args.outTypeId = AppDataType::appHalf;
+            args.outTypeId = DataTypeEnum::Half;
-        if(args.compTypeId == AppDataType::appHalf)
+        if(args.compTypeId == DataTypeEnum::Half)
        {
-            profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(args.do_verification,
+            profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(
-                                                                    args.init_method,
+                args.do_verification,
-                                                                    args.do_log,
+                args.init_method,
-                                                                    args.do_dumpout,
+                args.do_dumpout,
-                                                                    args.nrepeat,
+                args.time_kernel,
-                                                                    args.inLengths,
+                args.inLengths,
-                                                                    args.reduceDims,
+                args.reduceDims,
-                                                                    args.reduceOp,
+                args.reduceOp,
-                                                                    args.nanOpt,
+                static_cast<bool>(args.nanOpt),
-                                                                    args.indicesOpt,
+                static_cast<bool>(args.indicesOpt),
-                                                                    args.scales[0],
+                args.scales[0],
-                                                                    args.scales[1]);
+                args.scales[1]);
        }
-        else if(args.compTypeId == AppDataType::appFloat)
+        else if(args.compTypeId == DataTypeEnum::Float)
        {
            profile_reduce_impl<ck::half_t, float, ck::half_t>(args.do_verification,
                                                               args.init_method,
-                                                               args.do_log,
                                                               args.do_dumpout,
-                                                               args.nrepeat,
+                                                               args.time_kernel,
                                                               args.inLengths,
                                                               args.reduceDims,
                                                               args.reduceOp,
-                                                               args.nanOpt,
+                                                               static_cast<bool>(args.nanOpt),
-                                                               args.indicesOpt,
+                                                               static_cast<bool>(args.indicesOpt),
                                                               args.scales[0],
                                                               args.scales[1]);
        }
@@ -385,56 +315,53 @@ int profile_reduce(int argc, char* argv[])
    {
        profile_reduce_impl<double, double, double>(args.do_verification,
                                                    args.init_method,
-                                                    args.do_log,
                                                    args.do_dumpout,
-                                                    args.nrepeat,
+                                                    args.time_kernel,
                                                    args.inLengths,
                                                    args.reduceDims,
                                                    args.reduceOp,
-                                                    args.nanOpt,
+                                                    static_cast<bool>(args.nanOpt),
-                                                    args.indicesOpt,
+                                                    static_cast<bool>(args.indicesOpt),
                                                    args.scales[0],
                                                    args.scales[1]);
    }
    else if(args.use_int8)
    {
        if(!args.compType_assigned)
-            args.compTypeId = AppDataType::appInt8;
+            args.compTypeId = DataTypeEnum::Int8;
        if(args.outType_assigned &&
-           (args.outTypeId != AppDataType::appInt8 && args.outTypeId != AppDataType::appInt32))
+           (args.outTypeId != DataTypeEnum::Int8 && args.outTypeId != DataTypeEnum::Int32))
-            args.outTypeId = AppDataType::appInt32;
+            args.outTypeId = DataTypeEnum::Int32;
        if(!args.outType_assigned)
-            args.outTypeId = AppDataType::appInt8;
+            args.outTypeId = DataTypeEnum::Int8;
-        if(args.compTypeId == AppDataType::appInt8)
+        if(args.compTypeId == DataTypeEnum::Int8)
        {
            profile_reduce_impl<int8_t, int8_t, int8_t>(args.do_verification,
                                                        args.init_method,
-                                                        args.do_log,
                                                        args.do_dumpout,
-                                                        args.nrepeat,
+                                                        args.time_kernel,
                                                        args.inLengths,
                                                        args.reduceDims,
                                                        args.reduceOp,
-                                                        args.nanOpt,
+                                                        static_cast<bool>(args.nanOpt),
-                                                        args.indicesOpt,
+                                                        static_cast<bool>(args.indicesOpt),
                                                        args.scales[0],
                                                        args.scales[1]);
        }
-        else if(args.compTypeId == AppDataType::appInt32)
+        else if(args.compTypeId == DataTypeEnum::Int32)
        {
            profile_reduce_impl<int8_t, int32_t, int8_t>(args.do_verification,
                                                         args.init_method,
-                                                         args.do_log,
                                                         args.do_dumpout,
-                                                         args.nrepeat,
+                                                         args.time_kernel,
                                                         args.inLengths,
                                                         args.reduceDims,
                                                         args.reduceOp,
-                                                         args.nanOpt,
+                                                         static_cast<bool>(args.nanOpt),
-                                                         args.indicesOpt,
+                                                         static_cast<bool>(args.indicesOpt),
                                                         args.scales[0],
                                                         args.scales[1]);
        }
@@ -444,54 +371,51 @@ int profile_reduce(int argc, char* argv[])
    else if(args.use_bf16)
    {
        if(args.outType_assigned &&
-           (args.outTypeId != AppDataType::appBFloat16 && args.outTypeId != AppDataType::appFloat))
+           (args.outTypeId != DataTypeEnum::BFloat16 && args.outTypeId != DataTypeEnum::Float))
-            args.outTypeId = AppDataType::appFloat;
+            args.outTypeId = DataTypeEnum::Float;
        if(!args.outType_assigned)
-            args.outTypeId = AppDataType::appBFloat16;
+            args.outTypeId = DataTypeEnum::BFloat16;
        profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(args.do_verification,
                                                             args.init_method,
-                                                             args.do_log,
                                                             args.do_dumpout,
-                                                             args.nrepeat,
+                                                             args.time_kernel,
                                                             args.inLengths,
                                                             args.reduceDims,
                                                             args.reduceOp,
-                                                             args.nanOpt,
+                                                             static_cast<bool>(args.nanOpt),
-                                                             args.indicesOpt,
+                                                             static_cast<bool>(args.indicesOpt),
                                                             args.scales[0],
                                                             args.scales[1]);
    }
    else
    {
-        if(args.compTypeId == AppDataType::appFloat)
+        if(args.compTypeId == DataTypeEnum::Float)
        {
            profile_reduce_impl<float, float, float>(args.do_verification,
                                                     args.init_method,
-                                                     args.do_log,
                                                     args.do_dumpout,
-                                                     args.nrepeat,
+                                                     args.time_kernel,
                                                     args.inLengths,
                                                     args.reduceDims,
                                                     args.reduceOp,
-                                                     args.nanOpt,
+                                                     static_cast<bool>(args.nanOpt),
-                                                     args.indicesOpt,
+                                                     static_cast<bool>(args.indicesOpt),
                                                     args.scales[0],
                                                     args.scales[1]);
        }
-        else if(args.compTypeId == AppDataType::appDouble)
+        else if(args.compTypeId == DataTypeEnum::Double)
        {
            profile_reduce_impl<float, double, float>(args.do_verification,
                                                      args.init_method,
-                                                      args.do_log,
                                                      args.do_dumpout,
-                                                      args.nrepeat,
+                                                      args.time_kernel,
                                                      args.inLengths,
                                                      args.reduceDims,
                                                      args.reduceOp,
-                                                      args.nanOpt,
+                                                      static_cast<bool>(args.nanOpt),
-                                                      args.indicesOpt,
+                                                      static_cast<bool>(args.indicesOpt),
                                                      args.scales[0],
                                                      args.scales[1]);
        }

--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -13,6 +13,7 @@ int profile_gemm_bias_relu_add(int, char*[]);
 int profile_gemm_reduce(int, char*[]);
 int profile_batched_gemm(int, char*[]);
 int profile_grouped_gemm(int, char*[]);
+int profile_conv_fwd(int, char*[]);
 int profile_conv_fwd_bias_relu(int, char*[]);
 int profile_conv_fwd_bias_relu_add(int, char*[]);
 int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);
@@ -53,7 +54,7 @@ int main(int argc, char* argv[])
    }
    else if(strcmp(argv[1], "grouped_gemm") == 0)
    {
-        profile_grouped_gemm(argc, argv);
+        return profile_grouped_gemm(argc, argv);
    }
    else if(strcmp(argv[1], "conv_fwd") == 0)
    {
@@ -107,7 +108,7 @@ int main(int argc, char* argv[])
               "                        conv1d_bwd_data: BackwardConvolution data 1 dim\n"
               "                        conv2d_bwd_data: BackwardConvolution data 2 dim\n"
               "                        conv3d_bwd_data: BackwardConvolution data 3 dim\n"
-               "                        reduce: REDUCE\n"
+               "                        reduce: Reduce\n"
               "                        conv2d_bwd_weight: Backward Weight Convolution 2d\n");
        // clang-format on
    }

--- a/script/parse_perf_data.py
+++ b/script/parse_perf_data.py
+#!/usr/bin/env python3
+import os, io, argparse, datetime
+import numpy as np
+import sqlalchemy
+from sqlalchemy.types import NVARCHAR, Float, Integer
+import pymysql
+import pandas as pd
+from sshtunnel import SSHTunnelForwarder
+def print_to_string(*args, **kwargs):
+    output = io.StringIO()
+    print(*args, file=output, **kwargs)
+    contents = output.getvalue()
+    output.close()
+    return contents
+def parse_args():
+    parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs')
+    parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files')
+    args = parser.parse_args()
+    files = []
+    if os.path.isdir(args.filename):
+        all_files = os.listdir(args.filename)
+        for name in all_files:
+            if not 'log' in name:
+                continue
+            files.append(os.path.join(args.filename, name))
+    else:
+        files = [args.filename]
+    args.files = files
+    return args
+def main():
+    args = parse_args()
+    tests = []
+    kernels=[]
+    tflops=[]
+    dtype=[]
+    alayout=[]
+    blayout=[]
+    M=[]
+    N=[]
+    K=[]
+    StrideA=[]
+    StrideB=[]
+    StrideC=[]
+    #parse results, get the Tflops value for "Best Perf" kernels
+    glue=""
+    for filename in args.files:
+        for line in open(filename):
+            if 'Branch name' in line:
+                lst=line.split()
+                branch_name=lst[2]
+    for filename in args.files:
+        for line in open(filename):
+            if 'Best Perf' in line:
+                lst=line.split()
+                if len(lst)>=37: #the line is complete
+                    tests.append(glue.join(lst[5:30]))
+                    kernels.append(glue.join(lst[37:]))
+                    tflops.append(lst[33])
+                    dtype.append(lst[5])
+                    alayout.append(lst[8])
+                    blayout.append(lst[11])
+                    M.append(lst[14])
+                    N.append(lst[17])
+                    K.append(lst[20])
+                    StrideA.append(lst[23])
+                    StrideB.append(lst[26])
+                    StrideC.append(lst[29])
+                elif len(lst)<37 and len(lst)>=33: #the tflops are available
+                    tests.append(glue.join(lst[5:30]))
+                    kernels.append("N/A")
+                    tflops.append(lst[33])
+                    dtype.append(lst[5])
+                    alayout.append(lst[8])
+                    blayout.append(lst[11])
+                    M.append(lst[14])
+                    N.append(lst[17])
+                    K.append(lst[20])
+                    StrideA.append(lst[23])
+                    StrideB.append(lst[26])
+                    StrideC.append(lst[29])
+                    print("warning: incomplete line:",lst)
+                elif len(lst)<33: #even the tflops are not available
+                    print("Error in ckProfiler output!")
+                    print("warning: incomplete line=",lst)
+    #sort results
+    print("Number of tests:",len(tests))
+    print("Branch name:",branch_name)
+    #sorted_tests = sorted(tests)
+    #print("sorted tests:",sorted_tests)
+    sorted_tflops = [x for _,x in sorted(zip(tests,tflops))]
+    #sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
+    test_list=list(range(1,len(tests)+1))
+    sql_hostname = '127.0.0.1'
+    sql_username = os.environ["dbuser"]
+    print("sql_username=",sql_username)
+    sql_password = os.environ["dbpassword"]
+    sql_main_database = 'miopen_perf'
+    sql_port = 3306
+    ssh_host = os.environ["dbsship"]
+    print("ssh_host=",ssh_host)
+    ssh_user = os.environ["dbsshuser"]
+    print("ssh_user=",ssh_user)
+    ssh_port = int(os.environ["dbsshport"])
+    ssh_pass = os.environ["dbsshpassword"]
+    with SSHTunnelForwarder(
+            (ssh_host, ssh_port),
+            ssh_username=ssh_user,
+            ssh_password=ssh_pass,
+            remote_bind_address=(sql_hostname, sql_port)) as tunnel:
+        sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'.
+            format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database))
+        conn = sqlEngine.connect()
+        #write the ck_gemm_test_params table
+        #only needed once the test set changes
+        '''
+        sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
+        sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
+        sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
+        sorted_M = [x for _,x in sorted(zip(tests,M))]
+        sorted_N = [x for _,x in sorted(zip(tests,N))]
+        sorted_K = [x for _,x in sorted(zip(tests,K))]
+        sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
+        sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
+        sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
+        ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout,
+                    sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
+                    sorted_StrideC]
+        df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
+            'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
+        print(df)
+        dtypes = {
+            'Test_number': Integer(),
+            'Data_type': NVARCHAR(length=5),
+            'Alayout': NVARCHAR(length=12),
+            'Blayout': NVARCHAR(length=12),
+            'M': Integer(),
+            'N': Integer(),
+            'K': Integer(),
+            'StrideA': Integer(),
+            'StrideB': Integer(),
+            'StrideC': Integer()
+            }
+        df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes)
+        '''
+        #read baseline results for the latest develop branch
+        query = '''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );'''
+        tflops_base = pd.read_sql_query(query, conn)
+        #write new results to the db
+        testlist=[]
+        for i in range(1,len(tests)+1):
+            testlist.append("Test%i"%i)
+        ck_gemm_tflops=[str(branch_name),str(datetime.datetime.now())]
+        flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Datetime'])
+        df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist)
+        flops=pd.concat([flops,df_add],axis=1)
+        print("new tflops results:",flops)
+        flops.to_sql("ck_gemm_tflops",conn,if_exists='append',index=False)
+        conn.close()
+    #compare the results to the baseline
+    regression=0
+    base=tflops_base[testlist].to_numpy(dtype='float')
+    base_list=base[0]
+    ave_perf=0
+    for i in range(len(base_list)):
+        # success criterion:
+        if base_list[i]>1.01*float(sorted_tflops[i]):
+            print("test # ",i,"shows regression by {:.3f}%".format(
+                (float(sorted_tflops[i])-base_list[i])/base_list[i]*100))
+            regression=1
+        ave_perf=ave_perf+float(sorted_tflops[i])/base_list[i]
+    if regression==0:
+        print("no regressions found")
+    ave_perf=ave_perf/len(base_list)
+    print("average performance relative to baseline:",ave_perf)
+    #return 0 if performance criteria met, otherwise return 1
+    return regression
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/script/profile_gemm.sh
+++ b/script/profile_gemm.sh
 #!/bin/bash
 ## GPU visibility
- export HIP_VISIBLE_DEVICES=0
+export HIP_VISIBLE_DEVICES=0
+#make -j ckProfiler
- make -j ckProfiler
+DRIVER="../build/bin/ckProfiler"
+echo $DRIVER
- DRIVER="./profiler/ckProfiler"
 OP=$1
 DATATYPE=$2
 LAYOUT=$3
@@ -43,3 +41,13 @@ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024  1024 1024	 1088	1
 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048  2048 2048	 2112	2112	2112
 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 4096  4096 4096	 4160	4160	4160
 $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 8192  8192 8192	 8256	8256	8256
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 6656  8192 8192	 -1	    -1      -1
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3328  4096	4096	 -1	    -1      -1
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1664  2048 2048	 -1	    -1      -1
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 832   1024 1024	 -1	    -1      -1
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7040  8192 8192	 -1	    -1      -1
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 5120  5632 4096	 -1	    -1      -1
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2560  2816 2048	 -1	    -1      -1
+$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1280  1408 1024	 -1	    -1      -1
--- a/script/test_reduce_no_index.sh
+++ b/script/test_reduce_no_index.sh
@@ -15,6 +15,17 @@ bin/test_reduce_no_index -D 64,4,280,82  -R 1  0 2
 bin/test_reduce_no_index -D 64,4,280,82  -R 2  0 2
 bin/test_reduce_no_index -D 64,4,280,82  -R 3  0 2
+## for float64
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  6 2
 ## for float16
 bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  1 2
 bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  1 2

--- a/script/test_reduce_with_index.sh
+++ b/script/test_reduce_with_index.sh
@@ -15,6 +15,17 @@ bin/test_reduce_with_index -D 64,4,280,82  -R 1  0 2
 bin/test_reduce_with_index -D 64,4,280,82  -R 2  0 2
 bin/test_reduce_with_index -D 64,4,280,82  -R 3  0 2
+## for float64
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  6 2
 ## for float16
 bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  1 2
 bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  1 2

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
 include_directories(BEFORE
+    ${PROJECT_SOURCE_DIR}/
    ${PROJECT_SOURCE_DIR}/include/ck
    ${PROJECT_SOURCE_DIR}/include/ck/utility
+    ${PROJECT_SOURCE_DIR}/include/ck/host_utility
    ${PROJECT_SOURCE_DIR}/include/ck/tensor_description
    ${PROJECT_SOURCE_DIR}/include/ck/tensor
    ${PROJECT_SOURCE_DIR}/include/ck/problem_transform
@@ -21,7 +23,8 @@ include_directories(BEFORE
    ${PROJECT_SOURCE_DIR}/external/include/half
 )
-add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
+include(googletest)
 add_custom_target(tests)
@@ -41,7 +44,7 @@ function(add_gtest_executable TEST_NAME)
    add_dependencies(tests ${TEST_NAME})
    add_dependencies(check ${TEST_NAME})
    # suppress gtest warnings
-    target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors)
+    target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors -Wno-undef)
    target_link_libraries(${TEST_NAME} PRIVATE gtest_main)
    gtest_discover_tests(${TEST_NAME})
 endfunction(add_gtest_executable TEST_NAME)
@@ -60,3 +63,6 @@ add_subdirectory(grouped_gemm)
 add_subdirectory(convnd_fwd)
 add_subdirectory(reduce)
 add_subdirectory(conv2d_bwd_weight)
+add_subdirectory(convnd_bwd_data)
+add_subdirectory(block_to_ctile_map)
+# DONOT add client_app, that is tested via CI independently
--- a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
+++ b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
@@ -22,7 +22,7 @@ int main()
                                                                  Row,
                                                                  Row,
                                                                  Row>(
-                       true, 1, false, 1, M, N, K, K, N, N, BatchCount);
+                       true, 1, false, false, M, N, K, K, N, N, BatchCount);
    pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
                                                                  ck::half_t,
@@ -31,7 +31,7 @@ int main()
                                                                  Row,
                                                                  Col,
                                                                  Row>(
-                       true, 1, false, 1, M, N, K, K, K, N, BatchCount);
+                       true, 1, false, false, M, N, K, K, K, N, BatchCount);
    pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
                                                                  ck::half_t,
@@ -40,7 +40,7 @@ int main()
                                                                  Col,
                                                                  Row,
                                                                  Row>(
-                       true, 1, false, 1, M, N, K, M, N, N, BatchCount);
+                       true, 1, false, false, M, N, K, M, N, N, BatchCount);
    pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
                                                                  ck::half_t,
@@ -49,7 +49,7 @@ int main()
                                                                  Col,
                                                                  Col,
                                                                  Row>(
-                       true, 1, false, 1, M, N, K, M, K, N, BatchCount);
+                       true, 1, false, false, M, N, K, M, K, N, BatchCount);
    if(pass)
    {

--- a/test/block_to_ctile_map/CMakeLists.txt
+++ b/test/block_to_ctile_map/CMakeLists.txt
+add_gtest_executable(test_block_to_ctile_map test_block_to_ctile_map.cpp)
\ No newline at end of file
--- a/test/block_to_ctile_map/test_block_to_ctile_map.cpp
+++ b/test/block_to_ctile_map/test_block_to_ctile_map.cpp
+#include <ck/config.hpp>
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "gtest/gtest.h"
+#include <iostream>
+#include <vector>
+using namespace ck;
+static auto I0 = Number<0>{};
+static auto I1 = Number<1>{};
+static auto I2 = Number<2>{};
+TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck1)
+{
+    const index_t M         = 384;
+    const index_t N         = 384;
+    const index_t MPerBlock = 128;
+    const index_t NPerBlock = 128;
+    const index_t MBlock    = M / MPerBlock;
+    const index_t NBlock    = N / NPerBlock;
+    const index_t M01       = 4;
+    const index_t N01       = 4;
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
+    printf("(M, N, MPerBlock, NPerBlock, M01, N01) = (%d, %d, %d, %d, %d, %d)\n",
+           M,
+           N,
+           MPerBlock,
+           NPerBlock,
+           M01,
+           N01);
+    BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n), true> tile_map(
+        c_grid_desc_m_n, M01, N01);
+    EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
+    EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 16);
+    // clang-format off
+    std::vector<std::vector<int>> expected_m0idx_n0idx_valid = {
+        {0, 0, 1},
+        {0, 1, 1},
+        {0, 2, 1},
+        {0, 3, 0},
+        {1, 0, 1},
+        {1, 1, 1},
+        {1, 2, 1},
+        {1, 3, 0},
+        {2, 0, 1},
+        {2, 1, 1},
+        {2, 2, 1},
+        {2, 3, 0},
+        {3, 0, 0},
+        {3, 1, 0},
+        {3, 2, 0},
+        {3, 3, 0}
+    };
+    // clang-format on
+    for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
+    {
+        auto m0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
+        std::cout << "block_1d_id = " << i << ", m0, n0 = " << m0n0_idx[I0] << ", " << m0n0_idx[I1];
+        std::cout << ", valid = " << tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))
+                  << std::endl;
+        bool equal =
+            expected_m0idx_n0idx_valid[i] ==
+            std::vector<int>{m0n0_idx[I0],
+                             m0n0_idx[I1],
+                             tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))};
+        EXPECT_TRUE(equal);
+    }
+}
+TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck0)
+{
+    const index_t M         = 384;
+    const index_t N         = 384;
+    const index_t MPerBlock = 128;
+    const index_t NPerBlock = 128;
+    const index_t M01 = 4;
+    const index_t N01 = 4;
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
+    printf("(M, N, MPerBlock, NPerBlock, M01, N01) = (%d, %d, %d, %d, %d, %d)\n",
+           M,
+           N,
+           MPerBlock,
+           NPerBlock,
+           M01,
+           N01);
+    BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n), false>
+        tile_map(c_grid_desc_m_n, M01, N01);
+    EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == false);
+}
+TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N0_M01_DeviceCTileIndexCheck1)
+{
+    const index_t M         = 384;
+    const index_t N         = 512;
+    const index_t MPerBlock = 128;
+    const index_t NPerBlock = 128;
+    const index_t MBlock    = M / MPerBlock;
+    const index_t NBlock    = N / NPerBlock;
+    const index_t M01       = 4;
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
+    printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
+           M,
+           N,
+           MPerBlock,
+           NPerBlock,
+           M01);
+    BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n), true> tile_map(
+        c_grid_desc_m_n, M01);
+    EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
+    EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 16);
+    // clang-format off
+    std::vector<std::vector<int>> expected_m0idx_n0idx_valid = {
+        {0, 0, 1},
+        {1, 0, 1},
+        {2, 0, 1},
+        {3, 0, 0},
+        {0, 1, 1},
+        {1, 1, 1},
+        {2, 1, 1},
+        {3, 1, 0},
+        {0, 2, 1},
+        {1, 2, 1},
+        {2, 2, 1},
+        {3, 2, 0},
+        {0, 3, 1},
+        {1, 3, 1},
+        {2, 3, 1},
+        {3, 3, 0}
+    };
+    // clang-format on
+    for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
+    {
+        auto m0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
+        std::cout << "block_1d_id = " << i << ", m0, n0 = " << m0n0_idx[I0] << ", " << m0n0_idx[I1];
+        std::cout << ", valid = " << tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))
+                  << std::endl;
+        bool equal =
+            expected_m0idx_n0idx_valid[i] ==
+            std::vector<int>{m0n0_idx[I0],
+                             m0n0_idx[I1],
+                             tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))};
+        EXPECT_TRUE(equal);
+    }
+}
+TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N0_M01_DeviceCTileIndexCheck0)
+{
+    const index_t M         = 512;
+    const index_t N         = 384;
+    const index_t MPerBlock = 128;
+    const index_t NPerBlock = 128;
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
+    // clang-format off
+    std::vector<std::tuple<int, int, bool>> expected_m0_gridsize_validity = {
+        {5, 15, false},
+        {4, 12, true},
+        {3, 18, false},
+        {2, 12, true},
+        {1, 12, true}
+    };
+    // clang-format on
+    for(auto e : expected_m0_gridsize_validity)
+    {
+        const index_t M01 = std::get<0>(e);
+        printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
+               M,
+               N,
+               MPerBlock,
+               NPerBlock,
+               M01);
+        BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n), false> tile_map(
+            c_grid_desc_m_n, M01);
+        EXPECT_EQ(tile_map.CalculateGridSize(c_grid_desc_m_n), std::get<1>(e));
+        EXPECT_EQ(tile_map.CheckValidity(c_grid_desc_m_n), std::get<2>(e));
+    }
+}
+TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N0_M01Adapt)
+{
+    const index_t M         = 768;
+    const index_t N         = 384;
+    const index_t MPerBlock = 128;
+    const index_t NPerBlock = 128;
+    const index_t MBlock    = M / MPerBlock;
+    const index_t NBlock    = N / NPerBlock;
+    constexpr index_t M01   = 4;
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
+    printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
+           M,
+           N,
+           MPerBlock,
+           NPerBlock,
+           M01);
+    BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n)> tile_map(
+        c_grid_desc_m_n, M01);
+    EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
+    EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 18);
+    // clang-format off
+    std::vector<std::vector<int>> expected_m0idx_n0idx_valid = {
+        {0, 0, 1},
+        {1, 0, 1},
+        {2, 0, 1},
+        {3, 0, 1},
+        {0, 1, 1},
+        {1, 1, 1},
+        {2, 1, 1},
+        {3, 1, 1},
+        {0, 2, 1},
+        {1, 2, 1},
+        {2, 2, 1},
+        {3, 2, 1},
+        {4, 0, 1},
+        {5, 0, 1},
+        {4, 1, 1},
+        {5, 1, 1},
+        {4, 2, 1},
+        {5, 2, 1},
+    };
+    // clang-format on
+    for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
+    {
+        auto m0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
+        std::cout << "block_1d_id = " << i << ", m0, n0 = " << m0n0_idx[I0] << ", " << m0n0_idx[I1];
+        std::cout << ", valid = " << tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))
+                  << std::endl;
+        bool equal =
+            expected_m0idx_n0idx_valid[i] ==
+            std::vector<int>{m0n0_idx[I0],
+                             m0n0_idx[I1],
+                             tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))};
+        EXPECT_TRUE(equal);
+    }
+}
+TEST(BlockToCTileMap, TestBlockToCTileMap_KSplit_M00_N0_M01Adapt)
+{
+    const index_t M         = 768;
+    const index_t N         = 384;
+    const index_t MPerBlock = 128;
+    const index_t NPerBlock = 128;
+    const index_t MBlock    = M / MPerBlock;
+    const index_t NBlock    = N / NPerBlock;
+    constexpr index_t M01   = 4;
+    const index_t KSplit    = 3;
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
+    printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
+           M,
+           N,
+           MPerBlock,
+           NPerBlock,
+           M01);
+    BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n)>
+        tile_map(c_grid_desc_m_n, M01, KSplit);
+    EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
+    EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 18 * KSplit);
+    std::vector<std::vector<int>> expected_ksplitidx_m0idx_n0idx_valid = {
+        {0, 0, 0, 1}, {0, 1, 0, 1}, {0, 2, 0, 1}, {0, 3, 0, 1}, {0, 0, 1, 1}, {0, 1, 1, 1},
+        {0, 2, 1, 1}, {0, 3, 1, 1}, {0, 0, 2, 1}, {0, 1, 2, 1}, {0, 2, 2, 1}, {0, 3, 2, 1},
+        {0, 4, 0, 1}, {0, 5, 0, 1}, {0, 4, 1, 1}, {0, 5, 1, 1}, {0, 4, 2, 1}, {0, 5, 2, 1},
+        {1, 0, 0, 1}, {1, 1, 0, 1}, {1, 2, 0, 1}, {1, 3, 0, 1}, {1, 0, 1, 1}, {1, 1, 1, 1},
+        {1, 2, 1, 1}, {1, 3, 1, 1}, {1, 0, 2, 1}, {1, 1, 2, 1}, {1, 2, 2, 1}, {1, 3, 2, 1},
+        {1, 4, 0, 1}, {1, 5, 0, 1}, {1, 4, 1, 1}, {1, 5, 1, 1}, {1, 4, 2, 1}, {1, 5, 2, 1},
+        {2, 0, 0, 1}, {2, 1, 0, 1}, {2, 2, 0, 1}, {2, 3, 0, 1}, {2, 0, 1, 1}, {2, 1, 1, 1},
+        {2, 2, 1, 1}, {2, 3, 1, 1}, {2, 0, 2, 1}, {2, 1, 2, 1}, {2, 2, 2, 1}, {2, 3, 2, 1},
+        {2, 4, 0, 1}, {2, 5, 0, 1}, {2, 4, 1, 1}, {2, 5, 1, 1}, {2, 4, 2, 1}, {2, 5, 2, 1},
+    };
+    for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
+    {
+        auto ksplitm0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
+        std::cout << "block_1d_id = " << i << ", ksplit, m0, n0 = " << ksplitm0n0_idx[I0] << ", "
+                  << ksplitm0n0_idx[I1] << ", " << ksplitm0n0_idx[I2];
+        std::cout << ", valid = "
+                  << tile_map.ValidCTileIndex(ksplitm0n0_idx, make_tuple(MBlock, NBlock))
+                  << std::endl;
+        bool equal =
+            expected_ksplitidx_m0idx_n0idx_valid[i] ==
+            std::vector<int>{ksplitm0n0_idx[I0],
+                             ksplitm0n0_idx[I1],
+                             ksplitm0n0_idx[I2],
+                             tile_map.ValidCTileIndex(ksplitm0n0_idx, make_tuple(MBlock, NBlock))};
+        EXPECT_TRUE(equal);
+    }
+}
--- a/test/client_app/CMakeLists.txt
+++ b/test/client_app/CMakeLists.txt
+cmake_minimum_required(VERSION 3.15)
+project(ck_app)
+add_compile_options(-std=c++14)
+find_package(composable_kernel 1.0.0 COMPONENTS device_operations host_tensor)
+find_package(hip REQUIRED PATHS /opt/rocm)
+message(STATUS "Build with HIP ${hip_VERSION}")
+add_executable(test_client_app client_app.cpp)
+target_link_libraries(test_client_app PRIVATE composable_kernel::device_operations composable_kernel::host_tensor hip::host)
--- a/test/client_app/client_app.cpp
+++ b/test/client_app/client_app.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include <vector>
+#include "client_app_impl.hpp"
+int main(int argc, char* argv[])
+{
+    if(argc != 25)
+    {
+        printf("arg1: tensor operation (conv_fwd: ForwardConvolution)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+        printf("arg6: verification (0: no; 1: yes)\n");
+        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
+        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+    const ConvDataType data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const int in_layout          = static_cast<ConvInputLayout>(std::stoi(argv[3]));
+    const int wei_layout         = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
+    const int out_layout         = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
+    const bool do_verification   = std::stoi(argv[6]);
+    const int init_method        = std::stoi(argv[7]);
+    const bool do_log            = std::stoi(argv[8]);
+    const bool time_kernel       = std::stoi(argv[9]);
+    const ck::index_t N  = std::stoi(argv[10]);
+    const ck::index_t K  = std::stoi(argv[11]);
+    const ck::index_t C  = std::stoi(argv[12]);
+    const ck::index_t Y  = std::stoi(argv[13]);
+    const ck::index_t X  = std::stoi(argv[14]);
+    const ck::index_t Hi = std::stoi(argv[15]);
+    const ck::index_t Wi = std::stoi(argv[16]);
+    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
+    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
+    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
+    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
+    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
+    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+    ck::app::profile_conv_fwd_impl(do_verification,
+                                   init_method,
+                                   do_log,
+                                   time_kernel,
+                                   data_type,
+                                   N,
+                                   K,
+                                   C,
+                                   std::vector<ck::index_t>{Hi, Wi},
+                                   std::vector<ck::index_t>{Y, X},
+                                   std::vector<ck::index_t>{Ho, Wo},
+                                   std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+                                   std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+                                   std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+                                   std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    return 1;
+}
--- a/test/client_app/client_app_impl.hpp
+++ b/test/client_app/client_app_impl.hpp
+#pragma once
+#include "host_interface.hpp"
+enum ConvDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+};
+enum ConvInputLayout
+{
+    NCHW, // 0
+    NHWC, // 1
+};
+enum ConvWeightLayout
+{
+    KCYX, // 0
+    KYXC, // 1
+};
+enum ConvOutputLayout
+{
+    NKHW, // 0
+    NHWK, // 1
+};
+void check_hip_error(void)
+{
+    hipError_t err = hipGetLastError();
+    if(err != hipSuccess)
+    {
+        std::cerr << "Error: " << hipGetErrorString(err) << std::endl;
+        exit(err);
+    }
+}
+std::string getDeviceName(int device)
+{
+    struct hipDeviceProp_t prop;
+    hipGetDeviceProperties(&prop, device);
+    check_hip_error();
+    return std::string(prop.name);
+}
+int getDriver(void)
+{
+    int driver;
+    hipDriverGetVersion(&driver);
+    check_hip_error();
+    return driver;
+}
+namespace ck {
+namespace app {
+struct DeviceMem
+{
+    DeviceMem() = delete;
+    DeviceMem(std::size_t mem_size);
+    void* GetDeviceBuffer();
+    void ToDevice(const void* p);
+    void FromDevice(void* p);
+    ~DeviceMem();
+    void* mpDeviceBuf;
+    std::size_t mMemSize;
+};
+DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
+{
+    hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+}
+void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
+void DeviceMem::ToDevice(const void* p)
+{
+    hipGetErrorString(
+        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
+}
+void DeviceMem::FromDevice(void* p)
+{
+    hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
+}
+DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
+void profile_conv_fwd_impl(int do_verification,
+                           int init_method,
+                           bool do_log,
+                           bool time_kernel,
+                           ConvDataType data_type,
+                           ck::index_t N,
+                           ck::index_t K,
+                           ck::index_t C,
+                           std::vector<ck::index_t> input_spatial_lengths,
+                           std::vector<ck::index_t> filter_spatial_lengths,
+                           std::vector<ck::index_t> output_spatial_lengths,
+                           std::vector<ck::index_t> conv_filter_strides,
+                           std::vector<ck::index_t> conv_filter_dilations,
+                           std::vector<ck::index_t> input_left_pads,
+                           std::vector<ck::index_t> input_right_pads)
+{
+    const ck::index_t Y = filter_spatial_lengths[0];
+    const ck::index_t X = filter_spatial_lengths[1];
+    const ck::index_t Hi = input_spatial_lengths[0];
+    const ck::index_t Wi = input_spatial_lengths[1];
+    const ck::index_t Ho = output_spatial_lengths[0];
+    const ck::index_t Wo = output_spatial_lengths[1];
+    const auto in_sz  = N * C * Hi * Wi;
+    const auto wei_sz = K * C * Y * X;
+    const auto out_sz = N * K * Ho * Wo;
+    using WeiDataType = float;
+    using InDataType  = float;
+    using OutDataType = float;
+    app::DeviceMem in_device_buf(sizeof(InDataType) * in_sz);
+    app::DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_sz);
+    app::DeviceMem out_device_buf(sizeof(OutDataType) * out_sz);
+    // data is already on device!
+    // add device Conv instances
+    std::vector<DeviceConvFwdPtr_t> conv_ptrs;
+    if(data_type == F16_F16_F16)
+    {
+        add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances_t(conv_ptrs);
+        add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances_t(conv_ptrs);
+    }
+    else if(data_type == BF16_BF16_BF16)
+        add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances_t(conv_ptrs);
+    else if(data_type == F32_F32_F32)
+        add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances_t(conv_ptrs);
+    else if(data_type == INT8_INT8_INT8)
+        add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances_t(conv_ptrs);
+    else
+        throw std::runtime_error("wrong! Invalid data type");
+    if(conv_ptrs.empty())
+    {
+        throw std::runtime_error("wrong! no device Conv instance found");
+    }
+    std::string best_conv_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    int deviceIndex       = 0;
+    hipSetDevice(deviceIndex);
+    check_hip_error();
+    StreamConfig stream_config{nullptr, time_kernel};
+    hipStreamCreate(&stream_config.stream_id_);
+    check_hip_error();
+    // profile device Conv instances
+    for(auto& conv_ptr : conv_ptrs)
+    {
+        auto argument_ptr =
+            conv_ptr.MakeArgumentPointer(static_cast<void*>(in_device_buf.GetDeviceBuffer()),
+                                         static_cast<void*>(wei_device_buf.GetDeviceBuffer()),
+                                         static_cast<void*>(out_device_buf.GetDeviceBuffer()),
+                                         N,
+                                         K,
+                                         C,
+                                         input_spatial_lengths,
+                                         filter_spatial_lengths,
+                                         output_spatial_lengths,
+                                         conv_filter_strides,
+                                         conv_filter_dilations,
+                                         input_left_pads,
+                                         input_right_pads);
+        auto invoker_ptr = conv_ptr.MakeInvokerPointer();
+        if(conv_ptr.IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string conv_name = conv_ptr.GetTypeString();
+            float ave_time        = invoker_ptr->Run(argument_ptr.get(), stream_config);
+            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+            std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
+                                    sizeof(WeiDataType) * (K * C * Y * X) +
+                                    sizeof(OutDataType) * (N * K * Ho * Wo);
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << conv_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_conv_name  = conv_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
+}
+} // namespace app
+} // namespace ck
--- a/test/conv2d_bwd_weight/CMakeLists.txt
+++ b/test/conv2d_bwd_weight/CMakeLists.txt
@@ -4,4 +4,4 @@ include_directories(BEFORE
 )
 add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp)
-target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_fwd_util)
+target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_util)
--- a/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp
+++ b/test/conv2d_bwd_weight/conv2d_bwd_weight.cpp
@@ -6,7 +6,7 @@
 #include <half.hpp>
 #include <vector>
-#include "conv_fwd_util.hpp"
+#include "conv_util.hpp"
 #include "profile_conv_bwd_weight_impl.hpp"
 int test_self()
@@ -28,20 +28,20 @@ int test_self()
                                                           ck::tensor_layout::convolution::NHWC,
                                                           ck::tensor_layout::convolution::KYXC,
                                                           ck::tensor_layout::convolution::NHWK>(
-            1, // do_verification,
+            true,  // do_verification
-            1, // init_method,
+            1,     // init_method
-            0, // do_log,
+            false, // do_log
-            1, // nrepeat,
+            false, // time_kernel
-            param.N,
+            param.N_,
-            param.K,
+            param.K_,
-            param.C,
+            param.C_,
-            param.input_spatial_lengths,
+            param.input_spatial_lengths_,
-            param.filter_spatial_lengths,
+            param.filter_spatial_lengths_,
            param.GetOutputSpatialLengths(),
-            param.conv_filter_strides,
+            param.conv_filter_strides_,
-            param.conv_filter_dilations,
+            param.conv_filter_dilations_,
-            param.input_left_pads,
+            param.input_left_pads_,
-            param.input_right_pads,
+            param.input_right_pads_,
            2);
        // fp16
@@ -52,28 +52,28 @@ int test_self()
                                                           ck::tensor_layout::convolution::NHWC,
                                                           ck::tensor_layout::convolution::KYXC,
                                                           ck::tensor_layout::convolution::NHWK>(
-            1, // do_verification,
+            true,  // do_verification
-            1, // init_method,
+            1,     // init_method
-            0, // do_log,
+            false, // do_log
-            1, // nrepeat,
+            false, // time_kernel
-            param.N,
+            param.N_,
-            param.K,
+            param.K_,
-            param.C,
+            param.C_,
-            param.input_spatial_lengths,
+            param.input_spatial_lengths_,
-            param.filter_spatial_lengths,
+            param.filter_spatial_lengths_,
            param.GetOutputSpatialLengths(),
-            param.conv_filter_strides,
+            param.conv_filter_strides_,
-            param.conv_filter_dilations,
+            param.conv_filter_dilations_,
-            param.input_left_pads,
+            param.input_left_pads_,
-            param.input_right_pads,
+            param.input_right_pads_,
            2);
    }
    return pass;
 }
 int main(int argc, char* argv[])
 {
-    int data_type   = 0;
+    int data_type   = 1;
-    int init_method = 0;
+    int init_method = 1;
    // Conv shape
    ck::index_t N               = 128;
@@ -155,20 +155,20 @@ int main(int argc, char* argv[])
                                                              ck::tensor_layout::convolution::NHWC,
                                                              ck::tensor_layout::convolution::KYXC,
                                                              ck::tensor_layout::convolution::NHWK>(
-                1,
+                true, // do_verification
                init_method,
-                0,
+                false, // do_log
-                1,
+                false, // time_kernel
-                param.N,
+                param.N_,
-                param.K,
+                param.K_,
-                param.C,
+                param.C_,
-                param.input_spatial_lengths,
+                param.input_spatial_lengths_,
-                param.filter_spatial_lengths,
+                param.filter_spatial_lengths_,
                param.GetOutputSpatialLengths(),
-                param.conv_filter_strides,
+                param.conv_filter_strides_,
-                param.conv_filter_dilations,
+                param.conv_filter_dilations_,
-                param.input_left_pads,
+                param.input_left_pads_,
-                param.input_right_pads,
+                param.input_right_pads_,
                split_k);
        }
        else if(data_type == 1)
@@ -180,20 +180,20 @@ int main(int argc, char* argv[])
                                                              ck::tensor_layout::convolution::NHWC,
                                                              ck::tensor_layout::convolution::KYXC,
                                                              ck::tensor_layout::convolution::NHWK>(
-                1,
+                true, // do_verification
                init_method,
-                0,
+                false, // do_log
-                1,
+                false, // time_kernel
-                param.N,
+                param.N_,
-                param.K,
+                param.K_,
-                param.C,
+                param.C_,
-                param.input_spatial_lengths,
+                param.input_spatial_lengths_,
-                param.filter_spatial_lengths,
+                param.filter_spatial_lengths_,
                param.GetOutputSpatialLengths(),
-                param.conv_filter_strides,
+                param.conv_filter_strides_,
-                param.conv_filter_dilations,
+                param.conv_filter_dilations_,
-                param.input_left_pads,
+                param.input_left_pads_,
-                param.input_right_pads,
+                param.input_right_pads_,
                split_k);
        }
        else