Merge commit '3e6c2610' into gemm_norm

b2290854 · rocking · 253f7ef2 · 3e6c2610 · b2290854 · b2290854
Commit b2290854 authored May 27, 2022 by rocking
20 changed files
--- a/profiler/src/profile_reduce.cpp
+++ b/profiler/src/profile_reduce.cpp
 #include <iostream>
 #include <fstream>
-#include <numeric>
-#include <initializer_list>
 #include <cstdlib>
 #include <vector>
 #include <stdexcept>
 #include <sstream>
 #include <getopt.h>
-#include "config.hpp"
+#include "data_type_enum.hpp"
-#include "print.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "device_tensor.hpp"
 #include "reduction_enums.hpp"
+#include "host_common_util.hpp"
 #include "profile_reduce_impl.hpp"
 using namespace std;
-using ck::NanPropagation;
-using ck::ReduceTensorIndices;
 using ck::ReduceTensorOp;
 static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
@@ -38,63 +30,9 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
                                       {"bf16", no_argument, nullptr, '?'},
                                       {"dumpout", required_argument, nullptr, 'o'},
                                       {"verify", required_argument, nullptr, 'v'},
-                                       {"log", required_argument, nullptr, 'l'},
                                       {"help", no_argument, nullptr, '?'},
                                       {nullptr, 0, nullptr, 0}};
-template <typename T>
-static T getSingleValueFromString(const string& valueStr)
-{
-    std::istringstream iss(valueStr);
-    T val;
-    iss >> val;
-    return (val);
-};
-template <typename T>
-static std::vector<T> getTypeValuesFromString(const char* cstr_values)
-{
-    std::string valuesStr(cstr_values);
-    std::vector<T> values;
-    std::size_t pos = 0;
-    std::size_t new_pos;
-    new_pos = valuesStr.find(',', pos);
-    while(new_pos != std::string::npos)
-    {
-        const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
-        T val = getSingleValueFromString<T>(sliceStr);
-        values.push_back(val);
-        pos     = new_pos + 1;
-        new_pos = valuesStr.find(',', pos);
-    };
-    std::string sliceStr = valuesStr.substr(pos);
-    T val                = getSingleValueFromString<T>(sliceStr);
-    values.push_back(val);
-    return (values);
-}
-enum struct AppDataType
-{
-    appHalf     = 0,
-    appFloat    = 1,
-    appInt32    = 2,
-    appInt8     = 3,
-    appInt8x4   = 4,
-    appBFloat16 = 5,
-    appDouble   = 6,
-};
 static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims)
 {
    for(auto dim : reduceDims)
@@ -113,7 +51,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims
    };
 };
-class AppArgs
+class ReduceProfilerArgs
 {
    private:
    int option_index = 0;
@@ -130,26 +68,23 @@ class AppArgs
    std::vector<float> scales;
-    ReduceTensorOp reduceOp = ReduceTensorOp::ADD;
+    ReduceTensorOp reduceOp     = ReduceTensorOp::ADD;
-    AppDataType compTypeId  = AppDataType::appFloat;
+    ck::DataTypeEnum compTypeId = ck::DataTypeEnum::Float;
-    AppDataType outTypeId   = AppDataType::appFloat;
+    ck::DataTypeEnum outTypeId  = ck::DataTypeEnum::Float;
    bool compType_assigned = false;
    bool outType_assigned  = false;
-    NanPropagation nanOpt          = NanPropagation::NOT_PROPAGATE_NAN;
+    int nanOpt           = 0;
-    ReduceTensorIndices indicesOpt = ReduceTensorIndices::NO_INDICES;
+    int indicesOpt       = 0;
-    bool do_log                    = false;
+    bool do_verification = false;
-    bool do_verification           = false;
+    bool do_dumpout      = false;
-    bool do_dumpout                = false;
    int init_method;
    bool time_kernel;
-    bool need_indices = false;
+    ReduceProfilerArgs()  = default;
+    ~ReduceProfilerArgs() = default;
-    AppArgs()  = default;
-    ~AppArgs() = default;
    void show_usage(const char* cmd)
    {
@@ -166,8 +101,11 @@ class AppArgs
        std::cout << "--outType or -W, optional enum value indicating the type of the reduced "
                     "output, which could be float when the input data is half"
                  << std::endl;
-        std::cout << "--nanOpt or -N, enum value indicates the selection for NanOpt" << std::endl;
+        std::cout
-        std::cout << "--indicesOpt or -I, enum value indicates the selection for IndicesOpt"
+            << "--nanOpt or -N, 1/0 value indicates the selection to use or not use Nan-Propagation"
+            << std::endl;
+        std::cout << "--indicesOpt or -I, 1/0 value indicates the selection to use or not use "
+                     "index in reduction"
                  << std::endl;
        std::cout << "--scales or -S, comma separated two float values for alpha and beta"
                  << std::endl;
@@ -181,18 +119,19 @@ class AppArgs
        std::cout << "--dumpout or -o, 1/0 to indicate where to save the reduction result to files "
                     "for further analysis"
                  << std::endl;
-        std::cout << "--log or -l, 1/0 to indicate whether to log some information" << std::endl;
    };
    int processArgs(int argc, char* argv[])
    {
+        using ck::host_common::getTypeValuesFromString;
        int ch;
        optind++; // to skip the "reduce" module name
        while(1)
        {
-            ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:l:", long_options, &option_index);
+            ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:", long_options, &option_index);
            if(ch == -1)
                break;
            switch(ch)
@@ -219,27 +158,27 @@ class AppArgs
                if(!optarg)
                    throw std::runtime_error("Invalid option format!");
-                compTypeId        = static_cast<AppDataType>(std::atoi(optarg));
+                compTypeId        = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
                compType_assigned = true;
                break;
            case 'W':
                if(!optarg)
                    throw std::runtime_error("Invalid option format!");
-                outTypeId        = static_cast<AppDataType>(std::atoi(optarg));
+                outTypeId        = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
                outType_assigned = true;
                break;
            case 'N':
                if(!optarg)
                    throw std::runtime_error("Invalid option format!");
-                nanOpt = static_cast<NanPropagation>(std::atoi(optarg));
+                nanOpt = std::atoi(optarg);
                break;
            case 'I':
                if(!optarg)
                    throw std::runtime_error("Invalid option format!");
-                indicesOpt = static_cast<ReduceTensorIndices>(std::atoi(optarg));
+                indicesOpt = std::atoi(optarg);
                break;
            case 'S':
                if(!optarg)
@@ -262,12 +201,6 @@ class AppArgs
                do_dumpout = static_cast<bool>(std::atoi(optarg));
                break;
-            case 'l':
-                if(!optarg)
-                    throw std::runtime_error("Invalid option format!");
-                do_log = static_cast<bool>(std::atoi(optarg));
-                break;
            case '?':
                if(std::string(long_options[option_index].name) == "half")
                    use_half = true;
@@ -295,7 +228,7 @@ class AppArgs
            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
        init_method = std::atoi(argv[optind++]);
-        time_kernel = std::atoi(argv[optind]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
        if(scales.empty())
        {
@@ -306,9 +239,6 @@ class AppArgs
        if(reduceOp == ReduceTensorOp::MIN || reduceOp == ReduceTensorOp::MAX ||
           reduceOp == ReduceTensorOp::AMAX)
        {
-            if(indicesOpt != ReduceTensorIndices::NO_INDICES)
-                need_indices = true;
            // for indexable operations, no need to assign compType and outType, just let them be
            // same as inType
            compType_assigned = false;
@@ -322,9 +252,10 @@ class AppArgs
 int profile_reduce(int argc, char* argv[])
 {
-    using namespace ck::profiler;
+    using ck::DataTypeEnum;
+    using ck::profiler::profile_reduce_impl;
-    AppArgs args;
+    ReduceProfilerArgs args;
    if(args.processArgs(argc, argv) < 0)
        return (-1);
@@ -339,42 +270,41 @@ int profile_reduce(int argc, char* argv[])
    if(args.use_half)
    {
        if(!args.compType_assigned)
-            args.compTypeId = AppDataType::appHalf;
+            args.compTypeId = DataTypeEnum::Half;
        if(args.outType_assigned &&
-           (args.outTypeId != AppDataType::appHalf && args.outTypeId != AppDataType::appFloat))
+           (args.outTypeId != DataTypeEnum::Half && args.outTypeId != DataTypeEnum::Float))
-            args.outTypeId = AppDataType::appFloat;
+            args.outTypeId = DataTypeEnum::Float;
        if(!args.outType_assigned)
-            args.outTypeId = AppDataType::appHalf;
+            args.outTypeId = DataTypeEnum::Half;
-        if(args.compTypeId == AppDataType::appHalf)
+        if(args.compTypeId == DataTypeEnum::Half)
        {
-            profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(args.do_verification,
+            profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(
-                                                                    args.init_method,
+                args.do_verification,
-                                                                    args.do_log,
+                args.init_method,
-                                                                    args.do_dumpout,
+                args.do_dumpout,
-                                                                    args.time_kernel,
+                args.time_kernel,
-                                                                    args.inLengths,
+                args.inLengths,
-                                                                    args.reduceDims,
+                args.reduceDims,
-                                                                    args.reduceOp,
+                args.reduceOp,
-                                                                    args.nanOpt,
+                static_cast<bool>(args.nanOpt),
-                                                                    args.indicesOpt,
+                static_cast<bool>(args.indicesOpt),
-                                                                    args.scales[0],
+                args.scales[0],
-                                                                    args.scales[1]);
+                args.scales[1]);
        }
-        else if(args.compTypeId == AppDataType::appFloat)
+        else if(args.compTypeId == DataTypeEnum::Float)
        {
            profile_reduce_impl<ck::half_t, float, ck::half_t>(args.do_verification,
                                                               args.init_method,
-                                                               args.do_log,
                                                               args.do_dumpout,
                                                               args.time_kernel,
                                                               args.inLengths,
                                                               args.reduceDims,
                                                               args.reduceOp,
-                                                               args.nanOpt,
+                                                               static_cast<bool>(args.nanOpt),
-                                                               args.indicesOpt,
+                                                               static_cast<bool>(args.indicesOpt),
                                                               args.scales[0],
                                                               args.scales[1]);
        }
@@ -385,56 +315,53 @@ int profile_reduce(int argc, char* argv[])
    {
        profile_reduce_impl<double, double, double>(args.do_verification,
                                                    args.init_method,
-                                                    args.do_log,
                                                    args.do_dumpout,
                                                    args.time_kernel,
                                                    args.inLengths,
                                                    args.reduceDims,
                                                    args.reduceOp,
-                                                    args.nanOpt,
+                                                    static_cast<bool>(args.nanOpt),
-                                                    args.indicesOpt,
+                                                    static_cast<bool>(args.indicesOpt),
                                                    args.scales[0],
                                                    args.scales[1]);
    }
    else if(args.use_int8)
    {
        if(!args.compType_assigned)
-            args.compTypeId = AppDataType::appInt8;
+            args.compTypeId = DataTypeEnum::Int8;
        if(args.outType_assigned &&
-           (args.outTypeId != AppDataType::appInt8 && args.outTypeId != AppDataType::appInt32))
+           (args.outTypeId != DataTypeEnum::Int8 && args.outTypeId != DataTypeEnum::Int32))
-            args.outTypeId = AppDataType::appInt32;
+            args.outTypeId = DataTypeEnum::Int32;
        if(!args.outType_assigned)
-            args.outTypeId = AppDataType::appInt8;
+            args.outTypeId = DataTypeEnum::Int8;
-        if(args.compTypeId == AppDataType::appInt8)
+        if(args.compTypeId == DataTypeEnum::Int8)
        {
            profile_reduce_impl<int8_t, int8_t, int8_t>(args.do_verification,
                                                        args.init_method,
-                                                        args.do_log,
                                                        args.do_dumpout,
                                                        args.time_kernel,
                                                        args.inLengths,
                                                        args.reduceDims,
                                                        args.reduceOp,
-                                                        args.nanOpt,
+                                                        static_cast<bool>(args.nanOpt),
-                                                        args.indicesOpt,
+                                                        static_cast<bool>(args.indicesOpt),
                                                        args.scales[0],
                                                        args.scales[1]);
        }
-        else if(args.compTypeId == AppDataType::appInt32)
+        else if(args.compTypeId == DataTypeEnum::Int32)
        {
            profile_reduce_impl<int8_t, int32_t, int8_t>(args.do_verification,
                                                         args.init_method,
-                                                         args.do_log,
                                                         args.do_dumpout,
                                                         args.time_kernel,
                                                         args.inLengths,
                                                         args.reduceDims,
                                                         args.reduceOp,
-                                                         args.nanOpt,
+                                                         static_cast<bool>(args.nanOpt),
-                                                         args.indicesOpt,
+                                                         static_cast<bool>(args.indicesOpt),
                                                         args.scales[0],
                                                         args.scales[1]);
        }
@@ -444,54 +371,51 @@ int profile_reduce(int argc, char* argv[])
    else if(args.use_bf16)
    {
        if(args.outType_assigned &&
-           (args.outTypeId != AppDataType::appBFloat16 && args.outTypeId != AppDataType::appFloat))
+           (args.outTypeId != DataTypeEnum::BFloat16 && args.outTypeId != DataTypeEnum::Float))
-            args.outTypeId = AppDataType::appFloat;
+            args.outTypeId = DataTypeEnum::Float;
        if(!args.outType_assigned)
-            args.outTypeId = AppDataType::appBFloat16;
+            args.outTypeId = DataTypeEnum::BFloat16;
        profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(args.do_verification,
                                                             args.init_method,
-                                                             args.do_log,
                                                             args.do_dumpout,
                                                             args.time_kernel,
                                                             args.inLengths,
                                                             args.reduceDims,
                                                             args.reduceOp,
-                                                             args.nanOpt,
+                                                             static_cast<bool>(args.nanOpt),
-                                                             args.indicesOpt,
+                                                             static_cast<bool>(args.indicesOpt),
                                                             args.scales[0],
                                                             args.scales[1]);
    }
    else
    {
-        if(args.compTypeId == AppDataType::appFloat)
+        if(args.compTypeId == DataTypeEnum::Float)
        {
            profile_reduce_impl<float, float, float>(args.do_verification,
                                                     args.init_method,
-                                                     args.do_log,
                                                     args.do_dumpout,
                                                     args.time_kernel,
                                                     args.inLengths,
                                                     args.reduceDims,
                                                     args.reduceOp,
-                                                     args.nanOpt,
+                                                     static_cast<bool>(args.nanOpt),
-                                                     args.indicesOpt,
+                                                     static_cast<bool>(args.indicesOpt),
                                                     args.scales[0],
                                                     args.scales[1]);
        }
-        else if(args.compTypeId == AppDataType::appDouble)
+        else if(args.compTypeId == DataTypeEnum::Double)
        {
            profile_reduce_impl<float, double, float>(args.do_verification,
                                                      args.init_method,
-                                                      args.do_log,
                                                      args.do_dumpout,
                                                      args.time_kernel,
                                                      args.inLengths,
                                                      args.reduceDims,
                                                      args.reduceOp,
-                                                      args.nanOpt,
+                                                      static_cast<bool>(args.nanOpt),
-                                                      args.indicesOpt,
+                                                      static_cast<bool>(args.indicesOpt),
                                                      args.scales[0],
                                                      args.scales[1]);
        }

--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -13,6 +13,7 @@ int profile_gemm_bias_relu_add(int, char*[]);
 int profile_gemm_reduce(int, char*[]);
 int profile_batched_gemm(int, char*[]);
 int profile_grouped_gemm(int, char*[]);
+int profile_conv_fwd(int, char*[]);
 int profile_conv_fwd_bias_relu(int, char*[]);
 int profile_conv_fwd_bias_relu_add(int, char*[]);
 int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);
@@ -53,7 +54,7 @@ int main(int argc, char* argv[])
    }
    else if(strcmp(argv[1], "grouped_gemm") == 0)
    {
-        profile_grouped_gemm(argc, argv);
+        return profile_grouped_gemm(argc, argv);
    }
    else if(strcmp(argv[1], "conv_fwd") == 0)
    {
@@ -107,7 +108,7 @@ int main(int argc, char* argv[])
               "                        conv1d_bwd_data: BackwardConvolution data 1 dim\n"
               "                        conv2d_bwd_data: BackwardConvolution data 2 dim\n"
               "                        conv3d_bwd_data: BackwardConvolution data 3 dim\n"
-               "                        reduce: REDUCE\n"
+               "                        reduce: Reduce\n"
               "                        conv2d_bwd_weight: Backward Weight Convolution 2d\n");
        // clang-format on
    }

--- a/script/parse_perf_data.py
+++ b/script/parse_perf_data.py
 #!/usr/bin/env python3
-import os, io
+import os, io, argparse, datetime
-import argparse
+import numpy as np
+import sqlalchemy
-def print_to_string(*args, **kwargs):
+from sqlalchemy.types import NVARCHAR, Float, Integer
-    output = io.StringIO()
+import pymysql
-    print(*args, file=output, **kwargs)
+import pandas as pd
-    contents = output.getvalue()
+from sshtunnel import SSHTunnelForwarder
-    output.close()
-    return contents
+def print_to_string(*args, **kwargs):
+    output = io.StringIO()
-def parse_args():
+    print(*args, file=output, **kwargs)
-    parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs')
+    contents = output.getvalue()
-    parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files')
+    output.close()
-    args = parser.parse_args()
+    return contents
-    files = []
-    if os.path.isdir(args.filename):
+def parse_args():
-        all_files = os.listdir(args.filename)
+    parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs')
-        for name in all_files:
+    parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files')
-            if not 'log' in name:
+    args = parser.parse_args()
-                continue
+    files = []
-            files.append(os.path.join(args.filename, name))
+    if os.path.isdir(args.filename):
-    else:
+        all_files = os.listdir(args.filename)
-        files = [args.filename]
+        for name in all_files:
-    args.files = files
+            if not 'log' in name:
-    return args
+                continue
+            files.append(os.path.join(args.filename, name))
-def main():
+    else:
-    args = parse_args()
+        files = [args.filename]
-    results = []
+    args.files = files
-    #parse results
+    return args
-    glue=""
-    for filename in args.files:
+def main():
-        for line in open(filename):
+    args = parse_args()
-            if 'Best Perf' in line:
+    tests = []
-                lst=line.split()
+    kernels=[]
-                results.append(print_to_string(glue.join(lst[8:]),lst[4]))
+    tflops=[]
+    dtype=[]
-    #sort results    
+    alayout=[]
+    blayout=[]
-    #read baseline results for the latest develop branch    
+    M=[]
+    N=[]
-    #write new results to the db
+    K=[]
+    StrideA=[]
-    #compare the results to the baseline
+    StrideB=[]
+    StrideC=[]
-    #return 0 if performance criteria met, otherwise return 1
+    #parse results, get the Tflops value for "Best Perf" kernels
+    glue=""
-    print(results)
+    for filename in args.files:
-    return 0
+        for line in open(filename):
+            if 'Branch name' in line:
-if __name__ == '__main__':
+                lst=line.split()
+                branch_name=lst[2]
+    for filename in args.files:
+        for line in open(filename):
+            if 'Best Perf' in line:
+                lst=line.split()
+                if len(lst)>=37: #the line is complete
+                    tests.append(glue.join(lst[5:30]))
+                    kernels.append(glue.join(lst[37:]))
+                    tflops.append(lst[33])
+                    dtype.append(lst[5])
+                    alayout.append(lst[8])
+                    blayout.append(lst[11])
+                    M.append(lst[14])
+                    N.append(lst[17])
+                    K.append(lst[20])
+                    StrideA.append(lst[23])
+                    StrideB.append(lst[26])
+                    StrideC.append(lst[29])
+                elif len(lst)<37 and len(lst)>=33: #the tflops are available
+                    tests.append(glue.join(lst[5:30]))
+                    kernels.append("N/A")
+                    tflops.append(lst[33])
+                    dtype.append(lst[5])
+                    alayout.append(lst[8])
+                    blayout.append(lst[11])
+                    M.append(lst[14])
+                    N.append(lst[17])
+                    K.append(lst[20])
+                    StrideA.append(lst[23])
+                    StrideB.append(lst[26])
+                    StrideC.append(lst[29])
+                    print("warning: incomplete line:",lst)
+                elif len(lst)<33: #even the tflops are not available
+                    print("Error in ckProfiler output!")
+                    print("warning: incomplete line=",lst)
+    #sort results
+    print("Number of tests:",len(tests))
+    print("Branch name:",branch_name)
+    #sorted_tests = sorted(tests)
+    #print("sorted tests:",sorted_tests)
+    sorted_tflops = [x for _,x in sorted(zip(tests,tflops))]
+    #sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
+    test_list=list(range(1,len(tests)+1))
+    sql_hostname = '127.0.0.1'
+    sql_username = os.environ["dbuser"]
+    print("sql_username=",sql_username)
+    sql_password = os.environ["dbpassword"]
+    sql_main_database = 'miopen_perf'
+    sql_port = 3306
+    ssh_host = os.environ["dbsship"]
+    print("ssh_host=",ssh_host)
+    ssh_user = os.environ["dbsshuser"]
+    print("ssh_user=",ssh_user)
+    ssh_port = int(os.environ["dbsshport"])
+    ssh_pass = os.environ["dbsshpassword"]
+    with SSHTunnelForwarder(
+            (ssh_host, ssh_port),
+            ssh_username=ssh_user,
+            ssh_password=ssh_pass,
+            remote_bind_address=(sql_hostname, sql_port)) as tunnel:
+        sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'.
+            format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database))
+        conn = sqlEngine.connect()
+        #write the ck_gemm_test_params table
+        #only needed once the test set changes
+        '''
+        sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
+        sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
+        sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
+        sorted_M = [x for _,x in sorted(zip(tests,M))]
+        sorted_N = [x for _,x in sorted(zip(tests,N))]
+        sorted_K = [x for _,x in sorted(zip(tests,K))]
+        sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
+        sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
+        sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
+        ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout,
+                    sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
+                    sorted_StrideC]
+        df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
+            'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
+        print(df)
+        dtypes = {
+            'Test_number': Integer(),
+            'Data_type': NVARCHAR(length=5),
+            'Alayout': NVARCHAR(length=12),
+            'Blayout': NVARCHAR(length=12),
+            'M': Integer(),
+            'N': Integer(),
+            'K': Integer(),
+            'StrideA': Integer(),
+            'StrideB': Integer(),
+            'StrideC': Integer()
+            }
+        df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes)
+        '''
+        #read baseline results for the latest develop branch
+        query = '''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );'''
+        tflops_base = pd.read_sql_query(query, conn)
+        #write new results to the db
+        testlist=[]
+        for i in range(1,len(tests)+1):
+            testlist.append("Test%i"%i)
+        ck_gemm_tflops=[str(branch_name),str(datetime.datetime.now())]
+        flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Datetime'])
+        df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist)
+        flops=pd.concat([flops,df_add],axis=1)
+        print("new tflops results:",flops)
+        flops.to_sql("ck_gemm_tflops",conn,if_exists='append',index=False)
+        conn.close()
+    #compare the results to the baseline
+    regression=0
+    base=tflops_base[testlist].to_numpy(dtype='float')
+    base_list=base[0]
+    ave_perf=0
+    for i in range(len(base_list)):
+        # success criterion:
+        if base_list[i]>1.01*float(sorted_tflops[i]):
+            print("test # ",i,"shows regression by {:.3f}%".format(
+                (float(sorted_tflops[i])-base_list[i])/base_list[i]*100))
+            regression=1
+        ave_perf=ave_perf+float(sorted_tflops[i])/base_list[i]
+    if regression==0:
+        print("no regressions found")
+    ave_perf=ave_perf/len(base_list)
+    print("average performance relative to baseline:",ave_perf)
+    #return 0 if performance criteria met, otherwise return 1
+    return regression
+if __name__ == '__main__':
    main()
\ No newline at end of file
--- a/script/test_reduce_no_index.sh
+++ b/script/test_reduce_no_index.sh
@@ -15,6 +15,17 @@ bin/test_reduce_no_index -D 64,4,280,82  -R 1  0 2
 bin/test_reduce_no_index -D 64,4,280,82  -R 2  0 2
 bin/test_reduce_no_index -D 64,4,280,82  -R 3  0 2
+## for float64
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  6 2
 ## for float16
 bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  1 2
 bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  1 2

--- a/script/test_reduce_with_index.sh
+++ b/script/test_reduce_with_index.sh
@@ -15,6 +15,17 @@ bin/test_reduce_with_index -D 64,4,280,82  -R 1  0 2
 bin/test_reduce_with_index -D 64,4,280,82  -R 2  0 2
 bin/test_reduce_with_index -D 64,4,280,82  -R 3  0 2
+## for float64
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  6 2
 ## for float16
 bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  1 2
 bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  1 2

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -2,6 +2,7 @@ include_directories(BEFORE
    ${PROJECT_SOURCE_DIR}/
    ${PROJECT_SOURCE_DIR}/include/ck
    ${PROJECT_SOURCE_DIR}/include/ck/utility
+    ${PROJECT_SOURCE_DIR}/include/ck/host_utility
    ${PROJECT_SOURCE_DIR}/include/ck/tensor_description
    ${PROJECT_SOURCE_DIR}/include/ck/tensor
    ${PROJECT_SOURCE_DIR}/include/ck/problem_transform

--- a/test/block_to_ctile_map/test_block_to_ctile_map.cpp
+++ b/test/block_to_ctile_map/test_block_to_ctile_map.cpp
@@ -8,6 +8,7 @@ using namespace ck;
 static auto I0 = Number<0>{};
 static auto I1 = Number<1>{};
+static auto I2 = Number<2>{};
 TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck1)
 {
@@ -20,7 +21,7 @@ TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck1
    const index_t M01       = 4;
    const index_t N01       = 4;
-    auto c_grid_desc_m_n = make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, I1));
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
    printf("(M, N, MPerBlock, NPerBlock, M01, N01) = (%d, %d, %d, %d, %d, %d)\n",
           M,
@@ -37,7 +38,7 @@ TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck1
    EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 16);
    // clang-format off
-    std::vector<std::vector<int>> expected = {
+    std::vector<std::vector<int>> expected_m0idx_n0idx_valid = {
        {0, 0, 1},
        {0, 1, 1},
        {0, 2, 1},
@@ -64,7 +65,7 @@ TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck1
        std::cout << ", valid = " << tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))
                  << std::endl;
        bool equal =
-            expected[i] ==
+            expected_m0idx_n0idx_valid[i] ==
            std::vector<int>{m0n0_idx[I0],
                             m0n0_idx[I1],
                             tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))};
@@ -78,12 +79,11 @@ TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck0
    const index_t N         = 384;
    const index_t MPerBlock = 128;
    const index_t NPerBlock = 128;
-    // const index_t MBlock    = M / MPerBlock;
-    // const index_t NBlock    = N / NPerBlock;
    const index_t M01 = 4;
    const index_t N01 = 4;
-    auto c_grid_desc_m_n = make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, I1));
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
    printf("(M, N, MPerBlock, NPerBlock, M01, N01) = (%d, %d, %d, %d, %d, %d)\n",
           M,
@@ -98,3 +98,221 @@ TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck0
    EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == false);
 }
+TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N0_M01_DeviceCTileIndexCheck1)
+{
+    const index_t M         = 384;
+    const index_t N         = 512;
+    const index_t MPerBlock = 128;
+    const index_t NPerBlock = 128;
+    const index_t MBlock    = M / MPerBlock;
+    const index_t NBlock    = N / NPerBlock;
+    const index_t M01       = 4;
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
+    printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
+           M,
+           N,
+           MPerBlock,
+           NPerBlock,
+           M01);
+    BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n), true> tile_map(
+        c_grid_desc_m_n, M01);
+    EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
+    EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 16);
+    // clang-format off
+    std::vector<std::vector<int>> expected_m0idx_n0idx_valid = {
+        {0, 0, 1},
+        {1, 0, 1},
+        {2, 0, 1},
+        {3, 0, 0},
+        {0, 1, 1},
+        {1, 1, 1},
+        {2, 1, 1},
+        {3, 1, 0},
+        {0, 2, 1},
+        {1, 2, 1},
+        {2, 2, 1},
+        {3, 2, 0},
+        {0, 3, 1},
+        {1, 3, 1},
+        {2, 3, 1},
+        {3, 3, 0}
+    };
+    // clang-format on
+    for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
+    {
+        auto m0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
+        std::cout << "block_1d_id = " << i << ", m0, n0 = " << m0n0_idx[I0] << ", " << m0n0_idx[I1];
+        std::cout << ", valid = " << tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))
+                  << std::endl;
+        bool equal =
+            expected_m0idx_n0idx_valid[i] ==
+            std::vector<int>{m0n0_idx[I0],
+                             m0n0_idx[I1],
+                             tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))};
+        EXPECT_TRUE(equal);
+    }
+}
+TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N0_M01_DeviceCTileIndexCheck0)
+{
+    const index_t M         = 512;
+    const index_t N         = 384;
+    const index_t MPerBlock = 128;
+    const index_t NPerBlock = 128;
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
+    // clang-format off
+    std::vector<std::tuple<int, int, bool>> expected_m0_gridsize_validity = {
+        {5, 15, false},
+        {4, 12, true},
+        {3, 18, false},
+        {2, 12, true},
+        {1, 12, true}
+    };
+    // clang-format on
+    for(auto e : expected_m0_gridsize_validity)
+    {
+        const index_t M01 = std::get<0>(e);
+        printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
+               M,
+               N,
+               MPerBlock,
+               NPerBlock,
+               M01);
+        BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n), false> tile_map(
+            c_grid_desc_m_n, M01);
+        EXPECT_EQ(tile_map.CalculateGridSize(c_grid_desc_m_n), std::get<1>(e));
+        EXPECT_EQ(tile_map.CheckValidity(c_grid_desc_m_n), std::get<2>(e));
+    }
+}
+TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N0_M01Adapt)
+{
+    const index_t M         = 768;
+    const index_t N         = 384;
+    const index_t MPerBlock = 128;
+    const index_t NPerBlock = 128;
+    const index_t MBlock    = M / MPerBlock;
+    const index_t NBlock    = N / NPerBlock;
+    constexpr index_t M01   = 4;
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
+    printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
+           M,
+           N,
+           MPerBlock,
+           NPerBlock,
+           M01);
+    BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n)> tile_map(
+        c_grid_desc_m_n, M01);
+    EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
+    EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 18);
+    // clang-format off
+    std::vector<std::vector<int>> expected_m0idx_n0idx_valid = {
+        {0, 0, 1},
+        {1, 0, 1},
+        {2, 0, 1},
+        {3, 0, 1},
+        {0, 1, 1},
+        {1, 1, 1},
+        {2, 1, 1},
+        {3, 1, 1},
+        {0, 2, 1},
+        {1, 2, 1},
+        {2, 2, 1},
+        {3, 2, 1},
+        {4, 0, 1},
+        {5, 0, 1},
+        {4, 1, 1},
+        {5, 1, 1},
+        {4, 2, 1},
+        {5, 2, 1},
+    };
+    // clang-format on
+    for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
+    {
+        auto m0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
+        std::cout << "block_1d_id = " << i << ", m0, n0 = " << m0n0_idx[I0] << ", " << m0n0_idx[I1];
+        std::cout << ", valid = " << tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))
+                  << std::endl;
+        bool equal =
+            expected_m0idx_n0idx_valid[i] ==
+            std::vector<int>{m0n0_idx[I0],
+                             m0n0_idx[I1],
+                             tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))};
+        EXPECT_TRUE(equal);
+    }
+}
+TEST(BlockToCTileMap, TestBlockToCTileMap_KSplit_M00_N0_M01Adapt)
+{
+    const index_t M         = 768;
+    const index_t N         = 384;
+    const index_t MPerBlock = 128;
+    const index_t NPerBlock = 128;
+    const index_t MBlock    = M / MPerBlock;
+    const index_t NBlock    = N / NPerBlock;
+    constexpr index_t M01   = 4;
+    const index_t KSplit    = 3;
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
+    printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
+           M,
+           N,
+           MPerBlock,
+           NPerBlock,
+           M01);
+    BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n)>
+        tile_map(c_grid_desc_m_n, M01, KSplit);
+    EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
+    EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 18 * KSplit);
+    std::vector<std::vector<int>> expected_ksplitidx_m0idx_n0idx_valid = {
+        {0, 0, 0, 1}, {0, 1, 0, 1}, {0, 2, 0, 1}, {0, 3, 0, 1}, {0, 0, 1, 1}, {0, 1, 1, 1},
+        {0, 2, 1, 1}, {0, 3, 1, 1}, {0, 0, 2, 1}, {0, 1, 2, 1}, {0, 2, 2, 1}, {0, 3, 2, 1},
+        {0, 4, 0, 1}, {0, 5, 0, 1}, {0, 4, 1, 1}, {0, 5, 1, 1}, {0, 4, 2, 1}, {0, 5, 2, 1},
+        {1, 0, 0, 1}, {1, 1, 0, 1}, {1, 2, 0, 1}, {1, 3, 0, 1}, {1, 0, 1, 1}, {1, 1, 1, 1},
+        {1, 2, 1, 1}, {1, 3, 1, 1}, {1, 0, 2, 1}, {1, 1, 2, 1}, {1, 2, 2, 1}, {1, 3, 2, 1},
+        {1, 4, 0, 1}, {1, 5, 0, 1}, {1, 4, 1, 1}, {1, 5, 1, 1}, {1, 4, 2, 1}, {1, 5, 2, 1},
+        {2, 0, 0, 1}, {2, 1, 0, 1}, {2, 2, 0, 1}, {2, 3, 0, 1}, {2, 0, 1, 1}, {2, 1, 1, 1},
+        {2, 2, 1, 1}, {2, 3, 1, 1}, {2, 0, 2, 1}, {2, 1, 2, 1}, {2, 2, 2, 1}, {2, 3, 2, 1},
+        {2, 4, 0, 1}, {2, 5, 0, 1}, {2, 4, 1, 1}, {2, 5, 1, 1}, {2, 4, 2, 1}, {2, 5, 2, 1},
+    };
+    for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
+    {
+        auto ksplitm0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
+        std::cout << "block_1d_id = " << i << ", ksplit, m0, n0 = " << ksplitm0n0_idx[I0] << ", "
+                  << ksplitm0n0_idx[I1] << ", " << ksplitm0n0_idx[I2];
+        std::cout << ", valid = "
+                  << tile_map.ValidCTileIndex(ksplitm0n0_idx, make_tuple(MBlock, NBlock))
+                  << std::endl;
+        bool equal =
+            expected_ksplitidx_m0idx_n0idx_valid[i] ==
+            std::vector<int>{ksplitm0n0_idx[I0],
+                             ksplitm0n0_idx[I1],
+                             ksplitm0n0_idx[I2],
+                             tile_map.ValidCTileIndex(ksplitm0n0_idx, make_tuple(MBlock, NBlock))};
+        EXPECT_TRUE(equal);
+    }
+}
--- a/test/gemm/CMakeLists.txt
+++ b/test/gemm/CMakeLists.txt
-add_test_executable(test_gemm_fp32 gemm_fp32.cpp)
+# GEMM XDL
-target_link_libraries(test_gemm_fp32 PRIVATE host_tensor)
+add_test_executable(test_gemm_xdl_fp32 gemm_xdl_fp32.cpp)
-target_link_libraries(test_gemm_fp32 PRIVATE device_gemm_instance)
+target_link_libraries(test_gemm_xdl_fp32 PRIVATE host_tensor)
+target_link_libraries(test_gemm_xdl_fp32 PRIVATE device_gemm_instance)
-add_test_executable(test_gemm_fp16 gemm_fp16.cpp)
+add_test_executable(test_gemm_xdl_fp16 gemm_xdl_fp16.cpp)
-target_link_libraries(test_gemm_fp16 PRIVATE host_tensor)
+target_link_libraries(test_gemm_xdl_fp16 PRIVATE host_tensor)
-target_link_libraries(test_gemm_fp16 PRIVATE device_gemm_instance)
+target_link_libraries(test_gemm_xdl_fp16 PRIVATE device_gemm_instance)
-add_test_executable(test_gemm_bf16 gemm_bf16.cpp)
+add_test_executable(test_gemm_xdl_bf16 gemm_xdl_bf16.cpp)
-target_link_libraries(test_gemm_bf16 PRIVATE host_tensor)
+target_link_libraries(test_gemm_xdl_bf16 PRIVATE host_tensor)
-target_link_libraries(test_gemm_bf16 PRIVATE device_gemm_instance)
+target_link_libraries(test_gemm_xdl_bf16 PRIVATE device_gemm_instance)
-add_test_executable(test_gemm_int8 gemm_int8.cpp)
+add_test_executable(test_gemm_xdl_int8 gemm_xdl_int8.cpp)
-target_link_libraries(test_gemm_int8 PRIVATE host_tensor)
+target_link_libraries(test_gemm_xdl_int8 PRIVATE host_tensor)
-target_link_libraries(test_gemm_int8 PRIVATE device_gemm_instance)
+target_link_libraries(test_gemm_xdl_int8 PRIVATE device_gemm_instance)
+# GEMM DL
+add_test_executable(test_gemm_dl_fp32 gemm_dl_fp32.cpp)
+target_link_libraries(test_gemm_dl_fp32 PRIVATE host_tensor)
+target_link_libraries(test_gemm_dl_fp32 PRIVATE device_gemm_instance)
+add_test_executable(test_gemm_dl_fp16 gemm_dl_fp16.cpp)
+target_link_libraries(test_gemm_dl_fp16 PRIVATE host_tensor)
+target_link_libraries(test_gemm_dl_fp16 PRIVATE device_gemm_instance)
+add_test_executable(test_gemm_dl_int8 gemm_dl_int8.cpp)
+target_link_libraries(test_gemm_dl_int8 PRIVATE host_tensor)
+TArget_link_libraries(test_gemm_dl_int8 PRIVATE device_gemm_instance)
--- a/test/gemm/gemm_int8.cpp
+++ b/test/gemm/gemm_int8.cpp
 #include <algorithm>
 #include <cstdlib>
 #include <half.hpp>
 #include <iostream>
 #include <numeric>
 #include <tuple>
 #include <vector>
-#include "gemm_util.hpp"
+#include "../gemm/gemm_util.hpp"
 #include "config.hpp"
 #include "print.hpp"
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "host_tensor_generator.hpp"
 #include "host_gemm.hpp"
 #include "device_tensor.hpp"
-#include "device_gemm_xdl.hpp"
+#include "device_gemm_dl.hpp"
-#include "device_gemm_xdl_cshuffle.hpp"
+#include "element_wise_operation.hpp"
-#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
-#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
-#include "gemm_specialization.hpp"
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using DeviceGemmNoOpPtr =
-using DeviceGemmNoOpPtr =
+    ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough,
-                                                ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough>;
-                                                ck::tensor_operation::element_wise::PassThrough>;
+namespace ck {
-namespace ck {
+namespace tensor_operation {
-namespace tensor_operation {
+namespace device {
-namespace device {
+namespace device_gemm_instance {
-namespace device_gemm_instance {
-void add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_kn_mn_instances(
+void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
-    std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
-void add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_nk_mn_instances(
+void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
-    std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
-void add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances(
-    std::vector<DeviceGemmNoOpPtr>&);
+} // namespace device_gemm_instance
-void add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_kn_mn_instances(
+} // namespace device
-    std::vector<DeviceGemmNoOpPtr>&);
+} // namespace tensor_operation
-} // namespace device_gemm_instance
+} // namespace ck
-} // namespace device
-} // namespace tensor_operation
+int main()
-} // namespace ck
+{
+    using ADataType   = ck::half_t;
-int main()
+    using BDataType   = ck::half_t;
-{
+    using CDataType   = ck::half_t;
-    using ADataType = int8_t;
+    using AccDataType = float;
-    using BDataType = int8_t;
-    using CDataType = int8_t;
+    using RowMajor    = ck::tensor_layout::gemm::RowMajor;
+    using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
-    using RowMajor    = ck::tensor_layout::gemm::RowMajor;
-    using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
+    bool res = true;
    std::vector<DeviceGemmNoOpPtr> gemmPtrs;
-    bool res = true;
+    ck::tensor_operation::device::device_gemm_instance::
-    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(gemmPtrs);
-        add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_kn_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
-    for(auto& gemmPtr : gemmPtrs)
+    {
-    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
-        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
-                                       ADataType,
+                                       BDataType,
-                                       BDataType,
+                                       CDataType,
-                                       CDataType,
+                                       AccDataType,
                                       ColumnMajor,
                                       RowMajor,
                                       RowMajor,
                                       PassThrough,
                                       PassThrough,
                                       PassThrough>{}(gemmPtr);
    }
    gemmPtrs.clear();
    ck::tensor_operation::device::device_gemm_instance::
-        add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_nk_mn_instances(gemmPtrs);
+        add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(gemmPtrs);
    for(auto& gemmPtr : gemmPtrs)
    {
        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
                                       ADataType,
                                       BDataType,
                                       CDataType,
-                                       ColumnMajor,
+                                       AccDataType,
                                       ColumnMajor,
-                                       RowMajor,
+                                       ColumnMajor,
-                                       PassThrough,
+                                       RowMajor,
                                       PassThrough,
-                                       PassThrough>{}(gemmPtr);
+                                       PassThrough,
-    }
+                                       PassThrough>{}(gemmPtr);
+    }
-    gemmPtrs.clear();
-    ck::tensor_operation::device::device_gemm_instance::
+    gemmPtrs.clear();
-        add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_kn_mn_instances(gemmPtrs);
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(gemmPtrs);
-    for(auto& gemmPtr : gemmPtrs)
-    {
+    for(auto& gemmPtr : gemmPtrs)
-        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+    {
-                                       ADataType,
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
-                                       BDataType,
+                                       ADataType,
-                                       CDataType,
+                                       BDataType,
-                                       RowMajor,
+                                       CDataType,
-                                       RowMajor,
+                                       AccDataType,
                                       RowMajor,
-                                       PassThrough,
+                                       RowMajor,
-                                       PassThrough,
+                                       RowMajor,
-                                       PassThrough>{}(gemmPtr);
+                                       PassThrough,
-    }
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
-    gemmPtrs.clear();
+    }
-    ck::tensor_operation::device::device_gemm_instance::
-        add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances(gemmPtrs);
+    gemmPtrs.clear();
+    ck::tensor_operation::device::device_gemm_instance::
-    for(auto& gemmPtr : gemmPtrs)
+        add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(gemmPtrs);
-    {
-        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+    for(auto& gemmPtr : gemmPtrs)
-                                       ADataType,
+    {
-                                       BDataType,
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
-                                       CDataType,
+                                       ADataType,
-                                       RowMajor,
+                                       BDataType,
-                                       ColumnMajor,
+                                       CDataType,
-                                       RowMajor,
+                                       AccDataType,
-                                       PassThrough,
+                                       RowMajor,
-                                       PassThrough,
+                                       ColumnMajor,
-                                       PassThrough>{}(gemmPtr);
+                                       RowMajor,
-    }
+                                       PassThrough,
+                                       PassThrough,
-    std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
+                                       PassThrough>{}(gemmPtr);
-    return res ? 0 : 1;
+    }
-}
+    std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
+    return res ? 0 : 1;
+}
--- a/test/gemm/gemm_dl_fp32.cpp
+++ b/test/gemm/gemm_dl_fp32.cpp
+#include <algorithm>
+#include <cstdlib>
+#include <half.hpp>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+#include "../gemm/gemm_util.hpp"
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_gemm.hpp"
+#include "device_tensor.hpp"
+#include "device_gemm_dl.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using DeviceGemmNoOpPtr =
+    ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough>;
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_gemm_instance {
+void add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+} // namespace device_gemm_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+int main()
+{
+    using ADataType   = float;
+    using BDataType   = float;
+    using CDataType   = float;
+    using AccDataType = float;
+    using RowMajor    = ck::tensor_layout::gemm::RowMajor;
+    using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
+    bool res = true;
+    std::vector<DeviceGemmNoOpPtr> gemmPtrs;
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       ColumnMajor,
+                                       RowMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+    gemmPtrs.clear();
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       ColumnMajor,
+                                       ColumnMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+    gemmPtrs.clear();
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       RowMajor,
+                                       RowMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+    gemmPtrs.clear();
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       RowMajor,
+                                       ColumnMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+    std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
+    return res ? 0 : 1;
+}
--- a/test/gemm/gemm_dl_int8.cpp
+++ b/test/gemm/gemm_dl_int8.cpp
+#include <algorithm>
+#include <cstdlib>
+#include <half.hpp>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+#include "../gemm/gemm_util.hpp"
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_gemm.hpp"
+#include "device_tensor.hpp"
+#include "device_gemm_dl.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using DeviceGemmNoOpPtr =
+    ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough>;
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_gemm_instance {
+void add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+} // namespace device_gemm_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+int main()
+{
+    using ADataType   = int8_t;
+    using BDataType   = int8_t;
+    using CDataType   = int8_t;
+    using AccDataType = int;
+    using RowMajor    = ck::tensor_layout::gemm::RowMajor;
+    using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
+    bool res = true;
+    std::vector<DeviceGemmNoOpPtr> gemmPtrs;
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       ColumnMajor,
+                                       RowMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+    gemmPtrs.clear();
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       ColumnMajor,
+                                       ColumnMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+    gemmPtrs.clear();
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       RowMajor,
+                                       RowMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+    gemmPtrs.clear();
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       RowMajor,
+                                       ColumnMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+    std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
+    return res ? 0 : 1;
+}
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -60,7 +60,7 @@ template <typename DeviceGemmPtr_,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation>
-void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
+bool RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
                   const ck::gemm_util::GemmParams& params,
                   const Tensor<ADataType>& A,
                   const Tensor<BDataType>& B,
@@ -73,9 +73,6 @@ void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
    DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpace());
    DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpace());
-    a_m_k_device_buf.ToDevice(A.mData.data());
-    b_k_n_device_buf.ToDevice(B.mData.data());
    auto invoker_ptr = gemmPtr->MakeInvokerPointer();
    auto argument_ptr =
        gemmPtr->MakeArgumentPointer(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
@@ -91,21 +88,30 @@ void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
                                     b_element_op,
                                     c_element_op);
-    if(!gemmPtr->IsSupportedArgument(argument_ptr.get()))
+    if(gemmPtr->IsSupportedArgument(argument_ptr.get()))
    {
-        throw std::runtime_error(
+        a_m_k_device_buf.ToDevice(A.mData.data());
-            "wrong! device_gemm with the specified compilation parameters does "
+        b_k_n_device_buf.ToDevice(B.mData.data());
-            "not support this GEMM problem");
+        invoker_ptr->Run(argument_ptr.get());
+        c_m_n_device_buf.FromDevice(C.mData.data());
+        return true;
    }
+    else
+    {
+        std::cout << "device_gemm with the specified compilation parameters does "
+                     "not support this GEMM problem"
+                  << std::endl;
-    invoker_ptr->Run(argument_ptr.get());
+        return false;
-    c_m_n_device_buf.FromDevice(C.mData.data());
+    }
 }
 template <typename DeviceGemmPtr_,
          typename ADataType,
          typename BDataType,
          typename CDataType,
+          typename AccDataType,
          typename ALayout,
          typename BLayout,
          typename CLayout,
@@ -181,6 +187,7 @@ struct TestGemm
            ck::tensor_operation::host::ReferenceGemm<ADataType,
                                                      BDataType,
                                                      CDataType,
+                                                      AccDataType,
                                                      AElementwiseOperation,
                                                      BElementwiseOperation,
                                                      CElementwiseOperation>;
@@ -188,28 +195,40 @@ struct TestGemm
            a, b, c_host, a_element_op, b_element_op, c_element_op);
        // Act
-        ck::gemm_util::RunDeviceGEMM(
+        bool is_supported = ck::gemm_util::RunDeviceGEMM(
            gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op);
-        // Assert
+        if(is_supported)
-        bool res = false;
-        if(std::is_same<CDataType, float>::value)
        {
-            res = ck::utils::check_err(c_device.mData, c_host.mData);
+            // Assert
-            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            bool res = false;
+            if(std::is_same<CDataType, float>::value)
+            {
+                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            }
+            else if(std::is_same<CDataType, ck::half_t>::value)
+            {
+                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            }
+            else if(std::is_same<CDataType, int8_t>::value)
+            {
+                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            }
+            else if(std::is_same<CDataType, double>::value)
+            {
+                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            }
+            return res;
        }
-        else if(std::is_same<CDataType, ck::half_t>::value)
+        else
        {
-            res = ck::utils::check_err(c_device.mData, c_host.mData);
+            return true;
-            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
        }
-        else if(std::is_same<CDataType, int8_t>::value)
-        {
-            res = ck::utils::check_err(c_device.mData, c_host.mData);
-            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
-        }
-        return res;
    }
 };
@@ -299,6 +318,7 @@ struct TestGemmBF16
        // use fp32 host kernel to verify bf16 device kernel
        using ReferenceGemmInstance =
            ck::tensor_operation::host::ReferenceGemm<float,
+                                                      float,
                                                      float,
                                                      float,
                                                      AElementwiseOperation,

--- a/test/gemm/gemm_bf16.cpp
+++ b/test/gemm/gemm_bf16.cpp
--- a/test/gemm/gemm_fp16.cpp
+++ b/test/gemm/gemm_fp16.cpp
@@ -52,9 +52,10 @@ void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(
 int main()
 {
-    using ADataType = ck::half_t;
+    using ADataType   = ck::half_t;
-    using BDataType = ck::half_t;
+    using BDataType   = ck::half_t;
-    using CDataType = ck::half_t;
+    using CDataType   = ck::half_t;
+    using AccDataType = float;
    using RowMajor    = ck::tensor_layout::gemm::RowMajor;
    using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
@@ -74,6 +75,7 @@ int main()
                                       ADataType,
                                       BDataType,
                                       CDataType,
+                                       AccDataType,
                                       ColumnMajor,
                                       RowMajor,
                                       RowMajor,
@@ -96,6 +98,7 @@ int main()
                                       ADataType,
                                       BDataType,
                                       CDataType,
+                                       AccDataType,
                                       ColumnMajor,
                                       ColumnMajor,
                                       RowMajor,
@@ -118,6 +121,7 @@ int main()
                                       ADataType,
                                       BDataType,
                                       CDataType,
+                                       AccDataType,
                                       RowMajor,
                                       RowMajor,
                                       RowMajor,
@@ -142,6 +146,7 @@ int main()
                                       ADataType,
                                       BDataType,
                                       CDataType,
+                                       AccDataType,
                                       RowMajor,
                                       ColumnMajor,
                                       RowMajor,

--- a/test/gemm/gemm_fp32.cpp
+++ b/test/gemm/gemm_fp32.cpp
@@ -53,9 +53,10 @@ void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(std::vector<De
 int main()
 {
-    using ADataType = float;
+    using ADataType   = float;
-    using BDataType = float;
+    using BDataType   = float;
-    using CDataType = float;
+    using CDataType   = float;
+    using AccDataType = float;
    using RowMajor    = ck::tensor_layout::gemm::RowMajor;
    using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
@@ -75,6 +76,7 @@ int main()
                                       ADataType,
                                       BDataType,
                                       CDataType,
+                                       AccDataType,
                                       ColumnMajor,
                                       RowMajor,
                                       RowMajor,
@@ -97,6 +99,7 @@ int main()
                                       ADataType,
                                       BDataType,
                                       CDataType,
+                                       AccDataType,
                                       ColumnMajor,
                                       ColumnMajor,
                                       RowMajor,
@@ -119,6 +122,7 @@ int main()
                                       ADataType,
                                       BDataType,
                                       CDataType,
+                                       AccDataType,
                                       RowMajor,
                                       RowMajor,
                                       RowMajor,
@@ -141,6 +145,7 @@ int main()
                                       ADataType,
                                       BDataType,
                                       CDataType,
+                                       AccDataType,
                                       RowMajor,
                                       ColumnMajor,
                                       RowMajor,

--- a/test/gemm/gemm_xdl_fp64.cpp
+++ b/test/gemm/gemm_xdl_fp64.cpp
+#include <algorithm>
+#include <cstdlib>
+#include <half.hpp>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+#include "gemm_util.hpp"
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_gemm.hpp"
+#include "device_tensor.hpp"
+#include "device_gemm_xdl.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using DeviceGemmNoOpPtr =
+    ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough>;
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_gemm_instance {
+void add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+} // namespace device_gemm_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+inline std::string get_device_name()
+{
+    hipDeviceProp_t props{};
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess)
+    {
+        return std::string();
+    }
+    status = hipGetDeviceProperties(&props, device);
+    if(status != hipSuccess)
+    {
+        return std::string();
+    }
+    const std::string name(props.gcnArchName);
+    return name;
+}
+int main()
+{
+    if(get_device_name().find("gfx90a") == std::string::npos)
+    {
+        std::cout << "TestGemm ..... SUCCESS" << std::endl;
+        return 0;
+    }
+    using ADataType   = double;
+    using BDataType   = double;
+    using CDataType   = double;
+    using AccDataType = double;
+    using RowMajor    = ck::tensor_layout::gemm::RowMajor;
+    using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
+    bool res = true;
+    std::vector<DeviceGemmNoOpPtr> gemmPtrs;
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       ColumnMajor,
+                                       RowMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+    gemmPtrs.clear();
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       ColumnMajor,
+                                       ColumnMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+    gemmPtrs.clear();
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       RowMajor,
+                                       RowMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+    gemmPtrs.clear();
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       RowMajor,
+                                       ColumnMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+    std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
+    return res ? 0 : 1;
+}
--- a/test/gemm/gemm_xdl_int8.cpp
+++ b/test/gemm/gemm_xdl_int8.cpp
+#include <algorithm>
+#include <cstdlib>
+#include <half.hpp>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+#include "gemm_util.hpp"
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_gemm.hpp"
+#include "device_tensor.hpp"
+#include "device_gemm_xdl.hpp"
+#include "device_gemm_xdl_cshuffle.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using DeviceGemmNoOpPtr =
+    ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough>;
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_gemm_instance {
+void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
+} // namespace device_gemm_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+int main()
+{
+    using ADataType   = int8_t;
+    using BDataType   = int8_t;
+    using CDataType   = int8_t;
+    using AccDataType = int32_t;
+    using RowMajor    = ck::tensor_layout::gemm::RowMajor;
+    using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
+    std::vector<DeviceGemmNoOpPtr> gemmPtrs;
+    bool res = true;
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       ColumnMajor,
+                                       RowMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+    gemmPtrs.clear();
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       ColumnMajor,
+                                       ColumnMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+    gemmPtrs.clear();
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       RowMajor,
+                                       RowMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+    gemmPtrs.clear();
+    ck::tensor_operation::device::device_gemm_instance::
+        add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(gemmPtrs);
+    for(auto& gemmPtr : gemmPtrs)
+    {
+        res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       AccDataType,
+                                       RowMajor,
+                                       ColumnMajor,
+                                       RowMajor,
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough>{}(gemmPtr);
+    }
+    std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
+    return res ? 0 : 1;
+}
--- a/test/grouped_gemm/grouped_gemm_fp16.cpp
+++ b/test/grouped_gemm/grouped_gemm_fp16.cpp
@@ -151,8 +151,13 @@ bool TestGroupedGemm(DeviceGroupedGemmPtr_& groupedGemmPtr)
    {
        c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data());
-        using ReferenceGemmInstance = ck::tensor_operation::host::
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
-            ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
+                                                                                BDataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();

--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
 #include "getopt.h"
-#include "check_err.hpp"
+#include "host_common_util.hpp"
-#include "device_reduce_instance.hpp"
+#include "profile_reduce_impl.hpp"
-#include "reduction_enums.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "host_reduction.hpp"
-#include "reduce_util.hpp"
 using namespace ck;
-namespace {
-template <index_t Rank, index_t NumReduceDim>
-static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
-{
-    assert(NumReduceDim == reduceDims.size());
-    int reduceFlag = 0;
-    // flag the bits for the reduceDims
-    for(int i = 0; i < NumReduceDim; i++)
-    {
-        reduceFlag |= 1 << reduceDims[i];
-    };
-    std::vector<int> invariantDims;
-    // collect invariant dimensions
-    for(int i = 0; i < Rank; i++)
-        if((reduceFlag & (1 << i)) == 0)
-        {
-            invariantDims.push_back(i);
-        };
-    return invariantDims;
-};
-constexpr int Rank = 4;
-constexpr ReduceTensorOp ReduceOpId      = ReduceTensorOp::AVG;
-constexpr NanPropagation NanOpt          = NanPropagation::PROPAGATE_NAN;
-constexpr bool PropagateNan              = false;
-constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES;
-constexpr bool NeedIndices               = false;
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          int Rank,
-          int NumReduceDim>
-bool test_reduce_no_index_impl(int init_method,
-                               const std::vector<size_t>& inLengths,
-                               const std::vector<int>& reduceDims,
-                               float alpha,
-                               float beta)
-{
-    using namespace ck::tensor_operation::device;
-    using namespace ck::tensor_operation::device::device_reduce_instance;
-    using namespace ck::host_reduce;
-    constexpr bool out_support_atomic_add = std::is_same<OutDataType, float>::value;
-    constexpr bool op_support_atomic_add  = true;
-    constexpr bool use_atomic_add         = (out_support_atomic_add && op_support_atomic_add);
-    Tensor<InDataType> in(inLengths);
-    std::vector<size_t> outLengths;
-    const auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
-    if(reduceDims.size() == Rank)
-        outLengths.push_back(1);
-    else
-        for(auto dim : invariantDims)
-            outLengths.push_back(inLengths[dim]);
-    Tensor<OutDataType> out_ref(outLengths);
-    Tensor<OutDataType> out(outLengths);
-    // only used when the OutDataType is bhalf_t
-    Tensor<float> out_ref_fp32(outLengths);
-    Tensor<float> out_fp32(outLengths);
-    auto inStrides  = in.mDesc.GetStrides();
-    auto outStrides = out.mDesc.GetStrides();
-    size_t invariant_total_length = out.mDesc.GetElementSize();
-    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
-    std::size_t num_thread = 1;
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
-        break;
-    case 2:
-        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
-        break;
-    default:
-        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
-        if(beta != 0.0f)
-            out_ref.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
-    }
-    if(beta != 0.0f)
-        for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++)
-            out.mData[i] = out_ref.mData[i];
-    // these buffers are usually provided by the user application
-    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
-    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
-    in_dev.ToDevice(in.mData.data());
-    if(beta != 0.0f)
-        out_dev.ToDevice(out.mData.data());
-    using InElementwiseOperation_0 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
-    using AccElementwiseOperation_0 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
-            AccElementwiseOperation;
-    using InElementwiseOperation_1 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-            InElementwiseOperation;
-    using AccElementwiseOperation_1 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
-            AccElementwiseOperation;
-    using InElementwiseOperation_2 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-            InElementwiseOperation;
-    using AccElementwiseOperation_2 =
-        typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
-            AccElementwiseOperation;
-    using DeviceReduceInstPtr0 =
-        DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>;
-    using DeviceReduceInstPtr1 =
-        DeviceReducePtr<InElementwiseOperation_1, AccElementwiseOperation_1>;
-    using DeviceReduceInstPtr2 =
-        DeviceReducePtr<InElementwiseOperation_2, AccElementwiseOperation_2>;
-    std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
-    std::vector<DeviceReduceInstPtr1> reduce1_ptrs;
-    std::vector<DeviceReduceInstPtr2> reduce2_ptrs;
-    add_device_reduce_instance_threadwise<InDataType,
-                                          AccDataType,
-                                          OutDataType,
-                                          Rank,
-                                          NumReduceDim,
-                                          ReduceOpId,
-                                          NanOpt,
-                                          IndicesOpt>(reduce0_ptrs);
-    add_device_reduce_instance_blockwise<InDataType,
-                                         AccDataType,
-                                         OutDataType,
-                                         Rank,
-                                         NumReduceDim,
-                                         ReduceOpId,
-                                         NanOpt,
-                                         IndicesOpt>(reduce0_ptrs);
-    if constexpr(use_atomic_add)
-    {
-        add_device_reduce_instance_multiblock_atomic_add<InDataType,
-                                                         AccDataType,
-                                                         OutDataType,
-                                                         Rank,
-                                                         NumReduceDim,
-                                                         ReduceOpId,
-                                                         NanOpt,
-                                                         IndicesOpt>(reduce0_ptrs);
-    }
-    else
-    {
-        add_device_reduce_instance_multiblock_partial_reduce<InDataType,
-                                                             AccDataType,
-                                                             OutDataType,
-                                                             Rank,
-                                                             NumReduceDim,
-                                                             ReduceOpId,
-                                                             NanOpt,
-                                                             IndicesOpt>(reduce1_ptrs);
-    };
-    // used for secondary reduction
-    if constexpr(!use_atomic_add)
-    {
-        add_device_reduce_instance_blockwise_second_call<AccDataType,
-                                                         AccDataType,
-                                                         OutDataType,
-                                                         Rank,
-                                                         NumReduceDim,
-                                                         ReduceOpId,
-                                                         NanOpt,
-                                                         IndicesOpt>(reduce2_ptrs);
-    };
-    if(reduce0_ptrs.empty() && reduce1_ptrs.empty())
-    {
-        throw std::runtime_error("Wrong! No device REDUCE instance found");
-    };
-    bool result = true;
-    ReductionHost<InDataType,
-                  AccDataType,
-                  OutDataType,
-                  ReduceOpId,
-                  Rank,
-                  NumReduceDim,
-                  PropagateNan,
-                  NeedIndices>
-        hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
-    hostReduce.Run(alpha, in.mData.data(), beta, out_ref.mData.data(), nullptr);
-    const auto i_inLengths  = to_int_vector(inLengths);
-    const auto i_inStrides  = to_int_vector(inStrides);
-    const auto i_outLengths = to_int_vector(outLengths);
-    const auto i_outStrides = to_int_vector(outStrides);
-    for(auto& reduce_ptr : reduce0_ptrs)
-    {
-        auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-        DeviceMem ws_dev(wsSizeInBytes);
-        InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
-        AccElementwiseOperation_0 acc_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
-        auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
-                                                            i_inStrides,
-                                                            i_outLengths,
-                                                            i_outStrides,
-                                                            reduceDims,
-                                                            alpha,
-                                                            beta,
-                                                            in_dev.GetDeviceBuffer(),
-                                                            out_dev.GetDeviceBuffer(),
-                                                            nullptr,
-                                                            ws_dev.GetDeviceBuffer(),
-                                                            in_elementwise_op_0,
-                                                            acc_elementwise_op_0);
-        if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
-            continue;
-        auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
-        (void)invoker_ptr->Run(argument_ptr.get());
-        out_dev.FromDevice(out.mData.data());
-        bool single_result = true;
-        if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
-                     std::is_same<OutDataType, ck::bhalf_t>::value)
-        {
-            reduce_util::to_f32_vector(out, out_fp32);
-            reduce_util::to_f32_vector(out_ref, out_ref_fp32);
-            single_result = ck::utils::check_err(
-                out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
-        }
-        else
-        {
-            single_result =
-                ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
-        };
-        if(!single_result)
-        {
-            std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl;
-            result = false;
-        }
-    };
-    for(auto& reduce_ptr : reduce1_ptrs)
-    {
-        auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
-        DeviceMem ws_dev(wsSizeInBytes);
-        InElementwiseOperation_1 in_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
-        AccElementwiseOperation_1 acc_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
-        auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
-                                                            i_inStrides,
-                                                            i_outLengths,
-                                                            i_outStrides,
-                                                            reduceDims,
-                                                            alpha,
-                                                            beta,
-                                                            in_dev.GetDeviceBuffer(),
-                                                            out_dev.GetDeviceBuffer(),
-                                                            nullptr,
-                                                            ws_dev.GetDeviceBuffer(),
-                                                            in_elementwise_op_1,
-                                                            acc_elementwise_op_1);
-        if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
-            continue;
-        auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
-        (void)invoker_ptr->Run(argument_ptr.get());
-        std::vector<int> inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get());
-        std::vector<int> inStrides2{inLengths2[1], 1};
-        for(auto& reduce2_ptr : reduce2_ptrs)
-        {
-            InElementwiseOperation_2 in_elementwise_op_2(static_cast<int32_t>(reduce_total_length));
-            AccElementwiseOperation_2 acc_elementwise_op_2(
-                static_cast<int32_t>(reduce_total_length));
-            auto argument2_ptr = reduce2_ptr->MakeArgumentPointer(inLengths2,
-                                                                  inStrides2,
-                                                                  i_outLengths,
-                                                                  i_outStrides,
-                                                                  reduceDims,
-                                                                  alpha,
-                                                                  beta,
-                                                                  ws_dev.GetDeviceBuffer(),
-                                                                  out_dev.GetDeviceBuffer(),
-                                                                  nullptr,
-                                                                  ws_dev.GetDeviceBuffer(),
-                                                                  in_elementwise_op_2,
-                                                                  acc_elementwise_op_2);
-            if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get()))
-                continue;
-            std::string reduce2_name = reduce2_ptr->GetTypeString();
-            auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer();
-            (void)invoker2_ptr->Run(argument2_ptr.get());
-            out_dev.FromDevice(out.mData.data());
-            bool single_result = true;
-            if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
-                         std::is_same<OutDataType, ck::bhalf_t>::value)
-            {
-                reduce_util::to_f32_vector(out, out_fp32);
-                reduce_util::to_f32_vector(out_ref, out_ref_fp32);
-                single_result = ck::utils::check_err(
-                    out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
-            }
-            else
-            {
-                single_result =
-                    ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
-            };
-            if(!single_result)
-            {
-                std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << " => "
-                          << reduce2_ptr->GetTypeString() << std::endl;
-                result = false;
-            }
-        };
-    };
-    return (result);
-};
-} // anonymous namespace
 static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
                                       {"reduceDimensions", required_argument, nullptr, 'R'},
                                       {"scales", required_argument, nullptr, 'S'},
@@ -387,48 +13,6 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
 class SimpleAppArgs
 {
-    template <typename T>
-    static T getSingleValueFromString(const std::string& valueStr)
-    {
-        std::istringstream iss(valueStr);
-        T ret;
-        iss >> ret;
-        return (ret);
-    };
-    template <typename T>
-    static std::vector<T> getTypeValuesFromString(const char* cstr_values)
-    {
-        std::string valuesStr(cstr_values);
-        std::vector<T> values;
-        std::size_t pos = 0;
-        std::size_t new_pos;
-        new_pos = valuesStr.find(',', pos);
-        while(new_pos != std::string::npos)
-        {
-            const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
-            T val = getSingleValueFromString<T>(sliceStr);
-            values.push_back(val);
-            pos     = new_pos + 1;
-            new_pos = valuesStr.find(',', pos);
-        };
-        std::string sliceStr = valuesStr.substr(pos);
-        T val                = getSingleValueFromString<T>(sliceStr);
-        values.push_back(val);
-        return (values);
-    };
    private:
    int option_index = 0;
@@ -460,6 +44,8 @@ class SimpleAppArgs
    int processArgs(int argc, char* argv[])
    {
+        using ck::host_common::getTypeValuesFromString;
        int ch;
        while(1)
@@ -514,7 +100,7 @@ class SimpleAppArgs
           (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4))
            return (-1);
-        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5)
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
            return (-1);
        return (0);
@@ -525,87 +111,92 @@ bool test_reduce_no_index(int data_type,
                          int init_method,
                          std::vector<int> reduceDims,
                          std::vector<size_t> inLengths,
+                          ReduceTensorOp reduceOpId,
+                          bool propagateNan,
                          float alpha,
                          float beta)
 {
+    using ck::profiler::profile_reduce_impl;
    bool result = true;
    if(data_type == 0)
    {
-        switch(reduceDims.size())
+        result = profile_reduce_impl<float, float, float>(true,
-        {
+                                                          init_method,
-        case 1:
+                                                          false,
-            result = test_reduce_no_index_impl<float, float, float, Rank, 1>(
+                                                          false,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                          inLengths,
-            break;
+                                                          reduceDims,
-        case 3:
+                                                          reduceOpId,
-            result = test_reduce_no_index_impl<float, float, float, Rank, 3>(
+                                                          propagateNan,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                          false,
-            break;
+                                                          alpha,
-        case 4:
+                                                          beta);
-            result = test_reduce_no_index_impl<float, float, float, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
    }
    else if(data_type == 1)
    {
-        switch(reduceDims.size())
+        result = profile_reduce_impl<ck::half_t, float, ck::half_t>(true,
-        {
+                                                                    init_method,
-        case 1:
+                                                                    false,
-            result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 1>(
+                                                                    false,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                                    inLengths,
-            break;
+                                                                    reduceDims,
-        case 3:
+                                                                    reduceOpId,
-            result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 3>(
+                                                                    propagateNan,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                                    false,
-            break;
+                                                                    alpha,
-        case 4:
+                                                                    beta);
-            result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
    }
    else if(data_type == 3)
    {
-        switch(reduceDims.size())
+        result = profile_reduce_impl<int8_t, int32_t, int8_t>(true,
-        {
+                                                              init_method,
-        case 1:
+                                                              false,
-            result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 1>(
+                                                              false,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                              inLengths,
-            break;
+                                                              reduceDims,
-        case 3:
+                                                              reduceOpId,
-            result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 3>(
+                                                              propagateNan,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                              false,
-            break;
+                                                              alpha,
-        case 4:
+                                                              beta);
-            result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 4>(
-                init_method, inLengths, reduceDims, alpha, beta);
-            break;
-        };
    }
    else if(data_type == 5)
    {
-        switch(reduceDims.size())
+        result = profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(true,
-        {
+                                                                      init_method,
-        case 1:
+                                                                      false,
-            result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 1>(
+                                                                      false,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                                      inLengths,
-            break;
+                                                                      reduceDims,
-        case 3:
+                                                                      reduceOpId,
-            result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 3>(
+                                                                      propagateNan,
-                init_method, inLengths, reduceDims, alpha, beta);
+                                                                      false,
-            break;
+                                                                      alpha,
-        case 4:
+                                                                      beta);
-            result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 4>(
+    }
-                init_method, inLengths, reduceDims, alpha, beta);
+    else if(data_type == 6)
-            break;
+    {
-        };
+        result = profile_reduce_impl<double, double, double>(true,
+                                                             init_method,
+                                                             false,
+                                                             false,
+                                                             inLengths,
+                                                             reduceDims,
+                                                             reduceOpId,
+                                                             propagateNan,
+                                                             false,
+                                                             alpha,
+                                                             beta);
    }
    return (result);
 };
+constexpr ReduceTensorOp reduceOpId = ReduceTensorOp::AVG;
+constexpr bool propagateNan         = false;
 int main(int argc, char* argv[])
 {
    SimpleAppArgs args;
@@ -621,8 +212,14 @@ int main(int argc, char* argv[])
            {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}};
        for(auto& reduceDims : v_reduceDims)
-            result = result && test_reduce_no_index(
+            result = result && test_reduce_no_index(data_type,
-                                   data_type, init_method, reduceDims, inLengths, 1.0f, 0.0f);
+                                                    init_method,
+                                                    reduceDims,
+                                                    inLengths,
+                                                    reduceOpId,
+                                                    propagateNan,
+                                                    1.0f,
+                                                    0.0f);
    }
    else
    {
@@ -636,6 +233,8 @@ int main(int argc, char* argv[])
                                      args.init_method,
                                      args.reduceDims,
                                      args.inLengths,
+                                      reduceOpId,
+                                      propagateNan,
                                      args.scales[0],
                                      args.scales[1]);
    }

--- a/test/reduce/reduce_util.hpp
+++ b/test/reduce/reduce_util.hpp
-#ifndef REDUCE_UTILS_HPP
-#define REDUCE_UTILS_HPP
-#include "data_type.hpp"
-namespace ck {
-namespace reduce_util {
-template <typename T>
-void to_f32_vector(const Tensor<T>& src, Tensor<float>& dst)
-{
-    for(std::size_t i = 0; i < src.mData.size(); ++i)
-        dst.mData[i] = type_convert<float>(src.mData[i]);
-}
-} // namespace reduce_util
-} // namespace ck
-#endif