Commit d8f1458f authored by Jing Zhang's avatar Jing Zhang
Browse files

Merge remote-tracking branch 'origin/develop' into grouped_gemm_args_const_buff

parents 6e983ba2 40b59a63
......@@ -144,5 +144,5 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
throw std::runtime_error("wrong! this data_type & layout is not implemented");
}
return 1;
return 0;
}
......@@ -142,5 +142,5 @@ int profile_gemm_reduce(int argc, char* argv[])
throw std::runtime_error("wrong! this data_type & layout is not implemented");
}
return 1;
return 0;
}
......@@ -153,5 +153,5 @@ int profile_grouped_gemm(int argc, char* argv[])
throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
}
return 1;
return 0;
}
#include <iostream>
#include <fstream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <vector>
#include <stdexcept>
#include <sstream>
#include <getopt.h>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "data_type_enum.hpp"
#include "reduction_enums.hpp"
#include "host_common_util.hpp"
#include "profile_reduce_impl.hpp"
using namespace std;
using ck::NanPropagation;
using ck::ReduceTensorIndices;
using ck::ReduceTensorOp;
static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
......@@ -38,63 +30,9 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
{"bf16", no_argument, nullptr, '?'},
{"dumpout", required_argument, nullptr, 'o'},
{"verify", required_argument, nullptr, 'v'},
{"log", required_argument, nullptr, 'l'},
{"help", no_argument, nullptr, '?'},
{nullptr, 0, nullptr, 0}};
template <typename T>
static T getSingleValueFromString(const string& valueStr)
{
std::istringstream iss(valueStr);
T val;
iss >> val;
return (val);
};
template <typename T>
static std::vector<T> getTypeValuesFromString(const char* cstr_values)
{
std::string valuesStr(cstr_values);
std::vector<T> values;
std::size_t pos = 0;
std::size_t new_pos;
new_pos = valuesStr.find(',', pos);
while(new_pos != std::string::npos)
{
const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
T val = getSingleValueFromString<T>(sliceStr);
values.push_back(val);
pos = new_pos + 1;
new_pos = valuesStr.find(',', pos);
};
std::string sliceStr = valuesStr.substr(pos);
T val = getSingleValueFromString<T>(sliceStr);
values.push_back(val);
return (values);
}
enum struct AppDataType
{
appHalf = 0,
appFloat = 1,
appInt32 = 2,
appInt8 = 3,
appInt8x4 = 4,
appBFloat16 = 5,
appDouble = 6,
};
static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims)
{
for(auto dim : reduceDims)
......@@ -113,7 +51,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims
};
};
class AppArgs
class ReduceProfilerArgs
{
private:
int option_index = 0;
......@@ -130,26 +68,23 @@ class AppArgs
std::vector<float> scales;
ReduceTensorOp reduceOp = ReduceTensorOp::ADD;
AppDataType compTypeId = AppDataType::appFloat;
AppDataType outTypeId = AppDataType::appFloat;
ReduceTensorOp reduceOp = ReduceTensorOp::ADD;
ck::DataTypeEnum compTypeId = ck::DataTypeEnum::Float;
ck::DataTypeEnum outTypeId = ck::DataTypeEnum::Float;
bool compType_assigned = false;
bool outType_assigned = false;
NanPropagation nanOpt = NanPropagation::NOT_PROPAGATE_NAN;
ReduceTensorIndices indicesOpt = ReduceTensorIndices::NO_INDICES;
bool do_log = false;
bool do_verification = false;
bool do_dumpout = false;
int nanOpt = 0;
int indicesOpt = 0;
bool do_verification = false;
bool do_dumpout = false;
int init_method;
bool time_kernel;
bool need_indices = false;
AppArgs() = default;
~AppArgs() = default;
ReduceProfilerArgs() = default;
~ReduceProfilerArgs() = default;
void show_usage(const char* cmd)
{
......@@ -166,8 +101,11 @@ class AppArgs
std::cout << "--outType or -W, optional enum value indicating the type of the reduced "
"output, which could be float when the input data is half"
<< std::endl;
std::cout << "--nanOpt or -N, enum value indicates the selection for NanOpt" << std::endl;
std::cout << "--indicesOpt or -I, enum value indicates the selection for IndicesOpt"
std::cout
<< "--nanOpt or -N, 1/0 value indicates the selection to use or not use Nan-Propagation"
<< std::endl;
std::cout << "--indicesOpt or -I, 1/0 value indicates the selection to use or not use "
"index in reduction"
<< std::endl;
std::cout << "--scales or -S, comma separated two float values for alpha and beta"
<< std::endl;
......@@ -181,18 +119,19 @@ class AppArgs
std::cout << "--dumpout or -o, 1/0 to indicate where to save the reduction result to files "
"for further analysis"
<< std::endl;
std::cout << "--log or -l, 1/0 to indicate whether to log some information" << std::endl;
};
int processArgs(int argc, char* argv[])
{
using ck::host_common::getTypeValuesFromString;
int ch;
optind++; // to skip the "reduce" module name
while(1)
{
ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:l:", long_options, &option_index);
ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:", long_options, &option_index);
if(ch == -1)
break;
switch(ch)
......@@ -219,27 +158,27 @@ class AppArgs
if(!optarg)
throw std::runtime_error("Invalid option format!");
compTypeId = static_cast<AppDataType>(std::atoi(optarg));
compTypeId = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
compType_assigned = true;
break;
case 'W':
if(!optarg)
throw std::runtime_error("Invalid option format!");
outTypeId = static_cast<AppDataType>(std::atoi(optarg));
outTypeId = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
outType_assigned = true;
break;
case 'N':
if(!optarg)
throw std::runtime_error("Invalid option format!");
nanOpt = static_cast<NanPropagation>(std::atoi(optarg));
nanOpt = std::atoi(optarg);
break;
case 'I':
if(!optarg)
throw std::runtime_error("Invalid option format!");
indicesOpt = static_cast<ReduceTensorIndices>(std::atoi(optarg));
indicesOpt = std::atoi(optarg);
break;
case 'S':
if(!optarg)
......@@ -262,12 +201,6 @@ class AppArgs
do_dumpout = static_cast<bool>(std::atoi(optarg));
break;
case 'l':
if(!optarg)
throw std::runtime_error("Invalid option format!");
do_log = static_cast<bool>(std::atoi(optarg));
break;
case '?':
if(std::string(long_options[option_index].name) == "half")
use_half = true;
......@@ -295,7 +228,7 @@ class AppArgs
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
init_method = std::atoi(argv[optind++]);
time_kernel = std::atoi(argv[optind]);
time_kernel = static_cast<bool>(std::atoi(argv[optind]));
if(scales.empty())
{
......@@ -306,9 +239,6 @@ class AppArgs
if(reduceOp == ReduceTensorOp::MIN || reduceOp == ReduceTensorOp::MAX ||
reduceOp == ReduceTensorOp::AMAX)
{
if(indicesOpt != ReduceTensorIndices::NO_INDICES)
need_indices = true;
// for indexable operations, no need to assign compType and outType, just let them be
// same as inType
compType_assigned = false;
......@@ -322,9 +252,10 @@ class AppArgs
int profile_reduce(int argc, char* argv[])
{
using namespace ck::profiler;
using ck::DataTypeEnum;
using ck::profiler::profile_reduce_impl;
AppArgs args;
ReduceProfilerArgs args;
if(args.processArgs(argc, argv) < 0)
return (-1);
......@@ -339,42 +270,41 @@ int profile_reduce(int argc, char* argv[])
if(args.use_half)
{
if(!args.compType_assigned)
args.compTypeId = AppDataType::appHalf;
args.compTypeId = DataTypeEnum::Half;
if(args.outType_assigned &&
(args.outTypeId != AppDataType::appHalf && args.outTypeId != AppDataType::appFloat))
args.outTypeId = AppDataType::appFloat;
(args.outTypeId != DataTypeEnum::Half && args.outTypeId != DataTypeEnum::Float))
args.outTypeId = DataTypeEnum::Float;
if(!args.outType_assigned)
args.outTypeId = AppDataType::appHalf;
args.outTypeId = DataTypeEnum::Half;
if(args.compTypeId == AppDataType::appHalf)
if(args.compTypeId == DataTypeEnum::Half)
{
profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
args.scales[0],
args.scales[1]);
profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(
args.do_verification,
args.init_method,
args.do_dumpout,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
static_cast<bool>(args.nanOpt),
static_cast<bool>(args.indicesOpt),
args.scales[0],
args.scales[1]);
}
else if(args.compTypeId == AppDataType::appFloat)
else if(args.compTypeId == DataTypeEnum::Float)
{
profile_reduce_impl<ck::half_t, float, ck::half_t>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
static_cast<bool>(args.nanOpt),
static_cast<bool>(args.indicesOpt),
args.scales[0],
args.scales[1]);
}
......@@ -385,56 +315,53 @@ int profile_reduce(int argc, char* argv[])
{
profile_reduce_impl<double, double, double>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
static_cast<bool>(args.nanOpt),
static_cast<bool>(args.indicesOpt),
args.scales[0],
args.scales[1]);
}
else if(args.use_int8)
{
if(!args.compType_assigned)
args.compTypeId = AppDataType::appInt8;
args.compTypeId = DataTypeEnum::Int8;
if(args.outType_assigned &&
(args.outTypeId != AppDataType::appInt8 && args.outTypeId != AppDataType::appInt32))
args.outTypeId = AppDataType::appInt32;
(args.outTypeId != DataTypeEnum::Int8 && args.outTypeId != DataTypeEnum::Int32))
args.outTypeId = DataTypeEnum::Int32;
if(!args.outType_assigned)
args.outTypeId = AppDataType::appInt8;
args.outTypeId = DataTypeEnum::Int8;
if(args.compTypeId == AppDataType::appInt8)
if(args.compTypeId == DataTypeEnum::Int8)
{
profile_reduce_impl<int8_t, int8_t, int8_t>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
static_cast<bool>(args.nanOpt),
static_cast<bool>(args.indicesOpt),
args.scales[0],
args.scales[1]);
}
else if(args.compTypeId == AppDataType::appInt32)
else if(args.compTypeId == DataTypeEnum::Int32)
{
profile_reduce_impl<int8_t, int32_t, int8_t>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
static_cast<bool>(args.nanOpt),
static_cast<bool>(args.indicesOpt),
args.scales[0],
args.scales[1]);
}
......@@ -444,54 +371,51 @@ int profile_reduce(int argc, char* argv[])
else if(args.use_bf16)
{
if(args.outType_assigned &&
(args.outTypeId != AppDataType::appBFloat16 && args.outTypeId != AppDataType::appFloat))
args.outTypeId = AppDataType::appFloat;
(args.outTypeId != DataTypeEnum::BFloat16 && args.outTypeId != DataTypeEnum::Float))
args.outTypeId = DataTypeEnum::Float;
if(!args.outType_assigned)
args.outTypeId = AppDataType::appBFloat16;
args.outTypeId = DataTypeEnum::BFloat16;
profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
static_cast<bool>(args.nanOpt),
static_cast<bool>(args.indicesOpt),
args.scales[0],
args.scales[1]);
}
else
{
if(args.compTypeId == AppDataType::appFloat)
if(args.compTypeId == DataTypeEnum::Float)
{
profile_reduce_impl<float, float, float>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
static_cast<bool>(args.nanOpt),
static_cast<bool>(args.indicesOpt),
args.scales[0],
args.scales[1]);
}
else if(args.compTypeId == AppDataType::appDouble)
else if(args.compTypeId == DataTypeEnum::Double)
{
profile_reduce_impl<float, double, float>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
static_cast<bool>(args.nanOpt),
static_cast<bool>(args.indicesOpt),
args.scales[0],
args.scales[1]);
}
......
......@@ -13,6 +13,7 @@ int profile_gemm_bias_relu_add(int, char*[]);
int profile_gemm_reduce(int, char*[]);
int profile_batched_gemm(int, char*[]);
int profile_grouped_gemm(int, char*[]);
int profile_conv_fwd(int, char*[]);
int profile_conv_fwd_bias_relu(int, char*[]);
int profile_conv_fwd_bias_relu_add(int, char*[]);
int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);
......@@ -25,7 +26,8 @@ int main(int argc, char* argv[])
{
if(strcmp(argv[1], "gemm") == 0)
{
return profile_gemm(argc, argv);
int stat = profile_gemm(argc, argv);
return stat;
}
else if(strcmp(argv[1], "gemm_bias_2d") == 0)
{
......@@ -107,7 +109,7 @@ int main(int argc, char* argv[])
" conv1d_bwd_data: BackwardConvolution data 1 dim\n"
" conv2d_bwd_data: BackwardConvolution data 2 dim\n"
" conv3d_bwd_data: BackwardConvolution data 3 dim\n"
" reduce: REDUCE\n"
" reduce: Reduce\n"
" conv2d_bwd_weight: Backward Weight Convolution 2d\n");
// clang-format on
}
......
#!/usr/bin/env python3
import os, io
import argparse
def print_to_string(*args, **kwargs):
output = io.StringIO()
print(*args, file=output, **kwargs)
contents = output.getvalue()
output.close()
return contents
def parse_args():
parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs')
parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files')
args = parser.parse_args()
files = []
if os.path.isdir(args.filename):
all_files = os.listdir(args.filename)
for name in all_files:
if not 'log' in name:
continue
files.append(os.path.join(args.filename, name))
else:
files = [args.filename]
args.files = files
return args
def main():
args = parse_args()
results = []
#parse results
glue=""
for filename in args.files:
for line in open(filename):
if 'Best Perf' in line:
lst=line.split()
results.append(print_to_string(glue.join(lst[8:]),lst[4]))
#sort results
#read baseline results for the latest develop branch
#write new results to the db
#compare the results to the baseline
#return 0 if performance criteria met, otherwise return 1
print(results)
return 0
if __name__ == '__main__':
#!/usr/bin/env python3
import os, io, argparse, datetime
import numpy as np
import sqlalchemy
from sqlalchemy.types import NVARCHAR, Float, Integer
import pymysql
import pandas as pd
from sshtunnel import SSHTunnelForwarder
def print_to_string(*args, **kwargs):
output = io.StringIO()
print(*args, file=output, **kwargs)
contents = output.getvalue()
output.close()
return contents
def parse_args():
parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs')
parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files')
args = parser.parse_args()
files = []
if os.path.isdir(args.filename):
all_files = os.listdir(args.filename)
for name in all_files:
if not 'log' in name:
continue
files.append(os.path.join(args.filename, name))
else:
files = [args.filename]
args.files = files
return args
def main():
args = parse_args()
tests = []
kernels=[]
tflops=[]
dtype=[]
alayout=[]
blayout=[]
M=[]
N=[]
K=[]
StrideA=[]
StrideB=[]
StrideC=[]
#parse results, get the Tflops value for "Best Perf" kernels
glue=""
for filename in args.files:
for line in open(filename):
if 'Branch name' in line:
lst=line.split()
branch_name=lst[2]
for filename in args.files:
for line in open(filename):
if 'Best Perf' in line:
lst=line.split()
if len(lst)>=37: #the line is complete
tests.append(glue.join(lst[5:30]))
kernels.append(glue.join(lst[37:]))
tflops.append(lst[33])
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
elif len(lst)<37 and len(lst)>=33: #the tflops are available
tests.append(glue.join(lst[5:30]))
kernels.append("N/A")
tflops.append(lst[33])
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
print("warning: incomplete line:",lst)
elif len(lst)<33: #even the tflops are not available
print("Error in ckProfiler output!")
print("warning: incomplete line=",lst)
#sort results
print("Number of tests:",len(tests))
print("Branch name:",branch_name)
#sorted_tests = sorted(tests)
#print("sorted tests:",sorted_tests)
sorted_tflops = [x for _,x in sorted(zip(tests,tflops))]
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
test_list=list(range(1,len(tests)+1))
sql_hostname = '127.0.0.1'
sql_username = os.environ["dbuser"]
print("sql_username=",sql_username)
sql_password = os.environ["dbpassword"]
sql_main_database = 'miopen_perf'
sql_port = 3306
ssh_host = os.environ["dbsship"]
print("ssh_host=",ssh_host)
ssh_user = os.environ["dbsshuser"]
print("ssh_user=",ssh_user)
ssh_port = int(os.environ["dbsshport"])
ssh_pass = os.environ["dbsshpassword"]
with SSHTunnelForwarder(
(ssh_host, ssh_port),
ssh_username=ssh_user,
ssh_password=ssh_pass,
remote_bind_address=(sql_hostname, sql_port)) as tunnel:
sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'.
format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database))
conn = sqlEngine.connect()
#write the ck_gemm_test_params table
#only needed once the test set changes
'''
sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
sorted_M = [x for _,x in sorted(zip(tests,M))]
sorted_N = [x for _,x in sorted(zip(tests,N))]
sorted_K = [x for _,x in sorted(zip(tests,K))]
sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout,
sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
sorted_StrideC]
df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
print(df)
dtypes = {
'Test_number': Integer(),
'Data_type': NVARCHAR(length=5),
'Alayout': NVARCHAR(length=12),
'Blayout': NVARCHAR(length=12),
'M': Integer(),
'N': Integer(),
'K': Integer(),
'StrideA': Integer(),
'StrideB': Integer(),
'StrideC': Integer()
}
df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes)
'''
#read baseline results for the latest develop branch
query = '''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );'''
tflops_base = pd.read_sql_query(query, conn)
#write new results to the db
testlist=[]
for i in range(1,len(tests)+1):
testlist.append("Test%i"%i)
ck_gemm_tflops=[str(branch_name),str(datetime.datetime.now())]
flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Datetime'])
df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist)
flops=pd.concat([flops,df_add],axis=1)
print("new tflops results:",flops)
flops.to_sql("ck_gemm_tflops",conn,if_exists='append',index=False)
conn.close()
#compare the results to the baseline
regression=0
base=tflops_base[testlist].to_numpy(dtype='float')
base_list=base[0]
ave_perf=0
for i in range(len(base_list)):
# success criterion:
if base_list[i]>1.01*float(sorted_tflops[i]):
print("test # ",i,"shows regression by {:.3f}%".format(
(float(sorted_tflops[i])-base_list[i])/base_list[i]*100))
regression=1
ave_perf=ave_perf+float(sorted_tflops[i])/base_list[i]
if regression==0:
print("no regressions found")
ave_perf=ave_perf/len(base_list)
print("average performance relative to baseline:",ave_perf)
#return 0 if performance criteria met, otherwise return 1
return regression
if __name__ == '__main__':
main()
\ No newline at end of file
......@@ -15,6 +15,17 @@ bin/test_reduce_no_index -D 64,4,280,82 -R 1 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 2 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 3 0 2
## for float64
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 2 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 3 6 2
## for float16
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 1 2
......
......@@ -15,6 +15,17 @@ bin/test_reduce_with_index -D 64,4,280,82 -R 1 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 2 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 3 0 2
## for float64
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 2 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 3 6 2
## for float16
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 1 2
......
......@@ -2,6 +2,7 @@ include_directories(BEFORE
${PROJECT_SOURCE_DIR}/
${PROJECT_SOURCE_DIR}/include/ck
${PROJECT_SOURCE_DIR}/include/ck/utility
${PROJECT_SOURCE_DIR}/include/ck/host_utility
${PROJECT_SOURCE_DIR}/include/ck/tensor_description
${PROJECT_SOURCE_DIR}/include/ck/tensor
${PROJECT_SOURCE_DIR}/include/ck/problem_transform
......
add_test_executable(test_gemm_fp32 gemm_fp32.cpp)
target_link_libraries(test_gemm_fp32 PRIVATE host_tensor)
target_link_libraries(test_gemm_fp32 PRIVATE device_gemm_instance)
# GEMM XDL
add_test_executable(test_gemm_xdl_fp32 gemm_xdl_fp32.cpp)
target_link_libraries(test_gemm_xdl_fp32 PRIVATE host_tensor)
target_link_libraries(test_gemm_xdl_fp32 PRIVATE device_gemm_instance)
add_test_executable(test_gemm_fp16 gemm_fp16.cpp)
target_link_libraries(test_gemm_fp16 PRIVATE host_tensor)
target_link_libraries(test_gemm_fp16 PRIVATE device_gemm_instance)
add_test_executable(test_gemm_xdl_fp16 gemm_xdl_fp16.cpp)
target_link_libraries(test_gemm_xdl_fp16 PRIVATE host_tensor)
target_link_libraries(test_gemm_xdl_fp16 PRIVATE device_gemm_instance)
add_test_executable(test_gemm_bf16 gemm_bf16.cpp)
target_link_libraries(test_gemm_bf16 PRIVATE host_tensor)
target_link_libraries(test_gemm_bf16 PRIVATE device_gemm_instance)
add_test_executable(test_gemm_xdl_bf16 gemm_xdl_bf16.cpp)
target_link_libraries(test_gemm_xdl_bf16 PRIVATE host_tensor)
target_link_libraries(test_gemm_xdl_bf16 PRIVATE device_gemm_instance)
add_test_executable(test_gemm_int8 gemm_int8.cpp)
target_link_libraries(test_gemm_int8 PRIVATE host_tensor)
target_link_libraries(test_gemm_int8 PRIVATE device_gemm_instance)
add_test_executable(test_gemm_xdl_int8 gemm_xdl_int8.cpp)
target_link_libraries(test_gemm_xdl_int8 PRIVATE host_tensor)
target_link_libraries(test_gemm_xdl_int8 PRIVATE device_gemm_instance)
# GEMM DL
add_test_executable(test_gemm_dl_fp32 gemm_dl_fp32.cpp)
target_link_libraries(test_gemm_dl_fp32 PRIVATE host_tensor)
target_link_libraries(test_gemm_dl_fp32 PRIVATE device_gemm_instance)
add_test_executable(test_gemm_dl_fp16 gemm_dl_fp16.cpp)
target_link_libraries(test_gemm_dl_fp16 PRIVATE host_tensor)
target_link_libraries(test_gemm_dl_fp16 PRIVATE device_gemm_instance)
add_test_executable(test_gemm_dl_int8 gemm_dl_int8.cpp)
target_link_libraries(test_gemm_dl_int8 PRIVATE host_tensor)
TArget_link_libraries(test_gemm_dl_int8 PRIVATE device_gemm_instance)
#include <algorithm>
#include <cstdlib>
#include <half.hpp>
#include <iostream>
#include <numeric>
#include <tuple>
#include <vector>
#include "../gemm/gemm_util.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_gemm_dl.hpp"
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceGemmNoOpPtr =
ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_gemm_instance {
void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
} // namespace device_gemm_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
int main()
{
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using CDataType = ck::half_t;
using RowMajor = ck::tensor_layout::gemm::RowMajor;
using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
bool res = true;
std::vector<DeviceGemmNoOpPtr> gemmPtrs;
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
ColumnMajor,
RowMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
ColumnMajor,
ColumnMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
RowMajor,
RowMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
RowMajor,
ColumnMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
return res ? 0 : 1;
}
#include <algorithm>
#include <cstdlib>
#include <half.hpp>
#include <iostream>
#include <numeric>
#include <tuple>
#include <vector>
#include "../gemm/gemm_util.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_gemm_dl.hpp"
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceGemmNoOpPtr =
ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_gemm_instance {
void add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
} // namespace device_gemm_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
int main()
{
using ADataType = float;
using BDataType = float;
using CDataType = float;
using RowMajor = ck::tensor_layout::gemm::RowMajor;
using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
bool res = true;
std::vector<DeviceGemmNoOpPtr> gemmPtrs;
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
ColumnMajor,
RowMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
ColumnMajor,
ColumnMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
RowMajor,
RowMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
RowMajor,
ColumnMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
return res ? 0 : 1;
}
#include <algorithm>
#include <cstdlib>
#include <half.hpp>
#include <iostream>
#include <numeric>
#include <tuple>
#include <vector>
#include "../gemm/gemm_util.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_gemm_dl.hpp"
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceGemmNoOpPtr =
ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_gemm_instance {
void add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
} // namespace device_gemm_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
int main()
{
using ADataType = int8_t;
using BDataType = int8_t;
using CDataType = int8_t;
using RowMajor = ck::tensor_layout::gemm::RowMajor;
using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
bool res = true;
std::vector<DeviceGemmNoOpPtr> gemmPtrs;
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
ColumnMajor,
RowMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
ColumnMajor,
ColumnMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
RowMajor,
RowMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
RowMajor,
ColumnMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
return res ? 0 : 1;
}
......@@ -60,7 +60,7 @@ template <typename DeviceGemmPtr_,
typename AElementwiseOperation,
typename BElementwiseOperation,
typename CElementwiseOperation>
void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
bool RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
const ck::gemm_util::GemmParams& params,
const Tensor<ADataType>& A,
const Tensor<BDataType>& B,
......@@ -73,9 +73,6 @@ void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpace());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpace());
a_m_k_device_buf.ToDevice(A.mData.data());
b_k_n_device_buf.ToDevice(B.mData.data());
auto invoker_ptr = gemmPtr->MakeInvokerPointer();
auto argument_ptr =
gemmPtr->MakeArgumentPointer(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
......@@ -91,15 +88,23 @@ void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
b_element_op,
c_element_op);
if(!gemmPtr->IsSupportedArgument(argument_ptr.get()))
if(gemmPtr->IsSupportedArgument(argument_ptr.get()))
{
throw std::runtime_error(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem");
a_m_k_device_buf.ToDevice(A.mData.data());
b_k_n_device_buf.ToDevice(B.mData.data());
invoker_ptr->Run(argument_ptr.get());
c_m_n_device_buf.FromDevice(C.mData.data());
return true;
}
else
{
std::cout << "device_gemm with the specified compilation parameters does "
"not support this GEMM problem"
<< std::endl;
invoker_ptr->Run(argument_ptr.get());
c_m_n_device_buf.FromDevice(C.mData.data());
return false;
}
}
template <typename DeviceGemmPtr_,
......@@ -188,28 +193,35 @@ struct TestGemm
a, b, c_host, a_element_op, b_element_op, c_element_op);
// Act
ck::gemm_util::RunDeviceGEMM(
bool is_supported = ck::gemm_util::RunDeviceGEMM(
gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op);
// Assert
bool res = false;
if(std::is_same<CDataType, float>::value)
if(is_supported)
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
// Assert
bool res = false;
if(std::is_same<CDataType, float>::value)
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
else if(std::is_same<CDataType, ck::half_t>::value)
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
else if(std::is_same<CDataType, int8_t>::value)
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
return res;
}
else if(std::is_same<CDataType, ck::half_t>::value)
else
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
return true;
}
else if(std::is_same<CDataType, int8_t>::value)
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
return res;
}
};
......
......@@ -31,14 +31,10 @@ namespace ck {
namespace tensor_operation {
namespace device {
namespace device_gemm_instance {
void add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_kn_mn_instances(
std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_nk_mn_instances(
std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances(
std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_kn_mn_instances(
std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
} // namespace device_gemm_instance
} // namespace device
} // namespace tensor_operation
......@@ -57,7 +53,7 @@ int main()
bool res = true;
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_kn_mn_instances(gemmPtrs);
add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
......@@ -75,7 +71,7 @@ int main()
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_nk_mn_instances(gemmPtrs);
add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
......@@ -93,7 +89,7 @@ int main()
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_kn_mn_instances(gemmPtrs);
add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
......@@ -111,7 +107,7 @@ int main()
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances(gemmPtrs);
add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
......
#include "getopt.h"
#include "check_err.hpp"
#include "device_reduce_instance.hpp"
#include "reduction_enums.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_reduction.hpp"
#include "reduce_util.hpp"
#include "host_common_util.hpp"
#include "profile_reduce_impl.hpp"
using namespace ck;
namespace {
template <index_t Rank, index_t NumReduceDim>
static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
{
assert(NumReduceDim == reduceDims.size());
int reduceFlag = 0;
// flag the bits for the reduceDims
for(int i = 0; i < NumReduceDim; i++)
{
reduceFlag |= 1 << reduceDims[i];
};
std::vector<int> invariantDims;
// collect invariant dimensions
for(int i = 0; i < Rank; i++)
if((reduceFlag & (1 << i)) == 0)
{
invariantDims.push_back(i);
};
return invariantDims;
};
constexpr int Rank = 4;
constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG;
constexpr NanPropagation NanOpt = NanPropagation::PROPAGATE_NAN;
constexpr bool PropagateNan = false;
constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES;
constexpr bool NeedIndices = false;
template <typename InDataType,
typename AccDataType,
typename OutDataType,
int Rank,
int NumReduceDim>
bool test_reduce_no_index_impl(int init_method,
const std::vector<size_t>& inLengths,
const std::vector<int>& reduceDims,
float alpha,
float beta)
{
using namespace ck::tensor_operation::device;
using namespace ck::tensor_operation::device::device_reduce_instance;
using namespace ck::host_reduce;
constexpr bool out_support_atomic_add = std::is_same<OutDataType, float>::value;
constexpr bool op_support_atomic_add = true;
constexpr bool use_atomic_add = (out_support_atomic_add && op_support_atomic_add);
Tensor<InDataType> in(inLengths);
std::vector<size_t> outLengths;
const auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
if(reduceDims.size() == Rank)
outLengths.push_back(1);
else
for(auto dim : invariantDims)
outLengths.push_back(inLengths[dim]);
Tensor<OutDataType> out_ref(outLengths);
Tensor<OutDataType> out(outLengths);
// only used when the OutDataType is bhalf_t
Tensor<float> out_ref_fp32(outLengths);
Tensor<float> out_fp32(outLengths);
auto inStrides = in.mDesc.GetStrides();
auto outStrides = out.mDesc.GetStrides();
size_t invariant_total_length = out.mDesc.GetElementSize();
size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length;
std::size_t num_thread = 1;
switch(init_method)
{
case 0: break;
case 1:
in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
break;
case 2:
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
break;
default:
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
}
if(beta != 0.0f)
for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++)
out.mData[i] = out_ref.mData[i];
// these buffers are usually provided by the user application
DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
in_dev.ToDevice(in.mData.data());
if(beta != 0.0f)
out_dev.ToDevice(out.mData.data());
using InElementwiseOperation_0 =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
using AccElementwiseOperation_0 =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
AccElementwiseOperation;
using InElementwiseOperation_1 =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
InElementwiseOperation;
using AccElementwiseOperation_1 =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
AccElementwiseOperation;
using InElementwiseOperation_2 =
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
InElementwiseOperation;
using AccElementwiseOperation_2 =
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
AccElementwiseOperation;
using DeviceReduceInstPtr0 =
DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>;
using DeviceReduceInstPtr1 =
DeviceReducePtr<InElementwiseOperation_1, AccElementwiseOperation_1>;
using DeviceReduceInstPtr2 =
DeviceReducePtr<InElementwiseOperation_2, AccElementwiseOperation_2>;
std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
std::vector<DeviceReduceInstPtr1> reduce1_ptrs;
std::vector<DeviceReduceInstPtr2> reduce2_ptrs;
add_device_reduce_instance_threadwise<InDataType,
AccDataType,
OutDataType,
Rank,
NumReduceDim,
ReduceOpId,
NanOpt,
IndicesOpt>(reduce0_ptrs);
add_device_reduce_instance_blockwise<InDataType,
AccDataType,
OutDataType,
Rank,
NumReduceDim,
ReduceOpId,
NanOpt,
IndicesOpt>(reduce0_ptrs);
if constexpr(use_atomic_add)
{
add_device_reduce_instance_multiblock_atomic_add<InDataType,
AccDataType,
OutDataType,
Rank,
NumReduceDim,
ReduceOpId,
NanOpt,
IndicesOpt>(reduce0_ptrs);
}
else
{
add_device_reduce_instance_multiblock_partial_reduce<InDataType,
AccDataType,
OutDataType,
Rank,
NumReduceDim,
ReduceOpId,
NanOpt,
IndicesOpt>(reduce1_ptrs);
};
// used for secondary reduction
if constexpr(!use_atomic_add)
{
add_device_reduce_instance_blockwise_second_call<AccDataType,
AccDataType,
OutDataType,
Rank,
NumReduceDim,
ReduceOpId,
NanOpt,
IndicesOpt>(reduce2_ptrs);
};
if(reduce0_ptrs.empty() && reduce1_ptrs.empty())
{
throw std::runtime_error("Wrong! No device REDUCE instance found");
};
bool result = true;
ReductionHost<InDataType,
AccDataType,
OutDataType,
ReduceOpId,
Rank,
NumReduceDim,
PropagateNan,
NeedIndices>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce.Run(alpha, in.mData.data(), beta, out_ref.mData.data(), nullptr);
const auto i_inLengths = to_int_vector(inLengths);
const auto i_inStrides = to_int_vector(inStrides);
const auto i_outLengths = to_int_vector(outLengths);
const auto i_outStrides = to_int_vector(outStrides);
for(auto& reduce_ptr : reduce0_ptrs)
{
auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
DeviceMem ws_dev(wsSizeInBytes);
InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
AccElementwiseOperation_0 acc_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
i_inStrides,
i_outLengths,
i_outStrides,
reduceDims,
alpha,
beta,
in_dev.GetDeviceBuffer(),
out_dev.GetDeviceBuffer(),
nullptr,
ws_dev.GetDeviceBuffer(),
in_elementwise_op_0,
acc_elementwise_op_0);
if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
continue;
auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
(void)invoker_ptr->Run(argument_ptr.get());
out_dev.FromDevice(out.mData.data());
bool single_result = true;
if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
std::is_same<OutDataType, ck::bhalf_t>::value)
{
reduce_util::to_f32_vector(out, out_fp32);
reduce_util::to_f32_vector(out_ref, out_ref_fp32);
single_result = ck::utils::check_err(
out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
}
else
{
single_result =
ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
};
if(!single_result)
{
std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl;
result = false;
}
};
for(auto& reduce_ptr : reduce1_ptrs)
{
auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
DeviceMem ws_dev(wsSizeInBytes);
InElementwiseOperation_1 in_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
AccElementwiseOperation_1 acc_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
i_inStrides,
i_outLengths,
i_outStrides,
reduceDims,
alpha,
beta,
in_dev.GetDeviceBuffer(),
out_dev.GetDeviceBuffer(),
nullptr,
ws_dev.GetDeviceBuffer(),
in_elementwise_op_1,
acc_elementwise_op_1);
if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
continue;
auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
(void)invoker_ptr->Run(argument_ptr.get());
std::vector<int> inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get());
std::vector<int> inStrides2{inLengths2[1], 1};
for(auto& reduce2_ptr : reduce2_ptrs)
{
InElementwiseOperation_2 in_elementwise_op_2(static_cast<int32_t>(reduce_total_length));
AccElementwiseOperation_2 acc_elementwise_op_2(
static_cast<int32_t>(reduce_total_length));
auto argument2_ptr = reduce2_ptr->MakeArgumentPointer(inLengths2,
inStrides2,
i_outLengths,
i_outStrides,
reduceDims,
alpha,
beta,
ws_dev.GetDeviceBuffer(),
out_dev.GetDeviceBuffer(),
nullptr,
ws_dev.GetDeviceBuffer(),
in_elementwise_op_2,
acc_elementwise_op_2);
if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get()))
continue;
std::string reduce2_name = reduce2_ptr->GetTypeString();
auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer();
(void)invoker2_ptr->Run(argument2_ptr.get());
out_dev.FromDevice(out.mData.data());
bool single_result = true;
if constexpr(std::is_same<OutDataType, ck::half_t>::value ||
std::is_same<OutDataType, ck::bhalf_t>::value)
{
reduce_util::to_f32_vector(out, out_fp32);
reduce_util::to_f32_vector(out_ref, out_ref_fp32);
single_result = ck::utils::check_err(
out_fp32.mData, out_ref_fp32.mData, "Error: incorrect data result!");
}
else
{
single_result =
ck::utils::check_err(out.mData, out_ref.mData, "Error: incorrect data result!");
};
if(!single_result)
{
std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << " => "
<< reduce2_ptr->GetTypeString() << std::endl;
result = false;
}
};
};
return (result);
};
} // anonymous namespace
static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
{"reduceDimensions", required_argument, nullptr, 'R'},
{"scales", required_argument, nullptr, 'S'},
......@@ -387,48 +13,6 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
class SimpleAppArgs
{
template <typename T>
static T getSingleValueFromString(const std::string& valueStr)
{
std::istringstream iss(valueStr);
T ret;
iss >> ret;
return (ret);
};
template <typename T>
static std::vector<T> getTypeValuesFromString(const char* cstr_values)
{
std::string valuesStr(cstr_values);
std::vector<T> values;
std::size_t pos = 0;
std::size_t new_pos;
new_pos = valuesStr.find(',', pos);
while(new_pos != std::string::npos)
{
const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
T val = getSingleValueFromString<T>(sliceStr);
values.push_back(val);
pos = new_pos + 1;
new_pos = valuesStr.find(',', pos);
};
std::string sliceStr = valuesStr.substr(pos);
T val = getSingleValueFromString<T>(sliceStr);
values.push_back(val);
return (values);
};
private:
int option_index = 0;
......@@ -460,6 +44,8 @@ class SimpleAppArgs
int processArgs(int argc, char* argv[])
{
using ck::host_common::getTypeValuesFromString;
int ch;
while(1)
......@@ -514,7 +100,7 @@ class SimpleAppArgs
(reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4))
return (-1);
if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5)
if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
return (-1);
return (0);
......@@ -525,87 +111,92 @@ bool test_reduce_no_index(int data_type,
int init_method,
std::vector<int> reduceDims,
std::vector<size_t> inLengths,
ReduceTensorOp reduceOpId,
bool propagateNan,
float alpha,
float beta)
{
using ck::profiler::profile_reduce_impl;
bool result = true;
if(data_type == 0)
{
switch(reduceDims.size())
{
case 1:
result = test_reduce_no_index_impl<float, float, float, Rank, 1>(
init_method, inLengths, reduceDims, alpha, beta);
break;
case 3:
result = test_reduce_no_index_impl<float, float, float, Rank, 3>(
init_method, inLengths, reduceDims, alpha, beta);
break;
case 4:
result = test_reduce_no_index_impl<float, float, float, Rank, 4>(
init_method, inLengths, reduceDims, alpha, beta);
break;
};
result = profile_reduce_impl<float, float, float>(true,
init_method,
false,
false,
inLengths,
reduceDims,
reduceOpId,
propagateNan,
false,
alpha,
beta);
}
else if(data_type == 1)
{
switch(reduceDims.size())
{
case 1:
result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 1>(
init_method, inLengths, reduceDims, alpha, beta);
break;
case 3:
result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 3>(
init_method, inLengths, reduceDims, alpha, beta);
break;
case 4:
result = test_reduce_no_index_impl<ck::half_t, float, ck::half_t, Rank, 4>(
init_method, inLengths, reduceDims, alpha, beta);
break;
};
result = profile_reduce_impl<ck::half_t, float, ck::half_t>(true,
init_method,
false,
false,
inLengths,
reduceDims,
reduceOpId,
propagateNan,
false,
alpha,
beta);
}
else if(data_type == 3)
{
switch(reduceDims.size())
{
case 1:
result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 1>(
init_method, inLengths, reduceDims, alpha, beta);
break;
case 3:
result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 3>(
init_method, inLengths, reduceDims, alpha, beta);
break;
case 4:
result = test_reduce_no_index_impl<int8_t, int32_t, int8_t, Rank, 4>(
init_method, inLengths, reduceDims, alpha, beta);
break;
};
result = profile_reduce_impl<int8_t, int32_t, int8_t>(true,
init_method,
false,
false,
inLengths,
reduceDims,
reduceOpId,
propagateNan,
false,
alpha,
beta);
}
else if(data_type == 5)
{
switch(reduceDims.size())
{
case 1:
result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 1>(
init_method, inLengths, reduceDims, alpha, beta);
break;
case 3:
result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 3>(
init_method, inLengths, reduceDims, alpha, beta);
break;
case 4:
result = test_reduce_no_index_impl<ck::bhalf_t, float, ck::bhalf_t, Rank, 4>(
init_method, inLengths, reduceDims, alpha, beta);
break;
};
result = profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(true,
init_method,
false,
false,
inLengths,
reduceDims,
reduceOpId,
propagateNan,
false,
alpha,
beta);
}
else if(data_type == 6)
{
result = profile_reduce_impl<double, double, double>(true,
init_method,
false,
false,
inLengths,
reduceDims,
reduceOpId,
propagateNan,
false,
alpha,
beta);
}
return (result);
};
constexpr ReduceTensorOp reduceOpId = ReduceTensorOp::AVG;
constexpr bool propagateNan = false;
int main(int argc, char* argv[])
{
SimpleAppArgs args;
......@@ -621,8 +212,14 @@ int main(int argc, char* argv[])
{0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}};
for(auto& reduceDims : v_reduceDims)
result = result && test_reduce_no_index(
data_type, init_method, reduceDims, inLengths, 1.0f, 0.0f);
result = result && test_reduce_no_index(data_type,
init_method,
reduceDims,
inLengths,
reduceOpId,
propagateNan,
1.0f,
0.0f);
}
else
{
......@@ -636,6 +233,8 @@ int main(int argc, char* argv[])
args.init_method,
args.reduceDims,
args.inLengths,
reduceOpId,
propagateNan,
args.scales[0],
args.scales[1]);
}
......
#ifndef REDUCE_UTILS_HPP
#define REDUCE_UTILS_HPP
#include "data_type.hpp"
namespace ck {
namespace reduce_util {
template <typename T>
void to_f32_vector(const Tensor<T>& src, Tensor<float>& dst)
{
for(std::size_t i = 0; i < src.mData.size(); ++i)
dst.mData[i] = type_convert<float>(src.mData[i]);
}
} // namespace reduce_util
} // namespace ck
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment