Commit bccc6d8b authored by wangshaojie6's avatar wangshaojie6
Browse files

merge develop and resolve conflict

parents c6b52884 91d8b7d6
...@@ -139,5 +139,5 @@ int profile_gemm_bias_relu(int argc, char* argv[]) ...@@ -139,5 +139,5 @@ int profile_gemm_bias_relu(int argc, char* argv[])
throw std::runtime_error("wrong! this data_type & layout is not implemented"); throw std::runtime_error("wrong! this data_type & layout is not implemented");
} }
return 1; return 0;
} }
...@@ -144,5 +144,5 @@ int profile_gemm_bias_relu_add(int argc, char* argv[]) ...@@ -144,5 +144,5 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
throw std::runtime_error("wrong! this data_type & layout is not implemented"); throw std::runtime_error("wrong! this data_type & layout is not implemented");
} }
return 1; return 0;
} }
...@@ -142,5 +142,5 @@ int profile_gemm_reduce(int argc, char* argv[]) ...@@ -142,5 +142,5 @@ int profile_gemm_reduce(int argc, char* argv[])
throw std::runtime_error("wrong! this data_type & layout is not implemented"); throw std::runtime_error("wrong! this data_type & layout is not implemented");
} }
return 1; return 0;
} }
...@@ -79,6 +79,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -79,6 +79,7 @@ int profile_grouped_gemm(int argc, char* argv[])
if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
ck::profiler::profile_grouped_gemm_impl<ck::half_t, ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t, ck::half_t,
ck::half_t, ck::half_t,
ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor,
...@@ -97,6 +98,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -97,6 +98,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
ck::profiler::profile_grouped_gemm_impl<ck::half_t, ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t, ck::half_t,
ck::half_t, ck::half_t,
ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor,
...@@ -115,6 +117,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -115,6 +117,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{ {
ck::profiler::profile_grouped_gemm_impl<ck::half_t, ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t, ck::half_t,
ck::half_t, ck::half_t,
ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor,
...@@ -133,6 +136,7 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -133,6 +136,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{ {
ck::profiler::profile_grouped_gemm_impl<ck::half_t, ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t, ck::half_t,
ck::half_t, ck::half_t,
ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor,
...@@ -153,5 +157,5 @@ int profile_grouped_gemm(int argc, char* argv[]) ...@@ -153,5 +157,5 @@ int profile_grouped_gemm(int argc, char* argv[])
throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented"); throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
} }
return 1; return 0;
} }
#include <iostream> #include <iostream>
#include <fstream> #include <fstream>
#include <numeric>
#include <initializer_list>
#include <cstdlib> #include <cstdlib>
#include <vector> #include <vector>
#include <stdexcept> #include <stdexcept>
#include <sstream> #include <sstream>
#include <getopt.h> #include <getopt.h>
#include "config.hpp" #include "data_type_enum.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "reduction_enums.hpp" #include "reduction_enums.hpp"
#include "host_common_util.hpp"
#include "profile_reduce_impl.hpp" #include "profile_reduce_impl.hpp"
using namespace std; using namespace std;
using ck::NanPropagation;
using ck::ReduceTensorIndices;
using ck::ReduceTensorOp; using ck::ReduceTensorOp;
static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
...@@ -38,63 +30,9 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr, ...@@ -38,63 +30,9 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
{"bf16", no_argument, nullptr, '?'}, {"bf16", no_argument, nullptr, '?'},
{"dumpout", required_argument, nullptr, 'o'}, {"dumpout", required_argument, nullptr, 'o'},
{"verify", required_argument, nullptr, 'v'}, {"verify", required_argument, nullptr, 'v'},
{"log", required_argument, nullptr, 'l'},
{"help", no_argument, nullptr, '?'}, {"help", no_argument, nullptr, '?'},
{nullptr, 0, nullptr, 0}}; {nullptr, 0, nullptr, 0}};
template <typename T>
static T getSingleValueFromString(const string& valueStr)
{
std::istringstream iss(valueStr);
T val;
iss >> val;
return (val);
};
template <typename T>
static std::vector<T> getTypeValuesFromString(const char* cstr_values)
{
std::string valuesStr(cstr_values);
std::vector<T> values;
std::size_t pos = 0;
std::size_t new_pos;
new_pos = valuesStr.find(',', pos);
while(new_pos != std::string::npos)
{
const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
T val = getSingleValueFromString<T>(sliceStr);
values.push_back(val);
pos = new_pos + 1;
new_pos = valuesStr.find(',', pos);
};
std::string sliceStr = valuesStr.substr(pos);
T val = getSingleValueFromString<T>(sliceStr);
values.push_back(val);
return (values);
}
enum struct AppDataType
{
appHalf = 0,
appFloat = 1,
appInt32 = 2,
appInt8 = 3,
appInt8x4 = 4,
appBFloat16 = 5,
appDouble = 6,
};
static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims) static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims)
{ {
for(auto dim : reduceDims) for(auto dim : reduceDims)
...@@ -113,7 +51,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims ...@@ -113,7 +51,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims
}; };
}; };
class AppArgs class ReduceProfilerArgs
{ {
private: private:
int option_index = 0; int option_index = 0;
...@@ -130,26 +68,23 @@ class AppArgs ...@@ -130,26 +68,23 @@ class AppArgs
std::vector<float> scales; std::vector<float> scales;
ReduceTensorOp reduceOp = ReduceTensorOp::ADD; ReduceTensorOp reduceOp = ReduceTensorOp::ADD;
AppDataType compTypeId = AppDataType::appFloat; ck::DataTypeEnum compTypeId = ck::DataTypeEnum::Float;
AppDataType outTypeId = AppDataType::appFloat; ck::DataTypeEnum outTypeId = ck::DataTypeEnum::Float;
bool compType_assigned = false; bool compType_assigned = false;
bool outType_assigned = false; bool outType_assigned = false;
NanPropagation nanOpt = NanPropagation::NOT_PROPAGATE_NAN; int nanOpt = 0;
ReduceTensorIndices indicesOpt = ReduceTensorIndices::NO_INDICES; int indicesOpt = 0;
bool do_log = false; bool do_verification = false;
bool do_verification = false; bool do_dumpout = false;
bool do_dumpout = false;
int init_method; int init_method;
bool time_kernel; bool time_kernel;
bool need_indices = false; ReduceProfilerArgs() = default;
~ReduceProfilerArgs() = default;
AppArgs() = default;
~AppArgs() = default;
void show_usage(const char* cmd) void show_usage(const char* cmd)
{ {
...@@ -166,8 +101,11 @@ class AppArgs ...@@ -166,8 +101,11 @@ class AppArgs
std::cout << "--outType or -W, optional enum value indicating the type of the reduced " std::cout << "--outType or -W, optional enum value indicating the type of the reduced "
"output, which could be float when the input data is half" "output, which could be float when the input data is half"
<< std::endl; << std::endl;
std::cout << "--nanOpt or -N, enum value indicates the selection for NanOpt" << std::endl; std::cout
std::cout << "--indicesOpt or -I, enum value indicates the selection for IndicesOpt" << "--nanOpt or -N, 1/0 value indicates the selection to use or not use Nan-Propagation"
<< std::endl;
std::cout << "--indicesOpt or -I, 1/0 value indicates the selection to use or not use "
"index in reduction"
<< std::endl; << std::endl;
std::cout << "--scales or -S, comma separated two float values for alpha and beta" std::cout << "--scales or -S, comma separated two float values for alpha and beta"
<< std::endl; << std::endl;
...@@ -181,18 +119,19 @@ class AppArgs ...@@ -181,18 +119,19 @@ class AppArgs
std::cout << "--dumpout or -o, 1/0 to indicate where to save the reduction result to files " std::cout << "--dumpout or -o, 1/0 to indicate where to save the reduction result to files "
"for further analysis" "for further analysis"
<< std::endl; << std::endl;
std::cout << "--log or -l, 1/0 to indicate whether to log some information" << std::endl;
}; };
int processArgs(int argc, char* argv[]) int processArgs(int argc, char* argv[])
{ {
using ck::host_common::getTypeValuesFromString;
int ch; int ch;
optind++; // to skip the "reduce" module name optind++; // to skip the "reduce" module name
while(1) while(1)
{ {
ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:l:", long_options, &option_index); ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:", long_options, &option_index);
if(ch == -1) if(ch == -1)
break; break;
switch(ch) switch(ch)
...@@ -219,27 +158,27 @@ class AppArgs ...@@ -219,27 +158,27 @@ class AppArgs
if(!optarg) if(!optarg)
throw std::runtime_error("Invalid option format!"); throw std::runtime_error("Invalid option format!");
compTypeId = static_cast<AppDataType>(std::atoi(optarg)); compTypeId = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
compType_assigned = true; compType_assigned = true;
break; break;
case 'W': case 'W':
if(!optarg) if(!optarg)
throw std::runtime_error("Invalid option format!"); throw std::runtime_error("Invalid option format!");
outTypeId = static_cast<AppDataType>(std::atoi(optarg)); outTypeId = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
outType_assigned = true; outType_assigned = true;
break; break;
case 'N': case 'N':
if(!optarg) if(!optarg)
throw std::runtime_error("Invalid option format!"); throw std::runtime_error("Invalid option format!");
nanOpt = static_cast<NanPropagation>(std::atoi(optarg)); nanOpt = std::atoi(optarg);
break; break;
case 'I': case 'I':
if(!optarg) if(!optarg)
throw std::runtime_error("Invalid option format!"); throw std::runtime_error("Invalid option format!");
indicesOpt = static_cast<ReduceTensorIndices>(std::atoi(optarg)); indicesOpt = std::atoi(optarg);
break; break;
case 'S': case 'S':
if(!optarg) if(!optarg)
...@@ -262,12 +201,6 @@ class AppArgs ...@@ -262,12 +201,6 @@ class AppArgs
do_dumpout = static_cast<bool>(std::atoi(optarg)); do_dumpout = static_cast<bool>(std::atoi(optarg));
break; break;
case 'l':
if(!optarg)
throw std::runtime_error("Invalid option format!");
do_log = static_cast<bool>(std::atoi(optarg));
break;
case '?': case '?':
if(std::string(long_options[option_index].name) == "half") if(std::string(long_options[option_index].name) == "half")
use_half = true; use_half = true;
...@@ -295,7 +228,7 @@ class AppArgs ...@@ -295,7 +228,7 @@ class AppArgs
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
init_method = std::atoi(argv[optind++]); init_method = std::atoi(argv[optind++]);
time_kernel = std::atoi(argv[optind]); time_kernel = static_cast<bool>(std::atoi(argv[optind]));
if(scales.empty()) if(scales.empty())
{ {
...@@ -306,9 +239,6 @@ class AppArgs ...@@ -306,9 +239,6 @@ class AppArgs
if(reduceOp == ReduceTensorOp::MIN || reduceOp == ReduceTensorOp::MAX || if(reduceOp == ReduceTensorOp::MIN || reduceOp == ReduceTensorOp::MAX ||
reduceOp == ReduceTensorOp::AMAX) reduceOp == ReduceTensorOp::AMAX)
{ {
if(indicesOpt != ReduceTensorIndices::NO_INDICES)
need_indices = true;
// for indexable operations, no need to assign compType and outType, just let them be // for indexable operations, no need to assign compType and outType, just let them be
// same as inType // same as inType
compType_assigned = false; compType_assigned = false;
...@@ -322,9 +252,10 @@ class AppArgs ...@@ -322,9 +252,10 @@ class AppArgs
int profile_reduce(int argc, char* argv[]) int profile_reduce(int argc, char* argv[])
{ {
using namespace ck::profiler; using ck::DataTypeEnum;
using ck::profiler::profile_reduce_impl;
AppArgs args; ReduceProfilerArgs args;
if(args.processArgs(argc, argv) < 0) if(args.processArgs(argc, argv) < 0)
return (-1); return (-1);
...@@ -339,42 +270,41 @@ int profile_reduce(int argc, char* argv[]) ...@@ -339,42 +270,41 @@ int profile_reduce(int argc, char* argv[])
if(args.use_half) if(args.use_half)
{ {
if(!args.compType_assigned) if(!args.compType_assigned)
args.compTypeId = AppDataType::appHalf; args.compTypeId = DataTypeEnum::Half;
if(args.outType_assigned && if(args.outType_assigned &&
(args.outTypeId != AppDataType::appHalf && args.outTypeId != AppDataType::appFloat)) (args.outTypeId != DataTypeEnum::Half && args.outTypeId != DataTypeEnum::Float))
args.outTypeId = AppDataType::appFloat; args.outTypeId = DataTypeEnum::Float;
if(!args.outType_assigned) if(!args.outType_assigned)
args.outTypeId = AppDataType::appHalf; args.outTypeId = DataTypeEnum::Half;
if(args.compTypeId == AppDataType::appHalf) if(args.compTypeId == DataTypeEnum::Half)
{ {
profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(args.do_verification, profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(
args.init_method, args.do_verification,
args.do_log, args.init_method,
args.do_dumpout, args.do_dumpout,
args.time_kernel, args.time_kernel,
args.inLengths, args.inLengths,
args.reduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, static_cast<bool>(args.nanOpt),
args.indicesOpt, static_cast<bool>(args.indicesOpt),
args.scales[0], args.scales[0],
args.scales[1]); args.scales[1]);
} }
else if(args.compTypeId == AppDataType::appFloat) else if(args.compTypeId == DataTypeEnum::Float)
{ {
profile_reduce_impl<ck::half_t, float, ck::half_t>(args.do_verification, profile_reduce_impl<ck::half_t, float, ck::half_t>(args.do_verification,
args.init_method, args.init_method,
args.do_log,
args.do_dumpout, args.do_dumpout,
args.time_kernel, args.time_kernel,
args.inLengths, args.inLengths,
args.reduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, static_cast<bool>(args.nanOpt),
args.indicesOpt, static_cast<bool>(args.indicesOpt),
args.scales[0], args.scales[0],
args.scales[1]); args.scales[1]);
} }
...@@ -385,56 +315,53 @@ int profile_reduce(int argc, char* argv[]) ...@@ -385,56 +315,53 @@ int profile_reduce(int argc, char* argv[])
{ {
profile_reduce_impl<double, double, double>(args.do_verification, profile_reduce_impl<double, double, double>(args.do_verification,
args.init_method, args.init_method,
args.do_log,
args.do_dumpout, args.do_dumpout,
args.time_kernel, args.time_kernel,
args.inLengths, args.inLengths,
args.reduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, static_cast<bool>(args.nanOpt),
args.indicesOpt, static_cast<bool>(args.indicesOpt),
args.scales[0], args.scales[0],
args.scales[1]); args.scales[1]);
} }
else if(args.use_int8) else if(args.use_int8)
{ {
if(!args.compType_assigned) if(!args.compType_assigned)
args.compTypeId = AppDataType::appInt8; args.compTypeId = DataTypeEnum::Int8;
if(args.outType_assigned && if(args.outType_assigned &&
(args.outTypeId != AppDataType::appInt8 && args.outTypeId != AppDataType::appInt32)) (args.outTypeId != DataTypeEnum::Int8 && args.outTypeId != DataTypeEnum::Int32))
args.outTypeId = AppDataType::appInt32; args.outTypeId = DataTypeEnum::Int32;
if(!args.outType_assigned) if(!args.outType_assigned)
args.outTypeId = AppDataType::appInt8; args.outTypeId = DataTypeEnum::Int8;
if(args.compTypeId == AppDataType::appInt8) if(args.compTypeId == DataTypeEnum::Int8)
{ {
profile_reduce_impl<int8_t, int8_t, int8_t>(args.do_verification, profile_reduce_impl<int8_t, int8_t, int8_t>(args.do_verification,
args.init_method, args.init_method,
args.do_log,
args.do_dumpout, args.do_dumpout,
args.time_kernel, args.time_kernel,
args.inLengths, args.inLengths,
args.reduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, static_cast<bool>(args.nanOpt),
args.indicesOpt, static_cast<bool>(args.indicesOpt),
args.scales[0], args.scales[0],
args.scales[1]); args.scales[1]);
} }
else if(args.compTypeId == AppDataType::appInt32) else if(args.compTypeId == DataTypeEnum::Int32)
{ {
profile_reduce_impl<int8_t, int32_t, int8_t>(args.do_verification, profile_reduce_impl<int8_t, int32_t, int8_t>(args.do_verification,
args.init_method, args.init_method,
args.do_log,
args.do_dumpout, args.do_dumpout,
args.time_kernel, args.time_kernel,
args.inLengths, args.inLengths,
args.reduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, static_cast<bool>(args.nanOpt),
args.indicesOpt, static_cast<bool>(args.indicesOpt),
args.scales[0], args.scales[0],
args.scales[1]); args.scales[1]);
} }
...@@ -444,54 +371,51 @@ int profile_reduce(int argc, char* argv[]) ...@@ -444,54 +371,51 @@ int profile_reduce(int argc, char* argv[])
else if(args.use_bf16) else if(args.use_bf16)
{ {
if(args.outType_assigned && if(args.outType_assigned &&
(args.outTypeId != AppDataType::appBFloat16 && args.outTypeId != AppDataType::appFloat)) (args.outTypeId != DataTypeEnum::BFloat16 && args.outTypeId != DataTypeEnum::Float))
args.outTypeId = AppDataType::appFloat; args.outTypeId = DataTypeEnum::Float;
if(!args.outType_assigned) if(!args.outType_assigned)
args.outTypeId = AppDataType::appBFloat16; args.outTypeId = DataTypeEnum::BFloat16;
profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(args.do_verification, profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(args.do_verification,
args.init_method, args.init_method,
args.do_log,
args.do_dumpout, args.do_dumpout,
args.time_kernel, args.time_kernel,
args.inLengths, args.inLengths,
args.reduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, static_cast<bool>(args.nanOpt),
args.indicesOpt, static_cast<bool>(args.indicesOpt),
args.scales[0], args.scales[0],
args.scales[1]); args.scales[1]);
} }
else else
{ {
if(args.compTypeId == AppDataType::appFloat) if(args.compTypeId == DataTypeEnum::Float)
{ {
profile_reduce_impl<float, float, float>(args.do_verification, profile_reduce_impl<float, float, float>(args.do_verification,
args.init_method, args.init_method,
args.do_log,
args.do_dumpout, args.do_dumpout,
args.time_kernel, args.time_kernel,
args.inLengths, args.inLengths,
args.reduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, static_cast<bool>(args.nanOpt),
args.indicesOpt, static_cast<bool>(args.indicesOpt),
args.scales[0], args.scales[0],
args.scales[1]); args.scales[1]);
} }
else if(args.compTypeId == AppDataType::appDouble) else if(args.compTypeId == DataTypeEnum::Double)
{ {
profile_reduce_impl<float, double, float>(args.do_verification, profile_reduce_impl<float, double, float>(args.do_verification,
args.init_method, args.init_method,
args.do_log,
args.do_dumpout, args.do_dumpout,
args.time_kernel, args.time_kernel,
args.inLengths, args.inLengths,
args.reduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, static_cast<bool>(args.nanOpt),
args.indicesOpt, static_cast<bool>(args.indicesOpt),
args.scales[0], args.scales[0],
args.scales[1]); args.scales[1]);
} }
......
...@@ -13,6 +13,7 @@ int profile_gemm_bias_relu_add(int, char*[]); ...@@ -13,6 +13,7 @@ int profile_gemm_bias_relu_add(int, char*[]);
int profile_gemm_reduce(int, char*[]); int profile_gemm_reduce(int, char*[]);
int profile_batched_gemm(int, char*[]); int profile_batched_gemm(int, char*[]);
int profile_grouped_gemm(int, char*[]); int profile_grouped_gemm(int, char*[]);
int profile_conv_fwd(int, char*[]);
int profile_conv_fwd_bias_relu(int, char*[]); int profile_conv_fwd_bias_relu(int, char*[]);
int profile_conv_fwd_bias_relu_add(int, char*[]); int profile_conv_fwd_bias_relu_add(int, char*[]);
int profile_conv_fwd_bias_relu_atomic_add(int, char*[]); int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);
...@@ -53,7 +54,7 @@ int main(int argc, char* argv[]) ...@@ -53,7 +54,7 @@ int main(int argc, char* argv[])
} }
else if(strcmp(argv[1], "grouped_gemm") == 0) else if(strcmp(argv[1], "grouped_gemm") == 0)
{ {
profile_grouped_gemm(argc, argv); return profile_grouped_gemm(argc, argv);
} }
else if(strcmp(argv[1], "conv_fwd") == 0) else if(strcmp(argv[1], "conv_fwd") == 0)
{ {
...@@ -107,7 +108,7 @@ int main(int argc, char* argv[]) ...@@ -107,7 +108,7 @@ int main(int argc, char* argv[])
" conv1d_bwd_data: BackwardConvolution data 1 dim\n" " conv1d_bwd_data: BackwardConvolution data 1 dim\n"
" conv2d_bwd_data: BackwardConvolution data 2 dim\n" " conv2d_bwd_data: BackwardConvolution data 2 dim\n"
" conv3d_bwd_data: BackwardConvolution data 3 dim\n" " conv3d_bwd_data: BackwardConvolution data 3 dim\n"
" reduce: REDUCE\n" " reduce: Reduce\n"
" conv2d_bwd_weight: Backward Weight Convolution 2d\n"); " conv2d_bwd_weight: Backward Weight Convolution 2d\n");
// clang-format on // clang-format on
} }
......
#!/usr/bin/env python3 #!/usr/bin/env python3
import os, io import os, io, argparse, datetime
import argparse import numpy as np
import sqlalchemy
def print_to_string(*args, **kwargs): from sqlalchemy.types import NVARCHAR, Float, Integer
output = io.StringIO() import pymysql
print(*args, file=output, **kwargs) import pandas as pd
contents = output.getvalue() from sshtunnel import SSHTunnelForwarder
output.close()
return contents def print_to_string(*args, **kwargs):
output = io.StringIO()
def parse_args(): print(*args, file=output, **kwargs)
parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs') contents = output.getvalue()
parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files') output.close()
args = parser.parse_args() return contents
files = []
if os.path.isdir(args.filename): def parse_args():
all_files = os.listdir(args.filename) parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs')
for name in all_files: parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files')
if not 'log' in name: args = parser.parse_args()
continue files = []
files.append(os.path.join(args.filename, name)) if os.path.isdir(args.filename):
else: all_files = os.listdir(args.filename)
files = [args.filename] for name in all_files:
args.files = files if not 'log' in name:
return args continue
files.append(os.path.join(args.filename, name))
def main(): else:
args = parse_args() files = [args.filename]
results = [] args.files = files
#parse results return args
glue=""
for filename in args.files: def main():
for line in open(filename): args = parse_args()
if 'Best Perf' in line: tests = []
lst=line.split() kernels=[]
results.append(print_to_string(glue.join(lst[8:]),lst[4])) tflops=[]
dtype=[]
#sort results alayout=[]
blayout=[]
#read baseline results for the latest develop branch M=[]
N=[]
#write new results to the db K=[]
StrideA=[]
#compare the results to the baseline StrideB=[]
StrideC=[]
#return 0 if performance criteria met, otherwise return 1 #parse results, get the Tflops value for "Best Perf" kernels
glue=""
print(results) for filename in args.files:
return 0 for line in open(filename):
if 'Branch name' in line:
if __name__ == '__main__': lst=line.split()
branch_name=lst[2]
for filename in args.files:
for line in open(filename):
if 'Best Perf' in line:
lst=line.split()
if len(lst)>=37: #the line is complete
tests.append(glue.join(lst[5:30]))
kernels.append(glue.join(lst[37:]))
tflops.append(lst[33])
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
elif len(lst)<37 and len(lst)>=33: #the tflops are available
tests.append(glue.join(lst[5:30]))
kernels.append("N/A")
tflops.append(lst[33])
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
print("warning: incomplete line:",lst)
elif len(lst)<33: #even the tflops are not available
print("Error in ckProfiler output!")
print("warning: incomplete line=",lst)
#sort results
print("Number of tests:",len(tests))
print("Branch name:",branch_name)
#sorted_tests = sorted(tests)
#print("sorted tests:",sorted_tests)
sorted_tflops = [x for _,x in sorted(zip(tests,tflops))]
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
test_list=list(range(1,len(tests)+1))
sql_hostname = '127.0.0.1'
sql_username = os.environ["dbuser"]
print("sql_username=",sql_username)
sql_password = os.environ["dbpassword"]
sql_main_database = 'miopen_perf'
sql_port = 3306
ssh_host = os.environ["dbsship"]
print("ssh_host=",ssh_host)
ssh_user = os.environ["dbsshuser"]
print("ssh_user=",ssh_user)
ssh_port = int(os.environ["dbsshport"])
ssh_pass = os.environ["dbsshpassword"]
with SSHTunnelForwarder(
(ssh_host, ssh_port),
ssh_username=ssh_user,
ssh_password=ssh_pass,
remote_bind_address=(sql_hostname, sql_port)) as tunnel:
sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'.
format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database))
conn = sqlEngine.connect()
#write the ck_gemm_test_params table
#only needed once the test set changes
'''
sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
sorted_M = [x for _,x in sorted(zip(tests,M))]
sorted_N = [x for _,x in sorted(zip(tests,N))]
sorted_K = [x for _,x in sorted(zip(tests,K))]
sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout,
sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
sorted_StrideC]
df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
print(df)
dtypes = {
'Test_number': Integer(),
'Data_type': NVARCHAR(length=5),
'Alayout': NVARCHAR(length=12),
'Blayout': NVARCHAR(length=12),
'M': Integer(),
'N': Integer(),
'K': Integer(),
'StrideA': Integer(),
'StrideB': Integer(),
'StrideC': Integer()
}
df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes)
'''
#read baseline results for the latest develop branch
query = '''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );'''
tflops_base = pd.read_sql_query(query, conn)
#write new results to the db
testlist=[]
for i in range(1,len(tests)+1):
testlist.append("Test%i"%i)
ck_gemm_tflops=[str(branch_name),str(datetime.datetime.now())]
flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Datetime'])
df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist)
flops=pd.concat([flops,df_add],axis=1)
print("new tflops results:",flops)
flops.to_sql("ck_gemm_tflops",conn,if_exists='append',index=False)
conn.close()
#compare the results to the baseline
regression=0
base=tflops_base[testlist].to_numpy(dtype='float')
base_list=base[0]
ave_perf=0
for i in range(len(base_list)):
# success criterion:
if base_list[i]>1.01*float(sorted_tflops[i]):
print("test # ",i,"shows regression by {:.3f}%".format(
(float(sorted_tflops[i])-base_list[i])/base_list[i]*100))
regression=1
ave_perf=ave_perf+float(sorted_tflops[i])/base_list[i]
if regression==0:
print("no regressions found")
ave_perf=ave_perf/len(base_list)
print("average performance relative to baseline:",ave_perf)
#return 0 if performance criteria met, otherwise return 1
return regression
if __name__ == '__main__':
main() main()
\ No newline at end of file
...@@ -15,6 +15,17 @@ bin/test_reduce_no_index -D 64,4,280,82 -R 1 0 2 ...@@ -15,6 +15,17 @@ bin/test_reduce_no_index -D 64,4,280,82 -R 1 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 2 0 2 bin/test_reduce_no_index -D 64,4,280,82 -R 2 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 3 0 2 bin/test_reduce_no_index -D 64,4,280,82 -R 3 0 2
## for float64
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 2 6 2
bin/test_reduce_no_index -D 64,4,280,82 -R 3 6 2
## for float16 ## for float16
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 1 2 bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 1 2 bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 1 2
......
...@@ -15,6 +15,17 @@ bin/test_reduce_with_index -D 64,4,280,82 -R 1 0 2 ...@@ -15,6 +15,17 @@ bin/test_reduce_with_index -D 64,4,280,82 -R 1 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 2 0 2 bin/test_reduce_with_index -D 64,4,280,82 -R 2 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 3 0 2 bin/test_reduce_with_index -D 64,4,280,82 -R 3 0 2
## for float64
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 2 6 2
bin/test_reduce_with_index -D 64,4,280,82 -R 3 6 2
## for float16 ## for float16
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 1 2 bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 1 2 bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 1 2
......
...@@ -2,6 +2,7 @@ include_directories(BEFORE ...@@ -2,6 +2,7 @@ include_directories(BEFORE
${PROJECT_SOURCE_DIR}/ ${PROJECT_SOURCE_DIR}/
${PROJECT_SOURCE_DIR}/include/ck ${PROJECT_SOURCE_DIR}/include/ck
${PROJECT_SOURCE_DIR}/include/ck/utility ${PROJECT_SOURCE_DIR}/include/ck/utility
${PROJECT_SOURCE_DIR}/include/ck/host_utility
${PROJECT_SOURCE_DIR}/include/ck/tensor_description ${PROJECT_SOURCE_DIR}/include/ck/tensor_description
${PROJECT_SOURCE_DIR}/include/ck/tensor ${PROJECT_SOURCE_DIR}/include/ck/tensor
${PROJECT_SOURCE_DIR}/include/ck/problem_transform ${PROJECT_SOURCE_DIR}/include/ck/problem_transform
......
...@@ -8,6 +8,7 @@ using namespace ck; ...@@ -8,6 +8,7 @@ using namespace ck;
static auto I0 = Number<0>{}; static auto I0 = Number<0>{};
static auto I1 = Number<1>{}; static auto I1 = Number<1>{};
static auto I2 = Number<2>{};
TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck1) TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck1)
{ {
...@@ -20,7 +21,7 @@ TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck1 ...@@ -20,7 +21,7 @@ TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck1
const index_t M01 = 4; const index_t M01 = 4;
const index_t N01 = 4; const index_t N01 = 4;
auto c_grid_desc_m_n = make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, I1)); auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
printf("(M, N, MPerBlock, NPerBlock, M01, N01) = (%d, %d, %d, %d, %d, %d)\n", printf("(M, N, MPerBlock, NPerBlock, M01, N01) = (%d, %d, %d, %d, %d, %d)\n",
M, M,
...@@ -37,7 +38,7 @@ TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck1 ...@@ -37,7 +38,7 @@ TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck1
EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 16); EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 16);
// clang-format off // clang-format off
std::vector<std::vector<int>> expected = { std::vector<std::vector<int>> expected_m0idx_n0idx_valid = {
{0, 0, 1}, {0, 0, 1},
{0, 1, 1}, {0, 1, 1},
{0, 2, 1}, {0, 2, 1},
...@@ -64,7 +65,7 @@ TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck1 ...@@ -64,7 +65,7 @@ TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck1
std::cout << ", valid = " << tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock)) std::cout << ", valid = " << tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))
<< std::endl; << std::endl;
bool equal = bool equal =
expected[i] == expected_m0idx_n0idx_valid[i] ==
std::vector<int>{m0n0_idx[I0], std::vector<int>{m0n0_idx[I0],
m0n0_idx[I1], m0n0_idx[I1],
tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))}; tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))};
...@@ -78,12 +79,11 @@ TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck0 ...@@ -78,12 +79,11 @@ TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck0
const index_t N = 384; const index_t N = 384;
const index_t MPerBlock = 128; const index_t MPerBlock = 128;
const index_t NPerBlock = 128; const index_t NPerBlock = 128;
// const index_t MBlock = M / MPerBlock;
// const index_t NBlock = N / NPerBlock;
const index_t M01 = 4; const index_t M01 = 4;
const index_t N01 = 4; const index_t N01 = 4;
auto c_grid_desc_m_n = make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, I1)); auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
printf("(M, N, MPerBlock, NPerBlock, M01, N01) = (%d, %d, %d, %d, %d, %d)\n", printf("(M, N, MPerBlock, NPerBlock, M01, N01) = (%d, %d, %d, %d, %d, %d)\n",
M, M,
...@@ -98,3 +98,221 @@ TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck0 ...@@ -98,3 +98,221 @@ TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck0
EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == false); EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == false);
} }
TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N0_M01_DeviceCTileIndexCheck1)
{
const index_t M = 384;
const index_t N = 512;
const index_t MPerBlock = 128;
const index_t NPerBlock = 128;
const index_t MBlock = M / MPerBlock;
const index_t NBlock = N / NPerBlock;
const index_t M01 = 4;
auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
M,
N,
MPerBlock,
NPerBlock,
M01);
BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n), true> tile_map(
c_grid_desc_m_n, M01);
EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 16);
// clang-format off
std::vector<std::vector<int>> expected_m0idx_n0idx_valid = {
{0, 0, 1},
{1, 0, 1},
{2, 0, 1},
{3, 0, 0},
{0, 1, 1},
{1, 1, 1},
{2, 1, 1},
{3, 1, 0},
{0, 2, 1},
{1, 2, 1},
{2, 2, 1},
{3, 2, 0},
{0, 3, 1},
{1, 3, 1},
{2, 3, 1},
{3, 3, 0}
};
// clang-format on
for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
{
auto m0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
std::cout << "block_1d_id = " << i << ", m0, n0 = " << m0n0_idx[I0] << ", " << m0n0_idx[I1];
std::cout << ", valid = " << tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))
<< std::endl;
bool equal =
expected_m0idx_n0idx_valid[i] ==
std::vector<int>{m0n0_idx[I0],
m0n0_idx[I1],
tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))};
EXPECT_TRUE(equal);
}
}
TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N0_M01_DeviceCTileIndexCheck0)
{
const index_t M = 512;
const index_t N = 384;
const index_t MPerBlock = 128;
const index_t NPerBlock = 128;
auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
// clang-format off
std::vector<std::tuple<int, int, bool>> expected_m0_gridsize_validity = {
{5, 15, false},
{4, 12, true},
{3, 18, false},
{2, 12, true},
{1, 12, true}
};
// clang-format on
for(auto e : expected_m0_gridsize_validity)
{
const index_t M01 = std::get<0>(e);
printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
M,
N,
MPerBlock,
NPerBlock,
M01);
BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n), false> tile_map(
c_grid_desc_m_n, M01);
EXPECT_EQ(tile_map.CalculateGridSize(c_grid_desc_m_n), std::get<1>(e));
EXPECT_EQ(tile_map.CheckValidity(c_grid_desc_m_n), std::get<2>(e));
}
}
TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N0_M01Adapt)
{
const index_t M = 768;
const index_t N = 384;
const index_t MPerBlock = 128;
const index_t NPerBlock = 128;
const index_t MBlock = M / MPerBlock;
const index_t NBlock = N / NPerBlock;
constexpr index_t M01 = 4;
auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
M,
N,
MPerBlock,
NPerBlock,
M01);
BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n)> tile_map(
c_grid_desc_m_n, M01);
EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 18);
// clang-format off
std::vector<std::vector<int>> expected_m0idx_n0idx_valid = {
{0, 0, 1},
{1, 0, 1},
{2, 0, 1},
{3, 0, 1},
{0, 1, 1},
{1, 1, 1},
{2, 1, 1},
{3, 1, 1},
{0, 2, 1},
{1, 2, 1},
{2, 2, 1},
{3, 2, 1},
{4, 0, 1},
{5, 0, 1},
{4, 1, 1},
{5, 1, 1},
{4, 2, 1},
{5, 2, 1},
};
// clang-format on
for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
{
auto m0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
std::cout << "block_1d_id = " << i << ", m0, n0 = " << m0n0_idx[I0] << ", " << m0n0_idx[I1];
std::cout << ", valid = " << tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))
<< std::endl;
bool equal =
expected_m0idx_n0idx_valid[i] ==
std::vector<int>{m0n0_idx[I0],
m0n0_idx[I1],
tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))};
EXPECT_TRUE(equal);
}
}
TEST(BlockToCTileMap, TestBlockToCTileMap_KSplit_M00_N0_M01Adapt)
{
const index_t M = 768;
const index_t N = 384;
const index_t MPerBlock = 128;
const index_t NPerBlock = 128;
const index_t MBlock = M / MPerBlock;
const index_t NBlock = N / NPerBlock;
constexpr index_t M01 = 4;
const index_t KSplit = 3;
auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
M,
N,
MPerBlock,
NPerBlock,
M01);
BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n)>
tile_map(c_grid_desc_m_n, M01, KSplit);
EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 18 * KSplit);
std::vector<std::vector<int>> expected_ksplitidx_m0idx_n0idx_valid = {
{0, 0, 0, 1}, {0, 1, 0, 1}, {0, 2, 0, 1}, {0, 3, 0, 1}, {0, 0, 1, 1}, {0, 1, 1, 1},
{0, 2, 1, 1}, {0, 3, 1, 1}, {0, 0, 2, 1}, {0, 1, 2, 1}, {0, 2, 2, 1}, {0, 3, 2, 1},
{0, 4, 0, 1}, {0, 5, 0, 1}, {0, 4, 1, 1}, {0, 5, 1, 1}, {0, 4, 2, 1}, {0, 5, 2, 1},
{1, 0, 0, 1}, {1, 1, 0, 1}, {1, 2, 0, 1}, {1, 3, 0, 1}, {1, 0, 1, 1}, {1, 1, 1, 1},
{1, 2, 1, 1}, {1, 3, 1, 1}, {1, 0, 2, 1}, {1, 1, 2, 1}, {1, 2, 2, 1}, {1, 3, 2, 1},
{1, 4, 0, 1}, {1, 5, 0, 1}, {1, 4, 1, 1}, {1, 5, 1, 1}, {1, 4, 2, 1}, {1, 5, 2, 1},
{2, 0, 0, 1}, {2, 1, 0, 1}, {2, 2, 0, 1}, {2, 3, 0, 1}, {2, 0, 1, 1}, {2, 1, 1, 1},
{2, 2, 1, 1}, {2, 3, 1, 1}, {2, 0, 2, 1}, {2, 1, 2, 1}, {2, 2, 2, 1}, {2, 3, 2, 1},
{2, 4, 0, 1}, {2, 5, 0, 1}, {2, 4, 1, 1}, {2, 5, 1, 1}, {2, 4, 2, 1}, {2, 5, 2, 1},
};
for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
{
auto ksplitm0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
std::cout << "block_1d_id = " << i << ", ksplit, m0, n0 = " << ksplitm0n0_idx[I0] << ", "
<< ksplitm0n0_idx[I1] << ", " << ksplitm0n0_idx[I2];
std::cout << ", valid = "
<< tile_map.ValidCTileIndex(ksplitm0n0_idx, make_tuple(MBlock, NBlock))
<< std::endl;
bool equal =
expected_ksplitidx_m0idx_n0idx_valid[i] ==
std::vector<int>{ksplitm0n0_idx[I0],
ksplitm0n0_idx[I1],
ksplitm0n0_idx[I2],
tile_map.ValidCTileIndex(ksplitm0n0_idx, make_tuple(MBlock, NBlock))};
EXPECT_TRUE(equal);
}
}
add_test_executable(test_gemm_fp32 gemm_fp32.cpp) # GEMM XDL
target_link_libraries(test_gemm_fp32 PRIVATE host_tensor) add_test_executable(test_gemm_xdl_fp32 gemm_xdl_fp32.cpp)
target_link_libraries(test_gemm_fp32 PRIVATE device_gemm_instance) target_link_libraries(test_gemm_xdl_fp32 PRIVATE host_tensor)
target_link_libraries(test_gemm_xdl_fp32 PRIVATE device_gemm_instance)
add_test_executable(test_gemm_fp16 gemm_fp16.cpp) add_test_executable(test_gemm_xdl_fp16 gemm_xdl_fp16.cpp)
target_link_libraries(test_gemm_fp16 PRIVATE host_tensor) target_link_libraries(test_gemm_xdl_fp16 PRIVATE host_tensor)
target_link_libraries(test_gemm_fp16 PRIVATE device_gemm_instance) target_link_libraries(test_gemm_xdl_fp16 PRIVATE device_gemm_instance)
add_test_executable(test_gemm_bf16 gemm_bf16.cpp) add_test_executable(test_gemm_xdl_bf16 gemm_xdl_bf16.cpp)
target_link_libraries(test_gemm_bf16 PRIVATE host_tensor) target_link_libraries(test_gemm_xdl_bf16 PRIVATE host_tensor)
target_link_libraries(test_gemm_bf16 PRIVATE device_gemm_instance) target_link_libraries(test_gemm_xdl_bf16 PRIVATE device_gemm_instance)
add_test_executable(test_gemm_int8 gemm_int8.cpp) add_test_executable(test_gemm_xdl_int8 gemm_xdl_int8.cpp)
target_link_libraries(test_gemm_int8 PRIVATE host_tensor) target_link_libraries(test_gemm_xdl_int8 PRIVATE host_tensor)
target_link_libraries(test_gemm_int8 PRIVATE device_gemm_instance) target_link_libraries(test_gemm_xdl_int8 PRIVATE device_gemm_instance)
# GEMM DL
add_test_executable(test_gemm_dl_fp32 gemm_dl_fp32.cpp)
target_link_libraries(test_gemm_dl_fp32 PRIVATE host_tensor)
target_link_libraries(test_gemm_dl_fp32 PRIVATE device_gemm_instance)
add_test_executable(test_gemm_dl_fp16 gemm_dl_fp16.cpp)
target_link_libraries(test_gemm_dl_fp16 PRIVATE host_tensor)
target_link_libraries(test_gemm_dl_fp16 PRIVATE device_gemm_instance)
add_test_executable(test_gemm_dl_int8 gemm_dl_int8.cpp)
target_link_libraries(test_gemm_dl_int8 PRIVATE host_tensor)
TArget_link_libraries(test_gemm_dl_int8 PRIVATE device_gemm_instance)
#include <algorithm> #include <algorithm>
#include <cstdlib> #include <cstdlib>
#include <half.hpp> #include <half.hpp>
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#include "gemm_util.hpp" #include "../gemm/gemm_util.hpp"
#include "config.hpp" #include "config.hpp"
#include "print.hpp" #include "print.hpp"
#include "device.hpp" #include "device.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "host_tensor_generator.hpp" #include "host_tensor_generator.hpp"
#include "host_gemm.hpp" #include "host_gemm.hpp"
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_gemm_xdl.hpp" #include "device_gemm_dl.hpp"
#include "device_gemm_xdl_cshuffle.hpp" #include "element_wise_operation.hpp"
#include "element_wise_operation.hpp" #include "reference_gemm.hpp"
#include "reference_gemm.hpp" #include "gemm_specialization.hpp"
#include "gemm_specialization.hpp"
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceGemmNoOpPtr =
using DeviceGemmNoOpPtr = ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough>;
ck::tensor_operation::element_wise::PassThrough>;
namespace ck {
namespace ck { namespace tensor_operation {
namespace tensor_operation { namespace device {
namespace device { namespace device_gemm_instance {
namespace device_gemm_instance {
void add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_kn_mn_instances( void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
std::vector<DeviceGemmNoOpPtr>&); void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_nk_mn_instances( void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
std::vector<DeviceGemmNoOpPtr>&); void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances(
std::vector<DeviceGemmNoOpPtr>&); } // namespace device_gemm_instance
void add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_kn_mn_instances( } // namespace device
std::vector<DeviceGemmNoOpPtr>&); } // namespace tensor_operation
} // namespace device_gemm_instance } // namespace ck
} // namespace device
} // namespace tensor_operation int main()
} // namespace ck {
using ADataType = ck::half_t;
int main() using BDataType = ck::half_t;
{ using CDataType = ck::half_t;
using ADataType = int8_t; using AccDataType = float;
using BDataType = int8_t;
using CDataType = int8_t; using RowMajor = ck::tensor_layout::gemm::RowMajor;
using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
using RowMajor = ck::tensor_layout::gemm::RowMajor;
using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; bool res = true;
std::vector<DeviceGemmNoOpPtr> gemmPtrs; std::vector<DeviceGemmNoOpPtr> gemmPtrs;
bool res = true;
ck::tensor_operation::device::device_gemm_instance::
ck::tensor_operation::device::device_gemm_instance:: add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(gemmPtrs);
add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_kn_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
for(auto& gemmPtr : gemmPtrs) {
{ res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr, ADataType,
ADataType, BDataType,
BDataType, CDataType,
CDataType, AccDataType,
ColumnMajor, ColumnMajor,
RowMajor, RowMajor,
RowMajor, RowMajor,
PassThrough, PassThrough,
PassThrough, PassThrough,
PassThrough>{}(gemmPtr); PassThrough>{}(gemmPtr);
} }
gemmPtrs.clear(); gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance:: ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_km_nk_mn_instances(gemmPtrs); add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs) for(auto& gemmPtr : gemmPtrs)
{ {
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr, res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType, ADataType,
BDataType, BDataType,
CDataType, CDataType,
ColumnMajor, AccDataType,
ColumnMajor, ColumnMajor,
RowMajor, ColumnMajor,
PassThrough, RowMajor,
PassThrough, PassThrough,
PassThrough>{}(gemmPtr); PassThrough,
} PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance:: gemmPtrs.clear();
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_kn_mn_instances(gemmPtrs); ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{ for(auto& gemmPtr : gemmPtrs)
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr, {
ADataType, res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
BDataType, ADataType,
CDataType, BDataType,
RowMajor, CDataType,
RowMajor, AccDataType,
RowMajor, RowMajor,
PassThrough, RowMajor,
PassThrough, RowMajor,
PassThrough>{}(gemmPtr); PassThrough,
} PassThrough,
PassThrough>{}(gemmPtr);
gemmPtrs.clear(); }
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_c_shuffle_int8_int8_int8_mk_nk_mn_instances(gemmPtrs); gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
for(auto& gemmPtr : gemmPtrs) add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(gemmPtrs);
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr, for(auto& gemmPtr : gemmPtrs)
ADataType, {
BDataType, res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
CDataType, ADataType,
RowMajor, BDataType,
ColumnMajor, CDataType,
RowMajor, AccDataType,
PassThrough, RowMajor,
PassThrough, ColumnMajor,
PassThrough>{}(gemmPtr); RowMajor,
} PassThrough,
PassThrough,
std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; PassThrough>{}(gemmPtr);
return res ? 0 : 1; }
}
std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
return res ? 0 : 1;
}
#include <algorithm>
#include <cstdlib>
#include <half.hpp>
#include <iostream>
#include <numeric>
#include <tuple>
#include <vector>
#include "../gemm/gemm_util.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_gemm_dl.hpp"
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceGemmNoOpPtr =
ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_gemm_instance {
void add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
} // namespace device_gemm_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
int main()
{
using ADataType = float;
using BDataType = float;
using CDataType = float;
using AccDataType = float;
using RowMajor = ck::tensor_layout::gemm::RowMajor;
using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
bool res = true;
std::vector<DeviceGemmNoOpPtr> gemmPtrs;
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
AccDataType,
ColumnMajor,
RowMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
AccDataType,
ColumnMajor,
ColumnMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
AccDataType,
RowMajor,
RowMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
AccDataType,
RowMajor,
ColumnMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
return res ? 0 : 1;
}
#include <algorithm>
#include <cstdlib>
#include <half.hpp>
#include <iostream>
#include <numeric>
#include <tuple>
#include <vector>
#include "../gemm/gemm_util.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_gemm_dl.hpp"
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceGemmNoOpPtr =
ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_gemm_instance {
void add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
} // namespace device_gemm_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
int main()
{
using ADataType = int8_t;
using BDataType = int8_t;
using CDataType = int8_t;
using AccDataType = int;
using RowMajor = ck::tensor_layout::gemm::RowMajor;
using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
bool res = true;
std::vector<DeviceGemmNoOpPtr> gemmPtrs;
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
AccDataType,
ColumnMajor,
RowMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
AccDataType,
ColumnMajor,
ColumnMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
AccDataType,
RowMajor,
RowMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
AccDataType,
RowMajor,
ColumnMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
return res ? 0 : 1;
}
...@@ -60,7 +60,7 @@ template <typename DeviceGemmPtr_, ...@@ -60,7 +60,7 @@ template <typename DeviceGemmPtr_,
typename AElementwiseOperation, typename AElementwiseOperation,
typename BElementwiseOperation, typename BElementwiseOperation,
typename CElementwiseOperation> typename CElementwiseOperation>
void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr, bool RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
const ck::gemm_util::GemmParams& params, const ck::gemm_util::GemmParams& params,
const Tensor<ADataType>& A, const Tensor<ADataType>& A,
const Tensor<BDataType>& B, const Tensor<BDataType>& B,
...@@ -73,9 +73,6 @@ void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr, ...@@ -73,9 +73,6 @@ void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpace()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpace());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpace()); DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpace());
a_m_k_device_buf.ToDevice(A.mData.data());
b_k_n_device_buf.ToDevice(B.mData.data());
auto invoker_ptr = gemmPtr->MakeInvokerPointer(); auto invoker_ptr = gemmPtr->MakeInvokerPointer();
auto argument_ptr = auto argument_ptr =
gemmPtr->MakeArgumentPointer(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()), gemmPtr->MakeArgumentPointer(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
...@@ -91,21 +88,30 @@ void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr, ...@@ -91,21 +88,30 @@ void RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
b_element_op, b_element_op,
c_element_op); c_element_op);
if(!gemmPtr->IsSupportedArgument(argument_ptr.get())) if(gemmPtr->IsSupportedArgument(argument_ptr.get()))
{ {
throw std::runtime_error( a_m_k_device_buf.ToDevice(A.mData.data());
"wrong! device_gemm with the specified compilation parameters does " b_k_n_device_buf.ToDevice(B.mData.data());
"not support this GEMM problem"); invoker_ptr->Run(argument_ptr.get());
c_m_n_device_buf.FromDevice(C.mData.data());
return true;
} }
else
{
std::cout << "device_gemm with the specified compilation parameters does "
"not support this GEMM problem"
<< std::endl;
invoker_ptr->Run(argument_ptr.get()); return false;
c_m_n_device_buf.FromDevice(C.mData.data()); }
} }
template <typename DeviceGemmPtr_, template <typename DeviceGemmPtr_,
typename ADataType, typename ADataType,
typename BDataType, typename BDataType,
typename CDataType, typename CDataType,
typename AccDataType,
typename ALayout, typename ALayout,
typename BLayout, typename BLayout,
typename CLayout, typename CLayout,
...@@ -181,6 +187,7 @@ struct TestGemm ...@@ -181,6 +187,7 @@ struct TestGemm
ck::tensor_operation::host::ReferenceGemm<ADataType, ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
CDataType, CDataType,
AccDataType,
AElementwiseOperation, AElementwiseOperation,
BElementwiseOperation, BElementwiseOperation,
CElementwiseOperation>; CElementwiseOperation>;
...@@ -188,28 +195,40 @@ struct TestGemm ...@@ -188,28 +195,40 @@ struct TestGemm
a, b, c_host, a_element_op, b_element_op, c_element_op); a, b, c_host, a_element_op, b_element_op, c_element_op);
// Act // Act
ck::gemm_util::RunDeviceGEMM( bool is_supported = ck::gemm_util::RunDeviceGEMM(
gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op); gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op);
// Assert if(is_supported)
bool res = false;
if(std::is_same<CDataType, float>::value)
{ {
res = ck::utils::check_err(c_device.mData, c_host.mData); // Assert
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; bool res = false;
if(std::is_same<CDataType, float>::value)
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
else if(std::is_same<CDataType, ck::half_t>::value)
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
else if(std::is_same<CDataType, int8_t>::value)
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
else if(std::is_same<CDataType, double>::value)
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
return res;
} }
else if(std::is_same<CDataType, ck::half_t>::value) else
{ {
res = ck::utils::check_err(c_device.mData, c_host.mData); return true;
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
} }
else if(std::is_same<CDataType, int8_t>::value)
{
res = ck::utils::check_err(c_device.mData, c_host.mData);
std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
return res;
} }
}; };
...@@ -299,6 +318,7 @@ struct TestGemmBF16 ...@@ -299,6 +318,7 @@ struct TestGemmBF16
// use fp32 host kernel to verify bf16 device kernel // use fp32 host kernel to verify bf16 device kernel
using ReferenceGemmInstance = using ReferenceGemmInstance =
ck::tensor_operation::host::ReferenceGemm<float, ck::tensor_operation::host::ReferenceGemm<float,
float,
float, float,
float, float,
AElementwiseOperation, AElementwiseOperation,
......
...@@ -52,9 +52,10 @@ void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances( ...@@ -52,9 +52,10 @@ void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(
int main() int main()
{ {
using ADataType = ck::half_t; using ADataType = ck::half_t;
using BDataType = ck::half_t; using BDataType = ck::half_t;
using CDataType = ck::half_t; using CDataType = ck::half_t;
using AccDataType = float;
using RowMajor = ck::tensor_layout::gemm::RowMajor; using RowMajor = ck::tensor_layout::gemm::RowMajor;
using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
...@@ -74,6 +75,7 @@ int main() ...@@ -74,6 +75,7 @@ int main()
ADataType, ADataType,
BDataType, BDataType,
CDataType, CDataType,
AccDataType,
ColumnMajor, ColumnMajor,
RowMajor, RowMajor,
RowMajor, RowMajor,
...@@ -96,6 +98,7 @@ int main() ...@@ -96,6 +98,7 @@ int main()
ADataType, ADataType,
BDataType, BDataType,
CDataType, CDataType,
AccDataType,
ColumnMajor, ColumnMajor,
ColumnMajor, ColumnMajor,
RowMajor, RowMajor,
...@@ -118,6 +121,7 @@ int main() ...@@ -118,6 +121,7 @@ int main()
ADataType, ADataType,
BDataType, BDataType,
CDataType, CDataType,
AccDataType,
RowMajor, RowMajor,
RowMajor, RowMajor,
RowMajor, RowMajor,
...@@ -142,6 +146,7 @@ int main() ...@@ -142,6 +146,7 @@ int main()
ADataType, ADataType,
BDataType, BDataType,
CDataType, CDataType,
AccDataType,
RowMajor, RowMajor,
ColumnMajor, ColumnMajor,
RowMajor, RowMajor,
......
...@@ -53,9 +53,10 @@ void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(std::vector<De ...@@ -53,9 +53,10 @@ void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(std::vector<De
int main() int main()
{ {
using ADataType = float; using ADataType = float;
using BDataType = float; using BDataType = float;
using CDataType = float; using CDataType = float;
using AccDataType = float;
using RowMajor = ck::tensor_layout::gemm::RowMajor; using RowMajor = ck::tensor_layout::gemm::RowMajor;
using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor; using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
...@@ -75,6 +76,7 @@ int main() ...@@ -75,6 +76,7 @@ int main()
ADataType, ADataType,
BDataType, BDataType,
CDataType, CDataType,
AccDataType,
ColumnMajor, ColumnMajor,
RowMajor, RowMajor,
RowMajor, RowMajor,
...@@ -97,6 +99,7 @@ int main() ...@@ -97,6 +99,7 @@ int main()
ADataType, ADataType,
BDataType, BDataType,
CDataType, CDataType,
AccDataType,
ColumnMajor, ColumnMajor,
ColumnMajor, ColumnMajor,
RowMajor, RowMajor,
...@@ -119,6 +122,7 @@ int main() ...@@ -119,6 +122,7 @@ int main()
ADataType, ADataType,
BDataType, BDataType,
CDataType, CDataType,
AccDataType,
RowMajor, RowMajor,
RowMajor, RowMajor,
RowMajor, RowMajor,
...@@ -141,6 +145,7 @@ int main() ...@@ -141,6 +145,7 @@ int main()
ADataType, ADataType,
BDataType, BDataType,
CDataType, CDataType,
AccDataType,
RowMajor, RowMajor,
ColumnMajor, ColumnMajor,
RowMajor, RowMajor,
......
#include <algorithm>
#include <cstdlib>
#include <half.hpp>
#include <iostream>
#include <numeric>
#include <tuple>
#include <vector>
#include "gemm_util.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceGemmNoOpPtr =
ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_gemm_instance {
void add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
} // namespace device_gemm_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
inline std::string get_device_name()
{
hipDeviceProp_t props{};
int device;
auto status = hipGetDevice(&device);
if(status != hipSuccess)
{
return std::string();
}
status = hipGetDeviceProperties(&props, device);
if(status != hipSuccess)
{
return std::string();
}
const std::string name(props.gcnArchName);
return name;
}
int main()
{
if(get_device_name().find("gfx90a") == std::string::npos)
{
std::cout << "TestGemm ..... SUCCESS" << std::endl;
return 0;
}
using ADataType = double;
using BDataType = double;
using CDataType = double;
using AccDataType = double;
using RowMajor = ck::tensor_layout::gemm::RowMajor;
using ColumnMajor = ck::tensor_layout::gemm::ColumnMajor;
bool res = true;
std::vector<DeviceGemmNoOpPtr> gemmPtrs;
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
AccDataType,
ColumnMajor,
RowMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
AccDataType,
ColumnMajor,
ColumnMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
AccDataType,
RowMajor,
RowMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
gemmPtrs.clear();
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(gemmPtrs);
for(auto& gemmPtr : gemmPtrs)
{
res &= ck::gemm_util::TestGemm<DeviceGemmNoOpPtr,
ADataType,
BDataType,
CDataType,
AccDataType,
RowMajor,
ColumnMajor,
RowMajor,
PassThrough,
PassThrough,
PassThrough>{}(gemmPtr);
}
std::cout << "TestGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
return res ? 0 : 1;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment