Commit dd6a8de4 authored by Jehandad Khan's avatar Jehandad Khan
Browse files

Merge branch 'develop' into jd/dev_pkg

parents 0aa899aa abf4bdb9
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "profile_gemm_reduce_impl.hpp"
int profile_gemm_reduce(int argc, char* argv[])
{
enum struct GemmMatrixLayout
{
MK_KN_MN, // 0
MK_NK_MN, // 1
KM_KN_MN, // 2
KM_NK_MN, // 3
};
enum struct GemmReduceDataType
{
F32_F32_F32_F32_F32, // 0
F16_F16_F16_F32_F32, // 1
};
if(!(argc == 14 || argc == 15))
{
printf("arg1: tensor operation (gemm: GEMM+Reduce)\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
printf(" 2: A[k, m] * B[k, n] = C[m, n];\n");
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
printf("arg14: split k into mulitiple batch\n");
exit(1);
}
const auto data_type = static_cast<GemmReduceDataType>(std::stoi(argv[2]));
const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
const int K = std::stoi(argv[10]);
const int StrideA = std::stoi(argv[11]);
const int StrideB = std::stoi(argv[12]);
const int StrideC = std::stoi(argv[13]);
if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_gemm_reduce_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? K : StrideA,
(StrideB < 0) ? N : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 &&
layout == GemmMatrixLayout::MK_NK_MN)
{
ck::profiler::profile_gemm_reduce_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? K : StrideA,
(StrideB < 0) ? K : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 &&
layout == GemmMatrixLayout::KM_KN_MN)
{
ck::profiler::profile_gemm_reduce_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? M : StrideA,
(StrideB < 0) ? N : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 &&
layout == GemmMatrixLayout::KM_NK_MN)
{
ck::profiler::profile_gemm_reduce_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? M : StrideA,
(StrideB < 0) ? K : StrideB,
(StrideC < 0) ? N : StrideC);
}
else
{
throw std::runtime_error("wrong! this data_type & layout is not implemented");
}
return 1;
}
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "profile_grouped_gemm_impl.hpp"
enum struct GemmMatrixLayout
{
MK_KN_MN, // 0
MK_NK_MN, // 1
KM_KN_MN, // 2
KM_NK_MN, // 3
MK_KN_NM, // 4
MK_NK_NM, // 5
KM_KN_NM, // 6
KM_NK_NM, // 7
};
enum struct GemmDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
};
std::vector<int> argToIntArray(char* input)
{
std::vector<int> out;
std::istringstream in(input);
std::string item;
while(std::getline(in, item, ','))
{
out.push_back(std::stoi(item));
}
return out;
}
int profile_grouped_gemm(int argc, char* argv[])
{
if(!(argc == 14))
{
printf("arg1: tensor operation (grouped_gemm: Grouped GEMM)\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
printf(" 2: A[k, m] * B[k, n] = C[m, n];\n");
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n");
printf("arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
"64,64 64,64 128,128)\n");
exit(1);
}
const auto data_type = static_cast<GemmDataType>(std::stoi(argv[2]));
const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]);
const auto Ms = argToIntArray(argv[8]);
const auto Ns = argToIntArray(argv[9]);
const auto Ks = argToIntArray(argv[10]);
const auto StrideAs = argToIntArray(argv[11]);
const auto StrideBs = argToIntArray(argv[12]);
const auto StrideCs = argToIntArray(argv[13]);
if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs);
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs);
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs);
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs);
}
else
{
throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
}
return 1;
}
......@@ -20,12 +20,12 @@
using namespace std;
using ck::NanPropagation_t;
using ck::ReduceTensorIndices_t;
using ck::ReduceTensorOp_t;
using ck::NanPropagation;
using ck::ReduceTensorIndices;
using ck::ReduceTensorOp;
static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
{"toReduceDims", required_argument, nullptr, 'R'},
{"reduceDims", required_argument, nullptr, 'R'},
{"reduceOp", required_argument, nullptr, 'O'},
{"compType", required_argument, nullptr, 'C'},
{"outType", required_argument, nullptr, 'W'},
......@@ -34,6 +34,8 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
{"scales", required_argument, nullptr, 'S'},
{"half", no_argument, nullptr, '?'},
{"double", no_argument, nullptr, '?'},
{"int8", no_argument, nullptr, '?'},
{"bf16", no_argument, nullptr, '?'},
{"dumpout", required_argument, nullptr, 'o'},
{"verify", required_argument, nullptr, 'v'},
{"log", required_argument, nullptr, 'l'},
......@@ -82,7 +84,7 @@ static std::vector<T> getTypeValuesFromString(const char* cstr_values)
return (values);
}
typedef enum
enum struct AppDataType
{
appHalf = 0,
appFloat = 1,
......@@ -91,11 +93,11 @@ typedef enum
appInt8x4 = 4,
appBFloat16 = 5,
appDouble = 6,
} appDataType_t;
};
static void check_reduce_dims(const int rank, const std::vector<int>& toReduceDims)
static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims)
{
for(auto dim : toReduceDims)
for(auto dim : reduceDims)
{
if(dim < 0 || dim >= rank)
throw std::runtime_error("Invalid dimension index specified for Reducing");
......@@ -103,7 +105,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& toReduceDi
unsigned int flag = 0;
for(auto dim : toReduceDims)
for(auto dim : reduceDims)
{
if(flag & (0x1 << dim))
throw std::runtime_error("All toReduce dimensions should be different!");
......@@ -119,25 +121,27 @@ class AppArgs
public:
bool use_half = false;
bool use_double = false;
bool use_int8 = false;
bool use_bf16 = false;
std::vector<size_t> inLengths;
std::vector<size_t> outLengths;
std::vector<int> toReduceDims;
std::vector<int> reduceDims;
std::vector<float> scales;
ReduceTensorOp_t reduceOp = ReduceTensorOp_t::ADD;
appDataType_t compTypeId = appFloat;
appDataType_t outTypeId = appFloat;
ReduceTensorOp reduceOp = ReduceTensorOp::ADD;
AppDataType compTypeId = AppDataType::appFloat;
AppDataType outTypeId = AppDataType::appFloat;
bool compType_assigned = false;
bool outType_assigned = false;
NanPropagation_t nanOpt = NanPropagation_t::NOT_PROPAGATE_NAN;
ReduceTensorIndices_t indicesOpt = ReduceTensorIndices_t::NO_INDICES;
bool do_log = false;
bool do_verification = false;
bool do_dumpout = false;
NanPropagation nanOpt = NanPropagation::NOT_PROPAGATE_NAN;
ReduceTensorIndices indicesOpt = ReduceTensorIndices::NO_INDICES;
bool do_log = false;
bool do_verification = false;
bool do_dumpout = false;
int init_method;
int nrepeat;
......@@ -152,7 +156,7 @@ class AppArgs
std::cout << "Usage of " << cmd << std::endl;
std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
<< std::endl;
std::cout << "--toReduceDims or -R, comma separated list of to-reduce dimensions"
std::cout << "--reduceDims or -R, comma separated list of to-reduce dimensions"
<< std::endl;
std::cout << "--reduceOp or -O, enum value indicating the reduction operations"
<< std::endl;
......@@ -169,6 +173,8 @@ class AppArgs
<< std::endl;
std::cout << "--half, use fp16 for the input and output tensor data types" << std::endl;
std::cout << "--double, use fp64 for the input and output tensor data types" << std::endl;
std::cout << "--int8, use int8 for the input and output tensor data types" << std::endl;
std::cout << "--bf16, use bfloat16 for the input and output tensor data types" << std::endl;
std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
"comparing with the host-based reduction"
<< std::endl;
......@@ -201,39 +207,39 @@ class AppArgs
if(!optarg)
throw std::runtime_error("Invalid option format!");
toReduceDims = getTypeValuesFromString<int>(optarg);
reduceDims = getTypeValuesFromString<int>(optarg);
break;
case 'O':
if(!optarg)
throw std::runtime_error("Invalid option format!");
reduceOp = static_cast<ReduceTensorOp_t>(std::atoi(optarg));
reduceOp = static_cast<ReduceTensorOp>(std::atoi(optarg));
break;
case 'C':
if(!optarg)
throw std::runtime_error("Invalid option format!");
compTypeId = static_cast<appDataType_t>(std::atoi(optarg));
compTypeId = static_cast<AppDataType>(std::atoi(optarg));
compType_assigned = true;
break;
case 'W':
if(!optarg)
throw std::runtime_error("Invalid option format!");
outTypeId = static_cast<appDataType_t>(std::atoi(optarg));
outTypeId = static_cast<AppDataType>(std::atoi(optarg));
outType_assigned = true;
break;
case 'N':
if(!optarg)
throw std::runtime_error("Invalid option format!");
nanOpt = static_cast<NanPropagation_t>(std::atoi(optarg));
nanOpt = static_cast<NanPropagation>(std::atoi(optarg));
break;
case 'I':
if(!optarg)
throw std::runtime_error("Invalid option format!");
indicesOpt = static_cast<ReduceTensorIndices_t>(std::atoi(optarg));
indicesOpt = static_cast<ReduceTensorIndices>(std::atoi(optarg));
break;
case 'S':
if(!optarg)
......@@ -267,6 +273,10 @@ class AppArgs
use_half = true;
else if(std::string(long_options[option_index].name) == "double")
use_double = true;
else if(std::string(long_options[option_index].name) == "int8")
use_int8 = true;
else if(std::string(long_options[option_index].name) == "bf16")
use_bf16 = true;
else if(std::string(long_options[option_index].name) == "help")
{
show_usage(argv[0]);
......@@ -293,10 +303,10 @@ class AppArgs
scales.push_back(0.0f);
};
if(reduceOp == ReduceTensorOp_t::MIN || reduceOp == ReduceTensorOp_t::MAX ||
reduceOp == ReduceTensorOp_t::AMAX)
if(reduceOp == ReduceTensorOp::MIN || reduceOp == ReduceTensorOp::MAX ||
reduceOp == ReduceTensorOp::AMAX)
{
if(indicesOpt != ReduceTensorIndices_t::NO_INDICES)
if(indicesOpt != ReduceTensorIndices::NO_INDICES)
need_indices = true;
// for indexable operations, no need to assign compType and outType, just let them be
......@@ -321,23 +331,24 @@ int profile_reduce(int argc, char* argv[])
int rank = args.inLengths.size();
check_reduce_dims(rank, args.toReduceDims);
check_reduce_dims(rank, args.reduceDims);
if(args.reduceOp == ReduceTensorOp_t::MUL || args.reduceOp == ReduceTensorOp_t::NORM1)
if(args.reduceOp == ReduceTensorOp::MUL || args.reduceOp == ReduceTensorOp::NORM1)
throw std::runtime_error("MUL and NORM1 are not supported by composable kernel!");
if(args.use_half)
{
if(!args.compType_assigned)
args.compTypeId = appHalf;
args.compTypeId = AppDataType::appHalf;
if(args.outType_assigned && (args.outTypeId != appHalf && args.outTypeId != appFloat))
args.outTypeId = appFloat;
if(args.outType_assigned &&
(args.outTypeId != AppDataType::appHalf && args.outTypeId != AppDataType::appFloat))
args.outTypeId = AppDataType::appFloat;
if(!args.outType_assigned)
args.outTypeId = appHalf;
args.outTypeId = AppDataType::appHalf;
if(args.compTypeId == appHalf)
if(args.compTypeId == AppDataType::appHalf)
{
profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(args.do_verification,
args.init_method,
......@@ -345,14 +356,14 @@ int profile_reduce(int argc, char* argv[])
args.do_dumpout,
args.nrepeat,
args.inLengths,
args.toReduceDims,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
args.scales[0],
args.scales[1]);
}
else if(args.compTypeId == appFloat)
else if(args.compTypeId == AppDataType::appFloat)
{
profile_reduce_impl<ck::half_t, float, ck::half_t>(args.do_verification,
args.init_method,
......@@ -360,7 +371,7 @@ int profile_reduce(int argc, char* argv[])
args.do_dumpout,
args.nrepeat,
args.inLengths,
args.toReduceDims,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
......@@ -378,16 +389,83 @@ int profile_reduce(int argc, char* argv[])
args.do_dumpout,
args.nrepeat,
args.inLengths,
args.toReduceDims,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
args.scales[0],
args.scales[1]);
}
else if(args.use_int8)
{
if(!args.compType_assigned)
args.compTypeId = AppDataType::appInt8;
if(args.outType_assigned &&
(args.outTypeId != AppDataType::appInt8 && args.outTypeId != AppDataType::appInt32))
args.outTypeId = AppDataType::appInt32;
if(!args.outType_assigned)
args.outTypeId = AppDataType::appInt8;
if(args.compTypeId == AppDataType::appInt8)
{
profile_reduce_impl<int8_t, int8_t, int8_t>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.nrepeat,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
args.scales[0],
args.scales[1]);
}
else if(args.compTypeId == AppDataType::appInt32)
{
profile_reduce_impl<int8_t, int32_t, int8_t>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.nrepeat,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
args.scales[0],
args.scales[1]);
}
else
throw std::runtime_error("Invalid compType assignment!");
}
else if(args.use_bf16)
{
if(args.outType_assigned &&
(args.outTypeId != AppDataType::appBFloat16 && args.outTypeId != AppDataType::appFloat))
args.outTypeId = AppDataType::appFloat;
if(!args.outType_assigned)
args.outTypeId = AppDataType::appBFloat16;
profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.nrepeat,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
args.scales[0],
args.scales[1]);
}
else
{
if(args.compTypeId == appFloat)
if(args.compTypeId == AppDataType::appFloat)
{
profile_reduce_impl<float, float, float>(args.do_verification,
args.init_method,
......@@ -395,14 +473,14 @@ int profile_reduce(int argc, char* argv[])
args.do_dumpout,
args.nrepeat,
args.inLengths,
args.toReduceDims,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
args.scales[0],
args.scales[1]);
}
else if(args.compTypeId == appDouble)
else if(args.compTypeId == AppDataType::appDouble)
{
profile_reduce_impl<float, double, float>(args.do_verification,
args.init_method,
......@@ -410,7 +488,7 @@ int profile_reduce(int argc, char* argv[])
args.do_dumpout,
args.nrepeat,
args.inLengths,
args.toReduceDims,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
......
......@@ -5,16 +5,20 @@
#include <cstring>
int profile_gemm(int, char*[]);
int profile_batched_gemm(int, char*[]);
int profile_gemm_bias_2d(int, char*[]);
int profile_gemm_bias_relu(int, char*[]);
int profile_gemm_bias_relu_add(int, char*[]);
int profile_gemm_reduce(int, char*[]);
int profile_batched_gemm(int, char*[]);
int profile_grouped_gemm(int, char*[]);
int profile_conv_fwd(int, char*[]);
int profile_conv_fwd_bias_relu(int, char*[]);
int profile_conv_fwd_bias_relu_add(int, char*[]);
int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);
int profile_conv_bwd_data(int, char*[]);
int profile_convnd_bwd_data(int, char*[], int);
int profile_reduce(int, char*[]);
int profile_conv_bwd_weight(int, char*[]);
int profile_batched_gemm_reduce(int, char*[]);
int main(int argc, char* argv[])
{
......@@ -34,10 +38,22 @@ int main(int argc, char* argv[])
{
return profile_gemm_bias_relu_add(argc, argv);
}
else if(strcmp(argv[1], "gemm_reduce") == 0)
{
return profile_gemm_reduce(argc, argv);
}
else if(strcmp(argv[1], "batched_gemm") == 0)
{
return profile_batched_gemm(argc, argv);
}
else if(strcmp(argv[1], "batched_gemm_reduce") == 0)
{
return profile_batched_gemm_reduce(argc, argv);
}
else if(strcmp(argv[1], "grouped_gemm") == 0)
{
profile_grouped_gemm(argc, argv);
}
else if(strcmp(argv[1], "conv_fwd") == 0)
{
return profile_conv_fwd(argc, argv);
......@@ -54,14 +70,26 @@ int main(int argc, char* argv[])
{
return profile_conv_fwd_bias_relu_atomic_add(argc, argv);
}
else if(strcmp(argv[1], "conv_bwd") == 0)
else if(strcmp(argv[1], "conv1d_bwd_data") == 0)
{
return profile_conv_bwd_data(argc, argv);
return profile_convnd_bwd_data(argc, argv, 1);
}
else if(strcmp(argv[1], "conv2d_bwd_data") == 0)
{
return profile_convnd_bwd_data(argc, argv, 2);
}
else if(strcmp(argv[1], "conv3d_bwd_data") == 0)
{
return profile_convnd_bwd_data(argc, argv, 3);
}
else if(strcmp(argv[1], "reduce") == 0)
{
return profile_reduce(argc, argv);
}
else if(strcmp(argv[1], "conv2d_bwd_weight") == 0)
{
return profile_conv_bwd_weight(argc, argv);
}
else
{
// clang-format off
......@@ -69,14 +97,18 @@ int main(int argc, char* argv[])
" gemm_bias_2d: GEMM+Bias(2D)\n"
" gemm_bias_relu: GEMM+Bias+ReLU\n"
" gemm_bias_relu_add: GEMM+Bias+ReLU+Add\n"
" gemm_reduce: GEMM+Reduce\n"
" grouped_gemm: Grouped GEMM\n"
" conv_fwd: ForwardConvolution\n"
" conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
" conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
" conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd\n"
" conv_bwd: BackwardConvolution\n"
" reduce: REDUCE\n");
" conv1d_bwd_data: BackwardConvolution data 1 dim\n"
" conv2d_bwd_data: BackwardConvolution data 2 dim\n"
" conv3d_bwd_data: BackwardConvolution data 3 dim\n"
" reduce: REDUCE\n"
" conv2d_bwd_weight: Backward Weight Convolution 2d\n");
// clang-format on
return 0;
}
return 0;
}
......@@ -3,16 +3,18 @@ rm -f CMakeCache.txt
rm -f *.cmake
rm -rf CMakeFiles
MY_PROJECT_SOURCE=../../..
MY_PROJECT_SOURCE=../
MY_PROJECT_INSTALL=../install.dir
cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD" \
-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
${MY_PROJECT_SOURCE}
#-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD" \
#-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD" \
#!/bin/bash
FILE=$1
echo v0 $( grep -w v0 $FILE | wc -l )
echo v1 $( grep -w v1 $FILE | wc -l )
echo v2 $( grep -w v2 $FILE | wc -l )
echo v3 $( grep -w v3 $FILE | wc -l )
echo v4 $( grep -w v4 $FILE | wc -l )
echo v5 $( grep -w v5 $FILE | wc -l )
echo v6 $( grep -w v6 $FILE | wc -l )
echo v7 $( grep -w v7 $FILE | wc -l )
echo v8 $( grep -w v8 $FILE | wc -l )
echo v9 $( grep -w v9 $FILE | wc -l )
echo v10 $( grep -w v10 $FILE | wc -l )
echo v11 $( grep -w v11 $FILE | wc -l )
echo v12 $( grep -w v12 $FILE | wc -l )
echo v13 $( grep -w v13 $FILE | wc -l )
echo v14 $( grep -w v14 $FILE | wc -l )
echo v15 $( grep -w v15 $FILE | wc -l )
echo v16 $( grep -w v16 $FILE | wc -l )
echo v17 $( grep -w v17 $FILE | wc -l )
echo v18 $( grep -w v18 $FILE | wc -l )
echo v19 $( grep -w v19 $FILE | wc -l )
echo v20 $( grep -w v20 $FILE | wc -l )
echo v21 $( grep -w v21 $FILE | wc -l )
echo v22 $( grep -w v22 $FILE | wc -l )
echo v23 $( grep -w v23 $FILE | wc -l )
echo v24 $( grep -w v24 $FILE | wc -l )
echo v25 $( grep -w v25 $FILE | wc -l )
echo v26 $( grep -w v26 $FILE | wc -l )
echo v27 $( grep -w v27 $FILE | wc -l )
echo v28 $( grep -w v28 $FILE | wc -l )
echo v29 $( grep -w v29 $FILE | wc -l )
echo v30 $( grep -w v30 $FILE | wc -l )
echo v31 $( grep -w v31 $FILE | wc -l )
echo v32 $( grep -w v32 $FILE | wc -l )
echo v33 $( grep -w v33 $FILE | wc -l )
echo v34 $( grep -w v34 $FILE | wc -l )
echo v35 $( grep -w v35 $FILE | wc -l )
echo v36 $( grep -w v36 $FILE | wc -l )
echo v37 $( grep -w v37 $FILE | wc -l )
echo v38 $( grep -w v38 $FILE | wc -l )
echo v39 $( grep -w v39 $FILE | wc -l )
echo v40 $( grep -w v40 $FILE | wc -l )
echo v41 $( grep -w v41 $FILE | wc -l )
echo v42 $( grep -w v42 $FILE | wc -l )
echo v43 $( grep -w v43 $FILE | wc -l )
echo v44 $( grep -w v44 $FILE | wc -l )
echo v45 $( grep -w v45 $FILE | wc -l )
echo v46 $( grep -w v46 $FILE | wc -l )
echo v47 $( grep -w v47 $FILE | wc -l )
echo v48 $( grep -w v48 $FILE | wc -l )
echo v49 $( grep -w v49 $FILE | wc -l )
echo v50 $( grep -w v50 $FILE | wc -l )
echo v51 $( grep -w v51 $FILE | wc -l )
echo v52 $( grep -w v52 $FILE | wc -l )
echo v53 $( grep -w v53 $FILE | wc -l )
echo v54 $( grep -w v54 $FILE | wc -l )
echo v55 $( grep -w v55 $FILE | wc -l )
echo v56 $( grep -w v56 $FILE | wc -l )
echo v57 $( grep -w v57 $FILE | wc -l )
echo v58 $( grep -w v58 $FILE | wc -l )
echo v59 $( grep -w v59 $FILE | wc -l )
echo v60 $( grep -w v60 $FILE | wc -l )
echo v61 $( grep -w v61 $FILE | wc -l )
echo v62 $( grep -w v62 $FILE | wc -l )
echo v63 $( grep -w v63 $FILE | wc -l )
echo v64 $( grep -w v64 $FILE | wc -l )
echo v65 $( grep -w v65 $FILE | wc -l )
echo v66 $( grep -w v66 $FILE | wc -l )
echo v67 $( grep -w v67 $FILE | wc -l )
echo v68 $( grep -w v68 $FILE | wc -l )
echo v69 $( grep -w v69 $FILE | wc -l )
echo v70 $( grep -w v70 $FILE | wc -l )
echo v71 $( grep -w v71 $FILE | wc -l )
echo v72 $( grep -w v72 $FILE | wc -l )
echo v73 $( grep -w v73 $FILE | wc -l )
echo v74 $( grep -w v74 $FILE | wc -l )
echo v75 $( grep -w v75 $FILE | wc -l )
echo v76 $( grep -w v76 $FILE | wc -l )
echo v77 $( grep -w v77 $FILE | wc -l )
echo v78 $( grep -w v78 $FILE | wc -l )
echo v79 $( grep -w v79 $FILE | wc -l )
echo v80 $( grep -w v80 $FILE | wc -l )
echo v81 $( grep -w v81 $FILE | wc -l )
echo v82 $( grep -w v82 $FILE | wc -l )
echo v83 $( grep -w v83 $FILE | wc -l )
echo v84 $( grep -w v84 $FILE | wc -l )
echo v85 $( grep -w v85 $FILE | wc -l )
echo v86 $( grep -w v86 $FILE | wc -l )
echo v87 $( grep -w v87 $FILE | wc -l )
echo v88 $( grep -w v88 $FILE | wc -l )
echo v89 $( grep -w v89 $FILE | wc -l )
echo v90 $( grep -w v90 $FILE | wc -l )
echo v91 $( grep -w v91 $FILE | wc -l )
echo v92 $( grep -w v92 $FILE | wc -l )
echo v93 $( grep -w v93 $FILE | wc -l )
echo v94 $( grep -w v94 $FILE | wc -l )
echo v95 $( grep -w v95 $FILE | wc -l )
echo v96 $( grep -w v96 $FILE | wc -l )
echo v97 $( grep -w v97 $FILE | wc -l )
echo v98 $( grep -w v98 $FILE | wc -l )
echo v99 $( grep -w v99 $FILE | wc -l )
echo v100 $( grep -w v100 $FILE | wc -l )
echo v101 $( grep -w v101 $FILE | wc -l )
echo v102 $( grep -w v102 $FILE | wc -l )
echo v103 $( grep -w v103 $FILE | wc -l )
echo v104 $( grep -w v104 $FILE | wc -l )
echo v105 $( grep -w v105 $FILE | wc -l )
echo v106 $( grep -w v106 $FILE | wc -l )
echo v107 $( grep -w v107 $FILE | wc -l )
echo v108 $( grep -w v108 $FILE | wc -l )
echo v109 $( grep -w v109 $FILE | wc -l )
echo v110 $( grep -w v110 $FILE | wc -l )
echo v111 $( grep -w v111 $FILE | wc -l )
echo v112 $( grep -w v112 $FILE | wc -l )
echo v113 $( grep -w v113 $FILE | wc -l )
echo v114 $( grep -w v114 $FILE | wc -l )
echo v115 $( grep -w v115 $FILE | wc -l )
echo v116 $( grep -w v116 $FILE | wc -l )
echo v117 $( grep -w v117 $FILE | wc -l )
echo v118 $( grep -w v118 $FILE | wc -l )
echo v119 $( grep -w v119 $FILE | wc -l )
echo v120 $( grep -w v120 $FILE | wc -l )
echo v121 $( grep -w v121 $FILE | wc -l )
echo v122 $( grep -w v122 $FILE | wc -l )
echo v123 $( grep -w v123 $FILE | wc -l )
echo v124 $( grep -w v124 $FILE | wc -l )
echo v125 $( grep -w v125 $FILE | wc -l )
echo v126 $( grep -w v126 $FILE | wc -l )
echo v127 $( grep -w v127 $FILE | wc -l )
echo v128 $( grep -w v128 $FILE | wc -l )
echo v129 $( grep -w v129 $FILE | wc -l )
echo v130 $( grep -w v130 $FILE | wc -l )
echo v131 $( grep -w v131 $FILE | wc -l )
echo v132 $( grep -w v132 $FILE | wc -l )
echo v133 $( grep -w v133 $FILE | wc -l )
echo v134 $( grep -w v134 $FILE | wc -l )
echo v135 $( grep -w v135 $FILE | wc -l )
echo v136 $( grep -w v136 $FILE | wc -l )
echo v137 $( grep -w v137 $FILE | wc -l )
echo v138 $( grep -w v138 $FILE | wc -l )
echo v139 $( grep -w v139 $FILE | wc -l )
echo v140 $( grep -w v140 $FILE | wc -l )
echo v141 $( grep -w v141 $FILE | wc -l )
echo v142 $( grep -w v142 $FILE | wc -l )
echo v143 $( grep -w v143 $FILE | wc -l )
echo v144 $( grep -w v144 $FILE | wc -l )
echo v145 $( grep -w v145 $FILE | wc -l )
echo v146 $( grep -w v146 $FILE | wc -l )
echo v147 $( grep -w v147 $FILE | wc -l )
echo v148 $( grep -w v148 $FILE | wc -l )
echo v149 $( grep -w v149 $FILE | wc -l )
echo v150 $( grep -w v150 $FILE | wc -l )
echo v151 $( grep -w v151 $FILE | wc -l )
echo v152 $( grep -w v152 $FILE | wc -l )
echo v153 $( grep -w v153 $FILE | wc -l )
echo v154 $( grep -w v154 $FILE | wc -l )
echo v155 $( grep -w v155 $FILE | wc -l )
echo v156 $( grep -w v156 $FILE | wc -l )
echo v157 $( grep -w v157 $FILE | wc -l )
echo v158 $( grep -w v158 $FILE | wc -l )
echo v159 $( grep -w v159 $FILE | wc -l )
echo v160 $( grep -w v160 $FILE | wc -l )
echo v161 $( grep -w v161 $FILE | wc -l )
echo v162 $( grep -w v162 $FILE | wc -l )
echo v163 $( grep -w v163 $FILE | wc -l )
echo v164 $( grep -w v164 $FILE | wc -l )
echo v165 $( grep -w v165 $FILE | wc -l )
echo v166 $( grep -w v166 $FILE | wc -l )
echo v167 $( grep -w v167 $FILE | wc -l )
echo v168 $( grep -w v168 $FILE | wc -l )
echo v169 $( grep -w v169 $FILE | wc -l )
echo v170 $( grep -w v170 $FILE | wc -l )
echo v171 $( grep -w v171 $FILE | wc -l )
echo v172 $( grep -w v172 $FILE | wc -l )
echo v173 $( grep -w v173 $FILE | wc -l )
echo v174 $( grep -w v174 $FILE | wc -l )
echo v175 $( grep -w v175 $FILE | wc -l )
echo v176 $( grep -w v176 $FILE | wc -l )
echo v177 $( grep -w v177 $FILE | wc -l )
echo v178 $( grep -w v178 $FILE | wc -l )
echo v179 $( grep -w v179 $FILE | wc -l )
echo v180 $( grep -w v180 $FILE | wc -l )
echo v181 $( grep -w v181 $FILE | wc -l )
echo v182 $( grep -w v182 $FILE | wc -l )
echo v183 $( grep -w v183 $FILE | wc -l )
echo v184 $( grep -w v184 $FILE | wc -l )
echo v185 $( grep -w v185 $FILE | wc -l )
echo v186 $( grep -w v186 $FILE | wc -l )
echo v187 $( grep -w v187 $FILE | wc -l )
echo v188 $( grep -w v188 $FILE | wc -l )
echo v189 $( grep -w v189 $FILE | wc -l )
echo v190 $( grep -w v190 $FILE | wc -l )
echo v191 $( grep -w v191 $FILE | wc -l )
echo v192 $( grep -w v192 $FILE | wc -l )
echo v193 $( grep -w v193 $FILE | wc -l )
echo v194 $( grep -w v194 $FILE | wc -l )
echo v195 $( grep -w v195 $FILE | wc -l )
echo v196 $( grep -w v196 $FILE | wc -l )
echo v197 $( grep -w v197 $FILE | wc -l )
echo v198 $( grep -w v198 $FILE | wc -l )
echo v199 $( grep -w v199 $FILE | wc -l )
echo v200 $( grep -w v200 $FILE | wc -l )
echo v201 $( grep -w v201 $FILE | wc -l )
echo v202 $( grep -w v202 $FILE | wc -l )
echo v203 $( grep -w v203 $FILE | wc -l )
echo v204 $( grep -w v204 $FILE | wc -l )
echo v205 $( grep -w v205 $FILE | wc -l )
echo v206 $( grep -w v206 $FILE | wc -l )
echo v207 $( grep -w v207 $FILE | wc -l )
echo v208 $( grep -w v208 $FILE | wc -l )
echo v209 $( grep -w v209 $FILE | wc -l )
echo v210 $( grep -w v210 $FILE | wc -l )
echo v211 $( grep -w v211 $FILE | wc -l )
echo v212 $( grep -w v212 $FILE | wc -l )
echo v213 $( grep -w v213 $FILE | wc -l )
echo v214 $( grep -w v214 $FILE | wc -l )
echo v215 $( grep -w v215 $FILE | wc -l )
echo v216 $( grep -w v216 $FILE | wc -l )
echo v217 $( grep -w v217 $FILE | wc -l )
echo v218 $( grep -w v218 $FILE | wc -l )
echo v219 $( grep -w v219 $FILE | wc -l )
echo v220 $( grep -w v220 $FILE | wc -l )
echo v221 $( grep -w v221 $FILE | wc -l )
echo v222 $( grep -w v222 $FILE | wc -l )
echo v223 $( grep -w v223 $FILE | wc -l )
echo v224 $( grep -w v224 $FILE | wc -l )
echo v225 $( grep -w v225 $FILE | wc -l )
echo v226 $( grep -w v226 $FILE | wc -l )
echo v227 $( grep -w v227 $FILE | wc -l )
echo v228 $( grep -w v228 $FILE | wc -l )
echo v229 $( grep -w v229 $FILE | wc -l )
echo v230 $( grep -w v230 $FILE | wc -l )
echo v231 $( grep -w v231 $FILE | wc -l )
echo v232 $( grep -w v232 $FILE | wc -l )
echo v233 $( grep -w v233 $FILE | wc -l )
echo v234 $( grep -w v234 $FILE | wc -l )
echo v235 $( grep -w v235 $FILE | wc -l )
echo v236 $( grep -w v236 $FILE | wc -l )
echo v237 $( grep -w v237 $FILE | wc -l )
echo v238 $( grep -w v238 $FILE | wc -l )
echo v239 $( grep -w v239 $FILE | wc -l )
echo v240 $( grep -w v240 $FILE | wc -l )
echo v241 $( grep -w v241 $FILE | wc -l )
echo v242 $( grep -w v242 $FILE | wc -l )
echo v243 $( grep -w v243 $FILE | wc -l )
echo v244 $( grep -w v244 $FILE | wc -l )
echo v245 $( grep -w v245 $FILE | wc -l )
echo v246 $( grep -w v246 $FILE | wc -l )
echo v247 $( grep -w v247 $FILE | wc -l )
echo v248 $( grep -w v248 $FILE | wc -l )
echo v249 $( grep -w v249 $FILE | wc -l )
echo v250 $( grep -w v250 $FILE | wc -l )
echo v251 $( grep -w v251 $FILE | wc -l )
echo v252 $( grep -w v252 $FILE | wc -l )
echo v253 $( grep -w v253 $FILE | wc -l )
echo v254 $( grep -w v254 $FILE | wc -l )
echo v255 $( grep -w v255 $FILE | wc -l )
for num in {0..255}
do
base_pattern="(\[?${num}\b|\[\d*:${num}\])"
spattern="s${base_pattern}"
vpattern="v${base_pattern}"
apattern="a${base_pattern}"
scount=$(grep -P $spattern $FILE | wc -l)
vcount=$(grep -P $vpattern $FILE | wc -l)
acount=$(grep -P $apattern $FILE | wc -l)
bash -c "echo -n v${num} $vcount && \
echo -n , s${num} $scount && \
echo -n , a${num} $acount"
if [[ $scount -ne 0 || $vcount -ne 0 || $acount -ne 0 ]]; then
echo -n " *"
fi
echo ""
done
#!/bin/bash
PRECISION= ##--half
PRECISION=
##PRECISION=--half
##PRECISION=--double
##PRECISION=--int8
##PRECISION=--bf16
if test -n $PRECISION && test "$PRECISION" = "--half"; then
CTYPE="-C 1"
else
CTYPE=""
if [ -n $PRECISION ] && [ "$PRECISION" = "--half" -o "$PRECISION" = "--bf16" ]; then
ACCTYPE="-C 1"
elif [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then
ACCTYPE="-C 2"
fi
WTYPE=
if [ $# -ge 1 ] ; then
NREPEAT=$1
else
NREPEAT=1
fi
driver="./bin/ckProfiler"
VERIFY="-v $1"
INIT=$2
NREPEAT=$3
Operation=7
#### 0 - ADD, 5 - AVG, 7 - NORM2
Operations="0 5 7"
#### 0 - ADD, 5 - AVG, for int8, no NORM2 supported
if [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then
Operations=5
fi
## for generic validation
for op in $Operation; do
for op in $Operations; do
set -x
./bin/ckProfiler reduce $PRECISION -D 64,4,280,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 4,64,280,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 280,4,64,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 64,4,280,82 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 4,64,280,82 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 64,280,82,4 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 700,8192 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 700,1024 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 700,4 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT
####### datatype layout reduce dims op acctype verify init repeats
$driver reduce $PRECISION -D 64,4,280,82 -R 0,1,2,3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 0 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 1 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 1,2,3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 0,2,3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 0,1,3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,22960 -R 0 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,22960 -R 1 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 4,1469440 -R 0 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 4,1469440 -R 1 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
set +x
done
Operation=5
#### 0 - ADD, 5 - AVG, 7 - NORM2
Operations=5
## for performance evaluation (resnet50 NHWC => C)
for op in $Operation; do
for op in $Operations; do
set -x
./bin/ckProfiler reduce $PRECISION -D 256,14,14,1024 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,28,28,128 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,58,58,128 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,7,7,2048 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,14,14,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,30,30,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,56,56,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,16,16,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,28,28,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,7,7,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,56,56,64 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,230,230,3 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,14,14,1024 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,28,28,128 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,58,58,128 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,7,7,2048 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,14,14,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,30,30,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,56,56,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,16,16,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,28,28,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,7,7,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,56,56,64 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT
####### datatype layout reduce dims op acctype verify init repeats
$driver reduce $PRECISION -D 256,14,14,1024 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,28,28,128 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,58,58,128 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,7,7,2048 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,14,14,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,30,30,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,56,56,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,16,16,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,28,28,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,7,7,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,56,56,64 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,230,230,3 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,14,14,1024 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,28,28,128 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,58,58,128 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,7,7,2048 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,14,14,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,30,30,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,56,56,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,16,16,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,28,28,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,7,7,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,56,56,64 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
set +x
done
#!/bin/bash
PRECISION= ##--half
PRECISION=
##PRECISION=--half
##PRECISION=--double
##PRECISION=--int8
##PRECISION=--bf16
if [ $# -ge 1 ] ; then
NREPEAT=$1
else
NREPEAT=1
fi
driver="./bin/ckProfiler"
Operation=4
VERIFY="-v $1"
INIT=$2
NREPEAT=$3
LENGTHS=64,4,280,82
#### 2 - MIN, 3 - MAX, 4 - AMAX
Operations="2 4"
## for generic validation
for op in $Operation; do
for op in $Operations; do
for use_idx in 0 1; do
set -x
./bin/ckProfiler reduce $PRECISION -D 64,4,280,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 4,64,280,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 280,4,64,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 64,4,280,82 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 4,64,280,82 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 64,280,82,4 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 700,8192 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 700,1024 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 700,4 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT
####### datatype layout reduce dims op use index verify init repeats
$driver reduce $PRECISION -D 64,4,280,82 -R 0,1,2,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 0 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 1 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 1,2,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 0,2,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 64,4,280,82 -R 0,1,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,22960 -R 0 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,22960 -R 1 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 4,1469440 -R 0 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 4,1469440 -R 1 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
set +x
done
done
Operations=2
## for performance evaluation (resnet50 NHWC => C)
for op in $Operation; do
for op in $Operations; do
for use_idx in 0 1; do
set -x
./bin/ckProfiler reduce $PRECISION -D 256,14,14,1024 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,28,28,128 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,58,58,128 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,7,7,2048 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,14,14,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,30,30,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,56,56,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,16,16,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,28,28,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,7,7,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,56,56,64 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,230,230,3 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,14,14,1024 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,28,28,128 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,58,58,128 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,7,7,2048 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,14,14,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,30,30,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,56,56,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,16,16,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,28,28,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,7,7,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,56,56,64 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT
####### datatype layout reduce dims op use index verify init repeats
$driver reduce $PRECISION -D 256,14,14,1024 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,28,28,128 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,58,58,128 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,7,7,2048 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,14,14,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,30,30,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,56,56,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,16,16,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,28,28,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,7,7,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,56,56,64 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,230,230,3 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,14,14,1024 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,28,28,128 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,58,58,128 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,7,7,2048 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,14,14,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,30,30,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,56,56,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,16,16,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,28,28,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,7,7,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,56,56,64 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
set +x
done
done
......
#!/usr/bin/env bash
# set -e
DIM1=False
DIM2=True
DIM3=False
DATE=220317
GIT_HASH=4e6dfda
LOG_DIR=${DATE}_${GIT_HASH}
SUFFIX=${GIT_HASH}
#--------------------------------------------------------------------------
# Commandline arguments parsing
# like: cmd -key[--key] value
#--------------------------------------------------------------------------
POSITIONAL=()
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
-d1|--d1)
DIM1=True
echo DIM1: "${DIM1}"
shift # past argument
;;
-d2|--d2)
DIM2=True
echo DIM2: "${DIM2}"
shift # past argument
;;
-d3|--d3)
DIM3=True
echo DIM3: "${DIM3}"
shift # past argument
;;
-all|--all)
DIM1=True
DIM2=True
DIM3=True
echo DIM1: "${DIM1}"
echo DIM2: "${DIM2}"
echo DIM3: "${DIM3}"
shift # past argument
;;
-s|--suffix)
SUFFIX=${SUFFIX}_"$2"
echo SUFFIX: "${SUFFIX}"
shift # past argument
shift # past value
;;
*) # unknown option
POSITIONAL+=("$1") # save it in an array for later
shift # past argument
;;
esac
done
set -- "${POSITIONAL[@]}" # restore positional parameters
#--------------------------------------------------------------------------
# NUMACTL="numactl --cpunodebind=1 --membind=1"
NUMACTL=
# ENV_CONF=
GPU=mi100
PROF_ITER_COUNT=10000
LOG_DIR_PATH=../log/${LOG_DIR}
set -x
#-------------------------------------------------------------------------------
# 1D
#-------------------------------------------------------------------------------
if [[ "${DIM1}" == "True" ]]; then
mkdir -p ${LOG_DIR_PATH}
echo ">>>>>>>> RUN test conv1d nwc <<<<<<<<<<"
CMD="./../build/bin/test_conv1d_fwd"
${NUMACTL} ${CMD} 2>&1 \
| tee ${LOG_DIR_PATH}/test_conv1d_fwd_nwc_${SUFFIX}_${GPU}.log
fi
#-------------------------------------------------------------------------------
# 2D
#-------------------------------------------------------------------------------
if [[ "${DIM2}" == "True" ]]; then
mkdir -p ${LOG_DIR_PATH}
echo ">>>>>>>> RUN test conv2d nhwc <<<<<<<<<<"
CMD="./../build/bin/test_conv2d_fwd"
${NUMACTL} ${CMD} 2>&1 \
| tee ${LOG_DIR_PATH}/test_conv2d_fwd_nhwc_${SUFFIX}_${GPU}.log
fi
#-------------------------------------------------------------------------------
# 3D
#-------------------------------------------------------------------------------
if [[ "${DIM3}" == "True" ]]; then
mkdir -p ${LOG_DIR_PATH}
echo ">>>>>>>> RUN test conv3d ndhwc <<<<<<<<<<"
CMD="./../build/bin/test_conv3d_fwd"
${NUMACTL} ${CMD} 2>&1 \
| tee ${LOG_DIR_PATH}/test_conv3d_fwd_ndhwc_${SUFFIX}_${GPU}.log
fi
#!/bin/bash
## The following will be used for CI
set -x
## for float
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 2 0 2
bin/test_reduce_no_index -D 64,4,280,82 -R 3 0 2
## for float16
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 2 1 2
bin/test_reduce_no_index -D 64,4,280,82 -R 3 1 2
## for int8_t
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 3 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 3 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 3 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 3 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 3 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0 3 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1 3 2
bin/test_reduce_no_index -D 64,4,280,82 -R 2 3 2
bin/test_reduce_no_index -D 64,4,280,82 -R 3 3 2
## for bfloat16
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2,3 5 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,2 5 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,1,3 5 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0,2,3 5 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1,2,3 5 2
bin/test_reduce_no_index -D 64,4,280,82 -R 0 5 2
bin/test_reduce_no_index -D 64,4,280,82 -R 1 5 2
bin/test_reduce_no_index -D 64,4,280,82 -R 2 5 2
bin/test_reduce_no_index -D 64,4,280,82 -R 3 5 2
set +x
#!/bin/bash
## The following will be used for CI
set -x
## for float
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 2 0 2
bin/test_reduce_with_index -D 64,4,280,82 -R 3 0 2
## for float16
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 2 1 2
bin/test_reduce_with_index -D 64,4,280,82 -R 3 1 2
## for int8_t
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 3 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 3 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 3 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 3 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 3 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0 3 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1 3 2
bin/test_reduce_with_index -D 64,4,280,82 -R 2 3 2
bin/test_reduce_with_index -D 64,4,280,82 -R 3 3 2
## for bfloat16
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2,3 5 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,2 5 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,1,3 5 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0,2,3 5 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1,2,3 5 2
bin/test_reduce_with_index -D 64,4,280,82 -R 0 5 2
bin/test_reduce_with_index -D 64,4,280,82 -R 1 5 2
bin/test_reduce_with_index -D 64,4,280,82 -R 2 5 2
bin/test_reduce_with_index -D 64,4,280,82 -R 3 5 2
set +x
......@@ -15,7 +15,9 @@ include_directories(BEFORE
${PROJECT_SOURCE_DIR}/library/include/ck/library/tensor_operation_instance/gpu/reduce
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu
${PROJECT_SOURCE_DIR}/library/include/ck/library/utility
${PROJECT_SOURCE_DIR}/test/include
${PROJECT_SOURCE_DIR}/profiler/include
${PROJECT_SOURCE_DIR}/external/include/half
)
......@@ -37,6 +39,10 @@ add_subdirectory(conv_util)
add_subdirectory(reference_conv_fwd)
add_subdirectory(gemm)
add_subdirectory(gemm_split_k)
add_subdirectory(conv2d_fwd)
add_subdirectory(gemm_reduce)
add_subdirectory(batched_gemm)
add_subdirectory(batched_gemm_reduce)
add_subdirectory(grouped_gemm)
add_subdirectory(convnd_fwd)
add_subdirectory(conv2d_bwd_data)
add_subdirectory(reduce)
add_subdirectory(conv2d_bwd_weight)
add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
target_link_libraries(test_batched_gemm_fp16 PRIVATE host_tensor)
target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
#include <iostream>
#include "profile_batched_gemm_impl.hpp"
namespace {
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using CDataType = ck::half_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
} // namespace
int main()
{
int M = 512;
int N = 256;
int K = 128;
int BatchCount = 3;
bool pass = true;
pass = pass &&
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
true, 1, false, 1, M, N, K, K, N, N, BatchCount);
pass = pass &&
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
true, 1, false, 1, M, N, K, K, K, N, BatchCount);
pass = pass &&
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
true, 1, false, 1, M, N, K, M, N, N, BatchCount);
pass = pass &&
ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
true, 1, false, 1, M, N, K, M, K, N, BatchCount);
std::cout << "test BatchedGEMM fp16: " << (pass ? "Pass" : "Fail") << std::endl;
return pass ? 0 : 1;
}
#ifndef BATCHED_GEMM_UTILS_HPP
#define BATCHED_GEMM_UTILS_HPP
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
namespace ck {
namespace batched_gemm_util {
struct GemmParams
{
GemmParams()
: M(1024), N(1024), K(1024), StrideA(1024), StrideB(1024), StrideC(1024), alpha(1), beta(0)
{
}
ck::index_t M;
ck::index_t N;
ck::index_t K;
ck::index_t StrideA;
ck::index_t StrideB;
ck::index_t StrideC;
float alpha;
float beta;
};
template <typename BatchedGemmInstance,
typename ADataType,
typename BDataType,
typename CDataType,
typename AElementwiseOperation,
typename BElementwiseOperation,
typename CElementwiseOperation>
void RunHostBatchedGemm(const Tensor<ADataType>& A,
const Tensor<BDataType>& B,
Tensor<CDataType>& C,
AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op,
CElementwiseOperation c_element_op)
{
auto ref_batched_gemm = BatchedGemmInstance{};
auto ref_invoker = ref_batched_gemm.MakeInvoker();
auto ref_argument =
ref_batched_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op);
ref_invoker.Run(ref_argument);
}
template <typename DeviceGemmPtr,
typename ADataType,
typename BDataType,
typename CDataType,
typename AElementwiseOperation,
typename BElementwiseOperation,
typename CElementwiseOperation>
void RunDeviceBatchedGemm(DeviceGemmPtr& batched_gemm_ptr,
const ck::batched_gemm_util::GemmParams& params,
const Tensor<ADataType>& A,
const Tensor<BDataType>& B,
Tensor<CDataType>& C,
AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op,
CElementwiseOperation c_element_op)
{
DeviceMem a_g_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpace());
DeviceMem b_g_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpace());
DeviceMem c_g_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpace());
a_g_m_k_device_buf.ToDevice(A.mData.data());
b_g_k_n_device_buf.ToDevice(B.mData.data());
const auto batch_count = A.mDesc.GetLengths()[0];
auto invoker_ptr = batched_gemm_ptr->MakeInvokerPointer();
auto argument_ptr = batched_gemm_ptr->MakeArgumentPointer(
static_cast<ADataType*>(a_g_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_g_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_g_m_n_device_buf.GetDeviceBuffer()),
params.M,
params.N,
params.K,
params.StrideA,
params.StrideB,
params.StrideC,
a_element_op,
b_element_op,
c_element_op,
batch_count);
if(!batched_gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{
throw std::runtime_error(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem");
}
invoker_ptr->Run(argument_ptr.get());
c_g_m_n_device_buf.FromDevice(C.mData.data());
}
} // namespace batched_gemm_util
} // namespace ck
#endif
include_directories(BEFORE
${PROJECT_SOURCE_DIR}/profiler/include
${PROJECT_SOURCE_DIR}/test/include
${PROJECT_SOURCE_DIR}/external/include/half
)
add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE host_tensor)
target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance)
#include <iostream>
#include "profile_batched_gemm_reduce_impl.hpp"
int main()
{
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
int M = 512;
int N = 256;
int K = 128;
int BatchCount = 3;
bool pass = true;
pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
Row,
Row,
Row>(
true, 1, false, 1, M, N, K, K, N, N, BatchCount);
pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
Row,
Col,
Row>(
true, 1, false, 1, M, N, K, K, K, N, BatchCount);
pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
Col,
Row,
Row>(
true, 1, false, 1, M, N, K, M, N, N, BatchCount);
pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
Col,
Col,
Row>(
true, 1, false, 1, M, N, K, M, K, N, BatchCount);
if(pass)
{
std::cout << "test BatchedGEMM+Reduce fp16: Pass" << std::endl;
return 0;
}
else
{
std::cout << "test BatchedGEMM+Reduce fp16: Fail" << std::endl;
return -1;
}
}
......@@ -121,15 +121,17 @@ int main(int argc, char* argv[])
exit(1);
}
auto Run = [&](auto input_type, auto wei_type, auto out_type) {
auto Run = [&](auto input_type, auto wei_type, auto out_type, auto acc_type) {
using InDataType = decltype(input_type);
using WeiDataType = decltype(wei_type);
using OutDataType = decltype(out_type);
using AccDataType = decltype(acc_type);
using ReferenceConvBwdInstance =
ck::tensor_operation::host::ReferenceConvBwdData<InDataType,
WeiDataType,
OutDataType,
AccDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
......@@ -182,8 +184,8 @@ int main(int argc, char* argv[])
out_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
in_n_c_hi_wi_device_result.GenerateTensorValue(GeneratorTensor_1<InDataType>{5});
// reset input to zero
in_n_c_hi_wi_device_result.GenerateTensorValue(GeneratorTensor_1<InDataType>{0});
in_device_buf.ToDevice(in_n_c_hi_wi_device_result.mData.data());
// get host result
......@@ -225,9 +227,9 @@ int main(int argc, char* argv[])
ck::tensor_operation::device::device_conv2d_bwd_data_instance::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
}
else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ushort> &&
ck::is_same_v<ck::remove_cv_t<WeiDataType>, ushort> &&
ck::is_same_v<ck::remove_cv_t<OutDataType>, ushort>)
else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::bhalf_t> &&
ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::bhalf_t> &&
ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::bhalf_t>)
{
ck::tensor_operation::device::device_conv2d_bwd_data_instance::
add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
......@@ -293,33 +295,33 @@ int main(int argc, char* argv[])
if(success)
{
std::cout << "test conv2d bwd : Pass" << std::endl;
return 0;
}
else
{
std::cout << "test conv2d bwd: Fail " << std::endl;
return -1;
}
};
if(data_type == 0)
{
Run(F32(), F32(), F32());
return Run(F32(), F32(), F32(), F32());
}
else if(data_type == 1)
{
Run(F16(), F16(), F16());
return Run(F16(), F16(), F16(), F32());
}
else if(data_type == 2)
{
Run(BF16(), BF16(), BF16());
return Run(BF16(), BF16(), BF16(), F32());
}
else if(data_type == 3)
{
Run(INT8(), INT8(), INT8());
return Run(INT8(), INT8(), INT8(), int());
}
else
{
return 1;
}
return 0;
}
include_directories(BEFORE
${PROJECT_SOURCE_DIR}/profiler/include
${PROJECT_SOURCE_DIR}/external/include/half
)
add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp)
target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor)
target_link_libraries(test_conv2d_bwd_weight PRIVATE device_conv2d_bwd_weight_instance)
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include <vector>
#include "conv_fwd_util.hpp"
#include "profile_conv_bwd_weight_impl.hpp"
int test_self()
{
bool pass = true;
std::vector<ck::utils::conv::ConvParams> params;
params.push_back({2, 128, 256, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
params.push_back({2, 128, 256, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
params.push_back({2, 128, 256, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
for(auto& param : params)
{
// f32
pass &= ck::profiler::profile_conv_bwd_weight_impl<2,
float,
float,
float,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
1, // do_verification,
1, // init_method,
0, // do_log,
1, // nrepeat,
param.N,
param.K,
param.C,
param.input_spatial_lengths,
param.filter_spatial_lengths,
param.GetOutputSpatialLengths(),
param.conv_filter_strides,
param.conv_filter_dilations,
param.input_left_pads,
param.input_right_pads,
2);
// fp16
pass &= ck::profiler::profile_conv_bwd_weight_impl<2,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
1, // do_verification,
1, // init_method,
0, // do_log,
1, // nrepeat,
param.N,
param.K,
param.C,
param.input_spatial_lengths,
param.filter_spatial_lengths,
param.GetOutputSpatialLengths(),
param.conv_filter_strides,
param.conv_filter_dilations,
param.input_left_pads,
param.input_right_pads,
2);
}
return pass;
}
int main(int argc, char* argv[])
{
int data_type = 0;
int init_method = 0;
// Conv shape
ck::index_t N = 128;
ck::index_t K = 256;
ck::index_t C = 192;
ck::index_t Y = 3;
ck::index_t X = 3;
ck::index_t Hi = 71;
ck::index_t Wi = 71;
ck::index_t conv_stride_h = 2;
ck::index_t conv_stride_w = 2;
ck::index_t conv_dilation_h = 1;
ck::index_t conv_dilation_w = 1;
ck::index_t in_left_pad_h = 1;
ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_h = 1;
ck::index_t in_right_pad_w = 1;
ck::index_t split_k = 1;
bool pass = true;
if(argc == 1)
{
pass = test_self();
}
else
{
if(argc == 3)
{
data_type = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
}
else if(argc == 19)
{
data_type = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
N = std::stoi(argv[3]);
K = std::stoi(argv[4]);
C = std::stoi(argv[5]);
Y = std::stoi(argv[6]);
X = std::stoi(argv[7]);
Hi = std::stoi(argv[8]);
Wi = std::stoi(argv[9]);
conv_stride_h = std::stoi(argv[10]);
conv_stride_w = std::stoi(argv[11]);
conv_dilation_h = std::stoi(argv[12]);
conv_dilation_w = std::stoi(argv[13]);
in_left_pad_h = std::stoi(argv[14]);
in_left_pad_w = std::stoi(argv[15]);
in_right_pad_h = std::stoi(argv[16]);
in_right_pad_w = std::stoi(argv[17]);
split_k = std::stoi(argv[18]);
}
else
{
printf("arg1: data type (0=fp32, 1=fp16, 2= bfp16, 3= int8_t )\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3 to 17: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(1);
}
ck::utils::conv::ConvParams param{2,
N,
K,
C,
{Y, X},
{Hi, Wi},
{conv_stride_h, conv_stride_w},
{conv_dilation_h, conv_dilation_w},
{in_left_pad_h, in_left_pad_w},
{in_right_pad_h, in_right_pad_w}};
if(data_type == 0)
{
pass = ck::profiler::profile_conv_bwd_weight_impl<2,
float,
float,
float,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
1,
init_method,
0,
1,
param.N,
param.K,
param.C,
param.input_spatial_lengths,
param.filter_spatial_lengths,
param.GetOutputSpatialLengths(),
param.conv_filter_strides,
param.conv_filter_dilations,
param.input_left_pads,
param.input_right_pads,
split_k);
}
else if(data_type == 1)
{
pass = ck::profiler::profile_conv_bwd_weight_impl<2,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
1,
init_method,
0,
1,
param.N,
param.K,
param.C,
param.input_spatial_lengths,
param.filter_spatial_lengths,
param.GetOutputSpatialLengths(),
param.conv_filter_strides,
param.conv_filter_dilations,
param.input_left_pads,
param.input_right_pads,
split_k);
}
else
{
std::cout << "Not support data type" << std::endl;
return 1;
}
}
if(pass)
{
std::cout << "test conv2d bwd weight : Pass" << std::endl;
return 0;
}
else
{
std::cout << "test conv2d bwd weight: Fail " << std::endl;
return -1;
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment