"driver/conv_driver.cpp" did not exist on "bf975428460a27b46912d1c4293b407febb92de0"
Commit b134b7d6 authored by carlushuang's avatar carlushuang
Browse files

Merge remote-tracking branch 'origin/develop' into cpu_avx2

parents 090ba885 9f71ff48
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#include "check_err.hpp" #include "check_err.hpp"
#include "config.hpp" #include "config.hpp"
#include "conv_fwd_util.hpp" #include "conv_util.hpp"
#include "device.hpp" #include "device.hpp"
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp" #include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
...@@ -112,7 +112,7 @@ void print_use_msg() ...@@ -112,7 +112,7 @@ void print_use_msg()
{ {
std::cout << "arg1: verification (0=no, 1=yes)\n" std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: run kernel # of times (>1)\n" << "arg3: time kernel (0=n0, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n" << "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n" << "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n" << " N, K, C, \n"
...@@ -139,40 +139,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha ...@@ -139,40 +139,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha
ck::utils::conv::ConvParams params; ck::utils::conv::ConvParams params;
int arg_idx = 5; int arg_idx = 5;
params.num_dim_spatial = num_dim_spatial; params.num_dim_spatial_ = num_dim_spatial;
params.N = std::stoi(argv[arg_idx++]); params.N_ = std::stoi(argv[arg_idx++]);
params.K = std::stoi(argv[arg_idx++]); params.K_ = std::stoi(argv[arg_idx++]);
params.C = std::stoi(argv[arg_idx++]); params.C_ = std::stoi(argv[arg_idx++]);
params.filter_spatial_lengths.resize(num_dim_spatial); params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
} }
params.input_spatial_lengths.resize(num_dim_spatial); params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
} }
params.conv_filter_strides.resize(num_dim_spatial); params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
} }
params.conv_filter_dilations.resize(num_dim_spatial); params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
} }
params.input_left_pads.resize(num_dim_spatial); params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.input_left_pads[i] = std::stoi(argv[arg_idx++]); params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
} }
params.input_right_pads.resize(num_dim_spatial); params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.input_right_pads[i] = std::stoi(argv[arg_idx++]); params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
} }
return params; return params;
...@@ -184,9 +184,9 @@ int main(int argc, char* argv[]) ...@@ -184,9 +184,9 @@ int main(int argc, char* argv[])
{ {
using namespace ck::utils::conv; using namespace ck::utils::conv;
bool do_verification = 0; bool do_verification = true;
int init_method = 0; int init_method = 1;
int nrepeat = 5; bool time_kernel = false;
int num_dim_spatial = 2; int num_dim_spatial = 2;
ck::utils::conv::ConvParams params; ck::utils::conv::ConvParams params;
...@@ -195,7 +195,7 @@ int main(int argc, char* argv[]) ...@@ -195,7 +195,7 @@ int main(int argc, char* argv[])
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]); num_dim_spatial = std::stoi(argv[4]);
} }
...@@ -204,21 +204,21 @@ int main(int argc, char* argv[]) ...@@ -204,21 +204,21 @@ int main(int argc, char* argv[])
params = parse_conv_params(num_dim_spatial, argc, argv); params = parse_conv_params(num_dim_spatial, argc, argv);
} }
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N), std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
static_cast<std::size_t>(params.C)}; static_cast<std::size_t>(params.C_)};
input_dims.insert(std::end(input_dims), input_dims.insert(std::end(input_dims),
std::begin(params.input_spatial_lengths), std::begin(params.input_spatial_lengths_),
std::end(params.input_spatial_lengths)); std::end(params.input_spatial_lengths_));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K), std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
static_cast<std::size_t>(params.C)}; static_cast<std::size_t>(params.C_)};
filter_dims.insert(std::end(filter_dims), filter_dims.insert(std::end(filter_dims),
std::begin(params.filter_spatial_lengths), std::begin(params.filter_spatial_lengths_),
std::end(params.filter_spatial_lengths)); std::end(params.filter_spatial_lengths_));
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths(); const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N), std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
static_cast<std::size_t>(params.K)}; static_cast<std::size_t>(params.K_)};
output_dims.insert(std::end(output_dims), output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths), std::begin(output_spatial_lengths),
std::end(output_spatial_lengths)); std::end(output_spatial_lengths));
...@@ -258,16 +258,16 @@ int main(int argc, char* argv[]) ...@@ -258,16 +258,16 @@ int main(int argc, char* argv[])
conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()), static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
params.N, params.N_,
params.K, params.K_,
params.C, params.C_,
params.input_spatial_lengths, params.input_spatial_lengths_,
params.filter_spatial_lengths, params.filter_spatial_lengths_,
output_spatial_lengths, output_spatial_lengths,
params.conv_filter_strides, params.conv_filter_strides_,
params.conv_filter_dilations, params.conv_filter_dilations_,
params.input_left_pads, params.input_left_pads_,
params.input_right_pads, params.input_right_pads_,
InElementOp{}, InElementOp{},
WeiElementOp{}, WeiElementOp{},
OutElementOp{}); OutElementOp{});
...@@ -279,16 +279,16 @@ int main(int argc, char* argv[]) ...@@ -279,16 +279,16 @@ int main(int argc, char* argv[])
"not support this Conv problem"); "not support this Conv problem");
} }
float ave_time = invoker->Run(argument.get(), nrepeat); float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = get_flops( std::size_t flop = get_flops(
params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths); params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
std::size_t num_btype = get_btype<InDataType, WeiDataType, OutDataType>( std::size_t num_btype = get_btype<InDataType, WeiDataType, OutDataType>(
params.N, params.N_,
params.C, params.C_,
params.K, params.K_,
params.input_spatial_lengths, params.input_spatial_lengths_,
params.filter_spatial_lengths, params.filter_spatial_lengths_,
output_spatial_lengths); output_spatial_lengths);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time; float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
...@@ -304,18 +304,18 @@ int main(int argc, char* argv[]) ...@@ -304,18 +304,18 @@ int main(int argc, char* argv[])
auto ref_argument = ref_conv.MakeArgument(input, auto ref_argument = ref_conv.MakeArgument(input,
weights, weights,
host_output, host_output,
params.conv_filter_strides, params.conv_filter_strides_,
params.conv_filter_dilations, params.conv_filter_dilations_,
params.input_left_pads, params.input_left_pads_,
params.input_right_pads, params.input_right_pads_,
InElementOp{}, InElementOp{},
WeiElementOp{}, WeiElementOp{},
OutElementOp{}); OutElementOp{});
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data()); out_device_buf.FromDevice(device_output.mData.data());
ck::utils::check_err( return ck::utils::check_err(
host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); host_output.mData, device_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f) ? 0 : 1;
}; };
switch(num_dim_spatial) switch(num_dim_spatial)
...@@ -340,4 +340,5 @@ int main(int argc, char* argv[]) ...@@ -340,4 +340,5 @@ int main(int argc, char* argv[])
} }
} }
} }
return 0;
} }
add_example_executable(example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp) add_example_executable(example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp)
target_link_libraries(example_conv2d_bwd_data_xdl PRIVATE conv_fwd_util) target_link_libraries(example_conv2d_bwd_data_xdl PRIVATE conv_util)
...@@ -77,9 +77,9 @@ using ReferenceConvBwdInstance = ck::tensor_operation::host::ReferenceConvBwdDat ...@@ -77,9 +77,9 @@ using ReferenceConvBwdInstance = ck::tensor_operation::host::ReferenceConvBwdDat
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
bool do_verification = 0; bool do_verification = true;
int init_method = 0; int init_method = 1;
int nrepeat = 5; bool time_kernel = false;
// Conv shape // Conv shape
ck::index_t N = 128; ck::index_t N = 128;
...@@ -102,13 +102,13 @@ int main(int argc, char* argv[]) ...@@ -102,13 +102,13 @@ int main(int argc, char* argv[])
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
} }
else if(argc == 19) else if(argc == 19)
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
N = std::stoi(argv[4]); N = std::stoi(argv[4]);
K = std::stoi(argv[5]); K = std::stoi(argv[5]);
...@@ -130,7 +130,7 @@ int main(int argc, char* argv[]) ...@@ -130,7 +130,7 @@ int main(int argc, char* argv[])
{ {
printf("arg1: verification (0=no, 1=yes)\n"); printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: run kernel # of times (>1)\n"); printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n"); "RightPx\n");
exit(0); exit(0);
...@@ -214,7 +214,7 @@ int main(int argc, char* argv[]) ...@@ -214,7 +214,7 @@ int main(int argc, char* argv[])
"not support this Conv problem"); "not support this Conv problem");
} }
float ave_time = invoker.Run(argument, nrepeat); float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
...@@ -249,6 +249,10 @@ int main(int argc, char* argv[]) ...@@ -249,6 +249,10 @@ int main(int argc, char* argv[])
in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data()); in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data());
ck::utils::check_err(in_n_c_hi_wi_device_result.mData, in_n_c_hi_wi_host_result.mData); return ck::utils::check_err(in_n_c_hi_wi_device_result.mData,
in_n_c_hi_wi_host_result.mData)
? 0
: 1;
} }
return 0;
} }
add_example_executable(example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp) add_example_executable(example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp)
target_link_libraries(example_conv2d_bwd_weight_xdl PRIVATE conv_fwd_util) target_link_libraries(example_conv2d_bwd_weight_xdl PRIVATE conv_util)
...@@ -82,9 +82,9 @@ using ReferenceConvBwdWeightInstance = ...@@ -82,9 +82,9 @@ using ReferenceConvBwdWeightInstance =
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
bool do_verification = 0; bool do_verification = true;
int init_method = 0; int init_method = 1;
int nrepeat = 5; bool time_kernel = false;
int do_log = 0; int do_log = 0;
int split_k = 4; int split_k = 4;
...@@ -109,7 +109,7 @@ int main(int argc, char* argv[]) ...@@ -109,7 +109,7 @@ int main(int argc, char* argv[])
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
do_log = std::stoi(argv[4]); do_log = std::stoi(argv[4]);
split_k = std::stoi(argv[5]); split_k = std::stoi(argv[5]);
} }
...@@ -117,7 +117,7 @@ int main(int argc, char* argv[]) ...@@ -117,7 +117,7 @@ int main(int argc, char* argv[])
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
do_log = std::stoi(argv[4]); do_log = std::stoi(argv[4]);
split_k = std::stoi(argv[5]); split_k = std::stoi(argv[5]);
...@@ -141,7 +141,7 @@ int main(int argc, char* argv[]) ...@@ -141,7 +141,7 @@ int main(int argc, char* argv[])
{ {
printf("arg1: verification (0=no, 1=yes)\n"); printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: run kernel # of times (>1)\n"); printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4: is show log (0=no, 1=yes)\n"); printf("arg4: is show log (0=no, 1=yes)\n");
printf("arg5: split-k \n"); printf("arg5: split-k \n");
printf("arg6 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " printf("arg6 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
...@@ -246,7 +246,7 @@ int main(int argc, char* argv[]) ...@@ -246,7 +246,7 @@ int main(int argc, char* argv[])
return 1; return 1;
} }
float ave_time = invoker.Run(argument, nrepeat); float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
...@@ -291,6 +291,9 @@ int main(int argc, char* argv[]) ...@@ -291,6 +291,9 @@ int main(int argc, char* argv[])
LogRangeAsType<float>(std::cout << "wei_host : ", wei_k_c_y_x_host_result.mData, ",") LogRangeAsType<float>(std::cout << "wei_host : ", wei_k_c_y_x_host_result.mData, ",")
<< std::endl; << std::endl;
} }
ck::utils::check_err(wei_k_c_y_x_device_result.mData, wei_k_c_y_x_host_result.mData); return ck::utils::check_err(wei_k_c_y_x_device_result.mData, wei_k_c_y_x_host_result.mData)
? 0
: 1;
} }
return 0;
} }
add_example_executable(example_reduce_blockwise reduce_blockwise.cpp) add_example_executable(example_reduce_blockwise reduce_blockwise.cpp -D 16,64,32,960 -v 1 1 10)
...@@ -116,10 +116,9 @@ class SimpleAppArgs ...@@ -116,10 +116,9 @@ class SimpleAppArgs
std::vector<size_t> inLengths; std::vector<size_t> inLengths;
std::vector<float> scales; std::vector<float> scales;
bool do_verification = false; bool do_verification = true;
int init_method = 1; int init_method = 1;
int nrepeat = 5; bool time_kernel = false;
public: public:
void show_usage(const char* cmd) void show_usage(const char* cmd)
...@@ -135,12 +134,12 @@ class SimpleAppArgs ...@@ -135,12 +134,12 @@ class SimpleAppArgs
std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer " std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
"value, 3=decimal value)" "value, 3=decimal value)"
<< std::endl; << std::endl;
std::cout << "Arg2 -- number of repeats to run the kernel" << std::endl; std::cout << "Arg2 -- time kernel (0=n0, 1=yes)" << std::endl;
}; };
int processArgs(int argc, char* argv[]) int processArgs(int argc, char* argv[])
{ {
unsigned int ch; int ch;
while(1) while(1)
{ {
...@@ -182,7 +181,7 @@ class SimpleAppArgs ...@@ -182,7 +181,7 @@ class SimpleAppArgs
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!"); throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
init_method = std::atoi(argv[optind++]); init_method = std::atoi(argv[optind++]);
nrepeat = std::atoi(argv[optind]); time_kernel = std::atoi(argv[optind]);
if(scales.empty()) if(scales.empty())
{ {
...@@ -352,7 +351,7 @@ int main(int argc, char* argv[]) ...@@ -352,7 +351,7 @@ int main(int argc, char* argv[])
auto invoker_ptr = reduce.MakeInvokerPointer(); auto invoker_ptr = reduce.MakeInvokerPointer();
float avg_time = invoker_ptr->Run(argument_ptr.get(), args.nrepeat); float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, args.time_kernel});
std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) + std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) +
invariant_total_length * sizeof(OutDataType); invariant_total_length * sizeof(OutDataType);
...@@ -362,16 +361,17 @@ int main(int argc, char* argv[]) ...@@ -362,16 +361,17 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
<< std::endl; << std::endl;
bool pass = true;
if(args.do_verification) if(args.do_verification)
{ {
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.mData.data());
ck::utils::check_err(out.mData, out_ref.mData); pass &= ck::utils::check_err(out.mData, out_ref.mData);
if(NeedIndices) if(NeedIndices)
{ {
out_indices_dev.FromDevice(out_indices.mData.data()); out_indices_dev.FromDevice(out_indices.mData.data());
ck::utils::check_err(out_indices.mData, out_indices_ref.mData); pass &= ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
;
}; };
}; };
return pass ? 0 : 1;
} }
...@@ -80,8 +80,8 @@ static void pool_host_verify(const Tensor<InDataType>& in, ...@@ -80,8 +80,8 @@ static void pool_host_verify(const Tensor<InDataType>& in,
for(int x = 0; x < window_spatial_lengths[1]; ++x) for(int x = 0; x < window_spatial_lengths[1]; ++x)
{ {
int wi = wo * window_strides[1] + x - in_left_pads[1]; int wi = wo * window_strides[1] + x - in_left_pads[1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 && if(hi >= 0 && hi < ck::type_convert<int>(in.mDesc.GetLengths()[2]) && wi >= 0 &&
wi < in.mDesc.GetLengths()[3]) wi < ck::type_convert<int>(in.mDesc.GetLengths()[3]))
{ {
AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi)); AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
...@@ -149,9 +149,9 @@ int main(int argc, char* argv[]) ...@@ -149,9 +149,9 @@ int main(int argc, char* argv[])
{ {
using namespace ck::host_reduce; using namespace ck::host_reduce;
bool do_verification = 0; bool do_verification = true;
int init_method = 0; int init_method = 1;
int nrepeat = 5; bool time_kernel = false;
// Pool shape // Pool shape
ck::index_t N = 128; ck::index_t N = 128;
...@@ -171,13 +171,13 @@ int main(int argc, char* argv[]) ...@@ -171,13 +171,13 @@ int main(int argc, char* argv[])
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
} }
else if(argc == 16) else if(argc == 16)
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
N = std::stoi(argv[4]); N = std::stoi(argv[4]);
C = std::stoi(argv[5]); C = std::stoi(argv[5]);
...@@ -196,7 +196,7 @@ int main(int argc, char* argv[]) ...@@ -196,7 +196,7 @@ int main(int argc, char* argv[])
{ {
printf("arg1: verification (0=no, 1=yes)\n"); printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: run kernel # of times (>1)\n"); printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, " printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
"RightPx\n"); "RightPx\n");
exit(0); exit(0);
...@@ -271,7 +271,7 @@ int main(int argc, char* argv[]) ...@@ -271,7 +271,7 @@ int main(int argc, char* argv[])
"not support this problem"); "not support this problem");
} }
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * N * C * Ho * Wo * Y * X; std::size_t flop = std::size_t(2) * N * C * Ho * Wo * Y * X;
...@@ -285,6 +285,7 @@ int main(int argc, char* argv[]) ...@@ -285,6 +285,7 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl; << std::endl;
bool pass = true;
if(do_verification) if(do_verification)
{ {
pool_host_verify<InDataType, pool_host_verify<InDataType,
...@@ -302,14 +303,15 @@ int main(int argc, char* argv[]) ...@@ -302,14 +303,15 @@ int main(int argc, char* argv[])
out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data()); out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData); pass &= ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData);
if constexpr(NeedIndices) if constexpr(NeedIndices)
{ {
out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data()); out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());
// ck::utils::check_err(out_indices_n_c_ho_wo_device.mData, pass &= ck::utils::check_err(out_indices_n_c_ho_wo_device.mData,
// out_indices_n_c_ho_wo_host.mData);; out_indices_n_c_ho_wo_host.mData);
}; };
} }
return pass ? 0 : 1;
} }
...@@ -13,74 +13,91 @@ ...@@ -13,74 +13,91 @@
#include "host_tensor_generator.hpp" #include "host_tensor_generator.hpp"
#include "host_gemm.hpp" #include "host_gemm.hpp"
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_gemm_xdl.hpp" #include "device_gemm_xdl_cshuffle.hpp"
#include "device_gemm_xdl_c_shuffle.hpp"
#include "element_wise_operation.hpp" #include "element_wise_operation.hpp"
#include "reference_gemm.hpp" #include "reference_gemm.hpp"
#include "gemm_specialization.hpp" #include "gemm_specialization.hpp"
template <ck::index_t... Is> struct RequantReluRequant
using S = ck::Sequence<Is...>; {
// FIXME: We just need one scale for Relu / Leaky Relu / PRelu
RequantReluRequant(float scaleGemm, float scaleRelu)
: scaleGemm_(scaleGemm), scaleRelu_(scaleRelu)
{
}
using F32 = float; __host__ __device__ constexpr void operator()(float& y, const float& x) const
{
float gemm_requant = scaleGemm_ * x;
float relu = gemm_requant > 0 ? gemm_requant : 0;
float relu_requant = scaleRelu_ * relu;
y = relu_requant > 127 ? 127 : relu_requant < -128 ? -128 : relu_requant;
}
using Row = ck::tensor_layout::gemm::RowMajor; float scaleGemm_;
using Col = ck::tensor_layout::gemm::ColumnMajor; float scaleRelu_;
};
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using RequantReluRequant = ck::tensor_operation::element_wise::RequantReluRequant;
using ADataType = int8_t; using ADataType = int8_t;
using BDataType = int8_t; using BDataType = int8_t;
using CDataType = int8_t; using CDataType = int8_t;
using AccDataType = int32_t; using AccDataType = int32_t;
using CShuffleDataType = int32_t; using CShuffleDataType = float;
using ALayout = ck::tensor_layout::gemm::RowMajor; using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor; using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor; using CLayout = ck::tensor_layout::gemm::RowMajor;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off // clang-format off
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle< using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle<
ADataType, // ADataType ALayout, // typename ALayout,
BDataType, // BDataType BLayout, // typename BLayout,
CDataType, // CDataType CLayout, // typename CLayout,
AccDataType, // AccDataType ADataType, // typename ADataType,
CShuffleDataType, // CShuffleDataType BDataType, // typename BDataType,
ALayout, // ALayout CDataType, // typename CDataType,
BLayout, // BLayout AccDataType, // typename GemmAccDataType,
CLayout, // CLayout CShuffleDataType, // typename CShuffleDataType,
PassThrough, // AElementwiseOperation PassThrough, // typename AElementwiseOperation,
PassThrough, // BElementwiseOperation PassThrough, // typename BElementwiseOperation,
RequantReluRequant, // CElementwiseOperation RequantReluRequant, // typename CElementwiseOperation,
256, // BlockSize GemmDefault, // GemmSpecialization GemmSpec,
256, // MPerBlock 1, // index_t NumGemmKPrefetchStage,
128, // NPerBlock 256, // index_t BlockSize,
64, // KPerBlock 256, // index_t MPerBlock,
16, // AK1 128, // index_t NPerBlock,
16, // BK1 64, // index_t KPerBlock,
32, // MPerXDL 16, // index_t AK1,
32, // NPerXDL 16, // index_t BK1,
4, // MXdlPerWave 32, // index_t MPerXDL,
2, // NXdlPerWave 32, // index_t NPerXDL,
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1 4, // index_t MXdlPerWave,
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder 2, // index_t NXdlPerWave,
S<1, 0, 2>, // ABlockTransferSrcAccessOrder S<4, 64, 1>, // typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
2, // ABlockTransferSrcVectorDim S<1, 0, 2>, // typename ABlockTransferThreadClusterArrangeOrder,
16, // ABlockTransferSrcScalarPerVector S<1, 0, 2>, // typename ABlockTransferSrcAccessOrder,
16, // ABlockTransferDstScalarPerVector_K1 2, // index_t ABlockTransferSrcVectorDim,
true, // ABlockLdsAddExtraM 16, // index_t ABlockTransferSrcScalarPerVector,
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1 16, // index_t ABlockTransferDstScalarPerVector_AK1,
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder 1, // bool ABlockLdsExtraM,
S<1, 0, 2>, // BBlockTransferSrcAccessOrder S<4, 64, 1>, // typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
2, // BBlockTransferSrcVectorDim S<1, 0, 2>, // typename BBlockTransferThreadClusterArrangeOrder,
16, // BBlockTransferSrcScalarPerVector S<1, 0, 2>, // typename BBlockTransferSrcAccessOrder,
16, // BBlockTransferDstScalarPerVector_K1 2, // index_t BBlockTransferSrcVectorDim,
true, // BBlockLdsAddExtraN 8, // index_t BBlockTransferSrcScalarPerVector,
1, // CShuffleMXdlPerWavePerShuffle 8, // index_t BBlockTransferDstScalarPerVector_BK1,
1, // CShuffleNXdlPerWavePerShuffle 1, // bool BBlockLdsExtraN,
S<1, 1, 64, 1, 1, 4>, // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl 1, // index_t CShuffleMXdlPerWavePerShuffle,
16>; // CBlockTransferScalarPerVector_NWaveNPerXdl 1, // index_t CShuffleNXdlPerWavePerShuffle,
S<1, 64, 1, 4>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
16>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
// clang-format on // clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host:: using ReferenceGemmInstance = ck::tensor_operation::host::
...@@ -88,9 +105,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host:: ...@@ -88,9 +105,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
bool do_verification = 0; bool do_verification = true;
int init_method = 0; int init_method = 1;
int nrepeat = 5; bool time_kernel = false;
// GEMM shape // GEMM shape
ck::index_t M = 3840; ck::index_t M = 3840;
...@@ -108,13 +125,13 @@ int main(int argc, char* argv[]) ...@@ -108,13 +125,13 @@ int main(int argc, char* argv[])
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
} }
else if(argc == 10) else if(argc == 10)
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]); M = std::stoi(argv[4]);
N = std::stoi(argv[5]); N = std::stoi(argv[5]);
...@@ -128,7 +145,7 @@ int main(int argc, char* argv[]) ...@@ -128,7 +145,7 @@ int main(int argc, char* argv[])
{ {
printf("arg1: verification (0=no, 1=yes)\n"); printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: run kernel # of times (>1)\n"); printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0); exit(0);
} }
...@@ -202,7 +219,7 @@ int main(int argc, char* argv[]) ...@@ -202,7 +219,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem"); "not support this GEMM problem");
} }
float ave_time = invoker.Run(argument, nrepeat); float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K; std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype = std::size_t num_btype =
...@@ -227,7 +244,7 @@ int main(int argc, char* argv[]) ...@@ -227,7 +244,7 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
} }
return 0; return 0;
......
...@@ -60,21 +60,21 @@ using ReferenceGemmInstance = ck::tensor_operation::host:: ...@@ -60,21 +60,21 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
bool do_verification = 0; bool do_verification = true;
int init_method = 0; int init_method = 1;
int nrepeat = 5; bool time_kernel = false;
if(argc == 4) if(argc == 4)
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
} }
else else
{ {
printf("arg1: verification (0=no, 1=yes)\n"); printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: run kernel # of times (>1)\n"); printf("arg3: time kernel (0=n0, 1=yes)\n");
exit(0); exit(0);
} }
...@@ -131,7 +131,7 @@ int main(int argc, char* argv[]) ...@@ -131,7 +131,7 @@ int main(int argc, char* argv[])
std::size_t flop = 0, num_btype = 0; std::size_t flop = 0, num_btype = 0;
for(int i = 0; i < gemm_shapes.size(); i++) for(std::size_t i = 0; i < gemm_shapes.size(); i++)
{ {
a_tensors.push_back(Tensor<ADataType>(f_host_tensor_descriptor( a_tensors.push_back(Tensor<ADataType>(f_host_tensor_descriptor(
gemm_shapes[i].M, gemm_shapes[i].K, gemm_shapes[i].StrideA, ALayout{}))); gemm_shapes[i].M, gemm_shapes[i].K, gemm_shapes[i].StrideA, ALayout{})));
...@@ -168,7 +168,7 @@ int main(int argc, char* argv[]) ...@@ -168,7 +168,7 @@ int main(int argc, char* argv[])
} }
} }
for(int i = 0; i < gemm_shapes.size(); i++) for(std::size_t i = 0; i < gemm_shapes.size(); i++)
{ {
a_tensors_device.emplace_back( a_tensors_device.emplace_back(
std::make_unique<DeviceMem>(sizeof(ADataType) * a_tensors[i].mDesc.GetElementSpace())); std::make_unique<DeviceMem>(sizeof(ADataType) * a_tensors[i].mDesc.GetElementSpace()));
...@@ -202,7 +202,7 @@ int main(int argc, char* argv[]) ...@@ -202,7 +202,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem"); "not support this GEMM problem");
} }
float ave_time = invoker.Run(argument, nrepeat); float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
float tflops = static_cast<float>(flop) / 1.E9 / ave_time; float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
...@@ -211,9 +211,10 @@ int main(int argc, char* argv[]) ...@@ -211,9 +211,10 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl; << gemm.GetTypeString() << std::endl;
bool pass = true;
if(do_verification) if(do_verification)
{ {
for(int i = 0; i < gemm_shapes.size(); i++) for(std::size_t i = 0; i < gemm_shapes.size(); i++)
{ {
c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data()); c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data());
auto ref_gemm = ReferenceGemmInstance{}; auto ref_gemm = ReferenceGemmInstance{};
...@@ -227,9 +228,9 @@ int main(int argc, char* argv[]) ...@@ -227,9 +228,9 @@ int main(int argc, char* argv[])
c_element_op); c_element_op);
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
ck::utils::check_err(c_device_tensors[i].mData, c_host_tensors[i].mData); pass &= ck::utils::check_err(c_device_tensors[i].mData, c_host_tensors[i].mData);
} }
} }
return 0; return pass ? 0 : 1;
} }
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <cstdlib> #include <cstdlib>
#include <stdlib.h> #include <stdlib.h>
#include <half.hpp> #include <half.hpp>
#include "check_err.hpp"
#include "config.hpp" #include "config.hpp"
#include "device.hpp" #include "device.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
...@@ -11,9 +12,10 @@ ...@@ -11,9 +12,10 @@
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_gemm_reduce_xdl_cshuffle.hpp" #include "device_gemm_reduce_xdl_cshuffle.hpp"
#include "element_wise_operation.hpp" #include "element_wise_operation.hpp"
#include "reduction_operator.hpp"
#include "reference_gemm.hpp" #include "reference_gemm.hpp"
#include "gemm_specialization.hpp" #include "gemm_specialization.hpp"
#include "element_wise_reduce_operation.hpp" #include "reduction_operator.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -36,19 +38,20 @@ using CLayout = ck::tensor_layout::gemm::RowMajor; ...@@ -36,19 +38,20 @@ using CLayout = ck::tensor_layout::gemm::RowMajor;
using AElementOp = ck::tensor_operation::element_wise::PassThrough; using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough; using CElementOp = ck::tensor_operation::element_wise::PassThrough;
using D0ReduceOp = ck::tensor_operation::element_wise::ReduceSum; using D0ReduceOp = ck::reduce::Add<float>;
using D1ReduceOp = ck::tensor_operation::element_wise::ReduceSquareSum; using D1ReduceOp = ck::reduce::Add<float>;
using D1ElementOp = ck::tensor_operation::element_wise::UnarySquare<float, float, false>;
static constexpr auto GemmSpecialization = static constexpr auto GemmSpecialization =
ck::tensor_operation::device::GemmSpecialization::Default; ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off // clang-format off
using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_Xdl_CShuffle using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_Xdl_CShuffle
//######| ALayout| BLayout| CLayout|AData| BData| CData| GemmAcc| CShuffle| ReduceAcc| DData| A| B| C| D0| D1| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy| //######| ALayout| BLayout| CLayout|AData| BData| CData| GemmAcc| CShuffle| ReduceAcc| DData| A| B| C| D0| D1| D1EleOp| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
//######| | | | Type| Type| Type| DataType| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Reduce| Reduce| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector| //######| | | | Type| Type| Type| DataType| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Reduce| Reduce| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock| //######| | | | | | | | | | | Operation| Operation| Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< Row, Col, Row, F16, F16, F16, F32, F32, F32, F32, AElementOp, BElementOp, CElementOp, D0ReduceOp, D1ReduceOp, GemmSpecialization, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, S<64, 4>, 4, 1>; < Row, Col, Row, F16, F16, F16, F32, F32, F32, F32, AElementOp, BElementOp, CElementOp, D0ReduceOp, D1ReduceOp, D1ElementOp, GemmSpecialization, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, S<64, 4>, 4, 1>;
// clang-format on // clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host:: using ReferenceGemmInstance = ck::tensor_operation::host::
...@@ -56,9 +59,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host:: ...@@ -56,9 +59,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
bool do_verification = 1; bool do_verification = true;
int init_method = 1; int init_method = 1;
int nrepeat = 5; bool time_kernel = false;
// GEMM shape // GEMM shape
ck::index_t M = 3840; ck::index_t M = 3840;
...@@ -77,13 +80,13 @@ int main(int argc, char* argv[]) ...@@ -77,13 +80,13 @@ int main(int argc, char* argv[])
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
} }
else if(argc == 10) else if(argc == 10)
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]); M = std::stoi(argv[4]);
N = std::stoi(argv[5]); N = std::stoi(argv[5]);
...@@ -97,7 +100,7 @@ int main(int argc, char* argv[]) ...@@ -97,7 +100,7 @@ int main(int argc, char* argv[])
{ {
printf("arg1: verification (0=no, 1=yes)\n"); printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: run kernel # of times (>1)\n"); printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"); printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0); exit(0);
} }
...@@ -162,8 +165,7 @@ int main(int argc, char* argv[]) ...@@ -162,8 +165,7 @@ int main(int argc, char* argv[])
auto a_element_op = AElementOp{}; auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{}; auto b_element_op = BElementOp{};
auto c_element_op = CElementOp{}; auto c_element_op = CElementOp{};
auto d0_reduce_op = D0ReduceOp{}; auto d1_element_op = D1ElementOp{};
auto d1_reduce_op = D1ReduceOp{};
// do GEMM // do GEMM
auto gemm = DeviceGemmReduceInstance{}; auto gemm = DeviceGemmReduceInstance{};
...@@ -182,8 +184,7 @@ int main(int argc, char* argv[]) ...@@ -182,8 +184,7 @@ int main(int argc, char* argv[])
a_element_op, a_element_op,
b_element_op, b_element_op,
c_element_op, c_element_op,
d0_reduce_op, d1_element_op);
d1_reduce_op);
if(!gemm.IsSupportedArgument(argument)) if(!gemm.IsSupportedArgument(argument))
{ {
...@@ -192,30 +193,13 @@ int main(int argc, char* argv[]) ...@@ -192,30 +193,13 @@ int main(int argc, char* argv[])
"not support this GEMM problem"); "not support this GEMM problem");
} }
// warm up
invoker.Run(argument);
// timing
float total_time = 0;
for(int i = 0; i < nrepeat; ++i)
{
// init DO, D1 to 0 // init DO, D1 to 0
d0_device_buf.SetZero(); d0_device_buf.SetZero();
d1_device_buf.SetZero(); d1_device_buf.SetZero();
KernelTimer timer; // if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
// will not be correct. need to set time_kernel = false for correctness test
timer.Start(); float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
invoker.Run(argument);
timer.End();
total_time += timer.GetElapsedTime();
}
float ave_time = total_time / nrepeat;
std::size_t flop = std::size_t(2) * M * N * K; std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype = std::size_t num_btype =
...@@ -228,6 +212,7 @@ int main(int argc, char* argv[]) ...@@ -228,6 +212,7 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl; << gemm.GetTypeString() << std::endl;
bool pass = true;
if(do_verification) if(do_verification)
{ {
c_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_device_buf.FromDevice(c_m_n_device_result.mData.data());
...@@ -242,25 +227,41 @@ int main(int argc, char* argv[]) ...@@ -242,25 +227,41 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
auto d0_reduce_op = D0ReduceOp{};
auto d1_reduce_op = D1ReduceOp{};
for(int m = 0; m < M; ++m) for(int m = 0; m < M; ++m)
{ {
float d0_acc = d0_reduce_op.GetReduceZeroValue(); float d0_acc = d0_reduce_op.GetReductionZeroVal();
float d1_acc = d1_reduce_op.GetReduceZeroValue(); float d1_acc = d1_reduce_op.GetReductionZeroVal();
for(int n = 0; n < N; ++n) for(int n = 0; n < N; ++n)
{ {
d0_reduce_op.Reduce(d0_acc, c_m_n_host_result(m, n)); float d0_val = ck::type_convert<float>(c_m_n_host_result(m, n));
d1_reduce_op.Reduce(d1_acc, c_m_n_host_result(m, n)); float d1_val;
d1_element_op(d1_val, d0_val);
d0_reduce_op(d0_acc, d0_val);
d1_reduce_op(d1_acc, d1_val);
} }
d0_m_host_result(m) = d0_acc; d0_m_host_result(m) = ck::type_convert<DDataType>(d0_acc);
d1_m_host_result(m) = d1_acc; d1_m_host_result(m) = ck::type_convert<DDataType>(d1_acc);
} }
check_error(c_m_n_host_result, c_m_n_device_result); pass &= ck::utils::check_err(
check_error(d0_m_host_result, d0_m_device_result); c_m_n_device_result.mData, c_m_n_host_result.mData, "Error: Incorrect results c");
check_error(d1_m_host_result, d1_m_device_result); pass &= ck::utils::check_err(d0_m_device_result.mData,
d0_m_host_result.mData,
"Error: Incorrect results d0",
1e-3,
1e-3);
pass &= ck::utils::check_err(d1_m_device_result.mData,
d1_m_host_result.mData,
"Error: Incorrect results d1",
1e-3,
1e-3);
} }
return 0; return pass ? 0 : 1;
} }
add_example_executable(example_convnd_bwd_data_xdl convnd_bwd_data_xdl.cpp) add_example_executable(example_convnd_bwd_data_xdl convnd_bwd_data_xdl.cpp)
target_link_libraries(example_convnd_bwd_data_xdl PRIVATE conv_fwd_util) target_link_libraries(example_convnd_bwd_data_xdl PRIVATE conv_util)
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include <half.hpp> #include <half.hpp>
#include "config.hpp" #include "config.hpp"
#include "conv_fwd_util.hpp" #include "conv_util.hpp"
#include "print.hpp" #include "print.hpp"
#include "device.hpp" #include "device.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
...@@ -87,7 +87,7 @@ void print_use_msg() ...@@ -87,7 +87,7 @@ void print_use_msg()
{ {
std::cout << "arg1: verification (0=no, 1=yes)\n" std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=random value, 2= init to 1 )\n" << "arg2: initialization (0=no init, 1=random value, 2= init to 1 )\n"
<< "arg3: run kernel # of times (>1)\n" << "arg3: time kernel (0=n0, 1=yes)\n"
<< "arg4: N spatial dimensions (default 2)\n" << "arg4: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n" << "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n" << " N, K, C, \n"
...@@ -105,40 +105,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[]) ...@@ -105,40 +105,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[])
ck::utils::conv::ConvParams params; ck::utils::conv::ConvParams params;
int arg_idx = 5; int arg_idx = 5;
params.num_dim_spatial = num_dim_spatial; params.num_dim_spatial_ = num_dim_spatial;
params.N = std::stoi(argv[arg_idx++]); params.N_ = std::stoi(argv[arg_idx++]);
params.K = std::stoi(argv[arg_idx++]); params.K_ = std::stoi(argv[arg_idx++]);
params.C = std::stoi(argv[arg_idx++]); params.C_ = std::stoi(argv[arg_idx++]);
params.filter_spatial_lengths.resize(num_dim_spatial); params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
} }
params.input_spatial_lengths.resize(num_dim_spatial); params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
} }
params.conv_filter_strides.resize(num_dim_spatial); params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
} }
params.conv_filter_dilations.resize(num_dim_spatial); params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
} }
params.input_left_pads.resize(num_dim_spatial); params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.input_left_pads[i] = std::stoi(argv[arg_idx++]); params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
} }
params.input_right_pads.resize(num_dim_spatial); params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
params.input_right_pads[i] = std::stoi(argv[arg_idx++]); params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
} }
return params; return params;
...@@ -165,25 +165,25 @@ DeviceConvBwdDataBasePtr get_conv_instance(int num_dim_spatial) ...@@ -165,25 +165,25 @@ DeviceConvBwdDataBasePtr get_conv_instance(int num_dim_spatial)
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
bool do_verification = 0; bool do_verification = true;
int init_method = 0; int init_method = 1;
int nrepeat = 5; bool time_kernel = false;
int num_dim_spatial = 2; int num_dim_spatial = 2;
ck::utils::conv::ConvParams params; ck::utils::conv::ConvParams params;
params.C = 128; params.C_ = 128;
if(argc == 4) if(argc == 4)
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
} }
else if(argc > 4) else if(argc > 4)
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]); num_dim_spatial = std::stoi(argv[4]);
// check args number // check args number
int conv_args = 3 + num_dim_spatial * 6; int conv_args = 3 + num_dim_spatial * 6;
...@@ -202,21 +202,21 @@ int main(int argc, char* argv[]) ...@@ -202,21 +202,21 @@ int main(int argc, char* argv[])
exit(1); exit(1);
} }
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N), std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N_),
static_cast<std::size_t>(params.C)}; static_cast<std::size_t>(params.C_)};
input_dims.insert(std::end(input_dims), input_dims.insert(std::end(input_dims),
std::begin(params.input_spatial_lengths), std::begin(params.input_spatial_lengths_),
std::end(params.input_spatial_lengths)); std::end(params.input_spatial_lengths_));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K), std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K_),
static_cast<std::size_t>(params.C)}; static_cast<std::size_t>(params.C_)};
filter_dims.insert(std::end(filter_dims), filter_dims.insert(std::end(filter_dims),
std::begin(params.filter_spatial_lengths), std::begin(params.filter_spatial_lengths_),
std::end(params.filter_spatial_lengths)); std::end(params.filter_spatial_lengths_));
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths(); const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N), std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N_),
static_cast<std::size_t>(params.K)}; static_cast<std::size_t>(params.K_)};
output_dims.insert(std::end(output_dims), output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths), std::begin(output_spatial_lengths),
std::end(output_spatial_lengths)); std::end(output_spatial_lengths));
...@@ -263,16 +263,16 @@ int main(int argc, char* argv[]) ...@@ -263,16 +263,16 @@ int main(int argc, char* argv[])
conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), conv->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()), static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
params.N, params.N_,
params.K, params.K_,
params.C, params.C_,
params.input_spatial_lengths, params.input_spatial_lengths_,
params.filter_spatial_lengths, params.filter_spatial_lengths_,
output_spatial_lengths, output_spatial_lengths,
params.conv_filter_strides, params.conv_filter_strides_,
params.conv_filter_dilations, params.conv_filter_dilations_,
params.input_left_pads, params.input_left_pads_,
params.input_right_pads, params.input_right_pads_,
InElementOp{}, InElementOp{},
WeiElementOp{}, WeiElementOp{},
OutElementOp{}); OutElementOp{});
...@@ -284,16 +284,16 @@ int main(int argc, char* argv[]) ...@@ -284,16 +284,16 @@ int main(int argc, char* argv[])
"not support this Conv problem"); "not support this Conv problem");
} }
float ave_time = invoker->Run(argument.get(), nrepeat); float ave_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = ck::utils::conv::get_flops( std::size_t flop = ck::utils::conv::get_flops(
params.N, params.C, params.K, params.filter_spatial_lengths, output_spatial_lengths); params.N_, params.C_, params.K_, params.filter_spatial_lengths_, output_spatial_lengths);
std::size_t num_btype = ck::utils::conv::get_btype<InDataType, WeiDataType, OutDataType>( std::size_t num_btype = ck::utils::conv::get_btype<InDataType, WeiDataType, OutDataType>(
params.N, params.N_,
params.C, params.C_,
params.K, params.K_,
params.input_spatial_lengths, params.input_spatial_lengths_,
params.filter_spatial_lengths, params.filter_spatial_lengths_,
output_spatial_lengths); output_spatial_lengths);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time; float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
...@@ -310,10 +310,10 @@ int main(int argc, char* argv[]) ...@@ -310,10 +310,10 @@ int main(int argc, char* argv[])
auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi_host_result, auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi_host_result,
wei_k_c_y_x, wei_k_c_y_x,
out_n_k_ho_wo, out_n_k_ho_wo,
params.conv_filter_strides, params.conv_filter_strides_,
params.conv_filter_dilations, params.conv_filter_dilations_,
params.input_left_pads, params.input_left_pads_,
params.input_right_pads, params.input_right_pads_,
InElementOp{}, InElementOp{},
WeiElementOp{}, WeiElementOp{},
OutElementOp{}); OutElementOp{});
...@@ -322,7 +322,10 @@ int main(int argc, char* argv[]) ...@@ -322,7 +322,10 @@ int main(int argc, char* argv[])
in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data()); in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data());
check_error(in_n_c_hi_wi_host_result, in_n_c_hi_wi_device_result); return ck::utils::check_err(in_n_c_hi_wi_device_result.mData,
in_n_c_hi_wi_host_result.mData)
? 0
: 1;
}; };
switch(num_dim_spatial) switch(num_dim_spatial)
...@@ -347,4 +350,5 @@ int main(int argc, char* argv[]) ...@@ -347,4 +350,5 @@ int main(int argc, char* argv[])
} }
} }
} }
return 0;
} }
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <cstdlib> #include <cstdlib>
#include <stdlib.h> #include <stdlib.h>
#include <half.hpp> #include <half.hpp>
#include "check_err.hpp"
#include "config.hpp" #include "config.hpp"
#include "device.hpp" #include "device.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
...@@ -11,9 +12,9 @@ ...@@ -11,9 +12,9 @@
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_batched_gemm_reduce_xdl_cshuffle.hpp" #include "device_batched_gemm_reduce_xdl_cshuffle.hpp"
#include "element_wise_operation.hpp" #include "element_wise_operation.hpp"
#include "reduction_operator.hpp"
#include "reference_batched_gemm.hpp" #include "reference_batched_gemm.hpp"
#include "gemm_specialization.hpp" #include "gemm_specialization.hpp"
#include "element_wise_reduce_operation.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -36,19 +37,20 @@ using CLayout = ck::tensor_layout::gemm::RowMajor; ...@@ -36,19 +37,20 @@ using CLayout = ck::tensor_layout::gemm::RowMajor;
using AElementOp = ck::tensor_operation::element_wise::PassThrough; using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough; using CElementOp = ck::tensor_operation::element_wise::PassThrough;
using D0ReduceOp = ck::tensor_operation::element_wise::ReduceSum; using D0ReduceOp = ck::reduce::Add<float>;
using D1ReduceOp = ck::tensor_operation::element_wise::ReduceSquareSum; using D1ReduceOp = ck::reduce::Add<float>;
using D1ElementOp = ck::tensor_operation::element_wise::UnarySquare<float, float, false>;
static constexpr auto GemmSpecialization = static constexpr auto GemmSpecialization =
ck::tensor_operation::device::GemmSpecialization::Default; ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off // clang-format off
using DeviceBatchedGemmReduceInstance = ck::tensor_operation::device::DeviceBatchedGemmReduce_Xdl_CShuffle using DeviceBatchedGemmReduceInstance = ck::tensor_operation::device::DeviceBatchedGemmReduce_Xdl_CShuffle
//######| ALayout| BLayout| CLayout|AData| BData| CData| GemmAcc| CShuffle| ReduceAcc| DData| A| B| C| D0| D1| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy| //######| ALayout| BLayout| CLayout|AData| BData| CData| GemmAcc| CShuffle| ReduceAcc| DData| A| B| C| D0| D1| D1EleOp| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
//######| | | | Type| Type| Type| DataType| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Reduce| Reduce| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector| //######| | | | Type| Type| Type| DataType| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Reduce| Reduce| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock| //######| | | | | | | | | | | Operation| Operation| Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< Row, Col, Row, F16, F16, F16, F32, F32, F32, F32, AElementOp, BElementOp, CElementOp, D0ReduceOp, D1ReduceOp, GemmSpecialization, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, S<64, 4>, 4, 1>; < Row, Col, Row, F16, F16, F16, F32, F32, F32, F32, AElementOp, BElementOp, CElementOp, D0ReduceOp, D1ReduceOp, D1ElementOp, GemmSpecialization, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, S<64, 4>, 4, 1>;
// clang-format on // clang-format on
using ReferenceBatchedGemmInstance = ck::tensor_operation::host:: using ReferenceBatchedGemmInstance = ck::tensor_operation::host::
...@@ -56,18 +58,18 @@ using ReferenceBatchedGemmInstance = ck::tensor_operation::host:: ...@@ -56,18 +58,18 @@ using ReferenceBatchedGemmInstance = ck::tensor_operation::host::
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
bool do_verification = 1; bool do_verification = true;
int init_method = 1; int init_method = 1;
int nrepeat = 5; bool time_kernel = false;
// GEMM shape // GEMM shape
ck::index_t M = 3840; ck::index_t M = 2048;
ck::index_t N = 4096; ck::index_t N = 1920;
ck::index_t K = 4096; ck::index_t K = 2048;
ck::index_t StrideA = 4096; ck::index_t StrideA = 2048;
ck::index_t StrideB = 4096; ck::index_t StrideB = 2048;
ck::index_t StrideC = 4096; ck::index_t StrideC = 1920;
ck::index_t BatchCount = 4; ck::index_t BatchCount = 4;
...@@ -79,13 +81,13 @@ int main(int argc, char* argv[]) ...@@ -79,13 +81,13 @@ int main(int argc, char* argv[])
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
} }
else if(argc == 11) else if(argc == 11)
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]); M = std::stoi(argv[4]);
N = std::stoi(argv[5]); N = std::stoi(argv[5]);
...@@ -95,13 +97,13 @@ int main(int argc, char* argv[]) ...@@ -95,13 +97,13 @@ int main(int argc, char* argv[])
StrideB = std::stoi(argv[8]); StrideB = std::stoi(argv[8]);
StrideC = std::stoi(argv[9]); StrideC = std::stoi(argv[9]);
BatchCount = std::stoi(argv[9]); BatchCount = std::stoi(argv[10]);
} }
else else
{ {
printf("arg1: verification (0=no, 1=yes)\n"); printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: run kernel # of times (>1)\n"); printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, BatchCount\n"); printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, BatchCount\n");
exit(0); exit(0);
} }
...@@ -173,6 +175,7 @@ int main(int argc, char* argv[]) ...@@ -173,6 +175,7 @@ int main(int argc, char* argv[])
auto c_element_op = CElementOp{}; auto c_element_op = CElementOp{};
auto d0_reduce_op = D0ReduceOp{}; auto d0_reduce_op = D0ReduceOp{};
auto d1_reduce_op = D1ReduceOp{}; auto d1_reduce_op = D1ReduceOp{};
auto d1_element_op = D1ElementOp{};
// do GEMM // do GEMM
auto batched_gemm = DeviceBatchedGemmReduceInstance{}; auto batched_gemm = DeviceBatchedGemmReduceInstance{};
...@@ -192,8 +195,7 @@ int main(int argc, char* argv[]) ...@@ -192,8 +195,7 @@ int main(int argc, char* argv[])
a_element_op, a_element_op,
b_element_op, b_element_op,
c_element_op, c_element_op,
d0_reduce_op, d1_element_op,
d1_reduce_op,
BatchCount); BatchCount);
if(!batched_gemm.IsSupportedArgument(argument)) if(!batched_gemm.IsSupportedArgument(argument))
...@@ -203,30 +205,13 @@ int main(int argc, char* argv[]) ...@@ -203,30 +205,13 @@ int main(int argc, char* argv[])
"not support this GEMM problem"); "not support this GEMM problem");
} }
// warm up
invoker.Run(argument);
// timing
float total_time = 0;
for(int i = 0; i < nrepeat; ++i)
{
// init DO, D1 to 0 // init DO, D1 to 0
d0_device_buf.SetZero(); d0_device_buf.SetZero();
d1_device_buf.SetZero(); d1_device_buf.SetZero();
KernelTimer timer; // if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
// will not be correct. need to set time_kernel = false for correctness test
timer.Start(); float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
invoker.Run(argument);
timer.End();
total_time += timer.GetElapsedTime();
}
float ave_time = total_time / nrepeat;
std::size_t flop = std::size_t(2) * BatchCount * M * N * K; std::size_t flop = std::size_t(2) * BatchCount * M * N * K;
std::size_t num_btype = sizeof(ADataType) * BatchCount * M * K + std::size_t num_btype = sizeof(ADataType) * BatchCount * M * K +
...@@ -240,6 +225,7 @@ int main(int argc, char* argv[]) ...@@ -240,6 +225,7 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< batched_gemm.GetTypeString() << std::endl; << batched_gemm.GetTypeString() << std::endl;
bool pass = true;
if(do_verification) if(do_verification)
{ {
c_device_buf.FromDevice(c_g_m_n_device_result.mData.data()); c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
...@@ -258,24 +244,36 @@ int main(int argc, char* argv[]) ...@@ -258,24 +244,36 @@ int main(int argc, char* argv[])
{ {
for(int m = 0; m < M; ++m) for(int m = 0; m < M; ++m)
{ {
float d0_acc = d0_reduce_op.GetReduceZeroValue(); float d0_acc = d0_reduce_op.GetReductionZeroVal();
float d1_acc = d1_reduce_op.GetReduceZeroValue(); float d1_acc = d1_reduce_op.GetReductionZeroVal();
for(int n = 0; n < N; ++n) for(int n = 0; n < N; ++n)
{ {
d0_reduce_op.Reduce(d0_acc, c_g_m_n_host_result(batch, m, n)); float d0_val = ck::type_convert<float>(c_g_m_n_host_result(batch, m, n));
d1_reduce_op.Reduce(d1_acc, c_g_m_n_host_result(batch, m, n)); float d1_val;
d1_element_op(d1_val, d0_val);
d0_reduce_op(d0_acc, d0_val);
d1_reduce_op(d1_acc, d1_val);
} }
d0_g_m_host_result(batch, m) = d0_acc; d0_g_m_host_result(batch, m) = ck::type_convert<DDataType>(d0_acc);
d1_g_m_host_result(batch, m) = d1_acc; d1_g_m_host_result(batch, m) = ck::type_convert<DDataType>(d1_acc);
} }
} }
check_error(c_g_m_n_host_result, c_g_m_n_device_result); pass &= ck::utils::check_err(c_g_m_n_host_result.mData, c_g_m_n_device_result.mData);
check_error(d0_g_m_host_result, d0_g_m_device_result); pass &= ck::utils::check_err(d0_g_m_device_result.mData,
check_error(d1_g_m_host_result, d1_g_m_device_result); d0_g_m_host_result.mData,
"Error: Incorrect results! D0",
1e-3,
1e-3);
pass &= ck::utils::check_err(d1_g_m_device_result.mData,
d1_g_m_host_result.mData,
"Error: Incorrect results! D1",
1e-3,
1e-3);
} }
return 0; return pass ? 0 : 1;
} }
...@@ -19,9 +19,18 @@ include_directories(BEFORE ...@@ -19,9 +19,18 @@ include_directories(BEFORE
add_custom_target(examples) add_custom_target(examples)
function(add_example_executable EXAMPLE_NAME) function(add_example_executable EXAMPLE_NAME FILE_NAME)
message("adding example ${EXAMPLE_NAME}") message("adding example ${EXAMPLE_NAME}")
add_executable(${EXAMPLE_NAME} ${ARGN}) add_executable(${EXAMPLE_NAME} ${FILE_NAME})
target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor)
add_test(NAME ${EXAMPLE_NAME} COMMAND $<TARGET_FILE:${EXAMPLE_NAME}> ${ARGN})
add_dependencies(examples ${EXAMPLE_NAME})
add_dependencies(check ${EXAMPLE_NAME})
endfunction(add_example_executable EXAMPLE_NAME)
function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
message("adding example ${EXAMPLE_NAME}")
add_executable(${EXAMPLE_NAME} ${FILE_NAME})
target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor) target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor)
add_dependencies(examples ${EXAMPLE_NAME}) add_dependencies(examples ${EXAMPLE_NAME})
endfunction(add_example_executable EXAMPLE_NAME) endfunction(add_example_executable EXAMPLE_NAME)
......
...@@ -26,17 +26,14 @@ ...@@ -26,17 +26,14 @@
#endif #endif
#endif #endif
// buffer resourse, wave size // buffer resource
#ifndef __HIP_DEVICE_COMPILE__ // for host code #ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_BUFFER_RESOURCE_3RD_DWORD -1 #define CK_BUFFER_RESOURCE_3RD_DWORD -1
#define CK_GPU_WAVE_SIZE -1
#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \ #elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) // for GPU code defined(__gfx90a__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#define CK_GPU_WAVE_SIZE 64
#elif defined(__gfx1030__) // for GPU code #elif defined(__gfx1030__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#define CK_GPU_WAVE_SIZE 32
#endif #endif
// FMA instruction // FMA instruction
...@@ -112,6 +109,10 @@ ...@@ -112,6 +109,10 @@
// experimental feature: use __builtin_memcpy instead of union to do bit_cast // experimental feature: use __builtin_memcpy instead of union to do bit_cast
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1 #define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
// experimental feature: optimize for inter-wave scheduling policy
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING 0
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1
// hack: have underlying assumption that need to be satsified, otherwise it's a bug // hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug // thread-invariant, otherwise it's a bug
......
#pragma once
// "_PACKAGE_" to avoid name contentions: the macros like
// HIP_VERSION_MAJOR are defined in HIP_VERSION.h.
// clang-format off
#define CK_HIP_PACKAGE_VERSION_MAJOR @CK_HIP_VERSION_MAJOR@
#define CK_HIP_PACKAGE_VERSION_MINOR @CK_HIP_VERSION_MINOR@
#define CK_HIP_PACKAGE_VERSION_PATCH @CK_HIP_VERSION_PATCH@
// clang-format on
#ifndef CK_HIP_PACKAGE_VERSION_MAJOR
#define CK_HIP_PACKAGE_VERSION_MAJOR 0
#endif
#ifndef CK_HIP_PACKAGE_VERSION_MINOR
#define CK_HIP_PACKAGE_VERSION_MINOR 0
#endif
#ifndef CK_HIP_PACKAGE_VERSION_PATCH
#define CK_HIP_PACKAGE_VERSION_PATCH 0
#endif
// 3 decimal digits for major and minor, 6 digits for patch number.
// Max number is 999,999,999999 == 0xE8,D4A5,0FFF that fits into 64-bit math.
#if CK_HIP_PACKAGE_VERSION_MAJOR > 999 || CK_HIP_PACKAGE_VERSION_MAJOR > 999 || \
CK_HIP_PACKAGE_VERSION_PATCH > 999999
#error "Too big HIP version number(s)"
#endif
#define CK_HIP_PACKAGE_VERSION_FLAT \
((CK_HIP_PACKAGE_VERSION_MAJOR * 1000ULL + CK_HIP_PACKAGE_VERSION_MINOR) * 1000000 + \
CK_HIP_PACKAGE_VERSION_PATCH)
#pragma once
#cmakedefine01 CK_TIME_KERNEL
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
struct StreamConfig
{
hipStream_t stream_id_ = nullptr;
bool time_kernel_ = false;
};
#ifndef CK_TENSOR_DESCRIPTOR_HELPER_HPP #pragma once
#define CK_TENSOR_DESCRIPTOR_HELPER_HPP
#include "common_header.hpp" #include "common_header.hpp"
#include "tensor_descriptor.hpp" #include "tensor_descriptor.hpp"
#include "multi_index_transform_helper.hpp" #include "multi_index_transform_helper.hpp"
...@@ -35,6 +33,12 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt ...@@ -35,6 +33,12 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt
} }
#endif #endif
// Lengths..., Strides... could be:
// 1) index_t, which is known at run-time, or
// 2) Number<>, which is known at compile-time
// element_space_size could be:
// 1) long_index_t, or
// 2) LongNumber<>
template <typename... Lengths, template <typename... Lengths,
typename... Strides, typename... Strides,
typename enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false> typename enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
...@@ -68,10 +72,10 @@ __host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Leng ...@@ -68,10 +72,10 @@ __host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Leng
} }
}; };
const auto element_space_size = f(f, Number<0>{}, Number<1>{}); const auto element_space_size = f(f, Number<0>{}, LongNumber<1>{});
#else #else
const auto element_space_size = const auto element_space_size =
calculate_element_space_size_impl(lengths, strides, Number<0>{}, Number<1>{}); calculate_element_space_size_impl(lengths, strides, Number<0>{}, LongNumber<1>{});
#endif #endif
return TensorDescriptor<remove_cv_t<decltype(transforms)>, return TensorDescriptor<remove_cv_t<decltype(transforms)>,
...@@ -82,9 +86,12 @@ __host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Leng ...@@ -82,9 +86,12 @@ __host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Leng
element_space_size}; element_space_size};
} }
// Lengths... can be: // Lengths... could be:
// 1) index_t, which is known at run-time // 1) index_t, which is known at run-time, or
// 2) Number<>, which is known at compile-time // 2) Number<>, which is known at compile-time
// element_space_size could be:
// 1) long_index_t, or
// 2) LongNumber<>
template <typename... Lengths> template <typename... Lengths>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths) make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
...@@ -100,7 +107,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths) ...@@ -100,7 +107,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{}; constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
const auto element_space_size = container_reduce(lengths, math::multiplies{}, Number<1>{}); const auto element_space_size = container_reduce(lengths, math::multiplies{}, LongNumber<1>{});
return TensorDescriptor<remove_cv_t<decltype(transforms)>, return TensorDescriptor<remove_cv_t<decltype(transforms)>,
remove_cv_t<decltype(low_dim_hidden_idss)>, remove_cv_t<decltype(low_dim_hidden_idss)>,
...@@ -110,6 +117,12 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths) ...@@ -110,6 +117,12 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
element_space_size}; element_space_size};
} }
// Lengths... could be:
// 1) index_t, which is known at run-time, or
// 2) Number<>, which is known at compile-time
// align could be:
// 1) index_t, or
// 2) Number<>
template <typename... Lengths, typename Align> template <typename... Lengths, typename Align>
__host__ __device__ constexpr auto __host__ __device__ constexpr auto
make_naive_tensor_descriptor_aligned(const Tuple<Lengths...>& lengths, Align align) make_naive_tensor_descriptor_aligned(const Tuple<Lengths...>& lengths, Align align)
...@@ -146,4 +159,3 @@ make_naive_tensor_descriptor_aligned(const Tuple<Lengths...>& lengths, Align ali ...@@ -146,4 +159,3 @@ make_naive_tensor_descriptor_aligned(const Tuple<Lengths...>& lengths, Align ali
} }
} // namespace ck } // namespace ck
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment