Commit d789a53d authored by Chao Liu's avatar Chao Liu
Browse files

update example

parent 92a0945d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "parse_conv_parameter.hpp"
template <ck::index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename AccDataType,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename InElementOp,
typename WeiElementOp,
typename OutElementOp,
typename DeviceConvNDFwdInstance,
typename ReferenceConvNDFwdInstance>
int run_conv_fwd(const ck::tensor_operation::device::ConvParams& params,
bool do_verification,
int init_method,
bool time_kernel)
{
auto f_nchw_host_tensor_descriptor =
[](ck::index_t n, ck::index_t c, std::vector<ck::index_t> spatial_lengths) {
std::vector<std::size_t> nhwc_lengths{static_cast<std::size_t>(n),
static_cast<std::size_t>(c)};
nhwc_lengths.insert(
nhwc_lengths.begin() + 1, spatial_lengths.begin(), spatial_lengths.end());
return transpose_host_tensor_descriptor_given_new2old(
HostTensorDescriptor(nhwc_lengths), std::vector<std::size_t>({0, 3, 1, 2}));
};
Tensor<InDataType> input(
f_nchw_host_tensor_descriptor(params.N_, params.C_, params.input_spatial_lengths_));
Tensor<InDataType> weights(
f_nchw_host_tensor_descriptor(params.K_, params.C_, params.filter_spatial_lengths_));
Tensor<InDataType> host_output(
f_nchw_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
Tensor<InDataType> device_output(
f_nchw_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break;
default:
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
}
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
// do GEMM
auto conv = DeviceConvNDFwdInstance{};
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
params.N_,
params.K_,
params.C_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
params.GetOutputSpatialLengths(),
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv.IsSupportedArgument(argument))
{
throw std::runtime_error(
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem");
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = params.GetFlops();
std::size_t num_btype = params.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< conv.GetTypeString() << std::endl;
if(do_verification)
{
auto ref_conv = ReferenceConvNDFwdInstance();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input,
weights,
host_output,
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
return ck::utils::check_err(host_output.mData,
device_output.mData,
"Error: incorrect results!",
1e-5f,
1e-4f)
? 0
: 1;
}
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "parse_conv_parameter.hpp"
using InDataType = ck::bhalf_t;
using WeiDataType = ck::bhalf_t;
using OutDataType = ck::bhalf_t;
using AccDataType = float;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InLayout = ck::tensor_layout::convolution::NHWC;
using WeiLayout = ck::tensor_layout::convolution::KYXC;
using OutLayout = ck::tensor_layout::convolution::NHWK;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
template <ck::index_t NumDimSpatial>
using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwcKxcNwk_Xdl<
InDataType, //
WeiDataType, //
OutDataType, //
AccDataType, //
InElementOp, // Input Elementwise Operation
WeiElementOp, // Weights Elementwise Operation
OutElementOp, // Output Elementwise Operation
ConvFwdDefault, // ConvForwardSpecialization
NumDimSpatial, // NumDimSpatial
256, // BlockSize
128, // MPerBlock
256, // NPerBlock
4, // K0PerBlock
8, // K1
32, // MPerXdl
32, // NPerXdl
2, // MXdlPerWave
4, // NXdlPerWave
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
8, // ABlockTransferSrcScalarPerVector
8, // ABlockTransferDstScalarPerVector_K1
true, // ABlockLdsAddExtraM
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
8, // BBlockTransferSrcScalarPerVector
8, // BBlockTransferDstScalarPerVector_K1
true, // BBlockLdsAddExtraN
7, // CThreadTransferSrcDstVectorDim
1>; // CThreadTransferDstScalarPerVector
template <ck::index_t NumDimSpatial>
using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
NumDimSpatial>;
int main(int argc, char* argv[])
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = true;
int num_dim_spatial = 2;
ck::tensor_operation::device::ConvParams params;
if(argc >= 5)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
}
if(argc >= 6)
{
params = parse_conv_params(num_dim_spatial, argc, argv);
}
auto f_nchw_host_tensor_descriptor =
[](ck::index_t n, ck::index_t c, std::vector<ck::index_t> spatial_lengths) {
std::vector<std::size_t> nhwc_lengths{static_cast<std::size_t>(n),
static_cast<std::size_t>(c)};
nhwc_lengths.insert(
nhwc_lengths.begin() + 1, spatial_lengths.begin(), spatial_lengths.end());
return transpose_host_tensor_descriptor_given_new2old(
HostTensorDescriptor(nhwc_lengths), std::vector<std::size_t>({0, 3, 1, 2}));
};
Tensor<InDataType> input(
f_nchw_host_tensor_descriptor(params.N_, params.C_, params.input_spatial_lengths_));
Tensor<InDataType> weights(
f_nchw_host_tensor_descriptor(params.K_, params.C_, params.filter_spatial_lengths_));
Tensor<InDataType> host_output(
f_nchw_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
Tensor<InDataType> device_output(
f_nchw_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break;
default:
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
}
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
// do Conv
auto conv = DeviceConvNDFwdInstance<2>{};
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
params.N_,
params.K_,
params.C_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
params.GetOutputSpatialLengths(),
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv.IsSupportedArgument(argument))
{
throw std::runtime_error(
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem");
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = params.GetFlops();
std::size_t num_btype = params.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< conv.GetTypeString() << std::endl;
if(do_verification)
{
auto verify_f = [&input, &weights, &host_output, &params, &out_device_buf, &device_output](
const auto& ref_conv) {
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input,
weights,
host_output,
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
return ck::utils::check_err(host_output.mData,
device_output.mData,
"Error: incorrect results!",
1e-5f,
1e-4f)
? 0
: 1;
};
switch(num_dim_spatial)
{
case 1: {
auto ref_conv = ReferenceConvNDFwdInstance<1>();
return verify_f(ref_conv);
}
case 2: {
auto ref_conv = ReferenceConvNDFwdInstance<2>();
return verify_f(ref_conv);
}
case 3: {
auto ref_conv = ReferenceConvNDFwdInstance<3>();
return verify_f(ref_conv);
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
}
return 0;
}
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib> #include "convnd_fwd_common.hpp"
#include <iostream>
#include <numeric>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_convnd_fwd_nwc_kxc_nwk_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "parse_conv_parameter.hpp"
using InDataType = ck::half_t; using InDataType = ck::half_t;
using WeiDataType = ck::half_t; using WeiDataType = ck::half_t;
...@@ -87,151 +70,84 @@ using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd< ...@@ -87,151 +70,84 @@ using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 1; int init_method = 1;
bool time_kernel = true; bool time_kernel = false;
int num_dim_spatial = 2; int num_dim_spatial = 2;
ck::tensor_operation::device::ConvParams params; ck::tensor_operation::device::ConvParams params;
if(argc >= 5) if(argc == 1)
{
// use default
}
else if(argc == 4)
{ {
do_verification = std::stoi(argv[1]); do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]); init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]); time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
} }
else
if(argc >= 6)
{ {
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
num_dim_spatial = std::stoi(argv[4]);
params = parse_conv_params(num_dim_spatial, argc, argv); params = parse_conv_params(num_dim_spatial, argc, argv);
} }
auto f_nchw_host_tensor_descriptor = if(num_dim_spatial == 1)
[](ck::index_t n, ck::index_t c, std::vector<ck::index_t> spatial_lengths) {
std::vector<std::size_t> nhwc_lengths{static_cast<std::size_t>(n),
static_cast<std::size_t>(c)};
nhwc_lengths.insert(
nhwc_lengths.begin() + 1, spatial_lengths.begin(), spatial_lengths.end());
return transpose_host_tensor_descriptor_given_new2old(
HostTensorDescriptor(nhwc_lengths), std::vector<std::size_t>({0, 3, 1, 2}));
};
Tensor<InDataType> input(
f_nchw_host_tensor_descriptor(params.N_, params.C_, params.input_spatial_lengths_));
Tensor<InDataType> weights(
f_nchw_host_tensor_descriptor(params.K_, params.C_, params.filter_spatial_lengths_));
Tensor<InDataType> host_output(
f_nchw_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
Tensor<InDataType> device_output(
f_nchw_host_tensor_descriptor(params.N_, params.K_, params.GetOutputSpatialLengths()));
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weights: " << weights.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
switch(init_method)
{ {
case 0: break; return run_conv_fwd<1,
case 1: InDataType,
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); WeiDataType,
weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5}); OutDataType,
break; AccDataType,
default: InLayout,
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0}); WeiLayout,
weights.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5}); OutLayout,
InElementOp,
WeiElementOp,
OutElementOp,
DeviceConvNDFwdInstance<1>,
ReferenceConvNDFwdInstance<1>>(
params, do_verification, init_method, time_kernel);
} }
else if(num_dim_spatial == 2)
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpace());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
// do GEMM
auto conv = DeviceConvNDFwdInstance<2>{};
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
params.N_,
params.K_,
params.C_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
params.GetOutputSpatialLengths(),
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv.IsSupportedArgument(argument))
{ {
throw std::runtime_error( return run_conv_fwd<2,
"wrong! device_conv with the specified compilation parameters does " InDataType,
"not support this Conv problem"); WeiDataType,
OutDataType,
AccDataType,
InLayout,
WeiLayout,
OutLayout,
InElementOp,
WeiElementOp,
OutElementOp,
DeviceConvNDFwdInstance<2>,
ReferenceConvNDFwdInstance<2>>(
params, do_verification, init_method, time_kernel);
} }
else if(num_dim_spatial == 3)
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = params.GetFlops();
std::size_t num_btype = params.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< conv.GetTypeString() << std::endl;
if(do_verification)
{ {
auto verify_f = [&input, &weights, &host_output, &params, &out_device_buf, &device_output]( return run_conv_fwd<3,
const auto& ref_conv) { InDataType,
auto ref_invoker = ref_conv.MakeInvoker(); WeiDataType,
auto ref_argument = ref_conv.MakeArgument(input, OutDataType,
weights, AccDataType,
host_output, InLayout,
params.conv_filter_strides_, WeiLayout,
params.conv_filter_dilations_, OutLayout,
params.input_left_pads_, InElementOp,
params.input_right_pads_, WeiElementOp,
InElementOp{}, OutElementOp,
WeiElementOp{}, DeviceConvNDFwdInstance<3>,
OutElementOp{}); ReferenceConvNDFwdInstance<3>>(
params, do_verification, init_method, time_kernel);
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(device_output.mData.data());
return ck::utils::check_err(host_output.mData,
device_output.mData,
"Error: incorrect results!",
1e-5f,
1e-4f)
? 0
: 1;
};
switch(num_dim_spatial)
{
case 1: {
auto ref_conv = ReferenceConvNDFwdInstance<1>();
return verify_f(ref_conv);
}
case 2: {
auto ref_conv = ReferenceConvNDFwdInstance<2>();
return verify_f(ref_conv);
}
case 3: {
auto ref_conv = ReferenceConvNDFwdInstance<3>();
return verify_f(ref_conv);
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
}
}
} }
return 0;
} }
...@@ -52,7 +52,7 @@ parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[]) ...@@ -52,7 +52,7 @@ parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[])
return params; return params;
} }
void print_use_msg() void print_helper_msg()
{ {
std::cout << "arg1: verification (0=no, 1=yes)\n" std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/device_utility/hip_check_error.hpp" #include "ck/device_utility/hip_check_error.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment