Commit 95a83c6e authored by Adam Osewski's avatar Adam Osewski
Browse files

Merge remote-tracking branch 'origin/develop' into wavelet_model

parents 5b7c2432 892a8d76
...@@ -6,6 +6,8 @@ ...@@ -6,6 +6,8 @@
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp"
#include "ck/library/utility/literals.hpp"
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
...@@ -135,15 +137,15 @@ int main(int argc, char* argv[]) ...@@ -135,15 +137,15 @@ int main(int argc, char* argv[])
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -240,7 +242,7 @@ int main(int argc, char* argv[]) ...@@ -240,7 +242,7 @@ int main(int argc, char* argv[])
show_2d_matrix(std::cout << "c_host :", c_m_n_host_result) << std::endl; show_2d_matrix(std::cout << "c_host :", c_m_n_host_result) << std::endl;
} }
#endif #endif
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
} }
return 0; return 0;
......
...@@ -32,14 +32,12 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -32,14 +32,12 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{ {
case 0: break; case 0: break;
case 1: case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k.begin(), ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
a_m_k.end()); ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n.begin(),
b_k_n.end());
break; break;
default: default:
ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k.begin(), a_m_k.end()); ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n.begin(), b_k_n.end()); ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
} }
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
...@@ -133,11 +131,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -133,11 +131,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
c_m_n_device_result = c_m_n_device_result_converted.CopyAsType<CDataType>(); c_m_n_device_result = c_m_n_device_result_converted.CopyAsType<CDataType>();
return ck::utils::check_err(c_m_n_device_result_converted.mData, c_m_n_host_result.mData); return ck::utils::check_err(c_m_n_device_result_converted, c_m_n_host_result);
#else #else
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
#endif #endif
} }
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
...@@ -177,15 +178,15 @@ int main(int argc, char* argv[]) ...@@ -177,15 +178,15 @@ int main(int argc, char* argv[])
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -271,8 +272,7 @@ int main(int argc, char* argv[]) ...@@ -271,8 +272,7 @@ int main(int argc, char* argv[])
if(do_verification) if(do_verification)
{ {
Tensor<CShuffleDataType> c_m_n(HostTensorDescriptor( Tensor<CShuffleDataType> c_m_n({M, N});
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
...@@ -299,7 +299,7 @@ int main(int argc, char* argv[]) ...@@ -299,7 +299,7 @@ int main(int argc, char* argv[])
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.mData.data());
return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1; return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
} }
return 0; return 0;
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
...@@ -155,15 +156,15 @@ int main(int argc, char* argv[]) ...@@ -155,15 +156,15 @@ int main(int argc, char* argv[])
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -275,7 +276,7 @@ int main(int argc, char* argv[]) ...@@ -275,7 +276,7 @@ int main(int argc, char* argv[])
} }
} }
return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1; return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
} }
return 0; return 0;
......
...@@ -124,7 +124,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC ...@@ -124,7 +124,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
if(config.do_verification) if(config.do_verification)
{ {
Tensor<AccDataType> c_m_n(HostTensorDescriptor{M, N}); Tensor<AccDataType> c_m_n({M, N});
auto ref_gemm = ReferenceGemmInstance{}; auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker(); auto ref_invoker = ref_gemm.MakeInvoker();
...@@ -147,9 +147,9 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC ...@@ -147,9 +147,9 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
#ifdef BUILD_INT4_EXAMPLE #ifdef BUILD_INT4_EXAMPLE
const Tensor<EDataType> e_m_n_device_result_converted(e_m_n_device_result); const Tensor<EDataType> e_m_n_device_result_converted(e_m_n_device_result);
return ck::utils::check_err(e_m_n_device_result_converted.mData, e_m_n_host_result.mData); return ck::utils::check_err(e_m_n_device_result_converted, e_m_n_host_result);
#else #else
return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData); return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
#endif #endif
} }
......
...@@ -4,3 +4,7 @@ add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp) ...@@ -4,3 +4,7 @@ add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp)
add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp) add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed # FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp) add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
add_example_executable(example_convnd_fwd_dl_fp16 convnd_fwd_dl_fp16.cpp)
add_example_executable(example_convnd_fwd_dl_fp32 convnd_fwd_dl_fp32.cpp)
add_example_executable(example_convnd_fwd_dl_int8 convnd_fwd_dl_int8.cpp)
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
...@@ -84,7 +85,7 @@ bool run_grouped_conv_fwd(bool do_verification, ...@@ -84,7 +85,7 @@ bool run_grouped_conv_fwd(bool do_verification,
std::array<ck::index_t, NDimSpatial> input_left_pads{}; std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{}; std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); }; auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
...@@ -164,7 +165,7 @@ bool run_grouped_conv_fwd(bool do_verification, ...@@ -164,7 +165,7 @@ bool run_grouped_conv_fwd(bool do_verification,
out_device_buf.FromDevice(out_device.mData.data()); out_device_buf.FromDevice(out_device.mData.data());
return ck::utils::check_err( return ck::utils::check_err(
out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f); out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
} }
return true; return true;
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
#include <initializer_list> #include <type_traits>
#include <cstdlib>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
void print_helper_msg() void print_helper_msg()
{ {
...@@ -33,8 +34,8 @@ template <ck::index_t NDimSpatial, ...@@ -33,8 +34,8 @@ template <ck::index_t NDimSpatial,
typename InElementOp, typename InElementOp,
typename WeiElementOp, typename WeiElementOp,
typename OutElementOp, typename OutElementOp,
typename DeviceConvBwdWeightInstance> typename DeviceConvNDFwdInstance>
int run_conv_bwd_weight(bool do_verification, bool run_grouped_conv_fwd_dl(bool do_verification,
int init_method, int init_method,
bool time_kernel, bool time_kernel,
const ck::utils::conv::ConvParam& conv_param, const ck::utils::conv::ConvParam& conv_param,
...@@ -43,67 +44,87 @@ int run_conv_bwd_weight(bool do_verification, ...@@ -43,67 +44,87 @@ int run_conv_bwd_weight(bool do_verification,
const HostTensorDescriptor& out_g_n_k_wos_desc, const HostTensorDescriptor& out_g_n_k_wos_desc,
const InElementOp& in_element_op, const InElementOp& in_element_op,
const WeiElementOp& wei_element_op, const WeiElementOp& wei_element_op,
const OutElementOp& out_element_op, const OutElementOp& out_element_op)
ck::index_t split_k)
{ {
Tensor<InDataType> in(in_g_n_c_wis_desc); Tensor<InDataType> in(in_g_n_c_wis_desc);
Tensor<WeiDataType> wei_host_result(wei_g_k_c_xs_desc); Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
Tensor<WeiDataType> wei_device_result(wei_g_k_c_xs_desc); Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
Tensor<OutDataType> out(out_g_n_k_wos_desc); Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
std::cout << "in: " << in.mDesc << std::endl; std::cout << "in: " << in.mDesc << std::endl;
std::cout << "wei: " << wei_host_result.mDesc << std::endl; std::cout << "wei: " << wei.mDesc << std::endl;
std::cout << "out: " << out.mDesc << std::endl; std::cout << "out: " << out_host.mDesc << std::endl;
switch(init_method) switch(init_method)
{ {
case 0: break; case 0: break;
case 1: case 1:
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5}); wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break; break;
default: case 2:
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0}); in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5}); wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
break;
default:
in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1});
wei.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
} }
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_device_result.mDesc.GetElementSpaceSize()); DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize()); DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(in.mData.data()); in_device_buf.ToDevice(in.mData.data());
out_device_buf.ToDevice(out.mData.data()); wei_device_buf.ToDevice(wei.mData.data());
// init to 0 std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
wei_device_buf.SetZero(); std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
// do GEMM std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
auto conv = DeviceConvBwdWeightInstance{}; std::array<ck::index_t, NDimSpatial + 3> c_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> c_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
copy(out_g_n_k_wos_desc.GetLengths(), c_g_n_k_wos_lengths);
copy(out_g_n_k_wos_desc.GetStrides(), c_g_n_k_wos_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);
// do Conv
auto conv = DeviceConvNDFwdInstance{};
auto invoker = conv.MakeInvoker(); auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()), wei_device_buf.GetDeviceBuffer(),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), out_device_buf.GetDeviceBuffer(),
conv_param.N_, a_g_n_c_wis_lengths,
conv_param.K_, a_g_n_c_wis_strides,
conv_param.C_, b_g_k_c_xs_lengths,
conv_param.input_spatial_lengths_, b_g_k_c_xs_strides,
conv_param.filter_spatial_lengths_, c_g_n_k_wos_lengths,
conv_param.output_spatial_lengths_, c_g_n_k_wos_strides,
conv_param.conv_filter_strides_, conv_filter_strides,
conv_param.conv_filter_dilations_, conv_filter_dilations,
conv_param.input_left_pads_, input_left_pads,
conv_param.input_right_pads_, input_right_pads,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op, out_element_op);
split_k);
if(!conv.IsSupportedArgument(argument)) if(!conv.IsSupportedArgument(argument))
{ {
std::cout << "wrong! device_conv with the specified compilation parameters does " return true;
"not support this Conv problem"
<< std::endl;
return 1;
} }
float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
...@@ -112,41 +133,39 @@ int run_conv_bwd_weight(bool do_verification, ...@@ -112,41 +133,39 @@ int run_conv_bwd_weight(bool do_verification,
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>(); std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / avg_time; float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time; float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< conv.GetTypeString() << std::endl; << conv.GetTypeString() << std::endl;
if(do_verification) if(do_verification)
{ {
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial, auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InDataType, InDataType,
WeiDataType, WeiDataType,
OutDataType, OutDataType,
InElementOp, InElementOp,
WeiElementOp, WeiElementOp,
OutElementOp>{}; OutElementOp>();
auto ref_invoker = ref_conv.MakeInvoker(); auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in, auto ref_argument = ref_conv.MakeArgument(in,
wei_host_result, wei,
out, out_host,
conv_param.conv_filter_strides_, conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_, conv_param.conv_filter_dilations_,
conv_param.input_left_pads_, conv_param.input_left_pads_,
conv_param.input_right_pads_, conv_param.input_right_pads_,
InElementOp{}, in_element_op,
WeiElementOp{}, wei_element_op,
OutElementOp{}); out_element_op);
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
wei_device_buf.FromDevice(wei_device_result.mData.data()); out_device_buf.FromDevice(out_device.mData.data());
return ck::utils::check_err(wei_device_result.mData, wei_host_result.mData) ? 0 : 1; return ck::utils::check_err(
out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
} }
return 0; return true;
} }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_dl_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using InDataType = ck::half_t;
using WeiDataType = ck::half_t;
using AccDataType = float;
using OutDataType = ck::half_t;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
// clang-format off
using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK
// ######| NDim| InData| WeiData| OutData| AccData| InLayout| WeiLayout| OutLayout| In| Wei| Out| Convolution| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ######| Spatial| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Forward| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< NDimSpatial, InDataType, WeiDataType, OutDataType, AccDataType, InLayout, WeiLayout, OutLayout, InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmPadingSpec, 256, 128, 128, 16, 2, 4, 4, 1, S<8, 2>, S<8, 2>, S<8, 1, 1, 2>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<8, 1, 1, 2>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>;
// clang-format on
#include "run_convnd_fwd_dl_example.inc"
int main(int argc, char* argv[]) { return run_convnd_fwd_dl_example(argc, argv) ? 0 : 1; }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_dl_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using InDataType = float;
using WeiDataType = float;
using AccDataType = float;
using OutDataType = float;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
// clang-format off
using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK
// ######| NDim| InData| WeiData| OutData| AccData| InLayout| WeiLayout| OutLayout| In| Wei| Out| Convolution| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ######| Spatial| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Forward| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< NDimSpatial, InDataType, WeiDataType, OutDataType, AccDataType, InLayout, WeiLayout, OutLayout, InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmPadingSpec, 256, 128, 128, 16, 1, 4, 4, 1, S<8, 2>, S<8, 2>, S<8, 1, 1, 1>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 1>, S<1, 2, 0, 3>, S<1, 1, 1, 1>, S<8, 1, 1, 1>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 1>, S<1, 2, 0, 3>, S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>, 5, 4>;
// clang-format on
#include "run_convnd_fwd_dl_example.inc"
int main(int argc, char* argv[]) { return run_convnd_fwd_dl_example(argc, argv) ? 0 : 1; }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_dl_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using InDataType = int8_t;
using WeiDataType = int8_t;
using AccDataType = int32_t;
using OutDataType = int8_t;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
// clang-format off
using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK
// ######| NDim| InData| WeiData| OutData| AccData| InLayout| WeiLayout| OutLayout| In| Wei| Out| Convolution| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ######| Spatial| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Forward| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< NDimSpatial, InDataType, WeiDataType, OutDataType, AccDataType, InLayout, WeiLayout, OutLayout, InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmPadingSpec, 256, 128, 128, 16, 4, 4, 4, 1, S<8, 2>, S<8, 2>, S<8, 1, 1, 4>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 4>, S<1, 2, 0, 3>, S<1, 1, 1, 4>, S<8, 1, 1, 4>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 4>, S<1, 2, 0, 3>, S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>, 5, 4>;
// clang-format on
#include "run_convnd_fwd_dl_example.inc"
int main(int argc, char* argv[]) { return run_convnd_fwd_dl_example(argc, argv) ? 0 : 1; }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
bool run_convnd_fwd_dl_example(int argc, char* argv[])
{
print_helper_msg();
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
ck::utils::conv::ConvParam conv_param{
2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc == 1)
{
// use default
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
}
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
const auto run = [&](auto ndim_spatial, auto in_layout, auto wei_layout, auto out_layout) {
constexpr ck::index_t ndim_spatial_value = ndim_spatial.value;
std::cout << "ndim_spatial_value: " << ndim_spatial_value << std::endl;
using InLayout = decltype(in_layout);
using WeiLayout = decltype(wei_layout);
using OutLayout = decltype(out_layout);
const auto in_g_n_c_wis_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
conv_param);
const auto wei_g_k_c_xs_desc =
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
conv_param);
const auto out_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
conv_param);
return run_grouped_conv_fwd_dl<
ndim_spatial_value,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
};
namespace ctc = ck::tensor_layout::convolution;
if(conv_param.num_dim_spatial_ == 1)
{
return run(ck::Number<1>{}, ctc::GNWC{}, ctc::GKXC{}, ctc::GNWK{});
}
else if(conv_param.num_dim_spatial_ == 2)
{
return run(ck::Number<2>{}, ctc::GNHWC{}, ctc::GKYXC{}, ctc::GNHWK{});
}
else if(conv_param.num_dim_spatial_ == 3)
{
return run(ck::Number<3>{}, ctc::GNDHWC{}, ctc::GKZYXC{}, ctc::GNDHWK{});
}
return true;
}
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...@@ -140,9 +141,7 @@ make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size) ...@@ -140,9 +141,7 @@ make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
{ {
std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_}; std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_};
std::copy(begin(problem_size.output_spatial_lengths_), ck::ranges::copy(problem_size.output_spatial_lengths_, std::back_inserter(dimensions));
end(problem_size.output_spatial_lengths_),
std::back_inserter(dimensions));
return HostTensorDescriptor(dimensions); return HostTensorDescriptor(dimensions);
} }
...@@ -158,10 +157,3 @@ void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor, ...@@ -158,10 +157,3 @@ void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor,
assert(size(descriptor.GetStrides()) == size(strides)); assert(size(descriptor.GetStrides()) == size(strides));
std::copy_n(begin(descriptor.GetStrides()), size(descriptor.GetStrides()), begin(strides)); std::copy_n(begin(descriptor.GetStrides()), size(descriptor.GetStrides()), begin(strides));
} }
template <typename Range, typename OutputIterator>
auto copy(const Range& range, OutputIterator iter)
-> decltype(std::copy(std::begin(range), std::end(range), iter))
{
return std::copy(std::begin(range), std::end(range), iter);
}
...@@ -77,15 +77,12 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, ...@@ -77,15 +77,12 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
{ {
case 0: break; case 0: break;
case 1: case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input.begin(), ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input);
conv_input.end()); ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight.begin(),
conv_weight.end());
break; break;
default: default:
ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input.begin(), conv_input.end()); ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input);
ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight.begin(), ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight);
conv_weight.end());
} }
DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize()); DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize());
...@@ -123,10 +120,10 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, ...@@ -123,10 +120,10 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
conv_output_g_n_k_wos_desc, conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_strides); conv_output_g_n_k_wos_desc, conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_strides);
unpack_host_tensor_descriptor(r0_desc, r0_lengths, r0_strides); unpack_host_tensor_descriptor(r0_desc, r0_lengths, r0_strides);
copy(problem_size.conv_filter_strides_, begin(conv_filter_strides)); ck::ranges::copy(problem_size.conv_filter_strides_, begin(conv_filter_strides));
copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations)); ck::ranges::copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations));
copy(problem_size.input_left_pads_, begin(input_left_pads)); ck::ranges::copy(problem_size.input_left_pads_, begin(input_left_pads));
copy(problem_size.input_right_pads_, begin(input_right_pads)); ck::ranges::copy(problem_size.input_right_pads_, begin(input_right_pads));
// run Conv + Reduction on device // run Conv + Reduction on device
auto conv = DeviceInstance<NDimSpatial>{}; auto conv = DeviceInstance<NDimSpatial>{};
...@@ -276,16 +273,13 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, ...@@ -276,16 +273,13 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
conv_output_device_buf.FromDevice(conv_output_device.mData.data()); conv_output_device_buf.FromDevice(conv_output_device.mData.data());
r0_device_buf.FromDevice(r0_device.mData.data()); r0_device_buf.FromDevice(r0_device.mData.data());
return ck::utils::check_err(conv_output_device.mData, return ck::utils::check_err(conv_output_device,
conv_output_host.mData, conv_output_host,
"Error: incorrect results! (Matrix E)", "Error: incorrect results! (Matrix E)",
1e-5f, 1e-5f,
1e-4f) && 1e-4f) &&
ck::utils::check_err(r0_device.mData, ck::utils::check_err(
r0_host.mData, r0_device, r0_host, "Error: incorrect results! (Matrix R0)", 1e-5f, 1e-4f);
"Error: incorrect results! (Matrix R0)",
1e-5f,
1e-4f);
} }
return true; return true;
......
...@@ -140,6 +140,10 @@ bool reduce_blockwise_test(bool do_verification, ...@@ -140,6 +140,10 @@ bool reduce_blockwise_test(bool do_verification,
if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size()) if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size())
return; return;
std::array<int, ShapeType::NumReduceDim_> arrReduceDims;
ck::ranges::copy(reduceDims, arrReduceDims.begin());
result = reduce_blockwise_impl<InOutDataType, result = reduce_blockwise_impl<InOutDataType,
AccDataType, AccDataType,
ReduceOpId, ReduceOpId,
...@@ -147,7 +151,7 @@ bool reduce_blockwise_test(bool do_verification, ...@@ -147,7 +151,7 @@ bool reduce_blockwise_test(bool do_verification,
ShapeType::NumReduceDim_, ShapeType::NumReduceDim_,
PropagateNan, PropagateNan,
OutputIndex>( OutputIndex>(
do_verification, init_method, time_kernel, inLengths, reduceDims, alpha, beta); do_verification, init_method, time_kernel, inLengths, arrReduceDims, alpha, beta);
matched = true; matched = true;
}); });
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
...@@ -30,7 +31,7 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -30,7 +31,7 @@ int reduce_blockwise_impl(bool do_verification,
int init_method, int init_method,
bool time_kernel, bool time_kernel,
const std::vector<size_t>& inLengths, const std::vector<size_t>& inLengths,
const std::vector<int>& reduceDims, const std::array<int, NumReduceDim>& reduceDims,
float alpha, float alpha,
float beta) float beta)
...@@ -38,6 +39,8 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -38,6 +39,8 @@ int reduce_blockwise_impl(bool do_verification,
using namespace ck; using namespace ck;
using namespace ck::tensor_operation::device; using namespace ck::tensor_operation::device;
constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
constexpr bool op_support_indices = constexpr bool op_support_indices =
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp::AMAX); ReduceOpId == ReduceTensorOp::AMAX);
...@@ -143,7 +146,7 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -143,7 +146,7 @@ int reduce_blockwise_impl(bool do_verification,
std::vector<size_t> outLengths; std::vector<size_t> outLengths;
std::vector<int> invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims); auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
if(invariantDims.empty()) if(invariantDims.empty())
outLengths.push_back(1); outLengths.push_back(1);
...@@ -256,22 +259,22 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -256,22 +259,22 @@ int reduce_blockwise_impl(bool do_verification,
acc_elementwise_op); acc_elementwise_op);
}; };
std::vector<ck::index_t> i_inLengths; std::array<index_t, Rank> arrInLengths;
std::vector<ck::index_t> i_inStrides; std::array<index_t, Rank> arrInStrides;
std::vector<ck::index_t> i_outLengths; std::array<index_t, NumOutDim> arrOutLengths;
std::vector<ck::index_t> i_outStrides; std::array<index_t, NumOutDim> arrOutStrides;
i_inLengths.assign(inLengths.begin(), inLengths.end()); ck::ranges::copy(inLengths, arrInLengths.begin());
i_inStrides.assign(inStrides.begin(), inStrides.end()); ck::ranges::copy(inStrides, arrInStrides.begin());
i_outLengths.assign(outLengths.begin(), outLengths.end()); ck::ranges::copy(outLengths, arrOutLengths.begin());
i_outStrides.assign(outStrides.begin(), outStrides.end()); ck::ranges::copy(outStrides, arrOutStrides.begin());
auto reduce = DeviceReduceInstance{}; auto reduce = DeviceReduceInstance{};
auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths, auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths,
i_inStrides, arrInStrides,
i_outLengths, arrOutLengths,
i_outStrides, arrOutStrides,
reduceDims, reduceDims,
alpha, alpha,
beta, beta,
...@@ -322,12 +325,12 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -322,12 +325,12 @@ int reduce_blockwise_impl(bool do_verification,
#endif #endif
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.mData.data());
pass = pass && ck::utils::check_err(out.mData, out_ref.mData); pass = pass && ck::utils::check_err(out, out_ref);
if(OutputIndex) if(OutputIndex)
{ {
out_index_dev.FromDevice(out_indices.mData.data()); out_index_dev.FromDevice(out_indices.mData.data());
pass = pass && ck::utils::check_err(out_indices.mData, out_indices_ref.mData); pass = pass && ck::utils::check_err(out_indices, out_indices_ref);
}; };
}; };
......
...@@ -90,15 +90,15 @@ static bool time_kernel; ...@@ -90,15 +90,15 @@ static bool time_kernel;
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
// used by the device reduction // used by the device reduction
const std::vector<int> reduceDims_1 = {4}; const std::array<int, 1> reduceDims_1 = {4};
const std::vector<int> invariantDims_1 = {0, 1, 2, 3}; // const std::array<int, 4> invariantDims_1 = {0, 1, 2, 3};
const std::vector<int> reduceDims_2 = {3}; const std::array<int, 1> reduceDims_2 = {3};
const std::vector<int> invariantDims_2 = {0, 1, 2}; // const std::array<int, 3> invariantDims_2 = {0, 1, 2};
// used by the host reduction // used by the host reduction
const std::vector<int> reduceDims = {3, 4}; const std::array<int, 2> reduceDims = {3, 4};
const std::vector<int> invariantDims = {0, 1, 2}; const std::array<int, 3> invariantDims = {0, 1, 2};
const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128}; const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
...@@ -214,26 +214,26 @@ int main(int argc, char* argv[]) ...@@ -214,26 +214,26 @@ int main(int argc, char* argv[])
acc_elementwise_op); acc_elementwise_op);
}; };
std::vector<ck::index_t> i_inLengths_1; std::array<index_t, 5> arrInLengths_1;
std::vector<ck::index_t> i_inStrides_1; std::array<index_t, 5> arrInStrides_1;
std::vector<ck::index_t> i_inLengths_2; std::array<index_t, 4> arrInLengths_2;
std::vector<ck::index_t> i_inStrides_2; std::array<index_t, 4> arrInStrides_2;
std::vector<ck::index_t> i_outLengths; std::array<index_t, 3> arrOutLengths;
std::vector<ck::index_t> i_outStrides; std::array<index_t, 3> arrOutStrides;
i_inLengths_1.assign(inLengths_1.begin(), inLengths_1.end()); ck::ranges::copy(inLengths_1, arrInLengths_1.begin());
i_inStrides_1.assign(inStrides_1.begin(), inStrides_1.end()); ck::ranges::copy(inStrides_1, arrInStrides_1.begin());
i_inLengths_2.assign(inLengths_2.begin(), inLengths_2.end()); ck::ranges::copy(inLengths_2, arrInLengths_2.begin());
i_inStrides_2.assign(inStrides_2.begin(), inStrides_2.end()); ck::ranges::copy(inStrides_2, arrInStrides_2.begin());
i_outLengths.assign(outLengths.begin(), outLengths.end()); ck::ranges::copy(outLengths, arrOutLengths.begin());
i_outStrides.assign(outStrides.begin(), outStrides.end()); ck::ranges::copy(outStrides, arrOutStrides.begin());
auto reduce_1 = DeviceReduceInstance_1{}; auto reduce_1 = DeviceReduceInstance_1{};
auto argument_ptr_1 = reduce_1.MakeArgumentPointer(i_inLengths_1, auto argument_ptr_1 = reduce_1.MakeArgumentPointer(arrInLengths_1,
i_inStrides_1, arrInStrides_1,
i_inLengths_2, arrInLengths_2,
i_inStrides_2, arrInStrides_2,
reduceDims_1, reduceDims_1,
1.0f, 1.0f,
0.0f, 0.0f,
...@@ -255,10 +255,10 @@ int main(int argc, char* argv[]) ...@@ -255,10 +255,10 @@ int main(int argc, char* argv[])
auto reduce_2 = DeviceReduceInstance_2{}; auto reduce_2 = DeviceReduceInstance_2{};
auto argument_ptr_2 = reduce_2.MakeArgumentPointer(i_inLengths_2, auto argument_ptr_2 = reduce_2.MakeArgumentPointer(arrInLengths_2,
i_inStrides_2, arrInStrides_2,
i_outLengths, arrOutLengths,
i_outStrides, arrOutStrides,
reduceDims_2, reduceDims_2,
alpha, alpha,
beta, beta,
...@@ -294,7 +294,7 @@ int main(int argc, char* argv[]) ...@@ -294,7 +294,7 @@ int main(int argc, char* argv[])
if(do_verify) if(do_verify)
{ {
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.mData.data());
pass = pass && ck::utils::check_err(out.mData, out_ref.mData); pass = pass && ck::utils::check_err(out, out_ref);
}; };
return (pass ? 0 : 1); return (pass ? 0 : 1);
......
...@@ -5,11 +5,10 @@ ...@@ -5,11 +5,10 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
template <ck::index_t Rank, ck::index_t NumReduceDim> template <int Rank, int NumReduceDim>
std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims) static inline std::array<int, Rank - NumReduceDim>
get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
{ {
assert(NumReduceDim == reduceDims.size());
int reduceFlag = 0; int reduceFlag = 0;
// flag the bits for the reduceDims // flag the bits for the reduceDims
...@@ -18,13 +17,15 @@ std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims) ...@@ -18,13 +17,15 @@ std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
reduceFlag |= 1 << reduceDims[i]; reduceFlag |= 1 << reduceDims[i];
}; };
std::vector<int> invariantDims; std::array<int, Rank - NumReduceDim> invariantDims;
// collect invariant dimensions // collect invariant dimensions
int dim = 0;
for(int i = 0; i < Rank; i++) for(int i = 0; i < Rank; i++)
if((reduceFlag & (1 << i)) == 0) if((reduceFlag & (1 << i)) == 0)
{ {
invariantDims.push_back(i); invariantDims[dim] = i;
dim++;
}; };
return invariantDims; return invariantDims;
......
...@@ -138,13 +138,17 @@ bool reduce_multiblock_atomic_add_test(bool do_verification, ...@@ -138,13 +138,17 @@ bool reduce_multiblock_atomic_add_test(bool do_verification,
if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size()) if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size())
return; return;
std::array<int, ShapeType::NumReduceDim_> a_reduceDims;
ck::ranges::copy(reduceDims, a_reduceDims.begin());
result = reduce_multiblock_atomic_add_impl<InOutDataType, result = reduce_multiblock_atomic_add_impl<InOutDataType,
AccDataType, AccDataType,
ReduceOpId, ReduceOpId,
ShapeType::Rank_, ShapeType::Rank_,
ShapeType::NumReduceDim_, ShapeType::NumReduceDim_,
PropagateNan>( PropagateNan>(
do_verification, init_method, time_kernel, inLengths, reduceDims, alpha, beta); do_verification, init_method, time_kernel, inLengths, a_reduceDims, alpha, beta);
matched = true; matched = true;
}); });
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
...@@ -29,7 +30,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -29,7 +30,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
int init_method, int init_method,
bool time_kernel, bool time_kernel,
const std::vector<size_t>& inLengths, const std::vector<size_t>& inLengths,
const std::vector<int>& reduceDims, const std::array<int, NumReduceDim>& reduceDims,
float alpha, float alpha,
float beta) float beta)
...@@ -37,6 +38,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -37,6 +38,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
using namespace ck; using namespace ck;
using namespace ck::tensor_operation::device; using namespace ck::tensor_operation::device;
constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
constexpr bool op_support_atomic_add = constexpr bool op_support_atomic_add =
(ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG); (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG);
...@@ -84,7 +87,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -84,7 +87,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
std::vector<size_t> outLengths; std::vector<size_t> outLengths;
std::vector<int> invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims); auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
if(invariantDims.empty()) if(invariantDims.empty())
outLengths.push_back(1); outLengths.push_back(1);
...@@ -169,22 +172,22 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -169,22 +172,22 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
acc_elementwise_op); acc_elementwise_op);
}; };
std::vector<ck::index_t> i_inLengths; std::array<index_t, Rank> arrInLengths;
std::vector<ck::index_t> i_inStrides; std::array<index_t, Rank> arrInStrides;
std::vector<ck::index_t> i_outLengths; std::array<index_t, NumOutDim> arrOutLengths;
std::vector<ck::index_t> i_outStrides; std::array<index_t, NumOutDim> arrOutStrides;
i_inLengths.assign(inLengths.begin(), inLengths.end()); ck::ranges::copy(inLengths, arrInLengths.begin());
i_inStrides.assign(inStrides.begin(), inStrides.end()); ck::ranges::copy(inStrides, arrInStrides.begin());
i_outLengths.assign(outLengths.begin(), outLengths.end()); ck::ranges::copy(outLengths, arrOutLengths.begin());
i_outStrides.assign(outStrides.begin(), outStrides.end()); ck::ranges::copy(outStrides, arrOutStrides.begin());
auto reduce = DeviceReduceInstance{}; auto reduce = DeviceReduceInstance{};
auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths, auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths,
i_inStrides, arrInStrides,
i_outLengths, arrOutLengths,
i_outStrides, arrOutStrides,
reduceDims, reduceDims,
alpha, alpha,
beta, beta,
...@@ -223,7 +226,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -223,7 +226,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
if(do_verification) if(do_verification)
{ {
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.mData.data());
pass = pass && ck::utils::check_err(out.mData, out_ref.mData); pass = pass && ck::utils::check_err(out, out_ref);
}; };
return (pass ? 0 : 1); return (pass ? 0 : 1);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment