Unverified Commit e7be2fe8 authored by pmaybank's avatar pmaybank Committed by GitHub
Browse files

Merge branch 'develop' into sphinx_doc

parents f68fa79a f7d28f3e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
void print_helper_msg()
{
std::cout << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=no, 1=yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
}
template <ck::index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename DsDataType,
typename OutDataType,
typename InElementOp,
typename WeiElementOp,
typename OutElementOp,
typename DeviceConvNDFwdInstance>
bool run_grouped_conv_fwd_dl(bool do_verification,
int init_method,
bool time_kernel,
const ck::utils::conv::ConvParam& conv_param,
const HostTensorDescriptor& in_g_n_c_wis_desc,
const HostTensorDescriptor& wei_g_k_c_xs_desc,
const HostTensorDescriptor& out_g_n_k_wos_desc,
const InElementOp& in_element_op,
const WeiElementOp& wei_element_op,
const OutElementOp& out_element_op)
{
using DDataType = ck::remove_cvref_t<ck::tuple_element_t<0, DsDataType>>;
Tensor<InDataType> in(in_g_n_c_wis_desc);
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
Tensor<DDataType> bias(out_g_n_k_wos_desc);
Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
std::cout << "in: " << in.mDesc << std::endl;
std::cout << "wei: " << wei.mDesc << std::endl;
std::cout << "out: " << out_host.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 3});
wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 3});
bias.GenerateTensorValue(GeneratorTensor_2<DDataType>{-2, 3});
break;
case 2:
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
bias.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
break;
default:
in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1});
wei.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{-1});
bias.GenerateTensorValue(GeneratorTensor_1<DDataType>{1});
}
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
DeviceMem bias_device_buf(sizeof(DDataType) * bias.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data());
bias_device_buf.ToDevice(bias.mData.data());
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
copy(out_g_n_k_wos_desc.GetLengths(), d_g_n_k_wos_lengths);
copy(out_g_n_k_wos_desc.GetStrides(), d_g_n_k_wos_strides);
copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);
// do Conv
auto conv = DeviceConvNDFwdInstance{};
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(
in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{bias_device_buf.GetDeviceBuffer()},
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_lengths}},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_strides}},
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
if(!conv.IsSupportedArgument(argument))
{
std::cout << "wrong! device_conv with the specified compilation parameters does not "
"support this Conv problem"
<< std::endl;
return true;
}
float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = conv_param.GetFlops();
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< conv.GetTypeString() << std::endl;
if(do_verification)
{
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
ck::tensor_operation::element_wise::PassThrough>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument =
ref_conv.MakeArgument(in,
wei,
out_host,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_,
in_element_op,
wei_element_op,
ck::tensor_operation::element_wise::PassThrough{});
ref_invoker.Run(ref_argument);
// cde_elementwise
out_host.ForEach(
[&](auto&, auto idx) { out_element_op(out_host(idx), out_host(idx), bias(idx)); });
out_device_buf.FromDevice(out_device.mData.data());
return ck::utils::check_err(
out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
}
return true;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_dl_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using InDataType = ck::half_t;
using WeiDataType = ck::half_t;
using AccDataType = float;
using DsDataType = ck::Tuple<ck::half_t>;
using OutDataType = ck::half_t;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
// clang-format off
using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
// ######| NDim| InData| WeiData| MultpleD| OutData| AccData| InLayout| WeiLayout| MultipleD| OutLayout| In| Wei| Out| Convolution| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ######| Spatial| Type| Type| Type| Type| Type| | | Layout| | Elementwise| Elementwise| Elementwise| Forward| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ######| | | | | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< NDimSpatial, InDataType, WeiDataType, DsDataType, OutDataType, AccDataType, InLayout, WeiLayout, ck::Tuple<OutLayout>, OutLayout, InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmPadingSpec, 256, 128, 128, 16, 2, 4, 4, 1, S<8, 2>, S<8, 2>, S<8, 1, 1, 2>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<8, 1, 1, 2>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 2>, S<1, 2, 0, 3>, S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>;
// clang-format on
#include "run_convnd_fwd_dl_example.inc"
int main(int argc, char* argv[]) { return run_convnd_fwd_dl_example(argc, argv) ? 0 : 1; }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_dl_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using InDataType = float;
using WeiDataType = float;
using AccDataType = float;
using DsDataType = ck::Tuple<float>;
using OutDataType = float;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
// clang-format off
using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
// ######| NDim| InData| WeiData| MultpleD| OutData| AccData| InLayout| WeiLayout| MultipleD| OutLayout| In| Wei| Out| Convolution| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ######| Spatial| Type| Type| Type| Type| Type| | | Layout| | Elementwise| Elementwise| Elementwise| Forward| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ######| | | | | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< NDimSpatial, InDataType, WeiDataType, DsDataType, OutDataType, AccDataType, InLayout, WeiLayout, ck::Tuple<OutLayout>, OutLayout, InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmPadingSpec, 256, 128, 128, 16, 1, 4, 4, 1, S<8, 2>, S<8, 2>, S<8, 1, 1, 1>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 1>, S<1, 2, 0, 3>, S<1, 1, 1, 1>, S<8, 1, 1, 1>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 1>, S<1, 2, 0, 3>, S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>, 5, 4>;
// clang-format on
#include "run_convnd_fwd_dl_example.inc"
int main(int argc, char* argv[]) { return run_convnd_fwd_dl_example(argc, argv) ? 0 : 1; }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "convnd_fwd_dl_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
using InDataType = int8_t;
using WeiDataType = int8_t;
using AccDataType = int32_t;
using DsDataType = ck::Tuple<int8_t>;
using OutDataType = int8_t;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
// clang-format off
using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
// ######| NDim| InData| WeiData| MultpleD| OutData| AccData| InLayout| WeiLayout| MultipleD| OutLayout| In| Wei| Out| Convolution| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
// ######| Spatial| Type| Type| Type| Type| Type| | | Layout| | Elementwise| Elementwise| Elementwise| Forward| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
// ######| | | | | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< NDimSpatial, InDataType, WeiDataType, DsDataType, OutDataType, AccDataType, InLayout, WeiLayout, ck::Tuple<OutLayout>, OutLayout, InElementOp, WeiElementOp, OutElementOp, ConvSpec, GemmPadingSpec, 256, 128, 128, 16, 4, 4, 4, 1, S<8, 2>, S<8, 2>, S<8, 1, 1, 4>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 4>, S<1, 2, 0, 3>, S<1, 1, 1, 4>, S<8, 1, 1, 4>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 4>, S<1, 2, 0, 3>, S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>, 5, 4>;
// clang-format on
#include "run_convnd_fwd_dl_example.inc"
int main(int argc, char* argv[]) { return run_convnd_fwd_dl_example(argc, argv) ? 0 : 1; }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
bool run_convnd_fwd_dl_example(int argc, char* argv[])
{
print_helper_msg();
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
ck::utils::conv::ConvParam conv_param{
2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
if(argc == 1)
{
// use default
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
}
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
const auto run = [&](auto ndim_spatial, auto in_layout, auto wei_layout, auto out_layout) {
constexpr ck::index_t ndim_spatial_value = ndim_spatial.value;
std::cout << "ndim_spatial_value: " << ndim_spatial_value << std::endl;
using InLayout = decltype(in_layout);
using WeiLayout = decltype(wei_layout);
using OutLayout = decltype(out_layout);
const auto in_g_n_c_wis_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
conv_param);
const auto wei_g_k_c_xs_desc =
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
conv_param);
const auto out_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
conv_param);
return run_grouped_conv_fwd_dl<
ndim_spatial_value,
InDataType,
WeiDataType,
DsDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>>(
do_verification,
init_method,
time_kernel,
conv_param,
in_g_n_c_wis_desc,
wei_g_k_c_xs_desc,
out_g_n_k_wos_desc,
in_element_op,
wei_element_op,
out_element_op);
};
namespace ctc = ck::tensor_layout::convolution;
if(conv_param.num_dim_spatial_ == 1)
{
return run(ck::Number<1>{}, ctc::GNWC{}, ctc::GKXC{}, ctc::GNWK{});
}
else if(conv_param.num_dim_spatial_ == 2)
{
return run(ck::Number<2>{}, ctc::GNHWC{}, ctc::GKYXC{}, ctc::GNHWK{});
}
else if(conv_param.num_dim_spatial_ == 3)
{
return run(ck::Number<3>{}, ctc::GNDHWC{}, ctc::GKZYXC{}, ctc::GNDHWK{});
}
return true;
}
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...@@ -140,9 +141,7 @@ make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size) ...@@ -140,9 +141,7 @@ make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
{ {
std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_}; std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_};
std::copy(begin(problem_size.output_spatial_lengths_), ck::ranges::copy(problem_size.output_spatial_lengths_, std::back_inserter(dimensions));
end(problem_size.output_spatial_lengths_),
std::back_inserter(dimensions));
return HostTensorDescriptor(dimensions); return HostTensorDescriptor(dimensions);
} }
...@@ -158,10 +157,3 @@ void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor, ...@@ -158,10 +157,3 @@ void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor,
assert(size(descriptor.GetStrides()) == size(strides)); assert(size(descriptor.GetStrides()) == size(strides));
std::copy_n(begin(descriptor.GetStrides()), size(descriptor.GetStrides()), begin(strides)); std::copy_n(begin(descriptor.GetStrides()), size(descriptor.GetStrides()), begin(strides));
} }
template <typename Range, typename OutputIterator>
auto copy(const Range& range, OutputIterator iter)
-> decltype(std::copy(std::begin(range), std::end(range), iter))
{
return std::copy(std::begin(range), std::end(range), iter);
}
...@@ -77,15 +77,12 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, ...@@ -77,15 +77,12 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
{ {
case 0: break; case 0: break;
case 1: case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input.begin(), ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input);
conv_input.end()); ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight.begin(),
conv_weight.end());
break; break;
default: default:
ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input.begin(), conv_input.end()); ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input);
ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight.begin(), ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight);
conv_weight.end());
} }
DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize()); DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize());
...@@ -123,10 +120,10 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, ...@@ -123,10 +120,10 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
conv_output_g_n_k_wos_desc, conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_strides); conv_output_g_n_k_wos_desc, conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_strides);
unpack_host_tensor_descriptor(r0_desc, r0_lengths, r0_strides); unpack_host_tensor_descriptor(r0_desc, r0_lengths, r0_strides);
copy(problem_size.conv_filter_strides_, begin(conv_filter_strides)); ck::ranges::copy(problem_size.conv_filter_strides_, begin(conv_filter_strides));
copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations)); ck::ranges::copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations));
copy(problem_size.input_left_pads_, begin(input_left_pads)); ck::ranges::copy(problem_size.input_left_pads_, begin(input_left_pads));
copy(problem_size.input_right_pads_, begin(input_right_pads)); ck::ranges::copy(problem_size.input_right_pads_, begin(input_right_pads));
// run Conv + Reduction on device // run Conv + Reduction on device
auto conv = DeviceInstance<NDimSpatial>{}; auto conv = DeviceInstance<NDimSpatial>{};
...@@ -276,16 +273,13 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, ...@@ -276,16 +273,13 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
conv_output_device_buf.FromDevice(conv_output_device.mData.data()); conv_output_device_buf.FromDevice(conv_output_device.mData.data());
r0_device_buf.FromDevice(r0_device.mData.data()); r0_device_buf.FromDevice(r0_device.mData.data());
return ck::utils::check_err(conv_output_device.mData, return ck::utils::check_err(conv_output_device,
conv_output_host.mData, conv_output_host,
"Error: incorrect results! (Matrix E)", "Error: incorrect results! (Matrix E)",
1e-5f, 1e-5f,
1e-4f) && 1e-4f) &&
ck::utils::check_err(r0_device.mData, ck::utils::check_err(
r0_host.mData, r0_device, r0_host, "Error: incorrect results! (Matrix R0)", 1e-5f, 1e-4f);
"Error: incorrect results! (Matrix R0)",
1e-5f,
1e-4f);
} }
return true; return true;
......
...@@ -142,7 +142,7 @@ bool reduce_blockwise_test(bool do_verification, ...@@ -142,7 +142,7 @@ bool reduce_blockwise_test(bool do_verification,
std::array<int, ShapeType::NumReduceDim_> arrReduceDims; std::array<int, ShapeType::NumReduceDim_> arrReduceDims;
std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin()); ck::ranges::copy(reduceDims, arrReduceDims.begin());
result = reduce_blockwise_impl<InOutDataType, result = reduce_blockwise_impl<InOutDataType,
AccDataType, AccDataType,
......
...@@ -9,13 +9,14 @@ ...@@ -9,13 +9,14 @@
#include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp" #include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_reduction.hpp"
#include "reduce_example_common.hpp" #include "reduce_example_common.hpp"
...@@ -235,38 +236,57 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -235,38 +236,57 @@ int reduce_blockwise_impl(bool do_verification,
reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator( reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
static_cast<int32_t>(reduce_total_length)); static_cast<int32_t>(reduce_total_length));
if(do_verification)
{
ReductionHost<InOutDataType,
AccDataType,
InOutDataType,
ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
Rank,
NumReduceDim,
PropagateNan,
OutputIndex>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce.Run(alpha,
in.mData.data(),
beta,
out_ref.mData.data(),
out_indices_ref.mData.data(),
in_elementwise_op,
acc_elementwise_op);
};
std::array<index_t, Rank> arrInLengths; std::array<index_t, Rank> arrInLengths;
std::array<index_t, Rank> arrInStrides; std::array<index_t, Rank> arrInStrides;
std::array<index_t, NumOutDim> arrOutLengths; std::array<index_t, NumOutDim> arrOutLengths;
std::array<index_t, NumOutDim> arrOutStrides; std::array<index_t, NumOutDim> arrOutStrides;
std::copy(inLengths.begin(), inLengths.end(), arrInLengths.begin()); ck::ranges::copy(inLengths, arrInLengths.begin());
std::copy(inStrides.begin(), inStrides.end(), arrInStrides.begin()); ck::ranges::copy(inStrides, arrInStrides.begin());
std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin()); ck::ranges::copy(outLengths, arrOutLengths.begin());
std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin()); ck::ranges::copy(outStrides, arrOutStrides.begin());
if(do_verification)
{
using ReferenceReduceInstance =
ck::tensor_operation::host::ReferenceReduce<InOutDataType,
AccDataType,
InOutDataType,
Rank,
NumReduceDim,
ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
PropagateNan,
OutputIndex>;
auto reduce_ref = ReferenceReduceInstance{};
auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths,
arrInStrides,
arrOutLengths,
arrOutStrides,
reduceDims,
static_cast<double>(alpha),
static_cast<double>(beta),
in.mData.data(),
nullptr,
out_ref.mData.data(),
out_indices_ref.mData.data(),
in_elementwise_op,
acc_elementwise_op);
if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get()))
{
std::cout << "The runtime parameters not supported by the reduce reference, exiting!"
<< std::endl;
return (false);
};
auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();
invoker_ptr_ref->Run(argument_ptr_ref.get());
};
auto reduce = DeviceReduceInstance{}; auto reduce = DeviceReduceInstance{};
...@@ -275,8 +295,8 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -275,8 +295,8 @@ int reduce_blockwise_impl(bool do_verification,
arrOutLengths, arrOutLengths,
arrOutStrides, arrOutStrides,
reduceDims, reduceDims,
alpha, static_cast<double>(alpha),
beta, static_cast<double>(beta),
in_dev.GetDeviceBuffer(), in_dev.GetDeviceBuffer(),
nullptr, nullptr,
out_dev.GetDeviceBuffer(), out_dev.GetDeviceBuffer(),
...@@ -286,9 +306,8 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -286,9 +306,8 @@ int reduce_blockwise_impl(bool do_verification,
if(!reduce.IsSupportedArgument(argument_ptr.get())) if(!reduce.IsSupportedArgument(argument_ptr.get()))
{ {
std::cerr std::cerr << "The runtime parameters not supported by the DeviceReduce instance, exiting!"
<< "The runtime parameters seems not supported by the DeviceReduce instance, exiting!" << std::endl;
<< std::endl;
return (-2); return (-2);
}; };
...@@ -324,12 +343,12 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -324,12 +343,12 @@ int reduce_blockwise_impl(bool do_verification,
#endif #endif
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.mData.data());
pass = pass && ck::utils::check_err(out.mData, out_ref.mData); pass = pass && ck::utils::check_err(out, out_ref);
if(OutputIndex) if(OutputIndex)
{ {
out_index_dev.FromDevice(out_indices.mData.data()); out_index_dev.FromDevice(out_indices.mData.data());
pass = pass && ck::utils::check_err(out_indices.mData, out_indices_ref.mData); pass = pass && ck::utils::check_err(out_indices, out_indices_ref);
}; };
}; };
......
...@@ -12,13 +12,13 @@ ...@@ -12,13 +12,13 @@
#include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp" #include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_reduction.hpp"
using namespace ck; using namespace ck;
using namespace ck::tensor_operation::device; using namespace ck::tensor_operation::device;
...@@ -97,8 +97,8 @@ int main(int argc, char* argv[]) ...@@ -97,8 +97,8 @@ int main(int argc, char* argv[])
// const std::array<int, 3> invariantDims_2 = {0, 1, 2}; // const std::array<int, 3> invariantDims_2 = {0, 1, 2};
// used by the host reduction // used by the host reduction
const std::array<int, 2> reduceDims = {3, 4}; const std::array<int, 2> reduceDims = {3, 4};
const std::array<int, 3> invariantDims = {0, 1, 2}; // const std::array<int, 3> invariantDims = {0, 1, 2};
const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128}; const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
...@@ -191,29 +191,6 @@ int main(int argc, char* argv[]) ...@@ -191,29 +191,6 @@ int main(int argc, char* argv[])
reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator( reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
static_cast<int32_t>(reduce_total_length)); static_cast<int32_t>(reduce_total_length));
if(do_verify)
{
ReductionHost<InOutDataType,
AccDataType,
InOutDataType,
ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
5, // Rank
2, // NumReduceDim
PropagateNan,
OutputIndex>
hostReduce(in_1.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce.Run(alpha,
in_1.mData.data(),
beta,
out_ref.mData.data(),
nullptr,
in_elementwise_op,
acc_elementwise_op);
};
std::array<index_t, 5> arrInLengths_1; std::array<index_t, 5> arrInLengths_1;
std::array<index_t, 5> arrInStrides_1; std::array<index_t, 5> arrInStrides_1;
std::array<index_t, 4> arrInLengths_2; std::array<index_t, 4> arrInLengths_2;
...@@ -221,12 +198,54 @@ int main(int argc, char* argv[]) ...@@ -221,12 +198,54 @@ int main(int argc, char* argv[])
std::array<index_t, 3> arrOutLengths; std::array<index_t, 3> arrOutLengths;
std::array<index_t, 3> arrOutStrides; std::array<index_t, 3> arrOutStrides;
std::copy(inLengths_1.begin(), inLengths_1.end(), arrInLengths_1.begin()); ck::ranges::copy(inLengths_1, arrInLengths_1.begin());
std::copy(inStrides_1.begin(), inStrides_1.end(), arrInStrides_1.begin()); ck::ranges::copy(inStrides_1, arrInStrides_1.begin());
std::copy(inLengths_2.begin(), inLengths_2.end(), arrInLengths_2.begin()); ck::ranges::copy(inLengths_2, arrInLengths_2.begin());
std::copy(inStrides_2.begin(), inStrides_2.end(), arrInStrides_2.begin()); ck::ranges::copy(inStrides_2, arrInStrides_2.begin());
std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin()); ck::ranges::copy(outLengths, arrOutLengths.begin());
std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin()); ck::ranges::copy(outStrides, arrOutStrides.begin());
if(do_verify)
{
using ReferenceReduceInstance =
ck::tensor_operation::host::ReferenceReduce<InOutDataType,
AccDataType,
InOutDataType,
5,
2,
ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
PropagateNan,
OutputIndex>;
auto reduce_ref = ReferenceReduceInstance{};
auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths_1,
arrInStrides_1,
arrOutLengths,
arrOutStrides,
reduceDims,
static_cast<double>(alpha),
static_cast<double>(beta),
in_1.mData.data(),
nullptr,
out_ref.mData.data(),
nullptr,
in_elementwise_op,
acc_elementwise_op);
if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get()))
{
std::cout << "The runtime parameters not supported by the reduce reference, exiting!"
<< std::endl;
return (false);
};
auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();
invoker_ptr_ref->Run(argument_ptr_ref.get());
};
auto reduce_1 = DeviceReduceInstance_1{}; auto reduce_1 = DeviceReduceInstance_1{};
...@@ -235,8 +254,8 @@ int main(int argc, char* argv[]) ...@@ -235,8 +254,8 @@ int main(int argc, char* argv[])
arrInLengths_2, arrInLengths_2,
arrInStrides_2, arrInStrides_2,
reduceDims_1, reduceDims_1,
1.0f, 1.0,
0.0f, 0.0,
in_1_dev.GetDeviceBuffer(), in_1_dev.GetDeviceBuffer(),
nullptr, nullptr,
in_2_dev.GetDeviceBuffer(), in_2_dev.GetDeviceBuffer(),
...@@ -246,9 +265,8 @@ int main(int argc, char* argv[]) ...@@ -246,9 +265,8 @@ int main(int argc, char* argv[])
if(!reduce_1.IsSupportedArgument(argument_ptr_1.get())) if(!reduce_1.IsSupportedArgument(argument_ptr_1.get()))
{ {
std::cout std::cout << "The runtime parameters seems supported by the DeviceReduce instance, exiting!"
<< "The runtime parameters seems not supported by the DeviceReduce instance, exiting!" << std::endl;
<< std::endl;
}; };
auto invoker_ptr_1 = reduce_1.MakeInvokerPointer(); auto invoker_ptr_1 = reduce_1.MakeInvokerPointer();
...@@ -260,8 +278,8 @@ int main(int argc, char* argv[]) ...@@ -260,8 +278,8 @@ int main(int argc, char* argv[])
arrOutLengths, arrOutLengths,
arrOutStrides, arrOutStrides,
reduceDims_2, reduceDims_2,
alpha, static_cast<double>(alpha),
beta, static_cast<double>(beta),
in_2_dev.GetDeviceBuffer(), in_2_dev.GetDeviceBuffer(),
nullptr, nullptr,
out_dev.GetDeviceBuffer(), out_dev.GetDeviceBuffer(),
...@@ -294,7 +312,7 @@ int main(int argc, char* argv[]) ...@@ -294,7 +312,7 @@ int main(int argc, char* argv[])
if(do_verify) if(do_verify)
{ {
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.mData.data());
pass = pass && ck::utils::check_err(out.mData, out_ref.mData); pass = pass && ck::utils::check_err(out, out_ref);
}; };
return (pass ? 0 : 1); return (pass ? 0 : 1);
......
...@@ -140,7 +140,7 @@ bool reduce_multiblock_atomic_add_test(bool do_verification, ...@@ -140,7 +140,7 @@ bool reduce_multiblock_atomic_add_test(bool do_verification,
std::array<int, ShapeType::NumReduceDim_> a_reduceDims; std::array<int, ShapeType::NumReduceDim_> a_reduceDims;
std::copy(reduceDims.begin(), reduceDims.end(), a_reduceDims.begin()); ck::ranges::copy(reduceDims, a_reduceDims.begin());
result = reduce_multiblock_atomic_add_impl<InOutDataType, result = reduce_multiblock_atomic_add_impl<InOutDataType,
AccDataType, AccDataType,
......
...@@ -9,13 +9,14 @@ ...@@ -9,13 +9,14 @@
#include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp" #include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_reduction.hpp"
#include "reduce_example_common.hpp" #include "reduce_example_common.hpp"
...@@ -148,38 +149,57 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -148,38 +149,57 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator( reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
static_cast<int32_t>(reduce_total_length)); static_cast<int32_t>(reduce_total_length));
if(do_verification)
{
ReductionHost<InOutDataType,
AccDataType,
InOutDataType,
ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
Rank,
NumReduceDim,
PropagateNan,
false>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce.Run(alpha,
in.mData.data(),
beta,
out_ref.mData.data(),
nullptr,
in_elementwise_op,
acc_elementwise_op);
};
std::array<index_t, Rank> arrInLengths; std::array<index_t, Rank> arrInLengths;
std::array<index_t, Rank> arrInStrides; std::array<index_t, Rank> arrInStrides;
std::array<index_t, NumOutDim> arrOutLengths; std::array<index_t, NumOutDim> arrOutLengths;
std::array<index_t, NumOutDim> arrOutStrides; std::array<index_t, NumOutDim> arrOutStrides;
std::copy(inLengths.begin(), inLengths.end(), arrInLengths.begin()); ck::ranges::copy(inLengths, arrInLengths.begin());
std::copy(inStrides.begin(), inStrides.end(), arrInStrides.begin()); ck::ranges::copy(inStrides, arrInStrides.begin());
std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin()); ck::ranges::copy(outLengths, arrOutLengths.begin());
std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin()); ck::ranges::copy(outStrides, arrOutStrides.begin());
if(do_verification)
{
using ReferenceReduceInstance =
ck::tensor_operation::host::ReferenceReduce<InOutDataType,
AccDataType,
InOutDataType,
Rank,
NumReduceDim,
ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
PropagateNan,
false>;
auto reduce_ref = ReferenceReduceInstance{};
auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths,
arrInStrides,
arrOutLengths,
arrOutStrides,
reduceDims,
static_cast<double>(alpha),
static_cast<double>(beta),
in.mData.data(),
nullptr,
out_ref.mData.data(),
nullptr,
in_elementwise_op,
acc_elementwise_op);
if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get()))
{
std::cout << "The runtime parameters not supported by the reduce reference, exiting!"
<< std::endl;
return (false);
};
auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer();
invoker_ptr_ref->Run(argument_ptr_ref.get());
};
auto reduce = DeviceReduceInstance{}; auto reduce = DeviceReduceInstance{};
...@@ -188,8 +208,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -188,8 +208,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
arrOutLengths, arrOutLengths,
arrOutStrides, arrOutStrides,
reduceDims, reduceDims,
alpha, static_cast<double>(alpha),
beta, static_cast<double>(beta),
in_dev.GetDeviceBuffer(), in_dev.GetDeviceBuffer(),
nullptr, nullptr,
out_dev.GetDeviceBuffer(), out_dev.GetDeviceBuffer(),
...@@ -199,9 +219,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -199,9 +219,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
if(!reduce.IsSupportedArgument(argument_ptr.get())) if(!reduce.IsSupportedArgument(argument_ptr.get()))
{ {
std::cerr std::cerr << "The runtime parameters not supported by the DeviceReduce instance, exiting!"
<< "The runtime parameters seems not supported by the DeviceReduce instance, exiting!" << std::endl;
<< std::endl;
return (-2); return (-2);
}; };
...@@ -225,7 +244,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -225,7 +244,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
if(do_verification) if(do_verification)
{ {
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.mData.data());
pass = pass && ck::utils::check_err(out.mData, out_ref.mData); pass = pass && ck::utils::check_err(out, out_ref);
}; };
return (pass ? 0 : 1); return (pass ? 0 : 1);
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
template <typename InDataType, template <typename InDataType,
typename OutDataType, typename OutDataType,
...@@ -172,16 +173,16 @@ bool pool_test(bool do_verification, ...@@ -172,16 +173,16 @@ bool pool_test(bool do_verification,
// tensor layout // tensor layout
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) { [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
using namespace ck::literals;
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value) if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
} }
else if constexpr(ck::is_same<decltype(layout), else if constexpr(ck::is_same<decltype(layout),
ck::tensor_layout::convolution::NHWC>::value) ck::tensor_layout::convolution::NHWC>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
} }
}; };
...@@ -267,14 +268,14 @@ bool pool_test(bool do_verification, ...@@ -267,14 +268,14 @@ bool pool_test(bool do_verification,
out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data()); out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
pass = pass && ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData); pass = pass && ck::utils::check_err(out_n_c_ho_wo_device, out_n_c_ho_wo_host);
if constexpr(OutputIndex) if constexpr(OutputIndex)
{ {
out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data()); out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());
pass = pass && ck::utils::check_err(out_indices_n_c_ho_wo_device.mData, pass = pass &&
out_indices_n_c_ho_wo_host.mData); ck::utils::check_err(out_indices_n_c_ho_wo_device, out_indices_n_c_ho_wo_host);
}; };
} }
......
add_example_executable(example_gemm_xdl_bias_relu_quantization_int8 gemm_xdl_bias_relu_quantization_int8.cpp)
add_example_executable(example_gemm_xdl_quantization_int8 gemm_xdl_quantization_int8.cpp)
\ No newline at end of file
...@@ -9,80 +9,74 @@ ...@@ -9,80 +9,74 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
struct RequantReluRequant
{
// FIXME: We just need one scale for Relu / Leaky Relu / PRelu
RequantReluRequant(float scaleGemm, float scaleRelu)
: scaleGemm_(scaleGemm), scaleRelu_(scaleRelu)
{
}
__host__ __device__ constexpr void operator()(float& y, const float& x) const
{
float gemm_requant = scaleGemm_ * x;
float relu = gemm_requant > 0 ? gemm_requant : 0;
float relu_requant = scaleRelu_ * relu;
y = relu_requant > 127 ? 127 : relu_requant < -128 ? -128 : relu_requant;
}
float scaleGemm_;
float scaleRelu_;
};
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using I8 = int8_t;
using I32 = int32_t;
using ADataType = int8_t; using Row = ck::tensor_layout::gemm::RowMajor;
using BDataType = int8_t; using Col = ck::tensor_layout::gemm::ColumnMajor;
using CDataType = int8_t;
using AccDataType = int32_t; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using CShuffleDataType = float; using ActivationOp = ck::tensor_operation::element_wise::Relu;
using CDEElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor; using ADataType = I8;
using CLayout = ck::tensor_layout::gemm::RowMajor; using BDataType = I8;
using AccDataType = I32;
using CShuffleDataType = I32;
using BiasDataType = I32;
using DsDataType = ck::Tuple<BiasDataType>;
using EDataType = I8;
using ALayout = Row;
using BLayout = Col;
using BiasLayout = Row;
using DsLayout = ck::Tuple<BiasLayout>;
using ELayout = Row;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off // clang-format off
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle< using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<
ALayout, // typename ALayout, ALayout,
BLayout, // typename BLayout, BLayout,
CLayout, // typename CLayout, DsLayout,
ADataType, // typename ADataType, ELayout,
BDataType, // typename BDataType, ADataType,
CDataType, // typename CDataType, BDataType,
AccDataType, // typename GemmAccDataType, AccDataType,
CShuffleDataType, // typename CShuffleDataType, CShuffleDataType,
PassThrough, // typename AElementwiseOperation, DsDataType,
PassThrough, // typename BElementwiseOperation, EDataType,
RequantReluRequant, // typename CElementwiseOperation, PassThrough, // AElementwiseOperation,
PassThrough, // BElementwiseOperation,
CDEElementOp, // CDEElementwiseOperation,
GemmDefault, // GemmSpecialization GemmSpec, GemmDefault, // GemmSpecialization GemmSpec,
1, // index_t NumGemmKPrefetchStage, 1, // NumGemmKPrefetchStage,
256, // index_t BlockSize, 256, // BlockSize,
256, // index_t MPerBlock, 256, // MPerBlock,
128, // index_t NPerBlock, 128, // NPerBlock,
64, // index_t KPerBlock, 64, // KPerBlock,
16, // index_t AK1, 16, // AK1,
16, // index_t BK1, 16, // BK1,
32, // index_t MPerXDL, 32, // MPerXDL,
32, // index_t NPerXDL, 32, // NPerXDL,
4, // index_t MXdlPerWave, 4, // MXdlPerWave,
2, // index_t NXdlPerWave, 2, // NXdlPerWave,
S<4, 64, 1>, // typename ABlockTransferThreadClusterLengths_AK0_M_AK1, S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1,
S<1, 0, 2>, // typename ABlockTransferThreadClusterArrangeOrder, S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder,
S<1, 0, 2>, // typename ABlockTransferSrcAccessOrder, S<1, 0, 2>, // ABlockTransferSrcAccessOrder,
2, // index_t ABlockTransferSrcVectorDim, 2, // index_t ABlockTransferSrcVectorDim,
16, // index_t ABlockTransferSrcScalarPerVector, 16, // index_t ABlockTransferSrcScalarPerVector,
16, // index_t ABlockTransferDstScalarPerVector_AK1, 16, // index_t ABlockTransferDstScalarPerVector_AK1,
...@@ -97,125 +91,100 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle ...@@ -97,125 +91,100 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
1, // index_t CShuffleMXdlPerWavePerShuffle, 1, // index_t CShuffleMXdlPerWavePerShuffle,
1, // index_t CShuffleNXdlPerWavePerShuffle, 1, // index_t CShuffleNXdlPerWavePerShuffle,
S<1, 64, 1, 4>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, S<1, 64, 1, 4>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
16>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock> 8>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
// clang-format on // clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
CDataType, AccDataType,
float, AccDataType,
PassThrough, PassThrough,
PassThrough, PassThrough,
RequantReluRequant>; PassThrough>;
int main(int argc, char* argv[]) int main()
{ {
bool do_verification = true; bool do_verification = true;
int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
// GEMM shape // GEMM shape
ck::index_t M = 3840; ck::index_t M = 1024;
ck::index_t N = 4096; ck::index_t N = 1024;
ck::index_t K = 4096; ck::index_t K = 1024;
ck::index_t StrideA = 4096;
ck::index_t StrideB = 4096;
ck::index_t StrideC = 4096;
float scale_gemm = 0.03; ck::index_t StrideA = 1024;
float scale_relu = 1; ck::index_t StrideB = 1024;
ck::index_t StrideBias = 0;
ck::index_t StrideE = 1024;
if(argc == 4) float requant_scale = 0.03;
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]); auto f_host_tensor_descriptor2d =
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideC = std::stoi(argv[9]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1})); std::vector<std::size_t>({stride, 1_uz}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride})); std::vector<std::size_t>({1_uz, stride}));
} }
}; };
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); return HostTensorDescriptor(std::vector<std::size_t>({len}),
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); std::vector<std::size_t>({stride}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); };
Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; std::cout << "bias_n: " << bias_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
switch(init_method) a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-128, 127});
{ b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-128, 127});
case 0: break; bias_n.GenerateTensorValue(GeneratorTensor_2<BiasDataType>{-128, 127});
case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
}
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); DeviceMem bias_device_buf(sizeof(BiasDataType) * bias_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.mData.data());
bias_device_buf.ToDevice(bias_n.mData.data());
auto a_element_op = PassThrough{}; auto a_element_op = PassThrough{};
auto b_element_op = PassThrough{}; auto b_element_op = PassThrough{};
auto c_element_op = RequantReluRequant{scale_gemm, scale_relu}; auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
// do GEMM // do GEMM
auto gemm = DeviceGemmInstance{}; auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker(); auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()), auto argument = gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()), b_device_buf.GetDeviceBuffer(),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()), {bias_device_buf.GetDeviceBuffer()},
e_device_buf.GetDeviceBuffer(),
M, M,
N, N,
K, K,
StrideA, StrideA,
StrideB, StrideB,
StrideC, {StrideBias},
StrideE,
a_element_op, a_element_op,
b_element_op, b_element_op,
c_element_op); cde_element_op);
if(!gemm.IsSupportedArgument(argument)) if(!gemm.IsSupportedArgument(argument))
{ {
...@@ -228,7 +197,7 @@ int main(int argc, char* argv[]) ...@@ -228,7 +197,7 @@ int main(int argc, char* argv[])
std::size_t flop = std::size_t(2) * M * N * K; std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype = std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N; sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time; float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
...@@ -237,19 +206,29 @@ int main(int argc, char* argv[]) ...@@ -237,19 +206,29 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl; << gemm.GetTypeString() << std::endl;
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.mData.data());
if(do_verification) if(do_verification)
{ {
Tensor<AccDataType> c_m_n(HostTensorDescriptor{M, N});
auto ref_gemm = ReferenceGemmInstance{}; auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker(); auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument( auto ref_argument =
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op); ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1; for(int m = 0; m < M; ++m)
{
for(int n = 0; n < N; ++n)
{
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), bias_n(n));
}
}
return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
} }
return 0; return 0;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using I8 = int8_t;
using I32 = int32_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ActivationOp = PassThrough;
using CDEElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
using ADataType = I8;
using BDataType = I8;
using AccDataType = I32;
using CShuffleDataType = I32;
using DsDataType = ck::Tuple<>;
using EDataType = I8;
using ALayout = Row;
using BLayout = Col;
using DsLayout = ck::Tuple<>;
using ELayout = Row;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<
ALayout,
BLayout,
DsLayout,
ELayout,
ADataType,
BDataType,
AccDataType,
CShuffleDataType,
DsDataType,
EDataType,
PassThrough, // AElementwiseOperation,
PassThrough, // BElementwiseOperation,
CDEElementOp, // CDEElementwiseOperation,
GemmDefault, // GemmSpecialization GemmSpec,
1, // NumGemmKPrefetchStage,
256, // BlockSize,
256, // MPerBlock,
128, // NPerBlock,
64, // KPerBlock,
16, // AK1,
16, // BK1,
32, // MPerXDL,
32, // NPerXDL,
4, // MXdlPerWave,
2, // NXdlPerWave,
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1,
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder,
S<1, 0, 2>, // ABlockTransferSrcAccessOrder,
2, // index_t ABlockTransferSrcVectorDim,
16, // index_t ABlockTransferSrcScalarPerVector,
16, // index_t ABlockTransferDstScalarPerVector_AK1,
1, // bool ABlockLdsExtraM,
S<4, 64, 1>, // typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
S<1, 0, 2>, // typename BBlockTransferThreadClusterArrangeOrder,
S<1, 0, 2>, // typename BBlockTransferSrcAccessOrder,
2, // index_t BBlockTransferSrcVectorDim,
8, // index_t BBlockTransferSrcScalarPerVector,
8, // index_t BBlockTransferDstScalarPerVector_BK1,
1, // bool BBlockLdsExtraN,
1, // index_t CShuffleMXdlPerWavePerShuffle,
1, // index_t CShuffleNXdlPerWavePerShuffle,
S<1, 64, 1, 4>, // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
16>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, EDataType, float, PassThrough, PassThrough, CDEElementOp>;
int main()
{
bool do_verification = true;
bool time_kernel = false;
// GEMM shape
ck::index_t M = 1024;
ck::index_t N = 1024;
ck::index_t K = 1024;
ck::index_t StrideA = 1024;
ck::index_t StrideB = 1024;
ck::index_t StrideE = 1024;
float requant_scale = 0.03;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1_uz}));
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1_uz, stride}));
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-128, 127});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-128, 127});
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
auto a_element_op = PassThrough{};
auto b_element_op = PassThrough{};
auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
{},
e_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
{},
StrideE,
a_element_op,
b_element_op,
cde_element_op);
if(!gemm.IsSupportedArgument(argument))
{
throw std::runtime_error(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem");
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
if(do_verification)
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, e_m_n_host_result, a_element_op, b_element_op, cde_element_op);
ref_invoker.Run(ref_argument);
return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
}
return 0;
}
add_example_executable(example_gemm_xdl_requant_relu_requant_int8 gemm_xdl_requant_relu_requant_int8.cpp)
\ No newline at end of file
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment