Unverified Commit 46a675aa authored by Po Yen Chen's avatar Po Yen Chen Committed by GitHub
Browse files

Add examples of Conv + reduction (data type: int4, int8, bf16, fp16, fp32) (#380)



* Refactor the design of DeviceGemmMultipleDMultipleR_Xdl_CShuffle

* Add 'DeviceGroupedConvFwdMultipleDMultipleR' interface

* Add DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle

* Remove 'GridwiseConvFwdMultipleDMultipleR_xdl_cshuffle'

* Add 'TransformConvFwdToGemm<>' utility class (from Chao)

* Use 'TransformConvFwdToGemm<>' to shorten code

* Fix ill-formed method declaration

* Re-implement MakeRGridDescriptor_M() function

* Change problem description

* Use macro to define layout types

* Define K-reduced output tensor layout types

* Let user to decide R output tensor layout

* Rename variables

* Add padding to the reduced output tensor if necessary

* Extract common code as helper method

* Remove debug message

* Add missing include directive

* Add partial fp16 Conv + Reduction example

* Add example verification code for 2D Conv problem

* Use type alias to simplify code

* Share code across different-dimension Conv problems

* Rename file/functions from run_conv_fwd* to run_convnd_fwd*

* Make example code more verbose

* Add code to support 1D & 3D Conv + Reduction on host

* Add more examples for data type: bf16, fp32

* Add example for int8

* Add custom target to group examples

* Use more general custom target name

* Change the description in error message

* Disable testing for example other than fp32

* Add examplel for int4 (just copy from int8)

* Fix wrong data type

* Use larger data type for intermediate tensors

* Finish int4 example

* Undefine macro PP_DEFINE_LAYOUT_TYPE() after use

* Use named variables to replace magic numbers

* Remove debug messages

* Use same A/B data type for host Conv in int4 example

* Add check for the 'RLayout' type argument

* Group same-dim-layouts together in 'LayoutSetting<>'

* Add 'final' specifier to utility classes

* Use different initialization method for examples

* Remove macro PP_DEFINE_LAYOUT_TYPE()

* Fix code-comment mismatch

* Use more reasonable initialization value for all data types

* Default use init_method=1 for all examples

* Remove never-used code

* Remove confusing out-of-date comments

* clean
Co-authored-by: default avatarChao Liu <chao.liu2@amd.com>
Co-authored-by: default avatarChao Liu <lc.roy86@gmail.com>
parent 4df6d93f
add_custom_target(example_convnd_fwd_reduce_xdl)
add_example_executable(example_convnd_fwd_max_xdl_int8 convnd_fwd_max_xdl_int8.cpp)
add_example_executable_no_testing(example_convnd_fwd_max_xdl_bf16 convnd_fwd_max_xdl_bf16.cpp)
add_example_executable_no_testing(example_convnd_fwd_max_xdl_fp16 convnd_fwd_max_xdl_fp16.cpp)
add_example_executable(example_convnd_fwd_max_xdl_fp32 convnd_fwd_max_xdl_fp32.cpp)
add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int8)
add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_bf16)
add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp16)
add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp32)
if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_convnd_fwd_max_xdl_int4 convnd_fwd_max_xdl_int4.cpp)
add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int4)
endif(USE_BITINT_EXTENSION_INT4)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <iostream>
#include <iterator>
#include <numeric>
#include <type_traits>
#include <vector>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
using BF16 = ck::bhalf_t;
using FP16 = ck::half_t;
using FP32 = float;
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
using I4 = ck::int4_t;
#endif
using I8 = std::int8_t;
using I32 = std::int32_t;
template <typename ALay, typename BLay, typename DELay, typename RLay>
struct LayoutSetting
{
using ALayout = ALay;
using BLayout = BLay;
using DELayout = DELay;
using RLayout = RLay;
};
template <ck::index_t NDimSpatial>
struct LayoutSettingSelector;
namespace ctl = ck::tensor_layout::convolution;
template <>
struct LayoutSettingSelector<1> final : LayoutSetting<ctl::GNWC, ctl::GKXC, ctl::GNWK, ctl::GNW>
{
};
template <>
struct LayoutSettingSelector<2> final : LayoutSetting<ctl::GNHWC, ctl::GKYXC, ctl::GNHWK, ctl::GNHW>
{
};
template <>
struct LayoutSettingSelector<3> final
: LayoutSetting<ctl::GNDHWC, ctl::GKZYXC, ctl::GNDHWK, ctl::GNDHW>
{
};
template <ck::index_t NDimSpatial>
using ALayout = typename LayoutSettingSelector<NDimSpatial>::ALayout;
template <ck::index_t NDimSpatial>
using BLayout = typename LayoutSettingSelector<NDimSpatial>::BLayout;
template <ck::index_t NDimSpatial>
using DELayout = typename LayoutSettingSelector<NDimSpatial>::DELayout;
template <ck::index_t NDimSpatial>
using RLayout = typename LayoutSettingSelector<NDimSpatial>::RLayout;
struct ExecutionConfig final
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
};
inline void print_help_msg()
{
std::cerr << "arg1: verification (0=no, 1=yes)\n"
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg3: time kernel (0=no, 1=yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
}
inline bool parse_cmd_args(int argc,
char* argv[],
ck::utils::conv::ConvParam& problem_size,
ExecutionConfig& config)
{
constexpr int num_execution_config_args =
3; // arguments for do_verification, init_method, time_kernel
constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
constexpr int threshold_to_catch_all_args =
threshold_to_catch_partial_args + num_conv_param_leading_args;
if(argc == 1)
{
// use default
}
// catch only ExecutionConfig arguments
else if(argc == threshold_to_catch_partial_args)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
}
// catch both ExecutionConfig & ConvParam arguments
else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
problem_size = ck::utils::conv::parse_conv_param(
num_dim_spatial, threshold_to_catch_partial_args, argv);
}
else
{
print_help_msg();
return false;
}
return true;
}
inline HostTensorDescriptor
make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
{
std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_};
std::copy(begin(problem_size.output_spatial_lengths_),
end(problem_size.output_spatial_lengths_),
std::back_inserter(dimensions));
return HostTensorDescriptor(dimensions);
}
template <typename Lengths, typename Strides>
void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor,
Lengths& lengths,
Strides& strides)
{
assert(size(descriptor.GetLengths()) == size(lengths));
std::copy_n(begin(descriptor.GetLengths()), size(descriptor.GetLengths()), begin(lengths));
assert(size(descriptor.GetStrides()) == size(strides));
std::copy_n(begin(descriptor.GetStrides()), size(descriptor.GetStrides()), begin(strides));
}
template <typename Range, typename OutputIterator>
auto copy(const Range& range, OutputIterator iter)
-> decltype(std::copy(std::begin(range), std::end(range), iter))
{
return std::copy(std::begin(range), std::end(range), iter);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using ADataType = BF16;
using BDataType = BF16;
using AccDataType = FP32;
using CShuffleDataType = FP32;
using DsDataType = ck::Tuple<>;
using EDataType = BF16;
using ReduceAccDataType = FP32;
using R0DataType = FP32;
using RsDataType = ck::Tuple<R0DataType>;
#include "run_convnd_fwd_max_example.inc"
int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using ADataType = FP16;
using BDataType = FP16;
using AccDataType = FP32;
using CShuffleDataType = FP32;
using DsDataType = ck::Tuple<>;
using EDataType = FP16;
using ReduceAccDataType = FP32;
using R0DataType = FP32;
using RsDataType = ck::Tuple<R0DataType>;
#include "run_convnd_fwd_max_example.inc"
int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using ADataType = FP32;
using BDataType = FP32;
using AccDataType = FP32;
using CShuffleDataType = FP32;
using DsDataType = ck::Tuple<>;
using EDataType = FP32;
using ReduceAccDataType = FP32;
using R0DataType = FP32;
using RsDataType = ck::Tuple<R0DataType>;
#include "run_convnd_fwd_max_example.inc"
int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#error Should compile this file with ck::int4_t support
#endif
#define BUILD_INT4_EXAMPLE
#include "common.hpp"
using ADataType = I4;
using BDataType = I4;
using KernelADataType = I8;
using KernelBDataType = I8;
using AccDataType = I32;
using CShuffleDataType = I32;
using DsDataType = ck::Tuple<>;
using EDataType = I32;
using ReduceAccDataType = I32;
using R0DataType = I32;
using RsDataType = ck::Tuple<R0DataType>;
#include "run_convnd_fwd_max_example.inc"
int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
using ADataType = I8;
using BDataType = I8;
using AccDataType = I32;
using CShuffleDataType = I32;
using DsDataType = ck::Tuple<>;
using EDataType = I32;
using ReduceAccDataType = I32;
using R0DataType = I32;
using RsDataType = ck::Tuple<R0DataType>;
#include "run_convnd_fwd_max_example.inc"
int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = PassThrough;
using QsElementOp = ck::Tuple<PassThrough>;
using RsElementOp = ck::Tuple<PassThrough>;
// ReduceOp
using RsThreadReduceOp = ck::Tuple<ck::reduce::Max>;
using RsGlobalReduceOp =
ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;
static constexpr auto ConvSpec =
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
template <ck::index_t NDimSpatial>
using DeviceInstance =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
//######| NDimSpatial| ALayout| BLayout| DELayout| RLayout| AData| BData| AccData| CShuffle| DsData| EData| ReduceAccData| RsData| A| B| CDE| Qs| Rs| Thread| Global| Conv| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CDRThreadTransfer| CDE| RThreadTransfer|
//######| | | | | | Type| Type| Type| DataType| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Reduce| Reduce| Fwd|Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
//######| | | | | | | | | | | | | | Operation| Operation| Operation| Operation| Operation| Operation| Operation| Specialization| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _MPerBlock_NPerBlock| ScalarPerVector| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | _NPerBlock| |
#ifdef BUILD_INT4_EXAMPLE
< NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>, KernelADataType, KernelBDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementOp, BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp, ConvSpec, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<64, 4>, 4, 1>;
#else
< NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementOp, BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp, ConvSpec, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<64, 4>, 4, 1>;
#endif
template <ck::index_t NDimSpatial>
using HostInstance = ck::tensor_operation::host::ReferenceConvFwd
<NDimSpatial, ADataType, BDataType, EDataType, AElementOp, BElementOp, PassThrough>;
// clang-format on
template <ck::index_t NDimSpatial>
bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
const ExecutionConfig& config)
{
static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial");
#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
#endif
const auto conv_input_g_n_c_wis_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ALayout<NDimSpatial>>(
problem_size);
const auto conv_weight_g_k_c_xs_desc =
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<BLayout<NDimSpatial>>(
problem_size);
const auto conv_output_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<DELayout<NDimSpatial>>(
problem_size);
const auto r0_desc = make_r0_host_tensor_descriptor(problem_size);
Tensor<ADataType> conv_input(conv_input_g_n_c_wis_desc);
Tensor<BDataType> conv_weight(conv_weight_g_k_c_xs_desc);
Tensor<EDataType> conv_output_device(conv_output_g_n_k_wos_desc);
Tensor<R0DataType> r0_device(r0_desc);
switch(config.init_method)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input.begin(),
conv_input.end());
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight.begin(),
conv_weight.end());
break;
default:
ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input.begin(), conv_input.end());
ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight.begin(),
conv_weight.end());
}
DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize());
DeviceMem conv_weight_device_buf(sizeof(BDataType) * conv_weight.mDesc.GetElementSpaceSize());
DeviceMem conv_output_device_buf(sizeof(EDataType) *
conv_output_device.mDesc.GetElementSpaceSize());
DeviceMem r0_device_buf(sizeof(R0DataType) * r0_device.mDesc.GetElementSpaceSize());
#ifdef BUILD_INT4_EXAMPLE
const Tensor<KernelADataType> conv_input_converted(conv_input);
const Tensor<KernelBDataType> conv_weight_converted(conv_weight);
conv_input_device_buf.ToDevice(conv_input_converted.mData.data());
conv_weight_device_buf.ToDevice(conv_weight_converted.mData.data());
#else
conv_input_device_buf.ToDevice(conv_input.mData.data());
conv_weight_device_buf.ToDevice(conv_weight.mData.data());
#endif
std::array<ck::index_t, NDimSpatial + 3> conv_input_g_n_c_wis_lengths{},
conv_input_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial + 3> conv_weight_g_k_c_xs_lengths{},
conv_weight_g_k_c_xs_strides{};
std::array<ck::index_t, NDimSpatial + 3> conv_output_g_n_k_wos_lengths{},
conv_output_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial + 2> r0_lengths{}, r0_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{}, conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{}, input_right_pads{};
unpack_host_tensor_descriptor(
conv_input_g_n_c_wis_desc, conv_input_g_n_c_wis_lengths, conv_input_g_n_c_wis_strides);
unpack_host_tensor_descriptor(
conv_weight_g_k_c_xs_desc, conv_weight_g_k_c_xs_lengths, conv_weight_g_k_c_xs_strides);
unpack_host_tensor_descriptor(
conv_output_g_n_k_wos_desc, conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_strides);
unpack_host_tensor_descriptor(r0_desc, r0_lengths, r0_strides);
copy(problem_size.conv_filter_strides_, begin(conv_filter_strides));
copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations));
copy(problem_size.input_left_pads_, begin(input_left_pads));
copy(problem_size.input_right_pads_, begin(input_right_pads));
// run Conv + Reduction on device
auto conv = DeviceInstance<NDimSpatial>{};
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(conv_input_device_buf.GetDeviceBuffer(),
conv_weight_device_buf.GetDeviceBuffer(),
std::array<const void*, 0>{},
conv_output_device_buf.GetDeviceBuffer(),
{r0_device_buf.GetDeviceBuffer()},
conv_input_g_n_c_wis_lengths,
conv_input_g_n_c_wis_strides,
conv_weight_g_k_c_xs_lengths,
conv_weight_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
conv_output_g_n_k_wos_lengths,
conv_output_g_n_k_wos_strides,
r0_lengths,
r0_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
AElementOp{},
BElementOp{},
CDEElementOp{},
QsElementOp{},
RsElementOp{});
if(!conv.IsSupportedArgument(argument))
{
std::cerr << "wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem"
<< std::endl;
return false;
}
const float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
const std::size_t flop = problem_size.GetFlops();
const std::size_t num_btype = problem_size.GetByte<ADataType, BDataType, EDataType>();
const float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
const float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< conv.GetTypeString() << std::endl;
if(config.do_verification)
{
Tensor<EDataType> conv_output_host(conv_output_g_n_k_wos_desc);
// run Conv + Reduction on host
auto ref_conv = HostInstance<NDimSpatial>{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(conv_input,
conv_weight,
conv_output_host,
problem_size.conv_filter_strides_,
problem_size.conv_filter_dilations_,
problem_size.input_left_pads_,
problem_size.input_right_pads_,
AElementOp{},
BElementOp{},
PassThrough{});
ref_invoker.Run(ref_argument);
Tensor<R0DataType> r0_host(r0_device.mDesc);
auto reduce0_op = RsThreadReduceOp{}[ck::Number<0>{}];
auto& output_dims = conv_output_g_n_k_wos_desc.GetLengths();
if constexpr(NDimSpatial == 1)
{
for(std::size_t g = 0; g < output_dims[0]; ++g)
{
for(std::size_t n = 0; n < output_dims[1]; ++n)
{
for(std::size_t w = 0; w < output_dims[3]; ++w)
{
auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
for(std::size_t k = 0; k < output_dims[2]; ++k)
{
auto e_val =
ck::type_convert<ReduceAccDataType>(conv_output_host(g, n, k, w));
reduce0_op(reduce0_acc, e_val);
}
r0_host(g, n, w) = ck::type_convert<R0DataType>(reduce0_acc);
}
}
}
}
else if constexpr(NDimSpatial == 2)
{
for(std::size_t g = 0; g < output_dims[0]; ++g)
{
for(std::size_t n = 0; n < output_dims[1]; ++n)
{
for(std::size_t h = 0; h < output_dims[3]; ++h)
{
for(std::size_t w = 0; w < output_dims[4]; ++w)
{
auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
for(std::size_t k = 0; k < output_dims[2]; ++k)
{
auto e_val = ck::type_convert<ReduceAccDataType>(
conv_output_host(g, n, k, h, w));
reduce0_op(reduce0_acc, e_val);
}
r0_host(g, n, h, w) = ck::type_convert<R0DataType>(reduce0_acc);
}
}
}
}
}
else if constexpr(NDimSpatial == 3)
{
for(std::size_t g = 0; g < output_dims[0]; ++g)
{
for(std::size_t n = 0; n < output_dims[1]; ++n)
{
for(std::size_t d = 0; d < output_dims[3]; ++d)
{
for(std::size_t h = 0; h < output_dims[4]; ++h)
{
for(std::size_t w = 0; w < output_dims[5]; ++w)
{
auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
for(std::size_t k = 0; k < output_dims[2]; ++k)
{
auto e_val = ck::type_convert<ReduceAccDataType>(
conv_output_host(g, n, k, d, h, w));
reduce0_op(reduce0_acc, e_val);
}
r0_host(g, n, d, h, w) = ck::type_convert<R0DataType>(reduce0_acc);
}
}
}
}
}
}
conv_output_device_buf.FromDevice(conv_output_device.mData.data());
r0_device_buf.FromDevice(r0_device.mData.data());
return ck::utils::check_err(conv_output_device.mData,
conv_output_host.mData,
"Error: incorrect results! (Matrix E)",
1e-5f,
1e-4f) &&
ck::utils::check_err(r0_device.mData,
r0_host.mData,
"Error: incorrect results! (Matrix R0)",
1e-5f,
1e-4f);
}
return true;
}
bool run_convnd_fwd_max_example(int argc, char* argv[])
{
ck::utils::conv::ConvParam problem_size{
2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
ExecutionConfig config;
if(!parse_cmd_args(argc, argv, problem_size, config))
{
return false;
}
switch(problem_size.num_dim_spatial_)
{
case 1: return run_convnd_fwd_max<1>(problem_size, config);
case 2: return run_convnd_fwd_max<2>(problem_size, config);
case 3: return run_convnd_fwd_max<3>(problem_size, config);
}
return false;
}
...@@ -26,6 +26,7 @@ add_subdirectory(02_gemm_bilinear) ...@@ -26,6 +26,7 @@ add_subdirectory(02_gemm_bilinear)
add_subdirectory(03_gemm_bias_relu) add_subdirectory(03_gemm_bias_relu)
add_subdirectory(04_gemm_add_add_fastgelu) add_subdirectory(04_gemm_add_add_fastgelu)
add_subdirectory(09_convnd_fwd) add_subdirectory(09_convnd_fwd)
add_subdirectory(10_convnd_fwd_multiple_d_multiple_reduce)
add_subdirectory(12_reduce) add_subdirectory(12_reduce)
add_subdirectory(13_pool2d_fwd) add_subdirectory(13_pool2d_fwd)
add_subdirectory(14_gemm_xdl_requant_relu_requant) add_subdirectory(14_gemm_xdl_requant_relu_requant)
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
// Grouped Convolution Forward:
// input : input image A[G, N, C, Hi, Wi],
// input : weight B[G, K, C, Y, X],
// input : D0[G, N, K, Ho, Wo], D1[G, N, K, Ho, Wo], ...
// output : output image E[G, N, K, Ho, Wo]
// output : R0[G, N, Ho, Wo], R1[G, N, Ho, Wo], ...
// C = a_op(A) * b_op(B)
// E = cde_op(C, D0, D1, ...)
// Q0 = reduce0(q_op0(E)), Q1 = reduce1(q_op0(E)), ...
// R0 = r_op0(Q0), R1 = r_op1(Q1), ...
// Assume:
// D0, D1, ... and E have the same layout
template <index_t NDimSpatial,
typename ALayout,
typename BLayout,
typename DELayout,
typename RLayout,
typename ADataType,
typename BDataType,
typename DsDataType,
typename EDataType,
typename RsDataType,
typename AElementwiseOperation,
typename BElementwiseOperation,
typename CDEElementwiseOperation,
typename QsElementwiseOperation,
typename RsElementwiseOperation>
struct DeviceGroupedConvFwdMultipleDMultipleR : public BaseOperator
{
static constexpr index_t NumDTensor = DsDataType::Size();
static constexpr index_t NumRTensor = RsDataType::Size();
virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
const void* p_a,
const void* p_b,
const std::array<const void*, NumDTensor>& p_ds,
void* p_e,
std::array<void*, NumRTensor> p_rs,
const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides,
const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
const std::array<index_t, NDimSpatial + 2>& r_g_n_wos_lengths,
const std::array<index_t, NDimSpatial + 2>& r_g_n_wos_strides,
const std::array<index_t, NDimSpatial>& conv_filter_strides,
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
const std::array<index_t, NDimSpatial>& input_left_pads,
const std::array<index_t, NDimSpatial>& input_right_pads,
const AElementwiseOperation& a_element_op,
const BElementwiseOperation& b_element_op,
const CDEElementwiseOperation& cde_element_op,
const QsElementwiseOperation& qs_element_op,
const RsElementwiseOperation& rs_element_op) = 0;
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
};
} // namespace device
} // namespace tensor_operation
} // namespace ck
...@@ -93,7 +93,7 @@ struct GNDHWC : public BaseTensorLayout ...@@ -93,7 +93,7 @@ struct GNDHWC : public BaseTensorLayout
}; };
// input tensor // input tensor
// packed GNWC/GNHWC/GNDHWC // packed NWGC/NHWGC/NDHWGC
struct NWGC : public BaseTensorLayout struct NWGC : public BaseTensorLayout
{ {
static constexpr const char* name = "NWGC"; static constexpr const char* name = "NWGC";
...@@ -330,6 +330,54 @@ struct G_NDHW_K : public BaseTensorLayout ...@@ -330,6 +330,54 @@ struct G_NDHW_K : public BaseTensorLayout
static constexpr const char* name = "G_NDHW_K"; static constexpr const char* name = "G_NDHW_K";
}; };
// K-reduced output tensor (packed)
struct GNW : public BaseTensorLayout
{
static constexpr const char* name = "GNW";
};
struct GNHW : public BaseTensorLayout
{
static constexpr const char* name = "GNHW";
};
struct GNDHW : public BaseTensorLayout
{
static constexpr const char* name = "GNDHW";
};
// K-reduced output tensor (packed)
struct NWG : public BaseTensorLayout
{
static constexpr const char* name = "NWG";
};
struct NHWG : public BaseTensorLayout
{
static constexpr const char* name = "NHWG";
};
struct NDHWG : public BaseTensorLayout
{
static constexpr const char* name = "NDHWG";
};
// K-reduced output tensor (strided)
struct G_NW : public BaseTensorLayout
{
static constexpr const char* name = "G_NW";
};
struct G_NHW : public BaseTensorLayout
{
static constexpr const char* name = "G_NHW";
};
struct G_NDHW : public BaseTensorLayout
{
static constexpr const char* name = "G_NDHW";
};
} // namespace convolution } // namespace convolution
template < template <
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/host_tensor.hpp"
namespace ck { namespace ck {
namespace utils { namespace utils {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment