Unverified Commit f63a23ac authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

[MIOpen Downstream] Initial MIOpen integration (#52)

* update online kernel wrapper bundle all descriptors in a tuple

* change __CONSTANT__ to CONSTANT

* rename

* adding tuning

* added IsValidCompileParameter

* reorginze

* adding tunable for fp16 and int8

* fix kernel compile warning and bug fixes

* suppress warning about cast CONSTANT (address space 4) pointer

* fix building issue
parent 12649254
#ifndef CONV_IGEMM_FWD_V6R1_DLOPS_NCHW_KCYX_NKHW_HPP
#define CONV_IGEMM_FWD_V6R1_DLOPS_NCHW_KCYX_NKHW_HPP
#include <numeric>
namespace ck_driver {
struct CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw
{
ck::DataTypeEnum_t ABDataTypeEnum;
ck::DataTypeEnum_t AccDataTypeEnum;
ck::DataTypeEnum_t CDataTypeEnum;
int BlockSize;
int GN0;
int GK1;
int GM1PerBlockGM11;
int GN1PerBlockGN11;
int GK0PerBlock;
int BM1PerThreadBM11;
int BN1PerThreadBN11;
int BK0PerThread;
std::array<int, 2> BM10BN10ThreadClusterBM10Xs;
std::array<int, 2> BM10BN10ThreadClusterBN10Xs;
std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
int CThreadTransferDstScalarPerVector;
bool HasMainKBlockLoop;
bool HasDoubleTailKBlockLoop;
auto GetCompileParameterString() const
{
// clang-format off
return
" -DCK_PARAM_ABDataTypeEnum=" +
std::to_string(ABDataTypeEnum) +
" -DCK_PARAM_AccDataTypeEnum=" +
std::to_string(AccDataTypeEnum) +
" -DCK_PARAM_CDataTypeEnum=" +
std::to_string(CDataTypeEnum) +
" -DCK_PARAM_BlockSize=" +
std::to_string(BlockSize) +
" -DCK_PARAM_GN0=" +
std::to_string(GN0) +
" -DCK_PARAM_GK1=" +
std::to_string(GK1) +
" -DCK_PARAM_GM1PerBlockGM11=" +
std::to_string(GM1PerBlockGM11) +
" -DCK_PARAM_GN1PerBlockGN11=" +
std::to_string(GN1PerBlockGN11) +
" -DCK_PARAM_GK0PerBlock=" +
std::to_string(GK0PerBlock) +
" -DCK_PARAM_BM1PerThreadBM11=" +
std::to_string(BM1PerThreadBM11) +
" -DCK_PARAM_BN1PerThreadBN11=" +
std::to_string(BN1PerThreadBN11) +
" -DCK_PARAM_BK0PerThread=" +
std::to_string(BK0PerThread) +
" -DCK_PARAM_BM10BN10ThreadClusterBM10Xs=" +
std::to_string(BM10BN10ThreadClusterBM10Xs[0]) + "," +
std::to_string(BM10BN10ThreadClusterBM10Xs[1]) +
" -DCK_PARAM_BM10BN10ThreadClusterBN10Xs=" +
std::to_string(BM10BN10ThreadClusterBN10Xs[0]) + "," +
std::to_string(BM10BN10ThreadClusterBN10Xs[1]) +
" -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1=" +
std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
std::to_string(ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4]) +
" -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1=" +
std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
std::to_string(ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4]) +
" -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
std::to_string(ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
" -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
std::to_string(ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
" -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1=" +
std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
std::to_string(BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4]) +
" -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1=" +
std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
std::to_string(BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4]) +
" -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
std::to_string(BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
" -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
std::to_string(BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
" -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
std::to_string(CThreadTransferDstScalarPerVector) +
" -DCK_PARAM_HasMainKBlockLoop=" +
std::to_string(HasMainKBlockLoop) +
" -DCK_PARAM_HasDoubleTailKBlockLoop=" +
std::to_string(HasDoubleTailKBlockLoop);
// clang-format on
}
};
struct TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw
{
ck::DataTypeEnum_t ABDataTypeEnum;
ck::DataTypeEnum_t CDataTypeEnum;
int BlockSize;
int GN0;
int GK1;
int GM1PerBlockGM11;
int GN1PerBlockGN11;
int GK0PerBlock;
int BM1PerThreadBM11;
int BN1PerThreadBN11;
int BK0PerThread;
std::array<int, 2> BM10BN10ThreadClusterBM10Xs;
std::array<int, 2> BM10BN10ThreadClusterBN10Xs;
std::array<int, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
std::array<int, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
std::array<int, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
std::array<int, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
std::array<int, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
std::array<int, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
std::array<int, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
std::array<int, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
};
inline static auto generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw()
{
constexpr auto f32 = ck::DataTypeEnum_t::Float;
constexpr auto f16 = ck::DataTypeEnum_t::Half;
constexpr auto i8 = ck::DataTypeEnum_t::Int8;
return std::vector<TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw>{
// clang-format off
// fp32
{f32, f32, 256, 1, 1, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 1}, {4, 1, 1, 64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 1}, { 8, 1, 1, 32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
{f32, f32, 256, 1, 1, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1, 32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
{f32, f32, 256, 1, 1, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1, 32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
{f32, f32, 256, 1, 1, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}, { 8, 1, 1, 32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
{f32, f32, 256, 1, 1, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 1}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{f32, f32, 256, 2, 1, 128, 64, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 1}, { 4, 1, 1, 64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{f32, f32, 256, 4, 1, 128, 32, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 1}, { 8, 1, 1, 32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{f32, f32, 256, 8, 1, 128, 16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 1}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 1}, {16, 1, 1, 16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{f32, f32, 128, 1, 1, 64, 128, 8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 1}, {2, 1, 1, 64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 1}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
// fp16
{f16, f16, 256, 1, 2, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 2}, {4, 1, 1, 64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 2}, { 8, 1, 1, 32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
{f16, f16, 256, 1, 2, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1, 32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
{f16, f16, 256, 1, 2, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1, 32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
{f16, f16, 256, 1, 2, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 2}, { 8, 1, 1, 32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
{f16, f16, 256, 1, 2, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 2}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{f16, f16, 256, 2, 2, 128, 64, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 2}, { 4, 1, 1, 64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{f16, f16, 256, 4, 2, 128, 32, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 2}, { 8, 1, 1, 32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{f16, f16, 256, 8, 2, 128, 16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 2}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 2}, {16, 1, 1, 16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{f16, f16, 128, 1, 2, 64, 128, 8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 2}, {2, 1, 1, 64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 2}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
// i8
{ i8, i8, 256, 1, 4, 128, 128, 16, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 2, 4}, {4, 1, 1, 64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 4, 4}, { 8, 1, 1, 32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
{ i8, i8, 256, 1, 4, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1, 32, 1}, {1, 1, 1, 4, 1}, {1, 1, 1, 4, 1}},
{ i8, i8, 256, 1, 4, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1, 32, 1}, {1, 1, 1, 2, 1}, {1, 1, 1, 4, 1}},
{ i8, i8, 256, 1, 4, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 4}, { 8, 1, 1, 32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 4, 1}},
{ i8, i8, 256, 1, 4, 128, 128, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {4, 1, 1, 1, 4}, { 2, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{ i8, i8, 256, 2, 4, 128, 64, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 2, 1, 1, 4}, { 4, 1, 1, 64, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{ i8, i8, 256, 4, 4, 128, 32, 8, 4, 4, 1, {8, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 4, 1, 1, 4}, { 8, 1, 1, 32, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{ i8, i8, 256, 8, 4, 128, 16, 16, 4, 4, 1, {8, 2}, {8, 2}, {8, 1, 1, 1, 4}, {2, 1, 1, 128, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {1, 8, 1, 1, 4}, {16, 1, 1, 16, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
{ i8, i8, 128, 1, 4, 64, 128, 8, 4, 4, 1, {4, 2}, {8, 2}, {4, 1, 1, 1, 4}, {2, 1, 1, 64, 1}, {4, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {8, 1, 1, 1, 4}, { 1, 1, 1, 128, 1}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}}
// clang-format on
};
}
// TODO make this common interface and write specs for it
struct ConvIgemmFwdV6r1DlopsNchwKcyxNkhw
{
static auto
CalculateCompileParameterBasedOnTunable(const ConvolutionProblemDescriptor& conv_problem_desc,
const TunableConvIgemmFwdV6r1DlopsNchwKcyxNkhw& tunable)
{
using namespace ck;
const int C = conv_problem_desc.C;
const int Y = conv_problem_desc.Y;
const int X = conv_problem_desc.X;
const int Ho = conv_problem_desc.Ho;
const int Wo = conv_problem_desc.Wo;
if(!(conv_problem_desc.InDataTypeEnum == tunable.ABDataTypeEnum &&
conv_problem_desc.WeiDataTypeEnum == tunable.ABDataTypeEnum &&
conv_problem_desc.OutDataTypeEnum == tunable.CDataTypeEnum))
return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
const auto ABDataTypeEnum = conv_problem_desc.InDataTypeEnum;
const auto CDataTypeEnum = conv_problem_desc.OutDataTypeEnum;
DataTypeEnum_t AccDataTypeEnum;
switch(ABDataTypeEnum)
{
case DataTypeEnum_t::Float:
case DataTypeEnum_t::Half: AccDataTypeEnum = DataTypeEnum_t::Float; break;
case DataTypeEnum_t::Int8: AccDataTypeEnum = DataTypeEnum_t::Int32; break;
default: return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
}
const int BlockSize = tunable.BlockSize;
const int GN0 = tunable.GN0;
const int GK1 = tunable.GK1;
const int GM11 = tunable.GM1PerBlockGM11;
const int GN11 = tunable.GN1PerBlockGN11;
const int GK0PerBlock = tunable.GK0PerBlock;
const int BM11 = tunable.BM1PerThreadBM11;
const int BN11 = tunable.BN1PerThreadBN11;
const int BK0PerThread = tunable.BK0PerThread;
const auto BM10BN10ThreadClusterBM10Xs = tunable.BM10BN10ThreadClusterBM10Xs;
const auto BM10BN10ThreadClusterBN10Xs = tunable.BM10BN10ThreadClusterBN10Xs;
const auto ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 =
tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
const auto ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 =
tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
const auto ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
const auto ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
const auto BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 =
tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
const auto BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 =
tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
const auto BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
const auto BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
// C threadwise copy: {BN11} or {BN} or {BN1} or {GN11} is Dst vector dim
const int CThreadTransferDstScalarPerVector = gcd(4, GN11, BN11, Ho * Wo);
const int C0 = GK1;
if(!(C % C0 == 0))
return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
const int C1 = C / C0;
const int GK0 = C1 * Y * X;
if(!(GK0 % GK0PerBlock == 0))
return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
const bool HasMainKBlockLoop = ((GK0 + GK0PerBlock) / (2 * GK0PerBlock) > 1);
const bool HasDoubleTailKBlockLoop = ((GK0 / GK0PerBlock) % 2 == 0);
return std::make_tuple(
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{
ABDataTypeEnum,
AccDataTypeEnum,
CDataTypeEnum,
BlockSize,
GN0,
GK1,
GM11,
GN11,
GK0PerBlock,
BM11,
BN11,
BK0PerThread,
BM10BN10ThreadClusterBM10Xs,
BM10BN10ThreadClusterBN10Xs,
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
CThreadTransferDstScalarPerVector,
HasMainKBlockLoop,
HasDoubleTailKBlockLoop},
true);
}
static auto GetDefaultCompileParameter(const ConvolutionProblemDescriptor& conv_problem_desc)
{
for(const auto& tunable : generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw())
{
CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param;
bool found = false;
std::tie(compile_param, found) =
CalculateCompileParameterBasedOnTunable(conv_problem_desc, tunable);
if(found && IsValidCompileParameter(conv_problem_desc, compile_param))
return std::make_tuple(compile_param, true);
}
return std::make_tuple(CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw{}, false);
}
static bool IsApplicable(const ConvolutionProblemDescriptor& conv_problem_desc)
{
bool found = false;
std::tie(std::ignore, found) = GetDefaultCompileParameter(conv_problem_desc);
return found;
}
static bool
IsValidCompileParameter(const ConvolutionProblemDescriptor& conv_problem_desc,
const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
{
using namespace ck;
const int N = conv_problem_desc.N;
const int K = conv_problem_desc.K;
const int C = conv_problem_desc.C;
const int Y = conv_problem_desc.Y;
const int X = conv_problem_desc.X;
const int Ho = conv_problem_desc.Ho;
const int Wo = conv_problem_desc.Wo;
const int GK1 = compile_param.GK1;
const int GN0 = compile_param.GN0;
const int GM11 = compile_param.GM1PerBlockGM11;
const int GN11 = compile_param.GN1PerBlockGN11;
const int BM11 = compile_param.BM1PerThreadBM11;
const int BN11 = compile_param.BN1PerThreadBN11;
const int C0 = GK1;
const int N0 = GN0;
if(!(C % C0 == 0))
return false;
const int C1 = C / C0;
if(!(N % N0 == 0))
return false;
const int N1 = N / N0;
const int GM0 = 1;
const int GM1 = K;
const int GN1 = N1 * Ho * Wo;
const int GK0 = C1 * Y * X;
// check data type
{
if(!(conv_problem_desc.InDataTypeEnum == conv_problem_desc.WeiDataTypeEnum &&
conv_problem_desc.InDataTypeEnum == compile_param.ABDataTypeEnum))
return false;
if(compile_param.ABDataTypeEnum == DataTypeEnum_t::Float ||
compile_param.ABDataTypeEnum == DataTypeEnum_t::Half)
{
if(!(compile_param.AccDataTypeEnum == DataTypeEnum_t::Float))
return false;
}
else if(compile_param.ABDataTypeEnum == DataTypeEnum_t::Int8)
{
if(!(compile_param.AccDataTypeEnum == DataTypeEnum_t::Int32))
return false;
}
}
// check gridwise contraction
{
if(!(GM1 % GM11 == 0 && GN1 % GN11 == 0 && GK0 % compile_param.GK0PerBlock == 0))
return false;
const bool has_main_k_block_loop =
((GK0 + compile_param.GK0PerBlock) / (2 * compile_param.GK0PerBlock) > 1);
const bool has_double_tail_k_block_loop = ((GK0 / compile_param.GK0PerBlock) % 2 == 0);
if(!(has_main_k_block_loop == compile_param.HasMainKBlockLoop &&
has_double_tail_k_block_loop == compile_param.HasDoubleTailKBlockLoop))
return false;
}
// check A blockwise copy
{
const auto block_slice_lengths =
std::array<int, 5>{compile_param.GK0PerBlock, GM0, 1, GM11, GK1};
const auto& cluster_lengths =
compile_param.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1;
const auto& thread_slice_lengths =
compile_param.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1;
const auto& src_vector_lengths =
compile_param.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
const auto& dst_vector_lengths =
compile_param.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1;
// check number of working thread
const int num_work_thread = std::accumulate(
cluster_lengths.begin(), cluster_lengths.end(), 1, std::multiplies<int>{});
if(!(compile_param.BlockSize >= num_work_thread))
return false;
// check block slice lengths vs thread slice lengths vs cluster lengths
for(int i = 0; i < 5; ++i)
{
if(!(cluster_lengths[i] * thread_slice_lengths[i] == block_slice_lengths[i]))
return false;
}
// check thread slice lengths vs vector lengths
for(int i = 0; i < 5; ++i)
{
if(!(thread_slice_lengths[i] % src_vector_lengths[i] == 0))
return false;
if(!(thread_slice_lengths[i] % dst_vector_lengths[i] == 0))
return false;
}
// check Src vectorization, GK0 is global mem vector dim
if(!(src_vector_lengths[1] == 1 && src_vector_lengths[2] == 1 &&
src_vector_lengths[3] == 1 && src_vector_lengths[4] == 1))
return false;
// check Dst vectorization, {GM11, GK1} are LDS vector dims
if(dst_vector_lengths[4] == GK1)
{ // vectorize on {GM11, GK1}
if(!(GM11 % dst_vector_lengths[3] == 0))
return false;
}
else
{ // vectorize on {GK1} only
if(!(GK1 % dst_vector_lengths[4] == 0))
return false;
if(!(dst_vector_lengths[3] == 1))
return false;
}
}
// check B blockwise copy
{
const auto block_slice_lengths =
std::array<int, 5>{compile_param.GK0PerBlock, GN0, 1, GN11, GK1};
const auto& cluster_lengths =
compile_param.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1;
const auto& thread_slice_lengths =
compile_param.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1;
const auto& src_vector_lengths =
compile_param.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
const auto& dst_vector_lengths =
compile_param.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1;
// check number of working thread
const int num_work_thread = std::accumulate(
cluster_lengths.begin(), cluster_lengths.end(), 1, std::multiplies<int>{});
if(!(compile_param.BlockSize >= num_work_thread))
return false;
// check block slice lengths vs thread slice lengths vs cluster lengths
for(int i = 0; i < 5; ++i)
{
if(!(cluster_lengths[i] * thread_slice_lengths[i] == block_slice_lengths[i]))
return false;
}
// check thread slice lengths vs vector lengths
for(int i = 0; i < 5; ++i)
{
if(!(thread_slice_lengths[i] % src_vector_lengths[i] == 0 &&
thread_slice_lengths[i] % dst_vector_lengths[i] == 0))
return false;
}
// check Src vectorization: {GN11} is global mem vector dim
if(!(src_vector_lengths[0] == 1 && src_vector_lengths[1] == 1 &&
src_vector_lengths[2] == 1 && src_vector_lengths[4] == 1))
return false;
// check Src tensor layout related vectorization
if(Y == 1 && X == 1 && conv_problem_desc.ConvStrideH == 1 &&
conv_problem_desc.ConvStrideW == 1 && conv_problem_desc.InLeftPadH == 0 &&
conv_problem_desc.InLeftPadW == 0 && conv_problem_desc.InRightPadH == 0 &&
conv_problem_desc.InRightPadW == 0)
{
if(!((Ho * Wo) % src_vector_lengths[3] == 0))
return false;
}
else if(conv_problem_desc.ConvStrideW == 1 && conv_problem_desc.InLeftPadW == 0 &&
conv_problem_desc.InRightPadW == 0)
{
if(!(Wo % src_vector_lengths[3] == 0))
return false;
}
else
{
if(!(src_vector_lengths[3] == 1))
return false;
}
// check Dst vectorization: {GN11, GK1} are LDS vector dims
if(dst_vector_lengths[4] == GK1)
{ // vectorize on {GN11, GK1}
if(!(GN11 % dst_vector_lengths[3] == 0))
return false;
}
else
{ // vectorize on {GK1} only
if(!(dst_vector_lengths[3] == 1))
return false;
if(!(GK1 % dst_vector_lengths[4] == 0))
return false;
}
}
// check blockwise GEMM
{
const int BM10 = std::accumulate(compile_param.BM10BN10ThreadClusterBM10Xs.begin(),
compile_param.BM10BN10ThreadClusterBM10Xs.end(),
1,
std::multiplies<int>{});
const int BN10 = std::accumulate(compile_param.BM10BN10ThreadClusterBN10Xs.begin(),
compile_param.BM10BN10ThreadClusterBN10Xs.end(),
1,
std::multiplies<int>{});
if(!(compile_param.BlockSize == BM10 * BN10))
return false;
const int BM = GM0 * GM11;
const int BN = GN0 * GN11;
const int BM1 = BM10 * BM11;
const int BN1 = BN10 * BN11;
if(!(BM % BM1 == 0 && BN % BN1 == 0))
return false;
const int BM0 = BM / BM1;
const int BN0 = BN / BN1;
// blockwise GEMM currently only support BM0 == 2 && BN0 == 2
if(!(BM0 == 2 && BN0 == 2))
return false;
if(!(compile_param.GK0PerBlock % compile_param.BK0PerThread == 0))
return false;
}
// check C threadwise copy
{
// {BN11} or {BN} or {BN1} or {GN11} is Dst vector dim
const int dst_vector_len_gn11 = compile_param.CThreadTransferDstScalarPerVector;
// check slice length vs Dst vector length:
if(!(BN11 % dst_vector_len_gn11 == 0 && GN11 % dst_vector_len_gn11 == 0))
return false;
// check Dst memory layout related vectorization:
if(!((Ho * Wo) % compile_param.CThreadTransferDstScalarPerVector == 0))
return false;
}
return true;
};
static int GetBlockSize(const ConvolutionProblemDescriptor&,
const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
{
return compile_param.BlockSize;
}
static int GetGridSize(const ConvolutionProblemDescriptor& conv_problem_desc,
const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param)
{
const int N = conv_problem_desc.N;
const int K = conv_problem_desc.K;
const int Ho = conv_problem_desc.Ho;
const int Wo = conv_problem_desc.Wo;
const int N0 = compile_param.GN0;
const int N1 = N / N0;
const int GM1 = K;
const int GN1 = N1 * Ho * Wo;
const int GM11 = compile_param.GM1PerBlockGM11;
const int GN11 = compile_param.GN1PerBlockGN11;
const int GM10 = GM1 / GM11;
const int GN10 = GN1 / GN11;
return GM10 * GN10;
}
static std::size_t GetWorkSpaceSize(const ConvolutionProblemDescriptor&,
const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw&)
{
// workspace is used for save transformed tensor descritpors created by prepare kernel
return 4096L;
}
static std::size_t GetMaxWorkSpaceSize(const ConvolutionProblemDescriptor&) { return 4096L; }
static auto GetTunableList()
{
return generate_tunable_list_conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw();
}
};
} // namespace ck_driver
#endif
#ifndef CONV_TUNABLE_FWD_V4R4_DLOPS_NCHW_KCYX_NKHW_HPP
#define CONV_TUNABLE_FWD_V4R4_DLOPS_NCHW_KCYX_NKHW_HPP
struct tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
{
int BlockSize;
int MPerBlock;
int NPerBlock;
int KPerBlock;
int M1PerThread;
int N1PerThread;
int KPerThread;
int M1N1ThreadClusterM10;
int M1N1ThreadClusterN10;
int M1N1ThreadClusterM11;
int M1N1ThreadClusterN11;
std::array<int, 3> ABlockTransferThreadSliceLengths_K_M0_M1;
std::array<int, 3> ABlockTransferThreadClusterLengths_K_M0_M1;
std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
std::array<int, 3> ABlockTransferSrcAccessOrder;
int ABlockTransferSrcVectorDim;
int ABlockTransferSrcScalarPerVector;
int ABlockTransferDstScalarPerVector_M1;
bool AThreadTransferSrcResetCoordinateAfterRun;
std::array<int, 3> BBlockTransferThreadSliceLengths_K_N0_N1;
std::array<int, 3> BBlockTransferThreadClusterLengths_K_N0_N1;
std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
std::array<int, 3> BBlockTransferSrcAccessOrder;
int BBlockTransferSrcVectorDim;
int BBlockTransferSrcScalarPerVector;
int BBlockTransferDstScalarPerVector_N1;
bool BThreadTransferSrcResetCoordinateAfterRun;
std::array<int, 6> CThreadTransferSrcDstAccessOrder;
int CThreadTransferSrcDstVectorDim;
int CThreadTransferDstScalarPerVector;
};
static tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw
default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw = {
256, 128, 128, 8, 4, 4, 1,
8, 8, 2, 2, {4, 1, 1}, {2, 1, 128}, {2, 1, 0},
{2, 1, 0}, 0, 4, 1, false, {4, 1, 1}, {2, 1, 128},
{0, 1, 2}, {0, 1, 2}, 2, 1, 1, false, {3, 4, 5, 0, 1, 2},
5, 1};
#endif
#ifndef CONV_TUNABLE_FWD_V4R4_NCHW_KCYX_NKHW_HPP
#define CONV_TUNABLE_FWD_V4R4_NCHW_KCYX_NKHW_HPP
struct tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
{
int32_t BlockSize;
int32_t MPerBlock;
int32_t NPerBlock;
int32_t KPerBlock;
int32_t M1PerThread;
int32_t N1PerThread;
int32_t KPerThread;
int32_t M1N1ThreadClusterM10;
int32_t M1N1ThreadClusterN10;
int32_t M1N1ThreadClusterM11;
int32_t M1N1ThreadClusterN11;
std::array<int32_t, 3> ABlockTransferThreadSliceLengths_K_M0_M1;
std::array<int32_t, 3> ABlockTransferThreadClusterLengths_K_M0_M1;
std::array<int32_t, 3> ABlockTransferThreadClusterArrangeOrder;
std::array<int32_t, 3> ABlockTransferSrcAccessOrder;
int32_t ABlockTransferSrcVectorDim;
int32_t ABlockTransferSrcScalarPerVector;
int32_t ABlockTransferDstScalarPerVector_M1;
bool AThreadTransferSrcResetCoordinateAfterRun;
std::array<int32_t, 3> BBlockTransferThreadSliceLengths_K_N0_N1;
std::array<int32_t, 3> BBlockTransferThreadClusterLengths_K_N0_N1;
std::array<int32_t, 3> BBlockTransferThreadClusterArrangeOrder;
std::array<int32_t, 3> BBlockTransferSrcAccessOrder;
int32_t BBlockTransferSrcVectorDim;
int32_t BBlockTransferSrcScalarPerVector;
int32_t BBlockTransferDstScalarPerVector_N1;
bool BThreadTransferSrcResetCoordinateAfterRun;
std::array<int32_t, 6> CThreadTransferSrcDstAccessOrder;
int32_t CThreadTransferSrcDstVectorDim;
int32_t CThreadTransferDstScalarPerVector;
};
static tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw default_tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw = {
256, 128, 128, 8, 4, 4, 1,
8, 8, 2, 2, {4, 1, 1}, {2, 1, 128}, {2, 1, 0},
{2, 1, 0}, 0, 4, 1, false, {4, 1, 1}, {2, 1, 128},
{0, 1, 2}, {0, 1, 2}, 2, 1, 1, false, {3, 4, 5, 0, 1, 2},
5, 1};
#endif
...@@ -3,40 +3,40 @@ ...@@ -3,40 +3,40 @@
struct tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw struct tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
{ {
int32_t BlockSize; int BlockSize;
int32_t MPerBlock; int MPerBlock;
int32_t NPerBlock; int NPerBlock;
int32_t KPerBlock; int KPerBlock;
int32_t MPerWave; int MPerWave;
int32_t NPerWave; int NPerWave;
int32_t K1; int K1;
int32_t MRepeat; int MRepeat;
int32_t NRepeat; int NRepeat;
std::array<int32_t, 3> ABlockTransferThreadSliceLengths_K0_M_K1; std::array<int, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
std::array<int32_t, 3> ABlockTransferThreadClusterLengths_K0_M_K1; std::array<int, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
std::array<int32_t, 3> ABlockTransferThreadClusterArrangeOrder; std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
std::array<int32_t, 3> ABlockTransferSrcAccessOrder; std::array<int, 3> ABlockTransferSrcAccessOrder;
int32_t ABlockTransferSrcVectorDim; int ABlockTransferSrcVectorDim;
int32_t ABlockTransferSrcScalarPerVector; int ABlockTransferSrcScalarPerVector;
int32_t ABlockTransferDstScalarPerVector_K1; int ABlockTransferDstScalarPerVector_K1;
bool AThreadTransferSrcResetCoordinateAfterRun; bool AThreadTransferSrcResetCoordinateAfterRun;
std::array<int32_t, 3> BBlockTransferThreadSliceLengths_K0_N_K1; std::array<int, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
std::array<int32_t, 3> BBlockTransferThreadClusterLengths_K0_N_K1; std::array<int, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
std::array<int32_t, 3> BBlockTransferThreadClusterArrangeOrder; std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
std::array<int32_t, 3> BBlockTransferSrcAccessOrder; std::array<int, 3> BBlockTransferSrcAccessOrder;
int32_t BBlockTransferSrcVectorDim; int BBlockTransferSrcVectorDim;
int32_t BBlockTransferSrcScalarPerVector; int BBlockTransferSrcScalarPerVector;
int32_t BBlockTransferDstScalarPerVector_K1; int BBlockTransferDstScalarPerVector_K1;
bool BThreadTransferSrcResetCoordinateAfterRun; bool BThreadTransferSrcResetCoordinateAfterRun;
std::array<int32_t, 8> CThreadTransferSrcDstAccessOrder; std::array<int, 8> CThreadTransferSrcDstAccessOrder;
int32_t CThreadTransferSrcDstVectorDim; int CThreadTransferSrcDstVectorDim;
int32_t CThreadTransferDstScalarPerVector; int CThreadTransferDstScalarPerVector;
}; };
static tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw static tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
......
...@@ -3,40 +3,40 @@ ...@@ -3,40 +3,40 @@
struct tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk struct tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
{ {
int32_t BlockSize; int BlockSize;
int32_t MPerBlock; int MPerBlock;
int32_t NPerBlock; int NPerBlock;
int32_t KPerBlock; int KPerBlock;
int32_t MPerWave; int MPerWave;
int32_t NPerWave; int NPerWave;
int32_t K1; int K1;
int32_t MRepeat; int MRepeat;
int32_t NRepeat; int NRepeat;
std::array<int32_t, 3> ABlockTransferThreadSliceLengths_K0_M_K1; std::array<int, 3> ABlockTransferThreadSliceLengths_K0_M_K1;
std::array<int32_t, 3> ABlockTransferThreadClusterLengths_K0_M_K1; std::array<int, 3> ABlockTransferThreadClusterLengths_K0_M_K1;
std::array<int32_t, 3> ABlockTransferThreadClusterArrangeOrder; std::array<int, 3> ABlockTransferThreadClusterArrangeOrder;
std::array<int32_t, 3> ABlockTransferSrcAccessOrder; std::array<int, 3> ABlockTransferSrcAccessOrder;
int32_t ABlockTransferSrcVectorDim; int ABlockTransferSrcVectorDim;
int32_t ABlockTransferSrcScalarPerVector; int ABlockTransferSrcScalarPerVector;
int32_t ABlockTransferDstScalarPerVector_K1; int ABlockTransferDstScalarPerVector_K1;
bool AThreadTransferSrcResetCoordinateAfterRun; bool AThreadTransferSrcResetCoordinateAfterRun;
std::array<int32_t, 3> BBlockTransferThreadSliceLengths_K0_N_K1; std::array<int, 3> BBlockTransferThreadSliceLengths_K0_N_K1;
std::array<int32_t, 3> BBlockTransferThreadClusterLengths_K0_N_K1; std::array<int, 3> BBlockTransferThreadClusterLengths_K0_N_K1;
std::array<int32_t, 3> BBlockTransferThreadClusterArrangeOrder; std::array<int, 3> BBlockTransferThreadClusterArrangeOrder;
std::array<int32_t, 3> BBlockTransferSrcAccessOrder; std::array<int, 3> BBlockTransferSrcAccessOrder;
int32_t BBlockTransferSrcVectorDim; int BBlockTransferSrcVectorDim;
int32_t BBlockTransferSrcScalarPerVector; int BBlockTransferSrcScalarPerVector;
int32_t BBlockTransferDstScalarPerVector_K1; int BBlockTransferDstScalarPerVector_K1;
bool BThreadTransferSrcResetCoordinateAfterRun; bool BThreadTransferSrcResetCoordinateAfterRun;
std::array<int32_t, 8> CThreadTransferSrcDstAccessOrder; std::array<int, 8> CThreadTransferSrcDstAccessOrder;
int32_t CThreadTransferSrcDstVectorDim; int CThreadTransferSrcDstVectorDim;
int32_t CThreadTransferDstScalarPerVector; int CThreadTransferDstScalarPerVector;
}; };
static tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk static tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
......
#ifndef CONV_TUNABLE_FWD_V6R1_NCHW_KCYX_NKHW_HPP
#define CONV_TUNABLE_FWD_V6R1_NCHW_KCYX_NKHW_HPP
struct tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw
{
int32_t BlockSize = 256;
int32_t GN0 = 4;
int32_t GK1 = 1;
int32_t GM1PerBlockGM11 = 128;
int32_t GN1PerBlockGN11 = 32;
int32_t GK0PerBlock = 8;
int32_t BM1PerThreadBM11 = 4;
int32_t BN1PerThreadBN11 = 4;
int32_t BK0PerThread = 1;
int32_t BM10BN10ThreadClusterBM100 = 2;
int32_t BM10BN10ThreadClusterBN100 = 2;
int32_t BM10BN10ThreadClusterBM101 = 8;
int32_t BM10BN10ThreadClusterBN101 = 8;
std::array<int32_t, 5> ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 = {4, 1, 1, 1, 1};
std::array<int32_t, 5> ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 = {
2, 1, 1, 128, 1};
std::array<int32_t, 5> ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
4, 1, 1, 1, 1};
std::array<int32_t, 5> ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 = {
1, 1, 1, 1, 1};
std::array<int32_t, 5> BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 = {1, 4, 1, 1, 1};
std::array<int32_t, 5> BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 = {
8, 1, 1, 32, 1};
std::array<int32_t, 5> BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
1, 1, 1, 1, 1};
std::array<int32_t, 5> BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 = {
1, 1, 1, 1, 1};
int32_t CThreadTransferDstScalarPerVector = 1;
};
#endif
#ifndef CONVOLUTION_PROBLEM_DESCRIPTOR
#define CONVOLUTION_PROBLEM_DESCRIPTOR
namespace ck_driver {
struct ConvolutionProblemDescriptor
{
ConvolutionProblemDescriptor() = default;
ConvolutionProblemDescriptor(int N_,
int K_,
int C_,
int Y_,
int X_,
int Hi_,
int Wi_,
int Ho_,
int Wo_,
int ConvStrideH_,
int ConvStrideW_,
int ConvDilationH_,
int ConvDilationW_,
int InLeftPadH_,
int InLeftPadW_,
int InRightPadH_,
int InRightPadW_,
ck::DataTypeEnum_t InDataTypeEnum_,
ck::DataTypeEnum_t WeiDataTypeEnum_,
ck::DataTypeEnum_t OutDataTypeEnum_)
: N{N_},
K{K_},
C{C_},
Y{Y_},
X{X_},
Hi{Hi_},
Wi{Wi_},
Ho{Ho_},
Wo{Wo_},
ConvStrideH{ConvStrideH_},
ConvStrideW{ConvStrideW_},
ConvDilationH{ConvDilationH_},
ConvDilationW{ConvDilationW_},
InLeftPadH{InLeftPadH_},
InLeftPadW{InLeftPadW_},
InRightPadH{InRightPadH_},
InRightPadW{InRightPadW_},
InDataTypeEnum{InDataTypeEnum_},
WeiDataTypeEnum{WeiDataTypeEnum_},
OutDataTypeEnum{OutDataTypeEnum_}
{
}
int N;
int K;
int C;
int Y;
int X;
int Hi;
int Wi;
int Ho;
int Wo;
int ConvStrideH;
int ConvStrideW;
int ConvDilationH;
int ConvDilationW;
int InLeftPadH;
int InLeftPadW;
int InRightPadH;
int InRightPadW;
ck::DataTypeEnum_t InDataTypeEnum;
ck::DataTypeEnum_t WeiDataTypeEnum;
ck::DataTypeEnum_t OutDataTypeEnum;
std::size_t CalculateFlop() const { return 2L * N * K * C * Y * X * Ho * Wo; }
};
} // namespace ck_driver
#endif
#pragma once
#include "device.hpp" #include "device.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "handle.hpp" #include "handle.hpp"
...@@ -5,24 +6,26 @@ ...@@ -5,24 +6,26 @@
#include "dynamic_tensor_descriptor.hpp" #include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp" #include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp" #include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "conv_tunable_fwd_v4r4_nchw_kcyx_nkhw.hpp" #include "conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp"
namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw { namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw {
template <typename TInWei, typename TAcc, typename TOut> template <typename TInWei, typename TAcc, typename TOut>
static std::string get_network_config_string_from_types() static std::string get_network_config_string_from_types()
{ {
using namespace ck;
std::string out; std::string out;
out += static_cast<char>(Driver::get_typeid_from_type<TInWei>()) + out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
static_cast<char>(Driver::get_typeid_from_type<TAcc>()) + std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
static_cast<char>(Driver::get_typeid_from_type<TOut>()); std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out); return (out);
}; };
static std::string static std::string
get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* pt) get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
{ {
std::string out("TUN_"); std::string out("TUN_");
...@@ -95,17 +98,20 @@ get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_nchw_kcyx ...@@ -95,17 +98,20 @@ get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_nchw_kcyx
template <typename TInWei, typename TAcc, typename TOut> template <typename TInWei, typename TAcc, typename TOut>
static std::string get_definition_string_from_types() static std::string get_definition_string_from_types()
{ {
using namespace ck;
std::string out; std::string out;
out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TInWei>()) + out +=
" -DCK_PARAM_CONV_COMPTYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) + " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
" -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TOut>()); " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
" -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out); return (out);
}; };
static std::string static std::string
get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* pt) get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
{ {
std::string out; std::string out;
...@@ -209,7 +215,7 @@ template <typename TInWei, ...@@ -209,7 +215,7 @@ template <typename TInWei,
typename ConvDilations, typename ConvDilations,
typename InLeftPads, typename InLeftPads,
typename InRightPads> typename InRightPads>
void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw( void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
olCompile::Handle* handle, olCompile::Handle* handle,
const InLengths& in_n_c_hi_wi_lengths, const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths, const WeiLengths& wei_k_c_y_x_lengths,
...@@ -221,10 +227,11 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw ...@@ -221,10 +227,11 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
const Tensor<TInWei>& in_n_c_hi_wi, const Tensor<TInWei>& in_n_c_hi_wi,
const Tensor<TInWei>& wei_k_c_y_x, const Tensor<TInWei>& wei_k_c_y_x,
Tensor<TOut>& out_n_k_ho_wo, Tensor<TOut>& out_n_k_ho_wo,
const tunable_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw* tunable, const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable,
ck::index_t nrepeat) ck::index_t nrepeat)
{ {
using namespace ck; using namespace ck;
using namespace ck_driver;
using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw; using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;
using size_t = std::size_t; using size_t = std::size_t;
...@@ -288,8 +295,9 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw ...@@ -288,8 +295,9 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1}; const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1}; const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
std::string program_name = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.cpp"; std::string program_name =
std::string algo_name = "implicit_gemm_conv_fwd_v4r4_nchw"; "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp";
std::string algo_name = "implicit_gemm_conv_fwd_v4r4_dlops_nchw";
std::string param = " -std=c++17 "; std::string param = " -std=c++17 ";
std::string network_config; std::string network_config;
...@@ -311,7 +319,7 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw ...@@ -311,7 +319,7 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
KernelTimer timer1, timer2; KernelTimer timer1, timer2;
std::string kernel_name; std::string kernel_name;
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw_prepare"; kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare";
auto network_config_1 = network_config + "_1"; auto network_config_1 = network_config + "_1";
timer1.Start(); timer1.Start();
...@@ -337,7 +345,7 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw ...@@ -337,7 +345,7 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf); c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
timer1.End(); timer1.End();
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw"; kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw";
auto network_config_2 = network_config + "_2"; auto network_config_2 = network_config + "_2";
timer2.Start(); timer2.Start();
...@@ -356,8 +364,14 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw ...@@ -356,8 +364,14 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
} }
{ {
auto ave_time1 = Driver::get_effective_average(kernel1_times); auto ave_time1 =
auto ave_time2 = Driver::get_effective_average(kernel2_times); std::accumulate(
std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
auto ave_time2 =
std::accumulate(
std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
const auto N = in_n_c_hi_wi_lengths[I0]; const auto N = in_n_c_hi_wi_lengths[I0];
const auto C = in_n_c_hi_wi_lengths[I1]; const auto C = in_n_c_hi_wi_lengths[I1];
......
...@@ -11,11 +11,13 @@ namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw { ...@@ -11,11 +11,13 @@ namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw {
template <typename TInWei, typename TAcc, typename TOut> template <typename TInWei, typename TAcc, typename TOut>
static std::string get_network_config_string_from_types() static std::string get_network_config_string_from_types()
{ {
using namespace ck;
std::string out; std::string out;
out += static_cast<char>(Driver::get_typeid_from_type<TInWei>()) + out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
static_cast<char>(Driver::get_typeid_from_type<TAcc>()) + std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
static_cast<char>(Driver::get_typeid_from_type<TOut>()); std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out); return (out);
}; };
...@@ -93,11 +95,14 @@ get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nc ...@@ -93,11 +95,14 @@ get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nc
template <typename TInWei, typename TAcc, typename TOut> template <typename TInWei, typename TAcc, typename TOut>
static std::string get_definition_string_from_types() static std::string get_definition_string_from_types()
{ {
using namespace ck;
std::string out; std::string out;
out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TInWei>()) + out +=
" -DCK_PARAM_CONV_COMPTYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) + " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
" -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TOut>()); " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
" -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out); return (out);
}; };
...@@ -222,6 +227,7 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kc ...@@ -222,6 +227,7 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kc
ck::index_t nrepeat) ck::index_t nrepeat)
{ {
using namespace ck; using namespace ck;
using namespace ck_driver;
using namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw; using namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
using size_t = std::size_t; using size_t = std::size_t;
...@@ -349,8 +355,14 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kc ...@@ -349,8 +355,14 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kc
} }
{ {
auto ave_time1 = Driver::get_effective_average(kernel1_times); auto ave_time1 =
auto ave_time2 = Driver::get_effective_average(kernel2_times); std::accumulate(
std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
auto ave_time2 =
std::accumulate(
std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
const auto N = in_n_c_hi_wi_lengths[I0]; const auto N = in_n_c_hi_wi_lengths[I0];
const auto C = in_n_c_hi_wi_lengths[I1]; const auto C = in_n_c_hi_wi_lengths[I1];
......
...@@ -12,11 +12,13 @@ namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk { ...@@ -12,11 +12,13 @@ namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk {
template <typename TInWei, typename TAcc, typename TOut> template <typename TInWei, typename TAcc, typename TOut>
static std::string get_network_config_string_from_types() static std::string get_network_config_string_from_types()
{ {
using namespace ck;
std::string out; std::string out;
out += static_cast<char>(Driver::get_typeid_from_type<TInWei>()) + out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
static_cast<char>(Driver::get_typeid_from_type<TAcc>()) + std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
static_cast<char>(Driver::get_typeid_from_type<TOut>()); std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out); return (out);
}; };
...@@ -94,11 +96,14 @@ get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nh ...@@ -94,11 +96,14 @@ get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nh
template <typename TInWei, typename TAcc, typename TOut> template <typename TInWei, typename TAcc, typename TOut>
static std::string get_definition_string_from_types() static std::string get_definition_string_from_types()
{ {
using namespace ck;
std::string out; std::string out;
out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TInWei>()) + out +=
" -DCK_PARAM_CONV_COMPTYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) + " -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
" -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TOut>()); " -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
" -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out); return (out);
}; };
...@@ -302,15 +307,16 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_ky ...@@ -302,15 +307,16 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_ky
std::vector<float> kernel1_times; std::vector<float> kernel1_times;
std::vector<float> kernel2_times; std::vector<float> kernel2_times;
KernelTimer timer1, timer2;
std::string kernel_name;
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare";
auto network_config_1 = network_config + "_1";
timer1.Start();
for(index_t i = 0; i < nrepeat; ++i) for(index_t i = 0; i < nrepeat; ++i)
{ {
KernelTimer timer1, timer2;
std::string kernel_name;
kernel_name =
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare";
auto network_config_1 = network_config + "_1";
timer1.Start();
handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)( handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
static_cast<index_t>(in_n_hi_wi_c_lengths[I0]), static_cast<index_t>(in_n_hi_wi_c_lengths[I0]),
static_cast<index_t>(in_n_hi_wi_c_lengths[I1]), static_cast<index_t>(in_n_hi_wi_c_lengths[I1]),
...@@ -331,15 +337,12 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_ky ...@@ -331,15 +337,12 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_ky
b_k0_n_k1_grid_desc_dev_buf, b_k0_n_k1_grid_desc_dev_buf,
c_m0_m1_m2_n_grid_desc_dev_buf, c_m0_m1_m2_n_grid_desc_dev_buf,
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf); c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
} timer1.End();
timer1.End();
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk"; kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk";
auto network_config_2 = network_config + "_2"; auto network_config_2 = network_config + "_2";
timer2.Start(); timer2.Start();
for(index_t i = 0; i < nrepeat; ++i)
{
handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)( handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
reinterpret_cast<const TInWei*>(in_n_hi_wi_c_dev_buf.GetDeviceBuffer()), reinterpret_cast<const TInWei*>(in_n_hi_wi_c_dev_buf.GetDeviceBuffer()),
reinterpret_cast<const TInWei*>(wei_k_y_x_c_dev_buf.GetDeviceBuffer()), reinterpret_cast<const TInWei*>(wei_k_y_x_c_dev_buf.GetDeviceBuffer()),
...@@ -348,12 +351,21 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_ky ...@@ -348,12 +351,21 @@ void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_ky
(const void*)(b_k0_n_k1_grid_desc_dev_buf), (const void*)(b_k0_n_k1_grid_desc_dev_buf),
(const void*)(c_m0_m1_m2_n_grid_desc_dev_buf), (const void*)(c_m0_m1_m2_n_grid_desc_dev_buf),
(const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf)); (const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
timer2.End();
kernel1_times.push_back(timer1.GetElapsedTime());
kernel2_times.push_back(timer2.GetElapsedTime());
} }
timer2.End();
{ {
auto ave_time1 = timer1.GetElapsedTime() / nrepeat; auto ave_time1 =
auto ave_time2 = timer2.GetElapsedTime() / nrepeat; std::accumulate(
std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
auto ave_time2 =
std::accumulate(
std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
const auto N = in_n_hi_wi_c_lengths[I0]; const auto N = in_n_hi_wi_c_lengths[I0];
const auto C = in_n_hi_wi_c_lengths[I3]; const auto C = in_n_hi_wi_c_lengths[I3];
......
#pragma once
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "convolution_problem_descriptor.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp"
template <typename TInWei,
typename TAcc,
typename TOut,
typename InLengths,
typename WeiLengths,
typename OutLengths,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
olCompile::Handle* handle,
const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths,
const OutLengths& out_n_k_ho_wo_lengths,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads,
const Tensor<TInWei>& in_n_c_hi_wi,
const Tensor<TInWei>& wei_k_c_y_x,
Tensor<TOut>& out_n_k_ho_wo,
const ck_driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param,
ck::index_t nrepeat)
{
using namespace ck;
using namespace ck_driver;
using size_t = std::size_t;
std::cout << __func__ << std::endl;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
ConvolutionProblemDescriptor conv_problem_desc{in_n_c_hi_wi_lengths[I0],
out_n_k_ho_wo_lengths[I1],
in_n_c_hi_wi_lengths[I1],
wei_k_c_y_x_lengths[I2],
wei_k_c_y_x_lengths[I3],
in_n_c_hi_wi_lengths[I2],
in_n_c_hi_wi_lengths[I3],
out_n_k_ho_wo_lengths[I2],
out_n_k_ho_wo_lengths[I3],
conv_strides[I0],
conv_strides[I1],
conv_dilations[I0],
conv_dilations[I1],
in_left_pads[I0],
in_left_pads[I1],
in_right_pads[I0],
in_right_pads[I1],
get_datatype_enum_from_type<TInWei>::value,
get_datatype_enum_from_type<TInWei>::value,
get_datatype_enum_from_type<TOut>::value};
if(!ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsValidCompileParameter(conv_problem_desc,
compile_param))
{
throw std::runtime_error("wrong! IsValidCompileParameter fail");
}
DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
// workspace is used for save transformed tensor descritpors created by prepare kernel
DeviceMem workspace_dev_buf(
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetWorkSpaceSize(conv_problem_desc, compile_param));
const auto block_size = std::size_t(
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetBlockSize(conv_problem_desc, compile_param));
const auto grid_size = std::size_t(
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetGridSize(conv_problem_desc, compile_param));
const std::vector<size_t> vld1 = {1, 1, 1};
const std::vector<size_t> vgd1 = {1, 1, 1};
const std::vector<size_t> vld2 = {static_cast<size_t>(block_size), 1, 1};
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * block_size), 1, 1};
std::string program_name =
"dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
std::string algo_name = "implicit_gemm_conv_fwd_v6r1_dlops_nchw";
std::string compile_param_string = " -std=c++17 " + compile_param.GetCompileParameterString();
std::string network_config = compile_param_string;
std::vector<float> kernel1_times;
std::vector<float> kernel2_times;
for(index_t i = 0; i < nrepeat; ++i)
{
KernelTimer timer1, timer2;
std::string kernel_name;
kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare";
auto network_config_1 = network_config + "_1";
timer1.Start();
handle->AddKernel(algo_name,
network_config_1,
program_name,
kernel_name,
vld1,
vgd1,
compile_param_string)(static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
conv_strides[I0],
conv_strides[I1],
conv_dilations[I0],
conv_dilations[I1],
in_left_pads[I0],
in_left_pads[I1],
in_right_pads[I0],
in_right_pads[I1],
(void*)(workspace_dev_buf.GetDeviceBuffer()));
timer1.End();
kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw";
auto network_config_2 = network_config + "_2";
timer2.Start();
handle->AddKernel(algo_name,
network_config_2,
program_name,
kernel_name,
vld2,
vgd2,
compile_param_string)(
reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
(const void*)(workspace_dev_buf.GetDeviceBuffer()));
timer2.End();
kernel1_times.push_back(timer1.GetElapsedTime());
kernel2_times.push_back(timer2.GetElapsedTime());
}
{
auto ave_time1 =
std::accumulate(
std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
auto ave_time2 =
std::accumulate(
std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
float perf = (float)(conv_problem_desc.CalculateFlop()) /
(std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
<< ave_time2 << "), " << perf << " TFlop/s" << std::endl;
};
// copy result back to host
out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
}
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "conv_tunable_fwd_v6r1_nchw_kcyx_nkhw.hpp"
namespace detail_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw {
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_network_config_string_from_types()
{
std::string out("DAT_");
out += static_cast<char>(Driver::get_typeid_from_type<TInWei>()) +
static_cast<char>(Driver::get_typeid_from_type<TAcc>()) +
static_cast<char>(Driver::get_typeid_from_type<TOut>());
return (out);
};
static std::string
get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw& tunable)
{
std::string out("TUN_");
out += std::to_string(tunable.BlockSize) + "_";
out += std::to_string(tunable.GN0) + "x" + std::to_string(tunable.GK1) + "_";
out += std::to_string(tunable.GM1PerBlockGM11) + "x" + std::to_string(tunable.GN1PerBlockGN11) +
"x" + std::to_string(tunable.GK0PerBlock) + "_";
out += std::to_string(tunable.BM1PerThreadBM11) + "x" +
std::to_string(tunable.BN1PerThreadBN11) + "x" + std::to_string(tunable.BK0PerThread) +
"_";
out += std::to_string(tunable.BM10BN10ThreadClusterBM100) + "x" +
std::to_string(tunable.BM10BN10ThreadClusterBN100) + "x" +
std::to_string(tunable.BM10BN10ThreadClusterBM101) + "x" +
std::to_string(tunable.BM10BN10ThreadClusterBN101) + "_";
out += std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0]) + "x" +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1]) + "x" +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2]) + "x" +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3]) + "x" +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4]) + "_";
out +=
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0]) + "x" +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1]) + "x" +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2]) + "x" +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3]) + "x" +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4]) + "_";
out += std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +
"x" +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) +
"x" +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) +
"x" +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) +
"x" +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
"_";
out += std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +
"x" +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) +
"x" +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) +
"x" +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) +
"x" +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]) +
"_";
out += std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0]) + "x" +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1]) + "x" +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2]) + "x" +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3]) + "x" +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4]) + "_";
out +=
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0]) + "x" +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1]) + "x" +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2]) + "x" +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3]) + "x" +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4]) + "_";
out += std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) +
"x" +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) +
"x" +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) +
"x" +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) +
"x" +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
"_";
out += std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) +
"x" +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) +
"x" +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) +
"x" +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) +
"x" +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]) +
"_";
out += std::to_string(tunable.CThreadTransferDstScalarPerVector);
return (out);
};
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_definition_string_from_types()
{
std::string out;
out += " -DCK_PARAM_IN_WEI_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TInWei>()) +
" -DCK_PARAM_ACC_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TAcc>()) +
" -DCK_PARAM_OUT_DATATYPE=" + std::to_string(Driver::get_typeid_from_type<TOut>());
return (out);
};
static std::string
get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw& tunable)
{
std::string out;
out += " -DCK_PARAM_BlockSize=" + std::to_string(tunable.BlockSize);
out += " -DCK_PARAM_GN0=" + std::to_string(tunable.GN0);
out += " -DCK_PARAM_GK1=" + std::to_string(tunable.GK1);
out += " -DCK_PARAM_GM1PerBlockGM11=" + std::to_string(tunable.GM1PerBlockGM11) +
" -DCK_PARAM_GN1PerBlockGN11=" + std::to_string(tunable.GN1PerBlockGN11) +
" -DCK_PARAM_GK0PerBlock=" + std::to_string(tunable.GK0PerBlock);
out += " -DCK_PARAM_BM1PerThreadBM11=" + std::to_string(tunable.BM1PerThreadBM11) +
" -DCK_PARAM_BN1PerThreadBN11=" + std::to_string(tunable.BN1PerThreadBN11) +
" -DCK_PARAM_BK0PerThread=" + std::to_string(tunable.BK0PerThread);
out += " -DCK_PARAM_BM10BN10ThreadClusterBM100=" +
std::to_string(tunable.BM10BN10ThreadClusterBM100) +
" -DCK_PARAM_BM10BN10ThreadClusterBN100=" +
std::to_string(tunable.BM10BN10ThreadClusterBN100) +
" -DCK_PARAM_BM10BN10ThreadClusterBM101=" +
std::to_string(tunable.BM10BN10ThreadClusterBM101) +
" -DCK_PARAM_BM10BN10ThreadClusterBN101=" +
std::to_string(tunable.BM10BN10ThreadClusterBN101);
out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1=" +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
std::to_string(tunable.ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1[4]);
out +=
" -DCK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1=" +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[0]) + "," +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[1]) + "," +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[2]) + "," +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[3]) + "," +
std::to_string(tunable.ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1[4]);
out += " -DCK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +
"," +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) +
"," +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) +
"," +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) +
"," +
std::to_string(tunable.ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]);
out += " -DCK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1=" +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[0]) +
"," +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[1]) +
"," +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[2]) +
"," +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[3]) +
"," +
std::to_string(tunable.ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1[4]);
out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1=" +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
std::to_string(tunable.BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1[4]);
out +=
" -DCK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1=" +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[0]) + "," +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[1]) + "," +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[2]) + "," +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[3]) + "," +
std::to_string(tunable.BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1[4]);
out += " -DCK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) +
"," +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) +
"," +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) +
"," +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) +
"," +
std::to_string(tunable.BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]);
out += " -DCK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1=" +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[0]) +
"," +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[1]) +
"," +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[2]) +
"," +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[3]) +
"," +
std::to_string(tunable.BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1[4]);
out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
std::to_string(tunable.CThreadTransferDstScalarPerVector);
return (out);
};
} // namespace detail_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw
template <typename TInWei,
typename TAcc,
typename TOut,
typename InLengths,
typename WeiLengths,
typename OutLengths,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw(
olCompile::Handle* handle,
const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths,
const OutLengths& out_n_k_ho_wo_lengths,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads,
const Tensor<TInWei>& in_n_c_hi_wi,
const Tensor<TInWei>& wei_k_c_y_x,
Tensor<TOut>& out_n_k_ho_wo,
const tunable_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw& tunable,
ck::index_t nrepeat)
{
using namespace ck;
using namespace detail_dyn_conv_fwd_v6r1_nchw_kcyx_nkhw;
using size_t = std::size_t;
////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
// hasDoubleTailKBlockLoop
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const auto in_n_c_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
const auto wei_k_c_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
const auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
const auto descs =
transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
tunable.GN0,
tunable.GK1);
const auto a_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
const auto c_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
const auto GM1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1);
const auto GN1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3);
const auto GK = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);
const index_t grid_size = (GM1 / tunable.GM1PerBlockGM11) * (GN1 / tunable.GN1PerBlockGN11);
const bool hasMainKBlockLoop = ((GK + tunable.GK0PerBlock) / (2 * tunable.GK0PerBlock) > 1);
const bool hasDoubleTailKBlockLoop = ((GK / tunable.GK0PerBlock) % 2 == 0);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// these buffers are usually provided by the user application
DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
// these are workspace buffers that should be expressed to the user by the corresponding
// workspace API
DeviceMem workspace_buf(4096);
void* a_grid_desc_gk0_gm0_gm10_gm11_gk1_dev_buf = workspace_buf.GetDeviceBuffer();
void* b_grid_desc_gk0_gn0_gn10_gn11_gk1_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
void* c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
void* c_grid_block_cluster_blockid_to_gm10_gn10_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
const std::vector<size_t> vld = {static_cast<size_t>(tunable.BlockSize), 1, 1};
const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable.BlockSize), 1, 1};
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable.BlockSize), 1, 1};
std::string program_name = "dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.cpp";
std::string algo_name = "implicit_gemm_conv_fwd_v6r1_nchw";
std::string param = " -std=c++17 ";
std::string network_config;
param += get_definition_string_from_types<TInWei, TAcc, TOut>() +
" -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
" -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop) +
get_definition_string_from_tunable(tunable);
network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
std::to_string(hasDoubleTailKBlockLoop) + "_" +
get_network_config_string_from_tunable(tunable);
std::vector<float> kernel1_times;
std::vector<float> kernel2_times;
for(index_t i = 0; i < nrepeat; ++i)
{
KernelTimer timer1, timer2;
std::string kernel_name;
kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw_prepare";
auto network_config_1 = network_config + "_1";
timer1.Start();
handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
conv_strides[I0],
conv_strides[I1],
conv_dilations[I0],
conv_dilations[I1],
in_left_pads[I0],
in_left_pads[I1],
in_right_pads[I0],
in_right_pads[I1],
a_grid_desc_gk0_gm0_gm10_gm11_gk1_dev_buf,
b_grid_desc_gk0_gn0_gn10_gn11_gk1_dev_buf,
c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1_dev_buf,
c_grid_block_cluster_blockid_to_gm10_gn10_dev_buf);
timer2.End();
kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw";
auto network_config_2 = network_config + "_2";
timer2.Start();
handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
(const void*)(a_grid_desc_gk0_gm0_gm10_gm11_gk1_dev_buf),
(const void*)(b_grid_desc_gk0_gn0_gn10_gn11_gk1_dev_buf),
(const void*)(c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1_dev_buf),
(const void*)(c_grid_block_cluster_blockid_to_gm10_gn10_dev_buf));
timer2.End();
kernel1_times.push_back(timer1.GetElapsedTime());
kernel2_times.push_back(timer2.GetElapsedTime());
}
{
auto ave_time1 = Driver::get_effective_average(kernel1_times);
auto ave_time2 = Driver::get_effective_average(kernel2_times);
const auto N = in_n_c_hi_wi_lengths[I0];
const auto C = in_n_c_hi_wi_lengths[I1];
const auto K = out_n_k_ho_wo_lengths[I1];
const auto Ho = out_n_k_ho_wo_lengths[I2];
const auto Wo = out_n_k_ho_wo_lengths[I3];
const auto Y = wei_k_c_y_x_lengths[I2];
const auto X = wei_k_c_y_x_lengths[I3];
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
<< ave_time2 << "), " << perf << " TFlop/s" << std::endl;
};
// copy result back to host
out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
}
#ifndef OLC_DRIVER_COMMON_HPP #ifndef ONLINE_DRIVER_COMMON_HPP
#define OLC_DRIVER_COMMON_HPP #define ONLINE_DRIVER_COMMON_HPP
#include <half.hpp> namespace ck_driver {
#include <vector>
#include <cassert>
// this enumerate should be synchronized with include/miopen.h // greatest common divisor, aka highest common factor
typedef enum { inline int gcd(int x, int y)
appHalf = 0,
appFloat = 1,
appInt32 = 2,
appInt8 = 3,
appInt8x4 = 4,
appBFloat16 = 5,
appDouble = 6,
} appDataType_t;
namespace Driver {
template <appDataType_t typeNum>
struct get_type_from_type_enum
{
using type = float;
};
template <>
struct get_type_from_type_enum<appHalf>
{
using type = half_float::half;
};
template <>
struct get_type_from_type_enum<appFloat>
{
using type = float;
};
template <>
struct get_type_from_type_enum<appDouble>
{ {
using type = double; if(x < 0)
};
template <>
struct get_type_from_type_enum<appInt32>
{
using type = int;
};
static inline int get_typeid_from_type_enum(appDataType_t t)
{
switch(t)
{ {
case appHalf: return (static_cast<int>('H')); return gcd(-x, y);
case appFloat: return (static_cast<int>('F')); }
case appBFloat16: return (static_cast<int>('B')); else if(y < 0)
case appDouble: return (static_cast<int>('D')); {
case appInt8: return gcd(x, -y);
case appInt8x4: }
case appInt32: return (static_cast<int>('O')); else if(x == y || x == 0)
default: throw std::runtime_error("Only float, half, bfloat16 data type is supported."); break; {
}; return y;
}; }
else if(y == 0)
template <typename T> {
static inline int get_typeid_from_type() return x;
{ }
throw std::runtime_error("Unsupported typeid conversion for this type!"); else if(x > y)
}; {
return gcd(x % y, y);
template <> }
inline int get_typeid_from_type<float>()
{
return (static_cast<int>('F'));
};
template <>
inline int get_typeid_from_type<half_float::half>()
{
return (static_cast<int>('H'));
};
template <>
inline int get_typeid_from_type<double>()
{
return (static_cast<int>('D'));
};
static inline float get_effective_average(std::vector<float>& values)
{
assert(!values.empty());
if(values.size() == 1)
return (values[0]);
else else
{ {
float sum = 0.0f; return gcd(x, y % x);
float maxVal = 0.0f; }
}
for(const auto val : values) template <typename X,
{ typename... Ys,
if(maxVal < val) typename std::enable_if<sizeof...(Ys) >= 2, bool>::type = false>
maxVal = val; auto gcd(X x, Ys... ys)
sum += val; {
}; return gcd(x, gcd(ys...));
}
return ((sum - maxVal) / (values.size() - 1));
};
};
} // namespace Driver
} // namespace ck_driver
#endif #endif
...@@ -77,6 +77,7 @@ message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}") ...@@ -77,6 +77,7 @@ message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels ## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels
add_definitions("-DHIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}") add_definitions("-DHIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}")
set(HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS} ${HIP_ONLINE_COMPILER_FLAGS}")
file(GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE_1 "${PROJECT_SOURCE_DIR}/composable_kernel/include/*/*.hpp") file(GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE_1 "${PROJECT_SOURCE_DIR}/composable_kernel/include/*/*.hpp")
file(GLOB COMPOSABLE_KERNEL_INCLUDE_2 "${PROJECT_SOURCE_DIR}/external/rocm/include/bfloat16_dev.hpp") file(GLOB COMPOSABLE_KERNEL_INCLUDE_2 "${PROJECT_SOURCE_DIR}/external/rocm/include/bfloat16_dev.hpp")
......
...@@ -6,21 +6,16 @@ rm -rf CMakeFiles ...@@ -6,21 +6,16 @@ rm -rf CMakeFiles
MY_PROJECT_SOURCE=../../../ MY_PROJECT_SOURCE=../../../
MY_PROJECT_INSTALL=../install.dir MY_PROJECT_INSTALL=../install.dir
cmake \ cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx908 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD" \ -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX906 -O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D HIP_ONLINE_COMPILER_FLAGS="-DCK_AMD_GPU_GFX906" \
-D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps=$CWD" \
#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps=$CWD" \
#CXX_FLAG_TMP=-Weverything #CXX_FLAG_TMP=-Weverything
# -Wno-c++98-compat \ # -Wno-c++98-compat \
# -Wno-c++98-compat-pedantic \ # -Wno-c++98-compat-pedantic \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment