"...composable_kernel.git" did not exist on "504e97e382ad941744b19245fa0ae950b3b487e0"
Commit 40b7d356 authored by Jing Zhang's avatar Jing Zhang
Browse files

unify static and dynamic v5r1

parent 5e627be5
...@@ -53,16 +53,15 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -53,16 +53,15 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{}; constexpr auto I4 = Number<4>{};
const auto N = in_n_c_hi_wi_global_desc.GetLength(I0); const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
const auto C = in_n_c_hi_wi_global_desc.GetLength(I1); const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2); const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3); const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2); const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3); const auto Wo = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4); const auto K1 = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
const auto K = wei_k_c_y_x_global_desc.GetLength(I0); const auto K = wei_k_c_y_x_global_desc.GetLength(I0);
......
...@@ -38,7 +38,7 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -38,7 +38,7 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
typename InRightPads> typename InRightPads>
__host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc, __host__ void Run(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc, const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
const DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_global_desc, const DynamicTensorDescriptor<Out...>& out_n_k0_ho_wo_k1_global_desc,
const ConvStrides& conv_strides, const ConvStrides& conv_strides,
const ConvDilations& conv_dilations, const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads, const InLeftPads& in_left_pads,
...@@ -53,15 +53,16 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -53,15 +53,16 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{}; constexpr auto I4 = Number<4>{};
const auto N_ = in_n_c_hi_wi_global_desc.GetLength(I0); const auto N_ = in_n_c_hi_wi_global_desc.GetLength(I0);
const auto C_ = in_n_c_hi_wi_global_desc.GetLength(I1); const auto C_ = in_n_c_hi_wi_global_desc.GetLength(I1);
const auto K0_ = out_n_k0_ho_wo_global_desc.GetLength(I1);
const auto Hi_ = in_n_c_hi_wi_global_desc.GetLength(I2); const auto Hi_ = in_n_c_hi_wi_global_desc.GetLength(I2);
const auto Wi_ = in_n_c_hi_wi_global_desc.GetLength(I3); const auto Wi_ = in_n_c_hi_wi_global_desc.GetLength(I3);
const auto Ho_ = out_n_k0_ho_wo_global_desc.GetLength(I2); const auto K0_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
const auto Wo_ = out_n_k0_ho_wo_global_desc.GetLength(I3); const auto Ho_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
const auto Wo_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I3);
const auto K1_ = out_n_k0_ho_wo_k1_global_desc.GetLength(I4);
const auto K_ = wei_k_c_y_x_global_desc.GetLength(I0); const auto K_ = wei_k_c_y_x_global_desc.GetLength(I0);
const auto Y_ = wei_k_c_y_x_global_desc.GetLength(I2); const auto Y_ = wei_k_c_y_x_global_desc.GetLength(I2);
...@@ -70,6 +71,7 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -70,6 +71,7 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
constexpr auto N = Number<N_>{}; constexpr auto N = Number<N_>{};
constexpr auto C = Number<C_>{}; constexpr auto C = Number<C_>{};
constexpr auto K0 = Number<K0_>{}; constexpr auto K0 = Number<K0_>{};
constexpr auto K1 = Number<K1_>{};
constexpr auto Hi = Number<Hi_>{}; constexpr auto Hi = Number<Hi_>{};
constexpr auto Wi = Number<Wi_>{}; constexpr auto Wi = Number<Wi_>{};
...@@ -168,12 +170,12 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -168,12 +170,12 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
// output tensor // output tensor
const auto out_k_n_hop_wop_global_desc = transform_dynamic_tensor_descriptor( const auto out_k_n_hop_wop_global_desc = transform_dynamic_tensor_descriptor(
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo)), make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1)),
make_tuple(make_pass_through_transform(K0), make_tuple(make_merge_transform(make_tuple(K0, K1)),
make_pass_through_transform(N), make_pass_through_transform(N),
make_pad_transform(Ho, I0, OutRightPadH), make_pad_transform(Ho, I0, OutRightPadH),
make_pad_transform(Wo, I0, OutRightPadW)), make_pad_transform(Wo, I0, OutRightPadW)),
make_tuple(Sequence<1>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<1, 4>{}, Sequence<0>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
static_assert(out_k_n_hop_wop_global_desc.IsKnownAtCompileTime(), static_assert(out_k_n_hop_wop_global_desc.IsKnownAtCompileTime(),
...@@ -212,11 +214,11 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -212,11 +214,11 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor // hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format // hack for NKHW format
constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks =
make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{}, make_tuple(make_tuple(Sequence<0, 1, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}), Sequence<0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{}, make_tuple(Sequence<0, 2, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{})); Sequence<0, 0, 0, 0, 0>{}));
...@@ -369,7 +371,7 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad ...@@ -369,7 +371,7 @@ struct DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc, float perf = (float)calculate_convolution_flops(in_n_c_hi_wi_global_desc,
wei_k_c_y_x_global_desc, wei_k_c_y_x_global_desc,
out_n_k0_ho_wo_global_desc) / out_n_k0_ho_wo_k1_global_desc) /
(std::size_t(1000) * 1000 * 1000) / ave_time; (std::size_t(1000) * 1000 * 1000) / ave_time;
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s" std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
......
...@@ -30,64 +30,31 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -30,64 +30,31 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
{ {
using namespace ck; using namespace ck;
std::cout << "device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw" std::cout << __func__ << std::endl;
<< std::endl;
DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{}; constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
constexpr auto N = OutLengths{}[I0]; const auto N = out_n_k_ho_wo_lengths[I0];
constexpr auto K = OutLengths{}[I1]; const auto K = out_n_k_ho_wo_lengths[I1];
constexpr auto C = WeiLengths{}[I1]; const auto C = wei_k_c_y_x_lengths[I1];
constexpr auto Hi = InLengths{}[I2];
constexpr auto Wi = InLengths{}[I3];
constexpr auto Ho = OutLengths{}[I2]; const auto Hi = in_n_c_hi_wi_lengths[I2];
constexpr auto Wo = OutLengths{}[I3]; const auto Wi = in_n_c_hi_wi_lengths[I3];
constexpr auto Y = WeiLengths{}[I2]; const auto Ho = out_n_k_ho_wo_lengths[I2];
constexpr auto X = WeiLengths{}[I3]; const auto Wo = out_n_k_ho_wo_lengths[I3];
constexpr auto C0 = C / Number<InWeiVectorSize>{}; const auto Y = wei_k_c_y_x_lengths[I2];
constexpr auto C1 = Number<InWeiVectorSize>{}; const auto X = wei_k_c_y_x_lengths[I3];
constexpr auto K0 = K / Number<InWeiVectorSize>{}; const auto C0 = C / Number<InWeiVectorSize>{};
constexpr auto K1 = Number<InWeiVectorSize>{}; const auto C1 = Number<InWeiVectorSize>{};
#if 0 const auto K0 = K / Number<InWeiVectorSize>{};
// run-time variables const auto K1 = Number<InWeiVectorSize>{};
const auto in_n_c_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(to_multi_index(InDesc::GetLengths()));
const auto wei_k_c_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(to_multi_index(WeiDesc::GetLengths()));
const auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(to_multi_index(OutDesc::GetLengths()));
const auto conv_strides = to_multi_index(ConvStrides{});
const auto conv_dilations = to_multi_index(ConvDilations{});
const auto in_left_pads = to_multi_index(InLeftPads{});
const auto in_right_pads = to_multi_index(InRightPads{});
#else
// compile-time variables
constexpr auto in_n_c0_hi_wi_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<N>{}, Number<C0>{}, Number<Hi>{}, Number<Wi>{}));
constexpr auto wei_k_c0_y_x_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<K>{}, Number<C0>{}, Number<Y>{}, Number<X>{}));
constexpr auto out_n_k0_ho_wo_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<N>{}, Number<K0>{}, Number<Ho>{}, Number<Wo>{}));
// constexpr auto conv_strides = sequence_to_tuple_of_number(ConvStrides{});
// constexpr auto conv_dilations = sequence_to_tuple_of_number(ConvDilations{});
// constexpr auto in_left_pads = sequence_to_tuple_of_number(InLeftPads{});
// constexpr auto in_right_pads = sequence_to_tuple_of_number(InRightPads{});
#endif
Tensor<TInWei> in_n_c0_hi_wi_c1( Tensor<TInWei> in_n_c0_hi_wi_c1(
HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1})); HostTensorDescriptor(std::initializer_list<index_t>{N, C0, Hi, Wi, C1}));
...@@ -109,9 +76,23 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -109,9 +76,23 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)(); make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)();
make_ParallelTensorFunctor(f_kcyx2kc0yxc1, K, Y, X, C)(); make_ParallelTensorFunctor(f_kcyx2kc0yxc1, K, Y, X, C)();
in_n_c_hi_wi_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data()); DeviceMem in_n_c0_hi_wi_c1_device_buf(sizeof(TInWei) *
wei_k_c_y_x_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data()); in_n_c0_hi_wi_c1.mDesc.GetElementSpace());
DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
const auto in_n_c0_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, C0, Hi, Wi));
const auto wei_k_c0_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(K, C0, Y, X));
const auto out_n_k0_ho_wo_k1_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K0, Ho, Wo, K1));
#if 1
// cdata = 64, BlockSize = 64, 16x8x32x4 // cdata = 64, BlockSize = 64, 16x8x32x4
constexpr index_t BlockSize = 64; constexpr index_t BlockSize = 64;
...@@ -133,23 +114,45 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -133,23 +114,45 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
constexpr index_t BThreadTransferSrcScalarPerVector_W = 1; constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
constexpr index_t CThreadTransferDstScalarPerVector_W = 1; constexpr index_t CThreadTransferDstScalarPerVector_W = 8;
std::cerr << "conv_fp16_nchwc" << C1 << "_n" << N << "c" << C << "h" << Hi << "w" << Wi << "-k" static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
<< K << "c" << C << "y" << Y << "x" << X << "-u" << conv_strides[I0] << "v" #else
<< conv_strides[I1] << "l" << conv_dilations[I0] << "j" << conv_dilations[I1] << "q" constexpr index_t BlockSize = 64;
<< in_left_pads[I0] << "p" << in_right_pads[I0] << std::endl;
constexpr index_t KPerBlock = 16;
constexpr index_t HoPerBlock = 8;
constexpr index_t WoPerBlock = 32;
constexpr index_t EPerBlock = 1;
constexpr index_t KPerThread = 16;
constexpr index_t HoPerThread = 2;
constexpr index_t WoPerThread = 2;
constexpr index_t EPerThread = EPerBlock;
using ABlockTransferThreadSliceLengths_E_K = Sequence<9, 1>;
using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, 16>;
constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
constexpr index_t CThreadTransferDstScalarPerVector_W = K1;
static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
#endif
constexpr auto conv_driver = constexpr auto conv_driver =
#if 0 #if 0
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
#else #else
DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
#endif #endif
<BlockSize, <BlockSize,
typename vector_type<TInWei, InWeiVectorSize>::type, typename vector_type<TInWei, InWeiVectorSize>::type,
TAcc, TAcc,
typename vector_type<TOut, InWeiVectorSize>::type, TOut,
KPerBlock, KPerBlock,
HoPerBlock, HoPerBlock,
WoPerBlock, WoPerBlock,
...@@ -167,26 +170,23 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw( ...@@ -167,26 +170,23 @@ void device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
conv_driver.Run(wei_k_c0_y_x_desc, conv_driver.Run(wei_k_c0_y_x_desc,
in_n_c0_hi_wi_desc, in_n_c0_hi_wi_desc,
out_n_k0_ho_wo_desc, out_n_k0_ho_wo_k1_desc,
conv_strides, conv_strides,
conv_dilations, conv_dilations,
in_left_pads, in_left_pads,
in_right_pads, in_right_pads,
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>( static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
wei_k_c_y_x_device_buf.GetDeviceBuffer()), wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>( static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
in_n_c_hi_wi_device_buf.GetDeviceBuffer()), in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
static_cast<typename vector_type<TOut, InWeiVectorSize>::type*>( static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()));
out_n_k_ho_wo_device_buf.GetDeviceBuffer()));
out_n_k_ho_wo_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data()); out_n_k0_ho_wo_k1_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
#if 1
auto f_nk0hwk1_to_nkhw = [&](auto n, auto k, auto ho, auto wo) { auto f_nk0hwk1_to_nkhw = [&](auto n, auto k, auto ho, auto wo) {
out_n_k_ho_wo(n, k, ho, wo) = out_n_k_ho_wo(n, k, ho, wo) =
out_n_k0_ho_wo_k1(n, k / InWeiVectorSize, ho, wo, k % InWeiVectorSize); out_n_k0_ho_wo_k1(n, k / InWeiVectorSize, ho, wo, k % InWeiVectorSize);
}; };
make_ParallelTensorFunctor(f_nk0hwk1_to_nkhw, N, K, Ho, Wo)(); make_ParallelTensorFunctor(f_nk0hwk1_to_nkhw, N, K, Ho, Wo)();
#endif
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment