"...git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "304f17b3d838e049d0d6c103c6f91e210ffdf9c5"
Commit 1098ced2 authored by Jing Zhang's avatar Jing Zhang
Browse files

inital implement of add fusion

parent 316fcc3f
......@@ -45,6 +45,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
const InRightPads& in_right_pads,
const FloatAB* __restrict__ p_wei_global,
const FloatAB* __restrict__ p_in_global,
const FloatC* __restrict__ p_d_global,
FloatC* __restrict__ p_out_global) const
{
constexpr auto I0 = Number<0>{};
......@@ -250,6 +251,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
decltype(in_e_n_ho_wo_global_desc),
const FloatAB*,
decltype(out_k_n_hop_wop_global_desc),
const FloatC*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, true>>;
......@@ -264,6 +266,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
in_e_n_ho_wo_global_desc,
p_in_global,
out_k_n_hop_wop_global_desc,
p_d_global,
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, true>{});
......@@ -277,6 +280,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
decltype(in_e_n_ho_wo_global_desc),
const FloatAB*,
decltype(out_k_n_hop_wop_global_desc),
const FloatC*,
FloatC*,
integral_constant<bool, true>,
integral_constant<bool, false>>;
......@@ -291,6 +295,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
in_e_n_ho_wo_global_desc,
p_in_global,
out_k_n_hop_wop_global_desc,
p_d_global,
p_out_global,
integral_constant<bool, true>{},
integral_constant<bool, false>{});
......@@ -304,6 +309,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
decltype(in_e_n_ho_wo_global_desc),
const FloatAB*,
decltype(out_k_n_hop_wop_global_desc),
const FloatC*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, true>>;
......@@ -318,6 +324,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
in_e_n_ho_wo_global_desc,
p_in_global,
out_k_n_hop_wop_global_desc,
p_d_global,
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, true>{});
......@@ -331,6 +338,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
decltype(in_e_n_ho_wo_global_desc),
const FloatAB*,
decltype(out_k_n_hop_wop_global_desc),
const FloatC*,
FloatC*,
integral_constant<bool, false>,
integral_constant<bool, false>>;
......@@ -345,6 +353,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
in_e_n_ho_wo_global_desc,
p_in_global,
out_k_n_hop_wop_global_desc,
p_d_global,
p_out_global,
integral_constant<bool, false>{},
integral_constant<bool, false>{});
......
......@@ -74,6 +74,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
const BGlobalDesc& b_e_n_ho_wo_global_desc,
const FloatAB* __restrict__ p_b_global,
const CGlobalDesc& c_k_n_ho_wo_global_desc,
const FloatC* __restrict__ p_d_global,
FloatC* __restrict__ p_c_global,
FloatAB* __restrict__ p_shared_block,
integral_constant<bool, HasMainKBlockLoop>,
......@@ -352,6 +353,48 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
}
#endif
FloatC p_d_thread[c_k_n_ho_wo_thread_desc.GetElementSpaceSize()];
threadwise_matrix_set_zero_v3(c_k_n_ho_wo_thread_desc, p_d_thread);
#if 1
{
constexpr auto c_k_n_ho_wo_global_tensor_iterator_hacks = CGlobalIteratorHacks{};
const index_t k_thread_data_on_global =
k_block_data_on_global + k_thread_id * KPerThread;
ThreadwiseDynamicTensorSliceTransfer_v2<
FloatC,
FloatC,
decltype(c_k_n_ho_wo_global_desc),
decltype(c_k_n_ho_wo_thread_desc),
Sequence<KPerThread, 1, HoPerThread, WoPerThread>,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
AddressSpace::Global,
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
1,
true>(
c_k_n_ho_wo_global_desc,
make_multi_index(
k_thread_data_on_global, 0, ho_thread_data_on_global, wo_thread_data_on_global))
.Run(c_k_n_ho_wo_global_desc,
p_d_global,
c_k_n_ho_wo_thread_desc,
make_tuple(I0, I0, I0, I0),
p_d_thread,
c_k_n_ho_wo_global_tensor_iterator_hacks);
}
#endif
for(index_t i = 0; i < c_k_n_ho_wo_thread_desc.GetElementSpaceSize(); i++)
{
p_d_thread[i] += p_c_thread[i];
}
#if 1
// output: register to global memory
{
......@@ -362,7 +405,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
k_block_data_on_global + k_thread_id * KPerThread;
ThreadwiseDynamicTensorSliceTransfer_v1r3<
FloatAcc,
FloatC,
FloatC,
decltype(c_k_n_ho_wo_thread_desc),
decltype(c_k_n_ho_wo_global_desc),
......@@ -380,7 +423,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
k_thread_data_on_global, 0, ho_thread_data_on_global, wo_thread_data_on_global))
.Run(c_k_n_ho_wo_thread_desc,
make_tuple(I0, I0, I0, I0),
p_c_thread,
p_d_thread,
c_k_n_ho_wo_global_desc,
p_c_global,
c_k_n_ho_wo_global_tensor_iterator_hacks);
......@@ -395,6 +438,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
const BGlobalDesc& b_e_n_ho_wo_global_desc,
const FloatAB* __restrict__ p_b_global,
const CGlobalDesc& c_k_n_ho_wo_global_desc,
const FloatC* __restrict__ p_d_global,
FloatC* __restrict__ p_c_global,
integral_constant<bool, HasMainKBlockLoop>,
integral_constant<bool, HasDoubleTailKBlockLoop>) const
......@@ -408,6 +452,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
b_e_n_ho_wo_global_desc,
p_b_global,
c_k_n_ho_wo_global_desc,
p_d_global,
p_c_global,
p_shared_block,
integral_constant<bool, HasMainKBlockLoop>{},
......@@ -421,6 +466,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
const BGlobalDesc* p_b_e_n_ho_wo_global_desc,
const FloatAB* __restrict__ p_b_global,
const CGlobalDesc* p_c_k_n_ho_wo_global_desc,
const FloatC* __restrict__ p_d_global,
FloatC* __restrict__ p_c_global,
integral_constant<bool, HasMainKBlockLoop>,
integral_constant<bool, HasDoubleTailKBlockLoop>) const
......@@ -434,6 +480,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
b_e_n_ho_wo_global_desc,
p_b_global,
c_k_n_ho_wo_global_desc,
p_d_global,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
......@@ -446,6 +493,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
const void* p_b_e_n_ho_wo_global_desc,
const FloatAB* __restrict__ p_b_global,
const void* p_c_k_n_ho_wo_global_desc,
const FloatC* __restrict__ p_d_global,
FloatC* __restrict__ p_c_global,
integral_constant<bool, HasMainKBlockLoop>,
integral_constant<bool, HasDoubleTailKBlockLoop>) const
......@@ -461,6 +509,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
b_e_n_ho_wo_global_desc,
p_b_global,
c_k_n_ho_wo_global_desc,
p_d_global,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
......
......@@ -21,6 +21,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
WeiDesc,
const Tensor<TInWei>& wei_k_c_y_x,
OutDesc,
Tensor<TOut>& add_n_k_ho_wo,
Tensor<TOut>& out_n_k_ho_wo,
ConvStrides,
ConvDilations,
......@@ -35,6 +36,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
DeviceMem in_n_c_hi_wi_device_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_k_c_y_x_device_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem add_n_k_ho_wo_device_buf(sizeof(TOut) * add_n_k_ho_wo.mDesc.GetElementSpace());
DeviceMem out_n_k_ho_wo_device_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
constexpr auto I0 = Number<0>{};
......@@ -93,6 +95,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
make_native_tensor_descriptor_packed(Sequence<N, C0, Hi, Wi, C1>{})));
Tensor<TInWei> wei_k_c0_y_x_c1(make_HostTensorDescriptor(
make_native_tensor_descriptor_packed(Sequence<K, C0, Y, X, C1>{})));
Tensor<TOut> add_n_k0_ho_wo_k1(make_HostTensorDescriptor(
make_native_tensor_descriptor_packed(Sequence<N, K0, Ho, Wo, K1>{})));
Tensor<TOut> out_n_k0_ho_wo_k1(make_HostTensorDescriptor(
make_native_tensor_descriptor_packed(Sequence<N, K0, Ho, Wo, K1>{})));
......@@ -106,11 +110,18 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
wei_k_c_y_x(k, c, y, x);
};
auto f_nkhw_to_nk0hwk1 = [&](auto n, auto k, auto ho, auto wo) {
add_n_k0_ho_wo_k1(n, k / InWeiVectorSize, ho, wo, k % InWeiVectorSize) =
add_n_k_ho_wo(n, k, ho, wo);
};
make_ParallelTensorFunctor(f_nchw2nc0hwc1, N, Hi, Wi, C)();
make_ParallelTensorFunctor(f_kcyx2kc0yxc1, K, Y, X, C)();
make_ParallelTensorFunctor(f_nkhw_to_nk0hwk1, N, K, Ho, Wo)();
in_n_c_hi_wi_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
wei_k_c_y_x_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
add_n_k_ho_wo_device_buf.ToDevice(add_n_k0_ho_wo_k1.mData.data());
#if 1
// cdata = 64, BlockSize = 64, 16x8x32x4
......@@ -126,15 +137,15 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
constexpr index_t WoPerThread = 2;
constexpr index_t EPerThread = EPerBlock;
using ABlockTransferThreadSliceLengths_E_K = Sequence<3, 1>;
using ABlockTransferThreadClusterLengths_E_K = Sequence<3 * EPerBlock, KPerBlock>;
using ABlockTransferThreadSliceLengths_E_K = Sequence<9, 1>;
using ABlockTransferThreadClusterLengths_E_K = Sequence<EPerBlock, KPerBlock>;
constexpr index_t ABlockTransferSrcScalarPerVector_E = 1;
constexpr index_t ABlockTransferDstScalarPerVector_K = 1;
constexpr index_t BThreadTransferSrcScalarPerVector_W = 1;
constexpr index_t CThreadTransferDstScalarPerVector_W = K1;
constexpr index_t CThreadTransferDstScalarPerVector_W = 1;
static_assert(KPerThread % CThreadTransferDstScalarPerVector_W == 0, "");
#else
......@@ -196,6 +207,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
wei_k_c_y_x_device_buf.GetDeviceBuffer()),
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(add_n_k_ho_wo_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()));
out_n_k_ho_wo_device_buf.FromDevice(out_n_k0_ho_wo_k1.mData.data());
......
......@@ -40,7 +40,7 @@ void host_direct_convolution(const Tensor<TIn>& in_nchw,
}
}
}
out_nkhw(n, k, ho, wo) = v;
out_nkhw(n, k, ho, wo) += v;
};
auto f_par = make_ParallelTensorFunctor(f,
......
......@@ -64,7 +64,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 1
#elif 0
constexpr index_t N = 1;
constexpr index_t C = 16;
constexpr index_t HI = 1080;
......@@ -76,13 +76,13 @@ int main(int argc, char* argv[])
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 0
using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>;
#elif 1
constexpr index_t N = 1;
constexpr index_t C = 1;
constexpr index_t HI = 1024;
constexpr index_t WI = 2048;
constexpr index_t C = 4;
constexpr index_t HI = 64;
constexpr index_t WI = 64;
constexpr index_t K = 4;
constexpr index_t Y = 3;
constexpr index_t X = 3;
......@@ -630,11 +630,16 @@ int main(int argc, char* argv[])
print_array("ConvStrides", to_multi_index(ConvStrides{}));
print_array("ConvDilations", to_multi_index(ConvDilations{}));
#if 0
#if 1
using in_data_t = float;
constexpr index_t in_vector_size = 1;
using acc_data_t = float;
using out_data_t = float;
#elif 0
using in_data_t = half_t;
constexpr index_t in_vector_size = 16;
using acc_data_t = float;
using out_data_t = half_t;
#elif 0
using in_data_t = float;
constexpr index_t in_vector_size = 1;
......@@ -650,6 +655,8 @@ int main(int argc, char* argv[])
Tensor<in_data_t> in_nchw(make_HostTensorDescriptor(in_nchw_desc));
Tensor<in_data_t> wei_kcyx(make_HostTensorDescriptor(wei_kcyx_desc));
Tensor<out_data_t> out_nkhw_host(make_HostTensorDescriptor(out_nkhw_desc));
Tensor<out_data_t> add_nkhw_device(make_HostTensorDescriptor(out_nkhw_desc));
Tensor<out_data_t> out_nkhw_device(make_HostTensorDescriptor(out_nkhw_desc));
std::size_t num_thread = std::thread::hardware_concurrency();
......@@ -686,6 +693,9 @@ int main(int argc, char* argv[])
};
wei_kcyx.GenerateTensorValue(gen_wei, num_thread);
#endif
out_nkhw_host.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
add_nkhw_device.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
}
#if 0
......@@ -768,6 +778,7 @@ int main(int argc, char* argv[])
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
add_nkhw_device,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
......@@ -788,6 +799,7 @@ int main(int argc, char* argv[])
check_error(out_nkhw_host, out_nkhw_device);
#if 1
if(do_log)
{
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
......@@ -795,5 +807,6 @@ int main(int argc, char* argv[])
LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl;
}
#endif
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment