"...git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "78ee07eabd876fd6702443859c46d7f9a67dc351"
Commit 4e5e68a1 authored by Jing Zhang's avatar Jing Zhang
Browse files

add bias

parents 157ce4cc af84fba3
...@@ -17,7 +17,7 @@ template <typename GridwiseGemm, ...@@ -17,7 +17,7 @@ template <typename GridwiseGemm,
typename FloatC, typename FloatC,
typename AGridDesc_E0_E1_K0_K1_E2, typename AGridDesc_E0_E1_K0_K1_E2,
typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2, typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
typename CGridDesc_K_N_H0_H1_H2_W0_W1_W2, typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W, typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop> bool HasMainE0BlockLoop>
__global__ void __global__ void
...@@ -27,10 +27,11 @@ __global__ void ...@@ -27,10 +27,11 @@ __global__ void
kernel_gemm_dlops_v2( kernel_gemm_dlops_v2(
const FloatAB* __restrict__ p_a_grid, const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid, const FloatAB* __restrict__ p_b_grid,
const FloatC* __restrict__ p_bias_grid,
FloatC* __restrict__ p_c_grid, FloatC* __restrict__ p_c_grid,
const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc, const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc,
const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
const CGridDesc_K_N_H0_H1_H2_W0_W1_W2 c_k_n_h0_h1_h2_w0_w1_w2_grid_desc, const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
const CBlockIdToBlockClusterAdaptor_K_N_H_W c_blockid_to_k_n_h_w_block_cluster_adaptor) const CBlockIdToBlockClusterAdaptor_K_N_H_W c_blockid_to_k_n_h_w_block_cluster_adaptor)
{ {
constexpr index_t shared_block_size = constexpr index_t shared_block_size =
...@@ -40,11 +41,12 @@ __global__ void ...@@ -40,11 +41,12 @@ __global__ void
GridwiseGemm::Run(p_a_grid, GridwiseGemm::Run(p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid,
p_c_grid, p_c_grid,
p_shared_block, p_shared_block,
a_e0_e1_k0_k1_e2_grid_desc, a_e0_e1_k0_k1_e2_grid_desc,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
c_k_n_h0_h1_h2_w0_w1_w2_grid_desc, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor, c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>{}); integral_constant<bool, HasMainE0BlockLoop>{});
} }
...@@ -57,7 +59,7 @@ template <typename GridwiseGemm, ...@@ -57,7 +59,7 @@ template <typename GridwiseGemm,
typename FloatC, typename FloatC,
typename AGridDesc_E0_E1_K0_K1_E2, typename AGridDesc_E0_E1_K0_K1_E2,
typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2, typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
typename CGridDesc_K_N_H0_H1_H2_W0_W1_W2, typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W, typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop> bool HasMainE0BlockLoop>
__global__ void __global__ void
...@@ -66,10 +68,11 @@ __global__ void ...@@ -66,10 +68,11 @@ __global__ void
#endif #endif
kernel_gemm_dlops_v2(const FloatAB* __restrict__ p_a_grid, kernel_gemm_dlops_v2(const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid, const FloatAB* __restrict__ p_b_grid,
const FloatC* __restrict__ p_bias_grid,
FloatC* __restrict__ p_c_grid, FloatC* __restrict__ p_c_grid,
const void CONSTANT* p_a_e0_e1_k0_k1_e2_grid_desc, const void CONSTANT* p_a_e0_e1_k0_k1_e2_grid_desc,
const void CONSTANT* p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, const void CONSTANT* p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
const void CONSTANT* p_c_k_n_h0_h1_h2_w0_w1_w2_grid_desc, const void CONSTANT* p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
const void CONSTANT* p_c_blockid_to_k_n_h_w_block_cluster_adaptor) const void CONSTANT* p_c_blockid_to_k_n_h_w_block_cluster_adaptor)
{ {
// first cast void CONSTANT void* to void* // first cast void CONSTANT void* to void*
...@@ -80,9 +83,9 @@ __global__ void ...@@ -80,9 +83,9 @@ __global__ void
const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc = const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
*reinterpret_cast<const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2*>( *reinterpret_cast<const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2*>(
cast_pointer_to_generic_address_space(p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc)); cast_pointer_to_generic_address_space(p_b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc));
const auto c_k_n_h0_h1_h2_w0_w1_w2_grid_desc = const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc =
*reinterpret_cast<const CGridDesc_K_N_H0_H1_H2_W0_W1_W2*>( *reinterpret_cast<const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2*>(
cast_pointer_to_generic_address_space(p_c_k_n_h0_h1_h2_w0_w1_w2_grid_desc)); cast_pointer_to_generic_address_space(p_c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc));
const auto c_blockid_to_k_n_h_w_block_cluster_adaptor = const auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
*reinterpret_cast<const CBlockIdToBlockClusterAdaptor_K_N_H_W*>( *reinterpret_cast<const CBlockIdToBlockClusterAdaptor_K_N_H_W*>(
cast_pointer_to_generic_address_space(p_c_blockid_to_k_n_h_w_block_cluster_adaptor)); cast_pointer_to_generic_address_space(p_c_blockid_to_k_n_h_w_block_cluster_adaptor));
...@@ -94,11 +97,12 @@ __global__ void ...@@ -94,11 +97,12 @@ __global__ void
GridwiseGemm::Run(p_a_grid, GridwiseGemm::Run(p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid,
p_c_grid, p_c_grid,
p_shared_block, p_shared_block,
a_e0_e1_k0_k1_e2_grid_desc, a_e0_e1_k0_k1_e2_grid_desc,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
c_k_n_h0_h1_h2_w0_w1_w2_grid_desc, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor, c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>{}); integral_constant<bool, HasMainE0BlockLoop>{});
} }
...@@ -143,7 +147,7 @@ template <index_t BlockSize, ...@@ -143,7 +147,7 @@ template <index_t BlockSize,
typename CGlobalStepHacks, typename CGlobalStepHacks,
typename AGlobalMoveSliceWindowStepHacks, typename AGlobalMoveSliceWindowStepHacks,
typename BGlobalMoveSliceWindowStepHacks, typename BGlobalMoveSliceWindowStepHacks,
index_t activ_type = 0> ActivTypeEnum_t activ_type = ActivTypeEnum_t::None>
struct GridwiseGemmDlops_km_kn_mn_v3 struct GridwiseGemmDlops_km_kn_mn_v3
{ {
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
...@@ -159,6 +163,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -159,6 +163,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
static constexpr auto NPerBlock = I1; static constexpr auto NPerBlock = I1;
static constexpr FloatC alpha = 0.3;
__host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
{ {
constexpr auto max_lds_align = Number<ABlockTransferDstScalarPerVector_E2>{}; constexpr auto max_lds_align = Number<ABlockTransferDstScalarPerVector_E2>{};
...@@ -338,10 +344,20 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -338,10 +344,20 @@ struct GridwiseGemmDlops_km_kn_mn_v3
using CBlockIdToBlockClusterAdaptor_K_N_H_W = using CBlockIdToBlockClusterAdaptor_K_N_H_W =
decltype(MakeCBlockIdToKNHoWoBlockClusterAdaptor(CGridDesc_K_N_Ho_Wo{})); decltype(MakeCBlockIdToKNHoWoBlockClusterAdaptor(CGridDesc_K_N_Ho_Wo{}));
__host__ __device__ static constexpr auto MakeBiasK0K1GridDescriptor(
const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc)
{
const auto K0 = c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetLength(I0);
const auto K1 = c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetLength(I1);
return make_naive_tensor_descriptor_packed(make_tuple(K0, K1));
}
template <bool HasMainE0BlockLoop> template <bool HasMainE0BlockLoop>
__device__ static void __device__ static void
Run(const FloatAB* __restrict__ p_a_global, Run(const FloatAB* __restrict__ p_a_global,
const FloatAB* __restrict__ p_b_global, const FloatAB* __restrict__ p_b_global,
const FloatC* __restrict__ p_bias_global,
FloatC* __restrict__ p_c_global, FloatC* __restrict__ p_c_global,
FloatAB* __restrict__ p_shared_block, FloatAB* __restrict__ p_shared_block,
const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc, const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
...@@ -350,10 +366,16 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -350,10 +366,16 @@ struct GridwiseGemmDlops_km_kn_mn_v3
const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor, const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>) integral_constant<bool, HasMainE0BlockLoop>)
{ {
const auto bias_k0_k1_grid_desc =
MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>( const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize()); p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize());
const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>( const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize()); p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize());
auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>( auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize()); p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize());
...@@ -729,7 +751,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -729,7 +751,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
static_for<0, c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(), 1>{}([&](auto i) { static_for<0, c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(), 1>{}([&](auto i) {
if constexpr(activ_type == 1) if constexpr(activ_type == 1)
{ {
c_thread_buf(i) = c_thread_buf[i] >= 0 ? c_thread_buf[i] : 0.0; c_thread_buf(i) =
c_thread_buf[i] >= 0 ? c_thread_buf[i] : alpha * c_thread_buf[i];
} }
else if constexpr(activ_type == 2) else if constexpr(activ_type == 2)
{ {
...@@ -745,6 +768,55 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -745,6 +768,55 @@ struct GridwiseGemmDlops_km_kn_mn_v3
}); });
} }
// bias
{
constexpr auto bias_k0_k1_thread_desc =
make_naive_tensor_descriptor_packed(make_tuple(I1, Number<KPerThread>{}));
StaticBuffer<AddressSpaceEnum_t::Vgpr,
FloatC,
bias_k0_k1_thread_desc.GetElementSpaceSize(),
true>
bias_thread_buf;
const index_t k_thread_data_on_global = k_thread_id * KPerThread;
auto bias_threadwise_transfer =
ThreadwiseTensorSliceTransfer_v2<FloatC,
FloatC,
decltype(bias_k0_k1_grid_desc),
decltype(bias_k0_k1_thread_desc),
Sequence<I1, Number<KPerThread>{}>,
Sequence<0, 1>,
1,
CThreadTransferDstScalarPerVector,
false,
true>(
bias_k0_k1_grid_desc,
make_multi_index(k_block_work_id, k_thread_data_on_global));
constexpr auto bias_k0_k1_global_tensor_step_hacks = make_tuple(
make_tuple(Sequence<0>{}, Sequence<0>{}), make_tuple(Sequence<0>{}, Sequence<0>{}));
bias_threadwise_transfer.Run(bias_k0_k1_grid_desc,
bias_global_buf,
bias_k0_k1_thread_desc,
make_tuple(I0, I0),
bias_thread_buf,
bias_k0_k1_global_tensor_step_hacks);
static_for<0, KPerThread, 1>{}([&](auto ki) {
static_for<0, HoPerThread, 1>{}([&](auto hi) {
static_for<0, WoPerThread, 1>{}([&](auto wi) {
constexpr index_t c_offset = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
make_tuple(ki, 0, hi, wi));
c_thread_buf(Number<c_offset>{}) =
c_thread_buf[Number<c_offset>{}] + bias_thread_buf[ki];
});
});
});
}
// output: register to global memory // output: register to global memory
{ {
// hack to control index calculation when iterating over c_k_n_h0_h1_h2_w0_w1_w2_global // hack to control index calculation when iterating over c_k_n_h0_h1_h2_w0_w1_w2_global
......
...@@ -127,6 +127,13 @@ enum InMemoryDataOperationEnum_t ...@@ -127,6 +127,13 @@ enum InMemoryDataOperationEnum_t
AtomicAdd AtomicAdd
}; };
enum ActivTypeEnum_t
{
None = 0,
LeakyRelu,
Sigmoid
};
// index type // index type
using index_t = int32_t; using index_t = int32_t;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
template <typename TInWei, template <typename TInWei,
typename TAcc, typename TAcc,
typename TOut, typename TOut,
ck::index_t activ_type, ck::ActivTypeEnum_t activ_type,
typename InLengths, typename InLengths,
typename WeiLengths, typename WeiLengths,
typename OutLengths, typename OutLengths,
...@@ -24,6 +24,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1 ...@@ -24,6 +24,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
const InRightPads& in_right_pads, const InRightPads& in_right_pads,
const Tensor<TInWei>& in_n_c0_hi_wi_c1, const Tensor<TInWei>& in_n_c0_hi_wi_c1,
const Tensor<TInWei>& wei_k_c0_y_x_c1, const Tensor<TInWei>& wei_k_c0_y_x_c1,
const Tensor<TOut>& bias_k0_k1,
Tensor<TOut>& out_n_k0_ho_wo_k1, Tensor<TOut>& out_n_k0_ho_wo_k1,
ck::index_t nrepeat) ck::index_t nrepeat)
{ {
...@@ -57,9 +58,11 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1 ...@@ -57,9 +58,11 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace()); DeviceMem wei_k_c0_y_x_c1_device_buf(sizeof(TInWei) * wei_k_c0_y_x_c1.mDesc.GetElementSpace());
DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) * DeviceMem out_n_k0_ho_wo_k1_device_buf(sizeof(TOut) *
out_n_k0_ho_wo_k1.mDesc.GetElementSpace()); out_n_k0_ho_wo_k1.mDesc.GetElementSpace());
DeviceMem bias_k0_k1_device_buf(sizeof(TOut) * bias_k0_k1.mDesc.GetElementSpace());
in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data()); in_n_c0_hi_wi_c1_device_buf.ToDevice(in_n_c0_hi_wi_c1.mData.data());
wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data()); wei_k_c0_y_x_c1_device_buf.ToDevice(wei_k_c0_y_x_c1.mData.data());
bias_k0_k1_device_buf.ToDevice(bias_k0_k1.mData.data());
constexpr index_t InWeiVectorSize = 8; constexpr index_t InWeiVectorSize = 8;
...@@ -173,6 +176,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1 ...@@ -173,6 +176,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()), wei_k_c0_y_x_c1_device_buf.GetDeviceBuffer()),
static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>( static_cast<typename vector_type<TInWei, InWeiVectorSize>::type*>(
in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()), in_n_c0_hi_wi_c1_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(bias_k0_k1_device_buf.GetDeviceBuffer()),
static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()), static_cast<TOut*>(out_n_k0_ho_wo_k1_device_buf.GetDeviceBuffer()),
nrepeat); nrepeat);
......
...@@ -27,7 +27,7 @@ template <ck::index_t BlockSize, ...@@ -27,7 +27,7 @@ template <ck::index_t BlockSize,
ck::index_t ABlockTransferDstScalarPerVector_E2, ck::index_t ABlockTransferDstScalarPerVector_E2,
ck::index_t BThreadTransferSrcScalarPerVector_E2, ck::index_t BThreadTransferSrcScalarPerVector_E2,
ck::index_t CThreadTransferDstScalarPerVector_K, ck::index_t CThreadTransferDstScalarPerVector_K,
ck::index_t activ_type> ck::ActivTypeEnum_t activ_type>
struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad
{ {
template <typename... Wei, template <typename... Wei,
...@@ -46,6 +46,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -46,6 +46,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
const InRightPads& in_right_pads, const InRightPads& in_right_pads,
const FloatAB* __restrict__ p_a_grid, const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid, const FloatAB* __restrict__ p_b_grid,
const FloatC* __restrict__ p_bias_grid,
FloatC* __restrict__ p_c_grid, FloatC* __restrict__ p_c_grid,
const int nrepeat) const const int nrepeat) const
{ {
...@@ -61,7 +62,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -61,7 +62,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
const auto C0 = in_n_c0_hi_wi_c1_global_desc.GetLength(I1); const auto C0 = in_n_c0_hi_wi_c1_global_desc.GetLength(I1);
const auto Hi = in_n_c0_hi_wi_c1_global_desc.GetLength(I2); const auto Hi = in_n_c0_hi_wi_c1_global_desc.GetLength(I2);
const auto Wi = in_n_c0_hi_wi_c1_global_desc.GetLength(I3); const auto Wi = in_n_c0_hi_wi_c1_global_desc.GetLength(I3);
//const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4); // const auto C1 = in_n_c0_hi_wi_c1_global_desc.GetLength(I4);
const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1); const auto K0 = out_n_k0_ho_wo_k1_global_desc.GetLength(I1);
const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2); const auto Ho = out_n_k0_ho_wo_k1_global_desc.GetLength(I2);
...@@ -348,6 +349,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -348,6 +349,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
0, 0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid,
p_c_grid, p_c_grid,
a_e0_e1_k0_k1_e2_grid_desc, a_e0_e1_k0_k1_e2_grid_desc,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
...@@ -373,6 +375,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -373,6 +375,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
0, 0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid,
p_c_grid, p_c_grid,
a_e0_e1_k0_k1_e2_grid_desc, a_e0_e1_k0_k1_e2_grid_desc,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
...@@ -418,6 +421,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -418,6 +421,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
0, 0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid,
p_c_grid, p_c_grid,
cast_pointer_to_constant_address_space( cast_pointer_to_constant_address_space(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()), a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
...@@ -449,6 +453,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -449,6 +453,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
0, 0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid,
p_c_grid, p_c_grid,
cast_pointer_to_constant_address_space( cast_pointer_to_constant_address_space(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()), a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
......
...@@ -92,19 +92,17 @@ int main(int argc, char* argv[]) ...@@ -92,19 +92,17 @@ int main(int argc, char* argv[])
const bool do_log = std::stoi(argv[4]); const bool do_log = std::stoi(argv[4]);
const int nrepeat = std::stoi(argv[5]); const int nrepeat = std::stoi(argv[5]);
constexpr index_t activ_type = 0;
#if 0 #if 0
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
constexpr auto Hi = Number<1080>{}; constexpr auto Hi = Number<1080>{};
constexpr auto Wi = Number<1920>{}; constexpr auto Wi = Number<1920>{};
constexpr auto Y = Number<3>{}; constexpr auto Y = Number<3>{};
constexpr auto X = Number<3>{}; constexpr auto X = Number<3>{};
constexpr auto C0 = Number<2>{}; constexpr auto C0 = Number<2>{};
constexpr auto C1 = Number<8>{}; constexpr auto C1 = Number<8>{};
constexpr auto K1 = Number<8>{}; constexpr auto K1 = Number<8>{};
constexpr auto K0 = Number<8>{}; constexpr auto K0 = Number<8>{};
#elif 1 #elif 0
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
constexpr auto Hi = Number<540>{}; constexpr auto Hi = Number<540>{};
constexpr auto Wi = Number<960>{}; constexpr auto Wi = Number<960>{};
...@@ -114,7 +112,7 @@ int main(int argc, char* argv[]) ...@@ -114,7 +112,7 @@ int main(int argc, char* argv[])
constexpr auto C1 = Number<8>{}; constexpr auto C1 = Number<8>{};
constexpr auto K1 = Number<8>{}; constexpr auto K1 = Number<8>{};
constexpr auto K0 = Number<8>{}; constexpr auto K0 = Number<8>{};
#elif 0 #elif 1
constexpr auto N = Number<1>{}; constexpr auto N = Number<1>{};
constexpr auto Hi = Number<270>{}; constexpr auto Hi = Number<270>{};
constexpr auto Wi = Number<480>{}; constexpr auto Wi = Number<480>{};
...@@ -166,7 +164,8 @@ int main(int argc, char* argv[]) ...@@ -166,7 +164,8 @@ int main(int argc, char* argv[])
using out_data_t = int8_t; using out_data_t = int8_t;
#endif #endif
std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5); std::vector<std::size_t> in_lengths_host(5), wei_lengths_host(5), out_lengths_host(5),
bias_lengths_host(2);
in_lengths_host[0] = static_cast<std::size_t>(N); in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(C0); in_lengths_host[1] = static_cast<std::size_t>(C0);
...@@ -186,13 +185,18 @@ int main(int argc, char* argv[]) ...@@ -186,13 +185,18 @@ int main(int argc, char* argv[])
out_lengths_host[3] = static_cast<std::size_t>(Wo); out_lengths_host[3] = static_cast<std::size_t>(Wo);
out_lengths_host[4] = static_cast<std::size_t>(K1); out_lengths_host[4] = static_cast<std::size_t>(K1);
bias_lengths_host[0] = static_cast<std::size_t>(K0);
bias_lengths_host[1] = static_cast<std::size_t>(K1);
Tensor<in_data_t> in(in_lengths_host); Tensor<in_data_t> in(in_lengths_host);
Tensor<in_data_t> wei(wei_lengths_host); Tensor<in_data_t> wei(wei_lengths_host);
Tensor<out_data_t> bias(bias_lengths_host);
Tensor<out_data_t> out_host(out_lengths_host); Tensor<out_data_t> out_host(out_lengths_host);
Tensor<out_data_t> out_device(out_lengths_host); Tensor<out_data_t> out_device(out_lengths_host);
ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: "); ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: "); ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
ostream_HostTensorDescriptor(bias.mDesc, std::cout << "bias: ");
ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: "); ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w)); print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
...@@ -210,22 +214,27 @@ int main(int argc, char* argv[]) ...@@ -210,22 +214,27 @@ int main(int argc, char* argv[])
case 1: case 1:
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
break; break;
case 2: case 2:
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread); in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
break; break;
case 3: case 3:
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
bias.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
break; break;
case 4: case 4:
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
break; break;
case 5: case 5:
in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread); in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread); wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
bias.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
break; break;
default: default:
in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
...@@ -254,6 +263,8 @@ int main(int argc, char* argv[]) ...@@ -254,6 +263,8 @@ int main(int argc, char* argv[])
in_right_pads_dev); in_right_pads_dev);
}; };
constexpr ck::ActivTypeEnum_t activ_type = ActivTypeEnum_t::None;
#if USE_CONV_FWD_V5R1_NCHWC #if USE_CONV_FWD_V5R1_NCHWC
if(algo == ConvForwardAlgo::V5R1NCHWC) if(algo == ConvForwardAlgo::V5R1NCHWC)
{ {
...@@ -272,6 +283,7 @@ int main(int argc, char* argv[]) ...@@ -272,6 +283,7 @@ int main(int argc, char* argv[])
tmp[I6], tmp[I6],
in, in,
wei, wei,
bias,
out_device, out_device,
nrepeat); nrepeat);
} }
...@@ -281,6 +293,7 @@ int main(int argc, char* argv[]) ...@@ -281,6 +293,7 @@ int main(int argc, char* argv[])
{ {
host_direct_convolution_nchwc(in, host_direct_convolution_nchwc(in,
wei, wei,
bias,
out_host, out_host,
make_tuple(conv_stride_h, conv_stride_w), make_tuple(conv_stride_h, conv_stride_w),
make_tuple(conv_dilation_h, conv_dilation_w), make_tuple(conv_dilation_h, conv_dilation_w),
...@@ -294,6 +307,7 @@ int main(int argc, char* argv[]) ...@@ -294,6 +307,7 @@ int main(int argc, char* argv[])
{ {
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "bias: ", bias.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
} }
......
...@@ -110,6 +110,7 @@ template <typename TIn, ...@@ -110,6 +110,7 @@ template <typename TIn,
typename InRightPads> typename InRightPads>
void host_direct_convolution_nchwc(const Tensor<TIn>& in, void host_direct_convolution_nchwc(const Tensor<TIn>& in,
const Tensor<TWei>& wei, const Tensor<TWei>& wei,
const Tensor<TOut>& bias,
Tensor<TOut>& out, Tensor<TOut>& out,
const ConvStrides& conv_strides, const ConvStrides& conv_strides,
const ConvDilations& conv_dilations, const ConvDilations& conv_dilations,
...@@ -123,7 +124,8 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in, ...@@ -123,7 +124,8 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in,
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) { auto f_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
double v = 0; double v = 0;
const int k = k0 * out.mDesc.GetLengths()[4] + k1;
for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0) for(int c0 = 0; c0 < wei.mDesc.GetLengths()[1]; ++c0)
{ {
for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1) for(int c1 = 0; c1 < wei.mDesc.GetLengths()[4]; ++c1)
...@@ -138,14 +140,13 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in, ...@@ -138,14 +140,13 @@ void host_direct_convolution_nchwc(const Tensor<TIn>& in,
wi < in.mDesc.GetLengths()[3]) wi < in.mDesc.GetLengths()[3])
{ {
v += static_cast<const double>(in(n, c0, hi, wi, c1)) * v += static_cast<const double>(in(n, c0, hi, wi, c1)) *
static_cast<const double>( static_cast<const double>(wei(k, c0, y, x, c1));
wei(k0 * out.mDesc.GetLengths()[4] + k1, c0, y, x, c1));
} }
} }
} }
} }
} }
out(n, k0, ho, wo, k1) = activ(v, activ_type); out(n, k0, ho, wo, k1) = activ(v, activ_type) + bias(k0, k1);
}; };
make_ParallelTensorFunctor(f_nchw, make_ParallelTensorFunctor(f_nchw,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment