"...git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "e970a3189eddd724ea772bfe7f41ab416c8a0396"
Commit 7d0a5412 authored by root's avatar root
Browse files

threadwise transfer

parent b3a012bc
...@@ -535,7 +535,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2 ...@@ -535,7 +535,8 @@ struct ThreadwiseDynamicTensorSliceTransfer_v2
dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx + dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx +
i * src_scalar_step_in_vector); i * src_scalar_step_in_vector);
p_dst[Number<dst_offset>{}] = src_vector[i]; // p_dst[Number<dst_offset>{}] = src_vector[i];
p_dst[Number<dst_offset>{}] = src_vector.Scalars()(i);
}); });
constexpr auto move_on_dim = [&]() constexpr constexpr auto move_on_dim = [&]() constexpr
......
...@@ -28,33 +28,6 @@ __device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread ...@@ -28,33 +28,6 @@ __device__ void threadwise_matrix_set_zero_v3(Desc, Float* __restrict__ p_thread
}); });
} }
template <typename SrcDesc,
typename DstDesc,
index_t NSliceRow,
index_t NSliceCol,
index_t DataPerAccess>
struct ThreadwiseMatrixSliceCopy_v3
{
template <typename Data>
__device__ static void Run(const Data* p_src, Data* p_dst)
{
static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
"wrong! Desc should be known at compile-time");
using vector_t = typename vector_type<Data, DataPerAccess>::type;
static_for<0, NSliceRow, 1>{}([&](auto i) {
static_for<0, NSliceCol, DataPerAccess>{}([&](auto j) {
constexpr auto src_offset = SrcDesc{}.CalculateOffset(make_tuple(i, j));
constexpr auto dst_offset = DstDesc{}.CalculateOffset(make_tuple(i, j));
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
});
});
}
};
// C[M, N] += transpose(A[K, M]) * B[K, N] // C[M, N] += transpose(A[K, M]) * B[K, N]
// Element of matrix can be vectorized data // Element of matrix can be vectorized data
template <typename ADesc, template <typename ADesc,
...@@ -75,9 +48,9 @@ struct ThreadwiseGemm_km_kn_mn_v3 ...@@ -75,9 +48,9 @@ struct ThreadwiseGemm_km_kn_mn_v3
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto M = CDesc{}[I0]; constexpr auto M = CDesc{}.GetLength(I0);
constexpr auto N = CDesc{}[I1]; constexpr auto N = CDesc{}.GetLength(I1);
constexpr auto K = ADesc{}[I0]; constexpr auto K = ADesc{}.GetLength(I0);
static_for<0, K, 1>{}([&](auto k) { static_for<0, K, 1>{}([&](auto k) {
static_for<0, M, 1>{}([&](auto m) { static_for<0, M, 1>{}([&](auto m) {
......
...@@ -76,7 +76,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc ...@@ -76,7 +76,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr index_t GemmMPerThread = 16; constexpr index_t GemmMPerThread = 16;
constexpr index_t GemmNPerThread = 1; constexpr index_t GemmNPerThread = 1;
constexpr index_t GemmKPerThread = 1; constexpr index_t GemmKPerThread = 4;
constexpr index_t GemmMLevel0Cluster = 1; constexpr index_t GemmMLevel0Cluster = 1;
constexpr index_t GemmNLevel0Cluster = 1; constexpr index_t GemmNLevel0Cluster = 1;
......
...@@ -779,7 +779,7 @@ int main(int argc, char* argv[]) ...@@ -779,7 +779,7 @@ int main(int argc, char* argv[])
#if 1 #if 1
// LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; // LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
// LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl; // LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl; // LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl; LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl;
#endif #endif
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment