Commit a68b16a5 authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent 0a386c46
...@@ -646,9 +646,9 @@ int main(int argc, char* argv[]) ...@@ -646,9 +646,9 @@ int main(int argc, char* argv[])
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
#elif 0 #elif 0
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
#elif 1
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
#elif 0 #elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
#elif 1
device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw
#endif #endif
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
......
...@@ -203,7 +203,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -203,7 +203,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
#if 1 #if 0
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims; const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
...@@ -215,7 +215,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -215,7 +215,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
const index_t clipboard_offset = const index_t clipboard_offset =
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
#else #else // HIP compiler performs better with these codes
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto src_thread_data_multi_id_begin = constexpr auto src_thread_data_multi_id_begin =
...@@ -256,7 +256,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -256,7 +256,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
#if 1 #if 0
constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths; const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
...@@ -267,7 +267,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -267,7 +267,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin); const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
#else #else // HIP compiler performs better with these codes
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
constexpr auto clipboard_data_multi_id_begin = constexpr auto clipboard_data_multi_id_begin =
...@@ -338,27 +338,6 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -338,27 +338,6 @@ struct BlockwiseGenericTensorSliceCopy_v1
src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex( src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
old_src_partial_original_multi_id, StepSize, direction); old_src_partial_original_multi_id, StepSize, direction);
#if 0
{
if(debug_flag && get_block_1d_id() == 0)
{
printf("id %5u %5u: "
"old_src_partial_original_multi_id %u %u %u, "
"new_src_partial_original_multi_id %u %u %u, "
"mThreadSrcOffset %u, mThreadDstOffset %u \n",
get_block_1d_id(),
get_thread_local_1d_id(),
old_src_partial_original_multi_id[0],
old_src_partial_original_multi_id[1],
old_src_partial_original_multi_id[2],
new_src_partial_original_multi_id[0],
new_src_partial_original_multi_id[1],
new_src_partial_original_multi_id[2]
);
}
}
#endif
// update "mThreadSrcOriginalMultiId" // update "mThreadSrcOriginalMultiId"
static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I_) { static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I_) {
constexpr auto I = decltype(I_){}; constexpr auto I = decltype(I_){};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment