Commit 2a48812e authored by Chao Liu's avatar Chao Liu
Browse files

behavior has changed (better and worse), figuring out why

parent acd7082f
...@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, ...@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data()); wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 1 #if 0
// for 3x3, 34x34, v1r3, Pascal // for 3x3, 34x34, v1r3, Pascal
constexpr index_t BlockSize = 128; constexpr index_t BlockSize = 128;
...@@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, ...@@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_W = 2; constexpr index_t OutThreadCopyDataPerWrite_W = 2;
#elif 0 #elif 1
// for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8 // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
......
...@@ -286,10 +286,9 @@ struct ConstantTensorDescriptor ...@@ -286,10 +286,9 @@ struct ConstantTensorDescriptor
"wrong! dimensions to be unfolded need to be packed"); "wrong! dimensions to be unfolded need to be packed");
// checkt ranks // checkt ranks
static_assert(GetMemoryRank(IDim_p1) = GetMemoryRank(IDim) + 1, static_assert(GetMemoryRank(IDim_p1) == GetMemoryRank(IDim) + 1,
"wrong! ranks of dimensions to be " "wrong! ranks of dimensions to be unfolded need to be in increasing and "
"unfolded need to be in increasing " "continuous ranks");
"and continuous ranks");
}); });
// left and right // left and right
......
...@@ -39,7 +39,8 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -39,7 +39,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_cluster_lengths = constexpr auto thread_cluster_lengths =
src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster); src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);
constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(thread_cluster_lengths); constexpr auto thread_cluster_desc =
make_packed_ConstantTensorDescriptor(thread_cluster_lengths);
// sanity check: data type // sanity check: data type
static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n"); static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
...@@ -147,7 +148,8 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -147,7 +148,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths); constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
return thread_tensor_desc.GetElementSpace(); return thread_tensor_desc.GetElementSpace();
} }
...@@ -167,7 +169,8 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -167,7 +169,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths); constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
...@@ -204,7 +207,8 @@ struct BlockwiseTensorSliceReorderCopy_v3 ...@@ -204,7 +207,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;
constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths); constexpr auto thread_tensor_desc =
make_packed_ConstantTensorDescriptor(thread_tensor_lengths);
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) { static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
......
...@@ -362,8 +362,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn ...@@ -362,8 +362,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock; const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock;
static_if<GemmNPerThreadSubC <= NPerBlock>{}([&](auto fwd) { // fwd do nothing but static_if<GemmNPerThreadSubC <= NPerBlock>{}([&](auto fwd) { // fwd do nothing but
// perfect forwarding. // perfect forwarding.
// Using this trick to // Using this trick to
// make this lambda a generic lambda, so it won't be compiled until // make this lambda a generic lambda, so it won't be compiled until
// instantiated // instantiated
static_assert( static_assert(
......
...@@ -196,7 +196,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw ...@@ -196,7 +196,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
// choose GEMM implementation here // choose GEMM implementation here
const auto run_blockwise_batch_gemm = [&](auto... Xs) { const auto run_blockwise_batch_gemm = [&](auto... Xs) {
#if 1 #if 0
return blockwise_batch_gemm.Run(Xs...); return blockwise_batch_gemm.Run(Xs...);
#elif 0 #elif 0
return blockwise_batch_gemm.Run_asm(Xs...); return blockwise_batch_gemm.Run_asm(Xs...);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment