"...resnet50_tensorflow.git" did not exist on "2b2d4820ca868e427e9b36cd9827aabd866ebdc2"
Commit 3567bf79 authored by Jing Zhang's avatar Jing Zhang
Browse files

clean code

parent e8f5ca1a
...@@ -530,9 +530,9 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1 ...@@ -530,9 +530,9 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v1
decltype(c_m0_m1_m2_n_thread_desc), decltype(c_m0_m1_m2_n_thread_desc),
decltype(c_m0_m1_m2_n_global_desc), decltype(c_m0_m1_m2_n_global_desc),
Sequence<M0, 1, M2, 1>, Sequence<M0, 1, M2, 1>,
Sequence<0, 1, 2, 3>, // CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstAccessOrder,
3, // CThreadTransferSrcDstVectorDim, CThreadTransferSrcDstVectorDim,
1, // CThreadTransferDstScalarPerVector, CThreadTransferDstScalarPerVector,
CGlobalMemoryDataOperation, CGlobalMemoryDataOperation,
1, 1,
true>{c_m0_m1_m2_n_global_desc, true>{c_m0_m1_m2_n_global_desc,
......
...@@ -445,9 +445,9 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v2 ...@@ -445,9 +445,9 @@ struct GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v2
decltype(c_m0_m1_m2_n_thread_desc), decltype(c_m0_m1_m2_n_thread_desc),
decltype(c_m0_m1_m2_n_global_desc), decltype(c_m0_m1_m2_n_global_desc),
Sequence<M0, 1, M2, 1>, Sequence<M0, 1, M2, 1>,
Sequence<0, 1, 2, 3>, // CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstAccessOrder,
3, // CThreadTransferSrcDstVectorDim, CThreadTransferSrcDstVectorDim,
1, // CThreadTransferDstScalarPerVector, CThreadTransferDstScalarPerVector,
CGlobalMemoryDataOperation, CGlobalMemoryDataOperation,
1, 1,
true>{c_m0_m1_m2_n_global_desc, true>{c_m0_m1_m2_n_global_desc,
......
...@@ -152,61 +152,65 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw ...@@ -152,61 +152,65 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
for(index_t i = 0; i < 5; ++i) for(index_t i = 0; i < 5; ++i)
{ {
float ave_time = launch_kernel_dynamic_gemm_xdlops_v2< #if 0
BlockSize, float ave_time = launch_kernel_dynamic_gemm_xdlops_v1
TInWei, #else
TAcc, float ave_time = launch_kernel_dynamic_gemm_xdlops_v2
TOut, #endif
InMemoryDataOperation::Set, <BlockSize,
decltype(descs[I0]), TInWei,
decltype(descs[I1]), TAcc,
decltype(descs[I2]), TOut,
decltype(descs[I3]), InMemoryDataOperation::Set,
GemmMPerBlock, decltype(descs[I0]),
GemmNPerBlock, decltype(descs[I1]),
GemmKPerBlock, decltype(descs[I2]),
GemmMPerWave, decltype(descs[I3]),
GemmNPerWave, GemmMPerBlock,
GemmKPack, GemmNPerBlock,
MRepeat, GemmKPerBlock,
NRepeat, GemmMPerWave,
GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1, GemmNPerWave,
GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1, GemmKPack,
Sequence<1, 0, 2>, MRepeat,
Sequence<1, 0, 2>, NRepeat,
2, GemmABlockTransferThreadSliceLengths_GemmK0_GemmM_GemmK1,
GemmABlockTransferSrcScalarPerVector_GemmK, GemmABlockTransferThreadClusterLengths_GemmK0_GemmM_GemmK1,
GemmABlockTransferDstScalarPerVector_KPack, Sequence<1, 0, 2>,
false, // don't move back src coordinate after threadwise copy Sequence<1, 0, 2>,
GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1, 2,
GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1, GemmABlockTransferSrcScalarPerVector_GemmK,
Sequence<0, 2, 1>, GemmABlockTransferDstScalarPerVector_KPack,
Sequence<1, 0, 2>, false, // don't move back src coordinate after threadwise copy
1, GemmBBlockTransferThreadSliceLengths_GemmK0_GemmN_GemmK1,
GemmBBlockTransferSrcScalarPerVector_GemmN, GemmBBlockTransferThreadClusterLengths_GemmK0_GemmN_GemmK1,
GemmBBlockTransferDstScalarPerVector_KPack, Sequence<0, 2, 1>,
false, // don't move back src coordinate after threadwise copy, which will be fused with Sequence<1, 0, 2>,
// MoveSrcSliceWindow() to save addr computation 1,
Sequence<2, 3, 0, 1>, GemmBBlockTransferSrcScalarPerVector_GemmN,
3, GemmBBlockTransferDstScalarPerVector_KPack,
GemmCThreadTransferDstScalarPerVector_GemmN1, false, // don't move back src coordinate after threadwise copy, which will be fused
decltype(descs[I4]), // with MoveSrcSliceWindow() to save addr computation
decltype(descs[I5]), Sequence<2, 3, 0, 1>,
decltype(descs[I6]), 3,
decltype(descs[I7]), GemmCThreadTransferDstScalarPerVector_GemmN1,
decltype(descs[I8])>(static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()), decltype(descs[I4]),
static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()), decltype(descs[I5]),
static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()), decltype(descs[I6]),
descs[I0], decltype(descs[I7]),
descs[I1], decltype(descs[I8])>(static_cast<TInWei*>(wei_k_c_y_x_device_buf.GetDeviceBuffer()),
descs[I2], static_cast<TInWei*>(in_n_c_hi_wi_device_buf.GetDeviceBuffer()),
descs[I3], static_cast<TOut*>(out_n_k_ho_wo_device_buf.GetDeviceBuffer()),
descs[I4], descs[I0],
descs[I5], descs[I1],
descs[I6], descs[I2],
descs[I7], descs[I3],
descs[I8], descs[I4],
nrepeat); descs[I5],
descs[I6],
descs[I7],
descs[I8],
nrepeat);
float perf = (float)calculate_convolution_flops( float perf = (float)calculate_convolution_flops(
in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) / in_n_c_hi_wi_desc, wei_k_c_y_x_desc, out_n_k_ho_wo_desc) /
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment