Commit 9ed2de0b authored by rocking's avatar rocking
Browse files

[What] Refine perf evaluation in example of gemm + reduction

[Why] evaluation of gemm + reduction may cause verification fail. Because evaluation will not initial global memory
parent 086625dc
...@@ -67,6 +67,20 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp ...@@ -67,6 +67,20 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp
BElementOp, BElementOp,
CElementOp>; CElementOp>;
template <typename ADataType, typename BDataType, typename CDataType, typename DDataType>
void DumpGemmLayerNormPerf(float gemm_reduce_time, int M, int N, int K)
{
std::size_t gemm_flop = std::size_t(2) * M * N * K;
std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(CDataType) * M * N + sizeof(DDataType) * M;
float tflops = static_cast<float>(gemm_flop) / 1.E9 / gemm_reduce_time;
float gemm_gb_per_sec = gemm_num_byte / 1.E6 / gemm_reduce_time;
std::cout << "gemm + reduceMax Perf: " << gemm_reduce_time << " ms, " << tflops << " TFlops, "
<< gemm_gb_per_sec << " GB/s, " << std::endl;
}
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
bool do_verification = true; bool do_verification = true;
...@@ -198,21 +212,10 @@ int main(int argc, char* argv[]) ...@@ -198,21 +212,10 @@ int main(int argc, char* argv[])
"not support this GEMM problem"); "not support this GEMM problem");
} }
// init D // [CAUSION]: launch_and_time_kernel will not initialize D.
// If we evaluate kernel multiple time but without initialize D. Verification will fail
d_device_buf.SetValue(ck::NumericLimits<DDataType>::Lowest()); d_device_buf.SetValue(ck::NumericLimits<DDataType>::Lowest());
invoker.Run(argument, StreamConfig{nullptr, false});
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
bool pass = true; bool pass = true;
...@@ -251,5 +254,13 @@ int main(int argc, char* argv[]) ...@@ -251,5 +254,13 @@ int main(int argc, char* argv[])
1e-3); 1e-3);
} }
if(time_kernel)
{
float gemm_reduceMax_ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
DumpGemmLayerNormPerf<ADataType, BDataType, CDataType, DDataType>(
gemm_reduceMax_ave_time, M, N, K);
}
return pass ? 0 : 1; return pass ? 0 : 1;
} }
...@@ -78,6 +78,21 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp ...@@ -78,6 +78,21 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataTyp
BElementOp, BElementOp,
CElementOp>; CElementOp>;
template <typename ADataType, typename BDataType, typename CDataType, typename DDataType>
void DumpGemmLayerNormPerf(float gemm_reduce_time, int M, int N, int K)
{
std::size_t gemm_flop = std::size_t(2) * M * N * K;
std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(CDataType) * M * N + sizeof(DDataType) * M +
sizeof(DDataType) * M;
float tflops = static_cast<float>(gemm_flop) / 1.E9 / gemm_reduce_time;
float gemm_gb_per_sec = gemm_num_byte / 1.E6 / gemm_reduce_time;
std::cout << "gemm + reduce_mean + reduce_mean_square Perf: " << gemm_reduce_time << " ms, "
<< tflops << " TFlops, " << gemm_gb_per_sec << " GB/s, " << std::endl;
}
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
bool do_verification = true; bool do_verification = true;
...@@ -224,19 +239,7 @@ int main(int argc, char* argv[]) ...@@ -224,19 +239,7 @@ int main(int argc, char* argv[])
// if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result // if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
// will not be correct. need to set time_kernel = false for correctness test // will not be correct. need to set time_kernel = false for correctness test
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); invoker.Run(argument, StreamConfig{nullptr, false});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
bool pass = true; bool pass = true;
if(do_verification) if(do_verification)
...@@ -294,5 +297,12 @@ int main(int argc, char* argv[]) ...@@ -294,5 +297,12 @@ int main(int argc, char* argv[])
1e-5); 1e-5);
} }
if(time_kernel)
{
float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
DumpGemmLayerNormPerf<ADataType, BDataType, CDataType, DDataType>(ave_time, M, N, K);
}
return pass ? 0 : 1; return pass ? 0 : 1;
} }
...@@ -214,7 +214,7 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M, ...@@ -214,7 +214,7 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M,
std::cout << "gemm + reduce_mean + reduce_square_mean Perf: " << gemm_reduce_time << " ms, " std::cout << "gemm + reduce_mean + reduce_square_mean Perf: " << gemm_reduce_time << " ms, "
<< tflops << " TFlops, " << gemm_gb_per_sec << " GB/s, " << std::endl; << tflops << " TFlops, " << gemm_gb_per_sec << " GB/s, " << std::endl;
std::cout << "gemm + reduce_mean + reduce_square_mean Perf: " << normalize_time << " ms, " std::cout << "5-ary elementwise Perf: " << normalize_time << " ms, "
<< normalize_gb_per_sec << " GB/s, " << std::endl; << normalize_gb_per_sec << " GB/s, " << std::endl;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment