"git@developer.sourcefind.cn:change/sglang.git" did not exist on "6d6e24bcc473a8f7ea9d3da178cbc53624bf9814"
Commit 9db34134 authored by Bartlomiej Kocot's avatar Bartlomiej Kocot
Browse files

Fail when no kernel is applicable

parent 8f84a012
...@@ -126,6 +126,7 @@ bool profile_avg_pool3d_bwd_impl(int do_verification, ...@@ -126,6 +126,7 @@ bool profile_avg_pool3d_bwd_impl(int do_verification,
std::string best_instance_name; std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max(); float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
if(do_verification) if(do_verification)
{ {
...@@ -145,8 +146,6 @@ bool profile_avg_pool3d_bwd_impl(int do_verification, ...@@ -145,8 +146,6 @@ bool profile_avg_pool3d_bwd_impl(int do_verification,
ref_invoker.Run(ref_pooling_bwd_argument); ref_invoker.Run(ref_pooling_bwd_argument);
} }
int num_kernel = 0;
for(auto& inst_ptr : instance_ptrs) for(auto& inst_ptr : instance_ptrs)
{ {
auto argument_ptr = inst_ptr->MakeArgumentPointer( auto argument_ptr = inst_ptr->MakeArgumentPointer(
......
...@@ -263,6 +263,7 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification, ...@@ -263,6 +263,7 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances // profile device op instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
...@@ -301,6 +302,7 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification, ...@@ -301,6 +302,7 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
std::string op_name = op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
float ave_time = float ave_time =
...@@ -350,6 +352,12 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification, ...@@ -350,6 +352,12 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl; << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
......
...@@ -273,6 +273,7 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification, ...@@ -273,6 +273,7 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances // profile device op instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
...@@ -310,6 +311,7 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification, ...@@ -310,6 +311,7 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
std::string op_name = op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
float ave_time = float ave_time =
...@@ -385,6 +387,12 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification, ...@@ -385,6 +387,12 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl; << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
......
...@@ -223,6 +223,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification, ...@@ -223,6 +223,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances // profile device op instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
...@@ -255,6 +256,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification, ...@@ -255,6 +256,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
std::string op_name = op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
float ave_time = float ave_time =
...@@ -309,6 +311,12 @@ bool profile_batched_gemm_gemm_impl(bool do_verification, ...@@ -309,6 +311,12 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl; << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
......
...@@ -136,6 +136,7 @@ bool profile_batched_gemm_impl(int do_verification, ...@@ -136,6 +136,7 @@ bool profile_batched_gemm_impl(int do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances // profile device op instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
...@@ -201,6 +202,7 @@ bool profile_batched_gemm_impl(int do_verification, ...@@ -201,6 +202,7 @@ bool profile_batched_gemm_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// re-init C to zero before profiling next kernel // re-init C to zero before profiling next kernel
c_device_buf.SetZero(); c_device_buf.SetZero();
...@@ -254,6 +256,12 @@ bool profile_batched_gemm_impl(int do_verification, ...@@ -254,6 +256,12 @@ bool profile_batched_gemm_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl; << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
......
...@@ -254,6 +254,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -254,6 +254,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device GEMM instances // profile device GEMM instances
for(auto& gemm_ptr : gemm_ptrs) for(auto& gemm_ptr : gemm_ptrs)
...@@ -281,6 +282,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -281,6 +282,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// init DO, D1 to 0 // init DO, D1 to 0
reduce0_device_buf.SetZero(); reduce0_device_buf.SetZero();
reduce1_device_buf.SetZero(); reduce1_device_buf.SetZero();
...@@ -352,6 +354,12 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -352,6 +354,12 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl; << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
......
...@@ -251,6 +251,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, ...@@ -251,6 +251,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances // profile device op instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
...@@ -283,6 +284,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, ...@@ -283,6 +284,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
std::string op_name = op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
float ave_time = float ave_time =
...@@ -337,6 +339,12 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, ...@@ -337,6 +339,12 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl; << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
......
...@@ -251,6 +251,7 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification, ...@@ -251,6 +251,7 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances // profile device op instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
...@@ -284,6 +285,7 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification, ...@@ -284,6 +285,7 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
std::string op_name = op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
float ave_time = float ave_time =
...@@ -357,6 +359,12 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification, ...@@ -357,6 +359,12 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl; << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
......
...@@ -214,6 +214,7 @@ bool profile_batchnorm_backward_impl(bool do_verification, ...@@ -214,6 +214,7 @@ bool profile_batchnorm_backward_impl(bool do_verification,
std::string best_instance_name; std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max(); float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
if(do_verification) if(do_verification)
{ {
...@@ -264,8 +265,7 @@ bool profile_batchnorm_backward_impl(bool do_verification, ...@@ -264,8 +265,7 @@ bool profile_batchnorm_backward_impl(bool do_verification,
(void)invoker_ptr_ref->Run(argument_ptr_ref.get()); (void)invoker_ptr_ref->Run(argument_ptr_ref.get());
} }
int num_kernel = 0; bool pass = true;
bool pass = true;
for(auto& inst_ptr : instance_ptrs) for(auto& inst_ptr : instance_ptrs)
{ {
...@@ -371,18 +371,18 @@ bool profile_batchnorm_backward_impl(bool do_verification, ...@@ -371,18 +371,18 @@ bool profile_batchnorm_backward_impl(bool do_verification,
}; };
} }
if(time_kernel)
{
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_instance_name << std::endl;
}
if(num_kernel == 0) if(num_kernel == 0)
{ {
std::cout << "Error: No kernel is applicable" << std::endl; std::cout << "Error: No kernel is applicable" << std::endl;
return false; return false;
} }
if(time_kernel)
{
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_instance_name << std::endl;
}
return pass; return pass;
} }
......
...@@ -209,6 +209,7 @@ bool profile_batchnorm_forward_impl(int do_verification, ...@@ -209,6 +209,7 @@ bool profile_batchnorm_forward_impl(int do_verification,
std::string best_instance_name; std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max(); float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
if(do_verification) if(do_verification)
{ {
...@@ -258,8 +259,7 @@ bool profile_batchnorm_forward_impl(int do_verification, ...@@ -258,8 +259,7 @@ bool profile_batchnorm_forward_impl(int do_verification,
(void)invoker_ptr_ref->Run(argument_ptr_ref.get()); (void)invoker_ptr_ref->Run(argument_ptr_ref.get());
} }
int num_kernel = 0; bool pass = true;
bool pass = true;
for(auto& inst_ptr : instance_ptrs) for(auto& inst_ptr : instance_ptrs)
{ {
...@@ -393,18 +393,18 @@ bool profile_batchnorm_forward_impl(int do_verification, ...@@ -393,18 +393,18 @@ bool profile_batchnorm_forward_impl(int do_verification,
}; };
} }
if(time_kernel)
{
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_instance_name << std::endl;
}
if(num_kernel == 0) if(num_kernel == 0)
{ {
std::cout << "Error: No kernel is applicable" << std::endl; std::cout << "Error: No kernel is applicable" << std::endl;
return false; return false;
} }
if(time_kernel)
{
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_instance_name << std::endl;
}
return pass; return pass;
} }
......
...@@ -183,6 +183,7 @@ bool profile_batchnorm_infer_impl(int do_verification, ...@@ -183,6 +183,7 @@ bool profile_batchnorm_infer_impl(int do_verification,
std::string best_instance_name; std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max(); float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
if(do_verification) if(do_verification)
{ {
...@@ -230,8 +231,7 @@ bool profile_batchnorm_infer_impl(int do_verification, ...@@ -230,8 +231,7 @@ bool profile_batchnorm_infer_impl(int do_verification,
(void)invoker_ptr_ref->Run(argument_ptr_ref.get()); (void)invoker_ptr_ref->Run(argument_ptr_ref.get());
} }
int num_kernel = 0; bool pass = true;
bool pass = true;
for(auto& inst_ptr : instance_ptrs) for(auto& inst_ptr : instance_ptrs)
{ {
...@@ -316,18 +316,18 @@ bool profile_batchnorm_infer_impl(int do_verification, ...@@ -316,18 +316,18 @@ bool profile_batchnorm_infer_impl(int do_verification,
}; };
} }
if(time_kernel)
{
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_instance_name << std::endl;
}
if(num_kernel == 0) if(num_kernel == 0)
{ {
std::cout << "Error: No kernel is applicable" << std::endl; std::cout << "Error: No kernel is applicable" << std::endl;
return false; return false;
} }
if(time_kernel)
{
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_instance_name << std::endl;
}
return pass; return pass;
} }
......
...@@ -183,6 +183,7 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -183,6 +183,7 @@ int profile_contraction_impl(ck::index_t do_verification,
float best_avg_time = 0; float best_avg_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances // profile device op instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
...@@ -239,6 +240,7 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -239,6 +240,7 @@ int profile_contraction_impl(ck::index_t do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// re-init C to zero before profiling next kernel // re-init C to zero before profiling next kernel
e_device_buf.SetZero(); e_device_buf.SetZero();
...@@ -333,6 +335,12 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -333,6 +335,12 @@ int profile_contraction_impl(ck::index_t do_verification,
std::cout << " CDELayout = ColumnMajor"; std::cout << " CDELayout = ColumnMajor";
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << " M = " << M << " N = " << N << " K = " << K << " StridesA = " << StridesA std::cout << " M = " << M << " N = " << N << " K = " << K << " StridesA = " << StridesA
<< " StridesB = " << StridesB << " StridesE = " << StridesE << " : " << best_avg_time << " StridesB = " << StridesB << " StridesE = " << StridesE << " : " << best_avg_time
<< " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
......
...@@ -151,6 +151,7 @@ bool profile_conv_bwd_data_impl(int do_verification, ...@@ -151,6 +151,7 @@ bool profile_conv_bwd_data_impl(int do_verification,
float best_avg_time = 0; float best_avg_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device Conv instances // profile device Conv instances
bool pass = true; bool pass = true;
...@@ -177,6 +178,7 @@ bool profile_conv_bwd_data_impl(int do_verification, ...@@ -177,6 +178,7 @@ bool profile_conv_bwd_data_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// for conv bwd data, some input tensor element are zero, but not written by kernel, // for conv bwd data, some input tensor element are zero, but not written by kernel,
// need to set zero // need to set zero
in_device_buf.SetZero(); in_device_buf.SetZero();
...@@ -237,6 +239,12 @@ bool profile_conv_bwd_data_impl(int do_verification, ...@@ -237,6 +239,12 @@ bool profile_conv_bwd_data_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best configuration parameters:" std::cout << "Best configuration parameters:"
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
......
...@@ -192,6 +192,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ...@@ -192,6 +192,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device Conv instances // profile device Conv instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
...@@ -220,6 +221,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ...@@ -220,6 +221,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
std::string conv_name = op_ptr->GetTypeString(); std::string conv_name = op_ptr->GetTypeString();
float ave_time = float ave_time =
...@@ -270,6 +272,12 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ...@@ -270,6 +272,12 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_conv_name << std::endl; << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
} }
......
...@@ -182,6 +182,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ...@@ -182,6 +182,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device Conv instances // profile device Conv instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
...@@ -209,6 +210,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ...@@ -209,6 +210,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
std::string conv_name = op_ptr->GetTypeString(); std::string conv_name = op_ptr->GetTypeString();
float ave_time = float ave_time =
...@@ -258,6 +260,12 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ...@@ -258,6 +260,12 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_conv_name << std::endl; << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
} }
......
...@@ -133,6 +133,7 @@ bool profile_conv_fwd_impl(int do_verification, ...@@ -133,6 +133,7 @@ bool profile_conv_fwd_impl(int do_verification,
float best_avg_time = 0; float best_avg_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances // profile device op instances
bool pass = true; bool pass = true;
...@@ -159,6 +160,7 @@ bool profile_conv_fwd_impl(int do_verification, ...@@ -159,6 +160,7 @@ bool profile_conv_fwd_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// re-init output to zero before profiling next kernel // re-init output to zero before profiling next kernel
out_device_buf.SetZero(); out_device_buf.SetZero();
...@@ -210,6 +212,12 @@ bool profile_conv_fwd_impl(int do_verification, ...@@ -210,6 +212,12 @@ bool profile_conv_fwd_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best configuration parameters:" std::cout << "Best configuration parameters:"
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
......
...@@ -137,6 +137,7 @@ bool profile_elementwise_layernorm_impl(int do_verification, ...@@ -137,6 +137,7 @@ bool profile_elementwise_layernorm_impl(int do_verification,
std::string best_instance_name; std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max(); float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
if(do_verification) if(do_verification)
{ {
...@@ -163,8 +164,6 @@ bool profile_elementwise_layernorm_impl(int do_verification, ...@@ -163,8 +164,6 @@ bool profile_elementwise_layernorm_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
int num_kernel = 0;
for(auto& inst_ptr : instance_ptrs) for(auto& inst_ptr : instance_ptrs)
{ {
auto argument_ptr = inst_ptr->MakeArgumentPointer( auto argument_ptr = inst_ptr->MakeArgumentPointer(
...@@ -246,6 +245,12 @@ bool profile_elementwise_layernorm_impl(int do_verification, ...@@ -246,6 +245,12 @@ bool profile_elementwise_layernorm_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
if(time_kernel) if(time_kernel)
{ {
LogRange(std::cout << "length = ", length, ",") << ", "; LogRange(std::cout << "length = ", length, ",") << ", ";
...@@ -253,12 +258,6 @@ bool profile_elementwise_layernorm_impl(int do_verification, ...@@ -253,12 +258,6 @@ bool profile_elementwise_layernorm_impl(int do_verification,
<< best_gb_per_sec << " GB/s, " << best_instance_name << std::endl; << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl;
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is tested" << std::endl;
return false;
}
return true; return true;
} }
......
...@@ -164,6 +164,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -164,6 +164,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
bool pass = true; bool pass = true;
...@@ -193,6 +194,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -193,6 +194,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// re-init E to zero before profiling a kernel // re-init E to zero before profiling a kernel
e_device_buf.SetZero(); e_device_buf.SetZero();
...@@ -232,6 +234,12 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -232,6 +234,12 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl; << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
......
...@@ -155,6 +155,7 @@ bool profile_gemm_add_fastgelu_impl(int do_verification, ...@@ -155,6 +155,7 @@ bool profile_gemm_add_fastgelu_impl(int do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
bool pass = true; bool pass = true;
...@@ -183,6 +184,7 @@ bool profile_gemm_add_fastgelu_impl(int do_verification, ...@@ -183,6 +184,7 @@ bool profile_gemm_add_fastgelu_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// re-init E to zero before profiling a kernel // re-init E to zero before profiling a kernel
e_device_buf.SetZero(); e_device_buf.SetZero();
...@@ -222,6 +224,12 @@ bool profile_gemm_add_fastgelu_impl(int do_verification, ...@@ -222,6 +224,12 @@ bool profile_gemm_add_fastgelu_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl; << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
......
...@@ -164,6 +164,7 @@ bool profile_gemm_add_multiply_impl(int do_verification, ...@@ -164,6 +164,7 @@ bool profile_gemm_add_multiply_impl(int do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
bool pass = true; bool pass = true;
...@@ -193,6 +194,7 @@ bool profile_gemm_add_multiply_impl(int do_verification, ...@@ -193,6 +194,7 @@ bool profile_gemm_add_multiply_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// re-init E to zero before profiling a kernel // re-init E to zero before profiling a kernel
e_device_buf.SetZero(); e_device_buf.SetZero();
...@@ -232,6 +234,12 @@ bool profile_gemm_add_multiply_impl(int do_verification, ...@@ -232,6 +234,12 @@ bool profile_gemm_add_multiply_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl; << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment