Commit 9db34134 authored by Bartlomiej Kocot's avatar Bartlomiej Kocot
Browse files

Fail when no kernel is applicable

parent 8f84a012
...@@ -249,9 +249,9 @@ bool profile_gemm_add_relu_add_layernorm_impl(int do_verification, ...@@ -249,9 +249,9 @@ bool profile_gemm_add_relu_add_layernorm_impl(int do_verification,
std::string best_op_name; std::string best_op_name;
float best_ave_time = std::numeric_limits<float>::max(); float best_ave_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
bool pass = true; bool pass = true;
int num_kernel = 0;
// profile device operation instances // profile device operation instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
...@@ -283,7 +283,6 @@ bool profile_gemm_add_relu_add_layernorm_impl(int do_verification, ...@@ -283,7 +283,6 @@ bool profile_gemm_add_relu_add_layernorm_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
++num_kernel; ++num_kernel;
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get()); size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
DeviceMem workspace_dev(workspace_sz); DeviceMem workspace_dev(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer()); op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
...@@ -330,15 +329,14 @@ bool profile_gemm_add_relu_add_layernorm_impl(int do_verification, ...@@ -330,15 +329,14 @@ bool profile_gemm_add_relu_add_layernorm_impl(int do_verification,
if(num_kernel == 0) if(num_kernel == 0)
{ {
std::cout << "Error: No kernel is applicable" << std::endl; std::cout << "Error: No kernel is applicable" << std::endl;
pass = false; return false;
} }
else
{
if(time_kernel) if(time_kernel)
{
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_op_name << std::endl; << best_op_name << std::endl;
} }
return pass; return pass;
} }
......
...@@ -280,6 +280,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -280,6 +280,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device GEMM instances // profile device GEMM instances
for(auto& gemm_ptr : gemm_ptrs) for(auto& gemm_ptr : gemm_ptrs)
...@@ -306,6 +307,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -306,6 +307,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// init DO, D1 to 0 // init DO, D1 to 0
reduce0_device_buf.SetZero(); reduce0_device_buf.SetZero();
reduce1_device_buf.SetZero(); reduce1_device_buf.SetZero();
...@@ -376,6 +378,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -376,6 +378,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl; << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
} }
......
...@@ -157,6 +157,7 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -157,6 +157,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
bool pass = true; bool pass = true;
...@@ -185,6 +186,7 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -185,6 +186,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// re-init E to zero before profiling a kernel // re-init E to zero before profiling a kernel
e_device_buf.SetZero(); e_device_buf.SetZero();
...@@ -224,6 +226,12 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -224,6 +226,12 @@ bool profile_gemm_bilinear_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl; << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
......
...@@ -146,6 +146,7 @@ bool profile_gemm_fastgelu_impl(int do_verification, ...@@ -146,6 +146,7 @@ bool profile_gemm_fastgelu_impl(int do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
bool pass = true; bool pass = true;
...@@ -173,6 +174,7 @@ bool profile_gemm_fastgelu_impl(int do_verification, ...@@ -173,6 +174,7 @@ bool profile_gemm_fastgelu_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// re-init E to zero before profiling a kernel // re-init E to zero before profiling a kernel
e_device_buf.SetZero(); e_device_buf.SetZero();
...@@ -212,6 +214,12 @@ bool profile_gemm_fastgelu_impl(int do_verification, ...@@ -212,6 +214,12 @@ bool profile_gemm_fastgelu_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl; << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
......
...@@ -134,6 +134,7 @@ int profile_gemm_impl(int do_verification, ...@@ -134,6 +134,7 @@ int profile_gemm_impl(int do_verification,
float best_avg_time = 0; float best_avg_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances // profile device op instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
...@@ -156,6 +157,7 @@ int profile_gemm_impl(int do_verification, ...@@ -156,6 +157,7 @@ int profile_gemm_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// re-init C to zero before profiling next kernel // re-init C to zero before profiling next kernel
c_device_buf.SetZero(); c_device_buf.SetZero();
...@@ -242,6 +244,12 @@ int profile_gemm_impl(int do_verification, ...@@ -242,6 +244,12 @@ int profile_gemm_impl(int do_verification,
std::cout << " BLayout = ColumnMajor"; std::cout << " BLayout = ColumnMajor";
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
<< " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << best_avg_time << " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << best_avg_time
<< " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
......
...@@ -164,6 +164,7 @@ bool profile_gemm_multiply_add_impl(int do_verification, ...@@ -164,6 +164,7 @@ bool profile_gemm_multiply_add_impl(int do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
bool pass = true; bool pass = true;
...@@ -193,6 +194,7 @@ bool profile_gemm_multiply_add_impl(int do_verification, ...@@ -193,6 +194,7 @@ bool profile_gemm_multiply_add_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// re-init E to zero before profiling a kernel // re-init E to zero before profiling a kernel
e_device_buf.SetZero(); e_device_buf.SetZero();
...@@ -232,6 +234,12 @@ bool profile_gemm_multiply_add_impl(int do_verification, ...@@ -232,6 +234,12 @@ bool profile_gemm_multiply_add_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl; << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
......
...@@ -249,6 +249,7 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -249,6 +249,7 @@ bool profile_gemm_reduce_impl(int do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device GEMM instances // profile device GEMM instances
for(auto& gemm_ptr : gemm_ptrs) for(auto& gemm_ptr : gemm_ptrs)
...@@ -275,6 +276,7 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -275,6 +276,7 @@ bool profile_gemm_reduce_impl(int do_verification,
if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// init DO, D1 to 0 // init DO, D1 to 0
reduce0_device_buf.SetZero(); reduce0_device_buf.SetZero();
reduce1_device_buf.SetZero(); reduce1_device_buf.SetZero();
...@@ -343,6 +345,12 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -343,6 +345,12 @@ bool profile_gemm_reduce_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl; << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
......
...@@ -136,6 +136,7 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -136,6 +136,7 @@ bool profile_gemm_splitk_impl(int do_verification,
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
float best_kbatch = 0; float best_kbatch = 0;
int num_kernel = 0;
// profile device GEMM instances // profile device GEMM instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
...@@ -171,7 +172,7 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -171,7 +172,7 @@ bool profile_gemm_splitk_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// re-init C to zero before profiling next kernel // re-init C to zero before profiling next kernel
c_device_buf.SetZero(); c_device_buf.SetZero();
...@@ -281,6 +282,12 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -281,6 +282,12 @@ bool profile_gemm_splitk_impl(int do_verification,
std::cout << " BLayout = ColumnMajor"; std::cout << " BLayout = ColumnMajor";
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
<< " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch << " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch
<< " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -137,6 +137,7 @@ bool profile_gemm_streamk_impl(int do_verification, ...@@ -137,6 +137,7 @@ bool profile_gemm_streamk_impl(int do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device GEMM instances // profile device GEMM instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
...@@ -167,6 +168,7 @@ bool profile_gemm_streamk_impl(int do_verification, ...@@ -167,6 +168,7 @@ bool profile_gemm_streamk_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// re-init C to zero before profiling next kernel // re-init C to zero before profiling next kernel
c_device_buf.SetZero(); c_device_buf.SetZero();
...@@ -255,6 +257,12 @@ bool profile_gemm_streamk_impl(int do_verification, ...@@ -255,6 +257,12 @@ bool profile_gemm_streamk_impl(int do_verification,
std::cout << " BLayout = ColumnMajor"; std::cout << " BLayout = ColumnMajor";
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
<< " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << best_ave_time << " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << best_ave_time
<< " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
......
...@@ -120,6 +120,7 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, ...@@ -120,6 +120,7 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
float best_avg_time = 0; float best_avg_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances // profile device op instances
bool pass = true; bool pass = true;
...@@ -127,6 +128,7 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, ...@@ -127,6 +128,7 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
auto run_impl = [&](auto& op_ptr, auto& argument_ptr) { auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// re-init output to zero before profiling next kernel // re-init output to zero before profiling next kernel
in_device_buf.SetZero(); in_device_buf.SetZero();
...@@ -246,6 +248,12 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, ...@@ -246,6 +248,12 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
run_impl(op_ptr, argument_ptr); run_impl(op_ptr, argument_ptr);
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best configuration parameters:" std::cout << "Best configuration parameters:"
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
......
...@@ -132,6 +132,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, ...@@ -132,6 +132,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
float best_avg_time = 0; float best_avg_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device Conv instances // profile device Conv instances
bool all_pass = true; bool all_pass = true;
...@@ -183,6 +184,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, ...@@ -183,6 +184,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// using atomic add, so need to reset input // using atomic add, so need to reset input
wei_device_buf.SetZero(); wei_device_buf.SetZero();
...@@ -246,6 +248,12 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, ...@@ -246,6 +248,12 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best configuration parameters:" std::cout << "Best configuration parameters:"
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
......
...@@ -140,6 +140,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -140,6 +140,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
float best_avg_time = 0; float best_avg_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances // profile device op instances
bool pass = true; bool pass = true;
...@@ -147,6 +148,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -147,6 +148,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
auto run_impl = [&](auto& op_ptr, auto& argument_ptr) { auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
// re-init output to zero before profiling next kernel // re-init output to zero before profiling next kernel
out_device_buf.SetZero(); out_device_buf.SetZero();
...@@ -242,6 +244,12 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -242,6 +244,12 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
run_impl(op_ptr, argument_ptr); run_impl(op_ptr, argument_ptr);
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best configuration parameters:" std::cout << "Best configuration parameters:"
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
......
...@@ -166,6 +166,7 @@ bool profile_grouped_gemm_fastgelu_impl(int do_verification, ...@@ -166,6 +166,7 @@ bool profile_grouped_gemm_fastgelu_impl(int do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
auto p_ds = std::vector<std::array<const void*, 0>>{}; auto p_ds = std::vector<std::array<const void*, 0>>{};
...@@ -181,6 +182,7 @@ bool profile_grouped_gemm_fastgelu_impl(int do_verification, ...@@ -181,6 +182,7 @@ bool profile_grouped_gemm_fastgelu_impl(int do_verification,
if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
std::string gemm_name = gemm_ptr->GetTypeString(); std::string gemm_name = gemm_ptr->GetTypeString();
float ave_time = float ave_time =
...@@ -270,6 +272,12 @@ bool profile_grouped_gemm_fastgelu_impl(int do_verification, ...@@ -270,6 +272,12 @@ bool profile_grouped_gemm_fastgelu_impl(int do_verification,
std::cout << "Verification: " << (pass ? "SUCCESS" : "FAILURE") << std::endl; std::cout << "Verification: " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl; << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
......
...@@ -174,6 +174,7 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -174,6 +174,7 @@ bool profile_grouped_gemm_impl(int do_verification,
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
float best_kbatch = 0; float best_kbatch = 0;
int num_kernel = 0;
auto p_ds = std::vector<std::array<const void*, 0>>{}; auto p_ds = std::vector<std::array<const void*, 0>>{};
...@@ -258,6 +259,7 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -258,6 +259,7 @@ bool profile_grouped_gemm_impl(int do_verification,
if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
num_kernel++;
for(std::size_t i = 0; i < gemm_descs.size(); i++) for(std::size_t i = 0; i < gemm_descs.size(); i++)
c_device_buf[i]->SetZero(); c_device_buf[i]->SetZero();
...@@ -347,6 +349,12 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -347,6 +349,12 @@ bool profile_grouped_gemm_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
if(time_kernel) if(time_kernel)
{ {
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
......
...@@ -94,6 +94,7 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -94,6 +94,7 @@ bool profile_groupnorm_impl(int do_verification,
std::string best_instance_name; std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max(); float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
if(do_verification) if(do_verification)
{ {
...@@ -110,8 +111,6 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -110,8 +111,6 @@ bool profile_groupnorm_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
int num_kernel = 0;
for(auto& inst_ptr : instance_ptrs) for(auto& inst_ptr : instance_ptrs)
{ {
auto argument_ptr = inst_ptr->MakeArgumentPointer( auto argument_ptr = inst_ptr->MakeArgumentPointer(
...@@ -192,6 +191,12 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -192,6 +191,12 @@ bool profile_groupnorm_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
if(time_kernel) if(time_kernel)
{ {
LogRange(std::cout << "length = ", length, ",") << std::endl; LogRange(std::cout << "length = ", length, ",") << std::endl;
...@@ -199,12 +204,6 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -199,12 +204,6 @@ bool profile_groupnorm_impl(int do_verification,
<< best_instance_name << std::endl; << best_instance_name << std::endl;
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
return true; return true;
} }
......
...@@ -124,10 +124,10 @@ bool profile_image_to_column_impl(int do_verification, ...@@ -124,10 +124,10 @@ bool profile_image_to_column_impl(int do_verification,
std::string best_op_name; std::string best_op_name;
float best_avg_time = std::numeric_limits<float>::max(); float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances // profile device op instances
bool pass = true; bool pass = true;
bool is_supporting_instance = false;
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
...@@ -148,7 +148,7 @@ bool profile_image_to_column_impl(int do_verification, ...@@ -148,7 +148,7 @@ bool profile_image_to_column_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
is_supporting_instance = true; num_kernel++;
// re-init output to zero before profiling next kernel // re-init output to zero before profiling next kernel
out_device_buf.SetZero(); out_device_buf.SetZero();
std::string op_name = op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
...@@ -189,11 +189,17 @@ bool profile_image_to_column_impl(int do_verification, ...@@ -189,11 +189,17 @@ bool profile_image_to_column_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best configuration parameters:" std::cout << "Best configuration parameters:"
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
<< "\nGB/s: " << best_gb_per_sec << std::endl; << "\nGB/s: " << best_gb_per_sec << std::endl;
return is_supporting_instance && pass; return pass;
} }
} // namespace profiler } // namespace profiler
......
...@@ -102,6 +102,7 @@ bool profile_layernorm_impl(int do_verification, ...@@ -102,6 +102,7 @@ bool profile_layernorm_impl(int do_verification,
std::string best_instance_name; std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max(); float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
if(do_verification) if(do_verification)
{ {
...@@ -121,8 +122,6 @@ bool profile_layernorm_impl(int do_verification, ...@@ -121,8 +122,6 @@ bool profile_layernorm_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
int num_kernel = 0;
for(auto& inst_ptr : instance_ptrs) for(auto& inst_ptr : instance_ptrs)
{ {
auto argument_ptr = inst_ptr->MakeArgumentPointer(length, auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
...@@ -209,6 +208,12 @@ bool profile_layernorm_impl(int do_verification, ...@@ -209,6 +208,12 @@ bool profile_layernorm_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
if(time_kernel) if(time_kernel)
{ {
LogRange(std::cout << "length = ", length, ",") << ", "; LogRange(std::cout << "length = ", length, ",") << ", ";
...@@ -218,12 +223,6 @@ bool profile_layernorm_impl(int do_verification, ...@@ -218,12 +223,6 @@ bool profile_layernorm_impl(int do_verification,
<< best_instance_name << std::endl; << best_instance_name << std::endl;
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
return true; return true;
} }
......
...@@ -158,6 +158,7 @@ bool profile_max_pool3d_bwd_impl(int do_verification, ...@@ -158,6 +158,7 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
std::string best_instance_name; std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max(); float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
if(do_verification) if(do_verification)
{ {
...@@ -175,8 +176,6 @@ bool profile_max_pool3d_bwd_impl(int do_verification, ...@@ -175,8 +176,6 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
ref_invoker.Run(ref_pooling_bwd_argument); ref_invoker.Run(ref_pooling_bwd_argument);
} }
int num_kernel = 0;
for(auto& inst_ptr : instance_ptrs) for(auto& inst_ptr : instance_ptrs)
{ {
auto argument_ptr = inst_ptr->MakeArgumentPointer( auto argument_ptr = inst_ptr->MakeArgumentPointer(
...@@ -268,6 +267,12 @@ bool profile_max_pool3d_bwd_impl(int do_verification, ...@@ -268,6 +267,12 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
if(time_kernel) if(time_kernel)
{ {
LogRange(std::cout << "length = ", out_length, ",") << std::endl; LogRange(std::cout << "length = ", out_length, ",") << std::endl;
...@@ -275,12 +280,6 @@ bool profile_max_pool3d_bwd_impl(int do_verification, ...@@ -275,12 +280,6 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
<< best_instance_name << std::endl; << best_instance_name << std::endl;
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
return true; return true;
} }
......
...@@ -124,6 +124,7 @@ bool profile_pool3d_fwd_impl(int do_verification, ...@@ -124,6 +124,7 @@ bool profile_pool3d_fwd_impl(int do_verification,
std::string best_instance_name; std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max(); float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
int num_kernel = 0;
if(do_verification) if(do_verification)
{ {
...@@ -150,8 +151,6 @@ bool profile_pool3d_fwd_impl(int do_verification, ...@@ -150,8 +151,6 @@ bool profile_pool3d_fwd_impl(int do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
} }
int num_kernel = 0;
for(auto& inst_ptr : instance_ptrs) for(auto& inst_ptr : instance_ptrs)
{ {
auto argument_ptr = inst_ptr->MakeArgumentPointer( auto argument_ptr = inst_ptr->MakeArgumentPointer(
...@@ -260,6 +259,12 @@ bool profile_pool3d_fwd_impl(int do_verification, ...@@ -260,6 +259,12 @@ bool profile_pool3d_fwd_impl(int do_verification,
} }
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
if(time_kernel) if(time_kernel)
{ {
LogRange(std::cout << "length = ", in_length, ",") << std::endl; LogRange(std::cout << "length = ", in_length, ",") << std::endl;
...@@ -267,12 +272,6 @@ bool profile_pool3d_fwd_impl(int do_verification, ...@@ -267,12 +272,6 @@ bool profile_pool3d_fwd_impl(int do_verification,
<< best_instance_name << std::endl; << best_instance_name << std::endl;
} }
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
return true; return true;
} }
......
...@@ -195,8 +195,8 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -195,8 +195,8 @@ bool profile_reduce_impl_impl(bool do_verification,
constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 || constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 ||
invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6); invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6);
int num_kernel = 0;
bool pass = true; bool pass = true;
int num_kernel = 0;
if constexpr(!invalid_reduce) if constexpr(!invalid_reduce)
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment