Commit 9db34134 authored by Bartlomiej Kocot's avatar Bartlomiej Kocot
Browse files

Fail when no kernel is applicable

parent 8f84a012
......@@ -249,9 +249,9 @@ bool profile_gemm_add_relu_add_layernorm_impl(int do_verification,
std::string best_op_name;
float best_ave_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
int num_kernel = 0;
bool pass = true;
int num_kernel = 0;
// profile device operation instances
for(auto& op_ptr : op_ptrs)
......@@ -283,7 +283,6 @@ bool profile_gemm_add_relu_add_layernorm_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
++num_kernel;
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
DeviceMem workspace_dev(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
......@@ -330,15 +329,14 @@ bool profile_gemm_add_relu_add_layernorm_impl(int do_verification,
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
pass = false;
return false;
}
else
{
if(time_kernel)
{
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_op_name << std::endl;
}
return pass;
}
......
......@@ -280,6 +280,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device GEMM instances
for(auto& gemm_ptr : gemm_ptrs)
......@@ -306,6 +307,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{
num_kernel++;
// init DO, D1 to 0
reduce0_device_buf.SetZero();
reduce1_device_buf.SetZero();
......@@ -376,6 +378,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
}
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
}
......
......@@ -157,6 +157,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
int num_kernel = 0;
bool pass = true;
......@@ -185,6 +186,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
num_kernel++;
// re-init E to zero before profiling a kernel
e_device_buf.SetZero();
......@@ -224,6 +226,12 @@ bool profile_gemm_bilinear_impl(int do_verification,
}
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
......
......@@ -146,6 +146,7 @@ bool profile_gemm_fastgelu_impl(int do_verification,
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
int num_kernel = 0;
bool pass = true;
......@@ -173,6 +174,7 @@ bool profile_gemm_fastgelu_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
num_kernel++;
// re-init E to zero before profiling a kernel
e_device_buf.SetZero();
......@@ -212,6 +214,12 @@ bool profile_gemm_fastgelu_impl(int do_verification,
}
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
......
......@@ -134,6 +134,7 @@ int profile_gemm_impl(int do_verification,
float best_avg_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances
for(auto& op_ptr : op_ptrs)
......@@ -156,6 +157,7 @@ int profile_gemm_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
num_kernel++;
// re-init C to zero before profiling next kernel
c_device_buf.SetZero();
......@@ -242,6 +244,12 @@ int profile_gemm_impl(int do_verification,
std::cout << " BLayout = ColumnMajor";
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
<< " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << best_avg_time
<< " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
......
......@@ -164,6 +164,7 @@ bool profile_gemm_multiply_add_impl(int do_verification,
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
int num_kernel = 0;
bool pass = true;
......@@ -193,6 +194,7 @@ bool profile_gemm_multiply_add_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
num_kernel++;
// re-init E to zero before profiling a kernel
e_device_buf.SetZero();
......@@ -232,6 +234,12 @@ bool profile_gemm_multiply_add_impl(int do_verification,
}
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
......
......@@ -249,6 +249,7 @@ bool profile_gemm_reduce_impl(int do_verification,
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device GEMM instances
for(auto& gemm_ptr : gemm_ptrs)
......@@ -275,6 +276,7 @@ bool profile_gemm_reduce_impl(int do_verification,
if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{
num_kernel++;
// init DO, D1 to 0
reduce0_device_buf.SetZero();
reduce1_device_buf.SetZero();
......@@ -343,6 +345,12 @@ bool profile_gemm_reduce_impl(int do_verification,
}
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
......
......@@ -136,6 +136,7 @@ bool profile_gemm_splitk_impl(int do_verification,
float best_tflops = 0;
float best_gb_per_sec = 0;
float best_kbatch = 0;
int num_kernel = 0;
// profile device GEMM instances
for(auto& op_ptr : op_ptrs)
......@@ -171,7 +172,7 @@ bool profile_gemm_splitk_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
num_kernel++;
// re-init C to zero before profiling next kernel
c_device_buf.SetZero();
......@@ -281,6 +282,12 @@ bool profile_gemm_splitk_impl(int do_verification,
std::cout << " BLayout = ColumnMajor";
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
<< " StrideB = " << StrideB << " StrideC = " << StrideC << " KBatch = " << best_kbatch
<< " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......@@ -137,6 +137,7 @@ bool profile_gemm_streamk_impl(int do_verification,
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device GEMM instances
for(auto& op_ptr : op_ptrs)
......@@ -167,6 +168,7 @@ bool profile_gemm_streamk_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
num_kernel++;
// re-init C to zero before profiling next kernel
c_device_buf.SetZero();
......@@ -255,6 +257,12 @@ bool profile_gemm_streamk_impl(int do_verification,
std::cout << " BLayout = ColumnMajor";
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
<< " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << best_ave_time
<< " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
......
......@@ -120,6 +120,7 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
float best_avg_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances
bool pass = true;
......@@ -127,6 +128,7 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
num_kernel++;
// re-init output to zero before profiling next kernel
in_device_buf.SetZero();
......@@ -246,6 +248,12 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
run_impl(op_ptr, argument_ptr);
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best configuration parameters:"
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
......
......@@ -132,6 +132,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
float best_avg_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device Conv instances
bool all_pass = true;
......@@ -183,6 +184,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
num_kernel++;
// using atomic add, so need to reset input
wei_device_buf.SetZero();
......@@ -246,6 +248,12 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
}
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best configuration parameters:"
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
......
......@@ -140,6 +140,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
float best_avg_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances
bool pass = true;
......@@ -147,6 +148,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
num_kernel++;
// re-init output to zero before profiling next kernel
out_device_buf.SetZero();
......@@ -242,6 +244,12 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
run_impl(op_ptr, argument_ptr);
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best configuration parameters:"
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
......
......@@ -166,6 +166,7 @@ bool profile_grouped_gemm_fastgelu_impl(int do_verification,
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
int num_kernel = 0;
auto p_ds = std::vector<std::array<const void*, 0>>{};
......@@ -181,6 +182,7 @@ bool profile_grouped_gemm_fastgelu_impl(int do_verification,
if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{
num_kernel++;
std::string gemm_name = gemm_ptr->GetTypeString();
float ave_time =
......@@ -270,6 +272,12 @@ bool profile_grouped_gemm_fastgelu_impl(int do_verification,
std::cout << "Verification: " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
......
......@@ -174,6 +174,7 @@ bool profile_grouped_gemm_impl(int do_verification,
float best_tflops = 0;
float best_gb_per_sec = 0;
float best_kbatch = 0;
int num_kernel = 0;
auto p_ds = std::vector<std::array<const void*, 0>>{};
......@@ -258,6 +259,7 @@ bool profile_grouped_gemm_impl(int do_verification,
if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{
num_kernel++;
for(std::size_t i = 0; i < gemm_descs.size(); i++)
c_device_buf[i]->SetZero();
......@@ -347,6 +349,12 @@ bool profile_grouped_gemm_impl(int do_verification,
}
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
if(time_kernel)
{
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
......
......@@ -94,6 +94,7 @@ bool profile_groupnorm_impl(int do_verification,
std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
int num_kernel = 0;
if(do_verification)
{
......@@ -110,8 +111,6 @@ bool profile_groupnorm_impl(int do_verification,
ref_invoker.Run(ref_argument);
}
int num_kernel = 0;
for(auto& inst_ptr : instance_ptrs)
{
auto argument_ptr = inst_ptr->MakeArgumentPointer(
......@@ -192,6 +191,12 @@ bool profile_groupnorm_impl(int do_verification,
}
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
if(time_kernel)
{
LogRange(std::cout << "length = ", length, ",") << std::endl;
......@@ -199,12 +204,6 @@ bool profile_groupnorm_impl(int do_verification,
<< best_instance_name << std::endl;
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
return true;
}
......
......@@ -124,10 +124,10 @@ bool profile_image_to_column_impl(int do_verification,
std::string best_op_name;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
int num_kernel = 0;
// profile device op instances
bool pass = true;
bool is_supporting_instance = false;
for(auto& op_ptr : op_ptrs)
{
......@@ -148,7 +148,7 @@ bool profile_image_to_column_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
is_supporting_instance = true;
num_kernel++;
// re-init output to zero before profiling next kernel
out_device_buf.SetZero();
std::string op_name = op_ptr->GetTypeString();
......@@ -189,11 +189,17 @@ bool profile_image_to_column_impl(int do_verification,
}
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
std::cout << "Best configuration parameters:"
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time
<< "\nGB/s: " << best_gb_per_sec << std::endl;
return is_supporting_instance && pass;
return pass;
}
} // namespace profiler
......
......@@ -102,6 +102,7 @@ bool profile_layernorm_impl(int do_verification,
std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
int num_kernel = 0;
if(do_verification)
{
......@@ -121,8 +122,6 @@ bool profile_layernorm_impl(int do_verification,
ref_invoker.Run(ref_argument);
}
int num_kernel = 0;
for(auto& inst_ptr : instance_ptrs)
{
auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
......@@ -209,6 +208,12 @@ bool profile_layernorm_impl(int do_verification,
}
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
if(time_kernel)
{
LogRange(std::cout << "length = ", length, ",") << ", ";
......@@ -218,12 +223,6 @@ bool profile_layernorm_impl(int do_verification,
<< best_instance_name << std::endl;
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
return true;
}
......
......@@ -158,6 +158,7 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
int num_kernel = 0;
if(do_verification)
{
......@@ -175,8 +176,6 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
ref_invoker.Run(ref_pooling_bwd_argument);
}
int num_kernel = 0;
for(auto& inst_ptr : instance_ptrs)
{
auto argument_ptr = inst_ptr->MakeArgumentPointer(
......@@ -268,6 +267,12 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
}
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
if(time_kernel)
{
LogRange(std::cout << "length = ", out_length, ",") << std::endl;
......@@ -275,12 +280,6 @@ bool profile_max_pool3d_bwd_impl(int do_verification,
<< best_instance_name << std::endl;
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
return true;
}
......
......@@ -124,6 +124,7 @@ bool profile_pool3d_fwd_impl(int do_verification,
std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
int num_kernel = 0;
if(do_verification)
{
......@@ -150,8 +151,6 @@ bool profile_pool3d_fwd_impl(int do_verification,
ref_invoker.Run(ref_argument);
}
int num_kernel = 0;
for(auto& inst_ptr : instance_ptrs)
{
auto argument_ptr = inst_ptr->MakeArgumentPointer(
......@@ -260,6 +259,12 @@ bool profile_pool3d_fwd_impl(int do_verification,
}
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
if(time_kernel)
{
LogRange(std::cout << "length = ", in_length, ",") << std::endl;
......@@ -267,12 +272,6 @@ bool profile_pool3d_fwd_impl(int do_verification,
<< best_instance_name << std::endl;
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
return true;
}
......
......@@ -195,8 +195,8 @@ bool profile_reduce_impl_impl(bool do_verification,
constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 ||
invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6);
int num_kernel = 0;
bool pass = true;
int num_kernel = 0;
if constexpr(!invalid_reduce)
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment