Fail when no kernel is applicable

9db34134 · Bartlomiej Kocot · 8f84a012 · 9db34134 · 9db34134 · 9db34134
Commit 9db34134 authored Sep 11, 2023 by Bartlomiej Kocot
20 changed files
--- a/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp
+++ b/profiler/include/profiler/profile_avg_pool3d_bwd_impl.hpp
@@ -126,6 +126,7 @@ bool profile_avg_pool3d_bwd_impl(int do_verification,
    std::string best_instance_name;
    float best_avg_time   = std::numeric_limits<float>::max();
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    if(do_verification)
    {
@@ -145,8 +146,6 @@ bool profile_avg_pool3d_bwd_impl(int do_verification,
        ref_invoker.Run(ref_pooling_bwd_argument);
    }
-    int num_kernel = 0;
    for(auto& inst_ptr : instance_ptrs)
    {
        auto argument_ptr = inst_ptr->MakeArgumentPointer(

--- a/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
@@ -263,6 +263,7 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    // profile device op instances
    for(auto& op_ptr : op_ptrs)
@@ -301,6 +302,7 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
            std::string op_name = op_ptr->GetTypeString();
            float ave_time =
@@ -350,6 +352,12 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
        }
    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;

--- a/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_bias_softmax_gemm_permute_impl.hpp
@@ -273,6 +273,7 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    // profile device op instances
    for(auto& op_ptr : op_ptrs)
@@ -310,6 +311,7 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
            std::string op_name = op_ptr->GetTypeString();
            float ave_time =
@@ -385,6 +387,12 @@ bool profile_batched_gemm_bias_softmax_gemm_permute_impl(bool do_verification,
        }
    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;

--- a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
@@ -223,6 +223,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    // profile device op instances
    for(auto& op_ptr : op_ptrs)
@@ -255,6 +256,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
            std::string op_name = op_ptr->GetTypeString();
            float ave_time =
@@ -309,6 +311,12 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
        }
    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;

--- a/profiler/include/profiler/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_impl.hpp
@@ -136,6 +136,7 @@ bool profile_batched_gemm_impl(int do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    // profile device op instances
    for(auto& op_ptr : op_ptrs)
@@ -201,6 +202,7 @@ bool profile_batched_gemm_impl(int do_verification,
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
            // re-init C to zero before profiling next kernel
            c_device_buf.SetZero();
@@ -254,6 +256,12 @@ bool profile_batched_gemm_impl(int do_verification,
        }
    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;

--- a/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
@@ -254,6 +254,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    // profile device GEMM instances
    for(auto& gemm_ptr : gemm_ptrs)
@@ -281,6 +282,7 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
            // init DO, D1 to 0
            reduce0_device_buf.SetZero();
            reduce1_device_buf.SetZero();
@@ -352,6 +354,12 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
        }
    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;

--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
@@ -251,6 +251,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    // profile device op instances
    for(auto& op_ptr : op_ptrs)
@@ -283,6 +284,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
            std::string op_name = op_ptr->GetTypeString();
            float ave_time =
@@ -337,6 +339,12 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
        }
    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;

--- a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
@@ -251,6 +251,7 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    // profile device op instances
    for(auto& op_ptr : op_ptrs)
@@ -284,6 +285,7 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
            std::string op_name = op_ptr->GetTypeString();
            float ave_time =
@@ -357,6 +359,12 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
        }
    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;

--- a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
@@ -214,6 +214,7 @@ bool profile_batchnorm_backward_impl(bool do_verification,
    std::string best_instance_name;
    float best_avg_time   = std::numeric_limits<float>::max();
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    if(do_verification)
    {
@@ -264,8 +265,7 @@ bool profile_batchnorm_backward_impl(bool do_verification,
        (void)invoker_ptr_ref->Run(argument_ptr_ref.get());
    }
-    int num_kernel = 0;
+    bool pass = true;
-    bool pass      = true;
    for(auto& inst_ptr : instance_ptrs)
    {
@@ -371,18 +371,18 @@ bool profile_batchnorm_backward_impl(bool do_verification,
        };
    }
-    if(time_kernel)
-    {
-        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
-                  << best_instance_name << std::endl;
-    }
    if(num_kernel == 0)
    {
        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }
+    if(time_kernel)
+    {
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
@@ -209,6 +209,7 @@ bool profile_batchnorm_forward_impl(int do_verification,
    std::string best_instance_name;
    float best_avg_time   = std::numeric_limits<float>::max();
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    if(do_verification)
    {
@@ -258,8 +259,7 @@ bool profile_batchnorm_forward_impl(int do_verification,
        (void)invoker_ptr_ref->Run(argument_ptr_ref.get());
    }
-    int num_kernel = 0;
+    bool pass = true;
-    bool pass      = true;
    for(auto& inst_ptr : instance_ptrs)
    {
@@ -393,18 +393,18 @@ bool profile_batchnorm_forward_impl(int do_verification,
        };
    }
-    if(time_kernel)
-    {
-        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
-                  << best_instance_name << std::endl;
-    }
    if(num_kernel == 0)
    {
        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }
+    if(time_kernel)
+    {
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_batchnorm_infer_impl.hpp
+++ b/profiler/include/profiler/profile_batchnorm_infer_impl.hpp
@@ -183,6 +183,7 @@ bool profile_batchnorm_infer_impl(int do_verification,
    std::string best_instance_name;
    float best_avg_time   = std::numeric_limits<float>::max();
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    if(do_verification)
    {
@@ -230,8 +231,7 @@ bool profile_batchnorm_infer_impl(int do_verification,
        (void)invoker_ptr_ref->Run(argument_ptr_ref.get());
    }
-    int num_kernel = 0;
+    bool pass = true;
-    bool pass      = true;
    for(auto& inst_ptr : instance_ptrs)
    {
@@ -316,18 +316,18 @@ bool profile_batchnorm_infer_impl(int do_verification,
        };
    }
-    if(time_kernel)
-    {
-        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
-                  << best_instance_name << std::endl;
-    }
    if(num_kernel == 0)
    {
        std::cout << "Error: No kernel is applicable" << std::endl;
        return false;
    }
+    if(time_kernel)
+    {
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
    return pass;
 }

--- a/profiler/include/profiler/profile_contraction_impl.hpp
+++ b/profiler/include/profiler/profile_contraction_impl.hpp
@@ -183,6 +183,7 @@ int profile_contraction_impl(ck::index_t do_verification,
    float best_avg_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    // profile device op instances
    for(auto& op_ptr : op_ptrs)
@@ -239,6 +240,7 @@ int profile_contraction_impl(ck::index_t do_verification,
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
            // re-init C to zero before profiling next kernel
            e_device_buf.SetZero();
@@ -333,6 +335,12 @@ int profile_contraction_impl(ck::index_t do_verification,
        std::cout << " CDELayout =  ColumnMajor";
    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
    std::cout << " M = " << M << " N = " << N << " K = " << K << " StridesA = " << StridesA
              << " StridesB = " << StridesB << " StridesE = " << StridesE << " : " << best_avg_time
              << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "

--- a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
@@ -151,6 +151,7 @@ bool profile_conv_bwd_data_impl(int do_verification,
    float best_avg_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    // profile device Conv instances
    bool pass = true;
@@ -177,6 +178,7 @@ bool profile_conv_bwd_data_impl(int do_verification,
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
            // for conv bwd data, some input tensor element are zero, but not written by kernel,
            // need to set zero
            in_device_buf.SetZero();
@@ -237,6 +239,12 @@ bool profile_conv_bwd_data_impl(int do_verification,
        }
    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
    std::cout << "Best configuration parameters:"
              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;

--- a/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp
+++ b/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp
@@ -192,6 +192,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    // profile device Conv instances
    for(auto& op_ptr : op_ptrs)
@@ -220,6 +221,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
            std::string conv_name = op_ptr->GetTypeString();
            float ave_time =
@@ -270,6 +272,12 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
        }
    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
 }

--- a/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp
+++ b/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp
@@ -182,6 +182,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    // profile device Conv instances
    for(auto& op_ptr : op_ptrs)
@@ -209,6 +210,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
            std::string conv_name = op_ptr->GetTypeString();
            float ave_time =
@@ -258,6 +260,12 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
        }
    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
 }

--- a/profiler/include/profiler/profile_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_conv_fwd_impl.hpp
@@ -133,6 +133,7 @@ bool profile_conv_fwd_impl(int do_verification,
    float best_avg_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    // profile device op instances
    bool pass = true;
@@ -159,6 +160,7 @@ bool profile_conv_fwd_impl(int do_verification,
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
            // re-init output to zero before profiling next kernel
            out_device_buf.SetZero();
@@ -210,6 +212,12 @@ bool profile_conv_fwd_impl(int do_verification,
        }
    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
    std::cout << "Best configuration parameters:"
              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;

--- a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
+++ b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
@@ -137,6 +137,7 @@ bool profile_elementwise_layernorm_impl(int do_verification,
    std::string best_instance_name;
    float best_avg_time   = std::numeric_limits<float>::max();
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    if(do_verification)
    {
@@ -163,8 +164,6 @@ bool profile_elementwise_layernorm_impl(int do_verification,
        ref_invoker.Run(ref_argument);
    }
-    int num_kernel = 0;
    for(auto& inst_ptr : instance_ptrs)
    {
        auto argument_ptr = inst_ptr->MakeArgumentPointer(
@@ -246,6 +245,12 @@ bool profile_elementwise_layernorm_impl(int do_verification,
        }
    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
    if(time_kernel)
    {
        LogRange(std::cout << "length = ", length, ",") << ", ";
@@ -253,12 +258,6 @@ bool profile_elementwise_layernorm_impl(int do_verification,
                  << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl;
    }
-    if(num_kernel == 0)
-    {
-        std::cout << "Error: No kernel is tested" << std::endl;
-        return false;
-    }
    return true;
 }

--- a/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp
@@ -164,6 +164,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    bool pass = true;
@@ -193,6 +194,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
            // re-init E to zero before profiling a kernel
            e_device_buf.SetZero();
@@ -232,6 +234,12 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
        }
    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;

--- a/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp
@@ -155,6 +155,7 @@ bool profile_gemm_add_fastgelu_impl(int do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    bool pass = true;
@@ -183,6 +184,7 @@ bool profile_gemm_add_fastgelu_impl(int do_verification,
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
            // re-init E to zero before profiling a kernel
            e_device_buf.SetZero();
@@ -222,6 +224,12 @@ bool profile_gemm_add_fastgelu_impl(int do_verification,
        }
    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;

--- a/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_add_multiply_impl.hpp
@@ -164,6 +164,7 @@ bool profile_gemm_add_multiply_impl(int do_verification,
    float best_ave_time   = 0;
    float best_tflops     = 0;
    float best_gb_per_sec = 0;
+    int num_kernel        = 0;
    bool pass = true;
@@ -193,6 +194,7 @@ bool profile_gemm_add_multiply_impl(int do_verification,
        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            num_kernel++;
            // re-init E to zero before profiling a kernel
            e_device_buf.SetZero();
@@ -232,6 +234,12 @@ bool profile_gemm_add_multiply_impl(int do_verification,
        }
    }
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;