add maxpool host for validation

a69937d3 · Jing Zhang · ec381569 · a69937d3 · a69937d3
Commit a69937d3 authored Oct 14, 2021 by Jing Zhang
2 changed files
--- a/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
@@ -976,7 +976,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
        }
 #endif

-#if 0
+#if 1
        // Resize_Add
        if constexpr(add_type == 0)
        {
@@ -1137,11 +1137,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
                                make_tuple(ki, 0, hi * 2 + 1, wi * 2 + 1));

                        d_thread_buf(Number<d_offset>{}) = c_thread_buf[Number<c_offset_0>{}];
-                        d_thread_buf(Number<d_offset>{}) = max(c_thread_buf[Number<c_offset_1>{}],
+                        d_thread_buf(Number<d_offset>{}) = fmaxf(c_thread_buf[Number<c_offset_1>{}],
                                                                 d_thread_buf(Number<d_offset>{}));
-                        d_thread_buf(Number<d_offset>{}) = max(c_thread_buf[Number<c_offset_2>{}],
+                        d_thread_buf(Number<d_offset>{}) = fmaxf(c_thread_buf[Number<c_offset_2>{}],
                                                                 d_thread_buf(Number<d_offset>{}));
-                        d_thread_buf(Number<d_offset>{}) = max(c_thread_buf[Number<c_offset_3>{}],
+                        d_thread_buf(Number<d_offset>{}) = fmax(c_thread_buf[Number<c_offset_3>{}],
                                                                d_thread_buf(Number<d_offset>{}));
                    });
                });

--- a/host/host_tensor/include/host_conv.hpp
+++ b/host/host_tensor/include/host_conv.hpp
@@ -284,6 +284,25 @@ void host_direct_convolution_maxpool_nchwc(const Tensor<TIn>& in,
                               out_host.mDesc.GetLengths()[2],
                               out_host.mDesc.GetLengths()[3],
                               out_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
+
+    auto maxpool_nchw = [&](auto n, auto k0, auto ho, auto wo, auto k1) {
+        auto hx = ho * 2;
+        auto wx = wo * 2;
+
+        auto v0 = out_host(n, k0, hx, wx, k1);
+        auto v1 = out_host(n, k0, hx, wx + 1, k1);
+        auto v2 = out_host(n, k0, hx + 1, wx, k1);
+        auto v3 = out_host(n, k0, hx + 1, wx + 1, k1);
+
+        max_host(n, k0, ho, wo, k1) = std::max({v0, v1, v2, v3});
+    };
+
+    make_ParallelTensorFunctor(maxpool_nchw,
+                               max_host.mDesc.GetLengths()[0],
+                               max_host.mDesc.GetLengths()[1],
+                               max_host.mDesc.GetLengths()[2],
+                               max_host.mDesc.GetLengths()[3],
+                               max_host.mDesc.GetLengths()[4])(std::thread::hardware_concurrency());
 }

 template <typename TIn, typename TWei, typename TOut, typename InLeftPads, typename InRightPads>