Run some cpu ops in parallel

246c4236 · Paul · 426e28ac · 246c4236 · 246c4236 · 246c4236
Commit 246c4236 authored Jan 18, 2019 by Paul
4 changed files
--- a/src/include/migraphx/functional.hpp
+++ b/src/include/migraphx/functional.hpp
@@ -94,6 +94,14 @@ constexpr void each_args(F)
 {
 }
+template<class F, class T>
+auto unpack(F f, T& x)
+{
+    return sequence_c<std::tuple_size<T>{}>([&](auto... is) {
+        f(std::get<is>(x)...);
+    });
+}
 /// Implements a fix-point combinator
 template <class R, class F>
 detail::fix_f<R, F> fix(F f)

--- a/src/include/migraphx/par_dfor.hpp
+++ b/src/include/migraphx/par_dfor.hpp
+#ifndef GUARD_RTGLIB_PAR_DFOR_HPP
+#define GUARD_RTGLIB_PAR_DFOR_HPP
+#include <migraphx/par_for.hpp>
+#include <migraphx/functional.hpp>
+#include <array>
+#include <numeric>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+template<class... Ts>
+auto par_dfor(Ts... xs)
+{
+    return [=](auto f) {
+        using array_type = std::array<std::size_t, sizeof...(Ts)>;
+        array_type lens  = {{static_cast<std::size_t>(xs)...}};
+        auto n = std::accumulate(lens.begin(), lens.end(), 1, std::multiplies<std::size_t>{});
+        const std::size_t min_grain = 8;
+        if (n > 2*min_grain) {
+            array_type strides;
+            strides.fill(1);
+            std::partial_sum(
+                lens.rbegin(), lens.rend() - 1, strides.rbegin() + 1, std::multiplies<std::size_t>());
+            auto size = std::accumulate(lens.begin(), lens.end(), 1, std::multiplies<std::size_t>());
+            par_for(size, min_grain, [&](std::size_t i) {
+                array_type indices;
+                std::transform(strides.begin(),
+                               strides.end(),
+                               lens.begin(),
+                               indices.begin(),
+                               [&](size_t stride, size_t len) { return (i / stride) % len; });
+                migraphx::unpack(f, indices);
+            });
+        } else {
+            dfor(xs...)(f);
+        }
+    };
+}
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/par_for.hpp
+++ b/src/include/migraphx/par_for.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_PAR_FOR_HPP
+#define MIGRAPHX_GUARD_RTGLIB_PAR_FOR_HPP
+#include <thread>
+#include <cmath>
+#include <algorithm>
+#include <vector>
+#include <cassert>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct joinable_thread : std::thread
+{
+    template <class... Xs>
+    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...) // NOLINT
+    {
+    }
+    joinable_thread& operator=(joinable_thread&& other) = default;
+    joinable_thread(joinable_thread&& other)            = default;
+    ~joinable_thread()
+    {
+        if(this->joinable())
+            this->join();
+    }
+};
+template <class F>
+void par_for_impl(std::size_t n, std::size_t threadsize, F f)
+{
+    if(threadsize <= 1)
+    {
+        for(std::size_t i = 0; i < n; i++)
+            f(i);
+    }
+    else
+    {
+        std::vector<joinable_thread> threads(threadsize);
+        const std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
+        std::size_t work = 0;
+        std::generate(threads.begin(),
+                      threads.end(),
+                      [=, &work] {
+                            auto result = joinable_thread([=] {
+                            std::size_t start = work;
+                            std::size_t last  = std::min(n, work + grainsize);
+                            for(std::size_t i = start; i < last; i++)
+                            {
+                                f(i);
+                            }
+                        });
+                        work += grainsize;
+                        return result;
+                      });
+        assert(work >= n);
+    }
+}
+template <class F>
+void par_for(std::size_t n, std::size_t min_grain, F f)
+{
+    const auto threadsize =
+        std::min<std::size_t>(std::thread::hardware_concurrency(), n / min_grain);
+    par_for_impl(n, threadsize, f);
+}
+template <class F>
+void par_for(std::size_t n, F f)
+{
+    const int min_grain = 8;
+    par_for(n, min_grain, f);
+}
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -5,6 +5,7 @@
 #include <migraphx/operators.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/iterator_for.hpp>
+#include <migraphx/par_dfor.hpp>
 #include <migraphx/cpu/gemm.hpp>
 #include <unordered_map>
 #include <utility>
@@ -72,7 +73,7 @@ struct cpu_batch_norm_inference
            visit_all(output, input, mini_batch_mean, mini_batch_variance, arg_gamma, arg_bias)(
                [&](auto result, auto buffer, auto mean, auto variance, auto gamma, auto bias) {
-                    dfor(num_batch, num_channels, image_height, image_width)(
+                    par_dfor(num_batch, num_channels, image_height, image_width)(
                        [&](std::size_t n, std::size_t c, std::size_t h, std::size_t w) {
                            assert((variance(c) + epsilon) > 0);
                            result(n, c, h, w) = gamma(c) * (buffer(n, c, h, w) - mean(c)) /
@@ -87,7 +88,7 @@ struct cpu_batch_norm_inference
            visit_all(output, input, mini_batch_mean, mini_batch_mean, arg_gamma, arg_bias)(
                [&](auto result, auto buffer, auto mean, auto variance, auto gamma, auto bias) {
-                    dfor(num_batch, num_channels, image_height, image_width)(
+                    par_dfor(num_batch, num_channels, image_height, image_width)(
                        [&](std::size_t n, std::size_t c, std::size_t h, std::size_t w) {
                            assert((variance(c, h, w) + epsilon) > 0);
                            result(n, c, h, w) = gamma(c, h, w) *
@@ -122,7 +123,7 @@ struct cpu_convolution
            auto wei_h = wei[2];
            auto wei_w = wei[3];
-            dfor(output_shape.lens()[0],
+            par_dfor(output_shape.lens()[0],
                 output_shape.lens()[1],
                 output_shape.lens()[2],
                 output_shape.lens()[3])(
@@ -245,7 +246,7 @@ struct cpu_pooling
            auto in_h  = input.get_shape().lens()[2];
            auto in_w  = input.get_shape().lens()[3];
-            dfor(output_shape.lens()[0],
+            par_dfor(output_shape.lens()[0],
                 output_shape.lens()[1],
                 output_shape.lens()[2],
                 output_shape.lens()[3])(