Merge

07857fb4 · Paul · 1689d2d8 · 62e8ec20 · 07857fb4 · 07857fb4
Commit 07857fb4 authored Jul 01, 2022 by Paul
3 changed files
--- a/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
@@ -118,6 +118,12 @@ constexpr auto sequence_c_impl(F&& f, seq<Ns...>)
    return f(index_constant<Ns>{}...);
 }
+template <class F, index_int... Ns>
+constexpr void repeat_c_impl(F f, seq<Ns...>)
+{
+    swallow{(f(integral_constant<index_int, Ns>{}), 0)...};
+}
 template <index_int... N>
 constexpr auto args_at(seq<N...>)
 {
@@ -144,6 +150,18 @@ constexpr auto sequence(IntegerConstant ic, F&& f)
    return sequence_c<ic>(f);
 }
+template <std::size_t N, class F>
+constexpr void repeat_c(F f)
+{
+    detail::repeat_c_impl(f, detail::gens<N>{});
+}
+template <class IntegerConstant, class F>
+constexpr auto repeat(IntegerConstant ic, F&& f)
+{
+    return repeat_c<ic>(f);
+}
 template <class F, class G>
 constexpr auto by(F f, G g)
 {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
@@ -27,6 +27,8 @@
 #include <migraphx/kernels/hip.hpp>
 #include <migraphx/kernels/types.hpp>
 #include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/type_traits.hpp>
 namespace migraphx {
@@ -64,29 +66,61 @@ struct index
    {
        return _c<1> + n / nlocal();
    }
+    template <class N, class Stride>
+    static constexpr auto max_stride_iterations(N n, Stride stride)
+    {
+        return (n - _c<1>) / stride + _c<1>;
+    }
-    template <class F>
+    template <class F, class N, class Stride>
-    __device__ void global_stride(index_int n, F f) const
+    static constexpr void for_stride(index_int start, N n, Stride stride, F f)
+    {
+        if constexpr(not is_integral<N>{} and not is_integral<Stride>{})
+        {
+            if constexpr(max_stride_iterations(n, stride) == 1)
+            {
+                if constexpr(stride > n)
                {
-        const auto stride = nglobal();
+                    if(start < n)
-        for(index_int i = global; i < n; i += stride)
+                        f(start);
+                }
+                else
+                {
+                    f(start);
+                }
+            }
+            else
            {
+                repeat(max_stride_iterations(n, stride), [&](auto k) {
+                    auto i = start + stride * k;
+                    if(i < n)
                        f(i);
+                });
            }
        }
+        else
-    template <class F>
-    __device__ void local_stride(index_int n, F f) const
        {
-        const auto stride = nlocal();
+            for(index_int i = start; i < n; i += stride)
-        for(index_int i = local; i < n; i += stride)
            {
                f(i);
            }
        }
+    }
+    template <class F, class N>
+    __device__ void global_stride(N n, F f) const
+    {
+        for_stride(global, n, nglobal(), f);
+    }
+    template <class F, class N>
+    __device__ void local_stride(N n, F f) const
+    {
+        for_stride(local, n, nlocal(), f);
+    }
 };
-inline __device__ index make_index()
+inline __device__ __attribute__((const)) index make_index()
 {
    return index{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
@@ -186,6 +186,7 @@ __device__ auto auto_preload(index idx)
 {
    return make_transform([=](auto f, auto... xs) {
        auto invoke = [=](auto... ys) {
+            if constexpr((Bs or ...))
                __syncthreads();
            f(ys...);
        };