Merge branch 'develop' into doc2

3a848f0d · Paul · 64e8e30a · d1e945da · 3a848f0d · 3a848f0d
Commit 3a848f0d authored Mar 19, 2020 by Paul
20 changed files
--- a/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/launch.hpp
@@ -78,8 +78,9 @@ inline auto gs_launch(hipStream_t stream, index_int n, index_int local = 1024)
    index_int nglobal = std::min<index_int>(256, groups) * local;
    return [=](auto f) {
-        launch(stream, nglobal, local)(
+        launch(stream, nglobal, local)([=](auto idx) __device__ {
-            [=](auto idx) { idx.global_stride(n, [&](auto i) { gs_invoke(f, i, idx); }); });
+            idx.global_stride(n, [&](auto i) { gs_invoke(f, i, idx); });
+        });
    };
 }

--- a/src/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp
@@ -95,7 +95,7 @@ inline auto mi_launch(hipStream_t stream, const hip_shape<N>& global, index_int
    auto nglobal       = global.index(nglobal_multi);
    return [=](auto f) {
-        launch(stream, nglobal, nlocal)([=](auto idx) {
+        launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
            auto midx = make_multi_index(global, idx.global, nglobal_multi);
            f(idx, midx.for_stride(global.lens));
        });

--- a/src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
@@ -36,7 +36,8 @@ auto nary_nonstandard_nonpacked_impl(hipStream_t stream, F f, argument result, A
    MIGRAPHX_TRACE_NARY_FUNCTION
    shape s{result.get_shape().type(), result.get_shape().lens()};
    hip_visit_all(s, result, args...)([&](auto standard_shape, auto output, auto... inputs) {
-        mi_gs_launch(stream, standard_shape)([=](auto idx) { output[idx] = f(inputs[idx]...); });
+        mi_gs_launch(stream,
+                     standard_shape)([=](auto idx) __device__ { output[idx] = f(inputs[idx]...); });
    });
 }
@@ -45,7 +46,7 @@ inline auto create_broadcast_index(index_int len, index_int stride)
    auto next_stride   = stride * len;
    auto e_next_stride = encode_divisor(next_stride);
    auto e_stride      = encode_divisor(stride);
-    return [=](auto i) {
+    return [=](auto i) __device__ {
        // ( i % next_stride) / stride
        return fast_div(i, e_stride) - len * fast_div(i, e_next_stride);
    };
@@ -61,11 +62,11 @@ auto nary_nonstandard_packed_impl(hipStream_t stream,
    auto arg_shape = make_array(args...).front().get_shape();
    auto perm      = find_permutation(arg_shape);
    auto s         = reorder_shape(arg_shape, perm);
-    hip_visit_all(s,
+    hip_visit_all(s, result.reshape(reorder_shape(result.get_shape(), perm)), args.reshape(s)...)(
-                  result.reshape(reorder_shape(result.get_shape(), perm)),
+        [&](auto standard_shape, auto output, auto... inputs) {
-                  args.reshape(s)...)([&](auto standard_shape, auto output, auto... inputs) {
+            mi_gs_launch(stream, standard_shape)(
-        mi_gs_launch(stream, standard_shape)([=](auto idx) { output[idx] = f(inputs[idx]...); });
+                [=](auto idx) __device__ { output[idx] = f(inputs[idx]...); });
-    });
+        });
 }
 template <class F, class... Arguments>
@@ -93,7 +94,6 @@ void nary_broadcast_vec_impl(
            using type                = typename decltype(output)::value_type;
            const index_int nelements = output.size() / vec_size;
            launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
                MIGRAPHX_DEVICE_SHARED type buffer[2048 / vec_size];
                // Load bias into LDS
                for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
@@ -185,7 +185,6 @@ void nary_double_broadcast_vec_impl(
            using type                = typename decltype(output)::value_type;
            const index_int nelements = output.size() / vec_size;
            launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
                MIGRAPHX_DEVICE_SHARED type buffer[2048 / vec_size];
                // Load bias into LDS
                for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
@@ -274,7 +273,7 @@ void nary_standard_vec_impl(hipStream_t stream, F f, argument result, Arguments.
        const index_int vec_size = 4;
        auto data                = pack_vec<4>(device_cast(inputs.data())...);
        auto* outp               = as_vec<4>(device_cast(output.data()));
-        gs_launch(stream, output_shape.elements() / vec_size)([=](auto i) {
+        gs_launch(stream, output_shape.elements() / vec_size)([=](auto i) __device__ {
            vec<type, 4> out = outp[i];
            data(
                [&](auto... xs) {
@@ -295,7 +294,7 @@ void nary_standard_impl(hipStream_t stream, F f, argument result, Arguments... a
    MIGRAPHX_TRACE_NARY_FUNCTION
    index_int nelements = result.get_shape().elements();
    hip_pointer_visit_all(result, args...)([&](auto output, auto... inputs) {
-        gs_launch(stream, nelements)([=](auto i) { output[i] = f(inputs[i]...); });
+        gs_launch(stream, nelements)([=](auto i) __device__ { output[i] = f(inputs[i]...); });
    });
 }

--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@@ -20,6 +20,15 @@ struct sum
    }
 };
+struct product
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x, U y) const
+    {
+        return x * y;
+    }
+};
 struct id
 {
    template <class T>

--- a/src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
@@ -39,7 +39,12 @@ constexpr void visit_tensor_size(index_int n, F f)
        f(std::integral_constant<index_int, 5>{});
        break;
    }
-    default: throw std::runtime_error("Unknown tensor size");
+    case 6:
+    {
+        f(std::integral_constant<index_int, 6>{});
+        break;
+    }
+    default: throw std::runtime_error("Tensor size dim out of range");
    }
 }

--- a/src/targets/gpu/device/int8_gemm_pack.cpp
+++ b/src/targets/gpu/device/int8_gemm_pack.cpp
@@ -24,7 +24,7 @@ void int8_gemm_pack_a(hipStream_t stream, const argument& result, const argument
        auto* in_ptr          = device_cast(input.data());
        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
            hip_tensor_descriptor<out_dim> desc(comp_shape);
-            gs_launch(stream, nelements, 256)([=](auto ii) {
+            gs_launch(stream, nelements, 256)([=](auto ii) __device__ {
                const size_t nb    = 4;
                auto idx           = desc.multi(ii);
                std::size_t i_m    = idx[dim_1];
@@ -55,7 +55,7 @@ void int8_gemm_pack_b(hipStream_t stream, const argument& result, const argument
        auto* in_ptr          = device_cast(input.data());
        visit_tensor_size(out_lens.size(), [&](auto out_dim) {
            hip_tensor_descriptor<out_dim> desc(comp_shape);
-            gs_launch(stream, nelements, 256)([=](auto ii) {
+            gs_launch(stream, nelements, 256)([=](auto ii) __device__ {
                const size_t nb    = 4;
                auto idx           = desc.multi(ii);
                std::size_t i_n    = idx[dim_1];

--- a/src/targets/gpu/device/log.cpp
+++ b/src/targets/gpu/device/log.cpp
@@ -9,7 +9,7 @@ namespace device {
 void log(hipStream_t stream, const argument& result, const argument& arg)
 {
-    nary(stream, result, arg)([](auto x) { return ::log(to_hip_type(x)); });
+    nary(stream, result, arg)([](auto x) __device__ { return ::log(to_hip_type(x)); });
 }
 } // namespace device

--- a/src/targets/gpu/device/logsoftmax.cpp
+++ b/src/targets/gpu/device/logsoftmax.cpp
@@ -11,11 +11,10 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int axis)
+void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
 {
-    auto lens                = result.get_shape().lens();
+    auto batch_lens          = result.get_shape().lens();
-    auto batch_lens          = lens;
+    index_int batch_item_num = batch_lens[axis];
-    index_int batch_item_num = lens[axis];
    batch_lens[axis]         = 1;
    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};
@@ -44,7 +43,7 @@ void logsoftmax(hipStream_t stream, const argument& result, const argument& arg,
            auto log_batch_sum = ::log(to_hip_type(batch_sum)) + batch_max;
-            idx.local_stride(batch_item_num, [&](auto j) {
+            idx.local_stride(batch_item_num, [&](auto j) __device__ {
                data_idx[axis]   = j;
                output[data_idx] = input[data_idx] - log_batch_sum;
            });

--- a/src/targets/gpu/device/max.cpp
+++ b/src/targets/gpu/device/max.cpp
@@ -10,7 +10,7 @@ namespace device {
 void max(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
 {
    nary(stream, result, arg1, arg2)(
-        [](auto x, auto y) { return std::max(to_hip_type(x), to_hip_type(y)); });
+        [](auto x, auto y) __device__ { return ::max(to_hip_type(x), to_hip_type(y)); });
 }
 } // namespace device

--- a/src/targets/gpu/device/min.cpp
+++ b/src/targets/gpu/device/min.cpp
@@ -10,7 +10,7 @@ namespace device {
 void min(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
 {
    nary(stream, result, arg1, arg2)(
-        [](auto x, auto y) { return std::min(to_hip_type(x), to_hip_type(y)); });
+        [](auto x, auto y) __device__ { return ::min(to_hip_type(x), to_hip_type(y)); });
 }
 } // namespace device

--- a/src/targets/gpu/device/mul.cpp
+++ b/src/targets/gpu/device/mul.cpp
@@ -8,7 +8,7 @@ namespace device {
 void mul(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
 {
-    nary(stream, result, arg1, arg2)([](auto x, auto y) { return x * y; });
+    nary(stream, result, arg1, arg2)([](auto x, auto y) __device__ { return x * y; });
 }
 void mul(hipStream_t stream,
@@ -17,7 +17,8 @@ void mul(hipStream_t stream,
         const argument& arg2,
         const argument& arg3)
 {
-    nary(stream, result, arg1, arg2, arg3)([](auto x, auto y, auto z) { return x * y * z; });
+    nary(stream, result, arg1, arg2, arg3)([](auto x, auto y, auto z)
+                                               __device__ { return x * y * z; });
 }
 } // namespace device

--- a/src/targets/gpu/device/mul_add.cpp
+++ b/src/targets/gpu/device/mul_add.cpp
@@ -12,7 +12,8 @@ void mul_add(hipStream_t stream,
             const argument& arg2,
             const argument& arg3)
 {
-    nary(stream, result, arg1, arg2, arg3)([](auto x, auto a, auto b) { return a * x + b; });
+    nary(stream, result, arg1, arg2, arg3)([](auto x, auto a, auto b)
+                                               __device__ { return a * x + b; });
 }
 } // namespace device

--- a/src/targets/gpu/device/mul_add_relu.cpp
+++ b/src/targets/gpu/device/mul_add_relu.cpp
@@ -13,7 +13,7 @@ void mul_add_relu(hipStream_t stream,
                  const argument& arg3)
 {
    nary(stream, result, arg1, arg2, arg3)(
-        [](auto x, auto a, auto b) { return std::max<decltype(a * x + b)>(0, a * x + b); });
+        [](auto x, auto a, auto b) __device__ { return ::max<decltype(a * x + b)>(0, a * x + b); });
 }
 } // namespace device

--- a/src/targets/gpu/device/pad.cpp
+++ b/src/targets/gpu/device/pad.cpp
@@ -23,12 +23,12 @@ pad(hipStream_t stream, argument result, argument arg1, float value, std::vector
        {
            device_val = device_cast(std::numeric_limits<type>::lowest());
        }
-        gs_launch(stream,
+        gs_launch(stream, result.get_shape().elements())(
-                  result.get_shape().elements())([=](auto i) { output.data()[i] = device_val; });
+            [=](auto i) __device__ { output.data()[i] = device_val; });
        hip_index offsets;
        std::copy(pads.begin(), pads.begin() + offsets.size(), offsets.begin());
-        gs_launch(stream, nelements)([=](auto i) {
+        gs_launch(stream, nelements)([=](auto i) __device__ {
            auto idx = input.get_shape().multi(i);
            for(std::size_t j = 0; j < offsets.size(); j++)
            {

--- a/src/targets/gpu/device/pow.cpp
+++ b/src/targets/gpu/device/pow.cpp
@@ -9,7 +9,7 @@ namespace device {
 void pow(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
 {
    nary(stream, result, arg1, arg2)(
-        [](auto b, auto e) { return ::pow(to_hip_type(b), to_hip_type(e)); });
+        [](auto b, auto e) __device__ { return ::pow(to_hip_type(b), to_hip_type(e)); });
 }
 } // namespace device

--- a/src/targets/gpu/device/prelu.cpp
+++ b/src/targets/gpu/device/prelu.cpp
+#include <migraphx/gpu/device/prelu.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void prelu(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
+{
+    nary(stream, result, arg1, arg2)([](auto x, auto slope)
+                                         __device__ { return ((x < 0) ? (x * slope) : x); });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/reduce_prod.cpp
+++ b/src/targets/gpu/device/reduce_prod.cpp
+#include <migraphx/gpu/device/reduce_prod.hpp>
+#include <migraphx/gpu/device/reduce.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void reduce_prod(hipStream_t stream, const argument& result, const argument& arg)
+{
+    reduce(stream, result, arg, product{}, 1, id{}, id{});
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/relu.cpp
+++ b/src/targets/gpu/device/relu.cpp
@@ -8,7 +8,7 @@ namespace device {
 void relu(hipStream_t stream, const argument& result, const argument& arg)
 {
-    nary(stream, result, arg)([](auto x) { return std::max<decltype(x)>(0, x); });
+    nary(stream, result, arg)([](auto x) __device__ { return ::max<decltype(x)>(0, x); });
 }
 } // namespace device

--- a/src/targets/gpu/device/round.cpp
+++ b/src/targets/gpu/device/round.cpp
@@ -9,7 +9,7 @@ namespace device {
 void round(hipStream_t stream, const argument& result, const argument& arg)
 {
-    nary(stream, result, arg)([](auto x) { return ::round(to_hip_type(x)); });
+    nary(stream, result, arg)([](auto x) __device__ { return ::round(to_hip_type(x)); });
 }
 } // namespace device

--- a/src/targets/gpu/device/sigmoid.cpp
+++ b/src/targets/gpu/device/sigmoid.cpp
@@ -9,7 +9,8 @@ namespace device {
 void sigmoid(hipStream_t stream, const argument& result, const argument& arg)
 {
-    nary(stream, result, arg)([](auto x) { return 1.f / (1.f + ::exp(to_hip_type(-x))); });
+    nary(stream, result, arg)([](auto x)
+                                  __device__ { return 1.f / (1.f + ::exp(to_hip_type(-x))); });
 }
 } // namespace device