Fix bug in bert accuraccy (#385)

* Fix bug in bert accuraccy * Formatting * add another test * Fix add and overflow * Formatting * Fix bug in shape_for_each * Use front instead of iterator * Use result.front() * Split add_unary files * Formatting * Fix incorrect last index * Remove comment * Inline function * Fix carry check * Fix metadata errors * Formatting * Reflow * Reflow

Fix bug in bert accuraccy (#385)
* Fix bug in bert accuraccy * Formatting * add another test * Fix add and overflow * Formatting * Fix bug in shape_for_each * Use front instead of iterator * Use result.front() * Split add_unary files * Formatting * Fix incorrect last index * Remove comment * Inline function * Fix carry check * Fix metadata errors * Formatting * Reflow * Reflow
a797f890 · Paul Fultz II · mvermeulen · a625f7b4 · a797f890 · a797f890
Commit a797f890 authored Oct 09, 2019 by Paul Fultz II Committed by mvermeulen Oct 09, 2019
20 changed files
--- a/src/include/migraphx/shape_for_each.hpp
+++ b/src/include/migraphx/shape_for_each.hpp
@@ -14,11 +14,12 @@ void shape_for_each(const migraphx::shape& s, F f)
    // Ensure calls to f use const ref to vector
    auto call = [&f](const std::vector<std::size_t>& i) { f(i); };
    std::vector<std::size_t> indices(s.lens().size());
-    for(std::size_t i = 0; i < s.elements(); i++)
+    shape ss{s.type(), s.lens()};
+    for(std::size_t i = 0; i < ss.elements(); i++)
    {
-        std::transform(s.strides().begin(),
-                       s.strides().end(),
-                       s.lens().begin(),
+        std::transform(ss.strides().begin(),
+                       ss.strides().end(),
+                       ss.lens().begin(),
                       indices.begin(),
                       [&](std::size_t stride, std::size_t len) {
                           assert(len > 0 and stride > 0);

--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -945,7 +945,7 @@ struct onnx_parser
            l_val.visit([&](auto val) {
                using val_type = std::remove_cv_t<typename decltype(val)::value_type>;
                // l_val contains only one element
-                std::vector<val_type> out_vec(s.elements(), *val.begin());
+                std::vector<val_type> out_vec(s.elements(), val.front());
                l_out = literal(s, out_vec);
            });


--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -32,8 +32,10 @@ struct shape_impl
        assert(m_lens.size() == m_strides.size());
        // assert(std::any_of(m_strides.begin(), m_strides.end(), [](auto x) { return x > 0; }) and
        //        "At least one stride must be non-zero");
-        m_standard = this->elements() == this->element_space() and
-                     std::is_sorted(m_strides.rbegin(), m_strides.rend());
+        m_standard =
+            this->elements() == this->element_space() and
+            std::is_sorted(m_strides.rbegin(), m_strides.rend()) and
+            std::none_of(m_strides.begin(), m_strides.end(), [](auto x) { return x == 0; });
    }
    shape::type_t m_type;
    std::vector<std::size_t> m_lens;
@@ -160,7 +162,21 @@ bool shape::packed() const { return this->elements() == this->element_space(); }

 bool shape::transposed() const
 {
+    if(this->broadcasted())
+    {
+        // TODO: Use a filter_iterator instead
+        std::vector<std::size_t> s;
+        s.reserve(this->strides().size());
+        std::copy_if(this->strides().begin(),
+                     this->strides().end(),
+                     std::back_inserter(s),
+                     [](std::size_t x) { return x != 0; });
+        return not std::is_sorted(s.rbegin(), s.rend());
+    }
+    else
+    {
        return not std::is_sorted(this->strides().rbegin(), this->strides().rend());
+    }
 }

 bool shape::broadcasted() const

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -11,51 +11,55 @@ if(NOT TARGET MIOpen)
 endif()

 add_library(migraphx_device
+    device/acos.cpp
    device/add.cpp
+    device/add_clip.cpp
+    device/add_relu.cpp
+    device/add_sigmoid.cpp
+    device/add_tanh.cpp
    device/argmax.cpp
    device/argmin.cpp
-    device/max.cpp
-    device/min.cpp
-    device/mul_add.cpp
-    device/exp.cpp
-    device/erf.cpp
-    device/log.cpp
-    device/sin.cpp
-    device/cos.cpp
-    device/tan.cpp
-    device/sinh.cpp
-    device/cosh.cpp
-    device/tanh.cpp
    device/asin.cpp
-    device/acos.cpp
    device/atan.cpp
-    device/relu.cpp
-    device/add_unary.cpp
+    device/ceil.cpp
+    device/clip.cpp
+    device/concat.cpp
    device/contiguous.cpp
-    device/logsoftmax.cpp
-    device/softmax.cpp
-    device/sigmoid.cpp
    device/convert.cpp
-    device/mul.cpp
-    device/concat.cpp
-    device/pad.cpp
+    device/cos.cpp
+    device/cosh.cpp
+    device/div.cpp
+    device/erf.cpp
+    device/exp.cpp
+    device/floor.cpp
    device/gather.cpp
-    device/sub.cpp
    device/int8_gemm_pack.cpp
-    device/div.cpp
-    device/clip.cpp
-    device/reduce_sum.cpp
-    device/rsqrt.cpp
-    device/round.cpp
-    device/sqrt.cpp
+    device/log.cpp
+    device/logsoftmax.cpp
+    device/max.cpp
+    device/min.cpp
+    device/mul.cpp
+    device/mul_add.cpp
+    device/mul_add_relu.cpp
+    device/pad.cpp
+    device/pow.cpp
+    device/reduce_max.cpp
    device/reduce_mean.cpp
    device/reduce_min.cpp
-    device/reduce_max.cpp
-    device/pow.cpp
-    device/sqdiff.cpp
+    device/reduce_sum.cpp
+    device/relu.cpp
+    device/round.cpp
+    device/rsqrt.cpp
+    device/sigmoid.cpp
    device/sign.cpp
-    device/ceil.cpp
-    device/floor.cpp
+    device/sin.cpp
+    device/sinh.cpp
+    device/softmax.cpp
+    device/sqdiff.cpp
+    device/sqrt.cpp
+    device/sub.cpp
+    device/tan.cpp
+    device/tanh.cpp
 )
 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
 rocm_set_soversion(migraphx_device ${PROJECT_VERSION})

--- a/src/targets/gpu/device/add_unary.cpp
+++ b/src/targets/gpu/device/add_unary.cpp
-#include <migraphx/gpu/device/add_unary.hpp>
+#include <migraphx/gpu/device/add_clip.hpp>
 #include <migraphx/gpu/device/nary.hpp>

 namespace migraphx {
@@ -6,16 +6,6 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-void mul_add_relu(hipStream_t stream,
-                  const argument& result,
-                  const argument& arg1,
-                  const argument& arg2,
-                  const argument& arg3)
-{
-    nary(stream, result, arg1, arg2, arg3)(
-        [](auto x, auto a, auto b) { return std::max<decltype(a * x + b)>(0, a * x + b); });
-}
-
 void add_clip(hipStream_t stream,
              const argument& result,
              const argument& arg1,
@@ -28,32 +18,6 @@ void add_clip(hipStream_t stream,
    });
 }

-void add_relu(hipStream_t stream,
-              const argument& result,
-              const argument& arg1,
-              const argument& arg2)
-{
-    nary(stream, result, arg1, arg2)(
-        [](auto x, auto y) { return std::max<decltype(x + y)>(0, x + y); });
-}
-
-void add_sigmoid(hipStream_t stream,
-                 const argument& result,
-                 const argument& arg1,
-                 const argument& arg2)
-{
-    nary(stream, result, arg1, arg2)(
-        [](auto x, auto y) { return 1.f / (1.f + ::exp(to_hip_type(-(x + y)))); });
-}
-
-void add_tanh(hipStream_t stream,
-              const argument& result,
-              const argument& arg1,
-              const argument& arg2)
-{
-    nary(stream, result, arg1, arg2)([](auto x, auto y) { return ::tanh(to_hip_type(x + y)); });
-}
-
 void add_clip(hipStream_t stream,
              const argument& result,
              const argument& arg1,
@@ -67,36 +31,6 @@ void add_clip(hipStream_t stream,
    });
 }

-void add_relu(hipStream_t stream,
-              const argument& result,
-              const argument& arg1,
-              const argument& arg2,
-              const argument& arg3)
-{
-    nary(stream, result, arg1, arg2, arg3)(
-        [](auto x, auto y, auto z) { return std::max<decltype(x + y + z)>(0, x + y + z); });
-}
-
-void add_sigmoid(hipStream_t stream,
-                 const argument& result,
-                 const argument& arg1,
-                 const argument& arg2,
-                 const argument& arg3)
-{
-    nary(stream, result, arg1, arg2, arg3)(
-        [](auto x, auto y, auto z) { return 1.f / (1.f + ::exp(to_hip_type(-(x + y + z)))); });
-}
-
-void add_tanh(hipStream_t stream,
-              const argument& result,
-              const argument& arg1,
-              const argument& arg2,
-              const argument& arg3)
-{
-    nary(stream, result, arg1, arg2, arg3)(
-        [](auto x, auto y, auto z) { return ::tanh(to_hip_type(x + y + z)); });
-}
-
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/device/add_relu.cpp
+++ b/src/targets/gpu/device/add_relu.cpp
+#include <migraphx/gpu/device/add_relu.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void add_relu(hipStream_t stream,
+              const argument& result,
+              const argument& arg1,
+              const argument& arg2)
+{
+    nary(stream, result, arg1, arg2)(
+        [](auto x, auto y) { return std::max<decltype(x + y)>(0, x + y); });
+}
+
+void add_relu(hipStream_t stream,
+              const argument& result,
+              const argument& arg1,
+              const argument& arg2,
+              const argument& arg3)
+{
+    nary(stream, result, arg1, arg2, arg3)(
+        [](auto x, auto y, auto z) { return std::max<decltype(x + y + z)>(0, x + y + z); });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/add_sigmoid.cpp
+++ b/src/targets/gpu/device/add_sigmoid.cpp
+#include <migraphx/gpu/device/add_sigmoid.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void add_sigmoid(hipStream_t stream,
+                 const argument& result,
+                 const argument& arg1,
+                 const argument& arg2)
+{
+    nary(stream, result, arg1, arg2)(
+        [](auto x, auto y) { return 1.f / (1.f + ::exp(to_hip_type(-(x + y)))); });
+}
+
+void add_sigmoid(hipStream_t stream,
+                 const argument& result,
+                 const argument& arg1,
+                 const argument& arg2,
+                 const argument& arg3)
+{
+    nary(stream, result, arg1, arg2, arg3)(
+        [](auto x, auto y, auto z) { return 1.f / (1.f + ::exp(to_hip_type(-(x + y + z)))); });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/add_tanh.cpp
+++ b/src/targets/gpu/device/add_tanh.cpp
+#include <migraphx/gpu/device/add_tanh.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void add_tanh(hipStream_t stream,
+              const argument& result,
+              const argument& arg1,
+              const argument& arg2)
+{
+    nary(stream, result, arg1, arg2)([](auto x, auto y) { return ::tanh(to_hip_type(x + y)); });
+}
+
+void add_tanh(hipStream_t stream,
+              const argument& result,
+              const argument& arg1,
+              const argument& arg2,
+              const argument& arg3)
+{
+    nary(stream, result, arg1, arg2, arg3)(
+        [](auto x, auto y, auto z) { return ::tanh(to_hip_type(x + y + z)); });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
@@ -135,20 +135,21 @@ struct hip_array

    MIGRAPHX_DEVICE_CONSTEXPR hip_array carry(hip_array result) const
    {
-        std::ptrdiff_t rem = 0;
-        for(std::ptrdiff_t i = result.size() - 1; i >= 0; i--)
+        uint32_t overflow = 0;
+        for(std::ptrdiff_t i = result.size() - 1; i > 0; i--)
        {
-            auto z = result[i] + rem;
-            rem    = z - std::ptrdiff_t(d[i]) + 1;
-            if(rem > 0)
-                z -= rem;
-            else
-                rem = 0;
+            auto z = result[i] + overflow;
+            // Reset overflow
+            overflow = 0;
+            // Compute overflow using while loop instead of mod
+            while(z >= d[i])
+            {
+                z -= d[i];
+                overflow += 1;
+            }
            result[i] = z;
        }
-        // Add overflows to the back
-        if(rem > 0)
-            result.back() += rem;
+        result[0] += overflow;
        return result;
    }
 };

--- a/src/targets/gpu/device/include/migraphx/gpu/device/fast_div.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/fast_div.hpp
@@ -9,7 +9,7 @@ namespace gpu {
 namespace device {

 constexpr const std::size_t fast_div_shift = 42;
-MIGRAPHX_DEVICE_CONSTEXPR std::size_t encode_divisor(std::size_t divisor)
+inline std::size_t encode_divisor(std::size_t divisor)
 {
    if(divisor == 0)
        return 0;
@@ -19,7 +19,7 @@ MIGRAPHX_DEVICE_CONSTEXPR std::size_t encode_divisor(std::size_t divisor)

 inline constexpr bool is_divisor_encodable(std::size_t i)
 {
-    return i < std::size_t{1} << (fast_div_shift / 2);
+    return i < (std::size_t{1} << (fast_div_shift / 2));
 }

 MIGRAPHX_DEVICE_CONSTEXPR std::size_t fast_div(std::size_t dividend, std::size_t encoded_divisor)

--- a/src/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp
@@ -15,18 +15,12 @@ struct multi_index
 {
    using hip_index = hip_array<std::size_t, N>;
    hip_index id{};
-    std::size_t stride = 0;
-
-    MIGRAPHX_DEVICE_CONSTEXPR hip_index add_stride(hip_index i) const
-    {
-        i.back() += stride;
-        return i;
-    }
+    hip_index stride{};

    template <class F>
    MIGRAPHX_DEVICE_CONSTEXPR void for_stride(hip_index n, F f) const
    {
-        for(hip_index i = id; i < n; i = n.carry(add_stride(i)))
+        for(hip_index i = id; i < n; i = n.carry(i + stride))
        {
            f(i);
        }
@@ -37,7 +31,7 @@ template <std::size_t N>
 MIGRAPHX_DEVICE_CONSTEXPR multi_index<N>
 make_multi_index(const hip_shape<N>& s, std::size_t i, std::size_t n)
 {
-    return {s.multi(i), n};
+    return {s.multi(i), s.multi(n)};
 }

 template <std::size_t N>
@@ -51,13 +45,22 @@ template <std::size_t N>
 inline auto mi_launch(hipStream_t stream, const hip_shape<N>& s, std::size_t local = 1024)
 {
    assert(s.standard);
+    assert(s.elements() > 0);
    std::size_t n       = s.elements();
    std::size_t groups  = (n + local - 1) / local;
    std::size_t nglobal = std::min<std::size_t>(128, groups) * local;

+    assert(groups > 0);
+    assert(nglobal > 0);
+    auto nglobal_multi = s.multi(nglobal);
+    // Skip checking this, since this will cause metadata to not be generated
+    // for some unknown reason.
+    //
+    // assert(std::any_of(nglobal_multi.begin(), nglobal_multi.end(), [](auto x){return x>0;}));
+
    return [=](auto f) {
        launch(stream, nglobal, local)([=](auto idx) {
-            auto midx = make_multi_index(s, idx.global, nglobal);
+            auto midx = make_multi_index(s, idx.global, nglobal_multi);
            midx.for_stride(s.lens, [&](auto i) { f(i); });
        });
    };

--- a/src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/nary.hpp
@@ -304,7 +304,8 @@ void nary_impl(hipStream_t stream, F f, argument result, Arguments... args)
    MIGRAPHX_TRACE_NARY_FUNCTION
    const auto shapes   = make_array(args.get_shape()...);
    const bool standard = all_of(shapes, [](const shape& s) { return s.standard(); });
-    const bool packed   = all_of(shapes, [](const shape& s) { return s.packed(); });
+    const bool packed =
+        all_of(shapes, [](const shape& s) { return s.packed() and not s.broadcasted(); });
    const bool same_shapes =
        all_of(shapes, [&](const shape& s) { return s == result.get_shape(); });
    const bool same_input_shapes = all_of(shapes, [&](const shape& s) { return s == shapes[0]; });

--- a/src/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/shape.hpp
@@ -70,14 +70,15 @@ struct hip_shape
    {
        hip_index result;
        std::size_t tidx = idx;
-        for(std::ptrdiff_t is = result.size() - 1; is >= 0; is--)
+        for(std::ptrdiff_t is = result.size() - 1; is > 0; is--)
        {
            // result[is] = tidx % lens[is];
-            // tidx = tdix / lens[is];
+            // tidx = tidx / lens[is];
            auto q     = fast_div(tidx, divs[is]);
            result[is] = remainder(q, tidx, lens[is]);
            tidx       = q;
        }
+        result[0] = tidx;
        return result;
    }
 };

--- a/src/targets/gpu/device/mul_add.cpp
+++ b/src/targets/gpu/device/mul_add.cpp
-#include <migraphx/gpu/device/add_unary.hpp>
+#include <migraphx/gpu/device/mul_add.hpp>
 #include <migraphx/gpu/device/nary.hpp>

 namespace migraphx {

--- a/src/targets/gpu/device/mul_add_relu.cpp
+++ b/src/targets/gpu/device/mul_add_relu.cpp
+#include <migraphx/gpu/device/mul_add_relu.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void mul_add_relu(hipStream_t stream,
+                  const argument& result,
+                  const argument& arg1,
+                  const argument& arg2,
+                  const argument& arg3)
+{
+    nary(stream, result, arg1, arg2, arg3)(
+        [](auto x, auto a, auto b) { return std::max<decltype(a * x + b)>(0, a * x + b); });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -5,7 +5,11 @@
 #include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/oper.hpp>
 #include <migraphx/gpu/device/mul_add.hpp>
-#include <migraphx/gpu/device/add_unary.hpp>
+#include <migraphx/gpu/device/add_clip.hpp>
+#include <migraphx/gpu/device/add_relu.hpp>
+#include <migraphx/gpu/device/add_sigmoid.hpp>
+#include <migraphx/gpu/device/add_tanh.hpp>
+#include <migraphx/gpu/device/mul_add_relu.hpp>
 #include <migraphx/gpu/device/add.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/array.hpp>

--- a/src/targets/gpu/include/migraphx/gpu/device/add_unary.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/add_unary.hpp

-#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ADD_UNARY_HPP
-#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ADD_UNARY_HPP
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ADD_CLIP_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ADD_CLIP_HPP

 #include <migraphx/argument.hpp>
 #include <migraphx/config.hpp>
@@ -11,12 +11,6 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

-void mul_add_relu(hipStream_t stream,
-                  const argument& result,
-                  const argument& arg1,
-                  const argument& arg2,
-                  const argument& arg3);
-
 void add_clip(hipStream_t stream,
              const argument& result,
              const argument& arg1,
@@ -24,21 +18,6 @@ void add_clip(hipStream_t stream,
              float max,
              float min);

-void add_relu(hipStream_t stream,
-              const argument& result,
-              const argument& arg1,
-              const argument& arg2);
-
-void add_sigmoid(hipStream_t stream,
-                 const argument& result,
-                 const argument& arg1,
-                 const argument& arg2);
-
-void add_tanh(hipStream_t stream,
-              const argument& result,
-              const argument& arg1,
-              const argument& arg2);
-
 void add_clip(hipStream_t stream,
              const argument& result,
              const argument& arg1,
@@ -47,24 +26,6 @@ void add_clip(hipStream_t stream,
              float max,
              float min);

-void add_relu(hipStream_t stream,
-              const argument& result,
-              const argument& arg1,
-              const argument& arg2,
-              const argument& arg3);
-
-void add_sigmoid(hipStream_t stream,
-                 const argument& result,
-                 const argument& arg1,
-                 const argument& arg2,
-                 const argument& arg3);
-
-void add_tanh(hipStream_t stream,
-              const argument& result,
-              const argument& arg1,
-              const argument& arg2,
-              const argument& arg3);
-
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/device/add_relu.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/add_relu.hpp
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ADD_RELU_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ADD_RELU_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void add_relu(hipStream_t stream,
+              const argument& result,
+              const argument& arg1,
+              const argument& arg2);
+
+void add_relu(hipStream_t stream,
+              const argument& result,
+              const argument& arg1,
+              const argument& arg2,
+              const argument& arg3);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/add_sigmoid.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/add_sigmoid.hpp
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ADD_SIGMOID_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ADD_SIGMOID_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void add_sigmoid(hipStream_t stream,
+                 const argument& result,
+                 const argument& arg1,
+                 const argument& arg2);
+
+void add_sigmoid(hipStream_t stream,
+                 const argument& result,
+                 const argument& arg1,
+                 const argument& arg2,
+                 const argument& arg3);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/device/add_tanh.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/add_tanh.hpp
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_ADD_TANH_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_ADD_TANH_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void add_tanh(hipStream_t stream,
+              const argument& result,
+              const argument& arg1,
+              const argument& arg2);
+
+void add_tanh(hipStream_t stream,
+              const argument& result,
+              const argument& arg1,
+              const argument& arg2,
+              const argument& arg3);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif