Merge branch 'develop' into mlir-c

7f65a88e · Paul · 79bfe69f · b20e3d4d · 7f65a88e · 7f65a88e
Commit 7f65a88e authored Feb 04, 2022 by Paul
20 changed files
--- a/src/program.cpp
+++ b/src/program.cpp
@@ -180,6 +180,63 @@ void program::finalize()
    mm->finalize(this->impl->ctx);
 }

+template <class T>
+std::string classify(T x)
+{
+    switch(std::fpclassify(x))
+    {
+    case FP_INFINITE: return "inf";
+    case FP_NAN: return "nan";
+    case FP_NORMAL: return "normal";
+    case FP_SUBNORMAL: return "subnormal";
+    case FP_ZERO: return "zero";
+    default: return "unknown";
+    }
+}
+
+std::unordered_set<std::string> classify_argument(const argument& a)
+{
+    std::unordered_set<std::string> result;
+    a.visit(
+        [&](auto t) {
+            for(const auto& x : t)
+                result.insert(classify(x));
+        },
+        [&](const auto& xs) {
+            for(const auto& x : xs)
+            {
+                auto r = classify_argument(x);
+                result.insert(r.begin(), r.end());
+            }
+        });
+    return result;
+}
+
+void preview_argument(std::ostream& os, const argument& a)
+{
+    a.visit(
+        [&](auto t) {
+            if(t.size() <= 10)
+            {
+                os << t;
+            }
+            else
+            {
+                os << to_string_range(t.begin(), t.begin() + 5);
+                os << ", ..., ";
+                os << to_string_range(t.end() - 5, t.end());
+            }
+        },
+        [&](const auto& xs) {
+            for(const auto& x : xs)
+            {
+                os << '{';
+                preview_argument(os, x);
+                os << '}';
+            }
+        });
+}
+
 template <class F>
 std::vector<argument> generic_eval(const module* mod,
                                   context& ctx,
@@ -312,8 +369,21 @@ std::vector<argument> program::eval(parameter_map params) const
                                if(trace_level > 1 and ins->name().front() != '@' and
                                   ins->name() != "load" and not result.empty())
                                {
-                                    target tgt = make_target(this->impl->target_name);
-                                    std::cout << "Output: " << tgt.copy_from(result) << std::endl;
+                                    target tgt  = make_target(this->impl->target_name);
+                                    auto buffer = tgt.copy_from(result);
+                                    if(trace_level == 2)
+                                    {
+                                        std::cout << "Output has "
+                                                  << to_string_range(classify_argument(buffer))
+                                                  << std::endl;
+                                        std::cout << "Output: ";
+                                        preview_argument(std::cout, buffer);
+                                        std::cout << std::endl;
+                                    }
+                                    else
+                                    {
+                                        std::cout << "Output: " << buffer << std::endl;
+                                    }
                                }
                                return result;
                            }));

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -327,8 +327,12 @@ target_flags(HIP_COMPILER_FLAGS hip::device)
 # Remove cuda arch flags
 string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
 string(REGEX REPLACE --offload-arch=[a-z0-9:+-]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-string(REPLACE "$<LINK_LANGUAGE:CXX>" "1" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-string(REPLACE "SHELL:" "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+# Skip library paths since hip will incorrectly treat it as a source file
+string(APPEND HIP_COMPILER_FLAGS " ")
+foreach(_unused RANGE 2)
+    string(REGEX REPLACE " /[^ ]+\\.(a|so) " " " HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+endforeach()
+
 message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
 target_compile_definitions(migraphx_gpu PRIVATE 
    "-DMIGRAPHX_HIP_COMPILER=${CMAKE_CXX_COMPILER}" 

--- a/src/targets/gpu/argmax.cpp
+++ b/src/targets/gpu/argmax.cpp
@@ -9,7 +9,7 @@ namespace gpu {

 shape hip_argmax::compute_shape(const std::vector<shape>& inputs) const
 {
-    check_shapes{inputs, *this}.has(2).standard();
+    check_shapes{inputs, *this}.has(2);
    return op.normalize_compute_shape({inputs.at(0)});
 }


--- a/src/targets/gpu/argmin.cpp
+++ b/src/targets/gpu/argmin.cpp
@@ -9,7 +9,7 @@ namespace gpu {

 shape hip_argmin::compute_shape(const std::vector<shape>& inputs) const
 {
-    check_shapes{inputs, *this}.has(2).standard();
+    check_shapes{inputs, *this}.has(2);
    return op.normalize_compute_shape({inputs.at(0)});
 }


--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -114,6 +114,7 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option
    options.params += " -DMIGRAPHX_NGLOBAL=" + std::to_string(options.global);
    options.params += " -DMIGRAPHX_NLOCAL=" + std::to_string(options.local);
    options.params += " " + join_strings(compiler_warnings(), " ");
+    options.params += " -ftemplate-backtrace-limit=0";
    options.params += " -Werror";
    auto cos = compile_hip_src(srcs, std::move(options.params), get_device_name());
    if(cos.size() != 1)

--- a/src/targets/gpu/compile_pointwise.cpp
+++ b/src/targets/gpu/compile_pointwise.cpp
@@ -63,8 +63,16 @@ operation compile_pointwise(context& ctx, const std::vector<shape>& inputs, modu
    run_passes(m, {eliminate_common_subexpression{}, dead_code_elimination{}});
    cpp_generator g;
    g.fmap([](const std::string& fname) { return "migraphx::" + fname; });
-    auto name = g.create_function(g.generate_module(m).set_attributes({"__device__"}));
-    return compile_pointwise((ctx), inputs, "&" + name, g.str());
+    g.add_point_op("where", "${function:where}(${0}, ${1}, ${2})");
+    g.add_point_op("prelu", "${function:where}(${0} < 0, ${0} * ${1}, ${0})");
+    g.add_point_op("sign", "${function:where}(${0} > 0, 1, ${function:where}(${0} < 0, -1, 0))");
+    g.add_point_op("equal", "migraphx::abs(${0} == ${1})");
+    g.add_point_op("less", "migraphx::abs(${0} < ${1})");
+    g.add_point_op("greater", "migraphx::abs(${0} > ${1})");
+    g.add_point_op("not", "migraphx::abs(not ${0})");
+    auto name =
+        g.create_function(g.generate_module(m).set_attributes({"__device__"}).set_generic_types(m));
+    return compile_pointwise((ctx), inputs, "MIGRAPHX_LIFT(" + name + ")", g.str());
 }

 } // namespace gpu

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -62,6 +62,8 @@ struct fusion
        keep_alive(std::move(t));
    }

+    bool empty() const { return fp == nullptr; }
+
    op_t operator[](std::size_t i) const
    {
        assert(fp);
@@ -125,12 +127,11 @@ struct fusion
        return shape{shape::int8_type, {ws_size}};
    }

-    void compile(context& ctx)
+    bool compile(context& ctx)
    {
        assert(fp);
-        auto status = miopenCompileFusionPlan(ctx.get_stream().get_miopen(), fp.get());
-        if(status != miopenStatusSuccess)
-            MIGRAPHX_THROW("Compiling fusion plan failed");
+        return miopenCompileFusionPlan(ctx.get_stream().get_miopen(), fp.get()) ==
+               miopenStatusSuccess;
    }

    argument execute(context& ctx,
@@ -561,6 +562,117 @@ struct find_mul_add_relu
    }
 };

+struct miopen_fusion
+{
+    struct fuse_op_data
+    {
+        operation op;
+        float alpha = 1;
+        float beta  = 0;
+    };
+    struct fuse_op : fuse_op_data, reflect_equality<fuse_op>, reflect_stream<fuse_op>
+    {
+        template <class Self, class F>
+        static auto reflect(Self& self, F f)
+        {
+            return pack(f(self.op, "op"), f(self.alpha, "alpha"), f(self.beta, "beta"));
+        }
+    };
+    std::vector<fuse_op> ops = {};
+    fusion f                 = {};
+    std::function<void(context&, const fusion&, const std::vector<argument>&)> execute;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.ops, "ops"));
+    }
+
+    value compile(context& ctx, const shape&, std::vector<shape> inputs)
+    {
+        // Compensate for allocation
+        inputs.pop_back();
+        std::size_t i = 0;
+        f             = fusion(inputs[i]);
+        i++;
+        std::vector<std::function<void(const fused_operator_args&, const std::vector<argument>&)>>
+            invokers;
+        for(auto&& fop : ops)
+        {
+            if(i > inputs.size())
+            {
+                f = {};
+                return {};
+            }
+            if(fop.op.name() == "convolution")
+            {
+                auto* mop = f.create_conv(any_cast<op::convolution>(fop.op), inputs[i]);
+                invokers.push_back(
+                    [=](const fused_operator_args& fargs, const std::vector<argument>& args) {
+                        miopenSetOpArgsConvForward(
+                            fargs.get(), mop, &fop.alpha, &fop.beta, args[i].implicit());
+                    });
+                i++;
+            }
+            else if(fop.op.name() == "add")
+            {
+                auto* mop = f.create_bias(inputs[i]);
+                invokers.push_back(
+                    [=](const fused_operator_args& fargs, const std::vector<argument>& args) {
+                        miopenSetOpArgsBiasForward(
+                            fargs.get(), mop, &fop.alpha, &fop.beta, args[i].implicit());
+                    });
+                i++;
+            }
+            else if(fop.op.name() == "relu")
+            {
+                auto* mop = f.create_relu();
+                invokers.push_back([=](const fused_operator_args& fargs,
+                                       const std::vector<argument>&) {
+                    miopenSetOpArgsActivForward(fargs.get(), mop, &fop.alpha, &fop.beta, 0, 0, 0);
+                });
+            }
+            else
+            {
+                f = {};
+                return {};
+            }
+        }
+        if(not f.compile(ctx))
+        {
+            f = {};
+            return {};
+        }
+        execute = [invokers](context& c, const fusion& ff, const std::vector<argument>& args) {
+            auto fargs = make_fused_args();
+            for(auto&& invoker : invokers)
+                invoker(fargs, args);
+            ff.execute(c, fargs, args.front(), args.back());
+        };
+        return {{"workspace", f.get_workspace(ctx).bytes()}};
+    }
+    void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
+    {
+        if(not f.empty())
+            return;
+        auto v = compile(ctx, output_shape, inputs);
+        if(not v.is_object())
+            MIGRAPHX_THROW("Failed to compile fusion plan");
+    }
+    std::string name() const { return "gpu::miopen_fusion"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        if(ops.empty())
+            return {};
+        // TODO: Check number of arguments
+        return ops.front().op.compute_shape({inputs[0], inputs[1]});
+    }
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        execute(ctx, f, args);
+        return args.back();
+    }
+};
+
 struct miopen_conv_bias
 {
    op::convolution op;
@@ -596,7 +708,8 @@ struct miopen_conv_bias
        f    = fusion(inputs[0]);
        conv = f.create_conv(op, inputs[1]);
        bias = f.create_bias(inputs[3]);
-        f.compile(ctx);
+        if(not f.compile(ctx))
+            MIGRAPHX_THROW("Failed to compile fusion plan");
    }

    shape get_workspace(context& ctx) { return f.get_workspace(ctx); }
@@ -683,6 +796,25 @@ void apply_conv_bias(context& ctx, module& p, match::matcher_result r)
    p.replace_instruction(ins, cb, input_ins, weights_ins, old_ws_ins, bias_ins, alloc_ins);
 }

+inline auto precompile_name(std::string s) // NOLINT
+{
+    return match::make_basic_pred_matcher([=](instruction_ref ins) {
+        if(ins->name() != "gpu::precompile_op")
+            return false;
+        auto op = from_value<operation>(ins->get_operator().to_value().at("op"));
+        return (op.name() == s);
+    });
+}
+
+template <class... Ms>
+auto conv_bias_pointwise(Ms... ms)
+{
+    return precompile_name("pointwise")(
+        match::either_arg(0, 1)(bias_shape(match::used_once()).bind("bias"),
+                                fusable_conv(match::used_once()).bind("conv")),
+        ms...);
+}
+
 struct find_conv_bias
 {
    context* ctx = nullptr;
@@ -709,6 +841,46 @@ struct find_conv_bias_relu
    }
 };

+struct find_conv_pointwise
+{
+    context* ctx = nullptr;
+    auto matcher() const
+    {
+        return precompile_name("pointwise")(
+            match::nargs(3),
+            match::either_arg(0, 1)(bias_shape(match::used_once()).bind("bias"),
+                                    fusable_conv(match::used_once()).bind("conv")));
+    }
+
+    void apply(module& m, match::matcher_result r) const
+    {
+        auto conv_ins    = r.instructions["conv"];
+        auto bias_ins    = r.instructions["bias"];
+        auto ins         = r.result;
+        auto input_ins   = conv_ins->inputs().at(0);
+        auto weights_ins = conv_ins->inputs().at(1);
+        auto conv_op     = any_cast<miopen_convolution>(conv_ins->get_operator()).op;
+        auto alloc_ins   = ins->inputs().back();
+
+        module_ref pm = ins->module_inputs().front();
+
+        miopen_fusion op{};
+        op.ops.push_back({{conv_op}});
+        for(auto&& i : *pm)
+        {
+            if(i.name()[0] == '@')
+                continue;
+            auto inputs = to_shapes(i.inputs());
+            op.ops.push_back({{i.get_operator()}});
+        }
+        std::vector<instruction_ref> inputs = {input_ins, weights_ins, bias_ins, alloc_ins};
+        auto v                              = op.compile(*ctx, ins->get_shape(), to_shapes(inputs));
+        if(not v.is_object())
+            return;
+        m.replace_instruction(ins, op, inputs);
+    }
+};
+
 struct find_gemm_add
 {
    auto matcher() const
@@ -778,6 +950,7 @@ void fuse_ops::apply(module& p) const
    match::find_matches(p, find_triadd{});
    match::find_matches(p,
                        find_layernorm{},
+                        find_conv_pointwise{ctx},
                        find_conv_bias_relu{ctx},
                        find_conv_bias{ctx},
                        find_add_gelu{},

--- a/src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
@@ -76,8 +76,9 @@ void arg_op(Op op, hipStream_t stream, const argument& result, const argument& a
    size_t batch_item_num = batch_lens[axis];
    batch_lens[axis]      = 1;
    migraphx::shape batch_shape{arg_shape.type(), batch_lens};
+    migraphx::shape std_arg_shape{arg_shape.type(), arg_shape.lens()};

-    hip_visit_all(arg, arg_shape, batch_shape)([&](auto input, auto arg_s, auto batch_s) {
+    hip_visit_all(arg, std_arg_shape, batch_shape)([&](auto input, auto arg_s, auto batch_s) {
        auto* output = device_cast(result.get<int64_t>().data());
        using type   = device_type<std::remove_cv_t<typename decltype(input)::value_type>>;
        // use one block for items in one batch.

--- a/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
@@ -5,6 +5,9 @@

 namespace migraphx {

+#define MIGRAPHX_STRINGIZE_1(...) #__VA_ARGS__
+#define MIGRAPHX_STRINGIZE(...) MIGRAPHX_STRINGIZE_1(__VA_ARGS__)
+
 // Workaround hip's broken abort on device code
 #ifdef __HIP_DEVICE_COMPILE__
 // NOLINTNEXTLINE
@@ -14,19 +17,67 @@ namespace migraphx {
 #define MIGRAPHX_HIP_NORETURN [[noreturn]]
 #endif

+namespace debug {
+struct swallow
+{
+    template <class... Ts>
+    constexpr swallow(Ts&&...)
+    {
+    }
+};
+
+template <size_t N>
+struct print_buffer
+{
+    char buffer[N + 1] = {0};
+    char* pos          = buffer;
+
+    constexpr void append(char c)
+    {
+        if(c == 0)
+            return;
+        if(pos < buffer + N)
+        {
+            *pos = c;
+            pos++;
+        }
+    }
+
+    template <size_t M>
+    constexpr void append(const char (&array)[M])
+    {
+        for(int i = 0; i < M; i++)
+            append(array[i]);
+    }
+};
+
+template <class... Ts>
+__host__ __device__ void print(const Ts&... xs)
+{
+    const auto size = (sizeof(xs) + ...);
+    print_buffer<size> buffer;
+    swallow{(buffer.append(xs), 0)...};
+    printf("%s", buffer.buffer);
+}
+
+} // namespace debug
+
 // noreturn cannot be used on this function because abort in hip is broken
+template <class T1, class T2, class T3, class T4>
 MIGRAPHX_HIP_NORETURN inline __host__ __device__ void
-assert_fail(const char* assertion, const char* file, unsigned int line, const char* function)
+assert_fail(const T1& assertion, const T2& file, const T3& line, const T4& function)
 {
-    printf("%s:%u: %s: assertion '%s' failed.\n", file, line, function, assertion);
+    // printf is broken on hip with more than one argument, so use a simple print functions instead
+    debug::print(file, ":", line, ": ", function, ": assertion '", assertion, "' failed.\n");
+    // printf("%s:%s: %s: assertion '%s' failed.\n", file, line, function, assertion);
    abort();
 }

 #ifdef MIGRAPHX_DEBUG
-#define MIGRAPHX_ASSERT(cond)            \
-    ((cond) ? void(0) : [](auto... xs) { \
-        assert_fail(xs...);              \
-    }(#cond, __FILE__, __LINE__, __PRETTY_FUNCTION__))
+#define MIGRAPHX_ASSERT(cond)                               \
+    ((cond) ? void(0) : [](auto&&... private_migraphx_xs) { \
+        assert_fail(private_migraphx_xs...);                \
+    }(#cond, __FILE__, MIGRAPHX_STRINGIZE(__LINE__), __PRETTY_FUNCTION__))
 #else
 #define MIGRAPHX_ASSERT(cond)
 #endif

--- a/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
@@ -137,12 +137,48 @@ constexpr void each_args(F)
 {
 }

+template <class F, class T>
+constexpr auto fold_impl(F&&, T&& x)
+{
+    return static_cast<T&&>(x);
+}
+
+template <class F, class T, class U, class... Ts>
+constexpr auto fold_impl(F&& f, T&& x, U&& y, Ts&&... xs)
+{
+    return fold_impl(f, f(static_cast<T&&>(x), static_cast<U&&>(y)), static_cast<Ts&&>(xs)...);
+}
+
+template <class F>
+constexpr auto fold(F f)
+{
+    return [=](auto&&... xs) { return fold_impl(f, static_cast<decltype(xs)&&>(xs)...); };
+}
+
 template <class... Ts>
-auto pack(Ts... xs)
+constexpr auto pack(Ts... xs)
 {
    return [=](auto f) { return f(xs...); };
 }

+template <class Compare, class P1, class P2>
+constexpr auto pack_compare(Compare compare, P1 p1, P2 p2)
+{
+    return p1([&](auto... xs) {
+        return p2([&](auto... ys) {
+            auto c = [&](auto x, auto y) -> int {
+                if(compare(x, y))
+                    return 1;
+                else if(compare(y, x))
+                    return -1;
+                else
+                    return 0;
+            };
+            return fold([](auto x, auto y) { return x ? x : y; })(c(xs, ys)..., 0);
+        });
+    });
+}
+
 template <index_int N>
 constexpr auto arg_c()
 {
@@ -187,7 +223,7 @@ constexpr auto transform_args(F f, Fs... fs)

 // NOLINTNEXTLINE
 #define MIGRAPHX_LIFT(...) \
-    ([](auto&&... xs) MIGRAPHX_RETURNS((__VA_ARGS__)(static_cast<decltype(xs)>(xs)...))
+    [](auto&&... xs) MIGRAPHX_RETURNS((__VA_ARGS__)(static_cast<decltype(xs)>(xs)...))

 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
@@ -13,6 +13,7 @@ struct integral_constant
    using type               = integral_constant;
    constexpr operator value_type() const noexcept { return value; }
    constexpr value_type operator()() const noexcept { return value; }
+    static constexpr type to() { return {}; }
 };

 // NOLINTNEXTLINE

--- a/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
@@ -4,6 +4,7 @@
 #include <migraphx/kernels/types.hpp>
 #include <migraphx/kernels/vec.hpp>
 #include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/type_traits.hpp>
 #include <hip/hip_fp16.h>
 #include <hip/math_functions.h>

@@ -19,19 +20,30 @@ constexpr T as_float(T x)
 } // namespace math

 // NOLINTNEXTLINE
-#define MIGRAPHX_DEVICE_MATH(name, fname) \
-    template <class... Ts>                \
+#define MIGRAPHX_DEVICE_MATH(name, fname)                              \
+    template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())> \
    auto __device__ name(Ts... xs) MIGRAPHX_RETURNS(fname(xs...))

 // NOLINTNEXTLINE
-#define MIGRAPHX_DEVICE_MATH_FOR(type, name, fname) \
-    template <class... Ts>                          \
-    auto __device__ name(type x, Ts... xs) MIGRAPHX_RETURNS(fname(x, xs...))
+#define MIGRAPHX_DEVICE_MATH_VEC(name)                                       \
+    template <class... Ts, MIGRAPHX_REQUIRES(is_any_vec<Ts...>())>           \
+    auto __device__ name(Ts... xs)                                           \
+    {                                                                        \
+        return vec_transform(xs...)([](auto... ys) { return name(ys...); }); \
+    }

 // NOLINTNEXTLINE
-#define MIGRAPHX_DEVICE_MATH_HALF(name, fname)       \
-    template <class... Ts>                           \
-    auto __device__ name(migraphx::half x, Ts... xs) \
+#define MIGRAPHX_DEVICE_MATH_FOR(type, name, fname)                    \
+    template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())> \
+    auto __device__ name(type x, Ts... xs)->type                       \
+    {                                                                  \
+        return fname(x, xs...);                                        \
+    }
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_HALF(name, fname)                         \
+    template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())> \
+    auto __device__ name(migraphx::half x, Ts... xs)                   \
        MIGRAPHX_RETURNS(fname(math::as_float(x), math::as_float(xs)...))

 MIGRAPHX_DEVICE_MATH(abs, ::abs)
@@ -99,21 +111,51 @@ MIGRAPHX_DEVICE_MATH_HALF(tan, ::tan)
 MIGRAPHX_DEVICE_MATH_HALF(tanh, ::tanh)

 template <class T, class U>
-constexpr auto& max(const T& a, const U& b)
+constexpr auto where(bool cond, const T& a, const U& b)
 {
-    return (a < b) ? b : a;
+    return cond ? a : b;
 }

+MIGRAPHX_DEVICE_MATH_VEC(abs)
+MIGRAPHX_DEVICE_MATH_VEC(acos)
+MIGRAPHX_DEVICE_MATH_VEC(acosh)
+MIGRAPHX_DEVICE_MATH_VEC(asin)
+MIGRAPHX_DEVICE_MATH_VEC(asinh)
+MIGRAPHX_DEVICE_MATH_VEC(atan)
+MIGRAPHX_DEVICE_MATH_VEC(atanh)
+MIGRAPHX_DEVICE_MATH_VEC(ceil)
+MIGRAPHX_DEVICE_MATH_VEC(cos)
+MIGRAPHX_DEVICE_MATH_VEC(cosh)
+MIGRAPHX_DEVICE_MATH_VEC(erf)
+MIGRAPHX_DEVICE_MATH_VEC(exp)
+MIGRAPHX_DEVICE_MATH_VEC(floor)
+MIGRAPHX_DEVICE_MATH_VEC(log)
+MIGRAPHX_DEVICE_MATH_VEC(pow)
+MIGRAPHX_DEVICE_MATH_VEC(round)
+MIGRAPHX_DEVICE_MATH_VEC(rsqrt)
+MIGRAPHX_DEVICE_MATH_VEC(sin)
+MIGRAPHX_DEVICE_MATH_VEC(sinh)
+MIGRAPHX_DEVICE_MATH_VEC(sqrt)
+MIGRAPHX_DEVICE_MATH_VEC(tan)
+MIGRAPHX_DEVICE_MATH_VEC(tanh)
+MIGRAPHX_DEVICE_MATH_VEC(where)
+
 template <class T, class U>
-constexpr auto& min(const T& a, const U& b)
+constexpr auto max(const T& a, const U& b)
 {
-    return (a > b) ? b : a;
+    return where(a < b, b, a);
 }

 template <class T, class U>
-constexpr T convert(U x)
+constexpr auto min(const T& a, const U& b)
 {
-    return x;
+    return where(a > b, b, a);
+}
+
+template <class T, class U>
+constexpr auto convert(U v)
+{
+    return vec_transform(v)([](auto x) -> T { return x; });
 }

 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
@@ -10,13 +10,38 @@

 namespace migraphx {

+template <class T>
+struct implicit_conversion_op
+{
+    T x;
+
+    template <index_int N, class U>
+    constexpr operator vec<U, N>() const
+    {
+        static_assert(vec_size<T>() == N, "Vector mismatch size");
+        return __builtin_convertvector(x, vec<U, N>);
+    }
+
+    template <class U>
+    constexpr operator U() const
+    {
+        return x;
+    }
+};
+
+template <class T>
+constexpr implicit_conversion_op<T> implicit_conversion(T x)
+{
+    return {x};
+}
+
 template <class F, class T, class... Ts>
 __device__ void pointwise_tensor(index idx, F f, T out, Ts... xs)
 {
    preload<typename T::type>(idx, xs...)([&](auto... ps) {
        idx.global_stride(out.get_shape().elements(), [&](auto i) {
            auto multi_idx = out.get_shape().multi(i);
-            out[multi_idx] = f(ps[multi_idx]...);
+            out[multi_idx] = implicit_conversion(f(ps[multi_idx]...));
        });
    });
 }
@@ -24,7 +49,7 @@ __device__ void pointwise_tensor(index idx, F f, T out, Ts... xs)
 template <class F, class... Ts>
 __device__ void pointwise(F f, Ts*... ps)
 {
-    auto t = transform_args(make_tensors(), rotate_last());
+    auto t = transform_args(make_tensors(), rotate_last(), auto_vectorize());
    t(ps...)([&](auto... xs) {
        auto idx = make_index();
        pointwise_tensor(idx, f, xs...);

--- a/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
@@ -29,7 +29,7 @@ constexpr auto traverse_preload(Shapes... ss)
 }

 template <class T, class... Shapes>
-constexpr index_int compute_preload_size(Shapes...)
+constexpr index_int compute_preload_size_c(Shapes...)
 {
    index_int size = 0;
    traverse_preload<T>(Shapes{}...)(
@@ -37,6 +37,12 @@ constexpr index_int compute_preload_size(Shapes...)
    return size;
 }

+template <class T, class... Shapes>
+constexpr auto compute_preload_size(Shapes...)
+{
+    return _c<compute_preload_size_c<T>(Shapes{}...)>;
+}
+
 template <class F, class T, class... Ts>
 __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
 {
@@ -48,11 +54,21 @@ __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
        [&](auto x, auto offset, auto copy) {
            if constexpr(copy)
            {
-                auto v = vectorize(x);
-                auto b = as_vec(tensor_vec_size(v), buffer + offset);
-                idx.local_stride(v.get_shape().element_space(),
-                                 [&](auto i) { b[i] = v.data()[i]; });
-                return x.with(buffer + offset);
+                if constexpr(decltype(tensor_vec_size(x)){} == 0)
+                {
+                    auto v = vectorize(x);
+                    auto b = as_vec(tensor_vec_size(v), buffer + offset);
+                    idx.local_stride(v.get_shape().element_space(),
+                                     [&](auto i) { b[i] = v.data()[i]; });
+                    return x.with(buffer + offset);
+                }
+                else
+                {
+                    auto b = as_vec(tensor_vec_size(x), buffer + offset);
+                    idx.local_stride(x.get_shape().element_space(),
+                                     [&](auto i) { b[i] = x.data()[i]; });
+                    return x.with(b);
+                }
            }
            else
            {
@@ -78,7 +94,7 @@ template <class T, class... Ts>
 __device__ auto preload(index idx, Ts... xs)
 {
    using type               = typename remove_vec<T>::type;
-    constexpr auto size      = compute_preload_size<type>(xs.get_shape()...);
+    constexpr auto size      = decltype(compute_preload_size<type>(xs.get_shape()...)){};
    const index_int max_size = 512 * sizeof(type);
    return [=](auto f) {
        if constexpr(size > 0 and size < max_size)

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -3,6 +3,7 @@

 #include <migraphx/kernels/types.hpp>
 #include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/functional.hpp>

 namespace migraphx {

@@ -24,6 +25,38 @@ constexpr auto vec_size()
    return decltype(vec_size(T{})){};
 }

+template <class... Ts>
+constexpr auto is_any_vec()
+{
+    if constexpr(sizeof...(Ts) == 0)
+        return false_type{};
+    else
+        return bool_constant<((vec_size<Ts>() + ...) > 0)>{};
+}
+
+template <class T, class I>
+constexpr auto vec_at(T x, I i)
+{
+    if constexpr(vec_size<T>() == 0)
+        return x;
+    else
+    {
+        MIGRAPHX_ASSERT(i < vec_size<T>());
+        return x[i];
+    }
+}
+
+template <class... Ts>
+constexpr auto common_vec_size()
+{
+    return fold([](auto x, auto y) {
+        if constexpr(x > y)
+            return x;
+        else
+            return y;
+    })(vec_size<Ts>()...);
+}
+
 template <index_int N, class T>
 __device__ __host__ auto as_vec(T* x)
 {
@@ -33,5 +66,25 @@ __device__ __host__ auto as_vec(T* x)
        return reinterpret_cast<vec<T, N>*>(x);
 }

+template <class... Ts>
+constexpr auto vec_transform(Ts... xs)
+{
+    return [=](auto f) {
+        if constexpr(is_any_vec<Ts...>())
+        {
+            using type             = decltype(f(vec_at(xs, 0)...));
+            constexpr auto size    = common_vec_size<Ts...>();
+            vec<type, size> result = {0};
+            for(int i = 0; i < size; i++)
+                result[i] = f(vec_at(xs, i)...);
+            return result;
+        }
+        else
+        {
+            return f(xs...);
+        }
+    };
+}
+
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
@@ -7,40 +7,70 @@
 namespace migraphx {

 template <class T>
-constexpr auto tensor_vec_size(T)
+constexpr auto tensor_vec_size()
 {
    return vec_size<typename T::type>();
 }

-template <index_int N, class Shape>
-constexpr auto as_vec_shape(Shape s)
+template <class T>
+constexpr auto tensor_vec_size(T)
 {
-    auto lens    = transform(s.lens, s.strides, [](auto len, auto stride) {
-        if(stride == 1)
-            return len / N;
-        else
-            return len;
-    });
-    auto strides = transform(s.strides, [](auto stride) {
-        if(stride == 1)
-            return stride;
-        return stride / N;
+    return tensor_vec_size<T>();
+}
+
+template <index_int N, class Shape, class Axis>
+constexpr auto shape_step(Shape s, Axis)
+{
+    static_assert(N > 0, "Vector size must be non-zero");
+    return sequence(s.lens.size(), [&](auto... is) {
+        auto lens    = transform(s.lens, index_ints<is...>{}, [&](auto i, auto j) {
+            constexpr auto axis = Axis::to();
+            MIGRAPHX_ASSERT(i != 0);
+            MIGRAPHX_ASSERT(j != axis or i % N == 0);
+            if(j == axis)
+                return i / N;
+            else
+                return i;
+        });
+        auto strides = transform(s.strides, index_ints<is...>{}, [&](auto i, auto j) {
+            constexpr auto axis = Axis::to();
+            // If stride of the axis is zero then we dont need to adjust the other strides
+            if(Shape{}.strides[axis] == 0)
+                return i;
+            MIGRAPHX_ASSERT(j == axis or i % N == 0);
+            if(j == axis)
+                return i;
+            else
+                return i / N;
+        });
+        MIGRAPHX_ASSERT(make_shape(lens, strides).elements() * N == s.elements());
+        MIGRAPHX_ASSERT(strides[Axis{}] == 0 or
+                        make_shape(lens, strides).element_space() * N == s.element_space());
+        return make_shape(lens, strides);
    });
-    MIGRAPHX_ASSERT(make_shape(lens, strides).element_space() * N == s.element_space());
-    return make_shape(lens, strides);
 }

-template <index_int N, class T>
-__device__ __host__ auto as_vec(T x)
+// Bools can not be used as a vector type so convert it to int8
+template <class T>
+__device__ __host__ T* remove_bool(T* x)
+{
+    return x;
+}
+
+inline __device__ __host__ int8_t* remove_bool(bool* x) { return reinterpret_cast<int8_t*>(x); }
+
+template <index_int N, class T, class Axis>
+__device__ __host__ auto as_vec(T x, Axis axis)
 {
    if constexpr(N == 0)
        return x;
    else
-        return make_tensor_view(as_vec<N>(x.data()), as_vec_shape<N>(x.get_shape()));
+        return make_tensor_view(as_vec<N>(remove_bool(x.data())),
+                                shape_step<N>(x.get_shape(), axis));
 }

 template <index_int N, class T, class Axis>
-constexpr auto tensor_step(T x, Axis)
+constexpr auto tensor_step(T x, Axis axis)
 {
    if constexpr(N == 0)
    {
@@ -49,17 +79,8 @@ constexpr auto tensor_step(T x, Axis)
    else
    {
        constexpr auto s = decltype(x.get_shape()){};
-        MIGRAPHX_ASSERT(s.strides[Axis{}] == 0);
-        return sequence(x.get_shape().lens.size(), [&](auto... is) {
-            auto lens = transform(s.lens, index_ints<is...>{}, [&](auto i, auto j) {
-                constexpr auto axis = Axis{};
-                if(j == axis)
-                    return i / N;
-                else
-                    return i;
-            });
-            return make_tensor_view(x.data(), make_shape(lens, s.strides));
-        });
+        MIGRAPHX_ASSERT(s.strides[axis] == 0);
+        return make_tensor_view(x.data(), shape_step<N>(s, axis));
    }
 }

@@ -69,45 +90,71 @@ __device__ __host__ auto as_vec(IntegralConstant ic, T&& x)
    return as_vec<ic>(x);
 }

-template <class... Shapes>
-constexpr index_int find_vector_axis(Shapes... ss)
+template <class Shape>
+constexpr index_int find_vector_axis_c(Shape s)
 {
+    // Find the fastest axis that is not broadcasted
    index_int axis = 0;
-    bool b         = false;
+    for(index_int i = 1; i < s.lens.size(); i++)
+    {
+        if(s.strides[i] == 0)
+            continue;
+        if(s.strides[axis] == 0 or
+           pack_compare(less{}, pack(s.strides[i], s.lens[i]), pack(s.strides[axis], s.lens[axis])))
+            axis = i;
+    }
+    return axis;
+}
+
+template <class... Shapes>
+constexpr index_int find_vector_axis_c(Shapes... ss)
+{
+    const bool all_broadcasted = (ss.broadcasted() and ...);
+    index_int axis             = 0;
+    bool b                     = false;
    by([&](auto s) {
        if(b)
            return;
-        auto it = find(s.strides.begin(), s.strides.end(), 1);
-        if(it == s.strides.end())
+        // Skip broadcasted shapes if there are shapes not broadcasted
+        if(not all_broadcasted and s.broadcasted())
            return;
-        axis = it - s.strides.begin();
-        b    = true;
+        axis = find_vector_axis_c(s);
+        if(s.strides[axis] == 1)
+            b = true;
    })(ss...);
+    if(not b)
+        return -1;
    return axis;
 }

+template <class... Shapes>
+constexpr auto find_vector_axis(Shapes...)
+{
+    return _c<find_vector_axis_c(Shapes{}...)>;
+}
+
 template <index_int N, class Axis, class... Shapes>
-constexpr auto is_vectorizable(Axis axis, Shapes... ss)
+constexpr auto is_vectorizable_c(Axis axis, Shapes... ss)
 {
-    return (((ss.lens[axis] % N) == 0 and ss.strides[axis] == 1) and ...);
+    return ((axis < ss.lens.size() and ss.lens[axis] % N == 0 and
+             // Only vectorize broadcasted types with stride 0, since this causes issues in the
+             // preloader
+             ((not ss.broadcasted() and ss.strides[axis] == 1) or ss.strides[axis] == 0)) and
+            ...);
 }

-template <index_int N, class Shape>
-constexpr bool is_vectorizable(Shape s)
+template <index_int N, class Axis, class... Shapes>
+constexpr auto is_vectorizable(Axis, Shapes...)
 {
-    auto it = find(s.strides.begin(), s.strides.end(), 1);
-    if(it == s.strides.end())
-        return false;
-    auto axis = it - s.strides.begin();
-    return (s.lens[axis] % N) == 0 and s.strides[axis] == 1;
+    return _c<is_vectorizable_c<N>(Axis::to(), Shapes{}...)>;
 }

 template <class P>
 constexpr auto find_vectorize_size(P pred)
 {
-    if constexpr(pred(_c<4>))
+    if constexpr(decltype(pred(_c<4>)){})
        return _c<4>;
-    else if constexpr(pred(_c<2>))
+    else if constexpr(decltype(pred(_c<2>)){})
        return _c<2>;
    else
        return _c<0>;
@@ -116,11 +163,12 @@ constexpr auto find_vectorize_size(P pred)
 template <class T>
 __host__ __device__ auto vectorize(T x)
 {
-    if constexpr(vec_size<T>() == 0)
+    if constexpr(tensor_vec_size<T>() == 0)
    {
+        constexpr auto axis = find_vector_axis(x.get_shape());
        constexpr auto n =
-            find_vectorize_size([&](auto i) { return _c<is_vectorizable<i>(x.get_shape())>; });
-        return as_vec<n>(x);
+            find_vectorize_size([&](auto i) { return is_vectorizable<i>(axis, x.get_shape()); });
+        return as_vec<n>(x, axis);
    }
    else
    {
@@ -128,34 +176,46 @@ __host__ __device__ auto vectorize(T x)
    }
 }

+template <class F, class... Ts>
+inline __device__ __host__ auto auto_vectorize_impl(F f, Ts... xs)
+{
+    // TODO: Just check there a single axis of 1
+    constexpr bool packed_or_broadcasted =
+        ((xs.get_shape().packed() or xs.get_shape().broadcasted()) and ...);
+    if constexpr(packed_or_broadcasted)
+    {
+        constexpr auto axis = decltype(find_vector_axis(xs.get_shape()...)){};
+        constexpr auto n    = find_vectorize_size(
+            [&](auto i) { return is_vectorizable<i>(axis, xs.get_shape()...); });
+        by(
+            [&](auto x) {
+                constexpr auto s = decltype(x.get_shape()){};
+                if constexpr(axis < s.strides.size())
+                {
+                    MIGRAPHX_ASSERT(s.strides[axis] == 0 or s.strides[axis] == 1);
+                    MIGRAPHX_ASSERT(s.lens[axis] > 0);
+                    MIGRAPHX_ASSERT(n == 0 or s.lens[axis] % n == 0);
+                    if constexpr(s.strides[axis] == 0)
+                        return tensor_step<n>(x, axis);
+                    else
+                        return as_vec<n>(x, axis);
+                }
+                else
+                {
+                    return x;
+                }
+            },
+            f)(xs...);
+    }
+    else
+    {
+        f(xs...);
+    }
+}
+
 inline __device__ __host__ auto auto_vectorize()
 {
-    return [](auto... xs) {
-        return [=](auto f) {
-            // TODO: Just check there a single axis of 1
-            constexpr bool packed_or_broadcasted =
-                ((xs.get_shape().packed() or xs.get_shape().broadcasted()) and ...);
-            if constexpr(packed_or_broadcasted)
-            {
-                constexpr auto axis = find_vector_axis(xs.get_shape()...);
-                constexpr auto n    = find_vectorize_size(
-                    [&](auto i) { return _c<is_vectorizable<i>(axis, xs.get_shape()...)>; });
-                by(
-                    [&](auto x) {
-                        constexpr auto s = x.get_shape();
-                        if constexpr(s.strides[axis] == 0)
-                            return tensor_step<n>(x, axis);
-                        else
-                            return as_vec<n>(x);
-                    },
-                    f)(xs...);
-            }
-            else
-            {
-                f(xs...);
-            }
-        };
-    };
+    return [](auto... xs) { return [=](auto f) { auto_vectorize_impl(f, xs...); }; };
 }

 } // namespace migraphx

--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -1618,6 +1618,22 @@ def greater_bool_test():
    return ([node1, node2], [x1, x2], [y])


+@onnx_test
+def greaterorequal_test():
+
+    x1 = helper.make_tensor_value_info('x1', TensorProto.FLOAT, [3])
+    x2 = helper.make_tensor_value_info('x2', TensorProto.FLOAT, [3])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [3])
+
+    node = onnx.helper.make_node(
+        'GreaterOrEqual',
+        inputs=['x1', 'x2'],
+        outputs=['y'],
+    )
+
+    return ([node], [x1, x2], [y])
+
+
 @onnx_test
 def group_conv_test():
    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [1, 4, 16, 16])
@@ -1634,6 +1650,60 @@ def group_conv_test():
    return ([node], [x, y], [z])


+@onnx_test
+def hardsigmoid_default_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [1, 3, 4, 5])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [1, 3, 4, 5])
+
+    node = onnx.helper.make_node('HardSigmoid', inputs=['x'], outputs=['y'])
+
+    return ([node], [x], [y])
+
+
+@onnx_test
+def hardsigmoid_double_test():
+    x = helper.make_tensor_value_info('x', TensorProto.DOUBLE, [1, 3, 4, 5])
+    y = helper.make_tensor_value_info('y', TensorProto.DOUBLE, [1, 3, 4, 5])
+
+    node = onnx.helper.make_node('HardSigmoid',
+                                 inputs=['x'],
+                                 outputs=['y'],
+                                 alpha=0.3,
+                                 beta=0.7)
+
+    return ([node], [x], [y])
+
+
+@onnx_test
+def hardsigmoid_half_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT16, [1, 3, 4, 5])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT16, [1, 3, 4, 5])
+
+    node = onnx.helper.make_node('HardSigmoid', inputs=['x'], outputs=['y'])
+
+    return ([node], [x], [y])
+
+
+@onnx_test
+def hardsigmoid_verify_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 5])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 5])
+
+    node = onnx.helper.make_node('HardSigmoid', inputs=['x'], outputs=['y'])
+
+    return ([node], [x], [y])
+
+
+@onnx_test
+def hardswish_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 5])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 5])
+
+    node = onnx.helper.make_node('HardSwish', inputs=['x'], outputs=['y'])
+
+    return ([node], [x], [y])
+
+
 @onnx_test
 def if_else_test():
    x = onnx.helper.make_tensor_value_info('x', onnx.TensorProto.FLOAT, [2, 3])
@@ -2692,6 +2762,80 @@ def maxpool_same_upper_test():
    return ([node], [x], [y])


+@onnx_test
+def mean_broadcast_test():
+    data_0 = helper.make_tensor_value_info('0', TensorProto.FLOAT, [1, 3, 4])
+    data_1 = helper.make_tensor_value_info('1', TensorProto.FLOAT,
+                                           [1, 2, 3, 4])
+    data_2 = helper.make_tensor_value_info('2', TensorProto.FLOAT, [4])
+    data_3 = helper.make_tensor_value_info('3', TensorProto.FLOAT, [1])
+    data_4 = helper.make_tensor_value_info('4', TensorProto.FLOAT, [2, 3, 1])
+
+    mean = helper.make_tensor_value_info('mean', TensorProto.FLOAT,
+                                         [1, 2, 3, 4])
+
+    node = onnx.helper.make_node("Mean",
+                                 inputs=["0", "1", "2", "3", "4"],
+                                 outputs=["mean"])
+
+    return ([node], [data_0, data_1, data_2, data_3, data_4], [mean])
+
+
+@onnx_test
+def mean_fp16_test():
+    data_0 = helper.make_tensor_value_info('0', TensorProto.FLOAT16, [1, 2, 3])
+    data_1 = helper.make_tensor_value_info('1', TensorProto.FLOAT16, [1, 2, 3])
+    data_2 = helper.make_tensor_value_info('2', TensorProto.FLOAT16, [1, 2, 3])
+
+    mean = helper.make_tensor_value_info('mean', TensorProto.FLOAT16,
+                                         [1, 2, 3])
+
+    node = onnx.helper.make_node("Mean",
+                                 inputs=["0", "1", "2"],
+                                 outputs=["mean"])
+
+    return ([node], [data_0, data_1, data_2], [mean])
+
+
+@onnx_test
+def mean_invalid_broadcast_test():
+    data_0 = helper.make_tensor_value_info('0', TensorProto.FLOAT, [1, 2, 3])
+    data_1 = helper.make_tensor_value_info('1', TensorProto.FLOAT, [1, 2, 3])
+    data_2 = helper.make_tensor_value_info('2', TensorProto.FLOAT, [1, 2, 4])
+
+    mean = helper.make_tensor_value_info('mean', TensorProto.FLOAT, [1, 2, 3])
+
+    node = onnx.helper.make_node("Mean",
+                                 inputs=["0", "1", "2"],
+                                 outputs=["mean"])
+
+    return ([node], [data_0, data_1, data_2], [mean])
+
+
+@onnx_test
+def mean_single_input_test():
+    data_0 = helper.make_tensor_value_info('0', TensorProto.FLOAT, [1, 2, 3])
+    mean = helper.make_tensor_value_info('mean', TensorProto.FLOAT, [1, 2, 3])
+
+    node = onnx.helper.make_node("Mean", inputs=["0"], outputs=["mean"])
+
+    return ([node], [data_0], [mean])
+
+
+@onnx_test
+def mean_test():
+    data = [
+        helper.make_tensor_value_info(str(i), TensorProto.DOUBLE, [2, 2, 2])
+        for i in range(10)
+    ]
+    data_names = [str(i) for i in range(10)]
+    mean = helper.make_tensor_value_info('mean', TensorProto.DOUBLE, [2, 2, 2])
+
+    node = onnx.helper.make_node("Mean", inputs=data_names, outputs=["mean"])
+
+    return ([node], data, [mean])
+
+
 @onnx_test
 def min_test():
    a = helper.make_tensor_value_info('0', TensorProto.FLOAT, [3])
@@ -2725,6 +2869,21 @@ def multinomial_test():
    return ([node], [input], [output])


+@onnx_test
+def multinomial_generated_seed_test():
+    sample_size = 10
+    input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 10])
+    output = helper.make_tensor_value_info("output", TensorProto.INT32,
+                                           [1, 10])
+
+    node = onnx.helper.make_node('Multinomial',
+                                 inputs=['input'],
+                                 sample_size=sample_size,
+                                 outputs=['output'])
+
+    return ([node], [input], [output])
+
+
 @onnx_test
 def multinomial_dtype_error_test():
    sample_size = 10
@@ -3176,6 +3335,21 @@ def randomnormal_dtype_error_test():
    return ([node], [], [output])


+@onnx_test
+def randomnormal_generated_seed_test():
+    sample_size = 10
+    input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 10])
+    output = helper.make_tensor_value_info("output", TensorProto.INT32,
+                                           [1, 10])
+
+    node = onnx.helper.make_node('RandomNormal',
+                                 inputs=['input'],
+                                 sample_size=sample_size,
+                                 outputs=['output'])
+
+    return ([node], [input], [output])
+
+
 @onnx_test
 def randomnormal_shape_error_test():
    dtype = 1
@@ -3266,6 +3440,21 @@ def randomuniform_dtype_error_test():
    return ([node], [], [output])


+@onnx_test
+def randomuniform_generated_seed_test():
+    sample_size = 10
+    input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 10])
+    output = helper.make_tensor_value_info("output", TensorProto.INT32,
+                                           [1, 10])
+
+    node = onnx.helper.make_node('RandomUniform',
+                                 inputs=['input'],
+                                 sample_size=sample_size,
+                                 outputs=['output'])
+
+    return ([node], [input], [output])
+
+
 @onnx_test
 def randomuniform_shape_error_test():
    dtype = 1
@@ -4290,6 +4479,44 @@ def softmax_nonstd_input_test():
    return ([node0, node1], [x], [y])


+@onnx_test
+def softsign_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [5])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [5])
+
+    node = onnx.helper.make_node('Softsign', inputs=['x'], outputs=['y'])
+
+    return ([node], [x], [y])
+
+
+def softplus_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [5])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [5])
+
+    node = onnx.helper.make_node('Softplus', inputs=['x'], outputs=['y'])
+
+    return ([node], [x], [y])
+
+
+@onnx_test
+def softsign_nd_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT16, [3, 4, 5])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT16, [3, 4, 5])
+
+    node = onnx.helper.make_node('Softsign', inputs=['x'], outputs=['y'])
+
+    return ([node], [x], [y])
+
+
+def softplus_nd_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT16, [3, 4, 5])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT16, [3, 4, 5])
+
+    node = onnx.helper.make_node('Softplus', inputs=['x'], outputs=['y'])
+
+    return ([node], [x], [y])
+
+
 @onnx_test
 def split_minus_axis_test():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 15])
@@ -4847,6 +5074,25 @@ def unknown_aten_test():
    return ([node], [x, y], [a])


+@onnx_test
+def upsample_linear_test():
+    scales = np.array([1.0, 1.0, 2.0, 2.0], dtype=np.float32)
+    scales_tensor = helper.make_tensor(name='scales',
+                                       data_type=TensorProto.FLOAT,
+                                       dims=scales.shape,
+                                       vals=scales.flatten().astype(
+                                           np.float32))
+    X = helper.make_tensor_value_info('X', TensorProto.FLOAT, [1, 1, 2, 2])
+    Y = helper.make_tensor_value_info('Y', TensorProto.FLOAT, [])
+
+    node = onnx.helper.make_node('Upsample',
+                                 inputs=['X', '', 'scales'],
+                                 outputs=['Y'],
+                                 mode='linear')
+
+    return ([node], [X], [Y], [scales_tensor])
+
+
 @onnx_test
 def upsample_test():
    scales = np.array([1.0, 1.0, 2.0, 3.0], dtype=np.float32)

--- a/test/onnx/gen_onnx.pyc
+++ b/test/onnx/gen_onnx.pyc
--- a/test/onnx/greaterorequal_test.onnx
+++ b/test/onnx/greaterorequal_test.onnx
+greaterorequal_test:g
+
+x1
+x2y"GreaterOrEqualgreaterorequal_testZ
+x1
+
+
+Z
+x2
+
+
+b
+y
+
+
+B
\ No newline at end of file
--- a/test/onnx/hardsigmoid_default_test.onnx
+++ b/test/onnx/hardsigmoid_default_test.onnx
+hardsigmoid_default_test:i
+
+xy"HardSigmoidhardsigmoid_default_testZ
+x
+
+
+
+
+b
+y
+
+
+
+
+B
\ No newline at end of file