Merge remote-tracking branch 'origin/develop' into ck-poc3

99604c1c · Alan Turner · 78a300ff · f7d987ba · 99604c1c · 99604c1c
Commit 99604c1c authored Oct 07, 2022 by Alan Turner
20 changed files
--- a/src/onnx/parse_batchnorm.cpp
+++ b/src/onnx/parse_batchnorm.cpp
@@ -24,7 +24,7 @@
 #include <migraphx/onnx/op_parser.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/make_op.hpp>
-#include <migraphx/op/batch_norm_inference.hpp>
+#include <migraphx/instruction.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -36,28 +36,63 @@ struct parse_batchnorm : op_parser<parse_batchnorm>
    instruction_ref parse(const op_desc& /*opd*/,
                          const onnx_parser& parser,
-                          onnx_parser::node_info info,
+                          const onnx_parser::node_info& info,
-                          const std::vector<instruction_ref>& args) const
+                          std::vector<instruction_ref> args) const
    {
-        float epsilon                                     = 1e-5f;
+        float epsilon = 1e-5f;
-        float momentum                                    = 0.9f;
-        op::batch_norm_inference::bn_infer_mode_t bn_mode = op::batch_norm_inference::spatial;
        if(contains(info.attributes, "epsilon"))
        {
            epsilon = parser.parse_value(info.attributes.at("epsilon")).at<float>();
        }
-        if(contains(info.attributes, "momentum"))
+        auto x_lens = args[0]->get_shape().lens();
+        auto x_type = args[0]->get_shape().type();
+        if(std::any_of(args.cbegin() + 1, args.cend(), [](auto a) {
+               return a->get_shape().lens().size() != 1;
+           }))
+        {
+            MIGRAPHX_THROW("PARSE_BATCHNORM: argument scale, bias, mean, or var rank != 1");
+        }
+        if(x_lens.size() == 1)
+        {
+            auto rt   = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {0.5}});
+            auto eps  = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {epsilon}});
+            auto n0   = info.add_broadcastable_binary_op("sub", args[0], args[3]);
+            auto d0   = info.add_broadcastable_binary_op("add", args[4], eps);
+            auto d1   = info.add_broadcastable_binary_op("pow", d0, rt);
+            auto div0 = info.add_broadcastable_binary_op("div", n0, d1);
+            auto r0   = info.add_broadcastable_binary_op("mul", div0, args[1]);
+            return info.add_broadcastable_binary_op("add", r0, args[2]);
+        }
+        else if(x_lens.size() > 2)
        {
-            momentum = parser.parse_value(info.attributes.at("momentum")).at<float>();
+            // unsqueeze tensors of shape (C) to broadcast correctly
+            std::vector<int64_t> unsqueeze_axes(x_lens.size() - 2);
+            std::iota(unsqueeze_axes.begin(), unsqueeze_axes.end(), 1);
+            auto rt  = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {0.5}});
+            auto eps = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {epsilon}});
+            auto scale_unsqueeze = info.add_instruction(
+                migraphx::make_op("unsqueeze", {{"axes", unsqueeze_axes}}), args[1]);
+            auto bias_unsqueeze = info.add_instruction(
+                migraphx::make_op("unsqueeze", {{"axes", unsqueeze_axes}}), args[2]);
+            auto mean_unsqueeze = info.add_instruction(
+                migraphx::make_op("unsqueeze", {{"axes", unsqueeze_axes}}), args[3]);
+            auto var_unsqueeze = info.add_instruction(
+                migraphx::make_op("unsqueeze", {{"axes", unsqueeze_axes}}), args[4]);
+            auto numer   = info.add_broadcastable_binary_op("sub", args[0], mean_unsqueeze);
+            auto var_eps = info.add_broadcastable_binary_op("add", var_unsqueeze, eps);
+            auto denom   = info.add_broadcastable_binary_op("pow", var_eps, rt);
+            auto div0    = info.add_broadcastable_binary_op("div", numer, denom);
+            auto r0      = info.add_broadcastable_binary_op("mul", div0, scale_unsqueeze);
+            return info.add_broadcastable_binary_op("add", r0, bias_unsqueeze);
        }
-        if(contains(info.attributes, "spatial"))
+        else
        {
-            bn_mode = (parser.parse_value(info.attributes.at("spatial")).at<uint64_t>() > 0)
+            // num dims either 0 or 2
-                          ? op::batch_norm_inference::spatial
+            MIGRAPHX_THROW("PARSE_BATCHNORM: rank " + std::to_string(x_lens.size()) +
-                          : op::batch_norm_inference::per_activation;
+                           " input tensor, unhandled data format");
        }
-        op::batch_norm_inference op{epsilon, momentum, bn_mode};
-        return info.add_instruction(op, args);
    }
 };

--- a/src/opt/memory_coloring_impl.cpp
+++ b/src/opt/memory_coloring_impl.cpp
@@ -72,7 +72,7 @@ bool memory_coloring_impl::allocate(interval_ptr interval)
    if(conflict_table.find(vn) != conflict_table.end())
    {
-        std::set<int>& vn_set = conflict_table[vn];
+        const std::set<int>& vn_set = conflict_table[vn];
        for(const auto& iter : vn_set)
        {
            live_range* range = live_ranges[iter];
@@ -267,8 +267,8 @@ void memory_coloring_impl::verify()
    {
        for(int i = 0; i < num_of_lives; ++i)
        {
-            live_interval& interval = live_intervals[i];
+            const live_interval& interval = live_intervals[i];
-            live_range& segment     = interval.segment;
+            const live_range& segment     = interval.segment;
            if(segment.begin == invalid_offset)
            {
@@ -284,7 +284,7 @@ void memory_coloring_impl::verify()
            int vn = segment.vn;
            if(conflict_table.find(vn) != conflict_table.end())
            {
-                std::set<int>& vn_set = conflict_table[vn];
+                const std::set<int>& vn_set = conflict_table[vn];
                for(const auto& iter : vn_set)
                {
                    live_range* range = live_ranges[iter];
@@ -319,8 +319,8 @@ void memory_coloring_impl::dump_intervals()
        {
            std::cout << " segment:" << i;
            std::cout << " =>";
-            std::set<int>& table = conflict_table[i];
+            const std::set<int>& table = conflict_table[i];
-            for(auto& iter : table)
+            for(const auto& iter : table)
            {
                std::cout << (iter) << ",";
            }
@@ -357,7 +357,7 @@ void live_interval::dump()
    std::cout << "id:" << id;
    segment.dump();
    std::cout << " uses:";
-    for(auto& iter : use_points)
+    for(const auto& iter : use_points)
    {
        std::cout << " " << get_ins_enum(iter) << ",";
    }

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -398,7 +398,7 @@ std::vector<argument> generic_eval(const program& p,
    return generic_eval(mm, ctx, params, {}, make_trace);
 }
-std::vector<argument> program::eval(parameter_map params) const
+std::vector<argument> program::eval(parameter_map params, execution_environment exec_env) const
 {
    auto& ctx = this->impl->ctx;
 #ifndef NDEBUG
@@ -423,6 +423,12 @@ std::vector<argument> program::eval(parameter_map params) const
 #endif
    auto trace_level = value_of(MIGRAPHX_TRACE_EVAL{});
+    std::vector<argument> ret;
+    if(exec_env.async)
+    {
+        ctx.wait_for(exec_env.queue);
+    }
    if(trace_level > 0)
    {
@@ -434,49 +440,56 @@ std::vector<argument> program::eval(parameter_map params) const
            ins_out[x] = ss.str();
        });
-        return generic_eval(*this,
+        ret = generic_eval(*this,
-                            ctx,
+                           ctx,
-                            std::move(params),
+                           std::move(params),
-                            with_check_context([&](auto& ins, auto f, auto&& check_context) {
+                           with_check_context([&](auto& ins, auto f, auto&& check_context) {
-                                ctx.finish();
+                               ctx.finish();
-                                std::cout << "Run instruction: " << ins_out.at(ins) << std::endl;
+                               std::cout << "Run instruction: " << ins_out.at(ins) << std::endl;
-                                timer t{};
+                               timer t{};
-                                auto result = check_context(f);
+                               auto result = check_context(f);
-                                double t1   = t.record<milliseconds>();
+                               double t1   = t.record<milliseconds>();
-                                ctx.finish();
+                               ctx.finish();
-                                double t2 = t.record<milliseconds>();
+                               double t2 = t.record<milliseconds>();
-                                std::cout << "Time: " << t1 << "ms, " << t2 << "ms" << std::endl;
+                               std::cout << "Time: " << t1 << "ms, " << t2 << "ms" << std::endl;
-                                if(trace_level > 1 and ins->name().front() != '@' and
+                               if(trace_level > 1 and ins->name().front() != '@' and
-                                   ins->name() != "load" and not result.empty())
+                                  ins->name() != "load" and not result.empty())
-                                {
+                               {
-                                    target tgt  = make_target(this->impl->target_name);
+                                   target tgt  = make_target(this->impl->target_name);
-                                    auto buffer = tgt.copy_from(result);
+                                   auto buffer = tgt.copy_from(result);
-                                    if(trace_level == 2)
+                                   if(trace_level == 2)
-                                    {
+                                   {
-                                        std::cout << "Output has "
+                                       std::cout << "Output has "
-                                                  << to_string_range(classify_argument(buffer))
+                                                 << to_string_range(classify_argument(buffer))
-                                                  << std::endl;
+                                                 << std::endl;
-                                        std::cout << "Output: ";
+                                       std::cout << "Output: ";
-                                        preview_argument(std::cout, buffer);
+                                       preview_argument(std::cout, buffer);
-                                        std::cout << std::endl;
+                                       std::cout << std::endl;
-                                    }
+                                   }
-                                    else
+                                   else
-                                    {
+                                   {
-                                        std::cout << "Output: " << buffer << std::endl;
+                                       std::cout << "Output: " << buffer << std::endl;
-                                    }
+                                   }
-                                }
+                               }
-                                return result;
+                               return result;
-                            }));
+                           }));
    }
    else
    {
-        return generic_eval(*this,
+        ret = generic_eval(*this,
-                            ctx,
+                           ctx,
-                            std::move(params),
+                           std::move(params),
-                            with_check_context([&](auto&, auto f, auto&& check_context) {
+                           with_check_context([&](auto&, auto f, auto&& check_context) {
-                                return check_context(f);
+                               return check_context(f);
-                            }));
+                           }));
    }
+    if(exec_env.async)
+    {
+        ctx.finish_on(exec_env.queue);
+    }
+    return ret;
 }
 const int program_file_version = 5;

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -355,6 +355,23 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
                 }
                 return p.eval(pm);
             })
+        .def("run_async",
+             [](migraphx::program& p,
+                py::dict params,
+                std::uintptr_t stream,
+                std::string stream_name) {
+                 migraphx::parameter_map pm;
+                 for(auto x : params)
+                 {
+                     std::string key      = x.first.cast<std::string>();
+                     py::buffer b         = x.second.cast<py::buffer>();
+                     py::buffer_info info = b.request();
+                     pm[key]              = migraphx::argument(to_shape(info), info.ptr);
+                 }
+                 migraphx::execution_environment exec_env{
+                     migraphx::any_ptr(reinterpret_cast<void*>(stream), stream_name), true};
+                 return p.eval(pm, exec_env);
+             })
        .def("sort", &migraphx::program::sort)
        .def("print", [](const migraphx::program& p) { std::cout << p << std::endl; })
        .def("__eq__", std::equal_to<migraphx::program>{})

--- a/src/replace_allocate.cpp
+++ b/src/replace_allocate.cpp
@@ -73,7 +73,7 @@ void insert_submod_allocations(instruction_ref ins, module& mod, const allocatio
        name_shapes.insert(ps.begin(), ps.end());
    }
-    for(auto& pn : name_shapes)
+    for(const auto& pn : name_shapes)
    {
        const auto& s = pn.second;
        instruction_ref output{};

--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
@@ -435,6 +435,24 @@ struct find_concat_op
    }
 };
+void move_instructions_back(module& m, instruction_ref pos, std::vector<instruction_ref> inss)
+{
+    auto start = range(m.begin(), pos);
+    for(auto ins : iterator_for(start))
+    {
+        auto it = std::find(inss.begin(), inss.end(), ins);
+        if(it != inss.end())
+            inss.erase(it);
+    }
+    for(auto ins : inss)
+    {
+        if(not m.has_instruction(ins))
+            continue;
+        move_instructions_back(m, pos, ins->inputs());
+        m.move_instruction(ins, pos);
+    }
+}
 std::vector<instruction_ref> get_splits(instruction_ref ins)
 {
    std::vector<instruction_ref> result;
@@ -610,8 +628,7 @@ struct find_splits
                   }))
                    return;
-                for(auto data : data_args)
+                move_instructions_back(m, ins, data_args);
-                    m.move_instructions(data, ins);
                auto slice_op = any_cast<op::slice>(splits.front()->get_operator());
                assert(not slice_op.axes.empty());
@@ -864,8 +881,7 @@ struct find_conv_dot_horiz_fusion
                concat_axis = axis;
            }
-            for(auto arg : args)
+            move_instructions_back(m, input, args);
-                m.move_instructions(arg, input);
            // TODO: Check if axes match
            auto concat =
                m.insert_instruction(input, make_op("concat", {{"axis", concat_axis}}), args);

--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library(migraphx_cpu
    dnnl.cpp
    eltwise.cpp
    erf.cpp
+    fmod.cpp
    fuse_ops.cpp
    gather.cpp
    gemm.cpp
@@ -42,6 +43,7 @@ add_library(migraphx_cpu
    logsoftmax.cpp
    lowering.cpp
    lrn.cpp
+    mod.cpp
    preallocate.cpp
    pooling.cpp
    reduction.cpp

--- a/src/targets/gpu/include/migraphx/gpu/cos.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/cos.hpp
@@ -21,22 +21,16 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_COS_HPP
+#include <migraphx/config.hpp>
-#define MIGRAPHX_GUARD_RTGLIB_COS_HPP
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/fmod.hpp>
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/cos.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
+namespace cpu {
-struct hip_cos : unary_device<hip_cos, device::cos>
+template struct cpu_binary<op::fmod>;
-{
-};
-} // namespace gpu
+} // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-#endif
--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -43,6 +43,8 @@
 #include <migraphx/op/argmax.hpp>
 #include <migraphx/op/argmin.hpp>
 #include <migraphx/op/rnn_var_sl_last_output.hpp>
+#include <migraphx/op/mod.hpp>
+#include <migraphx/op/fmod.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/par_dfor.hpp>

--- a/src/targets/gpu/include/migraphx/gpu/exp.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/exp.hpp
@@ -21,22 +21,16 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_EXP_HPP
+#include <migraphx/config.hpp>
-#define MIGRAPHX_GUARD_RTGLIB_EXP_HPP
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/mod.hpp>
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/exp.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
+namespace cpu {
-struct hip_exp : unary_device<hip_exp, device::exp>
+template struct cpu_binary<op::mod>;
-{
-};
-} // namespace gpu
+} // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-#endif
--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -41,81 +41,9 @@ file(GLOB KERNEL_FILES ${CONFIGURE_DEPENDS}
 message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
 add_embed_library(migraphx_kernels ${KERNEL_FILES})
-add_library(migraphx_device
+file(GLOB DEVICE_GPU_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/device/*.cpp)
-    device/acos.cpp
+add_library(migraphx_device ${DEVICE_GPU_SRCS})
-    device/acosh.cpp
-    device/add.cpp
-    device/add_clip.cpp
-    device/add_relu.cpp
-    device/add_sigmoid.cpp
-    device/add_tanh.cpp
-    device/argmax.cpp
-    device/argmin.cpp
-    device/asin.cpp
-    device/asinh.cpp
-    device/atan.cpp
-    device/atanh.cpp
-    device/ceil.cpp
-    device/clip.cpp
-    device/concat.cpp
-    device/contiguous.cpp
-    device/convert.cpp
-    device/cos.cpp
-    device/cosh.cpp
-    device/div.cpp
-    device/equal.cpp
-    device/erf.cpp
-    device/exp.cpp
-    device/fill.cpp
-    device/floor.cpp
-    device/gather.cpp
-    device/gelu.cpp
-    device/greater.cpp
-    device/int8_gemm_pack.cpp
-    device/layernorm.cpp
-    device/less.cpp
-    device/log.cpp
-    device/logical_and.cpp
-    device/logical_or.cpp
-    device/logical_xor.cpp
-    device/logsoftmax.cpp
-    device/max.cpp
-    device/min.cpp
-    device/mul.cpp
-    device/mul_add.cpp
-    device/mul_add_relu.cpp
-    device/multinomial.cpp
-    device/nonzero.cpp
-    device/pad.cpp
-    device/pow.cpp
-    device/prelu.cpp
-    device/prefix_scan_sum.cpp
-    device/recip.cpp
-    device/reduce_max.cpp
-    device/reduce_mean.cpp
-    device/reduce_min.cpp
-    device/reduce_sum.cpp
-    device/reduce_prod.cpp
-    device/relu.cpp
-    device/reverse.cpp
-    device/rnn_variable_seq_lens.cpp
-    device/round.cpp
-    device/rsqrt.cpp
-    device/scatter.cpp
-    device/sigmoid.cpp
-    device/sign.cpp
-    device/sin.cpp
-    device/sinh.cpp
-    device/softmax.cpp
-    device/sqdiff.cpp
-    device/sqrt.cpp
-    device/sub.cpp
-    device/tan.cpp
-    device/tanh.cpp
-    device/topk.cpp
-    device/unary_not.cpp
-    device/where.cpp
-)
 add_library(compile_for_gpu INTERFACE)
 target_compile_options(compile_for_gpu INTERFACE -std=c++17 -fno-gpu-rdc -Wno-cuda-compat -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns)
 target_link_libraries(compile_for_gpu INTERFACE hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-option-ignored)
@@ -153,15 +81,12 @@ add_library(migraphx_gpu
    argmax.cpp
    argmin.cpp
    batch_norm_inference.cpp
-    clip.cpp
    code_object_op.cpp
    compile_ops.cpp
    compile_gen.cpp
    compile_hip.cpp
    compile_hip_code_object.cpp
    compiler.cpp
-    concat.cpp
-    convert.cpp
    convolution.cpp
    deconvolution.cpp
    device_name.cpp
@@ -194,7 +119,6 @@ add_library(migraphx_gpu
    rocblas.cpp
    scatter.cpp
    schedule_model.cpp
-    softmax.cpp
    sync_device.cpp
    target.cpp
    topk.cpp
@@ -209,68 +133,18 @@ function(register_migraphx_gpu_ops PREFIX)
    endforeach()
 endfunction()
 register_migraphx_gpu_ops(hip_
-    acosh
-    acos
-    add
    argmax
    argmin
-    asinh
-    asin
-    atanh
-    atan
-    ceil
-    clip
-    concat
-    convert
-    cosh
-    cos
-    div
-    equal
-    erf
-    exp
-    floor
    gather
-    greater
-    less
-    log
    logsoftmax
-    logical_and
-    logical_or
-    logical_xor
    loop
-    max
-    min
-    mul
    multinomial
    nonzero
    pad
-    pow
-    prelu
    prefix_scan_sum
-    recip
-    reduce_max
-    reduce_mean
-    reduce_min
-    reduce_prod
-    reduce_sum
-    relu
    reverse
-    round
-    rsqrt
    scatter
-    sigmoid
-    sign
-    sinh
-    sin
-    softmax
-    sqdiff
-    sqrt
-    sub
-    tanh
-    tan
    topk
-    unary_not
-    where
 )
 register_migraphx_gpu_ops(miopen_
    abs
@@ -372,9 +246,18 @@ endif()
 include(CheckLibraryExists)
 get_target_property(MIOPEN_LOCATION MIOpen LOCATION)
 check_library_exists(MIOpen "miopenHiddenSetConvolutionFindMode" "${MIOPEN_LOCATION}" HAS_FIND_MODE_API)
+check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API)
+if(HAS_FIND_2_API) 
+    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
+    message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
+else()
+    message(STATUS "MIOpen does not have Find-2.0 API")
+endif()
 if(HAS_FIND_MODE_API)
    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_MODE_API)
-    message(STATUS "MIOpen has find mode api")
+    message(STATUS "MIGraphx is using Find Mode API of MIOpen")
 else()
    message(STATUS "MIOpen does not have find mode api")
 endif()

--- a/src/targets/gpu/clip.cpp
+++ b/src/targets/gpu/clip.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/clip.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/device/clip.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-shape hip_clip::compute_shape(std::vector<shape> inputs) const
-{
-    inputs.pop_back();
-    return op.compute_shape(inputs);
-}
-argument hip_clip::compute(context& ctx, const shape&, const std::vector<argument>& args) const
-{
-    device::clip(ctx.get_stream().get(), args.back(), args.front(), args.at(1), args.at(2));
-    return args.back();
-}
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/compile_gen.cpp
+++ b/src/targets/gpu/compile_gen.cpp
@@ -22,6 +22,7 @@
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/gpu/context.hpp>
 #include <migraphx/shape.hpp>
 #include <migraphx/permutation.hpp>
 #include <migraphx/stringutils.hpp>
@@ -48,12 +49,13 @@ static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)
    return {4, 2};
 }
-vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs)
+vectorize vectorize::elements(std::size_t axis,
+                              const std::vector<shape>& inputs,
+                              const std::vector<std::size_t>& sizes)
 {
    if(std::all_of(
           inputs.begin(), inputs.end(), [&](const auto& s) { return s.lens()[axis] == 1; }))
        return {1, axis};
-    auto sizes = vector_sizes(inputs);
    std::vector<std::size_t> max_vec_size;
    std::transform(inputs.begin(),
                   inputs.end(),
@@ -81,6 +83,33 @@ vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs
    return {*std::min_element(max_vec_size.begin(), max_vec_size.end()), axis};
 }
+vectorize vectorize::elements(context& ctx, std::size_t axis, const std::vector<shape>& inputs)
+{
+    if(inputs.empty())
+        return {1, axis};
+    std::size_t n = std::max_element(inputs.begin(),
+                                     inputs.end(),
+                                     by(std::less<>{}, [](const auto& s) { return s.elements(); }))
+                        ->elements();
+    std::size_t max_global = ctx.get_current_device().get_cu_count() *
+                             ctx.get_current_device().get_max_workitems_per_cu();
+    std::size_t over = n / max_global;
+    bool broadcasted =
+        std::any_of(inputs.begin(), inputs.end(), [](const auto& s) { return s.broadcasted(); });
+    std::vector<std::size_t> sizes;
+    if(broadcasted and over > 8)
+        sizes.push_back(8);
+    if(over > 4)
+        sizes.push_back(4);
+    sizes.push_back(2);
+    return elements(axis, inputs, sizes);
+}
+vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs)
+{
+    return elements(axis, inputs, vector_sizes(inputs));
+}
 std::string vectorize::str() const
 {
    return "vectorize<" + to_string(size) + ", " + to_string(axis) + ">()";
@@ -102,7 +131,7 @@ preload preload::broadcasts(std::size_t axis, const std::vector<shape>& inputs)
    std::size_t bytes = 0;
    for(auto i : preloaded)
    {
-        auto input = inputs[i];
+        const auto& input = inputs[i];
        bytes += input.bytes();
        if(bytes > max_lds_bytes)
            break;

--- a/src/targets/gpu/concat.cpp
+++ b/src/targets/gpu/concat.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/concat.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/device/concat.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-shape hip_concat::compute_shape(std::vector<shape> inputs) const
-{
-    inputs.pop_back();
-    return op.normalize_compute_shape(inputs);
-}
-argument hip_concat::compute(context& ctx,
-                             const shape& output_shape,
-                             const std::vector<argument>& args) const
-{
-    std::vector<std::size_t> offsets = op.compute_offsets(output_shape, args);
-    return device::concat(ctx.get_stream().get(), output_shape, args, offsets);
-}
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/convert.cpp
+++ b/src/targets/gpu/convert.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/convert.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/device/convert.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-shape hip_convert::compute_shape(std::vector<shape> inputs) const
-{
-    inputs.pop_back();
-    check_shapes{inputs, *this}.packed();
-    return op.compute_shape(inputs);
-}
-argument hip_convert::compute(context& ctx, const shape&, const std::vector<argument>& args) const
-{
-    device::convert(ctx.get_stream().get(), args[1], args[0]);
-    return args[1];
-}
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/convolution.cpp
+++ b/src/targets/gpu/convolution.cpp
@@ -24,6 +24,7 @@
 #include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/generate.hpp>
+#include <miopen/miopen.h>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -55,14 +56,40 @@ argument miopen_convolution::compute(context& ctx,
                                     const shape& output_shape,
                                     const std::vector<argument>& args) const
 {
-    auto x_desc = make_tensor(reshape_if_1d(args[0].get_shape()));
+    auto x_desc                = make_tensor(reshape_if_1d(args[0].get_shape()));
-    auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()));
+    auto w_desc                = make_tensor(reshape_if_1d(args[1].get_shape()));
-    auto y_desc = make_tensor(reshape_if_1d(output_shape));
+    auto y_desc                = make_tensor(reshape_if_1d(output_shape));
+    auto* miopen_stream_handle = ctx.get_stream().get_miopen();
+    auto workspace_size        = args[2].get_shape().bytes();
+#ifdef MIGRAPHX_HAS_FIND_2_API
+    {
+        const miopenTensorArgument_t tensor_args[3] = {
+            {miopenTensorConvolutionX, nullptr, args[0].implicit()},
+            {miopenTensorConvolutionW, nullptr, args[1].implicit()},
+            {miopenTensorConvolutionY, nullptr, args[3].implicit()},
+        };
+        if(solution_ptr.get() == nullptr)
+            MIGRAPHX_THROW("MIOpen Convolution : Load MIOpen Solution before running it");
+        auto status = miopenRunSolution(miopen_stream_handle,
+                                        solution_ptr.get(),
+                                        3,
+                                        tensor_args,
+                                        args[2].implicit(),
+                                        workspace_size);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen Convolution: running convolution using find_2.0 failed");
+        return args[3];
+    }
+#else
+    // else use immediate mode
    if(solution_id == 0)
        MIGRAPHX_THROW("MIOpen Convolution: invalid solution ID");
-    auto status = miopenConvolutionForwardImmediate(ctx.get_stream().get_miopen(),
+    auto status = miopenConvolutionForwardImmediate(miopen_stream_handle,
                                                    w_desc.get(),
                                                    args[1].implicit(),
                                                    x_desc.get(),
@@ -71,29 +98,66 @@ argument miopen_convolution::compute(context& ctx,
                                                    y_desc.get(),
                                                    args[3].implicit(),
                                                    args[2].implicit(),
-                                                    args[2].get_shape().bytes(),
+                                                    workspace_size,
                                                    solution_id);
    if(status != miopenStatusSuccess)
        MIGRAPHX_THROW("MIOpen Convolution: running convolution failed");
    return args[3];
+#endif
 }
 shape miopen_convolution::find(context& ctx, const shape& output_shape, std::vector<shape> inputs)
 {
    shape workspace_shape{};
+    auto x_desc                = make_tensor(reshape_if_1d(inputs[0]));
+    auto w_desc                = make_tensor(reshape_if_1d(inputs[1]));
+    auto y_desc                = make_tensor(reshape_if_1d(output_shape));
+    std::size_t workspace_size = 0;
-    auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
+#ifdef MIGRAPHX_HAS_FIND_2_API
-    auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
+    {
-    auto y_desc = make_tensor(reshape_if_1d(output_shape));
+        auto conv_problem = make_obj<miopen_problem>(
+            &miopenCreateConvProblem, cd.get(), miopenProblemDirectionForward);
+        set_tensor_descriptor(miopenTensorConvolutionX, x_desc, conv_problem);
+        set_tensor_descriptor(miopenTensorConvolutionW, w_desc, conv_problem);
+        set_tensor_descriptor(miopenTensorConvolutionY, y_desc, conv_problem);
+        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
+        solution_ptr = find_solution(miopen_stream_handle, conv_problem.get());
+        auto status = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen Convolution : failed to get solution's workspace size");
+        std::size_t solution_size;
+        status = miopenGetSolutionSize(solution_ptr.get(), &solution_size);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen Convolution: Failed to fetch solution size");
+        auto solution_binary = std::vector<char>{};
+        solution_binary.resize(solution_size);
+        status = miopenSaveSolution(solution_ptr.get(), solution_binary.data());
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen Convolution: Saving solution failed");
+        solution_object = value::binary{solution_binary.data(), solution_size};
+        return shape{shape::int8_type, {workspace_size}};
+    }
+#else
+    // else use immediate find mode
+    auto status = miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
+                                                           w_desc.get(),
+                                                           x_desc.get(),
+                                                           cd.get(),
+                                                           y_desc.get(),
+                                                           &workspace_size);
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen Convolution: Failed to get forward workspace size");
-    std::size_t workspace_size = 0;
-    miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
-                                             w_desc.get(),
-                                             x_desc.get(),
-                                             cd.get(),
-                                             y_desc.get(),
-                                             &workspace_size);
    workspace_shape = shape{shape::int8_type, {workspace_size}};
    auto x         = to_gpu(generate_argument(inputs[0]));
@@ -103,20 +167,20 @@ shape miopen_convolution::find(context& ctx, const shape& output_shape, std::vec
    int algo_count = 1;
    miopenConvAlgoPerf_t perf;
-    auto status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
+    status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
-                                                        x_desc.get(),
+                                                   x_desc.get(),
-                                                        x.implicit(),
+                                                   x.implicit(),
-                                                        w_desc.get(),
+                                                   w_desc.get(),
-                                                        w.implicit(),
+                                                   w.implicit(),
-                                                        cd.get(),
+                                                   cd.get(),
-                                                        y_desc.get(),
+                                                   y_desc.get(),
-                                                        y.implicit(),
+                                                   y.implicit(),
-                                                        1,
+                                                   1,
-                                                        &algo_count,
+                                                   &algo_count,
-                                                        &perf,
+                                                   &perf,
-                                                        workspace.implicit(),
+                                                   workspace.implicit(),
-                                                        workspace_size,
+                                                   workspace_size,
-                                                        false);
+                                                   false);
    if(status != miopenStatusSuccess)
        MIGRAPHX_THROW("MIOpen Convolution: find convolution failed");
    algo = perf.fwd_algo;
@@ -148,35 +212,58 @@ shape miopen_convolution::find(context& ctx, const shape& output_shape, std::vec
    solution_id = solutions.front().solution_id;
    return shape{shape::int8_type, {perf.memory}};
+#endif
 }
 void miopen_convolution::finalize(context& ctx,
                                  const shape& output_shape,
-                                  std::vector<shape> inputs)
+                                  const std::vector<shape>& inputs)
 {
-    if(cd == nullptr)
+#ifdef MIGRAPHX_HAS_FIND_2_API
-        cd = make_conv(op);
-    if(solution_id == 0)
    {
-        // Check that workspace hasn't changed
+        (void)(ctx); // avoid warnings
-        auto size = inputs.at(2).bytes();
+        (void)(output_shape);
-        auto ws   = find(ctx, output_shape, inputs);
+        (void)(inputs);
-        if(ws.bytes() > size)
+        // load solution
-            MIGRAPHX_THROW("MIOpen Convolution: workspace has changed during finalization.");
+        if(solution_ptr == nullptr)
+        {
+            miopenSolution_t ptr;
+            auto status  = miopenLoadSolution(&ptr,
+                                             reinterpret_cast<const char*>(solution_object.data()),
+                                             solution_object.size());
+            solution_ptr = miopen_solution{ptr};
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen Convolution: loading convolution solution failed");
+        }
    }
+#else
+    // Use immediate mode API
+    {
+        if(cd == nullptr)
+            cd = make_conv(op);
+        if(solution_id == 0)
+        {
+            // Check that workspace hasn't changed
+            auto size = inputs.at(2).bytes();
+            auto ws   = find(ctx, output_shape, inputs);
+            if(ws.bytes() > size)
+                MIGRAPHX_THROW("MIOpen Convolution: workspace has changed during finalization.");
+        }
-    auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
+        auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
-    auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
+        auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
-    auto y_desc = make_tensor(reshape_if_1d(output_shape));
+        auto y_desc = make_tensor(reshape_if_1d(output_shape));
-    auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
+        auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
-                                                          w_desc.get(),
+                                                              w_desc.get(),
-                                                          x_desc.get(),
+                                                              x_desc.get(),
-                                                          cd.get(),
+                                                              cd.get(),
-                                                          y_desc.get(),
+                                                              y_desc.get(),
-                                                          solution_id);
+                                                              solution_id);
-    if(status != miopenStatusSuccess)
+        if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Convolution: compile solution failed");
+            MIGRAPHX_THROW("MIOpen Convolution: compile solution failed");
+    }
+#endif
 }
 } // namespace gpu

--- a/src/targets/gpu/device/acos.cpp
+++ b/src/targets/gpu/device/acos.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/device/acos.hpp>
-#include <migraphx/gpu/device/nary.hpp>
-#include <migraphx/gpu/device/types.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-namespace device {
-void acos(hipStream_t stream, const argument& result, const argument& arg)
-{
-    nary(stream, result, arg)([](auto x) __device__ { return ::acos(to_hip_type(x)); });
-}
-} // namespace device
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/device/acosh.cpp
+++ b/src/targets/gpu/device/acosh.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/device/acosh.hpp>
-#include <migraphx/gpu/device/nary.hpp>
-#include <migraphx/gpu/device/types.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-namespace device {
-void acosh(hipStream_t stream, const argument& result, const argument& arg)
-{
-    nary(stream, result, arg)([](auto x) { return ::acosh(to_hip_type(x)); });
-}
-} // namespace device
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/device/add.cpp
+++ b/src/targets/gpu/device/add.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/device/add.hpp>
-#include <migraphx/gpu/device/nary.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-namespace device {
-void add(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
-{
-    nary(stream, result, arg1, arg2)([](auto x, auto y) __device__ { return x + y; });
-}
-void add(hipStream_t stream,
-         const argument& result,
-         const argument& arg1,
-         const argument& arg2,
-         const argument& arg3)
-{
-    nary(stream, result, arg1, arg2, arg3)([](auto x, auto y, auto z)
-                                               __device__ { return x + y + z; });
-}
-} // namespace device
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/device/add_clip.cpp
+++ b/src/targets/gpu/device/add_clip.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/device/add_clip.hpp>
-#include <migraphx/gpu/device/nary.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-namespace device {
-void add_clip(hipStream_t stream,
-              const argument& result,
-              const argument& arg1,
-              const argument& arg2,
-              const argument& min_arg,
-              const argument& max_arg)
-{
-    nary(stream, result, arg1, arg2, min_arg, max_arg)(
-        [](auto x, auto y, auto min, auto max)
-            __device__ { return ::min<decltype(x + y)>(::max<decltype(x)>(min, x + y), max); });
-}
-void add_clip(hipStream_t stream,
-              const argument& result,
-              const argument& arg1,
-              const argument& arg2,
-              const argument& arg3,
-              const argument& min_arg,
-              const argument& max_arg)
-{
-    nary(stream, result, arg1, arg2, arg3, min_arg, max_arg)(
-        [](auto x, auto y, auto z, auto min, auto max) __device__ {
-            return ::min<decltype(x + y + z)>(::max<decltype(x)>(min, x + y + z), max);
-        });
-}
-} // namespace device
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx