Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_ref_broadcast

4f63c691 · charlie · f02f5d98 · e19f78ae · 4f63c691 · 4f63c691
Commit 4f63c691 authored Sep 30, 2022 by charlie
20 changed files
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -268,7 +268,9 @@ jobs:
        lcov --directory . --capture --output-file $(pwd)/coverage.info
        lcov --remove $(pwd)/coverage.info '/usr/*' --output-file $(pwd)/coverage.info
        lcov --list $(pwd)/coverage.info
-        curl -s https://codecov.io/bash | bash
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        chmod +x codecov
+        ./codecov -t ${CODECOV_TOKEN}
        echo "Uploaded"
  linux-fpga:
@@ -364,5 +366,7 @@ jobs:
    #    lcov --directory . --capture --output-file $(pwd)/coverage.info
    #    lcov --remove $(pwd)/coverage.info '/usr/*' --output-file $(pwd)/coverage.info
    #    lcov --list $(pwd)/coverage.info
-    #    curl -s https://codecov.io/bash | bash
+    #    curl -Os https://uploader.codecov.io/latest/linux/codecov
-    #    echo "Uploaded"
+    #    chmod +x codecov
\ No newline at end of file
+    #    ./codecov -t ${CODECOV_TOKEN}
+    #    echo "Uploaded"
--- a/.github/workflows/performance.yaml
+++ b/.github/workflows/performance.yaml
@@ -26,6 +26,8 @@ on:
        required: true
        default: '-s'
+concurrency: "perftest-${{ github.head_ref ||  github.base_ref || 'schedule' }}"
 jobs:
  release:
    uses: rocmsoftwareplatform/migraphx-benchmark/.github/workflows/perf-test.yml@main

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -212,6 +212,7 @@ rocm_enable_cppcheck(
        ConfigurationNotChecked
        unmatchedSuppression
        unusedFunction
+        ctuPointerArith
        noExplicitConstructor
        passedByValue
        unusedStructMember

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -33,7 +33,7 @@ def rocmtestnode(Map conf) {
        }
    }
    node(name) {
-        withEnv(['HSA_ENABLE_SDMA=0', 'MIOPEN_DEBUG_GCN_ASM_KERNELS=0']) {
+        withEnv(['HSA_ENABLE_SDMA=0']) {
            stage("checkout ${variant}") {
                checkout scm
            }

--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -25,6 +25,6 @@ pfultz2/rocm-recipes
 facebook/zstd@v1.4.5 -X subdir -DCMAKE_DIR=build/cmake
 ccache@v4.1
 pcre,pfultz2/pcre@8.45 -H sha256:d6f7182602a775a7d500a0cedca6449af0400c6493951513046d17615ed0bf11
-danmar/cppcheck@2.8 -DHAVE_RULES=1
+danmar/cppcheck@2.9 -DHAVE_RULES=1
 RadeonOpenCompute/rocm-cmake@1ebf7e7bc61bb5e949c171562b421264065230a7 --build
 -f requirements.txt
--- a/src/include/migraphx/match/layernorm.hpp
+++ b/src/include/migraphx/match/layernorm.hpp
@@ -50,8 +50,8 @@ struct layernorm_matcher
    {
        return f("div")(arg(0)(x_minus_mean()),
-                        arg(1)(skip_broadcasts(f("sqrt")(
+                        arg(1)(skip_broadcasts(f("sqrt")(arg(0)(
-                            arg(0)(f("add")(either_arg(0, 1)(variance(), has_value(1e-12f))))))));
+                            f("add")(either_arg(0, 1)(variance(), is_constant().bind("eps"))))))));
    }
    auto matcher() const { return layernorm_onnx(); }

--- a/src/include/migraphx/op/fmod.hpp
+++ b/src/include/migraphx/op/fmod.hpp
@@ -40,7 +40,6 @@ struct fmod : binary<fmod>
        a["commutative"] = false;
        return a;
    }
-    std::string point_function() const { return "fmod"; }
    auto apply() const
    {
        return [](auto x, auto y) { return std::fmod(x, y); };

--- a/src/include/migraphx/op/mod.hpp
+++ b/src/include/migraphx/op/mod.hpp
@@ -38,9 +38,9 @@ struct mod : binary<mod>
    {
        auto a           = base_attributes();
        a["commutative"] = false;
+        a["point_op"]    = "${function:fmod}((${function:remainder}(${0}, ${1})) + ${1}, ${1})";
        return a;
    }
-    std::string point_function() const { return "mod"; }
    auto apply() const
    {
        return [](auto x, auto y) { return std::fmod((std::remainder(x, y)) + y, y); };

--- a/src/include/migraphx/streamutils.hpp
+++ b/src/include/migraphx/streamutils.hpp
@@ -28,6 +28,7 @@
 #include <algorithm>
 #include <migraphx/rank.hpp>
 #include <migraphx/config.hpp>
+#include <vector>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -59,28 +60,35 @@ inline stream_range_container<Range> stream_range(const Range& r)
 namespace detail {
-inline void stream_write_value_impl(rank<2>, std::ostream& os, const std::string& x) { os << x; }
+template <class T>
+auto stream_write_value_impl(rank<1>, std::ostream& os, const T& x) -> decltype(os << x, void())
+{
+    os << x;
+}
-template <class Range>
+template <class T>
-auto stream_write_value_impl(rank<1>, std::ostream& os, const Range& r)
+void stream_write_value_impl(rank<1>, std::ostream& os, const std::vector<T>& r)
-    -> decltype(r.begin(), r.end(), void())
 {
    os << "{";
    os << stream_range(r);
    os << "}";
 }
-template <class T>
+template <class Range>
-void stream_write_value_impl(rank<0>, std::ostream& os, const T& x)
+auto stream_write_value_impl(rank<0>, std::ostream& os, const Range& r)
+    -> decltype(r.begin(), r.end(), void())
 {
-    os << x;
+    os << "{";
+    os << stream_range(r);
+    os << "}";
 }
 } // namespace detail
 template <class T>
 void stream_write_value(std::ostream& os, const T& x)
 {
-    detail::stream_write_value_impl(rank<2>{}, os, x);
+    detail::stream_write_value_impl(rank<1>{}, os, x);
 }
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/include/migraphx/value.hpp
+++ b/src/include/migraphx/value.hpp
@@ -184,6 +184,12 @@ struct value
        {
        }
        explicit binary(std::size_t s) : base(s) {}
+        friend std::ostream& operator<<(std::ostream& os, const binary& obj)
+        {
+            os << "{binary_object: " << obj.size() << "}";
+            return os;
+        }
    };
    value() = default;

--- a/src/module.cpp
+++ b/src/module.cpp
@@ -385,9 +385,13 @@ instruction_ref module::move_instruction(instruction_ref src, instruction_ref ds
 instruction_ref module::move_instructions(instruction_ref src, instruction_ref dst)
 {
-    this->move_instruction(src, dst);
    for(auto ins : src->inputs())
-        this->move_instruction(ins, src);
+    {
+        if(not contains(this->impl->instructions, ins))
+            continue;
+        this->move_instructions(ins, dst);
+    }
+    this->move_instruction(src, dst);
    return src;
 }

--- a/src/onnx/parse_batchnorm.cpp
+++ b/src/onnx/parse_batchnorm.cpp
@@ -24,7 +24,7 @@
 #include <migraphx/onnx/op_parser.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/make_op.hpp>
-#include <migraphx/op/batch_norm_inference.hpp>
+#include <migraphx/instruction.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -36,28 +36,63 @@ struct parse_batchnorm : op_parser<parse_batchnorm>
    instruction_ref parse(const op_desc& /*opd*/,
                          const onnx_parser& parser,
-                          onnx_parser::node_info info,
+                          const onnx_parser::node_info& info,
-                          const std::vector<instruction_ref>& args) const
+                          std::vector<instruction_ref> args) const
    {
-        float epsilon                                     = 1e-5f;
+        float epsilon = 1e-5f;
-        float momentum                                    = 0.9f;
-        op::batch_norm_inference::bn_infer_mode_t bn_mode = op::batch_norm_inference::spatial;
        if(contains(info.attributes, "epsilon"))
        {
            epsilon = parser.parse_value(info.attributes.at("epsilon")).at<float>();
        }
-        if(contains(info.attributes, "momentum"))
+        auto x_lens = args[0]->get_shape().lens();
+        auto x_type = args[0]->get_shape().type();
+        if(std::any_of(args.cbegin() + 1, args.cend(), [](auto a) {
+               return a->get_shape().lens().size() != 1;
+           }))
+        {
+            MIGRAPHX_THROW("PARSE_BATCHNORM: argument scale, bias, mean, or var rank != 1");
+        }
+        if(x_lens.size() == 1)
+        {
+            auto rt   = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {0.5}});
+            auto eps  = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {epsilon}});
+            auto n0   = info.add_broadcastable_binary_op("sub", args[0], args[3]);
+            auto d0   = info.add_broadcastable_binary_op("add", args[4], eps);
+            auto d1   = info.add_broadcastable_binary_op("pow", d0, rt);
+            auto div0 = info.add_broadcastable_binary_op("div", n0, d1);
+            auto r0   = info.add_broadcastable_binary_op("mul", div0, args[1]);
+            return info.add_broadcastable_binary_op("add", r0, args[2]);
+        }
+        else if(x_lens.size() > 2)
        {
-            momentum = parser.parse_value(info.attributes.at("momentum")).at<float>();
+            // unsqueeze tensors of shape (C) to broadcast correctly
+            std::vector<int64_t> unsqueeze_axes(x_lens.size() - 2);
+            std::iota(unsqueeze_axes.begin(), unsqueeze_axes.end(), 1);
+            auto rt  = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {0.5}});
+            auto eps = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {epsilon}});
+            auto scale_unsqueeze = info.add_instruction(
+                migraphx::make_op("unsqueeze", {{"axes", unsqueeze_axes}}), args[1]);
+            auto bias_unsqueeze = info.add_instruction(
+                migraphx::make_op("unsqueeze", {{"axes", unsqueeze_axes}}), args[2]);
+            auto mean_unsqueeze = info.add_instruction(
+                migraphx::make_op("unsqueeze", {{"axes", unsqueeze_axes}}), args[3]);
+            auto var_unsqueeze = info.add_instruction(
+                migraphx::make_op("unsqueeze", {{"axes", unsqueeze_axes}}), args[4]);
+            auto numer   = info.add_broadcastable_binary_op("sub", args[0], mean_unsqueeze);
+            auto var_eps = info.add_broadcastable_binary_op("add", var_unsqueeze, eps);
+            auto denom   = info.add_broadcastable_binary_op("pow", var_eps, rt);
+            auto div0    = info.add_broadcastable_binary_op("div", numer, denom);
+            auto r0      = info.add_broadcastable_binary_op("mul", div0, scale_unsqueeze);
+            return info.add_broadcastable_binary_op("add", r0, bias_unsqueeze);
        }
-        if(contains(info.attributes, "spatial"))
+        else
        {
-            bn_mode = (parser.parse_value(info.attributes.at("spatial")).at<uint64_t>() > 0)
+            // num dims either 0 or 2
-                          ? op::batch_norm_inference::spatial
+            MIGRAPHX_THROW("PARSE_BATCHNORM: rank " + std::to_string(x_lens.size()) +
-                          : op::batch_norm_inference::per_activation;
+                           " input tensor, unhandled data format");
        }
-        op::batch_norm_inference op{epsilon, momentum, bn_mode};
-        return info.add_instruction(op, args);
    }
 };

--- a/src/opt/memory_coloring_impl.cpp
+++ b/src/opt/memory_coloring_impl.cpp
@@ -72,7 +72,7 @@ bool memory_coloring_impl::allocate(interval_ptr interval)
    if(conflict_table.find(vn) != conflict_table.end())
    {
-        std::set<int>& vn_set = conflict_table[vn];
+        const std::set<int>& vn_set = conflict_table[vn];
        for(const auto& iter : vn_set)
        {
            live_range* range = live_ranges[iter];
@@ -267,8 +267,8 @@ void memory_coloring_impl::verify()
    {
        for(int i = 0; i < num_of_lives; ++i)
        {
-            live_interval& interval = live_intervals[i];
+            const live_interval& interval = live_intervals[i];
-            live_range& segment     = interval.segment;
+            const live_range& segment     = interval.segment;
            if(segment.begin == invalid_offset)
            {
@@ -284,7 +284,7 @@ void memory_coloring_impl::verify()
            int vn = segment.vn;
            if(conflict_table.find(vn) != conflict_table.end())
            {
-                std::set<int>& vn_set = conflict_table[vn];
+                const std::set<int>& vn_set = conflict_table[vn];
                for(const auto& iter : vn_set)
                {
                    live_range* range = live_ranges[iter];
@@ -319,8 +319,8 @@ void memory_coloring_impl::dump_intervals()
        {
            std::cout << " segment:" << i;
            std::cout << " =>";
-            std::set<int>& table = conflict_table[i];
+            const std::set<int>& table = conflict_table[i];
-            for(auto& iter : table)
+            for(const auto& iter : table)
            {
                std::cout << (iter) << ",";
            }
@@ -357,7 +357,7 @@ void live_interval::dump()
    std::cout << "id:" << id;
    segment.dump();
    std::cout << " uses:";
-    for(auto& iter : use_points)
+    for(const auto& iter : use_points)
    {
        std::cout << " " << get_ins_enum(iter) << ",";
    }

--- a/src/replace_allocate.cpp
+++ b/src/replace_allocate.cpp
@@ -73,7 +73,7 @@ void insert_submod_allocations(instruction_ref ins, module& mod, const allocatio
        name_shapes.insert(ps.begin(), ps.end());
    }
-    for(auto& pn : name_shapes)
+    for(const auto& pn : name_shapes)
    {
        const auto& s = pn.second;
        instruction_ref output{};

--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
@@ -57,12 +57,14 @@ auto conv_const_weights()
 auto reduction() { return match::name_contains("reduce"); }
+// conv(x, w) * a => conv(x, a * w)
 struct find_mul_conv
 {
    auto matcher() const
    {
-        return match::name("mul")(match::either_arg(0, 1)(conv_const_weights().bind("conv"),
+        return match::name("mul")(
-                                                          match::name("broadcast").bind("a")));
+            match::either_arg(0, 1)(conv_const_weights().bind("conv"),
+                                    match::name("broadcast", "multibroadcast").bind("a")));
    }
    void apply(module& m, const match::matcher_result& r) const
@@ -72,14 +74,35 @@ struct find_mul_conv
        auto a_ins    = r.instructions["a"];
        auto w_ins    = r.instructions["w"];
-        auto broadcast_op = any_cast<op::broadcast>(a_ins->get_operator());
+        const auto& a_input_lens = a_ins->inputs().front()->get_shape().lens();
-        if(broadcast_op.axis != 1)
+        std::size_t num_not_one_dims = std::count_if(
+            a_input_lens.cbegin(), a_input_lens.cend(), [](auto dim) { return dim != 1; });
+        if(num_not_one_dims > 1)
+            return;
+        // check broadcasted along channels
+        const auto& a_lens    = a_ins->get_shape().lens();
+        const auto& a_strides = a_ins->get_shape().strides();
+        auto is_broadcasted_axis = [](auto len, auto stride) { return len == 1 or stride == 0; };
+        if(a_strides.at(1) != 1)
+            return;
+        if(not is_broadcasted_axis(a_lens.front(), a_strides.front()))
+            return;
+        if(not std::equal(a_lens.begin() + 2,
+                          a_lens.end(),
+                          a_strides.begin() + 2,
+                          a_strides.end(),
+                          is_broadcasted_axis))
            return;
+        auto sq    = m.insert_instruction(ins, make_op("squeeze"), a_ins->inputs().front());
        auto new_a = m.insert_instruction(
-            ins,
+            ins, make_op("broadcast", {{"axis", 0}, {"out_lens", w_ins->get_shape().lens()}}), sq);
-            make_op("broadcast", {{"axis", 0}, {"out_lens", w_ins->get_shape().lens()}}),
-            a_ins->inputs().front());
        auto new_mul  = m.insert_instruction(ins, make_op("mul"), new_a, w_ins);
        auto new_conv = m.insert_instruction(
            ins, conv_ins->get_operator(), conv_ins->inputs().front(), new_mul);
@@ -412,6 +435,24 @@ struct find_concat_op
    }
 };
+void move_instructions_back(module& m, instruction_ref pos, std::vector<instruction_ref> inss)
+{
+    auto start = range(m.begin(), pos);
+    for(auto ins : iterator_for(start))
+    {
+        auto it = std::find(inss.begin(), inss.end(), ins);
+        if(it != inss.end())
+            inss.erase(it);
+    }
+    for(auto ins : inss)
+    {
+        if(not m.has_instruction(ins))
+            continue;
+        move_instructions_back(m, pos, ins->inputs());
+        m.move_instruction(ins, pos);
+    }
+}
 std::vector<instruction_ref> get_splits(instruction_ref ins)
 {
    std::vector<instruction_ref> result;
@@ -587,8 +628,7 @@ struct find_splits
                   }))
                    return;
-                for(auto data : data_args)
+                move_instructions_back(m, ins, data_args);
-                    m.move_instructions(data, ins);
                auto slice_op = any_cast<op::slice>(splits.front()->get_operator());
                assert(not slice_op.axes.empty());
@@ -841,8 +881,7 @@ struct find_conv_dot_horiz_fusion
                concat_axis = axis;
            }
-            for(auto arg : args)
+            move_instructions_back(m, input, args);
-                m.move_instructions(arg, input);
            // TODO: Check if axes match
            auto concat =
                m.insert_instruction(input, make_op("concat", {{"axis", concat_axis}}), args);

--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library(migraphx_cpu
    dnnl.cpp
    eltwise.cpp
    erf.cpp
+    fmod.cpp
    fuse_ops.cpp
    gather.cpp
    gemm.cpp
@@ -42,6 +43,7 @@ add_library(migraphx_cpu
    logsoftmax.cpp
    lowering.cpp
    lrn.cpp
+    mod.cpp
    preallocate.cpp
    pooling.cpp
    reduction.cpp

--- a/src/targets/gpu/include/migraphx/gpu/cos.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/cos.hpp
@@ -21,22 +21,16 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_COS_HPP
+#include <migraphx/config.hpp>
-#define MIGRAPHX_GUARD_RTGLIB_COS_HPP
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/fmod.hpp>
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/cos.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
+namespace cpu {
-struct hip_cos : unary_device<hip_cos, device::cos>
+template struct cpu_binary<op::fmod>;
-{
-};
-} // namespace gpu
+} // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-#endif
--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -43,6 +43,8 @@
 #include <migraphx/op/argmax.hpp>
 #include <migraphx/op/argmin.hpp>
 #include <migraphx/op/rnn_var_sl_last_output.hpp>
+#include <migraphx/op/mod.hpp>
+#include <migraphx/op/fmod.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/par_dfor.hpp>

--- a/src/targets/gpu/include/migraphx/gpu/exp.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/exp.hpp
@@ -21,22 +21,16 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_EXP_HPP
+#include <migraphx/config.hpp>
-#define MIGRAPHX_GUARD_RTGLIB_EXP_HPP
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/mod.hpp>
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/exp.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
+namespace cpu {
-struct hip_exp : unary_device<hip_exp, device::exp>
+template struct cpu_binary<op::mod>;
-{
-};
-} // namespace gpu
+} // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-#endif
--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -39,81 +39,9 @@ file(GLOB KERNEL_FILES ${CONFIGURE_DEPENDS}
 message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
 add_embed_library(migraphx_kernels ${KERNEL_FILES})
-add_library(migraphx_device
+file(GLOB DEVICE_GPU_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/device/*.cpp)
-    device/acos.cpp
+add_library(migraphx_device ${DEVICE_GPU_SRCS})
-    device/acosh.cpp
-    device/add.cpp
-    device/add_clip.cpp
-    device/add_relu.cpp
-    device/add_sigmoid.cpp
-    device/add_tanh.cpp
-    device/argmax.cpp
-    device/argmin.cpp
-    device/asin.cpp
-    device/asinh.cpp
-    device/atan.cpp
-    device/atanh.cpp
-    device/ceil.cpp
-    device/clip.cpp
-    device/concat.cpp
-    device/contiguous.cpp
-    device/convert.cpp
-    device/cos.cpp
-    device/cosh.cpp
-    device/div.cpp
-    device/equal.cpp
-    device/erf.cpp
-    device/exp.cpp
-    device/fill.cpp
-    device/floor.cpp
-    device/gather.cpp
-    device/gelu.cpp
-    device/greater.cpp
-    device/int8_gemm_pack.cpp
-    device/layernorm.cpp
-    device/less.cpp
-    device/log.cpp
-    device/logical_and.cpp
-    device/logical_or.cpp
-    device/logical_xor.cpp
-    device/logsoftmax.cpp
-    device/max.cpp
-    device/min.cpp
-    device/mul.cpp
-    device/mul_add.cpp
-    device/mul_add_relu.cpp
-    device/multinomial.cpp
-    device/nonzero.cpp
-    device/pad.cpp
-    device/pow.cpp
-    device/prelu.cpp
-    device/prefix_scan_sum.cpp
-    device/recip.cpp
-    device/reduce_max.cpp
-    device/reduce_mean.cpp
-    device/reduce_min.cpp
-    device/reduce_sum.cpp
-    device/reduce_prod.cpp
-    device/relu.cpp
-    device/reverse.cpp
-    device/rnn_variable_seq_lens.cpp
-    device/round.cpp
-    device/rsqrt.cpp
-    device/scatter.cpp
-    device/sigmoid.cpp
-    device/sign.cpp
-    device/sin.cpp
-    device/sinh.cpp
-    device/softmax.cpp
-    device/sqdiff.cpp
-    device/sqrt.cpp
-    device/sub.cpp
-    device/tan.cpp
-    device/tanh.cpp
-    device/topk.cpp
-    device/unary_not.cpp
-    device/where.cpp
-)
 add_library(compile_for_gpu INTERFACE)
 target_compile_options(compile_for_gpu INTERFACE -std=c++17 -fno-gpu-rdc -Wno-cuda-compat -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns)
 target_link_libraries(compile_for_gpu INTERFACE hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-option-ignored)
@@ -151,15 +79,12 @@ add_library(migraphx_gpu
    argmax.cpp
    argmin.cpp
    batch_norm_inference.cpp
-    clip.cpp
    code_object_op.cpp
    compile_ops.cpp
    compile_gen.cpp
    compile_hip.cpp
    compile_hip_code_object.cpp
    compiler.cpp
-    concat.cpp
-    convert.cpp
    convolution.cpp
    deconvolution.cpp
    device_name.cpp
@@ -192,7 +117,6 @@ add_library(migraphx_gpu
    rocblas.cpp
    scatter.cpp
    schedule_model.cpp
-    softmax.cpp
    sync_device.cpp
    target.cpp
    topk.cpp
@@ -207,68 +131,18 @@ function(register_migraphx_gpu_ops PREFIX)
    endforeach()
 endfunction()
 register_migraphx_gpu_ops(hip_
-    acosh
-    acos
-    add
    argmax
    argmin
-    asinh
-    asin
-    atanh
-    atan
-    ceil
-    clip
-    concat
-    convert
-    cosh
-    cos
-    div
-    equal
-    erf
-    exp
-    floor
    gather
-    greater
-    less
-    log
    logsoftmax
-    logical_and
-    logical_or
-    logical_xor
    loop
-    max
-    min
-    mul
    multinomial
    nonzero
    pad
-    pow
-    prelu
    prefix_scan_sum
-    recip
-    reduce_max
-    reduce_mean
-    reduce_min
-    reduce_prod
-    reduce_sum
-    relu
    reverse
-    round
-    rsqrt
    scatter
-    sigmoid
-    sign
-    sinh
-    sin
-    softmax
-    sqdiff
-    sqrt
-    sub
-    tanh
-    tan
    topk
-    unary_not
-    where
 )
 register_migraphx_gpu_ops(miopen_
    abs
@@ -365,9 +239,18 @@ endif()
 include(CheckLibraryExists)
 get_target_property(MIOPEN_LOCATION MIOpen LOCATION)
 check_library_exists(MIOpen "miopenHiddenSetConvolutionFindMode" "${MIOPEN_LOCATION}" HAS_FIND_MODE_API)
+check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API)
+if(HAS_FIND_2_API) 
+    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
+    message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
+else()
+    message(STATUS "MIOpen does not have Find-2.0 API")
+endif()
 if(HAS_FIND_MODE_API)
    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_MODE_API)
-    message(STATUS "MIOpen has find mode api")
+    message(STATUS "MIGraphx is using Find Mode API of MIOpen")
 else()
    message(STATUS "MIOpen does not have find mode api")
 endif()