Merge branch 'develop' into simplify_1_mul_div_ops

66483df6 · Chris Austen · GitHub · 9310bff0 · 40118191 · 66483df6
Unverified Commit 66483df6 authored Sep 27, 2022 by Chris Austen Committed by GitHub Sep 27, 2022
20 changed files
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -268,7 +268,9 @@ jobs:
        lcov --directory . --capture --output-file $(pwd)/coverage.info
        lcov --remove $(pwd)/coverage.info '/usr/*' --output-file $(pwd)/coverage.info
        lcov --list $(pwd)/coverage.info
-        curl -s https://codecov.io/bash | bash
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        chmod +x codecov
+        ./codecov -t ${CODECOV_TOKEN}
        echo "Uploaded"

  linux-fpga:
@@ -364,5 +366,7 @@ jobs:
    #    lcov --directory . --capture --output-file $(pwd)/coverage.info
    #    lcov --remove $(pwd)/coverage.info '/usr/*' --output-file $(pwd)/coverage.info
    #    lcov --list $(pwd)/coverage.info
-    #    curl -s https://codecov.io/bash | bash
-    #    echo "Uploaded"
\ No newline at end of file
+    #    curl -Os https://uploader.codecov.io/latest/linux/codecov
+    #    chmod +x codecov
+    #    ./codecov -t ${CODECOV_TOKEN}
+    #    echo "Uploaded"
--- a/.github/workflows/performance.yaml
+++ b/.github/workflows/performance.yaml
@@ -26,7 +26,7 @@ on:
        required: true
        default: '-s'

-concurrency: benchmark
+concurrency: "perftest-${{ github.head_ref ||  github.base_ref || 'schedule' }}"

 jobs:
  release:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -212,6 +212,7 @@ rocm_enable_cppcheck(
        ConfigurationNotChecked
        unmatchedSuppression
        unusedFunction
+        ctuPointerArith
        noExplicitConstructor
        passedByValue
        unusedStructMember

--- a/Dockerfile
+++ b/Dockerfile
@@ -86,7 +86,7 @@ RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXR

 ADD tools/build_and_test_onnxrt.sh /onnxruntime/build_and_test_onnxrt.sh

-RUN cget -p /usr/local install ROCmSoftwarePlatform/llvm-project-mlir@d2cb9e580550e92ab75a0a417e7a4abd02a24edf -DBUILD_MIXR_TARGET=On
+RUN cget -p /usr/local install ROCmSoftwarePlatform/llvm-project-mlir@e8e77eb16be413d301ea8509726d47f265d9011f -DBUILD_MIXR_TARGET=On

 ENV MIOPEN_FIND_DB_PATH=/tmp/miopen/find-db
 ENV MIOPEN_USER_DB_PATH=/tmp/miopen/user-db

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -33,7 +33,7 @@ def rocmtestnode(Map conf) {
        }
    }
    node(name) {
-        withEnv(['HSA_ENABLE_SDMA=0', 'MIOPEN_DEBUG_GCN_ASM_KERNELS=0']) {
+        withEnv(['HSA_ENABLE_SDMA=0']) {
            stage("checkout ${variant}") {
                checkout scm
            }

--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -25,6 +25,6 @@ pfultz2/rocm-recipes
 facebook/zstd@v1.4.5 -X subdir -DCMAKE_DIR=build/cmake
 ccache@v4.1
 pcre,pfultz2/pcre@8.45 -H sha256:d6f7182602a775a7d500a0cedca6449af0400c6493951513046d17615ed0bf11
-danmar/cppcheck@2.8 -DHAVE_RULES=1
+danmar/cppcheck@2.9 -DHAVE_RULES=1
 RadeonOpenCompute/rocm-cmake@1ebf7e7bc61bb5e949c171562b421264065230a7 --build
 -f requirements.txt
--- a/doc/src/reference/py.rst
+++ b/doc/src/reference/py.rst
@@ -84,6 +84,12 @@ argument

    Construct an argument from a python buffer. This can include numpy arrays.

+.. py:method:: data_ptr()
+
+    Returns the address to the underlying argument data.
+
+    :rtype: int
+
 .. py:method:: get_shape()

    Returns the shape of the argument.
@@ -113,7 +119,16 @@ argument
    :param shape s: Shape of argument to fill.
    :param int value: Value to fill in the argument.

-    :rtype argument
+    :rtype: argument
+
+.. py:function:: argument_from_pointer(shape, address)
+
+    Create argument from data stored in given address without copy.
+
+    :param shape shape: Shape of the data stored in address.
+    :param long address: Memory address of data from another source
+
+    :rtype: argument 

 target
 ------

--- a/src/include/migraphx/match/layernorm.hpp
+++ b/src/include/migraphx/match/layernorm.hpp
@@ -50,8 +50,8 @@ struct layernorm_matcher
    {
        return f("div")(arg(0)(x_minus_mean()),

-                        arg(1)(skip_broadcasts(f("sqrt")(
-                            arg(0)(f("add")(either_arg(0, 1)(variance(), has_value(1e-12f))))))));
+                        arg(1)(skip_broadcasts(f("sqrt")(arg(0)(
+                            f("add")(either_arg(0, 1)(variance(), is_constant().bind("eps"))))))));
    }

    auto matcher() const { return layernorm_onnx(); }

--- a/src/include/migraphx/op/fmod.hpp
+++ b/src/include/migraphx/op/fmod.hpp
@@ -40,7 +40,6 @@ struct fmod : binary<fmod>
        a["commutative"] = false;
        return a;
    }
-    std::string point_function() const { return "fmod"; }
    auto apply() const
    {
        return [](auto x, auto y) { return std::fmod(x, y); };

--- a/src/include/migraphx/op/mod.hpp
+++ b/src/include/migraphx/op/mod.hpp
@@ -38,9 +38,9 @@ struct mod : binary<mod>
    {
        auto a           = base_attributes();
        a["commutative"] = false;
+        a["point_op"]    = "${function:fmod}((${function:remainder}(${0}, ${1})) + ${1}, ${1})";
        return a;
    }
-    std::string point_function() const { return "mod"; }
    auto apply() const
    {
        return [](auto x, auto y) { return std::fmod((std::remainder(x, y)) + y, y); };

--- a/src/onnx/parse_batchnorm.cpp
+++ b/src/onnx/parse_batchnorm.cpp
@@ -24,7 +24,7 @@
 #include <migraphx/onnx/op_parser.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/make_op.hpp>
-#include <migraphx/op/batch_norm_inference.hpp>
+#include <migraphx/instruction.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -36,28 +36,63 @@ struct parse_batchnorm : op_parser<parse_batchnorm>

    instruction_ref parse(const op_desc& /*opd*/,
                          const onnx_parser& parser,
-                          onnx_parser::node_info info,
-                          const std::vector<instruction_ref>& args) const
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
    {
-        float epsilon                                     = 1e-5f;
-        float momentum                                    = 0.9f;
-        op::batch_norm_inference::bn_infer_mode_t bn_mode = op::batch_norm_inference::spatial;
+        float epsilon = 1e-5f;
        if(contains(info.attributes, "epsilon"))
        {
            epsilon = parser.parse_value(info.attributes.at("epsilon")).at<float>();
        }
-        if(contains(info.attributes, "momentum"))
+        auto x_lens = args[0]->get_shape().lens();
+        auto x_type = args[0]->get_shape().type();
+
+        if(std::any_of(args.cbegin() + 1, args.cend(), [](auto a) {
+               return a->get_shape().lens().size() != 1;
+           }))
+        {
+            MIGRAPHX_THROW("PARSE_BATCHNORM: argument scale, bias, mean, or var rank != 1");
+        }
+
+        if(x_lens.size() == 1)
+        {
+            auto rt   = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {0.5}});
+            auto eps  = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {epsilon}});
+            auto n0   = info.add_broadcastable_binary_op("sub", args[0], args[3]);
+            auto d0   = info.add_broadcastable_binary_op("add", args[4], eps);
+            auto d1   = info.add_broadcastable_binary_op("pow", d0, rt);
+            auto div0 = info.add_broadcastable_binary_op("div", n0, d1);
+            auto r0   = info.add_broadcastable_binary_op("mul", div0, args[1]);
+            return info.add_broadcastable_binary_op("add", r0, args[2]);
+        }
+        else if(x_lens.size() > 2)
        {
-            momentum = parser.parse_value(info.attributes.at("momentum")).at<float>();
+            // unsqueeze tensors of shape (C) to broadcast correctly
+            std::vector<int64_t> unsqueeze_axes(x_lens.size() - 2);
+            std::iota(unsqueeze_axes.begin(), unsqueeze_axes.end(), 1);
+            auto rt  = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {0.5}});
+            auto eps = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {epsilon}});
+            auto scale_unsqueeze = info.add_instruction(
+                migraphx::make_op("unsqueeze", {{"axes", unsqueeze_axes}}), args[1]);
+            auto bias_unsqueeze = info.add_instruction(
+                migraphx::make_op("unsqueeze", {{"axes", unsqueeze_axes}}), args[2]);
+            auto mean_unsqueeze = info.add_instruction(
+                migraphx::make_op("unsqueeze", {{"axes", unsqueeze_axes}}), args[3]);
+            auto var_unsqueeze = info.add_instruction(
+                migraphx::make_op("unsqueeze", {{"axes", unsqueeze_axes}}), args[4]);
+            auto numer   = info.add_broadcastable_binary_op("sub", args[0], mean_unsqueeze);
+            auto var_eps = info.add_broadcastable_binary_op("add", var_unsqueeze, eps);
+            auto denom   = info.add_broadcastable_binary_op("pow", var_eps, rt);
+            auto div0    = info.add_broadcastable_binary_op("div", numer, denom);
+            auto r0      = info.add_broadcastable_binary_op("mul", div0, scale_unsqueeze);
+            return info.add_broadcastable_binary_op("add", r0, bias_unsqueeze);
        }
-        if(contains(info.attributes, "spatial"))
+        else
        {
-            bn_mode = (parser.parse_value(info.attributes.at("spatial")).at<uint64_t>() > 0)
-                          ? op::batch_norm_inference::spatial
-                          : op::batch_norm_inference::per_activation;
+            // num dims either 0 or 2
+            MIGRAPHX_THROW("PARSE_BATCHNORM: rank " + std::to_string(x_lens.size()) +
+                           " input tensor, unhandled data format");
        }
-        op::batch_norm_inference op{epsilon, momentum, bn_mode};
-        return info.add_instruction(op, args);
    }
 };


--- a/src/opt/memory_coloring_impl.cpp
+++ b/src/opt/memory_coloring_impl.cpp
@@ -72,7 +72,7 @@ bool memory_coloring_impl::allocate(interval_ptr interval)

    if(conflict_table.find(vn) != conflict_table.end())
    {
-        std::set<int>& vn_set = conflict_table[vn];
+        const std::set<int>& vn_set = conflict_table[vn];
        for(const auto& iter : vn_set)
        {
            live_range* range = live_ranges[iter];
@@ -267,8 +267,8 @@ void memory_coloring_impl::verify()
    {
        for(int i = 0; i < num_of_lives; ++i)
        {
-            live_interval& interval = live_intervals[i];
-            live_range& segment     = interval.segment;
+            const live_interval& interval = live_intervals[i];
+            const live_range& segment     = interval.segment;

            if(segment.begin == invalid_offset)
            {
@@ -284,7 +284,7 @@ void memory_coloring_impl::verify()
            int vn = segment.vn;
            if(conflict_table.find(vn) != conflict_table.end())
            {
-                std::set<int>& vn_set = conflict_table[vn];
+                const std::set<int>& vn_set = conflict_table[vn];
                for(const auto& iter : vn_set)
                {
                    live_range* range = live_ranges[iter];
@@ -319,8 +319,8 @@ void memory_coloring_impl::dump_intervals()
        {
            std::cout << " segment:" << i;
            std::cout << " =>";
-            std::set<int>& table = conflict_table[i];
-            for(auto& iter : table)
+            const std::set<int>& table = conflict_table[i];
+            for(const auto& iter : table)
            {
                std::cout << (iter) << ",";
            }
@@ -357,7 +357,7 @@ void live_interval::dump()
    std::cout << "id:" << id;
    segment.dump();
    std::cout << " uses:";
-    for(auto& iter : use_points)
+    for(const auto& iter : use_points)
    {
        std::cout << " " << get_ins_enum(iter) << ",";
    }

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -264,12 +264,13 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)

    py::class_<migraphx::argument>(m, "argument", py::buffer_protocol())
        .def_buffer([](migraphx::argument& x) -> py::buffer_info { return to_buffer_info(x); })
-        .def("__init__",
-             [](migraphx::argument& x, py::buffer b) {
-                 py::buffer_info info = b.request();
-                 new(&x) migraphx::argument(to_shape(info), info.ptr);
-             })
+        .def(py::init([](py::buffer b) {
+            py::buffer_info info = b.request();
+            return migraphx::argument(to_shape(info), info.ptr);
+        }))
        .def("get_shape", &migraphx::argument::get_shape)
+        .def("data_ptr",
+             [](migraphx::argument& x) { return reinterpret_cast<std::uintptr_t>(x.data()); })
        .def("tolist",
             [](migraphx::argument& x) {
                 py::list l{x.get_shape().elements()};

--- a/src/replace_allocate.cpp
+++ b/src/replace_allocate.cpp
@@ -73,7 +73,7 @@ void insert_submod_allocations(instruction_ref ins, module& mod, const allocatio
        name_shapes.insert(ps.begin(), ps.end());
    }

-    for(auto& pn : name_shapes)
+    for(const auto& pn : name_shapes)
    {
        const auto& s = pn.second;
        instruction_ref output{};

--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
@@ -57,12 +57,14 @@ auto conv_const_weights()

 auto reduction() { return match::name_contains("reduce"); }

+// conv(x, w) * a => conv(x, a * w)
 struct find_mul_conv
 {
    auto matcher() const
    {
-        return match::name("mul")(match::either_arg(0, 1)(conv_const_weights().bind("conv"),
-                                                          match::name("broadcast").bind("a")));
+        return match::name("mul")(
+            match::either_arg(0, 1)(conv_const_weights().bind("conv"),
+                                    match::name("broadcast", "multibroadcast").bind("a")));
    }

    void apply(module& m, const match::matcher_result& r) const
@@ -72,14 +74,35 @@ struct find_mul_conv
        auto a_ins    = r.instructions["a"];
        auto w_ins    = r.instructions["w"];

-        auto broadcast_op = any_cast<op::broadcast>(a_ins->get_operator());
-        if(broadcast_op.axis != 1)
+        const auto& a_input_lens = a_ins->inputs().front()->get_shape().lens();
+
+        std::size_t num_not_one_dims = std::count_if(
+            a_input_lens.cbegin(), a_input_lens.cend(), [](auto dim) { return dim != 1; });
+        if(num_not_one_dims > 1)
+            return;
+
+        // check broadcasted along channels
+        const auto& a_lens    = a_ins->get_shape().lens();
+        const auto& a_strides = a_ins->get_shape().strides();
+
+        auto is_broadcasted_axis = [](auto len, auto stride) { return len == 1 or stride == 0; };
+
+        if(a_strides.at(1) != 1)
+            return;
+
+        if(not is_broadcasted_axis(a_lens.front(), a_strides.front()))
+            return;
+
+        if(not std::equal(a_lens.begin() + 2,
+                          a_lens.end(),
+                          a_strides.begin() + 2,
+                          a_strides.end(),
+                          is_broadcasted_axis))
            return;

+        auto sq    = m.insert_instruction(ins, make_op("squeeze"), a_ins->inputs().front());
        auto new_a = m.insert_instruction(
-            ins,
-            make_op("broadcast", {{"axis", 0}, {"out_lens", w_ins->get_shape().lens()}}),
-            a_ins->inputs().front());
+            ins, make_op("broadcast", {{"axis", 0}, {"out_lens", w_ins->get_shape().lens()}}), sq);
        auto new_mul  = m.insert_instruction(ins, make_op("mul"), new_a, w_ins);
        auto new_conv = m.insert_instruction(
            ins, conv_ins->get_operator(), conv_ins->inputs().front(), new_mul);
@@ -1052,20 +1075,35 @@ struct find_split_reshape
        auto rsp_lens    = rsp->get_shape().lens();
        auto rsp_strides = rsp->get_shape().strides();
        rsp_strides.insert(rsp_strides.begin(), rsp_strides[0] * rsp_lens[0]);
-        auto ait = std::find(rsp_strides.begin(), rsp_strides.end(), slc_dim_size);
+
+        auto ait     = std::find(rsp_strides.begin(), rsp_strides.end(), slc_dim_size);
+        int rsp_axis = -1;
        if(ait == rsp_strides.end())
        {
            return;
        }
-        int rsp_axis = std::distance(rsp_strides.begin(), ait);
-
+        else if(ait == rsp_strides.end() - 1)
+        {
+            // edge case
+            // slice_dim == 1, in that case it could match with last stride of 1.
+            // it should accumulate lengths from last dim in that case. discount 1 to avoid going
+            // out of bounds.
+            assert(slc_dim_size == 1);
+            rsp_axis = std::distance(rsp_strides.begin(), ait) - 1;
+        }
+        else
+        {
+            rsp_axis = std::distance(rsp_strides.begin(), ait);
+        }
        // calculate reshape output shape
        std::vector<int64_t> vec_dims(vec_rsp.size());
+
        std::transform(vec_rsp.begin(), vec_rsp.end(), vec_dims.begin(), [&](auto is) {
            return is->get_shape().lens()[rsp_axis];
        });

        std::vector<int64_t> rsp_out_lens(rsp_lens.begin(), rsp_lens.end());
+
        rsp_out_lens[rsp_axis] = std::accumulate(vec_dims.begin(), vec_dims.end(), std::int64_t{0});

        // insert the reshape instruction and add contiguous if needed

--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -271,6 +271,44 @@ struct find_nested_slice
    }
 };

+struct find_concat_multibroadcasts
+{
+    auto matcher() const
+    {
+        return match::name("concat")(match::all_of[match::inputs()](match::name("multibroadcast")));
+    }
+
+    void apply(module& m, const match::matcher_result& mr) const
+    {
+        auto ins        = mr.result;
+        auto op         = any_cast<op::concat>(ins->get_operator());
+        auto out_lens   = ins->get_shape().lens();
+        auto inputs     = ins->inputs();
+        auto in_strides = inputs.front()->get_shape().strides();
+
+        // Only apply when concat axis is not a broadcasted dimension
+        if(std::any_of(inputs.begin(), inputs.end(), [&](auto i) {
+               return i->get_shape().strides()[op.axis] == 0;
+           }))
+        {
+            return;
+        }
+
+        // Use inputs of multibroadcast ops as inputs to new concat op
+        std::transform(inputs.begin(), inputs.end(), inputs.begin(), [](auto i) {
+            return i->inputs().front();
+        });
+
+        // Reduce axis by number of leading broadcasted dimensions
+        if(inputs.front()->get_shape().lens().size() < out_lens.size())
+            op.axis -= std::count(in_strides.begin(), in_strides.begin() + op.axis, 0);
+
+        auto concat = m.insert_instruction(ins, op, inputs);
+        m.replace_instruction(
+            ins, migraphx::make_op("multibroadcast", {{"out_lens", out_lens}}), concat);
+    }
+};
+
 struct find_concat_transpose
 {
    auto matcher() const
@@ -764,6 +802,7 @@ void simplify_reshapes::apply(module& m) const
                            find_reshaper{},
                            find_transpose{},
                            find_concat_transpose{},
+                            find_concat_multibroadcasts{},
                            find_nested_convert{},
                            find_nested_slice{},
                            find_nested_concat{},

--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library(migraphx_cpu
    dnnl.cpp
    eltwise.cpp
    erf.cpp
+    fmod.cpp
    fuse_ops.cpp
    gather.cpp
    gemm.cpp
@@ -42,6 +43,7 @@ add_library(migraphx_cpu
    logsoftmax.cpp
    lowering.cpp
    lrn.cpp
+    mod.cpp
    preallocate.cpp
    pooling.cpp
    reduction.cpp

--- a/src/targets/gpu/include/migraphx/gpu/cos.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/cos.hpp
@@ -21,22 +21,16 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_COS_HPP
-#define MIGRAPHX_GUARD_RTGLIB_COS_HPP
-
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/cos.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/fmod.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
+namespace cpu {

-struct hip_cos : unary_device<hip_cos, device::cos>
-{
-};
+template struct cpu_binary<op::fmod>;

-} // namespace gpu
+} // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-
-#endif
--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -43,6 +43,8 @@
 #include <migraphx/op/argmax.hpp>
 #include <migraphx/op/argmin.hpp>
 #include <migraphx/op/rnn_var_sl_last_output.hpp>
+#include <migraphx/op/mod.hpp>
+#include <migraphx/op/fmod.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/par_dfor.hpp>

--- a/src/targets/gpu/include/migraphx/gpu/exp.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/exp.hpp
@@ -21,22 +21,16 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_EXP_HPP
-#define MIGRAPHX_GUARD_RTGLIB_EXP_HPP
-
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/exp.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/mod.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
+namespace cpu {

-struct hip_exp : unary_device<hip_exp, device::exp>
-{
-};
+template struct cpu_binary<op::mod>;

-} // namespace gpu
+} // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-
-#endif