Merge branch 'develop' into fuse-dot-weights

db1a954c · Paul · f92195d0 · 333860ce · db1a954c · db1a954c
Commit db1a954c authored Sep 15, 2022 by Paul
20 changed files
--- a/src/onnx/parse_resize.cpp
+++ b/src/onnx/parse_resize.cpp
@@ -56,7 +56,7 @@ const auto& get_nearest_op(const std::string& mode)
             return static_cast<std::size_t>(std::ceil((val)));
         }}};
-    if(!contains(nearest_ops, mode))
+    if(not contains(nearest_ops, mode))
    {
        MIGRAPHX_THROW("PARSE_RESIZE: nearest_mode " + mode + " not supported!");
    }
@@ -86,7 +86,7 @@ const auto& get_original_idx_op(const std::string& mode)
             return (idx + 0.5) / scale;
         }}};
-    if(!contains(idx_ops, mode))
+    if(not contains(idx_ops, mode))
    {
        MIGRAPHX_THROW("PARSE_RESIZE: coordinate_transformation_mode " + mode + " not supported!");
    }

--- a/src/onnx/parse_reversesequence.cpp
+++ b/src/onnx/parse_reversesequence.cpp
@@ -31,7 +31,7 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace onnx {
-//!  Parser for ReverseSequence ONNX operator.
+// Parser for ReverseSequence ONNX operator.
 /*!
  Reverses the data along the time axis for the batches along the batch axis.
  The sequence lengths can be given to reverse up to the given length for each batch, keeping the

--- a/src/opt/memory_coloring.cpp
+++ b/src/opt/memory_coloring.cpp
@@ -29,7 +29,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 void memory_coloring::apply(module& m) const
 {
-    if(!enabled(MIGRAPHX_DISABLE_MEMORY_COLORING{}))
+    if(not enabled(MIGRAPHX_DISABLE_MEMORY_COLORING{}))
    {
        memory_coloring_impl opt(&m, allocation_op, verify);
        opt.run();

--- a/src/opt/memory_coloring_impl.cpp
+++ b/src/opt/memory_coloring_impl.cpp
@@ -42,7 +42,7 @@ void memory_coloring_impl::run()
    {
        MIGRAPHX_DEBUG(dump_intervals());
        // Coloring
-        while(!alloc_queue.empty())
+        while(not alloc_queue.empty())
        {
            interval_ptr interval = alloc_queue.top();
            allocate(interval);
@@ -96,7 +96,7 @@ bool memory_coloring_impl::allocate(interval_ptr interval)
    }
    std::size_t offset = 0;
-    while(!conflict_queue.empty())
+    while(not conflict_queue.empty())
    {
        live_range* range       = conflict_queue.top();
        std::size_t iter_offset = range->offset;
@@ -149,7 +149,7 @@ void memory_coloring_impl::build()
        {
            def_interval = instr2_live[p_iter];
            bool is_lit  = is_literal(iter);
-            if(is_allocate(iter) || is_lit)
+            if(is_allocate(iter) or is_lit)
            {
                live_range& range        = def_interval->segment;
                def_interval->result     = iter->get_shape();
@@ -157,12 +157,12 @@ void memory_coloring_impl::build()
                range.begin              = cur_points;
                def_interval->def_point  = cur_points;
                range.size               = (iter->get_shape()).bytes();
-                if(!is_lit || unify_literals)
+                if(not is_lit or unify_literals)
                    alloc_queue.push(def_interval);
                live_set.erase(range.vn);
            }
        }
-        else if(!is_param(iter) && !is_outline(iter) && !is_check_context(iter))
+        else if(not is_param(iter) && not is_outline(iter) && not is_check_context(iter))
        {
            is_dead = true;
        }
@@ -179,7 +179,7 @@ void memory_coloring_impl::build()
            if(not p_mod->has_instruction(arg))
                continue;
-            if(is_param(arg) || is_outline(arg))
+            if(is_param(arg) or is_outline(arg))
            {
                if(is_output_param(arg))
                    is_dead = false;
@@ -235,7 +235,7 @@ void memory_coloring_impl::rewrite()
            if(interval->get_begin() == invalid_offset)
                continue;
-            if(!unify_literals && interval->is_literal)
+            if(not unify_literals && interval->is_literal)
                continue;
            std::size_t offset = 0;
@@ -272,7 +272,7 @@ void memory_coloring_impl::verify()
            if(segment.begin == invalid_offset)
            {
-                // if(!interval.is_live_on_entry)
+                // if(not interval.is_live_on_entry)
                // MIGRAPHX_THROW("interval is not live on entry");
                continue;
            }
@@ -290,7 +290,7 @@ void memory_coloring_impl::verify()
                    live_range* range = live_ranges[iter];
                    if(range->offset == invalid_offset)
                        continue;
-                    if(!is_disjoin(*range, segment))
+                    if(not is_disjoin(*range, segment))
                        MIGRAPHX_THROW("range and segment is not disjoined");
                }
            }

--- a/src/opt/memory_coloring_impl.hpp
+++ b/src/opt/memory_coloring_impl.hpp
@@ -125,11 +125,11 @@ struct memory_coloring_impl
    static bool is_disjoin(const live_range& range1, const live_range& range2)
    {
-        if((range1.size == 0) || (range2.size == 0))
+        if((range1.size == 0) or (range2.size == 0))
            return false;
        auto end1 = range1.offset + range1.size - 1;
        auto end2 = range2.offset + range2.size - 1;
-        return ((end1 < range2.offset) || (end2 < range1.offset));
+        return ((end1 < range2.offset) or (end2 < range1.offset));
    }
    void verify();
 #ifdef MIGRAPHX_DEBUG_OPT

--- a/src/process.cpp
+++ b/src/process.cpp
@@ -50,7 +50,7 @@ int exec(const std::string& cmd, const std::function<void(const char*)>& std_out
    {
        // TODO: Use execve instead of popen
        std::unique_ptr<FILE, decltype(closer)> pipe(popen(cmd.c_str(), "r"), closer); // NOLINT
-        if(!pipe)
+        if(not pipe)
            MIGRAPHX_THROW("popen() failed: " + cmd);
        std::array<char, 128> buffer;
        while(fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr)

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -78,11 +78,11 @@ program& program::operator=(program p)
 void program::assign(const program& p)
 {
-    if(!impl)
+    if(not impl)
    {
        impl = std::make_unique<program_impl>();
    }
-    else if(!impl->modules.empty())
+    else if(not impl->modules.empty())
    {
        impl->modules.clear();
    }

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -83,7 +83,7 @@ void visit_py(T x, F f)
    {
        f(x.template cast<bool>());
    }
-    else if(py::isinstance<py::int_>(x) || py::hasattr(x, "__index__"))
+    else if(py::isinstance<py::int_>(x) or py::hasattr(x, "__index__"))
    {
        f(x.template cast<int>());
    }
@@ -270,6 +270,8 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
                 new(&x) migraphx::argument(to_shape(info), info.ptr);
             })
        .def("get_shape", &migraphx::argument::get_shape)
+        .def("data_ptr",
+             [](migraphx::argument& x) { return reinterpret_cast<std::uintptr_t>(x.data()); })
        .def("tolist",
             [](migraphx::argument& x) {
                 py::list l{x.get_shape().elements()};

--- a/src/quantization.cpp
+++ b/src/quantization.cpp
@@ -70,7 +70,7 @@ void quantize_int8(program& prog,
 {
    std::set<std::string> op_names = {"convolution", "dot"};
    std::set<std::string> input_ins_names(ins_names.begin(), ins_names.end());
-    if(!std::includes(
+    if(not std::includes(
           op_names.begin(), op_names.end(), input_ins_names.begin(), input_ins_names.end()))
    {
        MIGRAPHX_THROW("QUANTIZE_INT8: only support DOT and CONVOLUTION operation");

--- a/src/rewrite_gelu.cpp
+++ b/src/rewrite_gelu.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/rewrite_gelu.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/match/gelu_erf.hpp>
+#include <migraphx/common.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct find_gelu_erf
+{
+    auto matcher() const { return match::gelu_erf(); }
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins = r.result;
+        auto x   = r.instructions["x"];
+        if(x->get_shape().type() != migraphx::shape::half_type)
+            return;
+        auto lit = m.add_literal(literal{shape{x->get_shape().type()}, {1.702f}});
+        auto mul = insert_common_op(m, ins, make_op("mul"), {x, lit});
+        auto sig = m.insert_instruction(ins, make_op("neg"), mul);
+        sig      = m.insert_instruction(ins, make_op("exp"), sig);
+        auto one = m.add_literal(literal{shape{x->get_shape().type()}, {1.0f}});
+        sig      = insert_common_op(m, ins, make_op("add"), {sig, one});
+        sig      = m.insert_instruction(ins, make_op("div"), x, sig);
+        m.replace_instruction(ins, sig);
+    }
+};
+void rewrite_gelu::apply(module& m) const { match::find_matches(m, find_gelu_erf{}); }
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/rewrite_pooling.cpp
+++ b/src/rewrite_pooling.cpp
@@ -47,12 +47,12 @@ void rewrite_pooling::apply(module& m) const
        if(not s.standard())
            continue;
        auto&& op = any_cast<op::pooling>(ins->get_operator());
-        if(!std::all_of(op.padding.begin(), op.padding.end(), [](auto i) { return i == 0; }))
+        if(not std::all_of(op.padding.begin(), op.padding.end(), [](auto i) { return i == 0; }))
            continue;
-        if(!std::all_of(op.stride.begin(), op.stride.end(), [](auto i) { return i == 1; }))
+        if(not std::all_of(op.stride.begin(), op.stride.end(), [](auto i) { return i == 1; }))
            continue;
        auto lens = s.lens();
-        if(!std::equal(lens.begin() + 2, lens.end(), op.lengths.begin(), op.lengths.end()))
+        if(not std::equal(lens.begin() + 2, lens.end(), op.lengths.begin(), op.lengths.end()))
            continue;
        std::int64_t n = s.lens()[0];
        std::int64_t c = s.lens()[1];

--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
@@ -214,7 +214,7 @@ void rewrite_rnn::apply_vanilla_rnn(module& m, instruction_ref ins) const
            ih = m.add_literal(migraphx::literal{ih_shape, data});
        }
-        if(!is_forward and variable_seq_len)
+        if(not is_forward and variable_seq_len)
        {
            args[0] =
                m.insert_instruction(ins, make_op("rnn_var_sl_shift_sequence"), args[0], seq_lens);
@@ -520,7 +520,7 @@ void rewrite_rnn::apply_gru(module& m, instruction_ref ins) const
            ih = m.add_literal(migraphx::literal{ih_shape, data});
        }
-        if(!is_forward and variable_seq_len)
+        if(not is_forward and variable_seq_len)
        {
            args[0] =
                m.insert_instruction(ins, make_op("rnn_var_sl_shift_sequence"), args[0], seq_lens);
@@ -977,7 +977,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
            pph = args[7];
        }
-        if(!is_forward and variable_seq_len)
+        if(not is_forward and variable_seq_len)
        {
            args[0] =
                m.insert_instruction(ins, make_op("rnn_var_sl_shift_sequence"), args[0], seq_lens);
@@ -1294,11 +1294,11 @@ bool rewrite_rnn::is_variable_seq_lens(const module& m, instruction_ref seq_lens
            std::vector<int64_t> vec_lens;
            arg_lens.visit([&](auto l) { vec_lens.assign(l.begin(), l.end()); });
            int64_t l = 0;
-            if(!vec_lens.empty())
+            if(not vec_lens.empty())
            {
                l = vec_lens[0];
            }
-            if(!std::all_of(vec_lens.begin(), vec_lens.end(), [&](auto v) { return v == l; }))
+            if(not std::all_of(vec_lens.begin(), vec_lens.end(), [&](auto v) { return v == l; }))
            {
                is_var_lens = true;
            }
@@ -1318,7 +1318,7 @@ rewrite_rnn::get_seq_len(const module& m, instruction_ref input, instruction_ref
    bool is_var_lens = is_variable_seq_lens(m, seq_lens);
    auto input_shape = input->get_shape();
    auto length      = input_shape.lens()[0];
-    if(!is_var_lens and seq_lens != m.end())
+    if(not is_var_lens and seq_lens != m.end())
    {
        auto arg_len = seq_lens->eval();
        std::vector<std::size_t> vec_lens;
@@ -1387,7 +1387,7 @@ void rewrite_rnn::replace_last_cell_output(module& m,
    if(variable_seq_len)
    {
-        if(!ins_outputs.empty())
+        if(not ins_outputs.empty())
        {
            cell_outputs = m.insert_instruction(
                std::next(ins),

--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -477,7 +477,7 @@ bool operator==(const shape::dynamic_dimension& x, const shape::dynamic_dimensio
 bool operator!=(const shape::dynamic_dimension& x, const shape::dynamic_dimension& y)
 {
-    return !(x == y);
+    return not(x == y);
 }
 std::ostream& operator<<(std::ostream& os, const shape::dynamic_dimension& x)
 {
@@ -497,7 +497,7 @@ bool operator==(const shape& x, const shape& y)
            x.strides() == y.strides() and x.sub_shapes() == y.sub_shapes());
 }
-bool operator!=(const shape& x, const shape& y) { return !(x == y); }
+bool operator!=(const shape& x, const shape& y) { return not(x == y); }
 std::ostream& operator<<(std::ostream& os, const shape& x)
 {

--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
@@ -208,6 +208,42 @@ struct find_mul_add
    }
 };
+struct find_dot_add
+{
+    auto matcher() const
+    {
+        return match::name("dot")(match::either_arg(0, 1)(
+            match::name("add")(
+                match::either_arg(0, 1)(match::any().bind("x"),
+                                        match::any_of(match::is_constant()).bind("b")),
+                match::none_of(match::args(match::is_constant(), match::is_constant())),
+                match::used_once()),
+            match::is_constant().bind("a")));
+    }
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins   = r.result;
+        auto a_ins = r.instructions["a"];
+        auto b_ins = r.instructions["b"];
+        auto x_ins = r.instructions["x"];
+        assert(x_ins != b_ins);
+        const bool flipped = a_ins == ins->inputs().back();
+        auto insert_dot = [&](auto x, auto y) {
+            if(flipped)
+                return m.insert_instruction(ins, make_op("dot"), y, x);
+            else
+                return m.insert_instruction(ins, make_op("dot"), x, y);
+        };
+        auto ax_ins = insert_dot(a_ins, x_ins);
+        auto ab_ins = insert_dot(a_ins, b_ins);
+        m.replace_instruction(ins, make_op("add"), ax_ins, ab_ins);
+    }
+};
 struct find_add_lit_broadcast
 {
    auto matcher() const
@@ -267,28 +303,26 @@ struct find_double_add_lit_broadcast
 struct find_inner_broadcast
 {
-    auto matcher() const
+    auto matcher() const { return pointwise(match::all_of[match::inputs()](match::broadcast())); }
-    {
-        return pointwise(
-            match::nargs(2),
-            match::args(match::name("broadcast").bind("x"), match::name("broadcast").bind("y")));
-    }
    void apply(module& m, const match::matcher_result& r) const
    {
-        auto ins   = r.result;
+        auto ins        = r.result;
-        auto x_ins = r.instructions["x"];
+        auto broadcasts = ins->inputs();
-        auto y_ins = r.instructions["y"];
+        if(broadcasts.empty())
+            return;
-        auto xbroadcast = any_cast<op::broadcast>(x_ins->get_operator());
+        std::vector<instruction_ref> inputs;
-        auto ybroadcast = any_cast<op::broadcast>(y_ins->get_operator());
+        std::transform(broadcasts.begin(),
+                       broadcasts.end(),
-        if(xbroadcast.axis != ybroadcast.axis)
+                       std::back_inserter(inputs),
+                       [](auto i) { return i->inputs().front(); });
+        if(std::any_of(inputs.begin(), inputs.end(), [&](auto i) {
+               return i->get_shape() != inputs.front()->get_shape();
+           }))
            return;
-        auto op = m.insert_instruction(
+        auto op = m.insert_instruction(ins, ins->get_operator(), inputs);
-            ins, ins->get_operator(), x_ins->inputs().front(), y_ins->inputs().front());
+        m.replace_instruction(ins, broadcasts.front()->get_operator(), op);
-        m.replace_instruction(ins, xbroadcast, op);
    }
 };
@@ -416,8 +450,9 @@ struct find_splits
 {
    auto matcher() const
    {
-        return match::any(match::any_of[match::outputs()](match::name("slice")(
+        return match::any(
-            match::any_of[match::outputs()](match::pointwise(), reduction()))));
+            match::any_of[match::outputs()](match::name("slice")(match::any_of[match::outputs()](
+                match::pointwise(match::any_of(match::nargs(1), match::nargs(2))), reduction()))));
    }
    static bool is_dependent(const module& m, instruction_ref ins1, instruction_ref ins2)
@@ -580,10 +615,9 @@ struct find_splits
                    auto outputs = i->outputs();
                    for(auto output : outputs)
                    {
-                        if(not contains({"reshape", "squeeze", "unsqueeze"}, output->name()))
+                        if(output->name() != "reshape")
                            continue;
-                        auto x =
+                        auto x = m.insert_instruction(output, make_op("contiguous"), i);
-                            m.insert_instruction(output, make_op("contiguous"), output->inputs());
                        m.replace_instruction(output, output->get_operator(), x);
                    }
@@ -753,7 +787,7 @@ MIGRAPHX_PRED_MATCHER(horiz_conv_dot, instruction_ref ins)
    };
    auto dots  = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("dot"));
    auto convs = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("convolution"));
-    return !(dots < 2 and convs < 2);
+    return not(dots < 2 and convs < 2);
 }
 struct find_conv_dot_horiz_fusion
@@ -773,7 +807,7 @@ struct find_conv_dot_horiz_fusion
            auto y = j->inputs()[1]->get_shape().lens();
            if(x.size() != y.size())
                return false;
-            // Check that non-axises match
+            // Check that non-axes match
            int axis = 1;
            if(i->name() == "dot")
            {
@@ -809,13 +843,22 @@ struct find_conv_dot_horiz_fusion
            for(auto arg : args)
                m.move_instructions(arg, input);
-            // TODO: Check if axises match
+            // TODO: Check if axes match
            auto concat =
                m.insert_instruction(input, make_op("concat", {{"axis", concat_axis}}), args);
            auto fused     = m.insert_instruction(std::next(input), op, input, concat);
            int64_t offset = 0;
            for(auto arg : range(start, last))
            {
+                auto outputs = arg->outputs();
+                for(auto output : outputs)
+                {
+                    if(output->name() != "reshape")
+                        continue;
+                    auto x = m.insert_instruction(output, make_op("contiguous"), arg);
+                    m.replace_instruction(output, output->get_operator(), x);
+                }
                int64_t len = arg->get_shape().lens()[axis];
                m.replace_instruction(
                    arg,
@@ -981,7 +1024,7 @@ struct find_split_reshape
        // all outputs are reshape and of the same shape
        auto dims = any_cast<op::reshape>(rsp->get_operator()).dims;
-        if(!same_ops(vec_rsp))
+        if(not same_ops(vec_rsp))
        {
            return;
        }
@@ -997,23 +1040,42 @@ struct find_split_reshape
        auto rsp_lens    = rsp->get_shape().lens();
        auto rsp_strides = rsp->get_shape().strides();
        rsp_strides.insert(rsp_strides.begin(), rsp_strides[0] * rsp_lens[0]);
-        auto ait = std::find(rsp_strides.begin(), rsp_strides.end(), slc_dim_size);
+        auto ait     = std::find(rsp_strides.begin(), rsp_strides.end(), slc_dim_size);
+        int rsp_axis = -1;
        if(ait == rsp_strides.end())
        {
            return;
        }
-        int rsp_axis = std::distance(rsp_strides.begin(), ait);
+        else if(ait == rsp_strides.end() - 1)
+        {
+            // edge case
+            // slice_dim == 1, in that case it could match with last stride of 1.
+            // it should accumulate lengths from last dim in that case. discount 1 to avoid going
+            // out of bounds.
+            assert(slc_dim_size == 1);
+            rsp_axis = std::distance(rsp_strides.begin(), ait) - 1;
+        }
+        else
+        {
+            rsp_axis = std::distance(rsp_strides.begin(), ait);
+        }
        // calculate reshape output shape
        std::vector<int64_t> vec_dims(vec_rsp.size());
        std::transform(vec_rsp.begin(), vec_rsp.end(), vec_dims.begin(), [&](auto is) {
            return is->get_shape().lens()[rsp_axis];
        });
        std::vector<int64_t> rsp_out_lens(rsp_lens.begin(), rsp_lens.end());
        rsp_out_lens[rsp_axis] = std::accumulate(vec_dims.begin(), vec_dims.end(), std::int64_t{0});
-        // insert the reshape instruction
+        // insert the reshape instruction and add contiguous if needed
+        if(not input->get_shape().standard())
+        {
+            input = m.insert_instruction(std::next(input), make_op("contiguous"), input);
+        }
        auto rsp_ins = m.insert_instruction(
            std::next(input), make_op("reshape", {{"dims", rsp_out_lens}}), input);
@@ -1060,7 +1122,7 @@ struct find_split_transpose
        // all transpose are the same
        auto perm = any_cast<op::transpose>(trans->get_operator()).dims;
-        if(!same_ops(vec_trans))
+        if(not same_ops(vec_trans))
        {
            return;
        }
@@ -1104,6 +1166,7 @@ void simplify_algebra::apply(module& m) const
                            find_mul_conv{},
                            find_mul_slice_conv{},
                            find_mul_add{},
+                            find_dot_add{},
                            find_div_const{},
                            find_sub_const{},
                            find_rsqrt{},

--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -99,7 +99,7 @@ struct find_reshaper
        std::vector<instruction_ref> reshapes{ins};
        while(is_reshaper(reshapes.back()))
        {
-            assert(!reshapes.back()->inputs().empty());
+            assert(not reshapes.back()->inputs().empty());
            assert(m.has_instruction(reshapes.back()->inputs().front()));
            auto input = reshapes.back()->inputs().front();
            reshapes.push_back(input);
@@ -271,6 +271,44 @@ struct find_nested_slice
    }
 };
+struct find_concat_multibroadcasts
+{
+    auto matcher() const
+    {
+        return match::name("concat")(match::all_of[match::inputs()](match::name("multibroadcast")));
+    }
+    void apply(module& m, const match::matcher_result& mr) const
+    {
+        auto ins        = mr.result;
+        auto op         = any_cast<op::concat>(ins->get_operator());
+        auto out_lens   = ins->get_shape().lens();
+        auto inputs     = ins->inputs();
+        auto in_strides = inputs.front()->get_shape().strides();
+        // Only apply when concat axis is not a broadcasted dimension
+        if(std::any_of(inputs.begin(), inputs.end(), [&](auto i) {
+               return i->get_shape().strides()[op.axis] == 0;
+           }))
+        {
+            return;
+        }
+        // Use inputs of multibroadcast ops as inputs to new concat op
+        std::transform(inputs.begin(), inputs.end(), inputs.begin(), [](auto i) {
+            return i->inputs().front();
+        });
+        // Reduce axis by number of leading broadcasted dimensions
+        if(inputs.front()->get_shape().lens().size() < out_lens.size())
+            op.axis -= std::count(in_strides.begin(), in_strides.begin() + op.axis, 0);
+        auto concat = m.insert_instruction(ins, op, inputs);
+        m.replace_instruction(
+            ins, migraphx::make_op("multibroadcast", {{"out_lens", out_lens}}), concat);
+    }
+};
 struct find_concat_transpose
 {
    auto matcher() const
@@ -288,7 +326,7 @@ struct find_concat_transpose
        auto permutation = find_permutation(s);
        // permutation should be the same for all inputs
-        if(!std::all_of(trans_inputs.begin(), trans_inputs.end(), [&](auto in) {
+        if(not std::all_of(trans_inputs.begin(), trans_inputs.end(), [&](auto in) {
               return (find_permutation(in->get_shape()) == permutation);
           }))
        {
@@ -764,6 +802,7 @@ void simplify_reshapes::apply(module& m) const
                            find_reshaper{},
                            find_transpose{},
                            find_concat_transpose{},
+                            find_concat_multibroadcasts{},
                            find_nested_convert{},
                            find_nested_slice{},
                            find_nested_concat{},

--- a/src/targets/cpu/binary.cpp
+++ b/src/targets/cpu/binary.cpp
@@ -49,7 +49,7 @@ struct dnnl_binary : dnnl_op<dnnl_binary, dnnl::binary>
        auto s0 = inputs.at(0);
        auto s1 = inputs.at(1);
        auto r  = s0;
-        if(s0 != s1 or !s0.packed())
+        if(s0 != s1 or not s0.packed())
        {
            r = shape{s0.type(), s0.lens()};
        }

--- a/src/targets/fpga/subgraph.cpp
+++ b/src/targets/fpga/subgraph.cpp
@@ -95,7 +95,7 @@ void subgraph::apply(module_pass_manager& mpm) const
    for(auto it : iterator_for(mod))
    {
        // assuming we want all the params/literals as inputs to the FPGA submodule
-        if(migraphx::starts_with(it->name(), "@param") ||
+        if(migraphx::starts_with(it->name(), "@param") or
           migraphx::starts_with(it->name(), "@literal"))
        {
            literal_inputs.push_back(it);

--- a/src/targets/gpu/code_object_op.cpp
+++ b/src/targets/gpu/code_object_op.cpp
@@ -51,7 +51,8 @@ code_object_op::compute(context& ctx, const shape&, const std::vector<argument>&
    std::vector<void*> kargs(args.size());
    std::transform(
        args.begin(), args.end(), kargs.begin(), [](const argument& a) { return a.data(); });
-    k.launch(ctx.get_stream().get(), global, local, std::move(kargs));
+    auto [start, stop] = ctx.get_perf_events();
+    k.launch(ctx.get_stream().get(), global, local, std::move(kargs), start, stop);
    return args[get_output_arg(args.size())];
 }
 void code_object_op::finalize(context&, const shape&, const std::vector<shape>&)

--- a/src/targets/gpu/compile_gen.cpp
+++ b/src/targets/gpu/compile_gen.cpp
@@ -61,12 +61,19 @@ vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs
                   [&](const auto& input) -> std::size_t {
                       auto stride = input.strides()[axis];
                       auto len    = input.lens()[axis];
-                       if(stride != 0 and stride != 1)
+                       if(not contains({0, 1}, stride))
                           return 1;
                       if(len == 1 and input.elements() > sizes.front())
                           return sizes.front();
-                       auto it = std::find_if(
+                       auto it = std::find_if(sizes.begin(), sizes.end(), [&](auto vsize) {
-                           sizes.begin(), sizes.end(), [&](auto i) { return (len % i) == 0; });
+                           // The len is divisible by the size and all the strides are divisible by
+                           // the size
+                           return (len % vsize) == 0 and
+                                  std::all_of(
+                                      input.strides().begin(), input.strides().end(), [&](auto i) {
+                                          return contains({0, 1}, i) or i % vsize == 0;
+                                      });
+                       });
                       if(it != sizes.end())
                           return *it;
                       return 1;

--- a/src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/array.hpp
@@ -131,7 +131,7 @@ struct hip_array
    friend MIGRAPHX_DEVICE_CONSTEXPR bool operator!=(const hip_array& x, const hip_array& y)
    {
-        return !(x == y);
+        return not(x == y);
    }
    // This uses the product order rather than lexical order
    friend MIGRAPHX_DEVICE_CONSTEXPR bool operator<(const hip_array& x, const hip_array& y)