Merge branch 'develop' into dyn_shape_update

faefeef9 · Charlie Lin · GitHub · 97a40ac3 · bf0a4713 · faefeef9
Unverified Commit faefeef9 authored May 25, 2022 by Charlie Lin Committed by GitHub May 25, 2022
20 changed files
--- a/src/reduce_dims.cpp
+++ b/src/reduce_dims.cpp
@@ -16,10 +16,8 @@ bool reduce_dim(std::vector<shape>& shapes, std::size_t n)
        auto bstride = s.strides()[n + 1];
        auto blen    = s.lens()[n + 1];
-        if(astride == bstride * blen)
+        if(astride == bstride * blen or alen == 1)
-        {
            new_lens.push_back(alen * blen);
-        }
    }
    if(new_lens.size() != shapes.size())
        return false;
@@ -37,10 +35,25 @@ bool reduce_dim(std::vector<shape>& shapes, std::size_t n)
    return true;
 }
+void reduce_dim1(std::vector<shape>& shapes)
+{
+    if(std::any_of(shapes.begin(), shapes.end(), [&](const auto& s) {
+           return s.lens().size() < 2 or s.lens().back() != 1;
+       }))
+        return;
+    for(auto& s : shapes)
+    {
+        auto lens    = s.lens();
+        auto strides = s.strides();
+        lens.pop_back();
+        strides.pop_back();
+        s = shape{s.type(), lens, strides};
+    }
+}
 std::size_t reduce_dim_all(std::vector<shape>& shapes, std::size_t n)
 {
    while(reduce_dim(shapes, n) and n < shapes.size()) {}
    return n + 1;
 }
 void reduce_dim_all(std::vector<shape>& shapes)
@@ -48,6 +61,7 @@ void reduce_dim_all(std::vector<shape>& shapes)
    std::size_t n = 0;
    while(n < shapes.front().lens().size() - 1)
        n = reduce_dim_all(shapes, n);
+    reduce_dim1(shapes);
 }
 std::vector<std::size_t> base_lens(const std::vector<shape>& shapes)

--- a/src/rewrite_batchnorm.cpp
+++ b/src/rewrite_batchnorm.cpp
@@ -14,9 +14,9 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-void rewrite_batchnorm::apply(module& p) const
+void rewrite_batchnorm::apply(module& m) const
 {
-    for(auto ins : iterator_for(p))
+    for(auto ins : iterator_for(m))
    {
        if(ins->name() != "batch_norm_inference")
            continue;
@@ -46,13 +46,13 @@ void rewrite_batchnorm::apply(module& p) const
            });
        auto broadcast   = op::broadcast{1, ins->get_shape().lens()};
-        auto a_ins       = p.add_literal({a.get_shape(), a.data()});
+        auto a_ins       = m.add_literal({a.get_shape(), a.data()});
-        auto a_broadcast = p.insert_instruction(ins, broadcast, a_ins);
+        auto a_broadcast = m.insert_instruction(ins, broadcast, a_ins);
-        auto mul   = p.insert_instruction(ins, make_op("mul"), ins->inputs().front(), a_broadcast);
+        auto mul   = m.insert_instruction(ins, make_op("mul"), ins->inputs().front(), a_broadcast);
-        auto b_ins = p.add_literal({b.get_shape(), b.data()});
+        auto b_ins = m.add_literal({b.get_shape(), b.data()});
-        auto b_broadcast = p.insert_instruction(ins, broadcast, b_ins);
+        auto b_broadcast = m.insert_instruction(ins, broadcast, b_ins);
-        auto add         = p.insert_instruction(ins, make_op("add"), mul, b_broadcast);
+        auto add         = m.insert_instruction(ins, make_op("add"), mul, b_broadcast);
-        p.replace_instruction(ins, add);
+        m.replace_instruction(ins, add);
    }
 }

--- a/src/rewrite_pooling.cpp
+++ b/src/rewrite_pooling.cpp
@@ -12,9 +12,9 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-void rewrite_pooling::apply(module& prog) const
+void rewrite_pooling::apply(module& m) const
 {
-    for(auto ins : iterator_for(prog))
+    for(auto ins : iterator_for(m))
    {
        if(ins->name() != "pooling")
            continue;
@@ -33,26 +33,25 @@ void rewrite_pooling::apply(module& prog) const
            continue;
        std::int64_t n = s.lens()[0];
        std::int64_t c = s.lens()[1];
-        auto reshape   = prog.insert_instruction(
+        auto reshape   = m.insert_instruction(
            ins, make_op("reshape", {{"dims", {n * c, -1}}}), ins->inputs().front());
        instruction_ref pooling{};
        // average pooling
        if(op.mode == op::pooling_mode::average)
        {
-            pooling =
+            pooling = m.insert_instruction(ins, make_op("reduce_mean", {{"axes", {1}}}), reshape);
-                prog.insert_instruction(ins, make_op("reduce_mean", {{"axes", {1}}}), reshape);
        }
        // max pooling
        else
        {
-            pooling = prog.insert_instruction(ins, make_op("reduce_max", {{"axes", {1}}}), reshape);
+            pooling = m.insert_instruction(ins, make_op("reduce_max", {{"axes", {1}}}), reshape);
        }
        std::vector<int64_t> rsp_lens(lens.size(), 1);
        rsp_lens[0] = n;
        rsp_lens[1] = c;
-        prog.replace_instruction(ins, make_op("reshape", {{"dims", rsp_lens}}), pooling);
+        m.replace_instruction(ins, make_op("reshape", {{"dims", rsp_lens}}), pooling);
    }
 }

--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
--- a/src/schedule.cpp
+++ b/src/schedule.cpp
@@ -42,7 +42,7 @@ struct stream_info
    std::unordered_map<instruction_ref, std::size_t> iweights;
    ins_dep_map mod_implicit_deps;
-    void calc_implicit_deps(const module& p) { mod_implicit_deps = p.calc_implicit_deps(); }
+    void calc_implicit_deps(const module& m) { mod_implicit_deps = m.calc_implicit_deps(); }
    void accumulate_weights(instruction_ref last, const schedule_model& model)
    {
@@ -116,15 +116,15 @@ struct stream_info
        }
    };
-    std::size_t assign_streams(module& p, std::size_t n)
+    std::size_t assign_streams(module& m, std::size_t n)
    {
        assert(n > 0);
        partition critical;
        std::unordered_map<instruction_ref, std::deque<partition>> partitions;
        partitions.reserve(weights.size());
        fix([&](auto self, auto ins, auto& part) {
-            assert(not is_end(ins, p.end()));
+            assert(not is_end(ins, m.end()));
-            if(not p.has_instruction(ins))
+            if(not m.has_instruction(ins))
                return;
            if(contains(partitions, ins))
                return;
@@ -151,8 +151,8 @@ struct stream_info
                }
            }
            // Sort instructions
-            p.move_instruction(ins, p.end());
+            m.move_instruction(ins, m.end());
-        })(std::prev(p.end()), critical);
+        })(std::prev(m.end()), critical);
        // Set the critical partition to stream 0
        set_stream(critical, 0);
@@ -197,13 +197,13 @@ struct stream_info
        }
    };
-    void sort(module& p, std::size_t)
+    void sort(module& m, std::size_t)
    {
        std::set<weight_ins, compare_weight_ins> children;
        std::unordered_map<instruction_ref, std::size_t> visited;
-        auto last      = std::prev(p.end());
+        auto last      = std::prev(m.end());
        auto mw        = this->weights.at(last);
-        auto nw        = mw / (p.size() + 1);
+        auto nw        = mw / (m.size() + 1);
        auto add_child = [&](auto ins) {
            auto x  = 1 + (mw - this->weights.at(ins)) / (nw + 1);
            auto w  = x * this->iweights.at(ins);
@@ -222,10 +222,10 @@ struct stream_info
            // Pop the first element
            auto top = children.begin()->second;
            children.erase(children.begin());
-            p.move_instruction(top, p.begin());
+            m.move_instruction(top, m.begin());
            for(auto ins : top->inputs())
            {
-                if(not p.has_instruction(ins))
+                if(not m.has_instruction(ins))
                    continue;
                add_child(ins);
            }
@@ -234,7 +234,7 @@ struct stream_info
            {
                for(auto ins : mod_implicit_deps.at(top))
                {
-                    assert(p.has_instruction(ins));
+                    assert(m.has_instruction(ins));
                    add_child(ins);
                }
            }
@@ -242,12 +242,12 @@ struct stream_info
        // move dangling parameter to the front so as not be removed
        auto ins = std::next(last);
-        while(ins != p.end())
+        while(ins != m.end())
        {
            auto next = std::next(ins);
            if(ins->name() == "@param")
            {
-                p.move_instruction(ins, p.begin());
+                m.move_instruction(ins, m.begin());
            }
            ins = next;
        }
@@ -364,18 +364,18 @@ struct stream_info
    }
    std::unordered_map<instruction_ref, std::vector<std::vector<instruction_ref>>>
-    find_concurrent_instructions(module& p) const
+    find_concurrent_instructions(module& m) const
    {
        std::unordered_map<instruction_ref, std::vector<std::vector<instruction_ref>>> result;
        std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>> merge_from;
-        dominator_info di = compute_dominator(p);
+        dominator_info di = compute_dominator(m);
-        result.reserve(p.size());
+        result.reserve(m.size());
-        merge_from.reserve(p.size());
+        merge_from.reserve(m.size());
-        for(auto ins : reverse_iterator_for(p))
+        for(auto ins : reverse_iterator_for(m))
        {
            for(auto&& arg : ins->outputs())
            {
-                if(not p.has_instruction(arg))
+                if(not m.has_instruction(arg))
                    continue;
                if(is_merge_point(arg))
                    merge_from[ins].insert(arg);
@@ -415,18 +415,18 @@ struct stream_info
    }
    std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>>
-    get_conflicts(module& p)
+    get_conflicts(module& m)
    {
        using conflict_table_type =
            std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>>;
        conflict_table_type conflict_table;
-        auto concur_ins = this->find_concurrent_instructions(p);
+        auto concur_ins = this->find_concurrent_instructions(m);
        // Compute an index for each instruction
        std::unordered_map<instruction_ref, std::size_t> ins2index;
        std::size_t index_total = 0;
-        for(auto ins : iterator_for(p))
+        for(auto ins : iterator_for(m))
            ins2index[ins] = index_total++;
        std::vector<conflict_table_type> thread_conflict_tables(
@@ -507,21 +507,21 @@ struct stream_info
    }
 };
-void schedule::apply(module& p) const
+void schedule::apply(module& m) const
 {
    if(not enable)
        return;
    stream_info si;
-    si.calc_implicit_deps(p);
+    si.calc_implicit_deps(m);
-    auto last = std::prev(p.end());
+    auto last = std::prev(m.end());
    si.accumulate_weights(last, model);
-    auto nstreams = si.assign_streams(p, model.concurrency());
+    auto nstreams = si.assign_streams(m, model.concurrency());
-    si.sort(p, model.concurrency());
+    si.sort(m, model.concurrency());
    if(enabled(MIGRAPHX_TRACE_COMPILE{}) or enabled(MIGRAPHX_TRACE_SCHEDULE{}))
    {
-        p.annotate(std::cout, [&](auto ins) {
+        m.annotate(std::cout, [&](auto ins) {
            if(ins->name() == "@param" and not contains(si.weights, ins))
                return;
@@ -548,9 +548,9 @@ void schedule::apply(module& p) const
    std::unordered_map<instruction_ref, std::size_t> ins2wait;
    std::unordered_map<std::size_t, std::unordered_set<std::size_t>> waited_for;
    std::unordered_map<instruction_ref, std::unordered_set<std::size_t>> ins2waited;
-    ins2wait.reserve(p.size());
+    ins2wait.reserve(m.size());
-    ins2waited.reserve(p.size());
+    ins2waited.reserve(m.size());
-    for(auto ins : iterator_for(p))
+    for(auto ins : iterator_for(m))
    {
        // Only schedule instructions that have a stream
        if(not si.has_stream(ins))
@@ -559,7 +559,7 @@ void schedule::apply(module& p) const
        // Schedule instruction on the stream
        auto stream = si.get_stream(ins);
        assert(stream < model.concurrency());
-        model.sched(p, ins, stream);
+        model.sched(m, ins, stream);
        // Insert wait instructions
        if(si.is_merge_point(ins, stream))
        {
@@ -572,14 +572,14 @@ void schedule::apply(module& p) const
                if(not contains(ins2wait, i))
                {
                    ins2wait[i] = wait_id;
-                    model.record(p, i, wait_id);
+                    model.record(m, i, wait_id);
                    wait_id++;
                }
                auto w = ins2wait.at(i);
                // If we already waited for the event on this stream then dont
                // insert another wait event
                if(not contains(waited_for[stream], w))
-                    model.wait(p, ins, w);
+                    model.wait(m, ins, w);
                // Store the event as waited
                waited_for[stream].insert(w);
                // Store all wait events that have been waited on prior to the recorded instruction
@@ -594,7 +594,7 @@ void schedule::apply(module& p) const
    }
    // Add memory conflicts
-    auto conflict_table = si.get_conflicts(p);
+    auto conflict_table = si.get_conflicts(m);
    for(auto&& ip : conflict_table)
    {
        if(ip.second.empty())
@@ -602,7 +602,7 @@ void schedule::apply(module& p) const
        std::vector<instruction_ref> args;
        args.push_back(ip.first);
        args.insert(args.end(), ip.second.begin(), ip.second.end());
-        p.insert_instruction(std::next(ip.first), make_op("identity"), args);
+        m.insert_instruction(std::next(ip.first), make_op("identity"), args);
    }
 }

--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
--- a/src/simplify_qdq.cpp
+++ b/src/simplify_qdq.cpp
@@ -53,7 +53,7 @@ struct match_find_quantizable_ops
            match::arg(1)(dequantizelinear_op("x2", "scale2")));
    }
-    void apply(module& m, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto qop    = r.result;
        auto q1     = r.instructions["x1"];

--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -70,19 +70,19 @@ struct find_reshaper
            match::any_of[match::outputs()](match::name(reshaper_names())));
    }
-    void apply(module& p, const match::matcher_result& mr) const
+    void apply(module& m, const match::matcher_result& mr) const
    {
        auto ins = mr.result;
        std::vector<instruction_ref> reshapes{ins};
        while(is_reshaper(reshapes.back()))
        {
            assert(!reshapes.back()->inputs().empty());
-            assert(p.has_instruction(reshapes.back()->inputs().front()));
+            assert(m.has_instruction(reshapes.back()->inputs().front()));
            auto input = reshapes.back()->inputs().front();
            reshapes.push_back(input);
        }
-        std::pair<instruction_ref, instruction_ref> r{p.end(), p.end()};
+        std::pair<instruction_ref, instruction_ref> r{m.end(), m.end()};
        for(auto start : iterator_for(reshapes))
        {
            auto last = std::find_if(reshapes.rbegin(), reshapes.rend(), [&](auto&& i) {
@@ -96,7 +96,7 @@ struct find_reshaper
        }
        if(r.first != r.second)
        {
-            p.replace_instruction(r.first, r.second);
+            m.replace_instruction(r.first, r.second);
        }
    }
 };
@@ -117,10 +117,10 @@ struct find_nop_reshapes
        return match::name(reshapes)(match::same_shape(match::arg(0)));
    }
-    void apply(module& p, const match::matcher_result& mr) const
+    void apply(module& m, const match::matcher_result& mr) const
    {
        auto ins = mr.result;
-        p.replace_instruction(ins, ins->inputs().front());
+        m.replace_instruction(ins, ins->inputs().front());
    }
 };
@@ -132,7 +132,7 @@ struct find_transpose
            match::skip_output(match::name("contiguous"))(match::name("transpose"))));
    }
-    void apply(module& p, const match::matcher_result& mr) const
+    void apply(module& m, const match::matcher_result& mr) const
    {
        auto ins = mr.result;
        auto x   = ins;
@@ -149,11 +149,11 @@ struct find_transpose
            return;
        if(is_no_transpose(dims))
        {
-            p.replace_instruction(ins, t->inputs().front());
+            m.replace_instruction(ins, t->inputs().front());
        }
        else
        {
-            p.replace_instruction(
+            m.replace_instruction(
                ins, make_op("transpose", {{"permutation", dims}}), t->inputs().front());
        }
    }
@@ -223,7 +223,7 @@ struct find_nested_slice
        return result;
    }
-    void apply(module& p, const match::matcher_result& mr) const
+    void apply(module& m, const match::matcher_result& mr) const
    {
        auto ins   = mr.result;
        auto slice = ins->inputs().front();
@@ -241,7 +241,7 @@ struct find_nested_slice
            op.starts.push_back(pp.second.first);
            op.ends.push_back(pp.second.second);
        }
-        p.replace_instruction(ins, op, input);
+        m.replace_instruction(ins, op, input);
    }
 };
@@ -252,7 +252,7 @@ struct find_concat_transpose
        return match::name("concat")(match::all_of[match::inputs()](match::transpose_shape()));
    }
-    void apply(module& p, const match::matcher_result& mr) const
+    void apply(module& m, const match::matcher_result& mr) const
    {
        auto ins          = mr.result;
        auto trans_inputs = ins->inputs();
@@ -279,14 +279,14 @@ struct find_concat_transpose
        std::vector<instruction_ref> inputs;
        std::transform(
            ins->inputs().begin(), ins->inputs().end(), std::back_inserter(inputs), [&](auto i) {
-                return p.insert_instruction(
+                return m.insert_instruction(
                    ins, make_op("transpose", {{"permutation", permutation}}), i);
            });
-        auto concat = p.insert_instruction(ins, op, inputs);
+        auto concat = m.insert_instruction(ins, op, inputs);
-        auto t      = p.insert_instruction(
+        auto t      = m.insert_instruction(
            ins, make_op("transpose", {{"permutation", ipermutation}}), concat);
        assert(ins->get_shape().lens() == t->get_shape().lens());
-        p.replace_instruction(ins, t);
+        m.replace_instruction(ins, t);
    }
 };
@@ -303,7 +303,7 @@ struct find_nested_concat
        return op.axis;
    }
-    void apply(module& p, const match::matcher_result& mr) const
+    void apply(module& m, const match::matcher_result& mr) const
    {
        auto ins  = mr.result;
        auto axis = get_axis(ins);
@@ -317,7 +317,7 @@ struct find_nested_concat
                    args.push_back(i);
            }
        })(ins->inputs());
-        p.replace_instruction(ins, ins->get_operator(), args);
+        m.replace_instruction(ins, ins->get_operator(), args);
    }
 };
@@ -329,7 +329,7 @@ struct find_resize
            match::args(match::name("reshape").bind("data"), match::is_constant().bind("ind")));
    }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto ins     = r.result;
        auto ins_rsp = r.instructions["data"];
@@ -417,13 +417,13 @@ struct find_resize
        }
        auto in_rsp   = ins_rsp->inputs().front();
-        auto rsp_data = p.insert_instruction(
+        auto rsp_data = m.insert_instruction(
            ins_rsp, migraphx::make_op("reshape", {{"dims", in_dims}}), in_rsp);
-        auto mb_rsp = p.insert_instruction(
+        auto mb_rsp = m.insert_instruction(
            ins_rsp, migraphx::make_op("multibroadcast", {{"out_lens", out_dims}}), rsp_data);
-        auto std_mb = p.insert_instruction(ins, migraphx::make_op("contiguous"), mb_rsp);
+        auto std_mb = m.insert_instruction(ins, migraphx::make_op("contiguous"), mb_rsp);
        std::vector<int64_t> rsp_dims(out_lens.begin(), out_lens.end());
-        p.replace_instruction(ins, migraphx::make_op("reshape", {{"dims", rsp_dims}}), std_mb);
+        m.replace_instruction(ins, migraphx::make_op("reshape", {{"dims", rsp_dims}}), std_mb);
    }
 };
@@ -436,7 +436,7 @@ struct find_where_op
                        match::is_constant().bind("ind")));
    }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto ins     = r.result;
        auto concat  = r.instructions["data"];
@@ -475,11 +475,11 @@ struct find_where_op
        if(val)
        {
-            p.replace_instruction(ins, inputs.at(0));
+            m.replace_instruction(ins, inputs.at(0));
        }
        else
        {
-            p.replace_instruction(ins, inputs.at(1));
+            m.replace_instruction(ins, inputs.at(1));
        }
    }
 };
@@ -496,7 +496,7 @@ struct find_reshape_cont
                match::any()));
    }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto ins      = r.result;
        auto ins_cont = r.instructions["cont"];
@@ -530,11 +530,11 @@ struct find_reshape_cont
            else
            {
                inputs.push_back(
-                    p.insert_instruction(ins, make_op("reshape", {{"dims", dims}}), in));
+                    m.insert_instruction(ins, make_op("reshape", {{"dims", dims}}), in));
            }
        }
-        auto out = p.insert_instruction(ins, ins->get_operator(), inputs);
+        auto out = m.insert_instruction(ins, ins->get_operator(), inputs);
-        p.replace_instruction(ins, make_op("reshape", {{"dims", out_dims}}), out);
+        m.replace_instruction(ins, make_op("reshape", {{"dims", out_dims}}), out);
    }
 };
@@ -564,25 +564,25 @@ struct find_transpose_contiguous_reshaper_unary
                         match::args(match_transpose_contiguous_reshaper()));
    }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto ins           = r.result;
        auto reshaper_ins  = r.instructions["reshaper_ins"];
        auto trans_ins     = r.instructions["trans_ins"];
        auto cont_ins      = r.instructions["cont_ins"];
        auto unary_op_name = ins->get_operator().name();
-        auto unary_ins     = p.insert_instruction(cont_ins, make_op(unary_op_name), trans_ins);
+        auto unary_ins     = m.insert_instruction(cont_ins, make_op(unary_op_name), trans_ins);
-        auto new_cont_ins  = p.insert_instruction(cont_ins, make_op("contiguous"), unary_ins);
+        auto new_cont_ins  = m.insert_instruction(cont_ins, make_op("contiguous"), unary_ins);
        // older cont and reshape are removed by deadcode elimination
-        p.replace_instruction(ins, reshaper_ins->get_operator(), new_cont_ins);
+        m.replace_instruction(ins, reshaper_ins->get_operator(), new_cont_ins);
    }
 };
-void simplify_reshapes::apply(module& p) const
+void simplify_reshapes::apply(module& m) const
 {
    for(int i = 0; i < 2; i++)
    {
-        match::find_matches(p,
+        match::find_matches(m,
                            find_where_op{},
                            find_resize{},
                            find_reshape_cont{},
@@ -594,7 +594,7 @@ void simplify_reshapes::apply(module& p) const
                            find_nested_slice{},
                            find_nested_concat{},
                            find_transpose_contiguous_reshaper_unary{});
-        dead_code_elimination{}.apply(p);
+        dead_code_elimination{}.apply(m);
    }
 }

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -352,7 +352,7 @@ struct cpu_apply
            std::transform(bind_inputs.begin(),
                           bind_inputs.end(),
                           std::back_inserter(inputs),
-                           [&](const auto& s) { return r.instructions.at(s); });
+                           [&](const auto& s) { return r.instructions[s]; });
            inputs.push_back(this->insert_allocation(ins, ins->get_shape()));
            modl->replace_instruction(ins, op, inputs);
        });

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -158,6 +158,7 @@ add_library(migraphx_gpu
    nonzero.cpp
    pack_args.cpp
    pack_int8_args.cpp
+    prefuse_ops.cpp
    pad.cpp
    pooling.cpp
    quant_convolution.cpp

--- a/src/targets/gpu/analyze_streams.cpp
+++ b/src/targets/gpu/analyze_streams.cpp
@@ -28,30 +28,30 @@ struct hip_stream_model
    bool is_wait(migraphx::instruction_ref ins) const { return ins->name() == "gpu::wait_event"; }
 };
-stream_model make_stream_model(const module& p)
+stream_model make_stream_model(const module& m)
 {
-    hip_stream_model m;
+    hip_stream_model hsm;
    std::size_t stream = 0;
-    for(auto ins : iterator_for(p))
+    for(auto ins : iterator_for(m))
    {
        if(ins->name() == "gpu::set_stream")
        {
-            auto v       = ins->get_operator().to_value();
+            auto v         = ins->get_operator().to_value();
-            stream       = v["stream"].to<std::size_t>();
+            stream         = v["stream"].to<std::size_t>();
-            m.max_stream = std::max(stream, m.max_stream);
+            hsm.max_stream = std::max(stream, hsm.max_stream);
        }
        if(ins->get_operator().is_context_free())
            continue;
        if(contains({"hip::hip_allocate_memory", "hip::hip_copy_literal", "@param"}, ins->name()))
            continue;
-        m.ins2stream[ins] = stream;
+        hsm.ins2stream[ins] = stream;
    }
-    return m;
+    return hsm;
 }
-std::vector<stream_race> analyze_streams(const module& p)
+std::vector<stream_race> analyze_streams(const module& m)
 {
-    return migraphx::analyze_streams(p, make_stream_model(p));
+    return migraphx::analyze_streams(m, make_stream_model(m));
 }
 } // namespace gpu

--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
@@ -22,6 +22,7 @@ namespace gpu {
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DEBUG);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_OPTIMIZE);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_ASM);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_SRC);
 #if MIGRAPHX_USE_HIPRTC
@@ -247,6 +248,16 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
            MIGRAPHX_THROW("Missing hsaco");
        };
+    if(enabled(MIGRAPHX_GPU_DUMP_SRC{}))
+    {
+        for(const auto& src : srcs)
+        {
+            if(src.path.extension() != ".cpp")
+                continue;
+            std::cout << std::string(src.content.first, src.len()) << std::endl;
+        }
+    }
    if(enabled(MIGRAPHX_GPU_DUMP_ASM{}))
    {

--- a/src/targets/gpu/eliminate_workspace.cpp
+++ b/src/targets/gpu/eliminate_workspace.cpp
@@ -11,11 +11,11 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-void eliminate_workspace::apply(module& p) const
+void eliminate_workspace::apply(module& m) const
 {
    std::size_t n = 0;
    std::vector<instruction_ref> allocs;
-    for(auto ins : iterator_for(p))
+    for(auto ins : iterator_for(m))
    {
        if(ins->outputs().size() != 1)
            continue;
@@ -30,11 +30,11 @@ void eliminate_workspace::apply(module& p) const
    }
    if(n > 0)
    {
-        auto ws = p.add_parameter("workspace", shape{shape::int8_type, {n}});
+        auto ws = m.add_parameter("workspace", shape{shape::int8_type, {n}});
        for(auto&& a : allocs)
        {
-            p.replace_instruction(a, ws);
+            m.replace_instruction(a, ws);
-            p.remove_instruction(a);
+            m.remove_instruction(a);
        }
    }
 }

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -316,7 +316,7 @@ struct find_layernorm
 {
    auto matcher() const { return match::layernorm(&gpu_name); }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto ins   = r.result;
        auto x_ins = r.instructions["x"];
@@ -331,7 +331,7 @@ struct find_layernorm
        if(relements > 1024 or (relements % 4 != 0 and relements > 256))
            return;
-        p.replace_instruction(ins, hip_layernorm{}, x_ins, args.back());
+        m.replace_instruction(ins, hip_layernorm{}, x_ins, args.back());
    }
 };
@@ -343,11 +343,11 @@ struct find_triadd_layernorm
            match::used_once(), match::all_of[match::inputs()](match::standard_shape()))));
    }
-    void apply(module& p, const match::matcher_result& r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto ins    = r.result;
        auto triadd = ins->inputs().front();
-        p.replace_instruction(ins, hip_triadd_layernorm{}, triadd->inputs());
+        m.replace_instruction(ins, hip_triadd_layernorm{}, triadd->inputs());
    }
 };
@@ -355,13 +355,13 @@ struct find_gelu
 {
    auto matcher() const { return match::gelu_erf(&gpu_name); }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto ins   = r.result;
        auto x_ins = r.instructions["x"];
        auto args  = ins->inputs();
-        p.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
+        m.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
    }
 };
@@ -372,7 +372,7 @@ struct find_add_gelu
        return match::name("gpu::gelu")(match::arg(0)(match::name("gpu::add").bind("add")));
    }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto add_ins = r.instructions["add"];
        auto ins     = r.result;
@@ -381,7 +381,7 @@ struct find_add_gelu
        move_broadcasted_back(args);
        args.back() = ins->inputs().back();
-        p.replace_instruction(ins, hip_add_gelu{}, args);
+        m.replace_instruction(ins, hip_add_gelu{}, args);
    }
 };
@@ -391,16 +391,16 @@ struct find_gelu_new
    auto matcher() const { return match::gelu_tanh(&gpu_name); }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto ins   = r.result;
        auto x_ins = r.instructions["x"];
        auto args  = ins->inputs();
        if(fast_math)
-            p.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
+            m.replace_instruction(ins, hip_gelu{}, x_ins, args.back());
        else
-            p.replace_instruction(ins, hip_gelu_new{}, x_ins, args.back());
+            m.replace_instruction(ins, hip_gelu_new{}, x_ins, args.back());
    }
 };
@@ -411,7 +411,7 @@ struct find_add_gelu_new
        return match::name("gpu::gelu_new")(match::arg(0)(match::name("gpu::add").bind("add")));
    }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto add_ins = r.instructions["add"];
        auto ins     = r.result;
@@ -420,7 +420,7 @@ struct find_add_gelu_new
        move_broadcasted_back(args);
        args.back() = ins->inputs().back();
-        p.replace_instruction(ins, hip_add_gelu_new{}, args);
+        m.replace_instruction(ins, hip_add_gelu_new{}, args);
    }
 };
@@ -435,7 +435,7 @@ struct find_add_clip
                              .bind("add")));
    }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto add_ins  = r.instructions["add"];
        auto ins      = r.result;
@@ -448,9 +448,9 @@ struct find_add_clip
        add_args.pop_back();
        add_args.insert(add_args.end(), std::next(ins_args.begin()), ins_args.end());
        if(add_ins->name() == "gpu::add")
-            p.replace_instruction(ins, hip_add_clip{}, add_args);
+            m.replace_instruction(ins, hip_add_clip{}, add_args);
        else if(add_ins->name() == "gpu::triadd")
-            p.replace_instruction(ins, hip_triadd_clip{}, add_args);
+            m.replace_instruction(ins, hip_triadd_clip{}, add_args);
    }
 };
@@ -470,7 +470,7 @@ struct find_add_unary
                .bind("add")));
    }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto add_ins = r.instructions["add"];
        auto ins     = r.result;
@@ -481,9 +481,9 @@ struct find_add_unary
        // Use the allocation from the relu operator
        args.back() = ins->inputs().back();
        if(add_ins->name() == "gpu::add")
-            p.replace_instruction(ins, binary_add_op, args);
+            m.replace_instruction(ins, binary_add_op, args);
        else if(add_ins->name() == "gpu::triadd")
-            p.replace_instruction(ins, ternary_add_op, args);
+            m.replace_instruction(ins, ternary_add_op, args);
    }
 };
@@ -498,7 +498,7 @@ struct find_triadd
                .bind("input")));
    }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto add_ins   = r.instructions["add"];
        auto input_ins = r.instructions["input"];
@@ -513,7 +513,7 @@ struct find_triadd
        move_broadcasted_back(args);
        args.back() = ins->inputs().back();
-        p.replace_instruction(ins, hip_triadd{}, args);
+        m.replace_instruction(ins, hip_triadd{}, args);
    }
 };
@@ -525,7 +525,7 @@ struct find_mul_add
            match::name("gpu::mul")(match::used_once()).bind("mul"), match::any().bind("b")));
    }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto mul_ins = r.instructions["mul"];
        auto b_ins   = r.instructions["b"];
@@ -538,7 +538,7 @@ struct find_mul_add
        args.insert(std::prev(args.end()), b_ins);
        args.back() = ins->inputs().back();
-        p.replace_instruction(ins, hip_mul_add{}, args);
+        m.replace_instruction(ins, hip_mul_add{}, args);
    }
 };
@@ -550,7 +550,7 @@ struct find_mul_add_relu
            match::arg(0)(match::name("gpu::mul_add")(match::used_once()).bind("mul_add")));
    }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto mul_add_ins = r.instructions["mul_add"];
        auto ins         = r.result;
@@ -558,7 +558,7 @@ struct find_mul_add_relu
        // Use the allocation from the relu operator
        args.back() = ins->inputs().back();
-        p.replace_instruction(ins, hip_mul_add_relu{}, args);
+        m.replace_instruction(ins, hip_mul_add_relu{}, args);
    }
 };
@@ -783,7 +783,7 @@ auto conv_bias(Ms... ms)
 }
 template <class Op>
-void apply_conv_bias(context& ctx, module& p, match::matcher_result r)
+void apply_conv_bias(context& ctx, module& m, const match::matcher_result& r)
 {
    auto conv_ins    = r.instructions["conv"];
    auto bias_ins    = r.instructions["bias"];
@@ -798,7 +798,7 @@ void apply_conv_bias(context& ctx, module& p, match::matcher_result r)
    // TODO: Insert ws allocation
    auto ws = cb.get_workspace(ctx);
    (void)ws;
-    p.replace_instruction(ins, cb, input_ins, weights_ins, old_ws_ins, bias_ins, alloc_ins);
+    m.replace_instruction(ins, cb, input_ins, weights_ins, old_ws_ins, bias_ins, alloc_ins);
 }
 inline auto precompile_name(std::string s) // NOLINT
@@ -829,9 +829,9 @@ struct find_conv_bias
            match::output(match::name(std::unordered_set<std::string>{"gpu::relu"}))));
    }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
-        apply_conv_bias<miopen_conv_bias>(*ctx, p, std::move(r));
+        apply_conv_bias<miopen_conv_bias>(*ctx, m, r);
    }
 };
@@ -840,9 +840,9 @@ struct find_conv_bias_relu
    context* ctx = nullptr;
    auto matcher() const { return match::name("gpu::relu")(match::arg(0)(conv_bias())); }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
-        apply_conv_bias<miopen_conv_bias_relu>(*ctx, p, std::move(r));
+        apply_conv_bias<miopen_conv_bias_relu>(*ctx, m, r);
    }
 };
@@ -857,7 +857,7 @@ struct find_conv_pointwise
                                    fusable_conv(match::used_once()).bind("conv")));
    }
-    void apply(module& m, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto conv_ins    = r.instructions["conv"];
        auto bias_ins    = r.instructions["bias"];
@@ -896,7 +896,7 @@ struct find_gemm_add
                                    match::name("gpu::gemm")(match::nargs(3)).bind("gemm")));
    }
-    void apply(module& p, match::matcher_result r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto ins      = r.result;
        auto gemm_ins = r.instructions["gemm"];
@@ -908,26 +908,68 @@ struct find_gemm_add
        if(not float_equal(gemm.beta, 0))
            return;
-        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto i) {
-               return not i->get_shape().standard();
-           }))
-            return;
        auto inputs = gemm_ins->inputs();
        inputs.pop_back();
        auto copy_ins = c_ins;
        // Insert copy
-        if(ins == p.end() or c_ins->outputs().size() > 1 or c_ins->inputs().empty())
+        if(ins == m.end() or c_ins->outputs().size() > 1 or c_ins->inputs().empty())
        {
-            copy_ins = p.insert_instruction(ins, hip_copy{}, c_ins, ins->inputs().back());
+            copy_ins = m.insert_instruction(ins, hip_copy{}, c_ins, ins->inputs().back());
        }
        inputs.push_back(copy_ins);
        inputs.push_back(copy_ins);
        gemm.beta = 1;
-        p.replace_instruction(ins, gemm, inputs);
+        m.replace_instruction(ins, gemm, inputs);
+    }
+};
+auto pointwise_name(const std::string& s)
+{
+    return precompile_name("pointwise")(match::make_basic_pred_matcher([=](auto ins) {
+        module_ref pm = ins->module_inputs().front();
+        auto n = std::count_if(pm->begin(), pm->end(), [&](auto& i) { return i.name() == s; });
+        if(n != 1)
+            return false;
+        return std::all_of(pm->begin(), pm->end(), [&](auto& i) {
+            return starts_with(i.name(), "@") or i.name() == s;
+        });
+    }));
+}
+struct find_gemm_pointwise
+{
+    auto matcher() const
+    {
+        return pointwise_name("add")(
+            match::nargs(3),
+            match::all_of[match::inputs()](match::standard_shape()),
+            match::either_arg(0, 1)(match::used_once().bind("c"),
+                                    match::name("gpu::gemm")(match::nargs(3)).bind("gemm")));
+    }
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        auto gemm_ins = r.instructions["gemm"];
+        auto c_ins    = r.instructions["c"];
+        auto gemm = any_cast<rocblas_gemm<op::dot>>(gemm_ins->get_operator());
+        // Already fused gemm
+        if(not float_equal(gemm.beta, 0))
+            return;
+        auto inputs = gemm_ins->inputs();
+        inputs.pop_back();
+        inputs.push_back(c_ins);
+        inputs.push_back(gemm_ins->inputs().back());
+        gemm.beta = 1;
+        m.replace_instruction(ins, gemm, inputs);
    }
 };
@@ -938,22 +980,22 @@ struct find_commutative_broadcast
        return match::name("gpu::add", "gpu::mul")(match::arg(1)(match::broadcast_shape()));
    }
-    void apply(module& p, const match::matcher_result& r) const
+    void apply(module& m, const match::matcher_result& r) const
    {
        auto ins  = r.result;
        auto args = ins->inputs();
        move_broadcasted_back(args);
-        p.replace_instruction(ins, ins->get_operator(), args);
+        m.replace_instruction(ins, ins->get_operator(), args);
    }
 };
-void fuse_ops::apply(module& p) const
+void fuse_ops::apply(module& m) const
 {
-    match::find_matches(p, find_gelu{}, find_gelu_new{fast_math});
+    match::find_matches(m, find_gelu{}, find_gelu_new{fast_math});
-    run_passes(p, {dead_code_elimination{}});
+    run_passes(m, {dead_code_elimination{}});
-    match::find_matches(p, find_triadd{});
+    match::find_matches(m, find_triadd{});
-    match::find_matches(p,
+    match::find_matches(m,
                        find_layernorm{},
                        find_conv_pointwise{ctx},
                        find_conv_bias_relu{ctx},
@@ -966,8 +1008,12 @@ void fuse_ops::apply(module& p) const
                        find_add_unary{"gpu::sigmoid", hip_add_sigmoid{}, hip_triadd_sigmoid{}},
                        find_add_unary{"gpu::tanh", hip_add_tanh{}, hip_triadd_tanh{}},
                        find_add_clip{});
-    run_passes(p, {dead_code_elimination{}});
+    run_passes(m, {dead_code_elimination{}});
-    match::find_matches(p, find_triadd_layernorm{}, find_gemm_add{}, find_commutative_broadcast{});
+    match::find_matches(m,
+                        find_triadd_layernorm{},
+                        find_gemm_add{},
+                        find_gemm_pointwise{},
+                        find_commutative_broadcast{});
 }
 } // namespace gpu

--- a/src/targets/gpu/gemm_impl.cpp
+++ b/src/targets/gpu/gemm_impl.cpp
 #include <rocblas.h>
 #include <migraphx/gpu/gemm_impl.hpp>
+#include <migraphx/reduce_dims.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -27,6 +28,22 @@ rocblas_datatype get_type(shape::type_t type)
    MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!");
 }
+void blas_shape(const shape& s)
+{
+    if(s.lens().size() < 2)
+        return;
+    if(std::none_of(s.strides().end() - 2, s.strides().end(), [&](auto i) { return i == 1; }))
+        MIGRAPHX_THROW("GPU_GEMM: needs to have one matrix stride as 1");
+    if(s.lens().size() < 3)
+        return;
+    shape batch_shape{s.type(),
+                      {s.lens().begin(), s.lens().end() - 2},
+                      {s.strides().begin(), s.strides().end() - 2}};
+    auto batch_shapes = reduce_dims({batch_shape});
+    if(batch_shapes.front().lens().size() != 1)
+        MIGRAPHX_THROW("GPU_GEMM: Batch dimension is not collapsible");
+}
 template <class R, class... Ts, class... Us>
 R rocblas_invoke(R (*f)(Ts...), Us... xs)
 {
@@ -36,6 +53,18 @@ R rocblas_invoke(R (*f)(Ts...), Us... xs)
        return f(xs..., nullptr, nullptr);
 }
+static bool is_transposed(const shape& s)
+{
+    if(not s.transposed())
+        return false;
+    return s.strides().back() != 1;
+}
+static rocblas_int get_batch_stride(const argument& a)
+{
+    return a.get_shape().strides()[a.get_shape().strides().size() - 3];
+}
 template <class T>
 void gemm_impl(context& ctx,
               const shape& output_shape,
@@ -45,8 +74,8 @@ void gemm_impl(context& ctx,
               bool int8_x4_format,
               bool compute_fp32)
 {
-    bool transa     = args[0].get_shape().transposed();
+    bool transa     = is_transposed(args[0].get_shape());
-    bool transb     = args[1].get_shape().transposed();
+    bool transb     = is_transposed(args[1].get_shape());
    auto n_dim      = output_shape.lens().size();
    auto dim_1      = n_dim - 1;
    auto dim_0      = n_dim - 2;
@@ -142,6 +171,9 @@ void gemm_impl(context& ctx,
        }
        else
        {
+            auto a_stride = get_batch_stride(args[0]);
+            auto b_stride = get_batch_stride(args[1]);
+            auto c_stride = get_batch_stride(args[2]);
            rocblas_invoke(&rocblas_gemm_strided_batched_ex,
                           ctx.get_stream().get_rocblas(),
                           transb ? rocblas_operation_transpose : rocblas_operation_none,
@@ -153,20 +185,20 @@ void gemm_impl(context& ctx,
                           to_pointer(args.at(1)),
                           arg_type,
                           ldb,
-                           k * n,
+                           b_stride,
                           to_pointer(args.at(0)),
                           arg_type,
                           lda,
-                           m * k,
+                           a_stride,
                           beta_v,
                           to_pointer(args[2]),
                           output_type,
                           ldc,
-                           m * n,
+                           c_stride,
                           is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
                           output_type,
                           ldc,
-                           m * n,
+                           c_stride,
                           num_matrices,
                           compute_type,
                           rocblas_gemm_algo_standard,

--- a/src/targets/gpu/include/migraphx/gpu/analyze_streams.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/analyze_streams.hpp
@@ -11,7 +11,7 @@ struct module;
 namespace gpu {
-std::vector<stream_race> analyze_streams(const module& p);
+std::vector<stream_race> analyze_streams(const module& m);
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/eliminate_workspace.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/eliminate_workspace.hpp
@@ -14,7 +14,7 @@ namespace gpu {
 struct eliminate_workspace
 {
    std::string name() const { return "eliminate_workspace"; }
-    void apply(module& p) const;
+    void apply(module& m) const;
 };
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/fuse_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/fuse_ops.hpp
@@ -16,7 +16,7 @@ struct fuse_ops
    context* ctx   = nullptr;
    bool fast_math = true;
    std::string name() const { return "gpu::fuse_ops"; }
-    void apply(module& p) const;
+    void apply(module& m) const;
 };
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm.hpp
@@ -18,6 +18,8 @@ namespace gpu {
 struct context;
+void blas_shape(const shape& s);
 template <class Op>
 struct rocblas_gemm
 {
@@ -50,13 +52,14 @@ struct rocblas_gemm
        std::vector<shape> in_shapes(inputs);
        in_shapes.pop_back();
        check_shapes{in_shapes, *this}.not_broadcasted();
-        batch_not_transposed(inputs[0].strides());
+        blas_shape(inputs[0]);
-        batch_not_transposed(inputs[1].strides());
+        blas_shape(inputs[1]);
        // if gemm and add are fused
-        if(not float_equal(beta, 0))
+        if(in_shapes.size() > 2)
        {
            auto cmat_shape = in_shapes.back();
            in_shapes.pop_back();
+            blas_shape(cmat_shape);
            auto op_out_shape = op.compute_shape(in_shapes);
            if(cmat_shape.lens() != op_out_shape.lens())
            {
@@ -71,6 +74,7 @@ struct rocblas_gemm
                               to_string(cmat_shape.type()) +
                               ", it must be: " + to_string(op_out_shape.type()));
            }
+            return op_out_shape;
        }
        return op.compute_shape(in_shapes);
@@ -96,28 +100,6 @@ struct rocblas_gemm
        return args.back();
    }
-    void batch_not_transposed(const std::vector<std::size_t>& strides) const
-    {
-        if(strides.size() <= 2)
-            return;
-        auto dim_0       = strides.size() - 2;
-        auto matrix_size = std::max(strides[dim_0], strides[dim_0 + 1]);
-        std::vector<std::size_t> batch(strides.begin(), strides.begin() + dim_0);
-        if(std::all_of(batch.begin(), batch.end(), [&](auto i) { return (i < matrix_size); }))
-        {
-            MIGRAPHX_THROW("GPU_GEMM: matrix size and batch size {" + to_string_range(strides) +
-                           "} are transposed!");
-        }
-        if(std::adjacent_find(batch.begin(), batch.end(), [&](auto i, auto j) {
-               return (i < j or i < matrix_size or j < matrix_size);
-           }) != batch.end())
-        {
-            MIGRAPHX_THROW("GPU_GEMM: batch size {" + to_string_range(strides) +
-                           "} is transposed!");
-        }
-    }
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;

--- a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
+#ifndef MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
+#define MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/context.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module;
+namespace gpu {
+struct prefuse_ops
+{
+    std::string name() const { return "gpu::prefuse_ops"; }
+    void apply(module& m) const;
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_PREFUSE_OPS_HPP