Merge branch 'develop' into dot-add

fd3252dc · Umang Yadav · GitHub · 56615a84 · 8192f37f · fd3252dc
Unverified Commit fd3252dc authored Jul 08, 2022 by Umang Yadav Committed by GitHub Jul 08, 2022
20 changed files
--- a/src/program.cpp
+++ b/src/program.cpp
@@ -159,6 +159,25 @@ instruction_ref program::validate() const
    return mm->validate();
 }

+target_assignments program::get_target_assignments(const std::vector<target>& targets,
+                                                   assignment_options options)
+{
+    const auto m = options.metric;
+
+    target_assignments p;
+
+    const auto* mod = get_main_module();
+    for(auto it : iterator_for(*mod))
+    {
+        auto t = std::max_element(
+            targets.begin(), targets.end(), [it, m](const target& lhs, const target& rhs) {
+                return lhs.is_supported(it, m) < rhs.is_supported(it, m);
+            });
+        p.add_assignment(it, t->name());
+    }
+    return p;
+}
+
 bool program::is_compiled() const { return not this->impl->target_name.empty(); }

 void program::compile(const target& t, compile_options options)
@@ -504,12 +523,14 @@ static void mod_from_val(module_ref mod,

        if(name == "@param")
        {
-            output = mod->add_parameter(fields["parameter"].to<std::string>(),
-                                        migraphx::from_value<shape>(node.at("shape")));
+            output = mod->insert_parameter(mod->end(),
+                                           fields["parameter"].to<std::string>(),
+                                           migraphx::from_value<shape>(node.at("shape")));
        }
        else if(name == "@literal")
        {
-            output = mod->add_literal(migraphx::from_value<literal>(node.at("literal")));
+            output =
+                mod->insert_literal(mod->end(), migraphx::from_value<literal>(node.at("literal")));
        }
        else
        {
@@ -544,11 +565,11 @@ static void mod_from_val(module_ref mod,
            }
            else if(module_inputs.empty())
            {
-                output = mod->add_instruction(op, inputs);
+                output = mod->insert_instruction(mod->end(), op, inputs);
            }
            else
            {
-                output = mod->add_instruction(op, inputs, module_inputs);
+                output = mod->insert_instruction(mod->end(), op, inputs, module_inputs);
            }
        }
        output->set_normalized(normalized);

--- a/src/serialize.cpp
+++ b/src/serialize.cpp
@@ -36,7 +36,7 @@ void raw_data_to_value(value& v, const RawData& rd)
    result["shape"] = migraphx::to_value(rd.get_shape());
    if(rd.get_shape().type() == shape::tuple_type)
        result["sub"] = migraphx::to_value(rd.get_sub_objects());
-    else
+    else if(not rd.empty())
        result["data"] = migraphx::value::binary(rd.data(), rd.get_shape().bytes());
    v = result;
 }
@@ -56,7 +56,7 @@ void migraphx_from_value(const value& v, argument& a)
        literal l = migraphx::from_value<literal>(v);
        a         = l.get_argument();
    }
-    else
+    else if(v.contains("sub"))
    {
        a = migraphx::from_value<std::vector<argument>>(v.at("sub"));
    }

--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -26,6 +26,7 @@
 #include <migraphx/stringutils.hpp>
 #include <migraphx/serialize.hpp>
 #include <migraphx/permutation.hpp>
+#include <migraphx/ranges.hpp>
 #include <numeric>
 #include <algorithm>
 #include <functional>
@@ -65,13 +66,21 @@ struct shape_impl
                     std::is_sorted(m_strides.rbegin(), m_strides.rend());
    }

+    shape_impl(shape::type_t t, std::vector<shape::dynamic_dimension> dims)
+        : m_type(t), m_dyn_dims(std::move(dims))
+    {
+    }
+
    shape_impl(const std::vector<shape>& subs) : m_type(shape::tuple_type), m_shapes(subs) {}
+
    shape::type_t m_type;
    std::vector<std::size_t> m_lens    = {};
    std::vector<std::size_t> m_strides = {};
    std::vector<shape> m_shapes        = {};
    bool m_standard                    = false;

+    std::vector<shape::dynamic_dimension> m_dyn_dims = {};
+
    void calculate_strides()
    {
        m_strides.clear();
@@ -87,6 +96,12 @@ struct shape_impl

    std::size_t element_space() const
    {
+        if(not m_dyn_dims.empty())
+        {
+            auto maxes = max_lens();
+            return std::accumulate(maxes.begin(), maxes.end(), std::size_t{1}, std::multiplies<>());
+        }
+
        assert(m_lens.size() == m_strides.size());
        if(m_lens.empty())
            return 0;
@@ -101,6 +116,11 @@ struct shape_impl

    std::size_t elements() const
    {
+        if(not m_dyn_dims.empty())
+        {
+            MIGRAPHX_THROW("SHAPE: elements() called on dynamic shape");
+        }
+
        assert(m_lens.size() == m_strides.size());
        if(m_lens.empty())
            return 0;
@@ -108,6 +128,35 @@ struct shape_impl
            m_lens.begin(), m_lens.end(), std::size_t{1}, std::multiplies<std::size_t>());
    }

+    std::vector<std::size_t> min_lens() const
+    {
+        std::vector<std::size_t> ret(m_dyn_dims.size());
+        std::transform(m_dyn_dims.cbegin(),
+                       m_dyn_dims.cend(),
+                       ret.begin(),
+                       [](shape::dynamic_dimension x) { return x.min; });
+        return ret;
+    }
+
+    std::vector<std::size_t> max_lens() const
+    {
+        std::vector<std::size_t> ret(m_dyn_dims.size());
+        std::transform(m_dyn_dims.cbegin(),
+                       m_dyn_dims.cend(),
+                       ret.begin(),
+                       [](shape::dynamic_dimension x) { return x.max; });
+        return ret;
+    }
+
+    std::vector<std::size_t> opt_lens() const
+    {
+        std::vector<std::size_t> ret(m_dyn_dims.size());
+        std::transform(m_dyn_dims.cbegin(),
+                       m_dyn_dims.cend(),
+                       ret.begin(),
+                       [](shape::dynamic_dimension x) { return x.opt; });
+        return ret;
+    }
    // Does the shape skip over elements?
    bool skips() const
    {
@@ -165,6 +214,16 @@ shape::shape(type_t t, std::vector<std::size_t> l, std::vector<std::size_t> s)
 {
 }

+shape::shape(type_t t, std::initializer_list<std::size_t> d)
+    : shape::shape(t, std::vector<std::size_t>{d.begin(), d.end()})
+{
+}
+
+shape::shape(type_t t, std::vector<shape::dynamic_dimension> dims)
+    : impl(std::make_shared<shape_impl>(t, std::move(dims)))
+{
+}
+
 shape::shape(const std::vector<shape>& subs) : impl(std::make_shared<shape_impl>(subs)) {}

 shape::shape(std::shared_ptr<shape_impl> pimpl) : impl(std::move(pimpl)) {}
@@ -180,9 +239,13 @@ shape shape::from_permutation(type_t t,
 }

 shape::type_t shape::type() const { return impl->m_type; }
+
 const std::vector<std::size_t>& shape::lens() const { return impl->m_lens; }
+
 const std::vector<std::size_t>& shape::strides() const { return impl->m_strides; }
+
 std::size_t shape::elements() const { return impl->elements(); }
+
 std::size_t shape::bytes() const
 {
    if(this->sub_shapes().empty())
@@ -199,6 +262,7 @@ std::size_t shape::bytes() const
                               [&](auto x, auto y) { return x + y.bytes(); });
    }
 }
+
 std::size_t shape::type_size() const
 {
    std::size_t n = 0;
@@ -206,20 +270,35 @@ std::size_t shape::type_size() const
        this->visit_type([&](auto as) { n = as.size(); });
    return n;
 }
+
 std::size_t shape::index(std::initializer_list<std::size_t> l) const
 {
+    if(this->dynamic())
+    {
+        MIGRAPHX_THROW("SHAPE: index() called on dynamic shape");
+    }
    assert(l.size() <= this->lens().size());
    assert(this->lens().size() == this->strides().size());
    return std::inner_product(l.begin(), l.end(), this->strides().begin(), std::size_t{0});
 }
+
 std::size_t shape::index(const std::vector<std::size_t>& l) const
 {
+    if(this->dynamic())
+    {
+        MIGRAPHX_THROW("SHAPE: index() called on dynamic shape");
+    }
    assert(l.size() <= this->lens().size());
    assert(this->lens().size() == this->strides().size());
    return std::inner_product(l.begin(), l.end(), this->strides().begin(), std::size_t{0});
 }
+
 std::size_t shape::index(std::size_t i) const
 {
+    if(this->dynamic())
+    {
+        MIGRAPHX_THROW("SHAPE: index() called on dynamic shape");
+    }
    assert(this->lens().size() == this->strides().size());
    if(this->standard())
        return i;
@@ -267,12 +346,20 @@ void shape::multi_copy(std::size_t i, std::size_t* start, const std::size_t* end

 bool shape::packed() const
 {
+    if(this->dynamic())
+    {
+        return false;
+    }
    return this->sub_shapes().empty() and not impl->skips() and
           this->elements() == this->element_space();
 }

 bool shape::transposed() const
 {
+    if(this->dynamic())
+    {
+        return false;
+    }
    if(this->broadcasted())
    {
        // TODO: Use a filter_iterator instead
@@ -292,6 +379,10 @@ bool shape::transposed() const

 bool shape::broadcasted() const
 {
+    if(this->dynamic())
+    {
+        return false;
+    }
    assert(this->lens().size() == this->strides().size());
    return std::any_of(
        this->strides().begin(), this->strides().end(), [](auto x) { return x == 0; });
@@ -299,6 +390,10 @@ bool shape::broadcasted() const

 bool shape::scalar() const
 {
+    if(this->dynamic())
+    {
+        return false;
+    }
    assert(this->lens().size() == this->strides().size());
    // if any stride > 0, then accumulate will return false
    return this->sub_shapes().empty() and
@@ -317,6 +412,10 @@ shape shape::normalize_standard() const

 shape shape::with_lens(type_t t, const std::vector<std::size_t>& l) const
 {
+    if(this->dynamic())
+    {
+        MIGRAPHX_THROW("SHAPE: with_lens() called on dynamic shape");
+    }
    assert(l.size() == this->lens().size());
    auto perm = find_permutation(*this);
    return shape::from_permutation(t, l, perm);
@@ -324,6 +423,10 @@ shape shape::with_lens(type_t t, const std::vector<std::size_t>& l) const

 shape shape::with_lens(const std::vector<std::size_t>& l) const
 {
+    if(this->dynamic())
+    {
+        MIGRAPHX_THROW("SHAPE: with_lens() called on dynamic shape");
+    }
    return this->with_lens(this->type(), l);
 }

@@ -338,20 +441,80 @@ std::size_t shape::element_space() const { return impl->element_space(); }

 std::string shape::type_string() const { return name(this->type()); }

+bool shape::dynamic() const { return not impl->m_dyn_dims.empty(); }
+
+const std::vector<shape::dynamic_dimension>& shape::dyn_dims() const { return impl->m_dyn_dims; }
+
+std::vector<std::size_t> shape::min_lens() const
+{
+    return this->dynamic() ? impl->min_lens() : this->lens();
+}
+
+std::vector<std::size_t> shape::max_lens() const
+{
+    return this->dynamic() ? impl->max_lens() : this->lens();
+}
+
+std::vector<std::size_t> shape::opt_lens() const
+{
+    return this->dynamic() ? impl->opt_lens() : this->lens();
+}
+
+bool shape::dynamic_dimension::is_fixed() const { return this->min == this->max; }
+
+bool shape::dynamic_dimension::has_optimal() const { return opt != 0; }
+
+template <class Self, class F>
+auto shape::dynamic_dimension::reflect(Self& self, F f)
+{
+    return pack(f(self.min, "min"), f(self.max, "max"), f(self.opt, "opt"));
+}
+
+bool operator==(const shape::dynamic_dimension& x, const shape::dynamic_dimension& y)
+{
+    return (x.min == y.min and x.max == y.max and x.opt == y.opt);
+}
+
+bool operator!=(const shape::dynamic_dimension& x, const shape::dynamic_dimension& y)
+{
+    return !(x == y);
+}
+std::ostream& operator<<(std::ostream& os, const shape::dynamic_dimension& x)
+{
+    os << "[" << x.min << ", " << x.max << ", " << x.opt << "]";
+    return os;
+}
+
 bool operator==(const shape& x, const shape& y)
 {
-    return x.impl == y.impl or (x.type() == y.type() and x.lens() == y.lens() and
-                                x.strides() == y.strides() and x.sub_shapes() == y.sub_shapes());
+    if(x.dynamic() and y.dynamic())
+    {
+        return x.impl == y.impl or (x.type() == y.type() and x.dyn_dims() == y.dyn_dims() and
+                                    x.sub_shapes() == y.sub_shapes());
+    }
+    return x.impl == y.impl or
+           (x.dynamic() == y.dynamic() and x.type() == y.type() and x.lens() == y.lens() and
+            x.strides() == y.strides() and x.sub_shapes() == y.sub_shapes());
 }
+
 bool operator!=(const shape& x, const shape& y) { return !(x == y); }

 std::ostream& operator<<(std::ostream& os, const shape& x)
 {
    if(x.sub_shapes().empty())
    {
-        os << x.type_string() << ", ";
-        os << "{" << to_string_range(x.lens()) << "}, ";
-        os << "{" << to_string_range(x.strides()) << "}";
+        if(x.dynamic())
+        {
+            os << "dynamic, ";
+            os << x.type_string() << ", ";
+            os << "{" << to_string_range(x.dyn_dims()) << "}";
+        }
+        else
+        {
+            os << x.type_string() << ", ";
+            os << "{" << to_string_range(x.lens()) << "}, ";
+            os << "{" << to_string_range(x.strides()) << "}";
+        }
    }
    else
    {
@@ -375,12 +538,14 @@ const std::vector<shape>& shape::sub_shapes() const { return impl->m_shapes; }
 void migraphx_to_value(value& v, const shape& s)
 {
    value result;
-    result["type"]       = migraphx::to_value(s.type_string());
-    result["lens"]       = migraphx::to_value(s.lens());
-    result["strides"]    = migraphx::to_value(s.strides());
-    result["sub_shapes"] = migraphx::to_value(s.sub_shapes());
-    v                    = result;
+    result["type"]               = migraphx::to_value(s.type_string());
+    result["lens"]               = migraphx::to_value(s.lens());
+    result["strides"]            = migraphx::to_value(s.strides());
+    result["sub_shapes"]         = migraphx::to_value(s.sub_shapes());
+    result["dynamic_dimensions"] = migraphx::to_value(s.dyn_dims());
+    v                            = result;
 }
+
 void migraphx_from_value(const value& v, shape& s)
 {
    auto t = v.at("type").get_string();
@@ -390,9 +555,25 @@ void migraphx_from_value(const value& v, shape& s)
    }
    else
    {
-        s = shape{shape::parse_type(t),
-                  v.at("lens").to_vector<std::size_t>(),
-                  v.at("strides").to_vector<std::size_t>()};
+        if(v.at("dynamic_dimensions").empty())
+        {
+            s = shape{shape::parse_type(t),
+                      v.at("lens").to_vector<std::size_t>(),
+                      v.at("strides").to_vector<std::size_t>()};
+        }
+        else
+        {
+            auto v_dd = v.at("dynamic_dimensions");
+            std::vector<shape::dynamic_dimension> dyn_dims(v.at("dynamic_dimensions").size());
+            std::transform(v_dd.begin(), v_dd.end(), dyn_dims.begin(), [](migraphx::value x) {
+                auto x_min = x.at("min").template to<size_t>();
+                auto x_max = x.at("max").template to<size_t>();
+                auto x_opt = x.at("opt").template to<size_t>();
+                return shape::dynamic_dimension{x_min, x_max, x_opt};
+            });
+
+            s = shape{shape::parse_type(t), dyn_dims};
+        }
    }
 }


--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -272,7 +272,7 @@ struct find_concat_transpose
 {
    auto matcher() const
    {
-        return match::name("concat")(match::all_of[match::inputs()](match::transpose_shape()));
+        return match::name("concat")(match::all_of[match::inputs()](match::name("transpose")));
    }

    void apply(module& m, const match::matcher_result& mr) const
@@ -601,6 +601,69 @@ struct find_transpose_contiguous_reshaper_unary
    }
 };

+struct find_slice_transpose
+{
+    auto matcher() const
+    {
+        return match::any(match::any_of[match::outputs()](
+            match::name("slice")(match::output(match::name("transpose")))));
+    }
+
+    static std::vector<int64_t> find_common_perm(const std::vector<instruction_ref>& transposes)
+    {
+        std::map<std::vector<int64_t>, int64_t> count;
+        for(auto t : transposes)
+        {
+            auto perm = t->get_operator().to_value()["permutation"].to_vector<int64_t>();
+            count[perm]++;
+        }
+        return std::max_element(
+                   count.begin(), count.end(), by(std::less<>{}, [](auto&& p) { return p.second; }))
+            ->first;
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins = r.result;
+        std::vector<instruction_ref> splits;
+        std::copy_if(ins->outputs().begin(),
+                     ins->outputs().end(),
+                     std::back_inserter(splits),
+                     [&](instruction_ref out) {
+                         return out->name() == "slice" and out->outputs().size() == 1 and
+                                out->outputs().front()->name() == "transpose";
+                     });
+        if(splits.size() < 2)
+            return;
+        std::vector<instruction_ref> transposes;
+        std::transform(splits.begin(),
+                       splits.end(),
+                       std::back_inserter(transposes),
+                       [](auto split) { return split->outputs().front(); });
+        auto perm  = find_common_perm(transposes);
+        auto iperm = invert_permutation(perm);
+        auto pre   = m.insert_instruction(
+            std::next(ins), make_op("transpose", {{"permutation", perm}}), ins);
+        for(auto i : range(transposes.size()))
+        {
+            auto split = splits[i];
+            auto t     = transposes[i];
+            auto op    = any_cast<op::slice>(split->get_operator());
+            std::transform(op.axes.begin(), op.axes.end(), op.axes.begin(), [&](auto axis) {
+                return iperm[axis];
+            });
+            auto new_ins = m.insert_instruction(t, op, pre);
+            if(t->get_operator() != pre->get_operator())
+            {
+                auto curr = t->get_operator().to_value()["permutation"].to_vector<int64_t>();
+                new_ins   = m.insert_instruction(
+                    t, make_op("transpose", {{"permutation", reorder_dims(iperm, curr)}}), new_ins);
+            }
+            m.replace_instruction(t, new_ins);
+        }
+    }
+};
+
 void simplify_reshapes::apply(module& m) const
 {
    for(int i = 0; i < 2; i++)
@@ -616,6 +679,7 @@ void simplify_reshapes::apply(module& m) const
                            find_nested_convert{},
                            find_nested_slice{},
                            find_nested_concat{},
+                            find_slice_transpose{},
                            find_transpose_contiguous_reshaper_unary{});
        dead_code_elimination{}.apply(m);
    }

--- a/src/target_assignments.cpp
+++ b/src/target_assignments.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <migraphx/target_assignments.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+void target_assignments::add_assignment(instruction_ref ins, const std::string& target)
+{
+    assignments.emplace(ins, target);
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/cpu/write_literals.cpp
+++ b/src/targets/cpu/write_literals.cpp
@@ -25,6 +25,7 @@
 #include <migraphx/module.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/iterator_for.hpp>
+#include <migraphx/register_op.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -52,6 +53,7 @@ struct cpu_literal
        return os;
    }
 };
+MIGRAPHX_REGISTER_OP(cpu_literal);

 void write_literals::apply(module& m) const
 {

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -164,6 +164,7 @@ add_library(migraphx_gpu
    deconvolution.cpp
    device_name.cpp
    elu.cpp
+    fuse_mlir.cpp
    fuse_ops.cpp
    gather.cpp
    gemm_impl.cpp
@@ -176,7 +177,7 @@ add_library(migraphx_gpu
    loop.cpp
    lrn.cpp
    leaky_relu.cpp
-    mlir_conv.cpp
+    mlir.cpp
    multinomial.cpp
    nonzero.cpp
    pack_args.cpp
@@ -320,16 +321,26 @@ message(STATUS "extractkernel: ${MIGRAPHX_EXTRACT_KERNEL}")

 set(MIGRAPHX_ENABLE_MLIR OFF CACHE BOOL "")
 if(MIGRAPHX_ENABLE_MLIR)
-    find_library(LIBMLIRMIOPEN MLIRMIOpenThin REQUIRED)
+    find_library(MLIRAPI_LIBRARY MLIRMIOpen 
+        PATH_SUFFIXES
+        # Workaournd broken mlir install
+        lib/ lib/lib)
    # REQUIRED is not supported before cmake 3.18
-    if(NOT LIBMLIRMIOPEN)
-        message(FATAL_ERROR "libMLIRMIOpenThin not found")
+    if(NOT MLIRAPI_LIBRARY)
+        message(FATAL_ERROR "libMLIRMIOpen not found")
    else()
-        message(STATUS "Build with libMLIRMIOpenThin: " ${LIBMLIRMIOPEN})
+        message(STATUS "Build with libMLIRMIOpen: " ${MLIRAPI_LIBRARY})
    endif()

-    target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_MLIR_MIOPEN_SUPPORT")
-    target_link_libraries(migraphx_gpu PUBLIC ${LIBMLIRMIOPEN})
+    find_path(MLIRAPI_HEADERS NAMES mlir-c/Dialect/MIGraphX.h)
+    # Workaround MLIR broken installation
+    find_path(MLIRAPI_HEADERS2 NAMES mlir-c/Registration.h
+        PATH_SUFFIXES 
+        include/external/include external/include)
+
+    target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_MLIR")
+    target_include_directories(migraphx_gpu SYSTEM PRIVATE ${MLIRAPI_HEADERS} ${MLIRAPI_HEADERS2})
+    target_link_libraries(migraphx_gpu PUBLIC ${MLIRAPI_LIBRARY})
 endif()

 set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "")

--- a/src/targets/gpu/code_object_op.cpp
+++ b/src/targets/gpu/code_object_op.cpp
@@ -52,7 +52,7 @@ code_object_op::compute(context& ctx, const shape&, const std::vector<argument>&
    std::transform(
        args.begin(), args.end(), kargs.begin(), [](const argument& a) { return a.data(); });
    k.launch(ctx.get_stream().get(), global, local, std::move(kargs));
-    return args.back();
+    return args[get_output_arg(args.size())];
 }
 void code_object_op::finalize(context&, const shape&, const std::vector<shape>&)
 {

--- a/src/targets/gpu/compile_gen.cpp
+++ b/src/targets/gpu/compile_gen.cpp
@@ -43,6 +43,9 @@ static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)

 vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs)
 {
+    if(std::all_of(
+           inputs.begin(), inputs.end(), [&](const auto& s) { return s.lens()[axis] == 1; }))
+        return {1, axis};
    auto sizes = vector_sizes(inputs);
    std::vector<std::size_t> max_vec_size;
    std::transform(inputs.begin(),

--- a/src/targets/gpu/deconvolution.cpp
+++ b/src/targets/gpu/deconvolution.cpp
@@ -59,31 +59,30 @@ argument miopen_deconvolution::compute(context& ctx,
    auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()));
    auto y_desc = make_tensor(reshape_if_1d(output_shape));

-    float alpha = 1;
-    float beta  = 0;
-    auto status = miopenConvolutionForward(ctx.get_stream().get_miopen(),
-                                           &alpha,
-                                           x_desc.get(),
-                                           args[0].implicit(),
-                                           w_desc.get(),
-                                           args[1].implicit(),
-                                           cd.get(),
-                                           algo,
-                                           &beta,
-                                           y_desc.get(),
-                                           args[3].implicit(),
-                                           args[2].implicit(),
-                                           args[2].get_shape().bytes());
+    if(solution_id == 0)
+        MIGRAPHX_THROW("MIOpen Deconvolution: invalid solution ID");
+
+    auto status = miopenConvolutionForwardImmediate(ctx.get_stream().get_miopen(),
+                                                    w_desc.get(),
+                                                    args[1].implicit(),
+                                                    x_desc.get(),
+                                                    args[0].implicit(),
+                                                    cd.get(),
+                                                    y_desc.get(),
+                                                    args[3].implicit(),
+                                                    args[2].implicit(),
+                                                    args[2].get_shape().bytes(),
+                                                    solution_id);
+
    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("Running deconvolution failed");
+        MIGRAPHX_THROW("MIOpen Deconvolution: running convolution failed");
    return args[3];
 }

-shape miopen_deconvolution::compile(context& ctx,
-                                    const shape& output_shape,
-                                    std::vector<shape> inputs)
+shape miopen_deconvolution::find(context& ctx, const shape& output_shape, std::vector<shape> inputs)
 {
    shape workspace_shape{};
+
    auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
    auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
    auto y_desc = make_tensor(reshape_if_1d(output_shape));
@@ -119,9 +118,35 @@ shape miopen_deconvolution::compile(context& ctx,
                                                        workspace_size,
                                                        false);
    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("Find deconvolution failed");
-    handle = ctx.get_stream().get_miopen();
-    algo   = perf.fwd_algo;
+        MIGRAPHX_THROW("MIOpen Deconvolution: find convolution failed");
+    algo = perf.fwd_algo;
+
+    size_t solution_count;
+
+    status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
+                                                      w_desc.get(),
+                                                      x_desc.get(),
+                                                      cd.get(),
+                                                      y_desc.get(),
+                                                      &solution_count);
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen Deconvolution: get solution count failed");
+
+    std::vector<miopenConvSolution_t> solutions(solution_count);
+
+    status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
+                                                 w_desc.get(),
+                                                 x_desc.get(),
+                                                 cd.get(),
+                                                 y_desc.get(),
+                                                 solution_count,
+                                                 &solution_count,
+                                                 solutions.data());
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen Deconvolution: get solution failed");
+
+    solution_id = solutions.front().solution_id;
+
    return shape{shape::int8_type, {perf.memory}};
 }

@@ -129,13 +154,29 @@ void miopen_deconvolution::finalize(context& ctx,
                                    const shape& output_shape,
                                    std::vector<shape> inputs)
 {
-    if(handle == ctx.get_stream().get_miopen())
-        return;
-    // Check that workspace hasn't changed
-    auto size = inputs.at(2).bytes();
-    auto ws   = compile(ctx, output_shape, std::move(inputs));
-    if(ws.bytes() > size)
-        MIGRAPHX_THROW("Workspace has changed during finalization.");
+    if(cd == nullptr)
+        cd = make_deconv(op);
+    if(solution_id == 0)
+    {
+        // Check that workspace hasn't changed
+        auto size = inputs.at(2).bytes();
+        auto ws   = find(ctx, output_shape, inputs);
+        if(ws.bytes() > size)
+            MIGRAPHX_THROW("MIOpen Deconvolution: workspace has changed during finalization.");
+    }
+
+    auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
+    auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
+    auto y_desc = make_tensor(reshape_if_1d(output_shape));
+
+    auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
+                                                          w_desc.get(),
+                                                          x_desc.get(),
+                                                          cd.get(),
+                                                          y_desc.get(),
+                                                          solution_id);
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen Deconvolution: compile solution failed");
 }

 } // namespace gpu

--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/fuse_mlir.hpp>
+#include <migraphx/gpu/mlir.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/register_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+
+namespace gpu {
+
+#ifdef MIGRAPHX_MLIR
+struct mlir_conv
+{
+    operation op = make_op("convolution");
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"));
+    }
+
+    std::string name() const { return "gpu::mlir_conv"; }
+    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
+    {
+        check_shapes{inputs, *this}.standard();
+        if(mods.size() != 1)
+            MIGRAPHX_THROW("should have one submodule.");
+        if(inputs.size() < 2)
+            MIGRAPHX_THROW("should have at least two inputs.");
+        auto n = inputs.size();
+        return op.compute_shape({inputs[n - 2], inputs[n - 1]});
+    }
+};
+MIGRAPHX_REGISTER_OP(mlir_conv);
+
+namespace {
+struct find_conv_pointwise
+{
+    // Find a convolution followed by a pointwise operation.
+    auto matcher() const
+    {
+        auto convolution =
+            match::skip(match::name("contiguous"))(match::name("convolution").bind("convolution"));
+        return match::name("pointwise")(match::any_of[match::inputs()](convolution.bind("x")));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        auto conv_ins = r.instructions["convolution"];
+        auto x_ins    = r.instructions["x"]; // input after contiguous
+        auto* pm      = ins->module_inputs().front();
+        auto names    = pm->get_parameter_names();
+        // Whitelist pointwise operators
+        if(std::any_of(pm->begin(), pm->end(), [](const auto& i) {
+               return not contains({"@literal", "@param", "@return", "convolution", "add", "relu"},
+                                   i.name());
+           }))
+            return;
+        // Only fuse with fp32 for now
+        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [&](auto i) {
+               return i->get_shape().type() != shape::type_t::float_type;
+           }))
+            return;
+        std::sort(names.begin(), names.end());
+        module_ref mm = mpm.create_module("mlir_" + pm->name());
+        mm->set_bypass();
+        std::unordered_map<instruction_ref, instruction_ref> param_map;
+        auto x    = mm->add_parameter("x" + std::to_string(names.size()),
+                                   conv_ins->inputs().at(0)->get_shape());
+        auto w    = mm->add_parameter("x" + std::to_string(names.size() + 1),
+                                   conv_ins->inputs().at(1)->get_shape());
+        auto conv = mm->add_instruction(conv_ins->get_operator(), {x, w});
+        std::transform(names.begin(),
+                       names.end(),
+                       ins->inputs().begin(),
+                       std::inserter(param_map, param_map.end()),
+                       [&](auto name, auto input) {
+                           if(input == x_ins)
+                               return std::make_pair(pm->get_parameter(name), conv);
+                           return std::make_pair(pm->get_parameter(name),
+                                                 mm->add_parameter(name, input->get_shape()));
+                       });
+        mm->add_return(mm->insert_instructions(mm->end(), pm, param_map));
+
+        std::vector<instruction_ref> inputs;
+        std::copy_if(ins->inputs().begin(),
+                     ins->inputs().end(),
+                     std::back_inserter(inputs),
+                     [&](auto input) { return input != conv_ins; });
+        inputs.insert(inputs.end(), conv_ins->inputs().begin(), conv_ins->inputs().end());
+        mpm.get_module().replace_instruction(
+            ins, mlir_conv{conv_ins->get_operator()}, inputs, {mm});
+    }
+};
+} // namespace
+
+#endif
+
+void fuse_mlir::apply(module_pass_manager& mpm) const
+{
+#ifdef MIGRAPHX_MLIR
+    match::find_matches(mpm, find_conv_pointwise{});
+#else
+    (void)mpm;
+#endif
+}
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -343,6 +343,7 @@ void move_standard_front(std::vector<instruction_ref>& args)

 auto gpu_name(const std::string& s) { return match::name("gpu::" + s); }

+namespace {
 struct find_layernorm
 {
    auto matcher() const { return match::layernorm(&gpu_name); }
@@ -843,15 +844,6 @@ inline auto precompile_name(std::string s) // NOLINT
    });
 }

-template <class... Ms>
-auto conv_bias_pointwise(Ms... ms)
-{
-    return precompile_name("pointwise")(
-        match::either_arg(0, 1)(bias_shape(match::used_once()).bind("bias"),
-                                fusable_conv(match::used_once()).bind("conv")),
-        ms...);
-}
-
 struct find_conv_bias
 {
    context* ctx = nullptr;
@@ -1145,6 +1137,7 @@ struct find_commutative_broadcast
        m.replace_instruction(ins, ins->get_operator(), args);
    }
 };
+} // namespace

 struct find_contiguous
 {

--- a/src/targets/gpu/include/migraphx/gpu/code_object_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/code_object_op.hpp
@@ -38,12 +38,13 @@ struct context;

 struct code_object_op
 {
-    value::binary code_object;
-    std::string symbol_name;
-    std::size_t global;
-    std::size_t local;
-    std::vector<shape> expected_inputs;
-    shape output;
+    value::binary code_object{};
+    std::string symbol_name = "";
+    std::size_t global      = 0;
+    std::size_t local       = 0;
+    std::vector<shape> expected_inputs{};
+    shape output{};
+    std::int64_t output_arg = -1;
    kernel k{};

    template <class Self, class F>
@@ -66,9 +67,13 @@ struct code_object_op
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
    void finalize(context&, const shape&, const std::vector<shape>&);
+    std::int64_t get_output_arg(std::size_t n) const
+    {
+        return output_arg < 0 ? n + output_arg : output_arg;
+    }
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
-        return shapes.size() - 1;
+        return get_output_arg(shapes.size());
    }

    friend std::ostream& operator<<(std::ostream& os, const code_object_op& op)

--- a/src/targets/gpu/include/migraphx/gpu/deconvolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/deconvolution.hpp
@@ -39,20 +39,20 @@ struct miopen_deconvolution
    op::deconvolution op;
    shared<convolution_descriptor> cd;
    miopenConvFwdAlgorithm_t algo{};
-    miopenHandle_t handle = nullptr;
+    uint64_t solution_id = 0;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        // TODO: Add algo
-        return op::convolution::reflect(self.op, f);
+        return pack_join(op::deconvolution::reflect(self.op, f),
+                         pack(f(self.solution_id, "solution_id")));
    }

    std::string name() const { return "gpu::deconv"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    shape compile(context& ctx, const shape& output_shape, std::vector<shape> inputs);
+    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {

--- a/src/targets/gpu/include/migraphx/gpu/mlir_conv.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/mlir_conv.hpp
@@ -21,8 +21,8 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_MIOPEN_MLIR_CONV_HPP
-#define MIGRAPHX_GUARD_RTGLIB_MIOPEN_MLIR_CONV_HPP
+#ifndef MIGRAPHX_GUARD_GPU_FUSE_MLIR_HPP
+#define MIGRAPHX_GUARD_GPU_FUSE_MLIR_HPP

 #include <migraphx/config.hpp>
 #include <migraphx/gpu/context.hpp>
@@ -30,18 +30,19 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-struct module;
+struct module_pass_manager;

 namespace gpu {
-struct mlir_conv
+
+struct fuse_mlir
 {
-    context* ctx;
-    std::string name() const { return "mlir::convolution"; }
-    void apply(module& m) const;
+    context* ctx = nullptr;
+    std::string name() const { return "gpu::fuse_mlir"; }
+    void apply(module_pass_manager& mpm) const;
 };

 } // namespace gpu
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-
-#endif
+#endif // MIGRAPHX_GUARD_GPU_FUSE_MLIR_HPP
--- a/src/targets/gpu/include/migraphx/gpu/mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/mlir.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_MLIR_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_MLIR_HPP
+
+#include <string>
+#include <vector>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/code_object_op.hpp>
+#include <migraphx/instruction_ref.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module;
+namespace gpu {
+
+std::string dump_mlir(const module& m);
+code_object_op compile_mlir(const context& ctx, const module& m);
+
+instruction_ref insert_mlir(module& m,
+                            instruction_ref ins,
+                            code_object_op co,
+                            const std::vector<instruction_ref>& inputs);
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
@@ -41,7 +41,7 @@ struct miopen_quant_convolution
    bool int8_x4_format = false;
    shared<convolution_descriptor> cd;
    miopenConvFwdAlgorithm_t algo{};
-    miopenHandle_t handle = nullptr;
+    uint64_t solution_id = 0;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -55,7 +55,7 @@ struct miopen_quant_convolution
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    shape compile(context& ctx, const shape& output_shape, std::vector<shape> inputs);
+    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {

--- a/src/targets/gpu/jit/mlir.cpp
+++ b/src/targets/gpu/jit/mlir.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+
+#include <migraphx/gpu/mlir.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct mlir_compiler : compiler<mlir_compiler>
+{
+    std::vector<std::string> names() const { return {"gpu::mlir_conv"}; }
+
+    operation compile_op(context&, const std::vector<shape>&, const value&) const { return {}; }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation&) const
+    {
+        auto* smod = ins->module_inputs().front();
+        assert(smod->get_parameter_names().size() == ins->inputs().size() - 1);
+        return insert(compile_mlir(ctx, *smod));
+    }
+
+    compiler_replace insert(code_object_op co) const
+    {
+        return [co = std::move(co)](module& m, instruction_ref ins) {
+            auto mlir = insert_mlir(m, ins, co, ins->inputs());
+            m.replace_instruction(ins, mlir);
+        };
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/jit/softmax.cpp
+++ b/src/targets/gpu/jit/softmax.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+
+#include <migraphx/cpp_generator.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/eliminate_common_subexpression.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/pass_manager.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+using namespace migraphx::gpu::gen; // NOLINT
+
+static const char* const softmax_kernel = R"__migraphx__(
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/softmax.hpp>
+#include <migraphx/kernels/vectorize.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+__global__ void softmax_kernel(void* input_p, void* output_p) 
+{
+    transform_args(make_tensors(), ${transformers})(input_p, output_p)([](auto input, auto output) {
+        softmax<${axis}>(input, output);
+    });
+}
+    
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct softmax_compiler : compiler<softmax_compiler>
+{
+    std::vector<std::string> names() const { return {"softmax"}; }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        // TODO: Use reduce_dims
+        auto axis  = v.at("axis").to<int64_t>();
+        auto faxis = find_fast_axis({inputs.front()});
+        vectorize vec{};
+        // Vectorize if the axis is a reduction axis
+        if(faxis == axis)
+        {
+            vec = vectorize::elements(faxis, inputs);
+        }
+        auto relements  = inputs[0].lens()[axis] / vec.size;
+        auto nelements  = (inputs.back().elements() / inputs[0].lens()[axis]);
+        auto block_size = compute_block_size(relements, 256);
+        hip_compile_options options;
+        options.set_launch_params(
+            v, compute_global_for(ctx, nelements * block_size, 256), block_size);
+        options.output      = inputs.back();
+        options.inputs      = inputs;
+        options.kernel_name = "softmax_kernel";
+
+        auto src = interpolate_string(
+            softmax_kernel,
+            {{"transformers", make_transformer_args(vec)}, {"axis", to_string(axis)}});
+
+        return compile_hip_code_object(src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -27,6 +27,7 @@
 #include <migraphx/kernels/types.hpp>
 #include <migraphx/kernels/type_traits.hpp>
 #include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/functional.hpp>
 #include <migraphx/kernels/debug.hpp>

 namespace migraphx {
@@ -213,6 +214,13 @@ constexpr auto transform(integral_const_array<T, Xs...>, F f)
    return integral_const_array<T, f(Xs)...>{};
 }

+template <class T, T... Xs, class F>
+constexpr auto transform_i(integral_const_array<T, Xs...>, F f)
+{
+    return sequence_c<sizeof...(Xs)>(
+        [=](auto... is) { return integral_const_array<T, f(Xs, is)...>{}; });
+}
+
 template <class T, T... Xs, class U, U... Ys, class F>
 constexpr auto transform(integral_const_array<T, Xs...>, integral_const_array<U, Ys...>, F f)
 {