Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into gemm_to_conv

95b5efeb · Khalique Ahmed · 5e22d800 · 25a0e433 · 95b5efeb · 95b5efeb
Commit 95b5efeb authored Oct 28, 2022 by Khalique Ahmed
20 changed files
--- a/src/include/migraphx/op/unary.hpp
+++ b/src/include/migraphx/op/unary.hpp
@@ -30,6 +30,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/value.hpp>
+#include <migraphx/dyn_output.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -62,9 +63,9 @@ struct unary : op_name<Derived>
    value attributes() const { return base_attributes(); }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, static_cast<const Derived&>(*this)}.has(1);
+        check_shapes{inputs, static_cast<const Derived&>(*this), true}.has(1);
        auto s = inputs.at(0);
-        if(s.scalar())
+        if(s.dynamic() or s.scalar())
        {
            return s;
        }
@@ -78,9 +79,9 @@ struct unary : op_name<Derived>
        }
    }
-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
        result.visit([&](auto output) {
            args[0].visit([&](auto input) {
                std::transform(input.begin(),

--- a/src/include/migraphx/operation.hpp
+++ b/src/include/migraphx/operation.hpp
@@ -32,6 +32,8 @@
 #include <utility>
 #include <unordered_map>
 #include <migraphx/reflect.hpp>
+#include <migraphx/dyn_output.hpp>
+#include <migraphx/functional.hpp>
 #include <migraphx/streamutils.hpp>
 #include <migraphx/normalize_attributes.hpp>
 #include <migraphx/argument.hpp>
@@ -199,9 +201,12 @@ auto compute_op(rank<1>,
                context& ctx,
                const shape& output_shape,
                const std::vector<argument>& input)
-    -> decltype(x.compute(auto_any_cast(ctx), output_shape, input))
+    -> decltype(x.compute(auto_any_cast(ctx),
+                          make_compute_output_shape(pack(x, output_shape, input)),
+                          input))
 {
-    return x.compute(auto_any_cast(ctx), output_shape, input);
+    return x.compute(
+        auto_any_cast(ctx), make_compute_output_shape(pack(x, output_shape, input)), input);
 }
 template <class T>
@@ -220,9 +225,9 @@ compute_op(const T& x, context& ctx, const shape& output_shape, const std::vecto
 template <class T>
 auto compute_op(rank<1>, const T& x, const shape& output_shape, const std::vector<argument>& input)
-    -> decltype(x.compute(output_shape, input))
+    -> decltype(x.compute(make_compute_output_shape(pack(x, output_shape, input)), input))
 {
-    return x.compute(output_shape, input);
+    return x.compute(make_compute_output_shape(pack(x, output_shape, input)), input);
 }
 template <class T>
@@ -244,9 +249,11 @@ auto compute_op(rank<1>,
                const shape& output,
                const std::vector<argument>& inputs,
                const std::vector<module_ref>& module_args,
-                F f) -> decltype(x.compute(output, inputs, module_args, f))
+                F f)
+    -> decltype(
+        x.compute(make_compute_output_shape(pack(x, output, inputs)), inputs, module_args, f))
 {
-    return x.compute(output, inputs, module_args, f);
+    return x.compute(make_compute_output_shape(pack(x, output, inputs)), inputs, module_args, f);
 }
 template <class T, class F>
@@ -278,9 +285,17 @@ auto compute_op(rank<4>,
                const shape& output,
                const std::vector<argument>& inputs,
                const std::vector<module_ref>& module_args,
-                F f) -> decltype(x.compute(auto_any_cast(ctx), output, inputs, module_args, f))
+                F f) -> decltype(x.compute(auto_any_cast(ctx),
+                                           make_compute_output_shape(pack(x, output, inputs)),
+                                           inputs,
+                                           module_args,
+                                           f))
 {
-    return x.compute(auto_any_cast(ctx), output, inputs, module_args, f);
+    return x.compute(auto_any_cast(ctx),
+                     make_compute_output_shape(pack(x, output, inputs)),
+                     inputs,
+                     module_args,
+                     f);
 }
 template <class T, class F>
@@ -290,9 +305,11 @@ auto compute_op(rank<3>,
                const shape& output,
                const std::vector<argument>& inputs,
                const std::vector<module_ref>& module_args,
-                F f) -> decltype(x.compute(output, inputs, module_args, f))
+                F f)
+    -> decltype(
+        x.compute(make_compute_output_shape(pack(x, output, inputs)), inputs, module_args, f))
 {
-    return x.compute(output, inputs, module_args, f);
+    return x.compute(make_compute_output_shape(pack(x, output, inputs)), inputs, module_args, f);
 }
 template <class T, class F>
@@ -302,9 +319,10 @@ auto compute_op(rank<2>,
                const shape& output,
                const std::vector<argument>& inputs,
                const std::vector<module_ref>&,
-                F) -> decltype(x.compute(output, inputs))
+                F)
+    -> decltype(x.compute(make_compute_output_shape(pack(x, output, inputs)), inputs))
 {
-    return x.compute(output, inputs);
+    return x.compute(make_compute_output_shape(pack(x, output, inputs)), inputs);
 }
 template <class T, class F>
@@ -314,9 +332,12 @@ auto compute_op(rank<1>,
                const shape& output,
                const std::vector<argument>& inputs,
                const std::vector<module_ref>&,
-                F) -> decltype(x.compute(auto_any_cast(ctx), output, inputs))
+                F) -> decltype(x.compute(auto_any_cast(ctx),
+                                         make_compute_output_shape(pack(x, output, inputs)),
+                                         inputs))
 {
-    return x.compute(auto_any_cast(ctx), output, inputs);
+    return x.compute(
+        auto_any_cast(ctx), make_compute_output_shape(pack(x, output, inputs)), inputs);
 }
 template <class T, class F>
@@ -348,7 +369,8 @@ auto is_context_free_op(rank<1>,
                        const T& x,
                        const shape& output_shape,
                        const std::vector<argument>& input)
-    -> decltype(x.compute(output_shape, input), std::true_type{});
+    -> decltype(x.compute(make_compute_output_shape(pack(x, output_shape, input)), input),
+                std::true_type{});
 template <class T>
 auto is_context_free_op(rank<0>, const T&, const shape&, const std::vector<argument>&)

--- a/src/load_save.cpp
+++ b/src/load_save.cpp
@@ -25,7 +25,6 @@
 #include <migraphx/file_buffer.hpp>
 #include <migraphx/json.hpp>
 #include <migraphx/msgpack.hpp>
-#include <migraphx/file_buffer.hpp>
 #include <fstream>
 namespace migraphx {

--- a/src/module.cpp
+++ b/src/module.cpp
@@ -34,7 +34,6 @@
 #include <migraphx/pass_manager.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/register_target.hpp>
-#include <migraphx/make_op.hpp>
 #include <migraphx/json.hpp>
 #include <iostream>
 #include <sstream>

--- a/src/onnx/conv.cpp
+++ b/src/onnx/conv.cpp
@@ -30,7 +30,7 @@ namespace onnx {
 void recalc_conv_attributes(value& v, size_t kdims)
 {
-    if(not(v["padding"].size() == kdims or v["padding"].size() == kdims * 2))
+    if(v["padding"].size() != kdims and v["padding"].size() != kdims * 2)
    {
        v["padding"].resize(kdims);
        std::fill_n(v["padding"].begin(), kdims, 0);

--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
@@ -46,9 +46,6 @@
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/ranges.hpp>
-#include <migraphx/op/common.hpp>
-#include <migraphx/op/rnn_var_sl_last_output.hpp>
-#include <migraphx/op/rnn_variable_seq_lens.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
@@ -827,7 +827,7 @@ MIGRAPHX_PRED_MATCHER(horiz_conv_dot, instruction_ref ins)
    };
    auto dots  = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("dot"));
    auto convs = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("convolution"));
-    return not(dots < 2 and convs < 2);
+    return (dots >= 2 or convs >= 2);
 }
 struct find_conv_dot_horiz_fusion

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -215,55 +215,6 @@ struct cpu_pad
 };
 MIGRAPHX_REGISTER_OP(cpu_pad)
-struct leaky_relu_op
-{
-    op::leaky_relu op;
-    std::string name() const { return "cpu::leaky_relu"; }
-    auto fcn() const
-    {
-        auto a = op.alpha;
-        return [a](auto x) { return x > 0 ? x : x * a; };
-    }
-};
-template <typename Op>
-struct cpu_unary2 : auto_register_op<cpu_unary2<Op>>
-{
-    cpu_unary2() = default;
-    template <class T>
-    cpu_unary2(T pop) : op(Op{std::move(pop)})
-    {
-    }
-    Op op;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op.op, f);
-    }
-    std::string name() const { return op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const
-    {
-        check_shapes{inputs, *this}.has(1);
-        const auto& s = inputs.at(0);
-        return {s.type(), s.lens()};
-    }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            assert(input.get_shape().standard());
-            std::transform(input.begin(), input.end(), output.begin(), op.fcn());
-        });
-        return result;
-    }
-};
-template struct cpu_unary2<leaky_relu_op>;
 struct cpu_rnn_var_sl_last_output
 {
    op::rnn_var_sl_last_output op;

--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -41,7 +41,6 @@
 #include <migraphx/rewrite_quantization.hpp>
 #include <migraphx/rewrite_rnn.hpp>
 #include <migraphx/schedule.hpp>
-#include <migraphx/memory_coloring.hpp>
 #include <migraphx/simplify_algebra.hpp>
 #include <migraphx/simplify_qdq.hpp>
 #include <migraphx/simplify_reshapes.hpp>

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -85,7 +85,6 @@ add_library(migraphx_gpu
    compile_hip_code_object.cpp
    compiler.cpp
    device_name.cpp
-    elu.cpp
    fuse_mlir.cpp
    fuse_ops.cpp
    gather.cpp
@@ -98,7 +97,6 @@ add_library(migraphx_gpu
    logsoftmax.cpp
    loop.cpp
    lrn.cpp
-    leaky_relu.cpp
    mlir.cpp
    multinomial.cpp
    nonzero.cpp
@@ -143,9 +141,7 @@ register_migraphx_gpu_ops(hip_
 register_migraphx_gpu_ops(miopen_
    abs
    contiguous
-    elu
    int8_conv_pack
-    leaky_relu
    lrn
    pooling
 )

--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -145,8 +145,7 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
 std::size_t compute_block_size(std::size_t n, std::size_t max_block_size)
 {
    const std::size_t min_block_size = 64;
-    const std::size_t base_block_size = 32;
+    auto block_size                  = (((n - 1) / min_block_size + 1)) * min_block_size;
-    auto block_size                   = (((n - 1) / base_block_size + 1)) * base_block_size;
    return std::min(std::max(min_block_size, block_size), max_block_size);
 }

--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -183,8 +183,8 @@ argument register_on_gpu(const argument& arg)
 {
    auto arg_shared = arg.share();
    auto p          = register_on_gpu(arg_shared.data(), arg_shared.get_shape().bytes());
-    return {arg_shared.get_shape(),
+    auto s          = arg_shared.get_shape();
-            [p, a = std::move(arg_shared)]() mutable { return get_device_ptr(p.get()); }};
+    return {s, [p, a = std::move(arg_shared)]() mutable { return get_device_ptr(p.get()); }};
 }
 argument to_gpu(const argument& arg, bool host)

--- a/src/targets/gpu/elu.cpp
+++ b/src/targets/gpu/elu.cpp
@@ -21,44 +21,80 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <migraphx/gpu/elu.hpp>
+#include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/float_equal.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-shape miopen_elu::compute_shape(const std::vector<shape>& inputs) const
+using namespace migraphx::gpu::gen; // NOLINT
-{
-    check_shapes{inputs, *this}.has(2).not_broadcasted();
-    return inputs.at(1);
-}
-argument miopen_elu::compute(context& ctx,
+static const char* const pointwise_kernel = R"__migraphx__(
-                             const shape& output_shape,
+#include <migraphx/kernels/pad.hpp>
-                             const std::vector<argument>& args) const
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <args.hpp>
+namespace migraphx {
+extern "C" {
+__global__ void pad_kernel(void* input_p, void* output_p) 
 {
-    float alpha = 1;
+    auto offsets = index_ints<${offsets}>{};
-    float beta  = 0;
+    auto idx     = make_index();
-    auto x_desc = make_tensor(args[0].get_shape());
+    make_tensors()(input_p, output_p)([&](auto input, auto output) {
-    auto y_desc = make_tensor(output_shape);
+        pad(idx, offsets, input, output, ${pad_val});
-    miopenActivationForward(ctx.get_stream().get_miopen(),
+    });
-                            ad.get(),
+}
-                            &alpha,
-                            x_desc.get(),
-                            args[0].implicit(),
-                            &beta,
-                            y_desc.get(),
-                            args[1].implicit());
-    return args[1];
 }
-void miopen_elu::finalize(context&, const shape&, const std::vector<shape>&)
+} // namespace migraphx
+)__migraphx__";
+struct pad_compiler : compiler<pad_compiler>
 {
-    ad = make_elu(op.alpha);
+    std::vector<std::string> names() const { return {"pad"}; }
-}
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        hip_compile_options options;
+        options.inputs         = inputs;
+        options.output         = inputs.back();
+        options.virtual_inputs = reduce_dims(inputs);
+        options.kernel_name    = "pad_kernel";
+        options.set_launch_params(v, compute_global_for(ctx, inputs.at(1).elements()));
+        auto pad_val        = v.get("value", 0.f);
+        auto pad_val_string = to_string(pad_val);
+        if(float_equal(pad_val, std::numeric_limits<float>::lowest()))
+            pad_val_string = "lowest{}";
+        if(float_equal(pad_val, std::numeric_limits<float>::max()))
+            pad_val_string = "highest{}";
+        auto padding    = v.at("pads").to_vector<int64_t>();
+        auto input_lens = inputs.front().lens();
+        std::vector<size_t> offsets(input_lens.size());
+        std::copy(padding.begin(), padding.begin() + offsets.size(), offsets.begin());
+        auto src = interpolate_string(
+            pointwise_kernel,
+            {{"pad_val", to_string(pad_val_string)}, {"offsets", to_string_range(offsets)}});
+        return compile_hip_code_object(src, options);
+    }
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+    }
+};
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/jit/scatternd.cpp
+++ b/src/targets/gpu/jit/scatternd.cpp
@@ -79,8 +79,9 @@ struct scatternd_compiler : compiler<scatternd_compiler>
    {
        assert(starts_with(op.name(), "scatternd_"));
        auto reduction = op.name().substr(10);
-        return insert(compile_op(ctx,
+        return insert(compile_op(
-                                 to_shapes({ins->inputs().begin() + 1, ins->inputs().end()}),
+            ctx,
+            to_shapes(std::vector<instruction_ref>{ins->inputs().begin() + 1, ins->inputs().end()}),
            {{"reduction", reduction}}));
    }

--- a/src/targets/gpu/include/migraphx/gpu/leaky_relu.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/leaky_relu.hpp
@@ -21,44 +21,43 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_LEAKY_RELU_HPP
+#ifndef MIGRAPHX_GUARD_KERNELS_PAD_HPP
-#define MIGRAPHX_GUARD_RTGLIB_LEAKY_RELU_HPP
+#define MIGRAPHX_GUARD_KERNELS_PAD_HPP
-#include <migraphx/op/leaky_relu.hpp>
+#include <migraphx/kernels/shape.hpp>
-#include <migraphx/shape.hpp>
+#include <migraphx/kernels/index.hpp>
-#include <migraphx/reflect.hpp>
+#include <migraphx/kernels/algorithm.hpp>
-#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/kernels/ranges.hpp>
 namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-struct context;
+template <class Offsets, class Input, class Output, class PadVal>
+__device__ void pad(const index& idx,
-struct miopen_leaky_relu
+                    const Offsets& offsets,
+                    const Input& input,
+                    Output& output,
+                    const PadVal& pad_val)
 {
-    op::leaky_relu op;
+    auto output_shape = output.get_shape();
-    shared<activation_descriptor> ad;
+    idx.global_stride(output_shape.elements(), [&](auto i) {
+        // 1. get current multi-index for output
-    template <class Self, class F>
+        // 2. get the size of the input to determine input boundaries
-    static auto reflect(Self& self, F f)
+        // 3. compute the corresponding multi-index for input by accounting for offsets
-    {
+        // 4. if current multi-index is within offsets or input's new multi-index is out of bounds,
-        return migraphx::reflect(self.op, f);
+        //    use pad value instead of input's value
-    }
+        auto multi        = output_shape.multi(i);
+        auto input_bounds = input.get_shape().lens;
+        auto input_idx    = multi - offsets;
+        auto range_multi  = range(multi.size());
+        if(any_of(range_multi.begin(), range_multi.end(), [&](auto j) {
+               return multi[j] < offsets[j] or input_idx[j] >= input_bounds[j];
+           }))
+            output[multi] = pad_val;
+        else
+            output[multi] = input[input_idx];
+    });
+}
-    std::string name() const { return "gpu::leaky_relu"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    void finalize(context&, const shape&, const std::vector<shape>&);
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
-};
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/test/verify/test_elu.cpp
+++ b/test/verify/test_elu.cpp
@@ -21,20 +21,29 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+#ifndef MIGRAPHX_GUARD_KERNELS_RANGES_HPP
+#define MIGRAPHX_GUARD_KERNELS_RANGES_HPP
-#include "verify_program.hpp"
+#include <migraphx/kernels/iota_iterator.hpp>
-#include <migraphx/program.hpp>
-#include <migraphx/generate.hpp>
-#include <migraphx/make_op.hpp>
-struct test_elu : verify_program<test_elu>
+namespace migraphx {
+template <class Iterator>
+struct iterator_range
 {
-    migraphx::program create_program() const
+    Iterator start;
-    {
+    Iterator last;
-        migraphx::program p;
-        auto* mm = p.get_main_module();
+    constexpr Iterator begin() const { return start; }
-        auto x = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, {4, 3, 3, 3}});
-        mm->add_instruction(migraphx::make_op("leaky_relu", {{"alpha", 1.0}}), x);
+    constexpr Iterator end() const { return last; }
-        return p;
-    }
 };
+constexpr iterator_range<iota_iterator> range(diff_int start, diff_int last)
+{
+    return {{start, {}}, {last, {}}};
+}
+constexpr iterator_range<iota_iterator> range(diff_int last) { return range(0, last); }
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_RANGES_HPP
--- a/src/targets/gpu/leaky_relu.cpp
+++ b/src/targets/gpu/leaky_relu.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/leaky_relu.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/miopen.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-shape miopen_leaky_relu::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{inputs, *this}.has(2).not_broadcasted();
-    return inputs.at(1);
-}
-argument miopen_leaky_relu::compute(context& ctx,
-                                    const shape& output_shape,
-                                    const std::vector<argument>& args) const
-{
-    float alpha = 1;
-    float beta  = 0;
-    auto x_desc = make_tensor(args[0].get_shape());
-    auto y_desc = make_tensor(output_shape);
-    miopenActivationForward(ctx.get_stream().get_miopen(),
-                            ad.get(),
-                            &alpha,
-                            x_desc.get(),
-                            args[0].implicit(),
-                            &beta,
-                            y_desc.get(),
-                            args[1].implicit());
-    return args[1];
-}
-void miopen_leaky_relu::finalize(context&, const shape&, const std::vector<shape>&)
-{
-    ad = make_leaky_relu(op.alpha);
-}
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -95,14 +95,11 @@ struct miopen_apply
        add_extend_op("argmax");
        add_extend_op("argmin");
-        add_extend_op("elu");
        add_extend_op("gather");
-        add_extend_op("leaky_relu");
        add_extend_op("logsoftmax");
        add_extend_op("lrn");
        add_extend_op("multinomial");
        add_extend_op("nonzero");
-        add_extend_op("pad");
        add_extend_op("pooling");
        add_extend_op("prefix_scan_sum");
        add_extend_op("reverse");

--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -196,6 +196,7 @@ struct mlir_program
    MlirType make_tensor(const shape& s) const
    {
+        assert(s.standard());
        std::vector<int64_t> lens(s.lens().begin(), s.lens().end());
        return mlirRankedTensorTypeGet(
            lens.size(), lens.data(), make_type(s.type()), mlirAttributeGetNull());
@@ -371,7 +372,11 @@ struct mlir_program
        mlir_operation_state& add_results(const std::vector<shape>& outputs)
        {
-            auto x = prog->make_tensors(outputs);
+            std::vector<shape> reshaped(outputs.size());
+            std::transform(outputs.begin(), outputs.end(), reshaped.begin(), [](const shape& r) {
+                return shape{r.type(), r.lens()};
+            });
+            auto x = prog->make_tensors(reshaped);
            mlirOperationStateAddResults(&op_state, x.size(), x.data());
            return *this;
        }

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -141,12 +141,12 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        pack_int8_args{},
        dead_code_elimination{},
-        adjust_allocation{gpu_allocation_model{}},
-        dead_code_elimination{},
        fuse_ops{&ctx, options.fast_math},
        dead_code_elimination{},
        replace_allocate{gpu_allocation_model{}, options.offload_copy},
        dead_code_elimination{},
+        adjust_allocation{gpu_allocation_model{}},
+        dead_code_elimination{},
        compile_ops{&ctx},
        dead_code_elimination{},
        write_literals{&ctx},