Merge branch 'develop' into fix_parse_if

881a4bd4 · Umang Yadav · GitHub · a2d710e3 · 4b1c1c41 · 881a4bd4
Unverified Commit 881a4bd4 authored Oct 28, 2022 by Umang Yadav Committed by GitHub Oct 28, 2022
20 changed files
--- a/src/targets/gpu/include/migraphx/gpu/mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/mlir.hpp
@@ -36,7 +36,8 @@ struct module;
 namespace gpu {

 std::string dump_mlir(const module& m);
-code_object_op compile_mlir(const context& ctx, const module& m);
+code_object_op
+compile_mlir(const context& ctx, module m, const std::vector<instruction_ref>& inputs);

 instruction_ref insert_mlir(module& m,
                            instruction_ref ins,

--- a/src/targets/gpu/include/migraphx/gpu/perfdb.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/perfdb.hpp
@@ -41,7 +41,7 @@ struct problem_params
    shape output;
 };

-std::string get_mlir_perf_for_conv(const problem_params& pp);
+std::string get_mlir_perf_for_conv(const problem_params& pp, bool xdlops);

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/jit/mlir.cpp
+++ b/src/targets/gpu/jit/mlir.cpp
@@ -41,7 +41,7 @@ struct mlir_compiler : compiler<mlir_compiler>
    {
        auto* smod = ins->module_inputs().front();
        assert(smod->get_parameter_names().size() == ins->inputs().size() - 1);
-        return insert(compile_mlir(ctx, *smod));
+        return insert(compile_mlir(ctx, *smod, ins->inputs()));
    }

    compiler_replace insert(code_object_op co) const

--- a/src/targets/gpu/leaky_relu.cpp
+++ b/src/targets/gpu/leaky_relu.cpp
@@ -21,45 +21,80 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <migraphx/gpu/leaky_relu.hpp>
+#include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/float_equal.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-shape miopen_leaky_relu::compute_shape(const std::vector<shape>& inputs) const
+using namespace migraphx::gpu::gen; // NOLINT
+
+static const char* const pointwise_kernel = R"__migraphx__(
+#include <migraphx/kernels/pad.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+__global__ void pad_kernel(void* input_p, void* output_p) 
 {
-    check_shapes{inputs, *this}.has(2).not_broadcasted();
-    return inputs.at(1);
+    auto offsets = index_ints<${offsets}>{};
+    auto idx     = make_index();
+    make_tensors()(input_p, output_p)([&](auto input, auto output) {
+        pad(idx, offsets, input, output, ${pad_val});
+    });
+}
+    
 }

-argument miopen_leaky_relu::compute(context& ctx,
-                                    const shape& output_shape,
-                                    const std::vector<argument>& args) const
-{
-    float alpha = 1;
-    float beta  = 0;
-    auto x_desc = make_tensor(args[0].get_shape());
-    auto y_desc = make_tensor(output_shape);
-    miopenActivationForward(ctx.get_stream().get_miopen(),
-                            ad.get(),
-                            &alpha,
-                            x_desc.get(),
-                            args[0].implicit(),
-                            &beta,
-                            y_desc.get(),
-                            args[1].implicit());
+} // namespace migraphx

-    return args[1];
-}
+)__migraphx__";

-void miopen_leaky_relu::finalize(context&, const shape&, const std::vector<shape>&)
+struct pad_compiler : compiler<pad_compiler>
 {
-    ad = make_leaky_relu(op.alpha);
-}
+    std::vector<std::string> names() const { return {"pad"}; }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        hip_compile_options options;
+        options.inputs         = inputs;
+        options.output         = inputs.back();
+        options.virtual_inputs = reduce_dims(inputs);
+        options.kernel_name    = "pad_kernel";
+        options.set_launch_params(v, compute_global_for(ctx, inputs.at(1).elements()));
+
+        auto pad_val        = v.get("value", 0.f);
+        auto pad_val_string = to_string(pad_val);
+        if(float_equal(pad_val, std::numeric_limits<float>::lowest()))
+            pad_val_string = "lowest{}";
+        if(float_equal(pad_val, std::numeric_limits<float>::max()))
+            pad_val_string = "highest{}";
+
+        auto padding    = v.at("pads").to_vector<int64_t>();
+        auto input_lens = inputs.front().lens();
+        std::vector<size_t> offsets(input_lens.size());
+        std::copy(padding.begin(), padding.begin() + offsets.size(), offsets.begin());
+
+        auto src = interpolate_string(
+            pointwise_kernel,
+            {{"pad_val", to_string(pad_val_string)}, {"offsets", to_string_range(offsets)}});
+        return compile_hip_code_object(src, options);
+    }

+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+    }
+};
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/jit/scatternd.cpp
+++ b/src/targets/gpu/jit/scatternd.cpp
@@ -79,9 +79,10 @@ struct scatternd_compiler : compiler<scatternd_compiler>
    {
        assert(starts_with(op.name(), "scatternd_"));
        auto reduction = op.name().substr(10);
-        return insert(compile_op(ctx,
-                                 to_shapes({ins->inputs().begin() + 1, ins->inputs().end()}),
-                                 {{"reduction", reduction}}));
+        return insert(compile_op(
+            ctx,
+            to_shapes(std::vector<instruction_ref>{ins->inputs().begin() + 1, ins->inputs().end()}),
+            {{"reduction", reduction}}));
    }

    compiler_replace insert(const operation& op) const

--- a/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
@@ -21,53 +21,43 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP
-#define MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP
+#ifndef MIGRAPHX_GUARD_KERNELS_PAD_HPP
+#define MIGRAPHX_GUARD_KERNELS_PAD_HPP

-#include <migraphx/shape.hpp>
-#include <migraphx/reflect.hpp>
-#include <migraphx/op/quant_convolution.hpp>
-#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/kernels/shape.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/ranges.hpp>

 namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {

-struct context;
-
-struct miopen_quant_convolution
+template <class Offsets, class Input, class Output, class PadVal>
+__device__ void pad(const index& idx,
+                    const Offsets& offsets,
+                    const Input& input,
+                    Output& output,
+                    const PadVal& pad_val)
 {
-    op::quant_convolution op;
-    bool int8_x4_format = false;
-    shared<convolution_descriptor> cd;
-    miopenConvFwdAlgorithm_t algo{};
-    uint64_t solution_id = 0;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        // TODO: Add algo
-        return pack_join(migraphx::reflect(self.op, f),
-                         pack(f(self.int8_x4_format, "int8_x4_format")));
-    }
-
-    std::string name() const { return "gpu::quant_convolution"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
-    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
+    auto output_shape = output.get_shape();
+    idx.global_stride(output_shape.elements(), [&](auto i) {
+        // 1. get current multi-index for output
+        // 2. get the size of the input to determine input boundaries
+        // 3. compute the corresponding multi-index for input by accounting for offsets
+        // 4. if current multi-index is within offsets or input's new multi-index is out of bounds,
+        //    use pad value instead of input's value
+        auto multi        = output_shape.multi(i);
+        auto input_bounds = input.get_shape().lens;
+        auto input_idx    = multi - offsets;
+        auto range_multi  = range(multi.size());
+
+        if(any_of(range_multi.begin(), range_multi.end(), [&](auto j) {
+               return multi[j] < offsets[j] or input_idx[j] >= input_bounds[j];
+           }))
+            output[multi] = pad_val;
+        else
+            output[multi] = input[input_idx];
+    });
+}

-    private:
-    shape pack_int8_shape(const shape& s) const;
-};
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-
 #endif
--- a/src/targets/gpu/include/migraphx/gpu/elu.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/elu.hpp
@@ -21,44 +21,29 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_ELU_HPP
-#define MIGRAPHX_GUARD_RTGLIB_ELU_HPP
+#ifndef MIGRAPHX_GUARD_KERNELS_RANGES_HPP
+#define MIGRAPHX_GUARD_KERNELS_RANGES_HPP

-#include <migraphx/op/elu.hpp>
-#include <migraphx/shape.hpp>
-#include <migraphx/reflect.hpp>
-#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/kernels/iota_iterator.hpp>

 namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {

-struct context;
-
-struct miopen_elu
+template <class Iterator>
+struct iterator_range
 {
-    op::elu op;
-    shared<activation_descriptor> ad;
+    Iterator start;
+    Iterator last;

-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
+    constexpr Iterator begin() const { return start; }

-    std::string name() const { return "gpu::elu"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    void finalize(context&, const shape&, const std::vector<shape>&);
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
+    constexpr Iterator end() const { return last; }
 };

-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
+constexpr iterator_range<iota_iterator> range(diff_int start, diff_int last)
+{
+    return {{start, {}}, {last, {}}};
+}
+constexpr iterator_range<iota_iterator> range(diff_int last) { return range(0, last); }

-#endif
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_RANGES_HPP
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -39,12 +39,10 @@

 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/convolution.hpp>
-#include <migraphx/gpu/deconvolution.hpp>
 #include <migraphx/gpu/device_name.hpp>
 #include <migraphx/gpu/gemm.hpp>
 #include <migraphx/gpu/int8_conv_pack.hpp>
 #include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/quant_convolution.hpp>
 #include <migraphx/gpu/rocblas.hpp>
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/iterator_for.hpp>
@@ -97,14 +95,11 @@ struct miopen_apply

        add_extend_op("argmax");
        add_extend_op("argmin");
-        add_extend_op("elu");
        add_extend_op("gather");
-        add_extend_op("leaky_relu");
        add_extend_op("logsoftmax");
        add_extend_op("lrn");
        add_extend_op("multinomial");
        add_extend_op("nonzero");
-        add_extend_op("pad");
        add_extend_op("pooling");
        add_extend_op("prefix_scan_sum");
        add_extend_op("reverse");
@@ -114,15 +109,15 @@ struct miopen_apply
        add_extend_op("scatter_none");
        add_extend_op("topk");

-        add_convolution_op();
-        add_deconvolution_op();
+        add_convolution_op<op::convolution>("convolution");
+        add_convolution_op<op::deconvolution>("deconvolution");
+        add_convolution_op<op::quant_convolution>("quant_convolution");
        add_gemm_op<op::dot>("dot");
        add_gemm_op<op::quant_dot>("quant_dot");
        add_if_op();
        add_loop_op();
        add_neg_op();
        add_nms_op();
-        add_quant_convolution_op();
    }

    void copy_params() const
@@ -230,38 +225,6 @@ struct miopen_apply
        return mod->insert_instruction(ins, make_op("allocate", {{"shape", to_value(s)}}));
    }

-    void add_convolution_op()
-    {
-        apply_map.emplace("convolution", [=](instruction_ref ins) {
-            auto&& op = any_cast<op::convolution>(ins->get_operator());
-
-            auto conv = miopen_convolution{op, make_conv(op)};
-            auto ws   = conv.find(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
-
-            auto workspace = insert_allocation(ins, ws);
-            auto output    = insert_allocation(ins, ins->get_shape());
-
-            return mod->replace_instruction(
-                ins, conv, ins->inputs().at(0), ins->inputs().at(1), workspace, output);
-        });
-    }
-
-    void add_deconvolution_op()
-    {
-        apply_map.emplace("deconvolution", [=](instruction_ref ins) {
-            auto&& op = any_cast<op::deconvolution>(ins->get_operator());
-
-            auto conv = miopen_deconvolution{op, make_deconv(op)};
-            auto ws   = conv.find(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
-
-            auto workspace = insert_allocation(ins, ws);
-            auto output    = insert_allocation(ins, ins->get_shape());
-
-            return mod->replace_instruction(
-                ins, conv, ins->inputs().at(0), ins->inputs().at(1), workspace, output);
-        });
-    }
-
    template <typename Op>
    void add_gemm_op(const std::string& name)
    {
@@ -275,31 +238,33 @@ struct miopen_apply
        });
    }

-    void add_quant_convolution_op()
+    template <typename Op>
+    void add_convolution_op(const std::string& name)
    {
-        apply_map.emplace("quant_convolution", [=](instruction_ref ins) {
-            auto&& op = any_cast<op::quant_convolution>(ins->get_operator());
-            shape ws;
-            miopen_quant_convolution conv;
-            auto compile_quant_conv_with_format = [&](bool format) {
-                conv = miopen_quant_convolution{op, format, make_conv(op)};
-                ws   = conv.find(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
+        apply_map.emplace(name, [=](instruction_ref ins) {
+            operation conv =
+                miopen_convolution<Op>{any_cast<Op>(ins->get_operator()), int8_x4_format};
+            migraphx::context ctx         = get_context();
+            size_t ws_bytes               = 0;
+            auto compile_conv_with_format = [&](bool format) {
+                conv     = miopen_convolution<Op>{any_cast<Op>(ins->get_operator()), format};
+                auto ws  = conv.compile(ctx, ins->get_shape(), to_shapes(ins->inputs()));
+                ws_bytes = ws.get("workspace", 0);
            };

            try
-            {
-                compile_quant_conv_with_format(int8_x4_format);
+            { // for the regular convolution and deconvolution, this try would always succeed
+                compile_conv_with_format(int8_x4_format);
            }
            catch(migraphx::exception&)
            {
                // In case no solver supports the default format, retry using the other format.
-                compile_quant_conv_with_format(not int8_x4_format);
+                compile_conv_with_format(not int8_x4_format);
            }

            auto args      = ins->inputs();
-            auto workspace = insert_allocation(ins, ws);
            auto output    = insert_allocation(ins, ins->get_shape());
-
+            auto workspace = insert_allocation(ins, shape{shape::int8_type, {ws_bytes}});
            return mod->replace_instruction(ins, conv, args[0], args[1], workspace, output);
        });
    }

--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -21,6 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+#include "migraphx/make_op.hpp"
 #include <migraphx/gpu/mlir.hpp>

 #ifdef MIGRAPHX_MLIR
@@ -43,8 +44,9 @@
 #include <migraphx/gpu/code_object_op.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/device_name.hpp>
-#include <migraphx/iterator_for.hpp>
 #include <migraphx/gpu/perfdb.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/permutation.hpp>
 #include <deque>
 #include <variant>

@@ -370,7 +372,11 @@ struct mlir_program

        mlir_operation_state& add_results(const std::vector<shape>& outputs)
        {
-            auto x = prog->make_tensors(outputs);
+            std::vector<shape> reshaped(outputs.size());
+            std::transform(outputs.begin(), outputs.end(), reshaped.begin(), [](const shape& r) {
+                return shape{r.type(), r.lens()};
+            });
+            auto x = prog->make_tensors(reshaped);
            mlirOperationStateAddResults(&op_state, x.size(), x.data());
            return *this;
        }
@@ -502,11 +508,12 @@ struct mlir_program
            {
                pp =
                    problem_params{ins->get_operator(), to_shapes(ins->inputs()), ins->get_shape()};
-                std::string tuned = get_tune_params();
+                // check if HW supports xdlops
+                bool xdlops       = contains(get_xdlops_archs(), target_name);
+                std::string tuned = get_tune_params(xdlops);
                if(not tuned.empty())
                    ops.add_attributes({{"perf_config", tuned}});
-                // check if HW supports xdlops
-                if(contains(get_xdlops_archs(), target_name))
+                if(xdlops)
                    ops.add_attributes({{"xdlopsV2", true}});
            }

@@ -571,7 +578,7 @@ struct mlir_program
        MIGRAPHX_THROW("Failed to compile mlir program");
    }

-    std::string get_tune_params() { return get_mlir_perf_for_conv(pp); }
+    std::string get_tune_params(bool xdlops) { return get_mlir_perf_for_conv(pp, xdlops); }

    mlir_context ctx;
    MlirLocation location;
@@ -589,8 +596,54 @@ std::string dump_mlir(const module& m)
    return mlir_print(&mlirOperationPrint, mod_op);
 }

-code_object_op compile_mlir(const context&, const module& m)
+void adjust_param_shapes(module& m, const std::vector<instruction_ref>& inputs)
 {
+    auto names = m.get_parameter_names();
+    std::sort(names.begin(), names.end());
+    for(auto i : range(names.size()))
+    {
+        const auto& name  = names[i];
+        const auto& input = inputs[i]->get_shape();
+        auto param        = m.get_parameter(name);
+        if(input.standard())
+            continue;
+        auto lens    = input.lens();
+        auto strides = input.strides();
+        std::vector<operation> ops;
+        if(input.transposed())
+        {
+            auto perm  = find_permutation(input);
+            auto iperm = invert_permutation(perm);
+            lens       = reorder_dims(lens, iperm);
+            strides    = reorder_dims(strides, iperm);
+            ops.push_back(make_op("transpose", {{"permutation", perm}}));
+        }
+        if(input.broadcasted())
+        {
+            std::transform(lens.begin(),
+                           lens.end(),
+                           strides.begin(),
+                           lens.begin(),
+                           [](auto len, auto stride) -> std::size_t {
+                               if(stride == 0)
+                                   return 1;
+                               return len;
+                           });
+            ops.push_back(make_op("multibroadcast", {{"out_lens", input.lens()}}));
+        }
+        auto new_param =
+            std::accumulate(ops.begin(),
+                            ops.end(),
+                            m.add_parameter(name + ".0", shape{input.type(), lens}),
+                            [&](auto x, auto op) { return m.insert_instruction(param, op, x); });
+        m.replace_instruction(param, new_param);
+        m.remove_instruction(param);
+    }
+}
+
+code_object_op compile_mlir(const context&, module m, const std::vector<instruction_ref>& inputs)
+{
+    adjust_param_shapes(m, inputs);
    const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
    if(trace)
        std::cout << m << std::endl;
@@ -662,13 +715,19 @@ instruction_ref insert_mlir(module& m,

 std::string dump_mlir(const module&) { return {}; }

-code_object_op compile_mlir(const context&, const module&) { return {}; }
-
 template <class T>
 void use(T&)
 {
 }

+// Disabling clang-tidy warning on non-real useage.
+// NOLINTBEGIN(performance-unnecessary-value-param)
+code_object_op compile_mlir(const context&, module, const std::vector<instruction_ref>&)
+{
+    return {};
+}
+// NOLINTEND(performance-unnecessary-value-param)
+
 instruction_ref
 // cppcheck-suppress funcArgNamesDifferent
 insert_mlir(module& m, instruction_ref, code_object_op co, const std::vector<instruction_ref>&)

--- a/src/targets/gpu/perfdb.cpp
+++ b/src/targets/gpu/perfdb.cpp
@@ -108,16 +108,17 @@ auto query_miopen_db(const std::string& query)

 } // namespace

-std::string get_mlir_perf_for_conv(const problem_params& pp)
+std::string get_mlir_perf_for_conv(const problem_params& pp, bool xdlops)
 {
-    std::string query = "select P.* \
+    std::string solver = xdlops ? "ConvMlirIgemmFwdXdlops" : "ConvMlirIgemmFwd";
+    std::string query  = "select P.* \
                             from perf_db P, config C \
                             where P.config = C.id AND \
-                             P.solver = 'ConvMlirIgemmFwdXdlops' AND \
+                             P.solver = '${solver}' AND \
                             ${config}";

-    auto results =
-        query_miopen_db(interpolate_string(query, {{"config", generate_miopen_config(pp)}}));
+    auto results = query_miopen_db(
+        interpolate_string(query, {{"config", generate_miopen_config(pp)}, {"solver", solver}}));
    if(results.empty())
        return "";
    return results.front().at("params");

--- a/src/targets/gpu/quant_convolution.cpp
+++ b/src/targets/gpu/quant_convolution.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/quant_convolution.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/generate.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-shape miopen_quant_convolution::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{inputs, *this}.has(4).standard();
-    return op.normalize_compute_shape({inputs.at(0), inputs.at(1)});
-}
-argument miopen_quant_convolution::compute(context& ctx,
-                                           const shape& output_shape,
-                                           const std::vector<argument>& args) const
-{
-    auto x_desc = make_tensor(args[0].get_shape(), int8_x4_format);
-    auto w_desc = make_tensor(args[1].get_shape(), int8_x4_format);
-    auto y_desc = make_tensor(output_shape);
-
-    float alpha = 1;
-    float beta  = 0;
-
-    auto status = miopenConvolutionForward(ctx.get_stream().get_miopen(),
-                                           &alpha,
-                                           x_desc.get(),
-                                           args[0].implicit(),
-                                           w_desc.get(),
-                                           args[1].implicit(),
-                                           cd.get(),
-                                           algo,
-                                           &beta,
-                                           y_desc.get(),
-                                           args[3].implicit(),
-                                           args[2].implicit(),
-                                           args[2].get_shape().bytes());
-    if(status != miopenStatusSuccess)
-    {
-        MIGRAPHX_THROW("QUANT_CONVOLUTION: run convolution forward failed");
-    }
-
-    return args[3];
-}
-
-shape miopen_quant_convolution::find(context& ctx,
-                                     const shape& output_shape,
-                                     std::vector<shape> inputs)
-{
-    shape workspace_shape{};
-    auto x_desc = make_tensor(inputs[0], int8_x4_format);
-    auto w_desc = make_tensor(inputs[1], int8_x4_format);
-    auto y_desc = make_tensor(output_shape);
-
-    std::size_t workspace_size = 0;
-    miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
-                                             w_desc.get(),
-                                             x_desc.get(),
-                                             cd.get(),
-                                             y_desc.get(),
-                                             &workspace_size);
-    workspace_shape = shape{shape::int8_type, {workspace_size}};
-
-    auto x_shape = inputs[0];
-    auto w_shape = inputs[1];
-    if(int8_x4_format)
-    {
-        x_shape = pack_int8_shape(x_shape);
-        w_shape = pack_int8_shape(w_shape);
-    }
-    auto x         = to_gpu(generate_argument(x_shape));
-    auto w         = to_gpu(generate_argument(w_shape));
-    auto y         = allocate_gpu(output_shape);
-    auto workspace = allocate_gpu(workspace_shape);
-
-    int algo_count = 1;
-    miopenConvAlgoPerf_t perf;
-    auto status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
-                                                        x_desc.get(),
-                                                        x.implicit(),
-                                                        w_desc.get(),
-                                                        w.implicit(),
-                                                        cd.get(),
-                                                        y_desc.get(),
-                                                        y.implicit(),
-                                                        1,
-                                                        &algo_count,
-                                                        &perf,
-                                                        workspace.implicit(),
-                                                        workspace_size,
-                                                        false);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Quant Convolution: find convolution failed");
-    algo = perf.fwd_algo;
-
-    size_t solution_count;
-
-    status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
-                                                      w_desc.get(),
-                                                      x_desc.get(),
-                                                      cd.get(),
-                                                      y_desc.get(),
-                                                      &solution_count);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Quant Convolution: get solution count failed");
-
-    std::vector<miopenConvSolution_t> solutions(solution_count);
-
-    status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
-                                                 w_desc.get(),
-                                                 x_desc.get(),
-                                                 cd.get(),
-                                                 y_desc.get(),
-                                                 solution_count,
-                                                 &solution_count,
-                                                 solutions.data());
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Quant Convolution: get solution failed");
-
-    solution_id = solutions.front().solution_id;
-
-    return shape{shape::int8_type, {perf.memory}};
-}
-
-void miopen_quant_convolution::finalize(context& ctx,
-                                        const shape& output_shape,
-                                        std::vector<shape> inputs)
-{
-    if(cd == nullptr)
-        cd = make_conv(op);
-    if(solution_id == 0)
-    {
-        // Check that workspace hasn't changed
-        auto size = inputs.at(2).bytes();
-        auto ws   = find(ctx, output_shape, inputs);
-        if(ws.bytes() > size)
-            MIGRAPHX_THROW("MIOpen Quant Convolution: workspace has changed during finalization.");
-    }
-
-    auto x_desc = make_tensor(inputs[0], int8_x4_format);
-    auto w_desc = make_tensor(inputs[1], int8_x4_format);
-    auto y_desc = make_tensor(output_shape);
-
-    auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
-                                                          w_desc.get(),
-                                                          x_desc.get(),
-                                                          cd.get(),
-                                                          y_desc.get(),
-                                                          solution_id);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Quant Convolution: compile solution failed");
-}
-
-shape miopen_quant_convolution::pack_int8_shape(const shape& s) const
-{
-    if(s.type() != shape::int8_type)
-    {
-        MIGRAPHX_THROW("PACK_INT8_SHAPE: only process int8_type");
-    }
-
-    auto lens    = s.lens();
-    auto strides = s.strides();
-    lens[1]      = (lens[1] + 3) / 4 * 4;
-    strides[0]   = strides[1] * lens[1];
-
-    return {s.type(), lens, strides};
-}
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -138,12 +138,12 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        pack_int8_args{},
        dead_code_elimination{},
-        adjust_allocation{gpu_allocation_model{}},
-        dead_code_elimination{},
        fuse_ops{&ctx, options.fast_math},
        dead_code_elimination{},
        replace_allocate{gpu_allocation_model{}, options.offload_copy},
        dead_code_elimination{},
+        adjust_allocation{gpu_allocation_model{}},
+        dead_code_elimination{},
        compile_ops{&ctx},
        dead_code_elimination{},
        write_literals{&ctx},

--- a/src/targets/ref/lowering.cpp
+++ b/src/targets/ref/lowering.cpp
@@ -31,9 +31,7 @@
 #include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/quant_dot.hpp>
-#include <migraphx/op/elu.hpp>
 #include <migraphx/op/im2col.hpp>
-#include <migraphx/op/leaky_relu.hpp>
 #include <migraphx/op/logsoftmax.hpp>
 #include <migraphx/op/loop.hpp>
 #include <migraphx/op/lrn.hpp>
@@ -431,65 +429,6 @@ struct ref_quant_gemm
 };
 MIGRAPHX_REGISTER_OP(ref_gemm)

-struct leaky_relu_op
-{
-    op::leaky_relu op;
-    std::string name() const { return "ref::leaky_relu"; }
-    auto fcn() const
-    {
-        auto a = op.alpha;
-        return [a](auto x) { return x > 0 ? x : x * a; };
-    }
-};
-
-struct elu_op
-{
-    op::elu op;
-    std::string name() const { return "ref::elu"; }
-    auto fcn() const
-    {
-        auto a = op.alpha;
-        return [a](auto x) { return x > 0 ? x : a * std::expm1(x); };
-    }
-};
-
-template <typename Op>
-struct ref_unary : auto_register_op<ref_unary<Op>>
-{
-    ref_unary() = default;
-
-    template <class T>
-    ref_unary(T pop) : op(Op{std::move(pop)})
-    {
-    }
-
-    Op op;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op.op, f);
-    }
-    std::string name() const { return op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const
-    {
-        check_shapes{inputs, *this}.has(1);
-        const auto& s = inputs.at(0);
-        return {s.type(), s.lens()};
-    }
-
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            assert(input.get_shape().standard());
-            std::transform(input.begin(), input.end(), output.begin(), op.fcn());
-        });
-
-        return result;
-    }
-};
-
 template <class Op>
 struct ref_softmax : auto_register_op<ref_softmax<Op>>
 {
@@ -630,9 +569,7 @@ struct ref_apply
        apply_map["quant_dot"]   = extend_op<ref_quant_gemm, op::quant_dot>();
        apply_map["quant_convolution"] =
            extend_op<ref_convolution<op::quant_convolution>, op::quant_convolution>();
-        apply_map["elu"]        = extend_op<ref_unary<elu_op>, op::elu>();
        apply_map["im2col"]     = extend_op<ref_im2col, op::im2col>();
-        apply_map["leaky_relu"] = extend_op<ref_unary<leaky_relu_op>, op::leaky_relu>();
        apply_map["logsoftmax"] = extend_op<ref_softmax<op::logsoftmax>, op::logsoftmax>();
        apply_map["lrn"]        = extend_op<ref_lrn, op::lrn>();
        apply_map["pad"]        = extend_op<ref_pad, op::pad>();

--- a/test/api/test_custom_op_gpu.cpp
+++ b/test/api/test_custom_op_gpu.cpp
@@ -55,7 +55,8 @@ struct half_copy_host final : migraphx::experimental_custom_op_base
                                           hipMemcpyHostToHost,
                                           ctx.get_queue<hipStream_t>()));
        MIGRAPHX_HIP_ASSERT(hipDeviceSynchronize());
-        MIGRAPHX_HIP_ASSERT(hipMemset(output_buffer_ptr, 0, copy_bytes));
+        MIGRAPHX_HIP_ASSERT(
+            hipMemsetAsync(output_buffer_ptr, 0, copy_bytes, ctx.get_queue<hipStream_t>()));
        MIGRAPHX_HIP_ASSERT(hipDeviceSynchronize());
        return inputs[1];
    }
@@ -97,7 +98,8 @@ struct half_copy_device final : migraphx::experimental_custom_op_base
                                           hipMemcpyDeviceToDevice,
                                           ctx.get_queue<hipStream_t>()));
        MIGRAPHX_HIP_ASSERT(hipDeviceSynchronize());
-        MIGRAPHX_HIP_ASSERT(hipMemset(output_buffer_ptr, 0, copy_bytes));
+        MIGRAPHX_HIP_ASSERT(
+            hipMemsetAsync(output_buffer_ptr, 0, copy_bytes, ctx.get_queue<hipStream_t>()));
        MIGRAPHX_HIP_ASSERT(hipDeviceSynchronize());
        return inputs[1];
    }
@@ -124,7 +126,7 @@ struct half_copy_device_same_buffer final : migraphx::experimental_custom_op_bas
    virtual bool runs_on_offload_target() const override { return true; }

    virtual migraphx::argument
-    compute(migraphx::context, migraphx::shape, migraphx::arguments inputs) const override
+    compute(migraphx::context ctx, migraphx::shape, migraphx::arguments inputs) const override
    {
        // This custom op simply sets first half size_bytes of the input 0, and rest of the half
        // bytes are copied. for this custom_op, it does its computation on the "device". Therefore,
@@ -133,7 +135,8 @@ struct half_copy_device_same_buffer final : migraphx::experimental_custom_op_bas
        auto input_bytes = inputs[0].get_shape().bytes();
        auto copy_bytes  = input_bytes / 2;
        MIGRAPHX_HIP_ASSERT(hipSetDevice(0));
-        MIGRAPHX_HIP_ASSERT(hipMemset(buffer_ptr, 0, copy_bytes));
+        MIGRAPHX_HIP_ASSERT(
+            hipMemsetAsync(buffer_ptr, 0, copy_bytes, ctx.get_queue<hipStream_t>()));
        MIGRAPHX_HIP_ASSERT(hipDeviceSynchronize());
        return inputs[0];
    }

--- a/test/gpu/literal.cpp
+++ b/test/gpu/literal.cpp
@@ -48,4 +48,4 @@ void gpu_literal_test()
    }
 }

-int main() { gpu_literal_test(); }
+int main() { gpu_literal_test(); } // NOLINT (bugprone-exception-escape)
--- a/test/gpu/mlir.cpp
+++ b/test/gpu/mlir.cpp
@@ -84,7 +84,7 @@ migraphx::program create_program_from_mlir(const migraphx::module& mmlir)
    inputs.push_back(mm->add_parameter("output", mmlir.get_output_shapes().front()));

    migraphx::gpu::context ctx;
-    migraphx::gpu::insert_mlir(*mm, mm->end(), compile_mlir(ctx, mmlir), inputs);
+    migraphx::gpu::insert_mlir(*mm, mm->end(), compile_mlir(ctx, mmlir, inputs), inputs);
    return p;
 }


--- a/test/gpu/quantization.cpp
+++ b/test/gpu/quantization.cpp
@@ -30,7 +30,6 @@
 #include <migraphx/ref/target.hpp>
 #include <migraphx/gpu/target.hpp>
 #include <migraphx/verify.hpp>
-#include <migraphx/quantization.hpp>
 #include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/pass_manager.hpp>

--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -4590,6 +4590,16 @@ def neg_test():
    return ([node], [x], [y])


+@onnx_test
+def neg_dynamic_test():
+    x = helper.make_tensor_value_info('0', TensorProto.INT64, [None, 3])
+    y = helper.make_tensor_value_info('1', TensorProto.INT64, [None, 3])
+
+    node = onnx.helper.make_node('Neg', inputs=['0'], outputs=['1'])
+
+    return ([node], [x], [y])
+
+
 @onnx_test
 def nms_test():
    b = helper.make_tensor_value_info('boxes', TensorProto.FLOAT, [1, 6, 4])
@@ -6223,6 +6233,20 @@ def sinh_test():
    return ([node], [x], [y])


+@onnx_test
+def sinh_dynamic_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [None])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [None])
+
+    node = onnx.helper.make_node(
+        'Sinh',
+        inputs=['x'],
+        outputs=['y'],
+    )
+
+    return ([node], [x], [y])
+
+
 @onnx_test
 def size_float_test():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 3, 4])

--- a/test/onnx/neg_dynamic_test.onnx
+++ b/test/onnx/neg_dynamic_test.onnx
--- a/test/onnx/onnx_test.cpp
+++ b/test/onnx/onnx_test.cpp
@@ -42,7 +42,6 @@
 #include <migraphx/op/lrn.hpp>
 #include <migraphx/op/reshape.hpp>
 #include <migraphx/op/unknown.hpp>
-#include <random>

 #include <migraphx/serialize.hpp>

@@ -3944,6 +3943,21 @@ TEST_CASE(neg_test)
    EXPECT(p == prog);
 }

+TEST_CASE(neg_dynamic_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape s{migraphx::shape::int64_type, {{1, 10, 0}, {3, 3, 0}}};
+    auto input = mm->add_parameter("0", s);
+    auto ret   = mm->add_instruction(migraphx::make_op("neg"), input);
+    mm->add_return({ret});
+
+    migraphx::onnx_options options;
+    options.default_dyn_dim_value = {1, 10, 0};
+    auto prog                     = migraphx::parse_onnx("neg_dynamic_test.onnx", options);
+    EXPECT(p == prog);
+}
+
 TEST_CASE(nms_test)
 {
    migraphx::program p;
@@ -5667,6 +5681,24 @@ TEST_CASE(sinh_test)
    EXPECT(p == prog);
 }

+TEST_CASE(sinh_dynamic_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape::dynamic_dimension dd{1, 10, 0};
+    std::vector<migraphx::shape::dynamic_dimension> dyn_dims;
+    dyn_dims.push_back(dd);
+    auto input = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, dyn_dims});
+    auto ret   = mm->add_instruction(migraphx::make_op("sinh"), input);
+    mm->add_return({ret});
+
+    migraphx::onnx_options options;
+    options.default_dyn_dim_value = dd;
+    auto prog                     = parse_onnx("sinh_dynamic_test.onnx", options);
+
+    EXPECT(p == prog);
+}
+
 TEST_CASE(size_float_test)
 {
    migraphx::program p;