Merge branch 'develop' into mlir-c

dd033c75 · Paul · 50f87a87 · 8829d6ab · dd033c75 · dd033c75
Commit dd033c75 authored Oct 18, 2021 by Paul
20 changed files
--- a/src/include/migraphx/op/nonzero.hpp
+++ b/src/include/migraphx/op/nonzero.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_NONZERO_HPP
+#define MIGRAPHX_GUARD_OPERATORS_NONZERO_HPP
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/float_equal.hpp>
+#include <migraphx/par_for.hpp>
+#include <cmath>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct nonzero
+{
+    std::string name() const { return "nonzero"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(1).standard();
+        auto elem_num                     = inputs[0].elements();
+        auto dim_num                      = inputs[0].lens().size();
+        std::vector<std::size_t> out_lens = {dim_num, elem_num};
+        return {shape::int64_type, out_lens};
+    }
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        std::vector<std::vector<std::size_t>> vec_idx;
+        auto s = args.front().get_shape();
+        args.front().visit([&](auto v) {
+            shape_for_each(s, [&](auto idx) {
+                if(not float_equal(v[s.index(idx)], 0))
+                {
+                    vec_idx.push_back(idx);
+                }
+            });
+        });
+        argument result{output_shape};
+        result.visit([&](auto output) {
+            std::fill(output.begin(), output.end(), 0);
+            par_for(vec_idx.size(), [&](auto i) {
+                for(std::size_t j = 0; j < vec_idx.front().size(); ++j)
+                {
+                    output[output_shape.index({j, i})] = vec_idx[i][j];
+                }
+            });
+        });
+        return result;
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/quant_dot.hpp
+++ b/src/include/migraphx/op/quant_dot.hpp
@@ -18,21 +18,12 @@ namespace op {
 struct quant_dot
 {
-    int32_t alpha = 1;
-    int32_t beta  = 1;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return pack(f(self.alpha, "alpha"), f(self.beta, "beta"));
-    }
    value attributes() const { return {{"general_data_type", "dot"}}; }
    std::string name() const { return "quant_dot"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.same_type();
+        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.same_type().has(2);
        const shape& a = inputs.at(0);
        const shape& b = inputs.at(1);
        auto t         = a.type();
@@ -64,18 +55,6 @@ struct quant_dot
        auto out_lens   = a.lens();
        out_lens[dim_1] = b.lens()[dim_1];
-        if(inputs.size() == 3 && out_lens != inputs.at(2).lens())
-        {
-            MIGRAPHX_THROW("QUANT_DOT: dimension mismatch, operand C: {" +
-                           to_string_range(inputs.at(2).lens()) +
-                           "}, cannot add to operand A * B: {" + to_string_range(out_lens) + "}");
-        }
-        if(inputs.size() == 3 && inputs.at(2).type() != shape::int32_type)
-        {
-            MIGRAPHX_THROW("QUANT_DOT: operand C type must be int32");
-        }
        return {shape::int32_type, out_lens};
    }
 };

--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -57,6 +57,7 @@
 #include <migraphx/op/mul.hpp>
 #include <migraphx/op/multibroadcast.hpp>
 #include <migraphx/op/neg.hpp>
+#include <migraphx/op/nonzero.hpp>
 #include <migraphx/op/outline.hpp>
 #include <migraphx/op/pad.hpp>
 #include <migraphx/op/pooling.hpp>

--- a/src/include/migraphx/remap.hpp
+++ b/src/include/migraphx/remap.hpp
-#ifndef MIGRAPHX_GUARD_RTGLIB_REMAP_HPP
-#define MIGRAPHX_GUARD_RTGLIB_REMAP_HPP
-#include <string>
-#include <migraphx/instruction_ref.hpp>
-#include <migraphx/config.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-struct module;
-/**
- * Decompose operators.
- */
-struct remap
-{
-    std::string name() const { return "remap"; }
-    void apply(module& p) const;
-};
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-#endif
--- a/src/onnx/parse_depthtospace.cpp
+++ b/src/onnx/parse_depthtospace.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+struct parse_depthtospace : op_parser<parse_depthtospace>
+{
+    std::vector<op_desc> operators() const { return {{"DepthToSpace"}}; }
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& /*parser*/,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        auto s = args[0]->get_shape();
+        // mode attribute of DepthToSpace
+        auto mode = std::string("DCR");
+        if(contains(info.attributes, "mode"))
+        {
+            mode = info.attributes.at("mode").s(); // DCR or CRD?
+        }
+        // blocksize attribute of DepthToSpace
+        int blocksize = 0;
+        if(contains(info.attributes, "blocksize"))
+        {
+            blocksize = info.attributes.at("blocksize").i();
+        }
+        if(blocksize < 1)
+        {
+            MIGRAPHX_THROW("DepthToSpace: blocksize is less than 1");
+        }
+        // calculate dimensions
+        auto lens1            = s.lens();
+        auto lens2            = s.lens();
+        unsigned long divisor = std::pow(blocksize, 2);
+        if((lens2[1] % divisor) == 0)
+            lens2[1] = lens2[1] / divisor;
+        else
+            MIGRAPHX_THROW("DepthToSpace: div by blocksize quotient not int ");
+        lens1.push_back(lens1[2]);
+        lens1.push_back(lens1[3]);
+        lens2[2] = lens2[2] * blocksize;
+        lens2[3] = lens2[3] * blocksize;
+        lens1[2] = blocksize;
+        std::vector<int64_t> perm;
+        if(mode == "DCR")
+        {
+            lens1[3] = lens1[1] / divisor;
+            lens1[1] = blocksize;
+            perm     = {0, 3, 4, 1, 5, 2};
+        }
+        else if(mode == "CRD")
+        {
+            lens1[1] = lens1[1] / divisor;
+            lens1[3] = blocksize;
+            perm     = {0, 1, 4, 2, 5, 3};
+        }
+        else
+            MIGRAPHX_THROW("DepthToSpace: mode attribute cannot be read.");
+        auto temp1 = info.add_instruction(make_op("reshape", {{"dims", lens1}}), args[0]);
+        auto temp2 = info.add_instruction(make_op("transpose", {{"permutation", perm}}), temp1);
+        return info.add_instruction(make_op("reshape", {{"dims", lens2}}),
+                                    info.make_contiguous(temp2));
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_gemm.cpp
+++ b/src/onnx/parse_gemm.cpp
@@ -61,7 +61,7 @@ struct parse_gemm : op_parser<parse_gemm>
                      ? info.add_instruction(make_op("transpose", {{"permutation", perm}}), args[1])
                      : args[1];
-        auto ret = info.add_instruction(make_op("dot", {{"alpha", 1.0f}, {"beta", 0.0f}}), l1, l2);
+        auto ret = info.add_instruction(make_op("dot"), l1, l2);
        if(args.size() == 3)
        {

--- a/src/onnx/parse_matmul.cpp
+++ b/src/onnx/parse_matmul.cpp
@@ -66,9 +66,7 @@ struct parse_matmul : op_parser<parse_matmul>
                    make_op("multibroadcast", {{"out_lens", l1_broadcasted_lens}}), l1);
            }
        }
+        instruction_ref dot_res = info.add_instruction(make_op(opd.op_name), bl0, bl1);
-        auto dot_res =
-            info.add_instruction(make_op(opd.op_name, {{"alpha", 1}, {"beta", 0}}), bl0, bl1);
        int64_t num_axis        = static_cast<int64_t>(dot_res->get_shape().lens().size());
        if(is_a_prepended)
        {

--- a/src/onnx/parse_multinomial.cpp
+++ b/src/onnx/parse_multinomial.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <random>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+struct parse_multinomial : op_parser<parse_multinomial>
+{
+    std::vector<op_desc> operators() const { return {{"Multinomial"}}; }
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& /*parser*/,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        int dtype = 6;
+        if(contains(info.attributes, "dtype"))
+            dtype = info.attributes.at("dtype").i();
+        shape::type_t output_type = get_type(dtype);
+        size_t sample_size = 1;
+        if(contains(info.attributes, "sample_size"))
+            sample_size = info.attributes.at("sample_size").i();
+        float seed = static_cast<float>(
+            std::chrono::high_resolution_clock::now().time_since_epoch().count());
+        if(contains(info.attributes, "seed"))
+            seed = info.attributes.at("seed").f();
+        // Subtract the per-batch maximum log-probability, making the per-batch max 0
+        auto maxes =
+            info.add_instruction(migraphx::make_op("reduce_max", {{"axes", {1}}}), args[0]);
+        auto mb_maxes = info.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", args[0]->get_shape().lens()}}),
+            maxes);
+        auto cdf = info.add_instruction(migraphx::make_op("sub"), args[0], mb_maxes);
+        // Take the element-wise exponent to get probabilities in the range (0, 1]
+        cdf = info.add_instruction(migraphx::make_op("exp"), cdf);
+        // Compute the cumulative density function
+        cdf = info.add_instruction(
+            migraphx::make_op("prefix_scan_sum", {{"axis", 1}, {"exclusive", false}}), cdf);
+        // Pre-compute random distribution
+        std::mt19937 gen(seed);
+        std::uniform_real_distribution<> dis(0.0, 1.0);
+        size_t batch_size = args[0]->get_shape().lens().front();
+        migraphx::shape dist_shape{migraphx::shape::float_type, {batch_size, sample_size}};
+        std::vector<float> random_dist(batch_size * sample_size);
+        std::generate(random_dist.begin(), random_dist.end(), [&]() { return dis(gen); });
+        auto dist_lit = info.add_literal(migraphx::literal{dist_shape, random_dist});
+        return info.add_instruction(
+            migraphx::make_op("multinomial", {{"dtype", output_type}}), cdf, dist_lit);
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_nonzero.cpp
+++ b/src/onnx/parse_nonzero.cpp
@@ -9,7 +9,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace onnx {
 template <class T>
-std::vector<std::size_t> nonzero_indices(const std::vector<T>& data)
+static std::vector<std::size_t> nonzero_indices(const std::vector<T>& data)
 {
    std::vector<std::size_t> indices;
    for(std::size_t i = 0; i < data.size(); ++i)
@@ -31,8 +31,12 @@ struct parse_nonzero : op_parser<parse_nonzero>
                          std::vector<instruction_ref> args) const
    {
        migraphx::argument data_arg = args.back()->eval();
-        check_arg_empty(data_arg, "PARSE_NONZERO: cannot support non-constant input!");
+        if(data_arg.empty())
+        {
+            return info.add_instruction(make_op("nonzero"), args);
+        }
+        else
+        {
            std::vector<std::size_t> indices;
            data_arg.visit([&](auto val) {
                using val_type = std::remove_cv_t<typename decltype(val)::value_type>;
@@ -56,6 +60,7 @@ struct parse_nonzero : op_parser<parse_nonzero>
            return info.add_literal(literal(out_s, out_data));
        }
+    }
 };
 } // namespace onnx

--- a/src/onnx/parse_randomnormal_ops.cpp
+++ b/src/onnx/parse_randomnormal_ops.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <random>
+#include <set>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+struct parse_randomnormal_ops : op_parser<parse_randomnormal_ops>
+{
+    const std::set<shape::type_t> valid_types = {
+        shape::float_type, shape::half_type, shape::double_type};
+    std::vector<op_desc> operators() const { return {{"RandomNormal"}, {"RandomNormalLike"}}; }
+    instruction_ref parse(const op_desc& opd,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        int dtype      = 1;
+        bool use_dtype = false;
+        if(contains(info.attributes, "dtype"))
+        {
+            dtype     = info.attributes.at("dtype").i();
+            use_dtype = true;
+        }
+        shape::type_t out_type = get_type(dtype);
+        if(not contains(valid_types, out_type))
+            MIGRAPHX_THROW(opd.op_name + ": invalid output type: " + std::to_string(dtype) +
+                           ". Valid types are 1 (float), 10 (half), and 11 (double).");
+        float mean = 0.0;
+        if(contains(info.attributes, "mean"))
+            mean = info.attributes.at("mean").f();
+        float scale = 1.0;
+        if(contains(info.attributes, "scale"))
+            scale = info.attributes.at("scale").f();
+        float seed = static_cast<float>(
+            std::chrono::high_resolution_clock::now().time_since_epoch().count());
+        if(contains(info.attributes, "seed"))
+            seed = info.attributes.at("seed").f();
+        shape out_shape;
+        if(contains(info.attributes, "shape"))
+        {
+            // RandomNormal:
+            // output type and shape must come from attributes
+            std::vector<int> out_lens;
+            literal ls = parser.parse_value(info.attributes.at("shape"));
+            ls.visit([&](auto s) { out_lens.assign(s.begin(), s.end()); });
+            out_shape = shape{out_type, out_lens};
+        }
+        else if(args.size() == 1)
+        {
+            // RandomNormalLike:
+            // output type and shape are the same as the input's by default
+            // dtype is used instead when attribute is set
+            if(not contains(valid_types, args[0]->get_shape().type()))
+                MIGRAPHX_THROW(opd.op_name + ": invalid output type: " +
+                               std::to_string(args[0]->get_shape().type()) +
+                               ". Valid types are float, half, and double.");
+            out_shape =
+                use_dtype ? shape{out_type, args[0]->get_shape().lens()} : args[0]->get_shape();
+        }
+        else
+        {
+            MIGRAPHX_THROW(opd.op_name +
+                           ": cannot deduce shape without shape attribute or argument.");
+        }
+        std::mt19937 gen(seed);
+        std::normal_distribution<> d(mean, scale);
+        std::vector<double> rand_vals(out_shape.elements());
+        std::generate(rand_vals.begin(), rand_vals.end(), [&]() { return d(gen); });
+        return info.add_literal(literal{out_shape, rand_vals});
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_randomuniform_ops.cpp
+++ b/src/onnx/parse_randomuniform_ops.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <random>
+#include <set>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+struct parse_randomuniform_ops : op_parser<parse_randomuniform_ops>
+{
+    const std::set<shape::type_t> valid_types = {
+        shape::float_type, shape::half_type, shape::double_type};
+    std::vector<op_desc> operators() const { return {{"RandomUniform"}, {"RandomUniformLike"}}; }
+    instruction_ref parse(const op_desc& opd,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        int dtype      = 1;
+        bool use_dtype = false;
+        if(contains(info.attributes, "dtype"))
+        {
+            dtype     = info.attributes.at("dtype").i();
+            use_dtype = true;
+        }
+        shape::type_t out_type = get_type(dtype);
+        if(not contains(valid_types, out_type))
+            MIGRAPHX_THROW(opd.op_name + ": invalid output type: " + std::to_string(dtype) +
+                           ". Valid types are 1 (float), 10 (half), and 11 (double).");
+        float high = 1.0;
+        if(contains(info.attributes, "high"))
+            high = info.attributes.at("high").f();
+        float low = 0.0;
+        if(contains(info.attributes, "low"))
+            low = info.attributes.at("low").f();
+        float seed = static_cast<float>(
+            std::chrono::high_resolution_clock::now().time_since_epoch().count());
+        if(contains(info.attributes, "seed"))
+            seed = info.attributes.at("seed").f();
+        shape out_shape;
+        if(contains(info.attributes, "shape"))
+        {
+            // RandomUniform:
+            // output type and shape must come from attributes
+            std::vector<int> out_lens;
+            literal ls = parser.parse_value(info.attributes.at("shape"));
+            ls.visit([&](auto s) { out_lens.assign(s.begin(), s.end()); });
+            out_shape = shape{out_type, out_lens};
+        }
+        else if(args.size() == 1)
+        {
+            // RandomUniformLike:
+            // output type and shape are the same as the input by default
+            // dtype is used instead when attribute is set
+            if(not contains(valid_types, args[0]->get_shape().type()))
+                MIGRAPHX_THROW(opd.op_name + ": invalid output type: " +
+                               std::to_string(args[0]->get_shape().type()) +
+                               ". Valid types are float, half, and double.");
+            out_shape =
+                use_dtype ? shape{out_type, args[0]->get_shape().lens()} : args[0]->get_shape();
+        }
+        else
+        {
+            MIGRAPHX_THROW(opd.op_name +
+                           ": cannot deduce shape without shape attribute or argument.");
+        }
+        std::mt19937 gen(seed);
+        std::uniform_real_distribution<> d(high, low);
+        std::vector<double> rand_vals(out_shape.elements());
+        std::generate(rand_vals.begin(), rand_vals.end(), [&]() { return d(gen); });
+        return info.add_literal(literal{out_shape, rand_vals});
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/program.cpp
+++ b/src/program.cpp
@@ -309,8 +309,11 @@ std::vector<argument> program::eval(parameter_map params) const
                                double t2 = t.record<milliseconds>();
                                std::cout << "Time: " << t1 << "ms, " << t2 << "ms" << std::endl;
                                if(trace_level > 1 and ins->name().front() != '@' and
-                                   ins->name() != "load")
+                                   ins->name() != "load" and not result.empty())
-                                    std::cout << "Output: " << result << std::endl;
+                                {
+                                    target tgt = make_target(this->impl->target_name);
+                                    std::cout << "Output: " << tgt.copy_from(result) << std::endl;
+                                }
                                return result;
                            }));
    }

--- a/src/remap.cpp
+++ b/src/remap.cpp
-#include <migraphx/remap.hpp>
-#include <migraphx/program.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/functional.hpp>
-#include <migraphx/ranges.hpp>
-#include <migraphx/float_equal.hpp>
-#include <migraphx/matcher.hpp>
-#include <migraphx/op/dot.hpp>
-#include <migraphx/op/add.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace {
-struct find_dot_add
-{
-    auto matcher() const
-    {
-        return match::name("add")(match::any_of(
-            match::args(match::name("dot")(match::nargs(2)).bind("dot"), match::any().bind("a")),
-            match::args(match::used_once().bind("a"),
-                        match::name("dot")(match::nargs(2)).bind("dot"))));
-    }
-    void apply(module& p, match::matcher_result r) const
-    {
-        auto ins     = r.result;
-        auto dot_ins = r.instructions["dot"];
-        auto a_ins   = r.instructions["a"];
-        auto dot = any_cast<op::dot>(dot_ins->get_operator());
-        dot.beta = 1;
-        p.replace_instruction(ins, dot, dot_ins->inputs()[0], dot_ins->inputs()[1], a_ins);
-    }
-};
-} // namespace
-void remap::apply(module& p) const { match::find_matches(p, find_dot_add{}); }
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/simplify_qdq.cpp
+++ b/src/simplify_qdq.cpp
@@ -84,13 +84,7 @@ struct match_find_quantizable_ops
        }
        else if(qop->name() == "dot")
        {
-            auto dot_op = any_cast<op::dot>(qop->get_operator());
+            dq = m.insert_instruction(qop, migraphx::make_op("quant_dot"), qop_args);
-            if(!(float_equal(dot_op.alpha, 1.0f) and float_equal(dot_op.beta, 0.0f)))
-                return;
-            if(qop_args.size() == 3)
-                qop_args.pop_back();
-            dq = m.insert_instruction(
-                qop, migraphx::make_op("quant_dot", {{"alpha", 1}, {"beta", 0}}), qop_args);
        }
        auto ins_type = qop->get_shape().type();
        dq_scale      = m.add_literal(literal({ins_type}, {scale}));

--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -3,7 +3,6 @@
 #include <migraphx/check_context.hpp>
 #include <migraphx/adjust_allocation.hpp>
 #include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/decompose.hpp>
 #include <migraphx/eliminate_allocation.hpp>
 #include <migraphx/eliminate_common_subexpression.hpp>
 #include <migraphx/eliminate_concat.hpp>
@@ -14,7 +13,6 @@
 #include <migraphx/memory_coloring.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
-#include <migraphx/remap.hpp>
 #include <migraphx/rewrite_batchnorm.hpp>
 #include <migraphx/rewrite_pooling.hpp>
 #include <migraphx/rewrite_quantization.hpp>
@@ -52,8 +50,6 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
            dead_code_elimination{},
            eliminate_data_type{unsupported_types, shape::type_t::float_type},
            dead_code_elimination{},
-            decompose{},
-            dead_code_elimination{},
            simplify_reshapes{},
            eliminate_identity{},
            eliminate_pad{},

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -59,6 +59,8 @@ add_library(migraphx_device
    device/mul.cpp
    device/mul_add.cpp
    device/mul_add_relu.cpp
+    device/multinomial.cpp
+    device/nonzero.cpp
    device/pad.cpp
    device/pow.cpp
    device/prelu.cpp
@@ -143,6 +145,8 @@ add_library(migraphx_gpu
    lrn.cpp
    leaky_relu.cpp
    mlir_conv.cpp
+    multinomial.cpp
+    nonzero.cpp
    pack_args.cpp
    pack_int8_args.cpp
    pad.cpp
@@ -199,6 +203,8 @@ register_migraphx_gpu_ops(hip_
    max
    min
    mul
+    multinomial
+    nonzero
    pad
    pow
    prelu

--- a/src/targets/gpu/device/include/migraphx/gpu/device/float_equal.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/float_equal.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_FLOAT_EQUAL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_FLOAT_EQUAL_HPP
+#include <migraphx/requires.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/types.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+template <class... Ts>
+using common_type = typename std::common_type<Ts...>::type;
+template <class T, MIGRAPHX_REQUIRES(is_floating_point<T>{})>
+__device__ bool float_equal_device(T x, T y)
+{
+    return std::isfinite(x) and std::isfinite(y) and
+           std::nextafter(x, std::numeric_limits<T>::lowest()) <= y and
+           std::nextafter(x, std::numeric_limits<T>::max()) >= y;
+}
+template <class T, MIGRAPHX_REQUIRES(not is_floating_point<T>{})>
+__device__ bool float_equal_device(T x, T y)
+{
+    return x == y;
+}
+template <class T, class U>
+__device__ bool float_equal(T x, U y)
+{
+    return float_equal_device<common_type<T, U>>(x, y);
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@@ -12,10 +12,6 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-#if __AMDGCN_WAVEFRONT_SIZE == 32
-#define MIGRAPHX_NO_DPP
-#endif
 #ifdef MIGRAPHX_NO_DPP
 template <index_int N,
          class Op,
@@ -98,10 +94,12 @@ __device__ void dpp_reduce(T& in, Op op)
    in  = op(in, out);
    out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
    in  = op(in, out);
+#if __AMDGCN_WAVEFRONT_SIZE == 64
    out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
    in  = op(in, out);
    out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
    in  = op(in, out);
+#endif
 }
 __device__ inline void dpp_reduce(float& x, sum)
@@ -118,9 +116,11 @@ __device__ inline void dpp_reduce(float& x, sum)
                     "s_nop 1\n"
                     "v_add_f32 %0 %0 %0 row_shr:8 bank_mask:0xc\n"
                     "s_nop 1\n"
+#if __AMDGCN_WAVEFRONT_SIZE == 64
                     "v_add_f32 %0 %0 %0 row_bcast:15 row_mask:0xa\n"
                     "s_nop 1\n"
                     "v_add_f32 %0 %0 %0 row_bcast:31 row_mask:0xc\n"
+#endif
                     "s_nop 1\n"
                     : "=v"(x)
                     : "0"(x));
@@ -135,21 +135,27 @@ template <index_int N,
          MIGRAPHX_REQUIRES(not std::is_integral<ForStride>{})>
 __device__ auto block_reduce(index idx, Op op, T init, ForStride fs, F f)
 {
+#if __AMDGCN_WAVEFRONT_SIZE == 32
+    constexpr index_int nthreads = 16;
+#else
+    constexpr index_int nthreads = 64;
+#endif
    using type                   = decltype(f(deduce_for_stride(fs)));
-    MIGRAPHX_DEVICE_SHARED type buffer[N / 64];
+    MIGRAPHX_DEVICE_SHARED type buffer[N / nthreads];
    type x = init;
    fs([&](auto i) { x = op(x, f(i)); });
    dpp_reduce(x, op);
-    const auto ldsidx = idx.local / 64;
+    const auto ldsidx = idx.local / nthreads;
-    if((idx.local % 64) == 63)
+    if((idx.local % nthreads) == nthreads - 1)
    {
        buffer[ldsidx] = x;
    }
    __syncthreads();
    type y = init;
-    for(index_int i = 0; i < idx.nlocal() / 64; i++)
+    for(index_int i = 0; i < idx.nlocal() / nthreads; i++)
    {
        y = op(y, buffer[i]);
    }

--- a/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
@@ -129,6 +129,21 @@ __device__ __host__ T to_hip_type(T x)
 // Hip doens't support __fp16
 inline __device__ __host__ float to_hip_type(gpu_half x) { return x; }
+#define MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(trait, T) \
+    template <class X>                             \
+    struct trait : std::trait<X>                   \
+    {                                              \
+    };                                             \
+                                                   \
+    template <>                                    \
+    struct trait<T> : std::true_type               \
+    {                                              \
+    };
+MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_floating_point, __fp16)
+MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_signed, __fp16)
+MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_arithmetic, __fp16)
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/device/layernorm.cpp
+++ b/src/targets/gpu/device/layernorm.cpp
@@ -8,6 +8,14 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
+#ifndef MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
+#if __AMDGCN_WAVEFRONT_SIZE == 32
+#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 1
+#else
+#define MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC 0
+#endif
+#endif
 template <class T>
 struct vector_type
 {
@@ -86,10 +94,13 @@ __device__ void layernorm(index_int i,
    const bool in_range    = idx.local < relements_v;
    auto mean = [&](auto z) {
-        return auto_block_reduce<MaxBlockSize>(
+        auto m = auto_block_reduce<MaxBlockSize>(
                     idx, sum{}, value_type(0), relements_v, [=](auto) { return z; }) /
                 value_type(relements);
+#if MIGRAPHX_WORKAROUND_NAVI_DPP_SYNC
+        __builtin_amdgcn_s_barrier();
+#endif
+        return m;
    };
    // m = x - mean(x)