Merge branch 'jit-layernorm-merge' into bert-opt-layernorm

b076d0f4 · Paul · 03c6967e · d705e483 · b076d0f4 · b076d0f4
Commit b076d0f4 authored Aug 23, 2022 by Paul
20 changed files
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -90,7 +90,6 @@ add_library(migraphx
    shape.cpp
    simplify_algebra.cpp
    simplify_reshapes.cpp
-    target_assignments.cpp
    tmp_dir.cpp
    value.cpp
    verify_args.cpp

--- a/src/include/migraphx/onnx.hpp
+++ b/src/include/migraphx/onnx.hpp
@@ -35,17 +35,13 @@ struct onnx_options
 {
    /// Old way to set default fixed dimension size
    std::size_t default_dim_value = 0;
-    /*!
-     * Default dynamic dimension size (if both default_dim_value and default_dyn_dim_value
-     * set parser throws)
-     */
+    /// Default dynamic dimension size (if both default_dim_value and default_dyn_dim_value set
+    /// parser throws)
    shape::dynamic_dimension default_dyn_dim_value = {1, 1, 0};
    /// Explicitly specify the dims of an input
    std::unordered_map<std::string, std::vector<std::size_t>> map_input_dims = {};
-    /*!
-     * Explicitly specify dynamic dims of an input (if both map_input_dims and
-     * map_dyn_input_dims set parser throws)
-     */
+    /// Explicitly specify dynamic dims of an input (if both map_input_dims and map_dyn_input_dims
+    /// set parser throws)
    std::unordered_map<std::string, std::vector<shape::dynamic_dimension>> map_dyn_input_dims = {};
    /// Continue parsing onnx file if an unknown operator is found
    bool skip_unknown_operators = false;
@@ -53,6 +49,8 @@ struct onnx_options
    bool print_program_on_error = false;
    /// Max iter num for the loop operator
    int64_t max_loop_iterations = 10;
+    /// Use dynamic output for operators when available
+    bool use_dyn_output = false;
 };

 /// Create a program from an onnx file

--- a/src/include/migraphx/op/convert.hpp
+++ b/src/include/migraphx/op/convert.hpp
@@ -45,7 +45,15 @@ struct convert : unary<convert>
    shape compute_shape(std::vector<shape> inputs) const
    {
        check_shapes{inputs, *this}.has(1);
-        return {target_type, inputs.at(0).lens(), inputs.at(0).strides()};
+        auto input = inputs.at(0);
+        if(input.dynamic())
+        {
+            return {target_type, input.dyn_dims()};
+        }
+        else
+        {
+            return {target_type, input.lens(), input.strides()};
+        }
    }

    std::string point_op() const

--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -45,11 +45,13 @@ namespace op {
 struct nonmaxsuppression
 {
    bool center_point_box = false;
+    bool use_dyn_output   = false;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return pack(f(self.center_point_box, "center_point_box"));
+        return pack(f(self.center_point_box, "center_point_box"),
+                    f(self.use_dyn_output, "use_dyn_output"));
    }

    std::string name() const { return "nonmaxsuppression"; }
@@ -57,27 +59,81 @@ struct nonmaxsuppression
    shape compute_shape(std::vector<shape> inputs) const
    {
        // requires at least 2 inputs
-        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.only_dims(3);
-        auto lens = inputs.front().lens();
+        check_shapes{{inputs.at(0), inputs.at(1)}, *this, true}.only_dims(3).same_ndims();
+        auto boxes_max_lens = inputs.at(0).max_lens();
+        // num batches * num boxes
+        const auto max_num_boxes = boxes_max_lens.at(0) * boxes_max_lens.at(1);

-        // check input shape
-        if(lens[1] != inputs.at(1).lens()[2])
+        auto fixed_shape_error_check = [&]() {
+            auto lens = inputs.front().lens();
+            if(lens[1] != inputs.at(1).lens()[2])
+            {
+                MIGRAPHX_THROW(
+                    "NonMaxSuppression: spatial dimension mismatch between boxes and scores input");
+            }
+            if(lens[0] != inputs.at(1).lens()[0])
+            {
+                MIGRAPHX_THROW(
+                    "NonMaxSuppression: number of batches mismatch between boxes and scores input");
+            }
+        };
+
+        if(use_dyn_output)
        {
-            MIGRAPHX_THROW(
-                "NonMaxSuppression: spatial dimension mismatch between boxes and scores input");
+            if(inputs.at(0).dynamic())
+            {
+                // both boxes and scores should be dynamic
+                // check dynamic dimensions are consistent
+                const auto boxes_dims  = inputs.at(0).dyn_dims();
+                const auto scores_dims = inputs.at(1).dyn_dims();
+                if(boxes_dims.at(1) != scores_dims.at(2))
+                {
+                    MIGRAPHX_THROW("NonMaxSuppression: dynamic spatial dimension mismatch between "
+                                   "boxes and scores input");
+                }
+                if(boxes_dims.at(0) != scores_dims.at(0))
+                {
+                    MIGRAPHX_THROW("NonMaxSuppression: dynamic number of batches mismatch between "
+                                   "boxes and scores input");
+                }
+            }
+            else if(inputs.at(1).dynamic())
+            {
+                // scores has dynamic shape, boxes fixed shape
+                // check that it is only a dynamic number of classes
+                const auto scores_dims = inputs.at(1).dyn_dims();
+                const auto boxes_lens  = inputs.at(0).lens();
+                if(not scores_dims.at(0).is_fixed() or scores_dims.at(0).max != boxes_lens.at(0))
+                {
+                    MIGRAPHX_THROW("NonMaxSuppression: scores dynamic num_classes; num_batches not "
+                                   "fixed or mismatched");
+                }
+                if(not scores_dims.at(2).is_fixed() or scores_dims.at(2).max != boxes_lens.at(1))
+                {
+                    MIGRAPHX_THROW("NonMaxSuppression: scores dynamic num_classes; "
+                                   "spatial_dimension not fixed or mismatches");
+                }
+            }
+            else
+            {
+                fixed_shape_error_check();
+            }
+            std::vector<shape::dynamic_dimension> out_lens = {};
+            out_lens.push_back({0, max_num_boxes, 0});
+            out_lens.push_back({3, 3, 0});
+            return {shape::int64_type, out_lens};
        }
-
-        // check batch sizes
-        if(lens[0] != inputs.at(1).lens()[0])
+        else
        {
-            MIGRAPHX_THROW(
-                "NonMaxSuppression: number of batches mismatch between boxes and scores input");
+            if(inputs.at(0).dynamic() or inputs.at(1).dynamic())
+            {
+                MIGRAPHX_THROW(
+                    "NonMaxSuppression: dynamic input shape with use_dyn_output set to false");
+            }
+            fixed_shape_error_check();
+            std::vector<std::size_t> out_lens = {max_num_boxes, 3};
+            return {shape::int64_type, out_lens};
        }
-
-        std::vector<int64_t> out_lens(2);
-        out_lens.at(0) = lens.at(1);
-        out_lens.at(1) = 3;
-        return {shape::int64_type, out_lens};
    }

    struct box
@@ -181,13 +237,13 @@ struct nonmaxsuppression
    }

    template <class Output, class Boxes, class Scores>
-    void compute_nms(Output output,
-                     Boxes boxes,
-                     Scores scores,
-                     const shape& output_shape,
-                     std::size_t max_output_boxes_per_class,
-                     double iou_threshold,
-                     double score_threshold) const
+    std::size_t compute_nms(Output output,
+                            Boxes boxes,
+                            Scores scores,
+                            const shape& max_output_shape,
+                            std::size_t max_output_boxes_per_class,
+                            double iou_threshold,
+                            double score_threshold) const
    {
        std::fill(output.begin(), output.end(), 0);
        const auto& lens       = scores.get_shape().lens();
@@ -197,7 +253,7 @@ struct nonmaxsuppression
        // boxes of a class with NMS applied [score, index]
        std::vector<std::pair<double, int64_t>> selected_boxes_inside_class;
        std::vector<int64_t> selected_indices;
-        selected_boxes_inside_class.reserve(output_shape.elements());
+        selected_boxes_inside_class.reserve(max_output_shape.elements());
        // iterate over batches and classes
        shape comp_s{shape::double_type, {num_batches, num_classes}};
        shape_for_each(comp_s, [&](auto idx) {
@@ -237,11 +293,14 @@ struct nonmaxsuppression
            }
        });
        std::copy(selected_indices.begin(), selected_indices.end(), output.begin());
+        return selected_indices.size() / 3;
    }

    argument compute(const shape& output_shape, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        // make buffer of maximum size
+        shape max_output_shape = {output_shape.type(), output_shape.max_lens()};
+        argument result{max_output_shape};

        std::size_t max_output_boxes_per_class =
            (args.size() > 2) ? (args.at(2).at<std::size_t>()) : 0;
@@ -249,22 +308,29 @@ struct nonmaxsuppression
        {
            return result;
        }
-        double iou_threshold   = (args.size() > 3) ? (args.at(3).at<double>()) : 0.0f;
-        double score_threshold = (args.size() > 4) ? (args.at(4).at<double>()) : 0.0f;
+        double iou_threshold     = (args.size() > 3) ? (args.at(3).at<double>()) : 0.0f;
+        double score_threshold   = (args.size() > 4) ? (args.at(4).at<double>()) : 0.0f;
+        std::size_t num_selected = 0;

        result.visit([&](auto output) {
            visit_all(args[0], args[1])([&](auto boxes, auto scores) {
-                compute_nms(output,
-                            boxes,
-                            scores,
-                            output_shape,
-                            max_output_boxes_per_class,
-                            iou_threshold,
-                            score_threshold);
+                num_selected = compute_nms(output,
+                                           boxes,
+                                           scores,
+                                           max_output_shape,
+                                           max_output_boxes_per_class,
+                                           iou_threshold,
+                                           score_threshold);
            });
        });
-
-        return result;
+        if(use_dyn_output)
+        {
+            return result.reshape({output_shape.type(), {num_selected, 3}});
+        }
+        else
+        {
+            return result;
+        }
    }
 };


--- a/src/target_assignments.cpp
+++ b/src/target_assignments.cpp
@@ -21,16 +21,24 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+#ifndef MIGRAPHX_GUARD_MIGRAPHX_SUPPORTED_SEGMENTS_HPP
+#define MIGRAPHX_GUARD_MIGRAPHX_SUPPORTED_SEGMENTS_HPP

-#include <migraphx/target_assignments.hpp>
+#include <unordered_set>
+
+#include <migraphx/instruction_ref.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-void target_assignments::add_assignment(instruction_ref ins, const std::string& target)
+struct supported_segment
 {
-    assignments.emplace(ins, target);
-}
+    std::unordered_set<instruction_ref> instructions;
+    float metric;
+};
+
+using supported_segments = std::vector<supported_segment>;

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
+#endif // MIGRAPHX_GUARD_MIGRAPHX_SUPPORTED_SEGMENTS_HPP
--- a/src/include/migraphx/target.hpp
+++ b/src/include/migraphx/target.hpp
@@ -37,8 +37,10 @@
 #include <migraphx/compile_options.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/rank.hpp>
+#include <migraphx/module_ref.hpp>
 #include <migraphx/support_metric.hpp>
 #include <migraphx/instruction_ref.hpp>
+#include <migraphx/supported_segments.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -64,12 +66,12 @@ struct target
     */
    context get_context() const;
    /**
-     * @brief Check how well an instruction is supported on a target with the given metric
-     * @param ins Instruction to check if it's supported
-     * @param metric Used to define how the return value should be interpreted
-     * @return The value based on the chosen metric. Negative numbers mean unsupported
+     * @brief Get the ranges of instructions that are supported on a target
+     * @param module Module to check for supported instructions
+     * @param metric Used to define how the quality of the support should be measured
+     * @return the supported segments of the graph
     */
-    float is_supported(T&, instruction_ref ins, support_metric m) const;
+    supported_segments target_is_supported(T&, const_module_ref mod, support_metric metric) const;
    /**
     * @brief copy an argument to the current target.
     *
@@ -115,9 +117,9 @@ argument copy_from_target(T&, const argument& arg)
 }

 template <class T>
-float target_is_supported(T&, instruction_ref, support_metric)
+supported_segments target_find_supported(T&, const_module_ref, support_metric)
 {
-    return 0;
+    return {};
 }

 #ifdef TYPE_ERASED_DECLARATION
@@ -132,7 +134,7 @@ struct target
    //
    context get_context() const;
    // (optional)
-    float is_supported(instruction_ref ins, support_metric m) const;
+    supported_segments find_supported(const_module_ref mod, support_metric m) const;
    // (optional)
    argument copy_to(const argument& input) const;
    // (optional)
@@ -224,10 +226,10 @@ struct target
        return (*this).private_detail_te_get_handle().get_context();
    }

-    float is_supported(instruction_ref ins, support_metric m) const
+    supported_segments find_supported(const_module_ref mod, support_metric m) const
    {
        assert((*this).private_detail_te_handle_mem_var);
-        return (*this).private_detail_te_get_handle().is_supported(ins, m);
+        return (*this).private_detail_te_get_handle().find_supported(mod, m);
    }

    argument copy_to(const argument& input) const
@@ -261,33 +263,33 @@ struct target
        virtual std::shared_ptr<private_detail_te_handle_base_type> clone() const = 0;
        virtual const std::type_info& type() const                                = 0;

-        virtual std::string name() const                                           = 0;
+        virtual std::string name() const                                                        = 0;
        virtual std::vector<pass> get_passes(context& ctx,
-                                             const compile_options& options) const = 0;
-        virtual context get_context() const                                        = 0;
-        virtual float is_supported(instruction_ref ins, support_metric m) const    = 0;
-        virtual argument copy_to(const argument& input) const                      = 0;
-        virtual argument copy_from(const argument& input) const                    = 0;
-        virtual argument allocate(const shape& s) const                            = 0;
+                                             const compile_options& options) const              = 0;
+        virtual context get_context() const                                                     = 0;
+        virtual supported_segments find_supported(const_module_ref mod, support_metric m) const = 0;
+        virtual argument copy_to(const argument& input) const                                   = 0;
+        virtual argument copy_from(const argument& input) const                                 = 0;
+        virtual argument allocate(const shape& s) const                                         = 0;
    };

    template <class T>
-    static auto private_detail_te_default_is_supported(char,
-                                                       T&& private_detail_te_self,
-                                                       instruction_ref ins,
-                                                       support_metric m)
-        -> decltype(private_detail_te_self.is_supported(ins, m))
+    static auto private_detail_te_default_find_supported(char,
+                                                         T&& private_detail_te_self,
+                                                         const_module_ref mod,
+                                                         support_metric m)
+        -> decltype(private_detail_te_self.find_supported(mod, m))
    {
-        return private_detail_te_self.is_supported(ins, m);
+        return private_detail_te_self.find_supported(mod, m);
    }

    template <class T>
-    static float private_detail_te_default_is_supported(float,
-                                                        T&& private_detail_te_self,
-                                                        instruction_ref ins,
-                                                        support_metric m)
+    static supported_segments private_detail_te_default_find_supported(float,
+                                                                       T&& private_detail_te_self,
+                                                                       const_module_ref mod,
+                                                                       support_metric m)
    {
-        return target_is_supported(private_detail_te_self, ins, m);
+        return target_find_supported(private_detail_te_self, mod, m);
    }

    template <class T>
@@ -372,10 +374,11 @@ struct target

        context get_context() const override { return private_detail_te_value.get_context(); }

-        float is_supported(instruction_ref ins, support_metric m) const override
+        supported_segments find_supported(const_module_ref mod, support_metric m) const override
        {

-            return private_detail_te_default_is_supported(char(0), private_detail_te_value, ins, m);
+            return private_detail_te_default_find_supported(
+                char(0), private_detail_te_value, mod, m);
        }

        argument copy_to(const argument& input) const override

--- a/src/include/migraphx/target_assignments.hpp
+++ b/src/include/migraphx/target_assignments.hpp
@@ -33,10 +33,20 @@ inline namespace MIGRAPHX_INLINE_NS {

 struct target_assignments
 {
-    void add_assignment(instruction_ref ins, const std::string& target);
+    using iterator   = std::unordered_map<instruction_ref, std::string>::const_iterator;
+    using value_type = std::pair<instruction_ref, std::string>;

-    auto begin() const { return assignments.cbegin(); }
-    auto end() const { return assignments.cend(); }
+    auto size() const { return assignments.size(); }
+    auto& at(instruction_ref ins) const { return assignments.at(ins); }
+
+    auto insert(iterator it, const std::pair<instruction_ref, std::string>& assignment)
+    {
+        return assignments.insert(it, assignment);
+    }
+    auto find(instruction_ref ins) const { return assignments.find(ins); }
+
+    auto begin() const { return assignments.begin(); }
+    auto end() const { return assignments.end(); }

    private:
    std::unordered_map<instruction_ref, std::string> assignments;

--- a/src/onnx/include/migraphx/onnx/onnx_parser.hpp
+++ b/src/onnx/include/migraphx/onnx/onnx_parser.hpp
@@ -97,6 +97,7 @@ struct onnx_parser
    shape::dynamic_dimension default_dyn_dim_value = {1, 1, 0};
    std::unordered_map<std::string, std::vector<std::size_t>> map_input_dims;
    std::unordered_map<std::string, std::vector<shape::dynamic_dimension>> map_dyn_input_dims;
+    bool use_dyn_output         = false;
    bool skip_unknown_operators = false;
    int64_t max_loop_iterations = 10;
    int64_t opset_version       = 13;

--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -60,8 +60,14 @@ program parse_onnx_from(const onnx_options& options, Ts&&... xs)
    {
        parser.default_dyn_dim_value = options.default_dyn_dim_value;
    }
+    if(not options.map_input_dims.empty() and not options.map_dyn_input_dims.empty())
+    {
+        MIGRAPHX_THROW("PARSE_ONNX_FROM: both map_input_dims and map_dyn_input_dims non-empty, only"
+                       "one should be used");
+    }
    parser.skip_unknown_operators = options.skip_unknown_operators;
    parser.max_loop_iterations    = options.max_loop_iterations;
+    parser.use_dyn_output         = options.use_dyn_output;

    if(options.print_program_on_error)
    {
@@ -80,6 +86,7 @@ program parse_onnx_from(const onnx_options& options, Ts&&... xs)
    {
        parser.parse_from(std::forward<Ts>(xs)...);
    }
+
    return std::move(parser.prog);
 }


--- a/src/onnx/onnx_parser.cpp
+++ b/src/onnx/onnx_parser.cpp
@@ -256,11 +256,6 @@ int64_t onnx_parser::get_opset_version(const onnx::ModelProto& model)

 void onnx_parser::parse_graph(module* mod, const onnx::GraphProto& graph)
 {
-    if(not map_input_dims.empty() and not map_dyn_input_dims.empty())
-    {
-        MIGRAPHX_THROW("PARSE_GRAPH: both map_input_dims and map_dyn_input_dims non-empty, only"
-                       "one should be used");
-    }
    std::unordered_map<std::string, instruction_ref> mod_insts;
    for(auto&& f : graph.initializer())
    {

--- a/src/onnx/parse_generic_op.cpp
+++ b/src/onnx/parse_generic_op.cpp
@@ -58,7 +58,6 @@ struct parse_generic_op : op_parser<parse_generic_op>
                {"Log", "log"},
                {"LRN", "lrn"},
                {"Neg", "neg"},
-                {"NonMaxSuppression", "nonmaxsuppression"},
                {"Reciprocal", "recip"},
                {"Relu", "relu"},
                {"Round", "round"},
@@ -75,7 +74,7 @@ struct parse_generic_op : op_parser<parse_generic_op>

    bool needs_contiguous(const std::string& op_name) const
    {
-        return contains({"flatten", "gather", "nonmaxsuppression", "scatter"}, op_name);
+        return contains({"flatten", "gather", "scatter"}, op_name);
    }

    instruction_ref parse(const op_desc& opd,

--- a/src/onnx/parse_nonmaxsuppression.cpp
+++ b/src/onnx/parse_nonmaxsuppression.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+
+struct parse_nonmaxsuppression : op_parser<parse_nonmaxsuppression>
+{
+    std::vector<op_desc> operators() const { return {{"NonMaxSuppression", "nonmaxsuppression"}}; }
+
+    instruction_ref parse(const op_desc& opd,
+                          const onnx_parser& parser,
+                          const onnx_parser::node_info& info,
+                          const std::vector<instruction_ref>& args) const
+    {
+        auto op = parser.load(opd.op_name, info);
+        op.from_value({{"use_dyn_output", parser.use_dyn_output}});
+        return info.add_instruction(op, args);
+    }
+};
+
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/program.cpp
+++ b/src/program.cpp
@@ -37,6 +37,7 @@
 #include <migraphx/output_iterator.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/marker.hpp>
+#include <migraphx/supported_segments.hpp>
 #include <iostream>
 #include <sstream>
 #include <algorithm>
@@ -167,13 +168,37 @@ target_assignments program::get_target_assignments(const std::vector<target>& ta
    target_assignments p;

    const auto* mod = get_main_module();
-    for(auto it : iterator_for(*mod))
+    std::vector<std::pair<target, supported_segments>> target_subgraphs;
+    target_subgraphs.reserve(targets.size());
+    std::transform(targets.begin(),
+                   targets.end(),
+                   std::back_inserter(target_subgraphs),
+                   [&](const auto& t) { return std::make_pair(t, t.find_supported(mod, m)); });
+
+    for(const auto ins : iterator_for(*mod))
    {
-        auto t = std::max_element(
-            targets.begin(), targets.end(), [it, m](const target& lhs, const target& rhs) {
-                return lhs.is_supported(it, m) < rhs.is_supported(it, m);
-            });
-        p.add_assignment(it, t->name());
+        if(contains(p, ins))
+        {
+            continue;
+        }
+
+        for(const auto& [target, subgraph] : target_subgraphs)
+        {
+            // can't pass a structured binding into lambda in C++17 so create a variable for it
+            const auto& t = target;
+            for(const auto& segment : subgraph)
+            {
+                const auto& instructions = segment.instructions;
+                if(not contains(instructions, ins))
+                {
+                    continue;
+                }
+                std::transform(instructions.begin(),
+                               instructions.end(),
+                               std::inserter(p, p.end()),
+                               [&](auto instr) { return std::make_pair(instr, t.name()); });
+            }
+        }
    }
    return p;
 }

--- a/src/targets/fpga/include/migraphx/fpga/target.hpp
+++ b/src/targets/fpga/include/migraphx/fpga/target.hpp
@@ -30,6 +30,7 @@
 #include <migraphx/compile_options.hpp>
 #include <migraphx/fpga/context.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/supported_segments.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -41,7 +42,7 @@ struct target
    std::string name() const;
    std::vector<pass> get_passes(migraphx::context& ctx, const compile_options&) const;
    migraphx::context get_context() const { return context{}; }
-    float is_supported(instruction_ref ins, support_metric m);
+    supported_segments find_supported(const_module_ref mod, support_metric m) const;

    argument copy_to(const argument& arg) const { return arg; }
    argument copy_from(const argument& arg) const { return arg; }

--- a/src/targets/fpga/target.cpp
+++ b/src/targets/fpga/target.cpp
@@ -34,6 +34,7 @@
 #include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/generate.hpp>
 #include <migraphx/normalize_ops.hpp>
+#include <migraphx/iterator_for.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -62,12 +63,17 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti

 argument target::allocate(const shape& s) const { return fill_argument(s, 0); }

-float is_supported(instruction_ref ins, support_metric m)
+supported_segments target::find_supported(const_module_ref mod, support_metric m) const
 {
-    // for now, not using the ins and metric to return a value
-    (void)ins;
    (void)m;
-    return 1.0;
+
+    supported_segment instrs;
+    for(const auto ins : iterator_for(*mod))
+    {
+        instrs.instructions.insert(ins);
+    }
+    instrs.metric = 1; // arbitrary value
+    return {instrs};
 }

 MIGRAPHX_REGISTER_TARGET(target);

--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -33,41 +33,43 @@
 namespace migraphx {

 // NOLINTNEXTLINE
-#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op)                                             \
-    template <class U>                                                                      \
-    constexpr array& operator op(const array<U, N>& x)                                      \
-    {                                                                                       \
-        array_for_each(*this, x)([](auto& sy, auto sx) { sy op sx; });                      \
-        return *this;                                                                       \
-    }                                                                                       \
-    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                           \
-    constexpr array& operator op(const U& x)                                                \
-    {                                                                                       \
-        array_for_each (*this)([&](auto& sy) { sy op x; });                                 \
-        return *this;                                                                       \
-    }                                                                                       \
-    template <class U>                                                                      \
-    friend constexpr auto operator binary_op(const array& x, const array<U, N>& y)          \
-    {                                                                                       \
-        array<decltype(T {} binary_op U{}), N> z{};                                         \
-        array_for_each(z, x, y)([&](auto& sz, auto sx, auto sy) { sz = sx binary_op sy; }); \
-        return z;                                                                           \
-    }                                                                                       \
-    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                           \
-    friend constexpr auto operator binary_op(const array& x, const U& y)                    \
-    {                                                                                       \
-        array<decltype(T {} binary_op U{}), N> z{};                                         \
-        array_for_each(z, x)([&](auto& sz, auto sx) { sz = sx binary_op y; });              \
-        return z;                                                                           \
-    }                                                                                       \
-    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                           \
-    friend constexpr auto operator binary_op(const U& x, const array& y)                    \
-    {                                                                                       \
-        array<decltype(T {} binary_op U{}), N> z{};                                         \
-        array_for_each(z, y)([&](auto& sz, auto sy) { sz = x binary_op sy; });              \
-        return z;                                                                           \
+#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op)                                              \
+    template <class U>                                                                       \
+    constexpr array& operator op(const array<U, N>& x)                                       \
+    {                                                                                        \
+        array_detail::array_for_each(*this, x)([](auto& sy, auto sx) { sy op sx; });         \
+        return *this;                                                                        \
+    }                                                                                        \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                            \
+    constexpr array& operator op(const U& x)                                                 \
+    {                                                                                        \
+        array_detail::array_for_each (*this)([&](auto& sy) { sy op x; });                    \
+        return *this;                                                                        \
+    }                                                                                        \
+    template <class U>                                                                       \
+    friend constexpr auto operator binary_op(const array& x, const array<U, N>& y)           \
+    {                                                                                        \
+        array<decltype(T {} binary_op U{}), N> z{};                                          \
+        array_detail::array_for_each(z, x, y)(                                               \
+            [&](auto& sz, auto sx, auto sy) { sz = sx binary_op sy; });                      \
+        return z;                                                                            \
+    }                                                                                        \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                            \
+    friend constexpr auto operator binary_op(const array& x, const U& y)                     \
+    {                                                                                        \
+        array<decltype(T {} binary_op U{}), N> z{};                                          \
+        array_detail::array_for_each(z, x)([&](auto& sz, auto sx) { sz = sx binary_op y; }); \
+        return z;                                                                            \
+    }                                                                                        \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                            \
+    friend constexpr auto operator binary_op(const U& x, const array& y)                     \
+    {                                                                                        \
+        array<decltype(T {} binary_op U{}), N> z{};                                          \
+        array_detail::array_for_each(z, y)([&](auto& sz, auto sy) { sz = x binary_op sy; }); \
+        return z;                                                                            \
    }

+namespace array_detail {
 template <class T>
 constexpr auto is_vectorizable()
 {
@@ -75,20 +77,15 @@ constexpr auto is_vectorizable()
 }

 template <class T>
-constexpr auto array2vec(T x)
+__device__ auto& array2vec(T& x)
 {
    using value_type    = typename T::value_type;
    constexpr auto size = decltype(x.size()){};
    using type          = vec<value_type, size>;
-    static_assert(size != 3, "Wrong size");
-    return __builtin_bit_cast(type, x);
-}
-
-template <class T, class U, index_int N>
-constexpr void vec2array(T& x, vec<U, N> v)
-{
-    if constexpr(not is_const<T>{})
-        x = __builtin_bit_cast(T, v);
+    if constexpr(is_const<T>{})
+        return reinterpret_cast<const type&>(x);
+    else
+        return reinterpret_cast<type&>(x);
 }

 template <class T, class... Ts>
@@ -101,11 +98,16 @@ constexpr auto array_for_each(T& x, Ts&... xs)
                      (is_vectorizable<typename Ts::value_type>() or ...)) and
                     size <= 8 and size > 1 and (size % 2 == 0))
        {
-            [&](auto v, auto... vs) {
-                f(v, vs...);
-                vec2array(x, v);
-                swallow{(vec2array(xs, vs), 0)...};
-            }(array2vec(x), array2vec(xs)...);
+            if(__builtin_is_constant_evaluated())
+            {
+                for(index_int i = 0; i < size; i++)
+                    f(x[i], xs[i]...);
+            }
+            else
+            {
+                using vec_type = std::remove_reference_t<decltype(array2vec(x))>;
+                f(array2vec(x), __builtin_convertvector(array2vec(xs), vec_type)...);
+            }
        }
        else
        {
@@ -114,6 +116,7 @@ constexpr auto array_for_each(T& x, Ts&... xs)
        }
    };
 }
+} // namespace array_detail

 template <class T, index_int N>
 struct array
@@ -151,18 +154,13 @@ struct array

    constexpr T dot(const array& x) const
    {
-        T result = 0;
-        for(index_int i = 0; i < N; i++)
-            result += x[i] * d[i];
-        return result;
+        auto r = x * (*this);
+        return r.reduce([](auto a, auto b) { return a + b; }, 0);
    }

    constexpr T product() const
    {
-        T result = 1;
-        for(index_int i = 0; i < N; i++)
-            result *= d[i];
-        return result;
+        return reduce([](auto x, auto y) { return x * y; }, 1);
    }

    constexpr T single(index_int width = 100) const
@@ -186,6 +184,15 @@ struct array
        return result;
    }

+    template <class F>
+    constexpr auto reduce(F f, T init) const
+    {
+        T result = init;
+        for(index_int i = 0; i < N; i++)
+            result = f(result, d[i]);
+        return result;
+    }
+
    MIGRAPHX_DEVICE_ARRAY_OP(+=, +)
    MIGRAPHX_DEVICE_ARRAY_OP(-=, -)
    MIGRAPHX_DEVICE_ARRAY_OP(*=, *)

--- a/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
@@ -55,14 +55,15 @@ __device__ void generic_binary_layernorm(
                return make_array(x, x * x) * vec_type<value_type>{1.0 / relements};
            })(input1, input2);

-        auto mean_x  = means[0];
-        auto mean_x2 = means[1];
+        auto mean_x   = means[0];
+        auto mean_x2  = means[1];
+        auto variance = mean_x2 - (mean_x * mean_x);

        r.inner([&](auto& y, auto x1, auto x2, auto... xs) {
            auto x = op(x1, x2);
            auto m = x - mean_x;
            // m * rsqrt(mean(m ^ 2) + 1e-12)
-            y = compute(m * rsqrt(mean_x2 - mean_x + value_type{1e-12}), xs...);
+            y = compute(m * rsqrt(variance + value_type{1e-12}), xs...);
        })(output, input1, input2, inputs...);
    });
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -94,8 +94,8 @@ MIGRAPHX_DPP_REDUCE(op::max, v_max)
 MIGRAPHX_DPP_REDUCE(op::min, v_min)
 MIGRAPHX_DPP_REDUCE(op::product, v_mul)

-template <class Op, class T, class F>
-__device__ auto block_reduce(index idx, Op op, T init, index_int n, F f)
+template <class Op, class T, class Index, class F>
+__device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
 {
 #if __AMDGCN_WAVEFRONT_SIZE == 32
    constexpr index_int lanes_per_thread = 16;
@@ -123,8 +123,8 @@ __device__ auto block_reduce(index idx, Op op, T init, index_int n, F f)
    return y;
 }
 #else
-template <class Op, class T, class F>
-__device__ auto block_reduce(index idx, Op op, T init, index_int n, F f)
+template <class Op, class T, class Index, class F>
+__device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
 {

    using type = decltype(f(0));

--- a/test/get_target_assignments.cpp
+++ b/test/get_target_assignments.cpp
@@ -26,8 +26,9 @@
 #include <migraphx/make_op.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/register_target.hpp>
-#include <migraphx/ref/target.hpp>
+#include <migraphx/fpga/target.hpp>
 #include <migraphx/target_assignments.hpp>
+#include <migraphx/iterator_for.hpp>

 migraphx::program create_program()
 {
@@ -37,8 +38,8 @@ migraphx::program create_program()
    auto x    = mm->add_parameter("x", s);
    auto y    = mm->add_parameter("y", s);
    auto z    = mm->add_parameter("z", s);
-    auto diff = mm->add_instruction(migraphx::make_op("div"), x, y);
-    mm->add_instruction(migraphx::make_op("div"), diff, z);
+    auto diff = mm->add_instruction(migraphx::make_op("add"), x, y);
+    mm->add_instruction(migraphx::make_op("add"), diff, z);
    return p;
 }

@@ -47,14 +48,16 @@ TEST_CASE(is_supported)
    auto p       = create_program();
    auto targets = migraphx::get_targets();
    EXPECT(!targets.empty());
-    auto first_target = targets[0];
-    auto t            = migraphx::make_target(first_target);
+    auto t = migraphx::make_target("fpga");

    const auto assignments = p.get_target_assignments({t});
-    for(const auto& [ins, target] : assignments)
+    const auto* mod        = p.get_main_module();
+    EXPECT(mod->size() == assignments.size());
+
+    for(const auto ins : iterator_for(*mod))
    {
-        (void)ins;
-        EXPECT(target == first_target);
+        const auto& target = assignments.at(ins);
+        EXPECT(target == "fpga");
    }
 }


--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -3589,7 +3589,7 @@ def nms_test():
    st = helper.make_tensor_value_info('score_threshold', TensorProto.FLOAT,
                                       [1])
    out = helper.make_tensor_value_info('selected_indices', TensorProto.INT64,
-                                        [6, 3])
+                                        [None, 3])

    node = onnx.helper.make_node('NonMaxSuppression',
                                 inputs=[
@@ -3603,6 +3603,108 @@ def nms_test():
    return ([node], [b, s, mo, iou, st], [out])


+@onnx_test
+def nms_use_dyn_output_false_test():
+    b = helper.make_tensor_value_info('boxes', TensorProto.FLOAT, [1, 6, 4])
+    s = helper.make_tensor_value_info('scores', TensorProto.FLOAT, [1, 1, 6])
+    mo = helper.make_tensor_value_info('max_output_boxes_per_class',
+                                       TensorProto.INT64, [1])
+    iou = helper.make_tensor_value_info('iou_threshold', TensorProto.FLOAT,
+                                        [1])
+    st = helper.make_tensor_value_info('score_threshold', TensorProto.FLOAT,
+                                       [1])
+    out = helper.make_tensor_value_info('selected_indices', TensorProto.INT64,
+                                        [None, 3])
+
+    node = onnx.helper.make_node('NonMaxSuppression',
+                                 inputs=[
+                                     'boxes', 'scores',
+                                     'max_output_boxes_per_class',
+                                     'iou_threshold', 'score_threshold'
+                                 ],
+                                 outputs=['selected_indices'],
+                                 use_dyn_output=0)
+
+    return ([node], [b, s, mo, iou, st], [out])
+
+
+@onnx_test
+def nms_dynamic_batch_test():
+    b = helper.make_tensor_value_info('boxes', TensorProto.FLOAT, [None, 6, 4])
+    s = helper.make_tensor_value_info('scores', TensorProto.FLOAT,
+                                      [None, 1, 6])
+    mo = helper.make_tensor_value_info('max_output_boxes_per_class',
+                                       TensorProto.INT64, [1])
+    iou = helper.make_tensor_value_info('iou_threshold', TensorProto.FLOAT,
+                                        [1])
+    st = helper.make_tensor_value_info('score_threshold', TensorProto.FLOAT,
+                                       [1])
+    out = helper.make_tensor_value_info('selected_indices', TensorProto.INT64,
+                                        [None, 3])
+
+    node = onnx.helper.make_node('NonMaxSuppression',
+                                 inputs=[
+                                     'boxes', 'scores',
+                                     'max_output_boxes_per_class',
+                                     'iou_threshold', 'score_threshold'
+                                 ],
+                                 outputs=['selected_indices'],
+                                 center_point_box=1,
+                                 use_dyn_output=1)
+
+    return ([node], [b, s, mo, iou, st], [out])
+
+
+@onnx_test
+def nms_dynamic_boxes_test():
+    b = helper.make_tensor_value_info('boxes', TensorProto.FLOAT, [1, None, 4])
+    s = helper.make_tensor_value_info('scores', TensorProto.FLOAT,
+                                      [1, 1, None])
+    mo = helper.make_tensor_value_info('max_output_boxes_per_class',
+                                       TensorProto.INT64, [1])
+    iou = helper.make_tensor_value_info('iou_threshold', TensorProto.FLOAT,
+                                        [1])
+    st = helper.make_tensor_value_info('score_threshold', TensorProto.FLOAT,
+                                       [1])
+    out = helper.make_tensor_value_info('selected_indices', TensorProto.INT64,
+                                        [None, 3])
+
+    node = onnx.helper.make_node('NonMaxSuppression',
+                                 inputs=[
+                                     'boxes', 'scores',
+                                     'max_output_boxes_per_class',
+                                     'iou_threshold', 'score_threshold'
+                                 ],
+                                 outputs=['selected_indices'])
+
+    return ([node], [b, s, mo, iou, st], [out])
+
+
+@onnx_test
+def nms_dynamic_classes_test():
+    b = helper.make_tensor_value_info('boxes', TensorProto.FLOAT, [1, 6, 4])
+    s = helper.make_tensor_value_info('scores', TensorProto.FLOAT,
+                                      [1, None, 6])
+    mo = helper.make_tensor_value_info('max_output_boxes_per_class',
+                                       TensorProto.INT64, [1])
+    iou = helper.make_tensor_value_info('iou_threshold', TensorProto.FLOAT,
+                                        [1])
+    st = helper.make_tensor_value_info('score_threshold', TensorProto.FLOAT,
+                                       [1])
+    out = helper.make_tensor_value_info('selected_indices', TensorProto.INT64,
+                                        [None, 3])
+
+    node = onnx.helper.make_node('NonMaxSuppression',
+                                 inputs=[
+                                     'boxes', 'scores',
+                                     'max_output_boxes_per_class',
+                                     'iou_threshold', 'score_threshold'
+                                 ],
+                                 outputs=['selected_indices'])
+
+    return ([node], [b, s, mo, iou, st], [out])
+
+
 @onnx_test
 def not_test():
    x = helper.make_tensor_value_info('0', TensorProto.INT32, [4])