Merge branch 'simplify_1_mul_div_ops' into divide_by_zero_check

7f97b8ef · Ted Themistokleous · GitHub · 2ba401f0 · d1fed367 · 7f97b8ef
Unverified Commit 7f97b8ef authored Oct 07, 2022 by Ted Themistokleous Committed by GitHub Oct 07, 2022
20 changed files
--- a/src/rewrite_pooling.cpp
+++ b/src/rewrite_pooling.cpp
@@ -47,12 +47,12 @@ void rewrite_pooling::apply(module& m) const
        if(not s.standard())
            continue;
        auto&& op = any_cast<op::pooling>(ins->get_operator());
-        if(!std::all_of(op.padding.begin(), op.padding.end(), [](auto i) { return i == 0; }))
+        if(not std::all_of(op.padding.begin(), op.padding.end(), [](auto i) { return i == 0; }))
            continue;
-        if(!std::all_of(op.stride.begin(), op.stride.end(), [](auto i) { return i == 1; }))
+        if(not std::all_of(op.stride.begin(), op.stride.end(), [](auto i) { return i == 1; }))
            continue;
        auto lens = s.lens();
-        if(!std::equal(lens.begin() + 2, lens.end(), op.lengths.begin(), op.lengths.end()))
+        if(not std::equal(lens.begin() + 2, lens.end(), op.lengths.begin(), op.lengths.end()))
            continue;
        std::int64_t n = s.lens()[0];
        std::int64_t c = s.lens()[1];

--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
@@ -214,7 +214,7 @@ void rewrite_rnn::apply_vanilla_rnn(module& m, instruction_ref ins) const
            ih = m.add_literal(migraphx::literal{ih_shape, data});
        }

-        if(!is_forward and variable_seq_len)
+        if(not is_forward and variable_seq_len)
        {
            args[0] =
                m.insert_instruction(ins, make_op("rnn_var_sl_shift_sequence"), args[0], seq_lens);
@@ -520,7 +520,7 @@ void rewrite_rnn::apply_gru(module& m, instruction_ref ins) const
            ih = m.add_literal(migraphx::literal{ih_shape, data});
        }

-        if(!is_forward and variable_seq_len)
+        if(not is_forward and variable_seq_len)
        {
            args[0] =
                m.insert_instruction(ins, make_op("rnn_var_sl_shift_sequence"), args[0], seq_lens);
@@ -977,7 +977,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
            pph = args[7];
        }

-        if(!is_forward and variable_seq_len)
+        if(not is_forward and variable_seq_len)
        {
            args[0] =
                m.insert_instruction(ins, make_op("rnn_var_sl_shift_sequence"), args[0], seq_lens);
@@ -1294,11 +1294,11 @@ bool rewrite_rnn::is_variable_seq_lens(const module& m, instruction_ref seq_lens
            std::vector<int64_t> vec_lens;
            arg_lens.visit([&](auto l) { vec_lens.assign(l.begin(), l.end()); });
            int64_t l = 0;
-            if(!vec_lens.empty())
+            if(not vec_lens.empty())
            {
                l = vec_lens[0];
            }
-            if(!std::all_of(vec_lens.begin(), vec_lens.end(), [&](auto v) { return v == l; }))
+            if(not std::all_of(vec_lens.begin(), vec_lens.end(), [&](auto v) { return v == l; }))
            {
                is_var_lens = true;
            }
@@ -1318,7 +1318,7 @@ rewrite_rnn::get_seq_len(const module& m, instruction_ref input, instruction_ref
    bool is_var_lens = is_variable_seq_lens(m, seq_lens);
    auto input_shape = input->get_shape();
    auto length      = input_shape.lens()[0];
-    if(!is_var_lens and seq_lens != m.end())
+    if(not is_var_lens and seq_lens != m.end())
    {
        auto arg_len = seq_lens->eval();
        std::vector<std::size_t> vec_lens;
@@ -1387,7 +1387,7 @@ void rewrite_rnn::replace_last_cell_output(module& m,

    if(variable_seq_len)
    {
-        if(!ins_outputs.empty())
+        if(not ins_outputs.empty())
        {
            cell_outputs = m.insert_instruction(
                std::next(ins),

--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -477,7 +477,7 @@ bool operator==(const shape::dynamic_dimension& x, const shape::dynamic_dimensio

 bool operator!=(const shape::dynamic_dimension& x, const shape::dynamic_dimension& y)
 {
-    return !(x == y);
+    return not(x == y);
 }
 std::ostream& operator<<(std::ostream& os, const shape::dynamic_dimension& x)
 {
@@ -497,7 +497,7 @@ bool operator==(const shape& x, const shape& y)
            x.strides() == y.strides() and x.sub_shapes() == y.sub_shapes());
 }

-bool operator!=(const shape& x, const shape& y) { return !(x == y); }
+bool operator!=(const shape& x, const shape& y) { return not(x == y); }

 std::ostream& operator<<(std::ostream& os, const shape& x)
 {

--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
@@ -57,12 +57,14 @@ auto conv_const_weights()

 auto reduction() { return match::name_contains("reduce"); }

+// conv(x, w) * a => conv(x, a * w)
 struct find_mul_conv
 {
    auto matcher() const
    {
-        return match::name("mul")(match::either_arg(0, 1)(conv_const_weights().bind("conv"),
-                                                          match::name("broadcast").bind("a")));
+        return match::name("mul")(
+            match::either_arg(0, 1)(conv_const_weights().bind("conv"),
+                                    match::name("broadcast", "multibroadcast").bind("a")));
    }

    void apply(module& m, const match::matcher_result& r) const
@@ -72,14 +74,35 @@ struct find_mul_conv
        auto a_ins    = r.instructions["a"];
        auto w_ins    = r.instructions["w"];

-        auto broadcast_op = any_cast<op::broadcast>(a_ins->get_operator());
-        if(broadcast_op.axis != 1)
+        const auto& a_input_lens = a_ins->inputs().front()->get_shape().lens();
+
+        std::size_t num_not_one_dims = std::count_if(
+            a_input_lens.cbegin(), a_input_lens.cend(), [](auto dim) { return dim != 1; });
+        if(num_not_one_dims > 1)
+            return;
+
+        // check broadcasted along channels
+        const auto& a_lens    = a_ins->get_shape().lens();
+        const auto& a_strides = a_ins->get_shape().strides();
+
+        auto is_broadcasted_axis = [](auto len, auto stride) { return len == 1 or stride == 0; };
+
+        if(a_strides.at(1) != 1)
+            return;
+
+        if(not is_broadcasted_axis(a_lens.front(), a_strides.front()))
            return;

+        if(not std::equal(a_lens.begin() + 2,
+                          a_lens.end(),
+                          a_strides.begin() + 2,
+                          a_strides.end(),
+                          is_broadcasted_axis))
+            return;
+
+        auto sq    = m.insert_instruction(ins, make_op("squeeze"), a_ins->inputs().front());
        auto new_a = m.insert_instruction(
-            ins,
-            make_op("broadcast", {{"axis", 0}, {"out_lens", w_ins->get_shape().lens()}}),
-            a_ins->inputs().front());
+            ins, make_op("broadcast", {{"axis", 0}, {"out_lens", w_ins->get_shape().lens()}}), sq);
        auto new_mul  = m.insert_instruction(ins, make_op("mul"), new_a, w_ins);
        auto new_conv = m.insert_instruction(
            ins, conv_ins->get_operator(), conv_ins->inputs().front(), new_mul);
@@ -208,6 +231,42 @@ struct find_mul_add
    }
 };

+struct find_dot_add
+{
+    auto matcher() const
+    {
+        return match::name("dot")(match::either_arg(0, 1)(
+            match::name("add")(
+                match::either_arg(0, 1)(match::any().bind("x"),
+                                        match::any_of(match::is_constant()).bind("b")),
+                match::none_of(match::args(match::is_constant(), match::is_constant())),
+                match::used_once()),
+            match::is_constant().bind("a")));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins   = r.result;
+        auto a_ins = r.instructions["a"];
+        auto b_ins = r.instructions["b"];
+        auto x_ins = r.instructions["x"];
+        assert(x_ins != b_ins);
+
+        const bool flipped = a_ins == ins->inputs().back();
+
+        auto insert_dot = [&](auto x, auto y) {
+            if(flipped)
+                return m.insert_instruction(ins, make_op("dot"), y, x);
+            else
+                return m.insert_instruction(ins, make_op("dot"), x, y);
+        };
+
+        auto ax_ins = insert_dot(a_ins, x_ins);
+        auto ab_ins = insert_dot(a_ins, b_ins);
+        m.replace_instruction(ins, make_op("add"), ax_ins, ab_ins);
+    }
+};
+
 struct find_add_lit_broadcast
 {
    auto matcher() const
@@ -267,28 +326,26 @@ struct find_double_add_lit_broadcast

 struct find_inner_broadcast
 {
-    auto matcher() const
-    {
-        return pointwise(
-            match::nargs(2),
-            match::args(match::name("broadcast").bind("x"), match::name("broadcast").bind("y")));
-    }
+    auto matcher() const { return pointwise(match::all_of[match::inputs()](match::broadcast())); }

    void apply(module& m, const match::matcher_result& r) const
    {
-        auto ins   = r.result;
-        auto x_ins = r.instructions["x"];
-        auto y_ins = r.instructions["y"];
-
-        auto xbroadcast = any_cast<op::broadcast>(x_ins->get_operator());
-        auto ybroadcast = any_cast<op::broadcast>(y_ins->get_operator());
-
-        if(xbroadcast.axis != ybroadcast.axis)
+        auto ins        = r.result;
+        auto broadcasts = ins->inputs();
+        if(broadcasts.empty())
+            return;
+        std::vector<instruction_ref> inputs;
+        std::transform(broadcasts.begin(),
+                       broadcasts.end(),
+                       std::back_inserter(inputs),
+                       [](auto i) { return i->inputs().front(); });
+        if(std::any_of(inputs.begin(), inputs.end(), [&](auto i) {
+               return i->get_shape() != inputs.front()->get_shape();
+           }))
            return;

-        auto op = m.insert_instruction(
-            ins, ins->get_operator(), x_ins->inputs().front(), y_ins->inputs().front());
-        m.replace_instruction(ins, xbroadcast, op);
+        auto op = m.insert_instruction(ins, ins->get_operator(), inputs);
+        m.replace_instruction(ins, broadcasts.front()->get_operator(), op);
    }
 };

@@ -378,6 +435,24 @@ struct find_concat_op
    }
 };

+void move_instructions_back(module& m, instruction_ref pos, std::vector<instruction_ref> inss)
+{
+    auto start = range(m.begin(), pos);
+    for(auto ins : iterator_for(start))
+    {
+        auto it = std::find(inss.begin(), inss.end(), ins);
+        if(it != inss.end())
+            inss.erase(it);
+    }
+    for(auto ins : inss)
+    {
+        if(not m.has_instruction(ins))
+            continue;
+        move_instructions_back(m, pos, ins->inputs());
+        m.move_instruction(ins, pos);
+    }
+}
+
 std::vector<instruction_ref> get_splits(instruction_ref ins)
 {
    std::vector<instruction_ref> result;
@@ -416,8 +491,9 @@ struct find_splits
 {
    auto matcher() const
    {
-        return match::any(match::any_of[match::outputs()](match::name("slice")(
-            match::any_of[match::outputs()](match::pointwise(), reduction()))));
+        return match::any(
+            match::any_of[match::outputs()](match::name("slice")(match::any_of[match::outputs()](
+                match::pointwise(match::any_of(match::nargs(1), match::nargs(2))), reduction()))));
    }

    static bool is_dependent(const module& m, instruction_ref ins1, instruction_ref ins2)
@@ -552,8 +628,7 @@ struct find_splits
                   }))
                    return;

-                for(auto data : data_args)
-                    m.move_instructions(data, ins);
+                move_instructions_back(m, ins, data_args);

                auto slice_op = any_cast<op::slice>(splits.front()->get_operator());
                assert(not slice_op.axes.empty());
@@ -580,10 +655,9 @@ struct find_splits
                    auto outputs = i->outputs();
                    for(auto output : outputs)
                    {
-                        if(not contains({"reshape", "squeeze", "unsqueeze"}, output->name()))
+                        if(output->name() != "reshape")
                            continue;
-                        auto x =
-                            m.insert_instruction(output, make_op("contiguous"), output->inputs());
+                        auto x = m.insert_instruction(output, make_op("contiguous"), i);
                        m.replace_instruction(output, output->get_operator(), x);
                    }

@@ -753,7 +827,7 @@ MIGRAPHX_PRED_MATCHER(horiz_conv_dot, instruction_ref ins)
    };
    auto dots  = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("dot"));
    auto convs = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("convolution"));
-    return !(dots < 2 and convs < 2);
+    return not(dots < 2 and convs < 2);
 }

 struct find_conv_dot_horiz_fusion
@@ -773,7 +847,7 @@ struct find_conv_dot_horiz_fusion
            auto y = j->inputs()[1]->get_shape().lens();
            if(x.size() != y.size())
                return false;
-            // Check that non-axises match
+            // Check that non-axes match
            int axis = 1;
            if(i->name() == "dot")
            {
@@ -807,15 +881,23 @@ struct find_conv_dot_horiz_fusion
                concat_axis = axis;
            }

-            for(auto arg : args)
-                m.move_instructions(arg, input);
-            // TODO: Check if axises match
+            move_instructions_back(m, input, args);
+            // TODO: Check if axes match
            auto concat =
                m.insert_instruction(input, make_op("concat", {{"axis", concat_axis}}), args);
            auto fused     = m.insert_instruction(std::next(input), op, input, concat);
            int64_t offset = 0;
            for(auto arg : range(start, last))
            {
+                auto outputs = arg->outputs();
+                for(auto output : outputs)
+                {
+                    if(output->name() != "reshape")
+                        continue;
+                    auto x = m.insert_instruction(output, make_op("contiguous"), arg);
+                    m.replace_instruction(output, output->get_operator(), x);
+                }
+
                int64_t len = arg->get_shape().lens()[axis];
                m.replace_instruction(
                    arg,
@@ -1010,7 +1092,7 @@ struct find_split_reshape

        // all outputs are reshape and of the same shape
        auto dims = any_cast<op::reshape>(rsp->get_operator()).dims;
-        if(!same_ops(vec_rsp))
+        if(not same_ops(vec_rsp))
        {
            return;
        }
@@ -1026,23 +1108,42 @@ struct find_split_reshape
        auto rsp_lens    = rsp->get_shape().lens();
        auto rsp_strides = rsp->get_shape().strides();
        rsp_strides.insert(rsp_strides.begin(), rsp_strides[0] * rsp_lens[0]);
-        auto ait = std::find(rsp_strides.begin(), rsp_strides.end(), slc_dim_size);
+
+        auto ait     = std::find(rsp_strides.begin(), rsp_strides.end(), slc_dim_size);
+        int rsp_axis = -1;
        if(ait == rsp_strides.end())
        {
            return;
        }
-        int rsp_axis = std::distance(rsp_strides.begin(), ait);
-
+        else if(ait == rsp_strides.end() - 1)
+        {
+            // edge case
+            // slice_dim == 1, in that case it could match with last stride of 1.
+            // it should accumulate lengths from last dim in that case. discount 1 to avoid going
+            // out of bounds.
+            assert(slc_dim_size == 1);
+            rsp_axis = std::distance(rsp_strides.begin(), ait) - 1;
+        }
+        else
+        {
+            rsp_axis = std::distance(rsp_strides.begin(), ait);
+        }
        // calculate reshape output shape
        std::vector<int64_t> vec_dims(vec_rsp.size());
+
        std::transform(vec_rsp.begin(), vec_rsp.end(), vec_dims.begin(), [&](auto is) {
            return is->get_shape().lens()[rsp_axis];
        });

        std::vector<int64_t> rsp_out_lens(rsp_lens.begin(), rsp_lens.end());
+
        rsp_out_lens[rsp_axis] = std::accumulate(vec_dims.begin(), vec_dims.end(), std::int64_t{0});

-        // insert the reshape instruction
+        // insert the reshape instruction and add contiguous if needed
+        if(not input->get_shape().standard())
+        {
+            input = m.insert_instruction(std::next(input), make_op("contiguous"), input);
+        }
        auto rsp_ins = m.insert_instruction(
            std::next(input), make_op("reshape", {{"dims", rsp_out_lens}}), input);

@@ -1089,7 +1190,7 @@ struct find_split_transpose

        // all transpose are the same
        auto perm = any_cast<op::transpose>(trans->get_operator()).dims;
-        if(!same_ops(vec_trans))
+        if(not same_ops(vec_trans))
        {
            return;
        }
@@ -1136,6 +1237,7 @@ void simplify_algebra::apply(module& m) const
                            find_neg_unit_ops{},
                            find_zero_ops{},
                            find_zero_div_const{},
+                            find_dot_add{},
                            find_div_const{},
                            find_sub_const{},
                            find_rsqrt{},

--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -99,7 +99,7 @@ struct find_reshaper
        std::vector<instruction_ref> reshapes{ins};
        while(is_reshaper(reshapes.back()))
        {
-            assert(!reshapes.back()->inputs().empty());
+            assert(not reshapes.back()->inputs().empty());
            assert(m.has_instruction(reshapes.back()->inputs().front()));
            auto input = reshapes.back()->inputs().front();
            reshapes.push_back(input);
@@ -151,8 +151,11 @@ struct find_transpose
 {
    auto matcher() const
    {
-        return match::name("transpose")(match::none_of(
-            match::skip_output(match::name("contiguous"))(match::name("transpose"))));
+        auto output_not_transpose =
+            match::none_of(match::skip_output(match::name("contiguous"))(match::name("transpose")));
+        auto input_has_transpose =
+            match::args(match::skip(match::name("contiguous"))(match::name("transpose")));
+        return match::name("transpose")(output_not_transpose, input_has_transpose);
    }

    void apply(module& m, const match::matcher_result& mr) const
@@ -268,6 +271,44 @@ struct find_nested_slice
    }
 };

+struct find_concat_multibroadcasts
+{
+    auto matcher() const
+    {
+        return match::name("concat")(match::all_of[match::inputs()](match::name("multibroadcast")));
+    }
+
+    void apply(module& m, const match::matcher_result& mr) const
+    {
+        auto ins        = mr.result;
+        auto op         = any_cast<op::concat>(ins->get_operator());
+        auto out_lens   = ins->get_shape().lens();
+        auto inputs     = ins->inputs();
+        auto in_strides = inputs.front()->get_shape().strides();
+
+        // Only apply when concat axis is not a broadcasted dimension
+        if(std::any_of(inputs.begin(), inputs.end(), [&](auto i) {
+               return i->get_shape().strides()[op.axis] == 0;
+           }))
+        {
+            return;
+        }
+
+        // Use inputs of multibroadcast ops as inputs to new concat op
+        std::transform(inputs.begin(), inputs.end(), inputs.begin(), [](auto i) {
+            return i->inputs().front();
+        });
+
+        // Reduce axis by number of leading broadcasted dimensions
+        if(inputs.front()->get_shape().lens().size() < out_lens.size())
+            op.axis -= std::count(in_strides.begin(), in_strides.begin() + op.axis, 0);
+
+        auto concat = m.insert_instruction(ins, op, inputs);
+        m.replace_instruction(
+            ins, migraphx::make_op("multibroadcast", {{"out_lens", out_lens}}), concat);
+    }
+};
+
 struct find_concat_transpose
 {
    auto matcher() const
@@ -285,7 +326,7 @@ struct find_concat_transpose
        auto permutation = find_permutation(s);

        // permutation should be the same for all inputs
-        if(!std::all_of(trans_inputs.begin(), trans_inputs.end(), [&](auto in) {
+        if(not std::all_of(trans_inputs.begin(), trans_inputs.end(), [&](auto in) {
               return (find_permutation(in->get_shape()) == permutation);
           }))
        {
@@ -664,9 +705,94 @@ struct find_slice_transpose
    }
 };

+struct find_transpose_slice
+{
+    auto matcher() const
+    {
+        return match::name("transpose")(match::all_of[match::outputs()](match::name("slice")));
+    }
+
+    static std::vector<int64_t> slice_distance(const op::slice& op)
+    {
+        assert(op.starts.size() == op.ends.size());
+        std::vector<int64_t> result(op.starts.size());
+        std::transform(
+            op.ends.begin(), op.ends.end(), op.starts.begin(), result.begin(), std::minus<>{});
+        return result;
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins    = r.result;
+        auto slices = ins->outputs();
+        if(slices.empty())
+            return;
+        auto slice     = any_cast<op::slice>(slices.front()->get_operator());
+        auto sdistance = slice_distance(slice);
+        // Check all distances and axes are the same
+        if(std::any_of(slices.begin(), slices.end(), [&](auto sins) {
+               auto s = any_cast<op::slice>(sins->get_operator());
+               return s.axes != slice.axes or slice_distance(s) != sdistance;
+           }))
+            return;
+        // Check distances are divisible by lens of corresponding axes
+        auto mod_by_distance = [&](const auto& v, auto f) {
+            return std::inner_product(v.begin(),
+                                      v.end(),
+                                      sdistance.begin(),
+                                      0,
+                                      std::plus<>{},
+                                      [&](auto x, auto d) -> uint64_t {
+                                          if(d == 0)
+                                              return 1;
+                                          return f(x) % d;
+                                      });
+        };
+        if(mod_by_distance(slice.axes, [&](auto x) { return ins->get_shape().lens()[x]; }) != 0 or
+           mod_by_distance(slice.starts, id{}) != 0 or mod_by_distance(slice.ends, id{}) != 0)
+            return;
+        // TODO: Handle multiple axes
+        if(sdistance.size() != 1)
+            return;
+        auto axis = slice.axes.front();
+        // Skip if axis would be packed
+        if(std::all_of(ins->get_shape().lens().begin(),
+                       ins->get_shape().lens().begin() + axis,
+                       [](auto x) { return x == 1; }))
+            return;
+        // Compute axis before transpose to use for unsqueeze
+        auto perm    = ins->get_operator().to_value()["permutation"].to_vector<int64_t>();
+        auto preaxis = std::find(perm.begin(), perm.end(), axis) - perm.begin();
+        // Make unsqeeze
+        auto unsqueeze = m.insert_instruction(
+            ins, make_op("unsqueeze", {{"axes", {preaxis}}, {"steps", sdistance}}), ins->inputs());
+        // Make transpose
+        std::transform(perm.begin(), perm.end(), perm.begin(), [&](auto i) {
+            if(i > preaxis)
+                return i + 1;
+            return i;
+        });
+        perm.insert(perm.begin(), preaxis + 1);
+        auto transpose =
+            m.insert_instruction(ins, make_op("transpose", {{"permutation", perm}}), unsqueeze);
+        // Slice and squeeze
+        for(auto s : slices)
+        {
+            auto op        = any_cast<op::slice>(s->get_operator());
+            op.axes        = {0};
+            op.starts      = {op.starts.front() / sdistance.front()};
+            op.ends        = {op.ends.front() / sdistance.front()};
+            auto slice_ins = m.insert_instruction(ins, op, transpose);
+            auto squeeze =
+                m.insert_instruction(ins, make_op("squeeze", {{"axes", {0}}}), slice_ins);
+            m.replace_instruction(s, squeeze);
+        }
+    }
+};
+
 void simplify_reshapes::apply(module& m) const
 {
-    for(int i = 0; i < 2; i++)
+    for(int i = 0; i < 4; i++)
    {
        match::find_matches(m,
                            find_where_op{},
@@ -676,9 +802,11 @@ void simplify_reshapes::apply(module& m) const
                            find_reshaper{},
                            find_transpose{},
                            find_concat_transpose{},
+                            find_concat_multibroadcasts{},
                            find_nested_convert{},
                            find_nested_slice{},
                            find_nested_concat{},
+                            find_transpose_slice{},
                            find_slice_transpose{},
                            find_transpose_contiguous_reshaper_unary{});
        dead_code_elimination{}.apply(m);

--- a/src/targets/gpu/device/where.cpp
+++ b/src/targets/gpu/device/where.cpp
@@ -21,42 +21,91 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <migraphx/gpu/device/where.hpp>
-#include <migraphx/gpu/device/tensor.hpp>
-#include <migraphx/gpu/device/types.hpp>
-#include <migraphx/gpu/device/launch.hpp>
+
+#include <migraphx/sqlite.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/errors.hpp>
+#include <sqlite3.h>
+#include <algorithm>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-namespace device {

-template <class Shape>
-constexpr auto get_rank(const Shape&)
-{
-    return decltype(typename Shape::hip_index{}.size()){};
-}
+using sqlite3_ptr = MIGRAPHX_MANAGE_PTR(sqlite3*, sqlite3_close);

-void where(hipStream_t stream,
-           const argument& result,
-           const argument& arg0,
-           const argument& arg1,
-           const argument& arg2)
+struct sqlite_impl
 {
-    hip_visit_all(result, arg1, arg2)([&](auto output, auto x, auto y) {
-        hip_visit_all(arg0)([&](auto cond) {
-            if constexpr(get_rank(cond.get_shape()) == get_rank(output.get_shape()))
+    sqlite3* get() const { return ptr.get(); }
+    void open(const fs::path& p, int flags)
+    {
+        sqlite3* ptr_tmp = nullptr;
+        int rc           = sqlite3_open_v2(p.string().c_str(), &ptr_tmp, flags, nullptr);
+        ptr              = sqlite3_ptr{ptr_tmp};
+        if(rc != 0)
+            MIGRAPHX_THROW("error opening " + p.string() + ": " + error_message());
+    }
+
+    template <class F>
+    void exec(const char* sql, F f)
+    {
+        auto callback = [](void* obj, auto... xs) -> int {
+            try
            {
-                gs_launch(stream, arg1.get_shape().elements())([=](auto idx) __device__ {
-                    auto i    = output.get_shape().multi(idx);
-                    output[i] = cond[i] ? x[i] : y[i];
-                });
+                const auto* g = static_cast<const F*>(obj);
+                (*g)(xs...);
+                return 0;
            }
-        });
+            catch(...)
+            {
+                return -1;
+            }
+        };
+        int rc = sqlite3_exec(get(), sql, callback, &f, nullptr);
+        if(rc != 0)
+            MIGRAPHX_THROW(error_message());
+    }
+
+    std::string error_message() const
+    {
+        std::string msg = "sqlite3: ";
+        return msg + sqlite3_errmsg(get());
+    }
+    sqlite3_ptr ptr;
+};
+
+sqlite sqlite::read(const fs::path& p)
+{
+    sqlite r;
+    r.impl = std::make_shared<sqlite_impl>();
+    r.impl->open(p, SQLITE_OPEN_READONLY);
+    return r;
+}
+
+sqlite sqlite::write(const fs::path& p)
+{
+    sqlite r;
+    r.impl = std::make_shared<sqlite_impl>();
+    // Using '+' instead of bitwise '|' to avoid compilation warning
+    r.impl->open(p, SQLITE_OPEN_READWRITE + SQLITE_OPEN_CREATE);
+    return r;
+}
+
+std::vector<std::unordered_map<std::string, std::string>> sqlite::execute(const std::string& s)
+{
+    std::vector<std::unordered_map<std::string, std::string>> result;
+    impl->exec(s.c_str(), [&](int n, char** texts, char** names) {
+        std::unordered_map<std::string, std::string> row;
+        row.reserve(n);
+        std::transform(
+            names,
+            names + n,
+            texts,
+            std::inserter(row, row.begin()),
+            [&](const char* name, const char* text) { return std::make_pair(name, text); });
+        result.push_back(row);
    });
+    return result;
 }

-} // namespace device
-} // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library(migraphx_cpu
    dnnl.cpp
    eltwise.cpp
    erf.cpp
+    fmod.cpp
    fuse_ops.cpp
    gather.cpp
    gemm.cpp
@@ -42,6 +43,7 @@ add_library(migraphx_cpu
    logsoftmax.cpp
    lowering.cpp
    lrn.cpp
+    mod.cpp
    preallocate.cpp
    pooling.cpp
    reduction.cpp

--- a/src/targets/cpu/binary.cpp
+++ b/src/targets/cpu/binary.cpp
@@ -49,7 +49,7 @@ struct dnnl_binary : dnnl_op<dnnl_binary, dnnl::binary>
        auto s0 = inputs.at(0);
        auto s1 = inputs.at(1);
        auto r  = s0;
-        if(s0 != s1 or !s0.packed())
+        if(s0 != s1 or not s0.packed())
        {
            r = shape{s0.type(), s0.lens()};
        }

--- a/src/target_assignments.cpp
+++ b/src/target_assignments.cpp
@@ -21,16 +21,16 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-
-#include <migraphx/target_assignments.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/fmod.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {

-void target_assignments::add_assignment(instruction_ref ins, const std::string& target)
-{
-    assignments.emplace(ins, target);
-}
+template struct cpu_binary<op::fmod>;

+} // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -43,6 +43,8 @@
 #include <migraphx/op/argmax.hpp>
 #include <migraphx/op/argmin.hpp>
 #include <migraphx/op/rnn_var_sl_last_output.hpp>
+#include <migraphx/op/mod.hpp>
+#include <migraphx/op/fmod.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/par_dfor.hpp>

--- a/src/targets/gpu/include/migraphx/gpu/cos.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/cos.hpp
@@ -21,22 +21,16 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_COS_HPP
-#define MIGRAPHX_GUARD_RTGLIB_COS_HPP
-
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/cos.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/mod.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
+namespace cpu {

-struct hip_cos : unary_device<hip_cos, device::cos>
-{
-};
+template struct cpu_binary<op::mod>;

-} // namespace gpu
+} // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-
-#endif
--- a/src/targets/fpga/include/migraphx/fpga/target.hpp
+++ b/src/targets/fpga/include/migraphx/fpga/target.hpp
@@ -30,6 +30,7 @@
 #include <migraphx/compile_options.hpp>
 #include <migraphx/fpga/context.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/supported_segments.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -41,7 +42,7 @@ struct target
    std::string name() const;
    std::vector<pass> get_passes(migraphx::context& ctx, const compile_options&) const;
    migraphx::context get_context() const { return context{}; }
-    float is_supported(instruction_ref ins, support_metric m);
+    supported_segments find_supported(const_module_ref mod, support_metric m) const;

    argument copy_to(const argument& arg) const { return arg; }
    argument copy_from(const argument& arg) const { return arg; }

--- a/src/targets/fpga/subgraph.cpp
+++ b/src/targets/fpga/subgraph.cpp
@@ -95,7 +95,7 @@ void subgraph::apply(module_pass_manager& mpm) const
    for(auto it : iterator_for(mod))
    {
        // assuming we want all the params/literals as inputs to the FPGA submodule
-        if(migraphx::starts_with(it->name(), "@param") ||
+        if(migraphx::starts_with(it->name(), "@param") or
           migraphx::starts_with(it->name(), "@literal"))
        {
            literal_inputs.push_back(it);

--- a/src/targets/fpga/target.cpp
+++ b/src/targets/fpga/target.cpp
@@ -34,6 +34,7 @@
 #include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/generate.hpp>
 #include <migraphx/normalize_ops.hpp>
+#include <migraphx/iterator_for.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -62,12 +63,17 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti

 argument target::allocate(const shape& s) const { return fill_argument(s, 0); }

-float is_supported(instruction_ref ins, support_metric m)
+supported_segments target::find_supported(const_module_ref mod, support_metric m) const
 {
-    // for now, not using the ins and metric to return a value
-    (void)ins;
    (void)m;
-    return 1.0;
+
+    supported_segment instrs;
+    for(const auto ins : iterator_for(*mod))
+    {
+        instrs.instructions.insert(ins);
+    }
+    instrs.metric = 1; // arbitrary value
+    return {instrs};
 }

 MIGRAPHX_REGISTER_TARGET(target);

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -39,81 +39,9 @@ file(GLOB KERNEL_FILES ${CONFIGURE_DEPENDS}
 message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
 add_embed_library(migraphx_kernels ${KERNEL_FILES})

-add_library(migraphx_device
-    device/acos.cpp
-    device/acosh.cpp
-    device/add.cpp
-    device/add_clip.cpp
-    device/add_relu.cpp
-    device/add_sigmoid.cpp
-    device/add_tanh.cpp
-    device/argmax.cpp
-    device/argmin.cpp
-    device/asin.cpp
-    device/asinh.cpp
-    device/atan.cpp
-    device/atanh.cpp
-    device/ceil.cpp
-    device/clip.cpp
-    device/concat.cpp
-    device/contiguous.cpp
-    device/convert.cpp
-    device/cos.cpp
-    device/cosh.cpp
-    device/div.cpp
-    device/equal.cpp
-    device/erf.cpp
-    device/exp.cpp
-    device/fill.cpp
-    device/floor.cpp
-    device/gather.cpp
-    device/gelu.cpp
-    device/greater.cpp
-    device/int8_gemm_pack.cpp
-    device/layernorm.cpp
-    device/less.cpp
-    device/log.cpp
-    device/logical_and.cpp
-    device/logical_or.cpp
-    device/logical_xor.cpp
-    device/logsoftmax.cpp
-    device/max.cpp
-    device/min.cpp
-    device/mul.cpp
-    device/mul_add.cpp
-    device/mul_add_relu.cpp
-    device/multinomial.cpp
-    device/nonzero.cpp
-    device/pad.cpp
-    device/pow.cpp
-    device/prelu.cpp
-    device/prefix_scan_sum.cpp
-    device/recip.cpp
-    device/reduce_max.cpp
-    device/reduce_mean.cpp
-    device/reduce_min.cpp
-    device/reduce_sum.cpp
-    device/reduce_prod.cpp
-    device/relu.cpp
-    device/reverse.cpp
-    device/rnn_variable_seq_lens.cpp
-    device/round.cpp
-    device/rsqrt.cpp
-    device/scatter.cpp
-    device/sigmoid.cpp
-    device/sign.cpp
-    device/sin.cpp
-    device/sinh.cpp
-    device/softmax.cpp
-    device/sqdiff.cpp
-    device/sqrt.cpp
-    device/sub.cpp
-    device/tan.cpp
-    device/tanh.cpp
-    device/topk.cpp
-    device/unary_not.cpp
-    device/where.cpp
-)
+file(GLOB DEVICE_GPU_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/device/*.cpp)
+add_library(migraphx_device ${DEVICE_GPU_SRCS})
+
 add_library(compile_for_gpu INTERFACE)
 target_compile_options(compile_for_gpu INTERFACE -std=c++17 -fno-gpu-rdc -Wno-cuda-compat -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns)
 target_link_libraries(compile_for_gpu INTERFACE hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-option-ignored)
@@ -151,15 +79,12 @@ add_library(migraphx_gpu
    argmax.cpp
    argmin.cpp
    batch_norm_inference.cpp
-    clip.cpp
    code_object_op.cpp
    compile_ops.cpp
    compile_gen.cpp
    compile_hip.cpp
    compile_hip_code_object.cpp
    compiler.cpp
-    concat.cpp
-    convert.cpp
    convolution.cpp
    deconvolution.cpp
    device_name.cpp
@@ -184,6 +109,7 @@ add_library(migraphx_gpu
    pack_int8_args.cpp
    prefuse_ops.cpp
    pad.cpp
+    perfdb.cpp
    pooling.cpp
    quant_convolution.cpp
    reverse.cpp
@@ -191,7 +117,6 @@ add_library(migraphx_gpu
    rocblas.cpp
    scatter.cpp
    schedule_model.cpp
-    softmax.cpp
    sync_device.cpp
    target.cpp
    topk.cpp
@@ -206,68 +131,18 @@ function(register_migraphx_gpu_ops PREFIX)
    endforeach()
 endfunction()
 register_migraphx_gpu_ops(hip_
-    acosh
-    acos
-    add
    argmax
    argmin
-    asinh
-    asin
-    atanh
-    atan
-    ceil
-    clip
-    concat
-    convert
-    cosh
-    cos
-    div
-    equal
-    erf
-    exp
-    floor
    gather
-    greater
-    less
-    log
    logsoftmax
-    logical_and
-    logical_or
-    logical_xor
    loop
-    max
-    min
-    mul
    multinomial
    nonzero
    pad
-    pow
-    prelu
    prefix_scan_sum
-    recip
-    reduce_max
-    reduce_mean
-    reduce_min
-    reduce_prod
-    reduce_sum
-    relu
    reverse
-    round
-    rsqrt
    scatter
-    sigmoid
-    sign
-    sinh
-    sin
-    softmax
-    sqdiff
-    sqrt
-    sub
-    tanh
-    tan
    topk
-    unary_not
-    where
 )
 register_migraphx_gpu_ops(miopen_
    abs
@@ -321,26 +196,11 @@ message(STATUS "extractkernel: ${MIGRAPHX_EXTRACT_KERNEL}")

 set(MIGRAPHX_ENABLE_MLIR OFF CACHE BOOL "")
 if(MIGRAPHX_ENABLE_MLIR)
-    find_library(MLIRAPI_LIBRARY MLIRMIOpen 
-        PATH_SUFFIXES
-        # Workaournd broken mlir install
-        lib/ lib/lib)
-    # REQUIRED is not supported before cmake 3.18
-    if(NOT MLIRAPI_LIBRARY)
-        message(FATAL_ERROR "libMLIRMIOpen not found")
-    else()
-        message(STATUS "Build with libMLIRMIOpen: " ${MLIRAPI_LIBRARY})
-    endif()
-
-    find_path(MLIRAPI_HEADERS NAMES mlir-c/Dialect/MIGraphX.h)
-    # Workaround MLIR broken installation
-    find_path(MLIRAPI_HEADERS2 NAMES mlir-c/Registration.h
-        PATH_SUFFIXES 
-        include/external/include external/include)
-
+    # Find package rocMLIR
+    find_package(rocMLIR 1.0.0 CONFIG REQUIRED)
+    message(STATUS "Build with rocMLIR::rockCompiler ${rocMLIR_VERSION}")
    target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_MLIR")
-    target_include_directories(migraphx_gpu SYSTEM PRIVATE ${MLIRAPI_HEADERS} ${MLIRAPI_HEADERS2})
-    target_link_libraries(migraphx_gpu PUBLIC ${MLIRAPI_LIBRARY})
+    target_link_libraries(migraphx_gpu PUBLIC rocMLIR::rockCompiler)
 endif()

 set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "")
@@ -379,9 +239,18 @@ endif()
 include(CheckLibraryExists)
 get_target_property(MIOPEN_LOCATION MIOpen LOCATION)
 check_library_exists(MIOpen "miopenHiddenSetConvolutionFindMode" "${MIOPEN_LOCATION}" HAS_FIND_MODE_API)
+check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API)
+
+if(HAS_FIND_2_API) 
+    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
+    message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
+else()
+    message(STATUS "MIOpen does not have Find-2.0 API")
+endif()
+
 if(HAS_FIND_MODE_API)
    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_MODE_API)
-    message(STATUS "MIOpen has find mode api")
+    message(STATUS "MIGraphx is using Find Mode API of MIOpen")
 else()
    message(STATUS "MIOpen does not have find mode api")
 endif()

--- a/src/targets/gpu/clip.cpp
+++ b/src/targets/gpu/clip.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/clip.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/device/clip.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-shape hip_clip::compute_shape(std::vector<shape> inputs) const
-{
-    inputs.pop_back();
-    return op.compute_shape(inputs);
-}
-
-argument hip_clip::compute(context& ctx, const shape&, const std::vector<argument>& args) const
-{
-    device::clip(ctx.get_stream().get(), args.back(), args.front(), args.at(1), args.at(2));
-    return args.back();
-}
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/code_object_op.cpp
+++ b/src/targets/gpu/code_object_op.cpp
@@ -51,7 +51,8 @@ code_object_op::compute(context& ctx, const shape&, const std::vector<argument>&
    std::vector<void*> kargs(args.size());
    std::transform(
        args.begin(), args.end(), kargs.begin(), [](const argument& a) { return a.data(); });
-    k.launch(ctx.get_stream().get(), global, local, std::move(kargs));
+    auto [start, stop] = ctx.get_perf_events();
+    k.launch(ctx.get_stream().get(), global, local, std::move(kargs), start, stop);
    return args[get_output_arg(args.size())];
 }
 void code_object_op::finalize(context&, const shape&, const std::vector<shape>&)

--- a/src/targets/gpu/compile_gen.cpp
+++ b/src/targets/gpu/compile_gen.cpp
@@ -22,9 +22,17 @@
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/gpu/context.hpp>
 #include <migraphx/shape.hpp>
 #include <migraphx/permutation.hpp>
 #include <migraphx/stringutils.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/eliminate_common_subexpression.hpp>
+#include <migraphx/cpp_generator.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/ranges.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -41,12 +49,13 @@ static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)
    return {4, 2};
 }

-vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs)
+vectorize vectorize::elements(std::size_t axis,
+                              const std::vector<shape>& inputs,
+                              const std::vector<std::size_t>& sizes)
 {
    if(std::all_of(
           inputs.begin(), inputs.end(), [&](const auto& s) { return s.lens()[axis] == 1; }))
        return {1, axis};
-    auto sizes = vector_sizes(inputs);
    std::vector<std::size_t> max_vec_size;
    std::transform(inputs.begin(),
                   inputs.end(),
@@ -54,12 +63,19 @@ vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs
                   [&](const auto& input) -> std::size_t {
                       auto stride = input.strides()[axis];
                       auto len    = input.lens()[axis];
-                       if(stride != 0 and stride != 1)
+                       if(not contains({0, 1}, stride))
                           return 1;
                       if(len == 1 and input.elements() > sizes.front())
                           return sizes.front();
-                       auto it = std::find_if(
-                           sizes.begin(), sizes.end(), [&](auto i) { return (len % i) == 0; });
+                       auto it = std::find_if(sizes.begin(), sizes.end(), [&](auto vsize) {
+                           // The len is divisible by the size and all the strides are divisible by
+                           // the size
+                           return (len % vsize) == 0 and
+                                  std::all_of(
+                                      input.strides().begin(), input.strides().end(), [&](auto i) {
+                                          return contains({0, 1}, i) or i % vsize == 0;
+                                      });
+                       });
                       if(it != sizes.end())
                           return *it;
                       return 1;
@@ -67,6 +83,33 @@ vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs
    return {*std::min_element(max_vec_size.begin(), max_vec_size.end()), axis};
 }

+vectorize vectorize::elements(context& ctx, std::size_t axis, const std::vector<shape>& inputs)
+{
+    if(inputs.empty())
+        return {1, axis};
+    std::size_t n = std::max_element(inputs.begin(),
+                                     inputs.end(),
+                                     by(std::less<>{}, [](const auto& s) { return s.elements(); }))
+                        ->elements();
+    std::size_t max_global = ctx.get_current_device().get_cu_count() *
+                             ctx.get_current_device().get_max_workitems_per_cu();
+    std::size_t over = n / max_global;
+    bool broadcasted =
+        std::any_of(inputs.begin(), inputs.end(), [](const auto& s) { return s.broadcasted(); });
+    std::vector<std::size_t> sizes;
+    if(broadcasted and over > 8)
+        sizes.push_back(8);
+    if(over > 4)
+        sizes.push_back(4);
+    sizes.push_back(2);
+    return elements(axis, inputs, sizes);
+}
+
+vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs)
+{
+    return elements(axis, inputs, vector_sizes(inputs));
+}
+
 std::string vectorize::str() const
 {
    return "vectorize<" + to_string(size) + ", " + to_string(axis) + ">()";
@@ -75,25 +118,25 @@ std::string vectorize::str() const
 preload preload::broadcasts(std::size_t axis, const std::vector<shape>& inputs)
 {
    const std::size_t max_lds_bytes = 4096;
-    std::vector<bool> result;
-    std::transform(inputs.begin(),
-                   inputs.end(),
-                   std::back_inserter(result),
-                   [&](const shape& input) { return input.strides()[axis] == 0; });
-    auto bytes = std::inner_product(inputs.begin(),
-                                    inputs.end(),
-                                    result.begin(),
-                                    std::size_t{0},
-                                    std::plus<>{},
-                                    [](const shape& s, bool b) -> std::size_t {
-                                        if(b)
-                                            return s.bytes();
-                                        return 0;
-                                    });
-    if(bytes < max_lds_bytes)
-        return {result};
-    // TODO: Try to partially preload items
-    std::fill(result.begin(), result.end(), false);
+    std::vector<bool> result(inputs.size());
+    std::vector<std::size_t> preloaded;
+    auto idxs = range(inputs.size());
+    std::copy_if(idxs.begin(), idxs.end(), std::back_inserter(preloaded), [&](auto i) {
+        return inputs[i].strides()[axis] == 0;
+    });
+    std::sort(preloaded.begin(), preloaded.end(), by(std::less<>{}, [&](auto i) {
+                  return inputs[i].bytes();
+              }));
+
+    std::size_t bytes = 0;
+    for(auto i : preloaded)
+    {
+        const auto& input = inputs[i];
+        bytes += input.bytes();
+        if(bytes > max_lds_bytes)
+            break;
+        result[i] = true;
+    }
    return {result};
 }

@@ -125,6 +168,45 @@ std::string make_transformer_args(std::vector<std::string> transformers)
    return join_strings(std::move(transformers), ", ");
 }

+std::string generate_pointwise(const module& pm, const std::string& name)
+{
+    module m = pm;
+    run_passes(m, {eliminate_common_subexpression{}, dead_code_elimination{}});
+    cpp_generator g;
+    g.fmap([](const std::string& fname) { return "migraphx::" + fname; });
+    g.add_point_op("where", "${function:where}(${0}, ${1}, ${2})");
+    g.add_point_op("prelu", "${function:where}(${0} < 0, ${0} * ${1}, ${0})");
+    g.add_point_op("sign", "${function:where}(${0} > 0, 1, ${function:where}(${0} < 0, -1, 0))");
+    g.add_point_op("equal", "migraphx::abs(${0} == ${1})");
+    g.add_point_op("less", "migraphx::abs(${0} < ${1})");
+    g.add_point_op("greater", "migraphx::abs(${0} > ${1})");
+    g.add_point_op("not", "migraphx::abs(not ${0})");
+    // Add explict conversions
+    g.fresult(
+        [](const shape& s) { return "migraphx::convert<" + shape::cpp_type(s.type()) + ">"; });
+    g.create_function(
+        g.generate_module(m).set_attributes({"__device__"}).set_generic_types(m).set_name(name));
+    return g.str();
+}
+
+static std::vector<std::string> get_op_names(const module& m)
+{
+    std::vector<std::string> result;
+    for(auto& ins : m)
+    {
+        if(starts_with(ins.name(), "@"))
+            continue;
+        result.push_back(ins.name());
+    }
+    return result;
+}
+
+std::string generate_name_from_ops(const module& m)
+{
+    auto op_names = get_op_names(m);
+    return join_strings(op_names, "_");
+}
+
 } // namespace gen
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -138,16 +138,16 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
        std::size_t groups     = (n + local - 1) / local;
        std::size_t max_blocks = max_global / local;
        std::size_t nglobal    = std::min(max_blocks * over, groups) * local;
-        return nglobal;
+        return std::min(nglobal, n);
    };
 }

 std::size_t compute_block_size(std::size_t n, std::size_t max_block_size)
 {
-    size_t block_size = 128;
-    while(block_size <= max_block_size and block_size <= n)
-        block_size *= 2;
-    return block_size / 2;
+    const std::size_t min_block_size  = 64;
+    const std::size_t base_block_size = 32;
+    auto block_size                   = (((n - 1) / base_block_size + 1)) * base_block_size;
+    return std::min(std::max(min_block_size, block_size), max_block_size);
 }

 operation compile_hip_code_object(const std::string& content, hip_compile_options options)

--- a/src/targets/gpu/convert.cpp
+++ b/src/targets/gpu/convert.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/convert.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/device/convert.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-shape hip_convert::compute_shape(std::vector<shape> inputs) const
-{
-    inputs.pop_back();
-    check_shapes{inputs, *this}.packed();
-    return op.compute_shape(inputs);
-}
-
-argument hip_convert::compute(context& ctx, const shape&, const std::vector<argument>& args) const
-{
-    device::convert(ctx.get_stream().get(), args[1], args[0]);
-    return args[1];
-}
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx