Merge

7e297b13 · Paul · 86ea5e91 · aa7ff911 · 7e297b13 · 7e297b13
Commit 7e297b13 authored Jun 13, 2022 by Paul
20 changed files
--- a/src/targets/gpu/sync_device.cpp
+++ b/src/targets/gpu/sync_device.cpp
@@ -8,9 +8,9 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-void sync_device::apply(module& p) const
+void sync_device::apply(module& m) const
 {
-    auto last = std::prev(p.end());
+    auto last = std::prev(m.end());
    if(last->name() == "@return")
    {
        auto inputs = last->inputs();
@@ -18,10 +18,10 @@ void sync_device::apply(module& p) const
               return (i->name() == "hip::copy_from_gpu");
           }))
        {
-            auto sync_in = p.insert_instruction(last, make_op("hip::sync_stream"), inputs);
+            auto sync_in = m.insert_instruction(last, make_op("hip::sync_stream"), inputs);
            if(not inputs.empty())
            {
-                p.replace_instruction(inputs.front(), sync_in);
+                m.replace_instruction(inputs.front(), sync_in);
            }
        }
    }

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -2,7 +2,6 @@
 #include <migraphx/auto_contiguous.hpp>
 #include <migraphx/check_context.hpp>
 #include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/decompose.hpp>
 #include <migraphx/eliminate_allocation.hpp>
 #include <migraphx/eliminate_common_subexpression.hpp>
 #include <migraphx/eliminate_concat.hpp>
@@ -10,6 +9,7 @@
 #include <migraphx/eliminate_data_type.hpp>
 #include <migraphx/eliminate_identity.hpp>
 #include <migraphx/eliminate_pad.hpp>
+#include <migraphx/fuse_pointwise.hpp>
 #include <migraphx/inline_module.hpp>
 #include <migraphx/insert_pad.hpp>
 #include <migraphx/memory_coloring.hpp>
@@ -17,19 +17,21 @@
 #include <migraphx/preallocate_param.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
-#include <migraphx/remap.hpp>
 #include <migraphx/rewrite_batchnorm.hpp>
 #include <migraphx/rewrite_pooling.hpp>
 #include <migraphx/rewrite_quantization.hpp>
 #include <migraphx/rewrite_rnn.hpp>
 #include <migraphx/schedule.hpp>
 #include <migraphx/simplify_algebra.hpp>
+#include <migraphx/simplify_qdq.hpp>
 #include <migraphx/simplify_reshapes.hpp>
 #include <migraphx/gpu/allocation_model.hpp>
+#include <migraphx/gpu/compile_ops.hpp>
 #include <migraphx/gpu/concat_gpu_opt.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/eliminate_workspace.hpp>
 #include <migraphx/gpu/fuse_ops.hpp>
+#include <migraphx/gpu/prefuse_ops.hpp>
 #include <migraphx/gpu/lowering.hpp>
 #include <migraphx/gpu/mlir_conv.hpp>
 #include <migraphx/gpu/pack_int8_args.hpp>
@@ -43,6 +45,20 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_POINTWISE_FUSION)
+
+struct id_pass
+{
+    std::string name() const { return "id"; }
+    void apple(const module&) const {}
+};
+
+pass enable_pass(bool enabled, pass p)
+{
+    if(enabled)
+        return p;
+    return id_pass{};
+}

 std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_options& options) const
 {
@@ -58,8 +74,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
    return
    {
        normalize_ops{},
-        decompose{},
        dead_code_elimination{},
+        simplify_qdq{},
        rewrite_quantization{},
        dead_code_elimination{},
        eliminate_data_type{unsupported_types, shape::type_t::float_type},
@@ -81,22 +97,28 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        simplify_algebra{},
        simplify_reshapes{},
        simplify_algebra{},
+        prefuse_ops{},
+        dead_code_elimination{},
        auto_contiguous{},
        simplify_reshapes{},
        propagate_constant{},
        dead_code_elimination{},
+        enable_pass(not enabled(MIGRAPHX_DISABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
+        dead_code_elimination{},
        mlir_conv{&ctx},
        lowering{&ctx, options.offload_copy},
        eliminate_contiguous{"gpu::contiguous"},
        dead_code_elimination{},
        eliminate_concat{concat_gpu_optimization{}},
        dead_code_elimination{},
-        adjust_allocation{gpu_allocation_model{}},
-        dead_code_elimination{},
        pack_int8_args{},
        dead_code_elimination{},
+        adjust_allocation{gpu_allocation_model{}},
+        dead_code_elimination{},
        fuse_ops{&ctx, options.fast_math},
        dead_code_elimination{},
+        compile_ops{&ctx},
+        dead_code_elimination{},
        write_literals{&ctx},
        schedule{gpu::schedule_model{ctx.get_current_device().nstreams()}, not enabled(MIGRAPHX_DISABLE_SCHEDULE_PASS{})},
        memory_coloring{"hip::allocate"},

--- a/src/targets/gpu/topk.cpp
+++ b/src/targets/gpu/topk.cpp
+#include <migraphx/gpu/topk.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/topk.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_topk::compute_shape(std::vector<shape> inputs) const
+{
+    return op.normalize_compute_shape({inputs.front()});
+}
+
+argument hip_topk::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    auto outputs = args.back().get_sub_objects();
+    return op.largest ? device::topk_largest(ctx.get_stream().get(),
+                                             outputs.front(),
+                                             outputs.back(),
+                                             args[0],
+                                             op.k,
+                                             op.axis)
+                      : device::topk_smallest(ctx.get_stream().get(),
+                                              outputs.front(),
+                                              outputs.back(),
+                                              args[0],
+                                              op.k,
+                                              op.axis);
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/write_literals.cpp
+++ b/src/targets/gpu/write_literals.cpp
@@ -11,25 +11,25 @@ namespace gpu {

 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_COPY_LITERALS)

-void write_literals::apply(module& p) const
+void write_literals::apply(module& m) const
 {
    assert(ctx != nullptr);
    std::size_t n = 0;
-    for(auto ins : iterator_for(p))
+    for(auto ins : iterator_for(m))
    {
        if(ins->name() == "@literal")
        {
            if(enabled(MIGRAPHX_COPY_LITERALS{}))
            {
                literal l  = ins->get_literal();
-                auto pre   = p.add_literal(l);
-                auto alloc = p.insert_instruction(std::next(pre), hip_allocate{l.get_shape()});
-                p.replace_instruction(ins, hip_copy_to_gpu{}, pre, alloc);
+                auto pre   = m.add_literal(l);
+                auto alloc = m.insert_instruction(std::next(pre), hip_allocate{l.get_shape()});
+                m.replace_instruction(ins, hip_copy_to_gpu{}, pre, alloc);
            }
            else
            {
-                std::string id = p.name() + ":@literal:" + std::to_string(n);
-                p.replace_instruction(ins, hip_copy_literal{ins->get_literal(), id});
+                std::string id = m.name() + ":@literal:" + std::to_string(n);
+                m.replace_instruction(ins, hip_copy_literal{ins->get_literal(), id});
                n++;
            }
        }

--- a/src/targets/ref/CMakeLists.txt
+++ b/src/targets/ref/CMakeLists.txt
@@ -15,8 +15,6 @@ target_link_libraries(migraphx_ref migraphx Threads::Threads)
 target_include_directories(migraphx_ref PRIVATE ${BLAZE_INCLUDE})
 target_compile_definitions(migraphx_ref PRIVATE -DBLAZE_USE_CPP_THREADS)

-target_link_libraries(migraphx_all_targets INTERFACE migraphx_ref)
-
 rocm_install_targets(
  TARGETS migraphx_ref
  INCLUDE

--- a/src/targets/ref/gemm.cpp
+++ b/src/targets/ref/gemm.cpp
 #include <migraphx/ref/gemm.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/requires.hpp>
-#include <migraphx/shape_for_each.hpp>
+#include <migraphx/par_for.hpp>
 #include <blaze/math/CustomMatrix.h>

 namespace migraphx {
@@ -74,8 +74,10 @@ void migemm_impl(
    assert(amat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_0]);
    assert(cmat.get_shape().lens()[dim_0] == amat.get_shape().lens()[dim_0]);
    assert(cmat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_1]);
+    auto cs = cmat.get_shape();

-    shape_for_each(cmat.get_shape(), [&](const auto& c_idx) {
+    par_for(cs.elements(), [&](auto i) {
+        auto c_idx = cs.multi(i);
        auto a_idx = c_idx;
        auto b_idx = c_idx;
        double s   = 0.0;

--- a/src/targets/ref/lowering.cpp
+++ b/src/targets/ref/lowering.cpp
@@ -10,13 +10,12 @@
 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/quant_dot.hpp>
 #include <migraphx/op/elu.hpp>
-#include <migraphx/op/if_op.hpp>
 #include <migraphx/op/im2col.hpp>
 #include <migraphx/op/leaky_relu.hpp>
 #include <migraphx/op/logsoftmax.hpp>
+#include <migraphx/op/loop.hpp>
 #include <migraphx/op/lrn.hpp>
 #include <migraphx/op/pad.hpp>
-#include <migraphx/op/pooling.hpp>
 #include <migraphx/op/softmax.hpp>
 #include <migraphx/op/argmax.hpp>
 #include <migraphx/op/argmin.hpp>
@@ -269,99 +268,6 @@ struct ref_convolution : auto_register_op<ref_convolution<Op>>
    }
 };

-template <class Op>
-struct ref_deconvolution : auto_register_op<ref_deconvolution<Op>>
-{
-    ref_deconvolution() = default;
-
-    ref_deconvolution(Op pop) : op(std::move(pop)) {}
-
-    Op op;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-
-    std::string name() const { return "ref::" + op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
-            using type = typename decltype(output)::value_type;
-
-            std::fill(output.begin(), output.end(), type{0});
-
-            auto in_lens = input.get_shape().lens();
-            auto in_n    = in_lens[0];
-            auto in_c    = in_lens[1];
-
-            auto wei   = weights.get_shape().lens();
-            auto wei_n = wei[0];
-            auto wei_c = wei[1];
-
-            auto out_lens = output_shape.lens();
-            auto kdims    = op.kdims();
-
-            std::vector<std::size_t> win_size{in_c};
-            std::copy(in_lens.begin() + 2, in_lens.end(), std::back_inserter(win_size));
-            std::copy(wei.begin() + 2, wei.end(), std::back_inserter(win_size));
-            shape win_shape{output_shape.type(), win_size};
-
-            par_dfor(in_n, wei_c)([&](int o, int k) {
-
-                shape_for_each(win_shape, [&](auto idx_win) {
-                    const int w = idx_win[0];
-
-                    auto input_dims_start = idx_win.begin() + 1;
-                    auto wei_dims_start   = idx_win.begin() + kdims + 1;
-
-                    std::vector<std::ptrdiff_t> win_start;
-                    for(std::size_t n = 0; n < kdims; ++n)
-                    {
-                        win_start.push_back(std::ptrdiff_t(*(input_dims_start + n) * op.stride[n]) -
-                                            std::ptrdiff_t(op.padding[n]));
-                    }
-
-                    const int group_id = w / (wei_n / op.group);
-                    const int in_ch    = group_id * wei_c + k;
-
-                    std::vector<std::ptrdiff_t> idx_out{o, in_ch};
-
-                    for(size_t n = 0; n < kdims; n++)
-                    {
-                        idx_out.push_back(win_start[n] + *(wei_dims_start + n) * op.dilation[n]);
-                    }
-
-                    std::vector<std::ptrdiff_t> idx_wei{w, k};
-                    std::copy(wei_dims_start, idx_win.end(), std::back_inserter(idx_wei));
-
-                    std::vector<std::ptrdiff_t> idx_in{o, w};
-                    std::copy(input_dims_start, wei_dims_start, std::back_inserter(idx_in));
-
-                    if(std::all_of(
-                           idx_out.begin() + 2, idx_out.end(), [&](auto ii) { return ii >= 0; }) and
-                       std::equal(idx_out.begin() + 2,
-                                  idx_out.end(),
-                                  out_lens.begin() + 2,
-                                  out_lens.end(),
-                                  std::less<std::ptrdiff_t>{}))
-                    {
-                        output(idx_out.begin(), idx_out.end()) +=
-                            input(idx_in.begin(), idx_in.end()) *
-                            weights(idx_wei.begin(), idx_wei.end());
-                    }
-                });
-
-            });
-
-        });
-        return result;
-    }
-};
-
 struct ref_im2col
 {
    op::im2col op;
@@ -428,109 +334,6 @@ struct ref_im2col
 };
 MIGRAPHX_REGISTER_OP(ref_im2col)

-struct max_pool
-{
-    static std::string name() { return "max"; }
-    template <class T>
-    static T start()
-    {
-        return std::numeric_limits<T>::lowest();
-    }
-
-    static double apply(double x, double y)
-    {
-        double m = std::max(x, y);
-        return (m);
-    }
-
-    static double final(double x, std::size_t) { return (x); }
-};
-
-struct avg_pool
-{
-    static std::string name() { return "average"; }
-
-    template <class T>
-    static double start()
-    {
-        return 0.0;
-    }
-
-    static double apply(double x, double y) { return x + y; }
-
-    static double final(double x, std::size_t y) { return (y == 0) ? 0.0 : (x / y); }
-};
-
-template <class Op>
-struct ref_pooling : auto_register_op<ref_pooling<Op>>
-{
-    ref_pooling() = default;
-
-    ref_pooling(op::pooling pop) : op(std::move(pop)) {}
-
-    op::pooling op;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-
-    std::string name() const { return "ref::pooling_" + Op::name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const
-    {
-        return op.normalize_compute_shape(inputs);
-    }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            using type   = typename decltype(output)::value_type;
-            auto in_s    = input.get_shape();
-            auto in_lens = in_s.lens();
-            std::vector<std::size_t> vec_len(in_lens.begin() + 2, in_lens.end());
-
-            par_for(output_shape.elements(), [&](auto i) {
-                auto idx_o = output_shape.multi(i);
-                auto n_dim = idx_o.size();
-                std::vector<std::size_t> win_start;
-                std::vector<std::size_t> win_size;
-                for(std::size_t dim = 2; dim < n_dim; ++dim)
-                {
-                    auto d_2  = dim - 2;
-                    int start = static_cast<int>(idx_o[dim] * op.stride[d_2]) -
-                                static_cast<int>(op.padding[d_2]);
-                    int end = std::min(start + op.lengths[d_2], in_lens[dim]);
-                    start   = std::max(start, 0);
-                    win_start.push_back(start);
-                    win_size.push_back(end - start);
-                }
-
-                shape win_shape{output_shape.type(), win_size};
-                auto pool_size = win_shape.elements();
-                double acc     = Op::template start<type>();
-                shape_for_each(win_shape, [&](auto idx_w) {
-                    auto idx = idx_o;
-                    std::transform(idx_w.begin(),
-                                   idx_w.end(),
-                                   win_start.begin(),
-                                   idx.begin() + 2,
-                                   [](auto ii, auto jj) { return ii + jj; });
-                    if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
-                       idx < in_lens)
-                    {
-                        acc = Op::apply(acc, input[in_s.index(idx)]);
-                    }
-                });
-
-                output[i] = type(Op::final(acc, pool_size));
-            });
-        });
-
-        return result;
-    }
-};
-
 struct ref_op
 {
    operation op = op::identity{};
@@ -611,42 +414,12 @@ struct ref_gemm
        return migraphx::reflect(self.op, f);
    }
    std::string name() const { return "ref::dot"; }
-    shape compute_shape(const std::vector<shape>& inputs) const
-    {
-        if(inputs.size() == 3)
-        {
-            auto c_shape = inputs.at(2);
-            check_shapes{{c_shape}, *this}.not_broadcasted();
-        }
-        return op.compute_shape(inputs);
-    }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        // 3 inputs, it is alpha * A * B + beta * C, then
-        // A and B are matrices, and C is of the same shape as A * B
-        if(args.size() == 3)
-        {
-            // no need to consider the value of args[2]
-            if(op.beta == 0.0f)
-            {
-                result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
-            }
-            else
-            {
-                visit_all(result, args[2])([&](auto output, auto input) {
-                    std::copy(input.begin(), input.end(), output.begin());
-                });
-            }
-
-            migemm(result, args[0], args[1], op.alpha, op.beta);
-
-            return result;
-        }
-
-        // 2 input arguments
-        migemm(result, args[0], args[1], op.alpha, 0.0f);
+        migemm(result, args[0], args[1], 1.0f, 0.0f);

        return result;
    }
@@ -664,22 +437,11 @@ struct ref_quant_gemm
    }

    std::string name() const { return "ref::quant_dot"; }
-    shape compute_shape(const std::vector<shape>& inputs) const
-    {
-        if(inputs.size() == 3)
-        {
-            auto c_shape = inputs.at(2);
-            check_shapes{{c_shape}, *this}.not_broadcasted();
-        }
-        return op.compute_shape(inputs);
-    }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        // 3 inputs, it is alpha * A * B + beta * C, then
-        // A and B are matrices, and C is of the same shape to A * B
-
        // first, convert the args[0] and args[1] from int8_t to int32_t
        argument arg_0{{shape::int32_type, {args.at(0).get_shape().lens()}}};
        argument arg_1{{shape::int32_type, {args.at(1).get_shape().lens()}}};
@@ -693,27 +455,7 @@ struct ref_quant_gemm
                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
        });

-        if(args.size() == 3)
-        {
-            // no need to consider the value of args[2]
-            if(op.beta == 0)
-            {
-                result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
-            }
-            else
-            {
-                visit_all(result, args[2])([&](auto output, auto input) {
-                    std::copy(input.begin(), input.end(), output.begin());
-                });
-            }
-
-            migemm(result, arg_0, arg_1, op.alpha, op.beta);
-
-            return result;
-        }
-
-        // 2 input arguments
-        migemm(result, arg_0, arg_1, op.alpha, int32_t{0});
+        migemm(result, arg_0, arg_1, int32_t{1}, int32_t{0});

        return result;
    }
@@ -763,7 +505,7 @@ struct ref_unary : auto_register_op<ref_unary<Op>>
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(1);
-        auto s = inputs.at(0);
+        const auto& s = inputs.at(0);
        return {s.type(), s.lens()};
    }

@@ -917,10 +659,8 @@ struct ref_apply
        apply_map["batch_norm_inference"] =
            extend_op<ref_batch_norm_inference, op::batch_norm_inference>();
        apply_map["convolution"] = extend_op<ref_convolution<op::convolution>, op::convolution>();
-        apply_map["deconvolution"] =
-            extend_op<ref_deconvolution<op::deconvolution>, op::deconvolution>();
-        apply_map["dot"]       = extend_op<ref_gemm, op::dot>();
-        apply_map["quant_dot"] = extend_op<ref_quant_gemm, op::quant_dot>();
+        apply_map["dot"]         = extend_op<ref_gemm, op::dot>();
+        apply_map["quant_dot"]   = extend_op<ref_quant_gemm, op::quant_dot>();
        apply_map["quant_convolution"] =
            extend_op<ref_convolution<op::quant_convolution>, op::quant_convolution>();
        apply_map["elu"]        = extend_op<ref_unary<elu_op>, op::elu>();
@@ -939,11 +679,7 @@ struct ref_apply
        init();
        for(auto it : iterator_for(*mod))
        {
-            if(it->name() == "pooling")
-            {
-                apply_pooling(it);
-            }
-            else if(apply_map.count(it->name()) > 0)
+            if(apply_map.count(it->name()) > 0)
            {
                apply_map.at(it->name())(it);
            }
@@ -971,15 +707,6 @@ struct ref_apply
        auto&& op = any_cast<Op>(ins->get_operator());
        mod->replace_instruction(ins, T{op}, ins->inputs());
    }
-
-    void apply_pooling(instruction_ref ins) const
-    {
-        auto&& op = any_cast<op::pooling>(ins->get_operator());
-        if(op.mode == "max")
-            mod->replace_instruction(ins, ref_pooling<max_pool>{op}, ins->inputs());
-        else if(op.mode == "average")
-            mod->replace_instruction(ins, ref_pooling<avg_pool>{op}, ins->inputs());
-    }
 };

 void lowering::apply(module& m) const { ref_apply{&m}.apply(); }

--- a/src/tf/CMakeLists.txt
+++ b/src/tf/CMakeLists.txt
@@ -19,7 +19,7 @@ target_compile_options(tf-proto PRIVATE -w)
 target_link_libraries(tf-proto PRIVATE ${PROTOBUF_LIBRARY})
 set_target_properties(tf-proto PROPERTIES POSITION_INDEPENDENT_CODE On)

-file(GLOB TF_SRCS *.cpp)
+file(GLOB TF_SRCS ${CONFIGURE_DEPENDS} *.cpp)
 add_library(migraphx_tf ${TF_SRCS})
 target_include_directories(migraphx_tf PRIVATE include)
 set_target_properties(migraphx_tf PROPERTIES EXPORT_NAME tf)

--- a/src/tf/parse_biasadd.cpp
+++ b/src/tf/parse_biasadd.cpp
@@ -20,7 +20,8 @@ struct parse_biasadd : op_parser<parse_biasadd>
        uint64_t axis = 1; // assume output of previous layer is in NCHW (broadcast on channel)

        auto l0 = info.add_instruction(
-            make_op("broadcast", {{"axis", axis}, {"dims", args[0]->get_shape().lens()}}), args[1]);
+            make_op("broadcast", {{"axis", axis}, {"out_lens", args[0]->get_shape().lens()}}),
+            args[1]);
        return info.add_instruction(make_op("add"), args[0], l0);
    }
 };

--- a/src/tf/parse_matmul.cpp
+++ b/src/tf/parse_matmul.cpp
@@ -46,10 +46,12 @@ struct parse_matmul : op_parser<parse_matmul>
        // swap the last two elements
        std::iter_swap(perm.end() - 1, perm.end() - 2);

-        auto l1 = (transa) ? info.add_instruction(make_op("transpose", {{"dims", perm}}), args[0])
-                           : args[0];
-        auto l2 = (transb) ? info.add_instruction(make_op("transpose", {{"dims", perm}}), args[1])
-                           : args[1];
+        auto l1 = (transa)
+                      ? info.add_instruction(make_op("transpose", {{"permutation", perm}}), args[0])
+                      : args[0];
+        auto l2 = (transb)
+                      ? info.add_instruction(make_op("transpose", {{"permutation", perm}}), args[1])
+                      : args[1];

        return info.add_instruction(make_op("dot"), l1, l2);
    }

--- a/src/tf/parse_pooling.cpp
+++ b/src/tf/parse_pooling.cpp
@@ -19,7 +19,12 @@ struct parse_pooling : op_parser<parse_pooling>
                          tf_parser::node_info info,
                          std::vector<instruction_ref> args) const
    {
-        op::pooling op{starts_with(opd.tf_name, "Max") ? "max" : "average"};
+        if(!starts_with(opd.tf_name, "Max") && !starts_with(opd.tf_name, "Av"))
+        {
+            MIGRAPHX_THROW("tf pooling mode must be Max or Average");
+        }
+        op::pooling op{starts_with(opd.tf_name, "Max") ? op::pooling_mode::max
+                                                       : op::pooling_mode::average};

        if(contains(info.attributes, "strides"))
        {

--- a/src/tf/parse_relu6.cpp
+++ b/src/tf/parse_relu6.cpp
@@ -23,9 +23,9 @@ struct parse_relu6 : op_parser<parse_relu6>
        auto max_val    = info.add_literal(6.0f);

        min_val =
-            info.add_instruction(make_op("multibroadcast", {{"output_lens", input_lens}}), min_val);
+            info.add_instruction(make_op("multibroadcast", {{"out_lens", input_lens}}), min_val);
        max_val =
-            info.add_instruction(make_op("multibroadcast", {{"output_lens", input_lens}}), max_val);
+            info.add_instruction(make_op("multibroadcast", {{"out_lens", input_lens}}), max_val);
        return info.add_instruction(make_op("clip"), args.front(), min_val, max_val);
    }
 };

--- a/src/tf/parse_transpose.cpp
+++ b/src/tf/parse_transpose.cpp
@@ -20,7 +20,7 @@ struct parse_transpose : op_parser<parse_transpose>
        auto perm = args[1]->eval().get<int32_t>().to_vector();
        std::vector<int64_t> dims(perm.begin(), perm.end());

-        return info.add_instruction(make_op("transpose", {{"dims", dims}}), args.front());
+        return info.add_instruction(make_op("transpose", {{"permutation", dims}}), args.front());
    }
 };


--- a/src/tf/tf_parser.cpp
+++ b/src/tf/tf_parser.cpp
@@ -35,20 +35,20 @@ bool tf_parser::should_transpose(instruction_ref ins) const
 instruction_ref tf_parser::to_nhwc(instruction_ref ins) const
 {
    if(should_transpose(ins))
-        return mm->add_instruction(make_op("transpose", {{"dims", {0, 2, 3, 1}}}), ins);
+        return mm->add_instruction(make_op("transpose", {{"permutation", {0, 2, 3, 1}}}), ins);
    return ins;
 }

 instruction_ref tf_parser::to_nchw(instruction_ref ins) const
 {
    if(should_transpose(ins))
-        return mm->add_instruction(make_op("transpose", {{"dims", {0, 3, 1, 2}}}), ins);
+        return mm->add_instruction(make_op("transpose", {{"permutation", {0, 3, 1, 2}}}), ins);
    return ins;
 }

 instruction_ref tf_parser::to_kcxy(instruction_ref ins) const
 {
-    return mm->add_instruction(make_op("transpose", {{"dims", {3, 2, 0, 1}}}), ins);
+    return mm->add_instruction(make_op("transpose", {{"permutation", {3, 2, 0, 1}}}), ins);
 }

 std::vector<instruction_ref> tf_parser::to_nchw(const std::vector<instruction_ref>& args) const
@@ -499,8 +499,7 @@ literal tf_parser::parse_tensor(const tensorflow::TensorProto& t) const
        return create_literal(shape::int64_type, dims, get_data_vals(t.int64_val(), shape_size));
    case tensorflow::DataType::DT_BOOL:
        return create_literal(shape::int32_type, dims, get_data_vals(t.bool_val(), shape_size));
-    case tensorflow::DataType::DT_HALF:
-    {
+    case tensorflow::DataType::DT_HALF: {
        std::vector<int> data_int32 = get_data_vals(t.half_val(), shape_size);
        std::vector<uint16_t> data_uint16(data_int32.begin(), data_int32.end());
        std::vector<half> data_half;

--- a/src/value.cpp
+++ b/src/value.cpp
@@ -4,6 +4,7 @@
 #include <migraphx/errors.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/value.hpp>
+#include <migraphx/optional.hpp>
 #include <unordered_map>
 #include <utility>

@@ -138,6 +139,7 @@ value::value(const std::string& pkey, const value& rhs)
 {
 }

+value::value(const std::string& pkey, const char* i) : value(pkey, std::string(i)) {}
 value::value(const char* i) : value(std::string(i)) {}

 #define MIGRAPHX_VALUE_GENERATE_DEFINE_METHODS(vt, cpp_type)                           \
@@ -161,6 +163,12 @@ value::value(const char* i) : value(std::string(i)) {}
    const cpp_type* value::if_##vt() const { return x ? x->if_##vt() : nullptr; }
 MIGRAPHX_VISIT_VALUE_TYPES(MIGRAPHX_VALUE_GENERATE_DEFINE_METHODS)

+value& value::operator=(const char* c)
+{
+    *this = std::string{c};
+    return *this;
+}
+
 value& value::operator=(std::nullptr_t)
 {
    x = nullptr;
@@ -410,25 +418,12 @@ value value::with_key(const std::string& pkey) const
    return result;
 }

-template <class F, class T, class U, class Common = typename std::common_type<T, U>::type>
-auto compare_common_impl(
-    rank<1>, F f, const std::string& keyx, const T& x, const std::string& keyy, const U& y)
-{
-    return f(std::forward_as_tuple(keyx, Common(x)), std::forward_as_tuple(keyy, Common(y)));
-}
-
-template <class F>
-auto compare_common_impl(
-    rank<1>, F f, const std::string& keyx, std::nullptr_t, const std::string& keyy, std::nullptr_t)
-{
-    return f(std::forward_as_tuple(keyx, 0), std::forward_as_tuple(keyy, 0));
-}
-
-template <class F, class T, class U>
-auto compare_common_impl(rank<0>, F, const std::string&, const T&, const std::string&, const U&)
+template <class T>
+const T& compare_decay(const T& x)
 {
-    return false;
+    return x;
 }
+int compare_decay(std::nullptr_t) { return 0; }

 template <class F>
 bool compare(const value& x, const value& y, F f)
@@ -436,7 +431,11 @@ bool compare(const value& x, const value& y, F f)
    bool result = false;
    x.visit_value([&](auto&& a) {
        y.visit_value([&](auto&& b) {
-            result = compare_common_impl(rank<1>{}, f, x.get_key(), a, y.get_key(), b);
+            if constexpr(std::is_same<decltype(a), decltype(b)>{})
+                result = f(std::forward_as_tuple(x.get_key(), compare_decay(a)),
+                           std::forward_as_tuple(y.get_key(), compare_decay(b)));
+            else
+                assert(false); // NOLINT
        });
    });
    return result;
@@ -455,11 +454,16 @@ bool operator==(const value& x, const value& y)
        return false;
    return compare(x, y, std::equal_to<>{});
 }
-bool operator!=(const value& x, const value& y) { return !(x == y); }
-bool operator<(const value& x, const value& y) { return compare(x, y, std::less<>{}); }
-bool operator<=(const value& x, const value& y) { return x == y or x < y; }
+bool operator!=(const value& x, const value& y) { return not(x == y); }
+bool operator<(const value& x, const value& y)
+{
+    if(x.get_type() != y.get_type())
+        return x.get_type() < y.get_type();
+    return compare(x, y, std::less<>{});
+}
+bool operator<=(const value& x, const value& y) { return not(x > y); }
 bool operator>(const value& x, const value& y) { return y < x; }
-bool operator>=(const value& x, const value& y) { return x == y or x > y; }
+bool operator>=(const value& x, const value& y) { return not(x < y); }

 void print_value(std::ostream& os, std::nullptr_t) { os << "null"; }


--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -90,7 +90,7 @@ function(add_test_executable TEST_NAME)
    target_include_directories(${TEST_NAME} PUBLIC include)
 endfunction(add_test_executable)

-file(GLOB TESTS *.cpp)
+file(GLOB TESTS ${CONFIGURE_DEPENDS} *.cpp)

 foreach(TEST ${TESTS})
    get_filename_component(BASE_NAME ${TEST} NAME_WE)
@@ -100,7 +100,7 @@ endforeach()

 if(MIGRAPHX_ENABLE_GPU)
    # gpu tests
-    file(GLOB GPU_TESTS gpu/*.cpp)
+    file(GLOB GPU_TESTS ${CONFIGURE_DEPENDS} gpu/*.cpp)

    foreach(TEST ${GPU_TESTS})
        get_filename_component(BASE_NAME ${TEST} NAME_WE)
@@ -120,7 +120,7 @@ file (GLOB ONNX_TESTS ${TEST_ONNX_DIR}/*.cpp)
 foreach(ONNX_TEST ${ONNX_TESTS})
    get_filename_component(BASE_NAME ${ONNX_TEST} NAME_WE)
    set(TEST_NAME test_${BASE_NAME})
-    add_executable(${TEST_NAME} ${TES_ONNX_DIR}/${ONNX_TEST})
+    add_executable(${TEST_NAME} ${ONNX_TEST})
    rocm_clang_tidy_check(${TEST_NAME})
    target_link_libraries(${TEST_NAME} migraphx_onnx migraphx_ref)
    target_include_directories(${TEST_NAME} PUBLIC include)
@@ -160,7 +160,7 @@ function(test_header NAME HEADER)
 endfunction()

 function(test_headers PREFIX)
-    file(GLOB HEADERS ${ARGN})
+    file(GLOB HEADERS ${CONFIGURE_DEPENDS} ${ARGN})

    foreach(HEADER ${HEADERS})
        file(RELATIVE_PATH HEADER_REL ${CMAKE_SOURCE_DIR} ${HEADER})

--- a/test/any_ptr.cpp
+++ b/test/any_ptr.cpp
+#include <migraphx/any_ptr.hpp>
+#include <test.hpp>
+
+TEST_CASE(test_int_id)
+{
+    int i               = 1;
+    migraphx::any_ptr p = &i;
+    EXPECT(p.get<int*>() == &i);
+    EXPECT(p.get(migraphx::get_type_name(i)) == &i);
+    EXPECT(p.unsafe_get() == &i);
+    EXPECT(test::throws([&] { p.get<float*>(); }));
+    EXPECT(test::throws([&] { p.get(migraphx::get_type_name(&i)); }));
+}
+
+TEST_CASE(test_int_name)
+{
+    int i    = 1;
+    void* vp = &i;
+    migraphx::any_ptr p{vp, migraphx::get_type_name(i)};
+    EXPECT(p.get<int*>() == &i);
+    EXPECT(p.get(migraphx::get_type_name(i)) == &i);
+    EXPECT(p.unsafe_get() == &i);
+    EXPECT(test::throws([&] { p.get<float*>(); }));
+    EXPECT(test::throws([&] { p.get(migraphx::get_type_name(&i)); }));
+    EXPECT(test::throws([&] { p.get(migraphx::get_type_name(float{})); }));
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/api/CMakeLists.txt
+++ b/test/api/CMakeLists.txt
-
 function(add_api_test TEST_NAME TEST_SRC TEST_DIR)
    set(NAME test_api_${TEST_NAME})
    add_executable(${NAME} EXCLUDE_FROM_ALL ${TEST_SRC})
    rocm_clang_tidy_check(${NAME})
-    target_link_libraries(${NAME} migraphx_c)
+    target_link_libraries(${NAME} migraphx_c migraphx)
    target_include_directories(${NAME} PUBLIC ../include)
    add_test(NAME ${NAME} COMMAND $<TARGET_FILE:${NAME}> WORKING_DIRECTORY ${TEST_DIR}) 
    add_dependencies(tests ${NAME})
    add_dependencies(check ${NAME})
 endfunction()

-
+add_api_test(array_base test_array_base.cpp ${TEST_ONNX_DIR})
+add_api_test(assign test_assign.cpp ${TEST_ONNX_DIR})
+add_api_test(custom_op test_custom_op.cpp ${TEST_ONNX_DIR})
+add_api_test(compile_options test_compile_options.cpp ${TEST_ONNX_DIR})
+add_api_test(lookup test_lookup.cpp ${TEST_ONNX_DIR})
+add_api_test(module_construct test_module_construct.cpp ${TEST_ONNX_DIR})
 add_api_test(ref test_cpu.cpp ${TEST_ONNX_DIR})
 add_api_test(save_load test_save_load.cpp ${TEST_ONNX_DIR})
 add_api_test(op test_op_construct.cpp ${TEST_ONNX_DIR})
 add_api_test(tf_parser test_tf_parser.cpp ${TEST_TF_DIR})
+# GPU-based tests
 if(MIGRAPHX_ENABLE_GPU)
 add_api_test(gpu test_gpu.cpp ${TEST_ONNX_DIR})
-# GPU-based tests
+target_link_libraries(test_api_gpu migraphx_gpu)
 endif()
--- a/test/api/test_array_base.cpp
+++ b/test/api/test_array_base.cpp
+#include <migraphx/migraphx.hpp>
+#include "test.hpp"
+
+struct array2 : migraphx::array_base<array2>
+{
+    std::vector<int> v;
+    array2() = default;
+    array2(std::initializer_list<int> x) : v(x) {}
+    std::size_t size() const { return v.size(); }
+    int operator[](std::size_t i) const { return v[i]; }
+};
+
+TEST_CASE(iterators)
+{
+    array2 a = {1, 2, 3};
+    EXPECT(bool{std::equal(a.begin(), a.end(), a.v.begin())});
+}
+
+TEST_CASE(front_back)
+{
+    array2 a = {1, 2, 3};
+    EXPECT(a.front() == 1);
+    EXPECT(a.back() == 3);
+}
+
+TEST_CASE(empty)
+{
+    array2 a = {1, 2, 3};
+    EXPECT(not a.empty());
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/api/test_assign.cpp
+++ b/test/api/test_assign.cpp
+#include <migraphx/migraphx.h>
+#include <migraphx/migraphx.hpp>
+#include "test.hpp"
+
+TEST_CASE(shape_assign)
+{
+    auto s1_cpp = migraphx::shape{migraphx_shape_float_type, {1, 3}};
+    std::vector<size_t> lens{2, 3};
+
+    // handle ptr is const, workaround to construct shape using C API
+    migraphx_shape_t s2;
+    migraphx_shape_create(&s2, migraphx_shape_float_type, lens.data(), lens.size());
+    auto s2_cpp = migraphx::shape(s2, migraphx::own{});
+    CHECK(bool{s1_cpp != s2_cpp});
+    // use C++ API for assignment
+    s1_cpp.assign_to_handle(s2);
+    CHECK(bool{s1_cpp == s2_cpp});
+
+    auto s3_cpp = migraphx::shape{migraphx_shape_float_type, lens};
+    // use C API for assignment
+    migraphx_shape_assign_to(s2, s3_cpp.get_handle_ptr());
+    CHECK(bool{s2_cpp == s3_cpp});
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }