Merge

11e155c2 · Paul · 8a9c5bce · aa7ff911 · 11e155c2 · 11e155c2
Commit 11e155c2 authored Jun 13, 2022 by Paul
20 changed files
--- a/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
@@ -6,13 +6,15 @@
 namespace migraphx {

 using index_int = std::uint32_t;
+using diff_int  = std::int32_t;

 #define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT

 template <class T, index_int N>
 using vec = T __attribute__((ext_vector_type(N)));

-using half = _Float16;
+using half  = _Float16;
+using half2 = migraphx::vec<half, 2>;

 } // namespace migraphx


--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -46,6 +46,9 @@ constexpr auto vec_at(T x, I i)
    }
 }

+template <class T>
+using vec_type = decltype(vec_at(T{}, 0));
+
 template <class... Ts>
 constexpr auto common_vec_size()
 {
@@ -57,24 +60,36 @@ constexpr auto common_vec_size()
    })(vec_size<Ts>()...);
 }

+// Bools can not be used as a vector type so convert it to uint8
+template <class T>
+__device__ __host__ T* remove_bool(T* x)
+{
+    return x;
+}
+
+inline __device__ __host__ uint8_t* remove_bool(bool* x) { return reinterpret_cast<uint8_t*>(x); }
+
 template <index_int N, class T>
 __device__ __host__ auto as_vec(T* x)
 {
-    if constexpr(N == 0)
+    if constexpr(N < 2)
        return x;
    else
        return reinterpret_cast<vec<T, N>*>(x);
 }

+template <class T, index_int N>
+using safe_vec = vec<conditional_t<is_same<T, bool>{}, uint8_t, T>, N>;
+
 template <class... Ts>
 constexpr auto vec_transform(Ts... xs)
 {
    return [=](auto f) {
        if constexpr(is_any_vec<Ts...>())
        {
-            using type             = decltype(f(vec_at(xs, 0)...));
-            constexpr auto size    = common_vec_size<Ts...>();
-            vec<type, size> result = {0};
+            using type                  = decltype(f(vec_at(xs, 0)...));
+            constexpr auto size         = common_vec_size<Ts...>();
+            safe_vec<type, size> result = {0};
            for(int i = 0; i < size; i++)
                result[i] = f(vec_at(xs, i)...);
            return result;
@@ -86,5 +101,64 @@ constexpr auto vec_transform(Ts... xs)
    };
 }

+// Return a vector type of N from index i in another larger vector
+// N will be 2 for half2 packing
+template <index_int N, class T, class I>
+constexpr vec<vec_type<T>, N> vec_packed_at(T x, I i)
+{
+    if constexpr(vec_size<T>() == 0)
+        return vec<T, N>{x};
+    else
+    {
+        MIGRAPHX_ASSERT((i + N) < vec_size<T>());
+        vec<vec_type<T>, N> result = {0};
+        for(int j = 0; j < N; j++)
+        {
+            result[j] = x[i + j];
+        }
+        return result;
+    }
+}
+
+template <index_int N, class... Ts>
+constexpr auto vec_packed_transform(Ts... xs)
+{
+    return [=](auto f) {
+        if constexpr(is_any_vec<Ts...>())
+        {
+            using type                  = vec_type<decltype(f(vec_packed_at<N>(xs, 0)...))>;
+            constexpr auto size         = common_vec_size<Ts...>();
+            safe_vec<type, size> result = {0};
+            for(int i = 0; i < size / N; i++)
+            {
+                // Call the function with packed vectors
+                safe_vec<type, N> r = f(vec_packed_at<N>(xs, i * N)...);
+                // Copy the packed vectors to the result
+                for(int j = 0; j < N; j++)
+                    result[i * N + j] = r[j];
+            }
+            return result;
+        }
+        else
+        {
+            return f(xs...);
+        }
+    };
+}
+
+template <class T, class Op>
+constexpr auto vec_reduce(T x, Op op)
+{
+    if constexpr(vec_size<T>() < 2)
+        return x;
+    else
+    {
+        vec_type<T> result = x[0];
+        for(int i = 1; i < vec_size<T>(); i++)
+            result = op(result, x[i]);
+        return result;
+    }
+}
+
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
@@ -50,19 +50,10 @@ constexpr auto shape_step(Shape s, Axis)
    });
 }

-// Bools can not be used as a vector type so convert it to int8
-template <class T>
-__device__ __host__ T* remove_bool(T* x)
-{
-    return x;
-}
-
-inline __device__ __host__ int8_t* remove_bool(bool* x) { return reinterpret_cast<int8_t*>(x); }
-
 template <index_int N, class T, class Axis>
 __device__ __host__ auto as_vec(T x, Axis axis)
 {
-    if constexpr(N == 0)
+    if constexpr(N < 2)
        return x;
    else
        return make_tensor_view(as_vec<N>(remove_bool(x.data())),
@@ -72,7 +63,7 @@ __device__ __host__ auto as_vec(T x, Axis axis)
 template <index_int N, class T, class Axis>
 constexpr auto tensor_step(T x, Axis axis)
 {
-    if constexpr(N == 0)
+    if constexpr(N < 2)
    {
        return x;
    }
@@ -157,11 +148,11 @@ constexpr auto find_vectorize_size(P pred)
    else if constexpr(decltype(pred(_c<2>)){})
        return _c<2>;
    else
-        return _c<0>;
+        return _c<1>;
 }

 template <class T>
-__host__ __device__ auto vectorize(T x)
+__host__ __device__ auto auto_vectorize(T x)
 {
    if constexpr(tensor_vec_size<T>() == 0)
    {
@@ -194,7 +185,7 @@ inline __device__ __host__ auto auto_vectorize_impl(F f, Ts... xs)
                {
                    MIGRAPHX_ASSERT(s.strides[axis] == 0 or s.strides[axis] == 1);
                    MIGRAPHX_ASSERT(s.lens[axis] > 0);
-                    MIGRAPHX_ASSERT(n == 0 or s.lens[axis] % n == 0);
+                    MIGRAPHX_ASSERT(n == 1 or s.lens[axis] % n == 0);
                    if constexpr(s.strides[axis] == 0)
                        return tensor_step<n>(x, axis);
                    else
@@ -215,7 +206,34 @@ inline __device__ __host__ auto auto_vectorize_impl(F f, Ts... xs)

 inline __device__ __host__ auto auto_vectorize()
 {
-    return [](auto... xs) { return [=](auto f) { auto_vectorize_impl(f, xs...); }; };
+    return make_transform([](auto f, auto... xs) { auto_vectorize_impl(f, xs...); });
+}
+
+template <index_int N, index_int Axis, class T>
+__device__ __host__ auto vectorize_tensor(T x)
+{
+    constexpr auto shape = get_shape_c<T>{};
+    if constexpr(shape.lens[Axis] == 1)
+        return x;
+    else if constexpr(shape.strides[Axis] == 0)
+        return tensor_step<N>(x, _c<Axis>);
+    else
+        return as_vec<N>(x, _c<Axis>);
+}
+
+template <index_int N, index_int Axis>
+__device__ __host__ auto vectorize()
+{
+    return make_transform([](auto f, auto... xs) {
+        if constexpr(N < 2)
+        {
+            f(xs...);
+        }
+        else
+        {
+            f(vectorize_tensor<N, Axis>(xs)...);
+        }
+    });
 }

 } // namespace migraphx

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -20,10 +20,10 @@

 #include <migraphx/gpu/abs.hpp>
 #include <migraphx/gpu/batch_norm_inference.hpp>
-#include <migraphx/gpu/compile_roialign.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/deconvolution.hpp>
+#include <migraphx/gpu/device_name.hpp>
 #include <migraphx/gpu/elu.hpp>
 #include <migraphx/gpu/equal.hpp>
 #include <migraphx/gpu/gemm.hpp>
@@ -40,6 +40,7 @@
 #include <migraphx/gpu/rocblas.hpp>
 #include <migraphx/gpu/unary_not.hpp>
 #include <migraphx/gpu/where.hpp>
+#include <migraphx/gpu/compiler.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/program.hpp>
 #include <utility>
@@ -60,6 +61,7 @@ struct miopen_apply
    std::unordered_map<instruction_ref, std::string> prog_output_names{};
    bool offload_copy   = false;
    bool int8_x4_format = true;
+    bool compute_fp32   = false;

    context& get_context() const
    {
@@ -96,13 +98,22 @@ struct miopen_apply
        }
    }

+    const std::unordered_set<std::string>& get_rocblas_fp32_archs()
+    {
+        static std::unordered_set<std::string> supported_archs{"gfx908", "gfx90a"};
+        return supported_archs;
+    }
+
    void init()
    {
        assert(mod != nullptr);
        assert(pass != nullptr);

 #if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
-        auto& ctx = get_context();
+        auto& ctx              = get_context();
+        const auto device_name = trim(split_string(get_device_name(), ':').front());
+        if(contains(get_rocblas_fp32_archs(), device_name))
+            compute_fp32 = true;
        rocblas_gemm_flags flag;
        rocblas_query_int8_layout_flag(ctx.get_stream().get_rocblas(), &flag);
        int8_x4_format = (flag == rocblas_gemm_flags_pack_int8x4);
@@ -170,21 +181,14 @@ struct miopen_apply
        add_extend_op("pad");
        add_extend_op("pooling");
        add_extend_op("prefix_scan_sum");
-        add_extend_op("reduce_max");
-        add_extend_op("reduce_mean");
-        add_extend_op("reduce_min");
-        add_extend_op("reduce_prod");
-        add_extend_op("reduce_sum");
        add_extend_op("reverse");
        add_extend_op("rnn_var_sl_last_output");
        add_extend_op("rnn_var_sl_shift_output");
        add_extend_op("rnn_var_sl_shift_sequence");
-        add_extend_op("scatter");
+        add_extend_op("scatter_none");
        add_extend_op("softmax");
        add_extend_op("topk");

-        add_precompile_op("pointwise");
-
        add_batch_norm_inference_op();
        add_convolution_op();
        add_deconvolution_op();
@@ -195,7 +199,6 @@ struct miopen_apply
        add_neg_op();
        add_nms_op();
        add_quant_convolution_op();
-        add_roialign();
    }

    void copy_params()
@@ -249,11 +252,28 @@ struct miopen_apply
            {
                check_shape(s, apply_map.at(it->name())(it));
            }
+            else if(has_compiler_for(it->name()))
+            {
+                check_shape(s, insert_precompile_op(it));
+            }
        }

        copy_params();
    }

+    instruction_ref insert_precompile_op(instruction_ref ins)
+    {
+        auto output                       = insert_allocation(ins, ins->get_shape());
+        std::vector<instruction_ref> refs = ins->inputs();
+        refs.push_back(output);
+
+        return mod->replace_instruction(
+            ins,
+            make_op("gpu::precompile_op", {{"op", to_value(ins->get_operator())}}),
+            refs,
+            ins->module_inputs());
+    }
+
    instruction_ref insert_allocation(instruction_ref ins, const shape& s, std::string tag = "")
    {
        // Instruction's output is an input of the ret instruction
@@ -337,7 +357,7 @@ struct miopen_apply
                }
            }
            return mod->replace_instruction(
-                ins, rocblas_gemm<Op>{Op{}, 1, 0, int8_x4_format}, refs);
+                ins, rocblas_gemm<Op>{Op{}, 1, 0, int8_x4_format, compute_fp32}, refs);
        });
    }

@@ -345,8 +365,22 @@ struct miopen_apply
    {
        apply_map.emplace("quant_convolution", [=](instruction_ref ins) {
            auto&& op = any_cast<op::quant_convolution>(ins->get_operator());
-            auto conv = miopen_quant_convolution{op, make_conv(op)};
-            auto ws   = conv.compile(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
+            shape ws;
+            miopen_quant_convolution conv;
+            auto compile_quant_conv_with_format = [&](bool format) {
+                conv = miopen_quant_convolution{op, format, make_conv(op)};
+                ws   = conv.compile(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
+            };
+
+            try
+            {
+                compile_quant_conv_with_format(int8_x4_format);
+            }
+            catch(migraphx::exception&)
+            {
+                // In case no solver supports the default format, retry using the other format.
+                compile_quant_conv_with_format(!int8_x4_format);
+            }

            auto args      = ins->inputs();
            auto workspace = insert_allocation(ins, ws, "workspace");
@@ -356,6 +390,9 @@ struct miopen_apply
        });
    }

+    // add_generic_op just constructs the operator with no fields whereas add_extend_op copies over
+    // the fields Since it doesn't have fields its default constructed
+
    void add_generic_op(const std::string& name) { add_generic_op(name, "gpu::" + name); }

    void add_generic_op(const std::string& op_name, const std::string& gpu_name)
@@ -383,21 +420,6 @@ struct miopen_apply
        });
    }

-    void add_precompile_op(const std::string& name)
-    {
-        apply_map.emplace(name, [=](instruction_ref ins) {
-            auto output                       = insert_allocation(ins, ins->get_shape());
-            std::vector<instruction_ref> refs = ins->inputs();
-            refs.push_back(output);
-
-            return mod->replace_instruction(
-                ins,
-                make_op("gpu::precompile_op", {{"op", to_value(ins->get_operator())}}),
-                refs,
-                ins->module_inputs());
-        });
-    }
-
    void add_batch_norm_inference_op()
    {
        apply_map.emplace("batch_norm_inference", [=](instruction_ref ins) {
@@ -432,7 +454,6 @@ struct miopen_apply
                                            reshapes[2],
                                            reshapes[3],
                                            output);
-
        });
    }

@@ -489,22 +510,6 @@ struct miopen_apply
        });
    }

-    void add_roialign()
-    {
-        apply_map.emplace("roialign", [=](instruction_ref ins) {
-
-            auto s      = ins->get_shape();
-            auto op_val = ins->get_operator().to_value();
-            auto output = insert_allocation(ins, s);
-            auto args   = ins->inputs();
-            args.push_back(output);
-
-            auto io_shapes = to_shapes(args);
-            auto co        = compile_roialign(get_context(), io_shapes, op_val);
-            return mod->replace_instruction(ins, co, args);
-        });
-    }
-
    // replace the loop operator with gpu_loop operator
    void add_loop_op()
    {

--- a/src/targets/gpu/pack_int8_args.cpp
+++ b/src/targets/gpu/pack_int8_args.cpp
@@ -22,10 +22,10 @@ static instruction_ref pad_ins(module& m, instruction_ref ins, int offset)
    auto pad_k                     = (k + 3) / 4 * 4;
    auto pad_lens                  = lens;
    pad_lens[lens.size() + offset] = pad_k;
-    std::vector<int64_t> pad_dims(lens.size() * 2, 0);
-    auto ret_ins = ins;
+    auto ret_ins                   = ins;
    if(pad_k != k)
    {
+        std::vector<int64_t> pad_dims(lens.size() * 2, 0);
        pad_dims[lens.size() + offset] = pad_k - k;
        shape ps{s.type(), pad_lens};
        auto ins_out =
@@ -118,7 +118,7 @@ void pack_int8_args::apply(module& m) const
            assert(val.contains("int8_x4_format"));
            if(not val.at("int8_x4_format").to<bool>())
            {
-                return;
+                continue;
            }
            auto inputs = ins->inputs();
            auto lens   = inputs.at(0)->get_shape().lens();
@@ -156,6 +156,12 @@ void pack_int8_args::apply(module& m) const
        }
        else if(ins->name() == "gpu::quant_convolution")
        {
+            auto val = ins->get_operator().to_value();
+            if(not val.at("int8_x4_format").to<bool>())
+            {
+                continue;
+            }
+
            auto inputs   = ins->inputs();
            auto packed_x = m.insert_instruction(
                ins,

--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
+#include <migraphx/gpu/prefuse_ops.hpp>
+#include <migraphx/match/layernorm.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+namespace {
+struct find_layernorm
+{
+    auto matcher() const { return match::layernorm(); }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins   = r.result;
+        auto x_ins = r.instructions["x"];
+
+        if(not x_ins->get_shape().standard())
+            x_ins = m.insert_instruction(ins, make_op("contiguous"), x_ins);
+
+        auto relements = x_ins->get_shape().lens().back();
+
+        if(relements > 1024 or (relements % 4 != 0 and relements > 256))
+            return;
+
+        auto a = m.insert_instruction(
+            ins, make_op("hip::allocate", {{"shape", to_value(x_ins->get_shape())}}));
+        m.replace_instruction(ins, make_op("gpu::layernorm"), x_ins, a);
+    }
+};
+
+struct find_triaddlayernorm
+{
+    auto matcher() const
+    {
+        auto add1 =
+            match::name("add")(match::none_of(match::is_constant()),
+                               match::args(match::any().bind("z1"), match::any().bind("z2")));
+        auto add2 = match::name("add")(match::either_arg(0, 1)(add1, match::any().bind("z3")));
+        return match::layernorm()(match::var("x")(add2));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins   = r.result;
+        auto x_ins = r.instructions["z1"];
+        auto y_ins = r.instructions["z2"];
+        auto z_ins = r.instructions["z3"];
+
+        for(auto* pins : {&x_ins, &y_ins, &z_ins})
+        {
+            if(not(*pins)->get_shape().standard())
+                *pins = m.insert_instruction(ins, make_op("contiguous"), *pins);
+        }
+
+        auto relements = x_ins->get_shape().lens().back();
+
+        if(relements > 1024 or (relements % 4 != 0 and relements > 256))
+            return;
+
+        auto a = m.insert_instruction(
+            ins, make_op("hip::allocate", {{"shape", to_value(x_ins->get_shape())}}));
+        m.replace_instruction(ins, make_op("gpu::triadd_layernorm"), x_ins, y_ins, z_ins, a);
+    }
+};
+} // namespace
+
+void prefuse_ops::apply(module& m) const
+{
+    match::find_matches(m, find_triaddlayernorm{}, find_layernorm{});
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/quant_convolution.cpp
+++ b/src/targets/gpu/quant_convolution.cpp
@@ -16,8 +16,8 @@ argument miopen_quant_convolution::compute(context& ctx,
                                           const shape& output_shape,
                                           const std::vector<argument>& args) const
 {
-    auto x_desc = make_tensor(args[0].get_shape(), true);
-    auto w_desc = make_tensor(args[1].get_shape(), true);
+    auto x_desc = make_tensor(args[0].get_shape(), int8_x4_format);
+    auto w_desc = make_tensor(args[1].get_shape(), int8_x4_format);
    auto y_desc = make_tensor(output_shape);

    float alpha = 1;
@@ -49,8 +49,8 @@ shape miopen_quant_convolution::compile(context& ctx,
                                        std::vector<shape> inputs)
 {
    shape workspace_shape{};
-    auto x_desc = make_tensor(inputs[0], true);
-    auto w_desc = make_tensor(inputs[1], true);
+    auto x_desc = make_tensor(inputs[0], int8_x4_format);
+    auto w_desc = make_tensor(inputs[1], int8_x4_format);
    auto y_desc = make_tensor(output_shape);

    std::size_t workspace_size = 0;
@@ -62,8 +62,15 @@ shape miopen_quant_convolution::compile(context& ctx,
                                             &workspace_size);
    workspace_shape = shape{shape::int8_type, {workspace_size}};

-    auto arg_vec4_x = to_gpu(generate_argument(pack_int8_shape(inputs[0])));
-    auto arg_vec4_w = to_gpu(generate_argument(pack_int8_shape(inputs[1])));
+    auto x_shape = inputs[0];
+    auto w_shape = inputs[1];
+    if(int8_x4_format)
+    {
+        x_shape = pack_int8_shape(x_shape);
+        w_shape = pack_int8_shape(w_shape);
+    }
+    auto arg_vec4_x = to_gpu(generate_argument(x_shape));
+    auto arg_vec4_w = to_gpu(generate_argument(w_shape));
    auto y          = allocate_gpu(output_shape);
    auto workspace  = allocate_gpu(workspace_shape);


--- a/src/targets/gpu/schedule_model.cpp
+++ b/src/targets/gpu/schedule_model.cpp
@@ -77,28 +77,28 @@ MIGRAPHX_REGISTER_OP(wait_event)
 MIGRAPHX_REGISTER_OP(set_stream)

 std::size_t schedule_model::concurrency() const { return streams; }
-void schedule_model::sched(module& p, instruction_ref ins, std::size_t n) const
+void schedule_model::sched(module& m, instruction_ref ins, std::size_t n) const
 {
    auto last_stream = std::find_if(std::make_reverse_iterator(ins),
-                                    std::make_reverse_iterator(p.begin()),
+                                    std::make_reverse_iterator(m.begin()),
                                    [&](auto&& i) { return i.name() == "gpu::set_stream"; });
-    if(last_stream != std::make_reverse_iterator(p.begin()))
+    if(last_stream != std::make_reverse_iterator(m.begin()))
    {
        auto&& op = any_cast<set_stream>(last_stream->get_operator());
        // If the same stream was set earlier then skip
        if(op.stream == n)
            return;
    }
-    p.insert_instruction(ins, set_stream{n});
+    m.insert_instruction(ins, set_stream{n});
 }

-void schedule_model::wait(module& p, instruction_ref ins, std::size_t wait_id) const
+void schedule_model::wait(module& m, instruction_ref ins, std::size_t wait_id) const
 {
-    p.insert_instruction(ins, wait_event{wait_id});
+    m.insert_instruction(ins, wait_event{wait_id});
 }
-void schedule_model::record(module& p, instruction_ref ins, std::size_t wait_id) const
+void schedule_model::record(module& m, instruction_ref ins, std::size_t wait_id) const
 {
-    p.insert_instruction(std::next(ins), record_event{wait_id});
+    m.insert_instruction(std::next(ins), record_event{wait_id});
 }

 static std::unordered_map<std::string, std::size_t> create_weight_map()

--- a/src/targets/gpu/sync_device.cpp
+++ b/src/targets/gpu/sync_device.cpp
@@ -8,9 +8,9 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-void sync_device::apply(module& p) const
+void sync_device::apply(module& m) const
 {
-    auto last = std::prev(p.end());
+    auto last = std::prev(m.end());
    if(last->name() == "@return")
    {
        auto inputs = last->inputs();
@@ -18,10 +18,10 @@ void sync_device::apply(module& p) const
               return (i->name() == "hip::copy_from_gpu");
           }))
        {
-            auto sync_in = p.insert_instruction(last, make_op("hip::sync_stream"), inputs);
+            auto sync_in = m.insert_instruction(last, make_op("hip::sync_stream"), inputs);
            if(not inputs.empty())
            {
-                p.replace_instruction(inputs.front(), sync_in);
+                m.replace_instruction(inputs.front(), sync_in);
            }
        }
    }

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -31,6 +31,7 @@
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/eliminate_workspace.hpp>
 #include <migraphx/gpu/fuse_ops.hpp>
+#include <migraphx/gpu/prefuse_ops.hpp>
 #include <migraphx/gpu/lowering.hpp>
 #include <migraphx/gpu/mlir_conv.hpp>
 #include <migraphx/gpu/pack_int8_args.hpp>
@@ -44,7 +45,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_POINTWISE_FUSION)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_POINTWISE_FUSION)

 struct id_pass
 {
@@ -96,11 +97,13 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        simplify_algebra{},
        simplify_reshapes{},
        simplify_algebra{},
+        prefuse_ops{},
+        dead_code_elimination{},
        auto_contiguous{},
        simplify_reshapes{},
        propagate_constant{},
        dead_code_elimination{},
-        enable_pass(enabled(MIGRAPHX_ENABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
+        enable_pass(not enabled(MIGRAPHX_DISABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
        dead_code_elimination{},
        mlir_conv{&ctx},
        lowering{&ctx, options.offload_copy},

--- a/src/targets/gpu/write_literals.cpp
+++ b/src/targets/gpu/write_literals.cpp
@@ -11,25 +11,25 @@ namespace gpu {

 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_COPY_LITERALS)

-void write_literals::apply(module& p) const
+void write_literals::apply(module& m) const
 {
    assert(ctx != nullptr);
    std::size_t n = 0;
-    for(auto ins : iterator_for(p))
+    for(auto ins : iterator_for(m))
    {
        if(ins->name() == "@literal")
        {
            if(enabled(MIGRAPHX_COPY_LITERALS{}))
            {
                literal l  = ins->get_literal();
-                auto pre   = p.add_literal(l);
-                auto alloc = p.insert_instruction(std::next(pre), hip_allocate{l.get_shape()});
-                p.replace_instruction(ins, hip_copy_to_gpu{}, pre, alloc);
+                auto pre   = m.add_literal(l);
+                auto alloc = m.insert_instruction(std::next(pre), hip_allocate{l.get_shape()});
+                m.replace_instruction(ins, hip_copy_to_gpu{}, pre, alloc);
            }
            else
            {
-                std::string id = p.name() + ":@literal:" + std::to_string(n);
-                p.replace_instruction(ins, hip_copy_literal{ins->get_literal(), id});
+                std::string id = m.name() + ":@literal:" + std::to_string(n);
+                m.replace_instruction(ins, hip_copy_literal{ins->get_literal(), id});
                n++;
            }
        }

--- a/src/targets/ref/CMakeLists.txt
+++ b/src/targets/ref/CMakeLists.txt
@@ -15,8 +15,6 @@ target_link_libraries(migraphx_ref migraphx Threads::Threads)
 target_include_directories(migraphx_ref PRIVATE ${BLAZE_INCLUDE})
 target_compile_definitions(migraphx_ref PRIVATE -DBLAZE_USE_CPP_THREADS)

-target_link_libraries(migraphx_all_targets INTERFACE migraphx_ref)
-
 rocm_install_targets(
  TARGETS migraphx_ref
  INCLUDE

--- a/src/targets/ref/gemm.cpp
+++ b/src/targets/ref/gemm.cpp
 #include <migraphx/ref/gemm.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/requires.hpp>
-#include <migraphx/shape_for_each.hpp>
+#include <migraphx/par_for.hpp>
 #include <blaze/math/CustomMatrix.h>

 namespace migraphx {
@@ -74,8 +74,10 @@ void migemm_impl(
    assert(amat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_0]);
    assert(cmat.get_shape().lens()[dim_0] == amat.get_shape().lens()[dim_0]);
    assert(cmat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_1]);
+    auto cs = cmat.get_shape();

-    shape_for_each(cmat.get_shape(), [&](const auto& c_idx) {
+    par_for(cs.elements(), [&](auto i) {
+        auto c_idx = cs.multi(i);
        auto a_idx = c_idx;
        auto b_idx = c_idx;
        double s   = 0.0;

--- a/src/targets/ref/lowering.cpp
+++ b/src/targets/ref/lowering.cpp
@@ -16,7 +16,6 @@
 #include <migraphx/op/loop.hpp>
 #include <migraphx/op/lrn.hpp>
 #include <migraphx/op/pad.hpp>
-#include <migraphx/op/pooling.hpp>
 #include <migraphx/op/softmax.hpp>
 #include <migraphx/op/argmax.hpp>
 #include <migraphx/op/argmin.hpp>
@@ -335,109 +334,6 @@ struct ref_im2col
 };
 MIGRAPHX_REGISTER_OP(ref_im2col)

-struct max_pool
-{
-    static std::string name() { return "max"; }
-    template <class T>
-    static T start()
-    {
-        return std::numeric_limits<T>::lowest();
-    }
-
-    static double apply(double x, double y)
-    {
-        double m = std::max(x, y);
-        return (m);
-    }
-
-    static double final(double x, std::size_t) { return (x); }
-};
-
-struct avg_pool
-{
-    static std::string name() { return "average"; }
-
-    template <class T>
-    static double start()
-    {
-        return 0.0;
-    }
-
-    static double apply(double x, double y) { return x + y; }
-
-    static double final(double x, std::size_t y) { return (y == 0) ? 0.0 : (x / y); }
-};
-
-template <class Op>
-struct ref_pooling : auto_register_op<ref_pooling<Op>>
-{
-    ref_pooling() = default;
-
-    ref_pooling(op::pooling pop) : op(std::move(pop)) {}
-
-    op::pooling op;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-
-    std::string name() const { return "ref::pooling_" + Op::name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const
-    {
-        return op.normalize_compute_shape(inputs);
-    }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            using type   = typename decltype(output)::value_type;
-            auto in_s    = input.get_shape();
-            auto in_lens = in_s.lens();
-            std::vector<std::size_t> vec_len(in_lens.begin() + 2, in_lens.end());
-
-            par_for(output_shape.elements(), [&](auto i) {
-                auto idx_o = output_shape.multi(i);
-                auto n_dim = idx_o.size();
-                std::vector<std::size_t> win_start;
-                std::vector<std::size_t> win_size;
-                for(std::size_t dim = 2; dim < n_dim; ++dim)
-                {
-                    auto d_2  = dim - 2;
-                    int start = static_cast<int>(idx_o[dim] * op.stride[d_2]) -
-                                static_cast<int>(op.padding[d_2]);
-                    int end = std::min(start + op.lengths[d_2], in_lens[dim]);
-                    start   = std::max(start, 0);
-                    win_start.push_back(start);
-                    win_size.push_back(end - start);
-                }
-
-                shape win_shape{output_shape.type(), win_size};
-                auto pool_size = win_shape.elements();
-                double acc     = Op::template start<type>();
-                shape_for_each(win_shape, [&](auto idx_w) {
-                    auto idx = idx_o;
-                    std::transform(idx_w.begin(),
-                                   idx_w.end(),
-                                   win_start.begin(),
-                                   idx.begin() + 2,
-                                   [](auto ii, auto jj) { return ii + jj; });
-                    if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
-                       idx < in_lens)
-                    {
-                        acc = Op::apply(acc, input[in_s.index(idx)]);
-                    }
-                });
-
-                output[i] = type(Op::final(acc, pool_size));
-            });
-        });
-
-        return result;
-    }
-};
-
 struct ref_op
 {
    operation op = op::identity{};
@@ -609,7 +505,7 @@ struct ref_unary : auto_register_op<ref_unary<Op>>
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(1);
-        auto s = inputs.at(0);
+        const auto& s = inputs.at(0);
        return {s.type(), s.lens()};
    }

@@ -783,11 +679,7 @@ struct ref_apply
        init();
        for(auto it : iterator_for(*mod))
        {
-            if(it->name() == "pooling")
-            {
-                apply_pooling(it);
-            }
-            else if(apply_map.count(it->name()) > 0)
+            if(apply_map.count(it->name()) > 0)
            {
                apply_map.at(it->name())(it);
            }
@@ -815,15 +707,6 @@ struct ref_apply
        auto&& op = any_cast<Op>(ins->get_operator());
        mod->replace_instruction(ins, T{op}, ins->inputs());
    }
-
-    void apply_pooling(instruction_ref ins) const
-    {
-        auto&& op = any_cast<op::pooling>(ins->get_operator());
-        if(op.mode == "max")
-            mod->replace_instruction(ins, ref_pooling<max_pool>{op}, ins->inputs());
-        else if(op.mode == "average")
-            mod->replace_instruction(ins, ref_pooling<avg_pool>{op}, ins->inputs());
-    }
 };

 void lowering::apply(module& m) const { ref_apply{&m}.apply(); }

--- a/src/tf/CMakeLists.txt
+++ b/src/tf/CMakeLists.txt
@@ -19,7 +19,7 @@ target_compile_options(tf-proto PRIVATE -w)
 target_link_libraries(tf-proto PRIVATE ${PROTOBUF_LIBRARY})
 set_target_properties(tf-proto PROPERTIES POSITION_INDEPENDENT_CODE On)

-file(GLOB TF_SRCS *.cpp)
+file(GLOB TF_SRCS ${CONFIGURE_DEPENDS} *.cpp)
 add_library(migraphx_tf ${TF_SRCS})
 target_include_directories(migraphx_tf PRIVATE include)
 set_target_properties(migraphx_tf PROPERTIES EXPORT_NAME tf)

--- a/src/tf/parse_pooling.cpp
+++ b/src/tf/parse_pooling.cpp
@@ -19,7 +19,12 @@ struct parse_pooling : op_parser<parse_pooling>
                          tf_parser::node_info info,
                          std::vector<instruction_ref> args) const
    {
-        op::pooling op{starts_with(opd.tf_name, "Max") ? "max" : "average"};
+        if(!starts_with(opd.tf_name, "Max") && !starts_with(opd.tf_name, "Av"))
+        {
+            MIGRAPHX_THROW("tf pooling mode must be Max or Average");
+        }
+        op::pooling op{starts_with(opd.tf_name, "Max") ? op::pooling_mode::max
+                                                       : op::pooling_mode::average};

        if(contains(info.attributes, "strides"))
        {

--- a/src/tf/tf_parser.cpp
+++ b/src/tf/tf_parser.cpp
@@ -499,8 +499,7 @@ literal tf_parser::parse_tensor(const tensorflow::TensorProto& t) const
        return create_literal(shape::int64_type, dims, get_data_vals(t.int64_val(), shape_size));
    case tensorflow::DataType::DT_BOOL:
        return create_literal(shape::int32_type, dims, get_data_vals(t.bool_val(), shape_size));
-    case tensorflow::DataType::DT_HALF:
-    {
+    case tensorflow::DataType::DT_HALF: {
        std::vector<int> data_int32 = get_data_vals(t.half_val(), shape_size);
        std::vector<uint16_t> data_uint16(data_int32.begin(), data_int32.end());
        std::vector<half> data_half;

--- a/src/value.cpp
+++ b/src/value.cpp
@@ -4,6 +4,7 @@
 #include <migraphx/errors.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/value.hpp>
+#include <migraphx/optional.hpp>
 #include <unordered_map>
 #include <utility>

@@ -138,6 +139,7 @@ value::value(const std::string& pkey, const value& rhs)
 {
 }

+value::value(const std::string& pkey, const char* i) : value(pkey, std::string(i)) {}
 value::value(const char* i) : value(std::string(i)) {}

 #define MIGRAPHX_VALUE_GENERATE_DEFINE_METHODS(vt, cpp_type)                           \
@@ -161,6 +163,12 @@ value::value(const char* i) : value(std::string(i)) {}
    const cpp_type* value::if_##vt() const { return x ? x->if_##vt() : nullptr; }
 MIGRAPHX_VISIT_VALUE_TYPES(MIGRAPHX_VALUE_GENERATE_DEFINE_METHODS)

+value& value::operator=(const char* c)
+{
+    *this = std::string{c};
+    return *this;
+}
+
 value& value::operator=(std::nullptr_t)
 {
    x = nullptr;
@@ -410,25 +418,12 @@ value value::with_key(const std::string& pkey) const
    return result;
 }

-template <class F, class T, class U, class Common = typename std::common_type<T, U>::type>
-auto compare_common_impl(
-    rank<1>, F f, const std::string& keyx, const T& x, const std::string& keyy, const U& y)
-{
-    return f(std::forward_as_tuple(keyx, Common(x)), std::forward_as_tuple(keyy, Common(y)));
-}
-
-template <class F>
-auto compare_common_impl(
-    rank<1>, F f, const std::string& keyx, std::nullptr_t, const std::string& keyy, std::nullptr_t)
-{
-    return f(std::forward_as_tuple(keyx, 0), std::forward_as_tuple(keyy, 0));
-}
-
-template <class F, class T, class U>
-auto compare_common_impl(rank<0>, F, const std::string&, const T&, const std::string&, const U&)
+template <class T>
+const T& compare_decay(const T& x)
 {
-    return false;
+    return x;
 }
+int compare_decay(std::nullptr_t) { return 0; }

 template <class F>
 bool compare(const value& x, const value& y, F f)
@@ -436,7 +431,11 @@ bool compare(const value& x, const value& y, F f)
    bool result = false;
    x.visit_value([&](auto&& a) {
        y.visit_value([&](auto&& b) {
-            result = compare_common_impl(rank<1>{}, f, x.get_key(), a, y.get_key(), b);
+            if constexpr(std::is_same<decltype(a), decltype(b)>{})
+                result = f(std::forward_as_tuple(x.get_key(), compare_decay(a)),
+                           std::forward_as_tuple(y.get_key(), compare_decay(b)));
+            else
+                assert(false); // NOLINT
        });
    });
    return result;
@@ -455,11 +454,16 @@ bool operator==(const value& x, const value& y)
        return false;
    return compare(x, y, std::equal_to<>{});
 }
-bool operator!=(const value& x, const value& y) { return !(x == y); }
-bool operator<(const value& x, const value& y) { return compare(x, y, std::less<>{}); }
-bool operator<=(const value& x, const value& y) { return x == y or x < y; }
+bool operator!=(const value& x, const value& y) { return not(x == y); }
+bool operator<(const value& x, const value& y)
+{
+    if(x.get_type() != y.get_type())
+        return x.get_type() < y.get_type();
+    return compare(x, y, std::less<>{});
+}
+bool operator<=(const value& x, const value& y) { return not(x > y); }
 bool operator>(const value& x, const value& y) { return y < x; }
-bool operator>=(const value& x, const value& y) { return x == y or x > y; }
+bool operator>=(const value& x, const value& y) { return not(x < y); }

 void print_value(std::ostream& os, std::nullptr_t) { os << "null"; }


--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -90,7 +90,7 @@ function(add_test_executable TEST_NAME)
    target_include_directories(${TEST_NAME} PUBLIC include)
 endfunction(add_test_executable)

-file(GLOB TESTS *.cpp)
+file(GLOB TESTS ${CONFIGURE_DEPENDS} *.cpp)

 foreach(TEST ${TESTS})
    get_filename_component(BASE_NAME ${TEST} NAME_WE)
@@ -100,7 +100,7 @@ endforeach()

 if(MIGRAPHX_ENABLE_GPU)
    # gpu tests
-    file(GLOB GPU_TESTS gpu/*.cpp)
+    file(GLOB GPU_TESTS ${CONFIGURE_DEPENDS} gpu/*.cpp)

    foreach(TEST ${GPU_TESTS})
        get_filename_component(BASE_NAME ${TEST} NAME_WE)
@@ -120,7 +120,7 @@ file (GLOB ONNX_TESTS ${TEST_ONNX_DIR}/*.cpp)
 foreach(ONNX_TEST ${ONNX_TESTS})
    get_filename_component(BASE_NAME ${ONNX_TEST} NAME_WE)
    set(TEST_NAME test_${BASE_NAME})
-    add_executable(${TEST_NAME} ${TES_ONNX_DIR}/${ONNX_TEST})
+    add_executable(${TEST_NAME} ${ONNX_TEST})
    rocm_clang_tidy_check(${TEST_NAME})
    target_link_libraries(${TEST_NAME} migraphx_onnx migraphx_ref)
    target_include_directories(${TEST_NAME} PUBLIC include)
@@ -160,7 +160,7 @@ function(test_header NAME HEADER)
 endfunction()

 function(test_headers PREFIX)
-    file(GLOB HEADERS ${ARGN})
+    file(GLOB HEADERS ${CONFIGURE_DEPENDS} ${ARGN})

    foreach(HEADER ${HEADERS})
        file(RELATIVE_PATH HEADER_REL ${CMAKE_SOURCE_DIR} ${HEADER})

--- a/test/any_ptr.cpp
+++ b/test/any_ptr.cpp
+#include <migraphx/any_ptr.hpp>
+#include <test.hpp>
+
+TEST_CASE(test_int_id)
+{
+    int i               = 1;
+    migraphx::any_ptr p = &i;
+    EXPECT(p.get<int*>() == &i);
+    EXPECT(p.get(migraphx::get_type_name(i)) == &i);
+    EXPECT(p.unsafe_get() == &i);
+    EXPECT(test::throws([&] { p.get<float*>(); }));
+    EXPECT(test::throws([&] { p.get(migraphx::get_type_name(&i)); }));
+}
+
+TEST_CASE(test_int_name)
+{
+    int i    = 1;
+    void* vp = &i;
+    migraphx::any_ptr p{vp, migraphx::get_type_name(i)};
+    EXPECT(p.get<int*>() == &i);
+    EXPECT(p.get(migraphx::get_type_name(i)) == &i);
+    EXPECT(p.unsafe_get() == &i);
+    EXPECT(test::throws([&] { p.get<float*>(); }));
+    EXPECT(test::throws([&] { p.get(migraphx::get_type_name(&i)); }));
+    EXPECT(test::throws([&] { p.get(migraphx::get_type_name(float{})); }));
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }