Refactor runtime compiled kernels to use the same compile_ops pipeline (#1125)

This adds the infrastructure so we can compile everything in parallel, whereas before only pointwise kernels were compiled in parallel. This will also directly integrate with lowering and the gpu-driver. The kernels for pointwise and roialign are using this infrastructure. Scatternd is not since it does require standard shape. This also makes it easier to add new runtime compiled kernels in the future.

Refactor runtime compiled kernels to use the same compile_ops pipeline (#1125)
This adds the infrastructure so we can compile everything in parallel, whereas before only pointwise kernels were compiled in parallel. This will also directly integrate with lowering and the gpu-driver. The kernels for pointwise and roialign are using this infrastructure. Scatternd is not since it does require standard shape. This also makes it easier to add new runtime compiled kernels in the future.
661046c6 · Paul Fultz II · GitHub · 024b4abc · 661046c6 · 661046c6
Unverified Commit 661046c6 authored Mar 29, 2022 by Paul Fultz II Committed by GitHub Mar 29, 2022
20 changed files
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
@@ -43,7 +43,7 @@ struct roialign
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(3).standard();
+        check_shapes{inputs, *this}.has(3);
        auto x_lens   = inputs.at(0).lens();
        auto roi_lens = inputs.at(1).lens();
        auto bi_lens  = inputs.at(2).lens();

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -119,6 +119,7 @@ target_link_libraries(kernel_file_check compile_for_gpu)
 rocm_clang_tidy_check(kernel_file_check)
+file(GLOB JIT_GPU_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp)
 add_library(migraphx_gpu
    abs.cpp
    analyze_streams.cpp
@@ -131,9 +132,7 @@ add_library(migraphx_gpu
    compile_ops.cpp
    compile_hip.cpp
    compile_hip_code_object.cpp
-    compile_pointwise.cpp
+    compiler.cpp
-    compile_roialign.cpp
-    compile_scatternd.cpp
    concat.cpp
    convert.cpp
    convolution.cpp
@@ -171,6 +170,7 @@ add_library(migraphx_gpu
    target.cpp
    topk.cpp
    write_literals.cpp
+    ${JIT_GPU_SRCS}
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)

--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
@@ -248,14 +248,6 @@ std::string enum_params(std::size_t count, std::string param)
    return join_strings(items, ",");
 }
-std::size_t compute_global(std::size_t n, std::size_t local)
-{
-    std::size_t groups = (n + local - 1) / local;
-    // max possible number of blocks is set to 1B (1,073,741,824)
-    std::size_t nglobal = std::min<std::size_t>(1073741824, groups) * local;
-    return nglobal;
-}
 #endif // MIGRAPHX_USE_HIPRTC
 } // namespace gpu

--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -93,6 +93,32 @@ const std::vector<std::string>& compiler_warnings()
    return warnings;
 }
+void hip_compile_options::set_launch_params(
+    const value& v,
+    const std::function<std::size_t(std::size_t local)>& compute_global,
+    std::size_t default_local)
+{
+    local = v.get("local", default_local);
+    if(v.contains("global"))
+        global = v.at("global").to<std::size_t>();
+    else
+        global = compute_global(local);
+}
+std::function<std::size_t(std::size_t local)>
+compute_global_for(context& ctx, std::size_t n, std::size_t over)
+{
+    assert(over > 0);
+    std::size_t max_global = ctx.get_current_device().get_cu_count() *
+                             ctx.get_current_device().get_max_workitems_per_cu();
+    return [n, over, max_global](std::size_t local) {
+        std::size_t groups     = (n + local - 1) / local;
+        std::size_t max_blocks = max_global / local;
+        std::size_t nglobal    = std::min(max_blocks * over, groups) * local;
+        return nglobal;
+    };
+}
 operation compile_hip_code_object(const std::string& content, hip_compile_options options)
 {
    std::vector<src_file> srcs;

--- a/src/targets/gpu/compile_ops.cpp
+++ b/src/targets/gpu/compile_ops.cpp
@@ -6,7 +6,7 @@
 #include <migraphx/par_for.hpp>
 #include <migraphx/register_op.hpp>
 #include <migraphx/op/identity.hpp>
-#include <migraphx/gpu/compile_pointwise.hpp>
+#include <migraphx/gpu/compiler.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -40,35 +40,9 @@ struct precompile_op
 MIGRAPHX_REGISTER_OP(precompile_op);
-struct pointwise_compiler
-{
-    std::string name() const { return "pointwise"; }
-    operation apply(context& ctx, instruction_ref ins, const operation&) const
-    {
-        assert(not ins->module_inputs().empty());
-        auto* pm = ins->module_inputs().front();
-        return compile_pointwise(ctx, to_shapes(ins->inputs()), *pm);
-    }
-};
-using compiler_function = std::function<operation(context&, instruction_ref, operation)>;
-template <class T>
-compiler_function make_compiler_function(T x)
-{
-    return {[=](auto&&... xs) { return x.apply(xs...); }};
-}
-template <class... Ts>
-std::unordered_map<std::string, compiler_function> make_compilers(Ts... xs)
-{
-    return {{xs.name(), make_compiler_function(xs)}...};
-}
 struct compiled_result
 {
-    operation op;
+    compiler_replace replace;
    instruction_ref ins;
 };
@@ -82,7 +56,6 @@ void par_compile(std::size_t n, F f)
 void compile_ops::apply(module& m) const
 {
-    auto compilers = make_compilers(pointwise_compiler{});
    std::vector<std::function<compiled_result()>> compiles;
    for(auto ins : iterator_for(m))
@@ -90,15 +63,15 @@ void compile_ops::apply(module& m) const
        if(ins->name() != "gpu::precompile_op")
            continue;
        operation preop = any_cast<precompile_op>(ins->get_operator()).op;
-        assert(contains(compilers, preop.name()));
+        compiles.emplace_back([=]() -> compiled_result {
-        auto c = compilers[preop.name()];
+            return {compile(*ctx, ins, preop), ins};
-        compiles.emplace_back([=]() -> compiled_result { return {c(*ctx, ins, preop), ins}; });
+        });
    }
    std::vector<compiled_result> results(compiles.size());
    par_compile(compiles.size(), [&](auto i) { results[i] = compiles[i](); });
    for(const auto& cr : results)
    {
-        m.replace_instruction(cr.ins, cr.op, cr.ins->inputs());
+        cr.replace(m, cr.ins);
    }
 }

--- a/src/targets/gpu/compile_pointwise.cpp
+++ b/src/targets/gpu/compile_pointwise.cpp
-#include <migraphx/gpu/compile_pointwise.hpp>
-#include <migraphx/gpu/compile_hip_code_object.hpp>
-#include <migraphx/gpu/compile_hip.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/cpp_generator.hpp>
-#include <migraphx/ranges.hpp>
-#include <migraphx/reduce_dims.hpp>
-#include <migraphx/stringutils.hpp>
-#include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/eliminate_common_subexpression.hpp>
-#include <migraphx/module.hpp>
-#include <migraphx/pass_manager.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-static const char* const pointwise_kernel = R"__migraphx__(
-#include <migraphx/kernels/index.hpp>
-#include <migraphx/kernels/pointwise.hpp>
-#include <args.hpp>
-namespace migraphx {
-${preamble}
-extern "C" {
-__global__ void kernel(${params}) 
-{
-    pointwise(${lambda}, ${args});
-}
-}
-} // namespace migraphx
-int main() {}
-)__migraphx__";
-operation compile_pointwise(context&,
-                            const std::vector<shape>& inputs,
-                            const std::string& lambda,
-                            const std::string& preamble)
-{
-    hip_compile_options options;
-    options.global         = compute_global(inputs.front().elements());
-    options.local          = 1024;
-    options.inputs         = inputs;
-    options.output         = inputs.back();
-    options.virtual_inputs = reduce_dims(inputs);
-    options.params         = "-Wno-float-equal";
-    auto src               = interpolate_string(pointwise_kernel,
-                                  {{"params", enum_params(inputs.size(), "void * private_p")},
-                                   {"args", enum_params(inputs.size(), "private_p")},
-                                   {"lambda", lambda},
-                                   {"preamble", preamble}});
-    return compile_hip_code_object(src, options);
-}
-operation compile_pointwise(context& ctx, const std::vector<shape>& inputs, module m)
-{
-    run_passes(m, {eliminate_common_subexpression{}, dead_code_elimination{}});
-    cpp_generator g;
-    g.fmap([](const std::string& fname) { return "migraphx::" + fname; });
-    g.add_point_op("where", "${function:where}(${0}, ${1}, ${2})");
-    g.add_point_op("prelu", "${function:where}(${0} < 0, ${0} * ${1}, ${0})");
-    g.add_point_op("sign", "${function:where}(${0} > 0, 1, ${function:where}(${0} < 0, -1, 0))");
-    g.add_point_op("equal", "migraphx::abs(${0} == ${1})");
-    g.add_point_op("less", "migraphx::abs(${0} < ${1})");
-    g.add_point_op("greater", "migraphx::abs(${0} > ${1})");
-    g.add_point_op("not", "migraphx::abs(not ${0})");
-    // Add explict conversions
-    g.fresult(
-        [](const shape& s) { return "migraphx::convert<" + shape::cpp_type(s.type()) + ">"; });
-    auto name =
-        g.create_function(g.generate_module(m).set_attributes({"__device__"}).set_generic_types(m));
-    return compile_pointwise((ctx), inputs, "MIGRAPHX_LIFT(" + name + ")", g.str());
-}
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/compiler.cpp
+++ b/src/targets/gpu/compiler.cpp
+#include <migraphx/gpu/compiler.hpp>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+auto& compiler_map()
+{
+    static std::unordered_map<std::string, compiler_compile> m; // NOLINT
+    return m;
+}
+auto& compiler_op_map()
+{
+    static std::unordered_map<std::string, compiler_compile_op> m; // NOLINT
+    return m;
+}
+void register_compiler(const std::string& name, compiler_compile c, compiler_compile_op cop)
+{
+    compiler_map()[name]    = std::move(c);
+    compiler_op_map()[name] = std::move(cop);
+}
+bool has_compiler_for(const std::string& name) { return compiler_map().count(name) > 0; }
+compiler_replace compile(context& ctx, instruction_ref ins, const operation& op)
+{
+    return compiler_map().at(op.name())(ctx, ins, op);
+}
+operation
+compile_op(const std::string& name, context& ctx, const std::vector<shape>& inputs, const value& v)
+{
+    return compiler_op_map().at(name)(ctx, inputs, v);
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/driver/CMakeLists.txt
+++ b/src/targets/gpu/driver/CMakeLists.txt
+file(GLOB GPU_DRIVER_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 add_executable(gpu-driver
-    action.cpp
+    ${GPU_DRIVER_SRCS}
-    compile_pointwise.cpp
-    main.cpp
-    parser.cpp
-    perf.cpp
-    run_op.cpp
 )
 target_include_directories(gpu-driver PRIVATE include)
 target_link_libraries(gpu-driver PRIVATE migraphx_gpu)
--- a/src/targets/gpu/driver/compile_pointwise.cpp
+++ b/src/targets/gpu/driver/compile_pointwise.cpp
 #include <migraphx/gpu/driver/action.hpp>
 #include <migraphx/gpu/driver/perf.hpp>
-#include <migraphx/gpu/compile_pointwise.hpp>
+#include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/context.hpp>
 namespace migraphx {
@@ -8,13 +8,13 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace driver {
-struct compile_pointwise : action<compile_pointwise>
+struct compile_op : action<compile_op>
 {
    static void apply(const parser& p, const value& v)
    {
        context ctx;
        auto inputs = p.parse_shapes(v.at("inputs"));
-        auto op     = gpu::compile_pointwise(ctx, inputs, v.at("lambda").to<std::string>());
+        auto op     = gpu::compile_op(v.at("name").to<std::string>(), ctx, inputs, v);
        double t    = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
        std::cout << op << ": " << t << "ms" << std::endl;
    }

--- a/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
@@ -17,8 +17,6 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
 std::string enum_params(std::size_t count, std::string param);
-std::size_t compute_global(std::size_t n, std::size_t local = 1024);
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
@@ -8,6 +8,8 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
+struct context;
 struct hip_compile_options
 {
    std::size_t global;
@@ -17,8 +19,24 @@ struct hip_compile_options
    std::string kernel_name           = "kernel";
    std::string params                = "";
    std::vector<shape> virtual_inputs = {};
+    /**
+     * @brief Set the launch parameters but allow v to override the values
+     *
+     * @param v A value class which can have a "global" and/or "local" keys to override the default
+     * global and local
+     * @param compute_global A function used to compute the global based on the local
+     * @param default_local The defaul local to use if its missing from the v parameter
+     */
+    void set_launch_params(const value& v,
+                           const std::function<std::size_t(std::size_t local)>& compute_global,
+                           std::size_t default_local = 1024);
 };
+/// Compute global for n elements, but max out on target-specific upper limit
+std::function<std::size_t(std::size_t local)>
+compute_global_for(context& ctx, std::size_t n, std::size_t over = 1);
 operation compile_hip_code_object(const std::string& content, hip_compile_options options);
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/compile_pointwise.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_pointwise.hpp
-#ifndef MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
-#define MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
-#include <migraphx/config.hpp>
-#include <migraphx/operation.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-struct module;
-namespace gpu {
-struct context;
-operation compile_pointwise(context& ctx,
-                            const std::vector<shape>& inputs,
-                            const std::string& lambda,
-                            const std::string& preamble = "");
-operation compile_pointwise(context& ctx, const std::vector<shape>& inputs, module m);
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-#endif // MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
--- a/src/targets/gpu/include/migraphx/gpu/compile_roialign.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_roialign.hpp
-#ifndef MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
-#define MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
-#include <migraphx/config.hpp>
-#include <migraphx/operation.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-struct context;
-operation compile_roialign(context& ctx, const std::vector<shape>& io_shapes, const value& val);
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-#endif // MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
--- a/src/targets/gpu/include/migraphx/gpu/compile_scatternd.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_scatternd.hpp
-#ifndef MIGRAPHX_GUARD_GPU_COMPILE_SCATTERND_HPP
-#define MIGRAPHX_GUARD_GPU_COMPILE_SCATTERND_HPP
-#include <migraphx/config.hpp>
-#include <migraphx/operation.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-struct context;
-operation
-compile_scatternd(context& ctx, const std::vector<shape>& io_shapes, const std::string& reduction);
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-#endif // MIGRAPHX_GUARD_GPU_COMPILE_SCATTERND_HPP
--- a/src/targets/gpu/include/migraphx/gpu/compiler.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compiler.hpp
+#ifndef MIGRAPHX_GUARD_GPU_COMPILER_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILER_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/auto_register.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/instruction.hpp>
+#include <functional>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct context;
+using compiler_replace = std::function<void(module& m, instruction_ref ins)>;
+using compiler_compile = std::function<compiler_replace(context&, instruction_ref, operation)>;
+using compiler_compile_op =
+    std::function<operation(context&, const std::vector<shape>& inputs, const value&)>;
+void register_compiler(const std::string& name, compiler_compile c, compiler_compile_op cop);
+bool has_compiler_for(const std::string& name);
+compiler_replace compile(context& ctx, instruction_ref ins, const operation& op);
+operation
+compile_op(const std::string& name, context& ctx, const std::vector<shape>& inputs, const value& v);
+template <class T>
+void register_compiler()
+{
+    T c;
+    for(auto&& name : c.names())
+    {
+        register_compiler(
+            name,
+            [=](auto&&... xs) { return c.compile(std::forward<decltype(xs)>(xs)...); },
+            [=](auto&&... xs) { return c.compile_op(std::forward<decltype(xs)>(xs)...); });
+    }
+}
+struct register_compiler_action
+{
+    template <class T>
+    static void apply()
+    {
+        register_compiler<T>();
+    }
+};
+template <class T>
+using auto_register_compiler = auto_register<register_compiler_action, T>;
+template <class Derived>
+struct compiler : auto_register_compiler<Derived>
+{
+    auto replace(const operation& op) const
+    {
+        return
+            [=](module& m, instruction_ref ins) { m.replace_instruction(ins, op, ins->inputs()); };
+    }
+    operation compile_op(context&, const std::vector<shape>&, const value&) const { return {}; }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_COMPILER_HPP
--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -154,6 +154,13 @@ struct hip_device
    std::size_t get_cu_count() const { return device_props.multiProcessorCount; }
+    std::size_t get_max_workitems_per_cu() const
+    {
+        return device_props.maxThreadsPerMultiProcessor;
+    }
+    std::size_t get_max_workitems_per_block() const { return device_props.maxThreadsPerBlock; }
    private:
    std::size_t device_id      = 0;
    std::size_t current_stream = 0;

--- a/src/targets/gpu/jit/pointwise.cpp
+++ b/src/targets/gpu/jit/pointwise.cpp
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/cpp_generator.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/eliminate_common_subexpression.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/pass_manager.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+static const char* const pointwise_kernel = R"__migraphx__(
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/pointwise.hpp>
+#include <args.hpp>
+namespace migraphx {
+${preamble}
+extern "C" {
+__global__ void kernel(${params}) 
+{
+    pointwise(${lambda}, ${args});
+}
+}
+} // namespace migraphx
+)__migraphx__";
+struct pointwise_compiler : compiler<pointwise_compiler>
+{
+    std::vector<std::string> names() const { return {"pointwise"}; }
+    static std::size_t oversubscribe(const std::vector<shape>& inputs)
+    {
+        if(std::any_of(inputs.begin(), inputs.end(), [](const auto& s) { return s.broadcasted(); }))
+            return 1;
+        else
+            return 4;
+    }
+    static std::size_t vectorize_elements(const std::vector<shape>& inputs)
+    {
+        std::size_t n = inputs.front().elements();
+        if(std::all_of(inputs.begin(), inputs.end(), [](const auto& s) {
+               return s.packed() or s.broadcasted();
+           }))
+        {
+            if((n % 4) == 0)
+                return n / 4;
+            else if((n % 2) == 0)
+                return n / 2;
+        }
+        return n;
+    }
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        hip_compile_options options;
+        options.set_launch_params(
+            v, compute_global_for(ctx, vectorize_elements(inputs), oversubscribe(inputs)));
+        options.inputs         = inputs;
+        options.output         = inputs.back();
+        options.virtual_inputs = reduce_dims(inputs);
+        options.params         = "-Wno-float-equal";
+        auto src               = interpolate_string(pointwise_kernel,
+                                      {{"params", enum_params(inputs.size(), "void * private_p")},
+                                       {"args", enum_params(inputs.size(), "private_p")},
+                                       {"lambda", v.at("lambda").to<std::string>()},
+                                       {"preamble", v.get("preamble", std::string{})}});
+        return compile_hip_code_object(src, options);
+    }
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation&) const
+    {
+        assert(not ins->module_inputs().empty());
+        auto* pm = ins->module_inputs().front();
+        run_passes(*pm, {eliminate_common_subexpression{}, dead_code_elimination{}});
+        cpp_generator g;
+        g.fmap([](const std::string& fname) { return "migraphx::" + fname; });
+        g.add_point_op("where", "${function:where}(${0}, ${1}, ${2})");
+        g.add_point_op("prelu", "${function:where}(${0} < 0, ${0} * ${1}, ${0})");
+        g.add_point_op("sign",
+                       "${function:where}(${0} > 0, 1, ${function:where}(${0} < 0, -1, 0))");
+        g.add_point_op("equal", "migraphx::abs(${0} == ${1})");
+        g.add_point_op("less", "migraphx::abs(${0} < ${1})");
+        g.add_point_op("greater", "migraphx::abs(${0} > ${1})");
+        g.add_point_op("not", "migraphx::abs(not ${0})");
+        // Add explict conversions
+        g.fresult(
+            [](const shape& s) { return "migraphx::convert<" + shape::cpp_type(s.type()) + ">"; });
+        auto name = g.create_function(
+            g.generate_module(*pm).set_attributes({"__device__"}).set_generic_types(*pm));
+        std::string lambda = "MIGRAPHX_LIFT(" + name + ")";
+        return replace(
+            compile_op(ctx, to_shapes(ins->inputs()), {{"lambda", lambda}, {"preamble", g.str()}}));
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/compile_roialign.cpp
+++ b/src/targets/gpu/compile_roialign.cpp
-#include <migraphx/gpu/compile_roialign.hpp>
+#include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/compile_hip_code_object.hpp>
-#include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/cpp_generator.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/reduce_dims.hpp>
 #include <migraphx/stringutils.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/eliminate_common_subexpression.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/pass_manager.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -37,46 +43,46 @@ __global__ void roialign_kernel(void* in_x, void* in_rois, void* in_ind, void* y
 } // namespace migraphx
-int main() {}
 )__migraphx__";
-operation compile_roialign(context&, const std::vector<shape>& io_shapes, const value& val)
+struct roialign_compiler : compiler<roialign_compiler>
 {
-    hip_compile_options options;
+    std::vector<std::string> names() const { return {"roialign"}; }
-    auto out_s             = io_shapes.back();
-    options.local          = 128;
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
-    options.global         = compute_global(out_s.elements(), options.local);
+    {
-    options.inputs         = io_shapes;
+        hip_compile_options options;
-    options.output         = out_s;
+        options.set_launch_params(v, compute_global_for(ctx, inputs.back().elements()), 128);
-    options.kernel_name    = "roialign_kernel";
+        options.output      = inputs.back();
-    options.virtual_inputs = io_shapes;
+        options.inputs      = inputs;
+        options.kernel_name = "roialign_kernel";
-    // sampling_ratio
-    assert(val.contains("sampling_ratio"));
+        // sampling_ratio
-    auto sampling_ratio = val.at("sampling_ratio").to<int64_t>();
+        options.params += " -DSAMPLING_RATIO=" + v.at("sampling_ratio").to<std::string>();
-    options.params += " -DSAMPLING_RATIO=" + std::to_string(sampling_ratio);
+        // pooling_mode
-    // pooling_mode
+        auto mode = v.at("mode").to<migraphx::op::pooling_mode>();
-    assert(val.contains("mode"));
+        std::string is_avg_pooling =
-    auto mode           = val.at("mode").to<migraphx::op::pooling_mode>();
+            (mode == migraphx::op::pooling_mode::average) ? "true" : "false";
-    bool is_avg_pooling = (mode == migraphx::op::pooling_mode::average);
+        options.params += " -DIS_AVG_POOLING=" + is_avg_pooling;
-    options.params += " -DIS_AVG_POOLING=" + std::to_string(static_cast<int>(is_avg_pooling));
+        // coord_trans_mode
-    // coord_trans_mode
+        auto ctm          = v.at("coordinate_transformation_mode").to<std::string>();
-    assert(val.contains("coordinate_transformation_mode"));
+        float rois_offset = (ctm == "output_half_pixel") ? -0.5f : 0.0f;
-    auto ctm          = val.at("coordinate_transformation_mode").to<std::string>();
+        options.params += " -DROIS_OFFSET=" + std::to_string(rois_offset);
-    float rois_offset = (ctm == "output_half_pixel") ? -0.5f : 0.0f;
-    options.params += " -DROIS_OFFSET=" + std::to_string(rois_offset);
+        // spatial_scale
+        options.params += " -DSPATIAL_SCALE=" + v.at("spatial_scale").to<std::string>();
-    // spatial_scale
-    assert(val.contains("spatial_scale"));
+        return compile_hip_code_object(roialign_kernel, options);
-    float spatial_scale = val.at("spatial_scale").to<float>();
+    }
-    options.params += " -DSPATIAL_SCALE=" + std::to_string(spatial_scale);
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
-    return compile_hip_code_object(roialign_kernel, options);
+    {
-}
+        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
-} // namespace gpu
+    }
+};
+} // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/compile_scatternd.cpp
+++ b/src/targets/gpu/compile_scatternd.cpp
-#include <migraphx/gpu/compile_scatternd.hpp>
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/compile_hip.hpp>
-#include <migraphx/gpu/context.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/reduce_dims.hpp>
 #include <migraphx/stringutils.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/eliminate_common_subexpression.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/pass_manager.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -25,7 +31,7 @@ extern "C" {
 __global__ void scatternd_kernel(void* in_indices, void* in_updates, void* output) 
 {
    make_tensors()(in_indices, in_updates, output)([](auto&&... xs) { 
-        scatternd(xs..., REDUCTION); 
+        scatternd(xs..., ${reduction}{}); 
    });
 }
@@ -33,28 +39,50 @@ __global__ void scatternd_kernel(void* in_indices, void* in_updates, void* outpu
 } // namespace migraphx
-int main() {}
 )__migraphx__";
-operation
+struct scatternd_compiler : compiler<scatternd_compiler>
-compile_scatternd(context&, const std::vector<shape>& io_shapes, const std::string& reduction)
 {
-    hip_compile_options options;
+    std::vector<std::string> names() const
-    auto out_s             = io_shapes.back();
+    {
-    options.local          = 1024;
+        return {"scatternd_none", "scatternd_add", "scatternd_mul"};
-    options.global         = compute_global(io_shapes.at(1).elements(), options.local);
+    }
-    options.inputs         = io_shapes;
-    options.output         = out_s;
-    options.kernel_name    = "scatternd_kernel";
-    options.virtual_inputs = io_shapes;
-    options.params += " -DREDUCTION=assign_" + reduction + "{}";
-    return compile_hip_code_object(scatternd_kernel, options);
-}
-} // namespace gpu
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        hip_compile_options options;
+        options.set_launch_params(v, compute_global_for(ctx, inputs.at(1).elements()));
+        auto out_s             = inputs.back();
+        options.inputs         = inputs;
+        options.output         = out_s;
+        options.kernel_name    = "scatternd_kernel";
+        options.virtual_inputs = inputs;
+        auto reduction         = "assign_" + v.get("reduction", std::string{"none"});
+        auto src               = interpolate_string(scatternd_kernel, {{"reduction", reduction}});
+        return compile_hip_code_object(src, options);
+    }
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        assert(starts_with(op.name(), "scatternd_"));
+        auto reduction = op.name().substr(10);
+        return insert(compile_op(ctx,
+                                 to_shapes({ins->inputs().begin() + 1, ins->inputs().end()}),
+                                 {{"reduction", reduction}}));
+    }
+    compiler_replace insert(const operation& op) const
+    {
+        return [=](module& m, instruction_ref ins) {
+            auto args = ins->inputs();
+            args.back() =
+                m.insert_instruction(ins, make_op("hip::copy"), args.front(), args.back());
+            args.erase(args.begin());
+            return m.replace_instruction(ins, op, args);
+        };
+    }
+};
+} // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -20,8 +20,6 @@
 #include <migraphx/gpu/abs.hpp>
 #include <migraphx/gpu/batch_norm_inference.hpp>
-#include <migraphx/gpu/compile_roialign.hpp>
-#include <migraphx/gpu/compile_scatternd.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/deconvolution.hpp>
@@ -42,6 +40,7 @@
 #include <migraphx/gpu/rocblas.hpp>
 #include <migraphx/gpu/unary_not.hpp>
 #include <migraphx/gpu/where.hpp>
+#include <migraphx/gpu/compiler.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/program.hpp>
 #include <utility>
@@ -195,8 +194,6 @@ struct miopen_apply
        add_extend_op("softmax");
        add_extend_op("topk");
-        add_precompile_op("pointwise");
        add_batch_norm_inference_op();
        add_convolution_op();
        add_deconvolution_op();
@@ -207,8 +204,6 @@ struct miopen_apply
        add_neg_op();
        add_nms_op();
        add_quant_convolution_op();
-        add_roialign();
-        add_scatternd();
    }
    void copy_params()
@@ -262,11 +257,28 @@ struct miopen_apply
            {
                check_shape(s, apply_map.at(it->name())(it));
            }
+            else if(has_compiler_for(it->name()))
+            {
+                check_shape(s, insert_precompile_op(it));
+            }
        }
        copy_params();
    }
+    instruction_ref insert_precompile_op(instruction_ref ins)
+    {
+        auto output                       = insert_allocation(ins, ins->get_shape());
+        std::vector<instruction_ref> refs = ins->inputs();
+        refs.push_back(output);
+        return mod->replace_instruction(
+            ins,
+            make_op("gpu::precompile_op", {{"op", to_value(ins->get_operator())}}),
+            refs,
+            ins->module_inputs());
+    }
    instruction_ref insert_allocation(instruction_ref ins, const shape& s, std::string tag = "")
    {
        // Instruction's output is an input of the ret instruction
@@ -396,21 +408,6 @@ struct miopen_apply
        });
    }
-    void add_precompile_op(const std::string& name)
-    {
-        apply_map.emplace(name, [=](instruction_ref ins) {
-            auto output                       = insert_allocation(ins, ins->get_shape());
-            std::vector<instruction_ref> refs = ins->inputs();
-            refs.push_back(output);
-            return mod->replace_instruction(
-                ins,
-                make_op("gpu::precompile_op", {{"op", to_value(ins->get_operator())}}),
-                refs,
-                ins->module_inputs());
-        });
-    }
    void add_batch_norm_inference_op()
    {
        apply_map.emplace("batch_norm_inference", [=](instruction_ref ins) {
@@ -501,75 +498,6 @@ struct miopen_apply
        });
    }
-    void add_roialign()
-    {
-        apply_map.emplace("roialign", [=](instruction_ref ins) {
-            auto s      = ins->get_shape();
-            auto op_val = ins->get_operator().to_value();
-            auto output = insert_allocation(ins, s);
-            auto args   = ins->inputs();
-            args.push_back(output);
-            auto io_shapes = to_shapes(args);
-            auto co        = compile_roialign(get_context(), io_shapes, op_val);
-            return mod->replace_instruction(ins, co, args);
-        });
-    }
-    void add_scatternd()
-    {
-        apply_map.emplace("scatternd_none", [=](instruction_ref ins) {
-            auto s      = ins->get_shape();
-            auto op_val = ins->get_operator().to_value();
-            auto output = insert_allocation(ins, s);
-            auto args   = ins->inputs();
-            args.push_back(output);
-            auto io_shapes = to_shapes(args);
-            io_shapes.erase(io_shapes.begin());
-            const std::string reduction = "none";
-            auto co                     = compile_scatternd(get_context(), io_shapes, reduction);
-            auto copy   = mod->insert_instruction(ins, make_op("hip::copy"), args.front(), output);
-            args.back() = copy;
-            args.erase(args.begin());
-            return mod->replace_instruction(ins, co, args);
-        });
-        apply_map.emplace("scatternd_add", [=](instruction_ref ins) {
-            auto s      = ins->get_shape();
-            auto op_val = ins->get_operator().to_value();
-            auto output = insert_allocation(ins, s);
-            auto args   = ins->inputs();
-            args.push_back(output);
-            auto io_shapes = to_shapes(args);
-            io_shapes.erase(io_shapes.begin());
-            const std::string reduction = "add";
-            auto co                     = compile_scatternd(get_context(), io_shapes, reduction);
-            auto copy   = mod->insert_instruction(ins, make_op("hip::copy"), args.front(), output);
-            args.back() = copy;
-            args.erase(args.begin());
-            return mod->replace_instruction(ins, co, args);
-        });
-        apply_map.emplace("scatternd_mul", [=](instruction_ref ins) {
-            auto s      = ins->get_shape();
-            auto op_val = ins->get_operator().to_value();
-            auto output = insert_allocation(ins, s);
-            auto args   = ins->inputs();
-            args.push_back(output);
-            auto io_shapes = to_shapes(args);
-            io_shapes.erase(io_shapes.begin());
-            const std::string reduction = "mul";
-            auto co                     = compile_scatternd(get_context(), io_shapes, reduction);
-            auto copy   = mod->insert_instruction(ins, make_op("hip::copy"), args.front(), output);
-            args.back() = copy;
-            args.erase(args.begin());
-            return mod->replace_instruction(ins, co, args);
-        });
-    }
    // replace the loop operator with gpu_loop operator
    void add_loop_op()
    {