Merge branch 'layernorm_half2' into branch_for_ort2

3c95b34d · Shucai Xiao · af110526 · 789f86fb · 3c95b34d · 3c95b34d
Commit 3c95b34d authored Apr 04, 2022 by Shucai Xiao
20 changed files
--- a/src/include/migraphx/pass.hpp
+++ b/src/include/migraphx/pass.hpp
@@ -58,17 +58,20 @@ void module_pass_manager_apply(const T& x, module_pass_manager& mpm)

 } // namespace detail

-/*
- * Type-erased interface for:
- *
- * struct pass
- * {
- *      std::string name() const;
- *      void apply(module_pass_manager & mpm) const;
- *      void apply(program & p) const;
- * };
- *
- */
+#ifdef TYPE_ERASED_DECLARATION
+
+// Type-erased interface for:
+struct pass
+{
+    //
+    std::string name() const;
+    // (optional)
+    void apply(module_pass_manager& mpm) const;
+    // (optional)
+    void apply(program& p) const;
+};
+
+#else

 struct pass
 {
@@ -303,6 +306,7 @@ inline const ValueType& any_cast(const pass& x)
        throw std::bad_cast();
    return *y;
 }
+#endif

 #endif


--- a/src/include/migraphx/schedule_model.hpp
+++ b/src/include/migraphx/schedule_model.hpp
@@ -26,30 +26,35 @@ struct schedule_model
    /// Get the number of concurrent instruction allowed
    std::size_t concurrency() const;
    /// Schedule a concurrent instruction
-    void sched(module& p, instruction_ref ins, std::size_t n) const;
+    void sched(module& m, instruction_ref ins, std::size_t n) const;
    // Insert necessary waits before an instruction
-    void wait(module& p, instruction_ref ins, std::size_t wait_id) const;
+    void wait(module& m, instruction_ref ins, std::size_t wait_id) const;
    // Insert necessary records after an instruction
-    void record(module& p, instruction_ref ins, std::size_t wait_id) const;
+    void record(module& m, instruction_ref ins, std::size_t wait_id) const;
    /// Compute weights for an operation
    std::size_t weight(const operation& op) const;
 };

 #else

-/*
- * Type-erased interface for:
- *
- * struct schedule_model
- * {
- *      std::size_t concurrency() const;
- *      void sched(module& p,instruction_ref ins,std::size_t n) const;
- *      void wait(module& p,instruction_ref ins,std::size_t wait_id) const;
- *      void record(module& p,instruction_ref ins,std::size_t wait_id) const;
- *      std::size_t weight(const operation& op) const;
- * };
- *
- */
+#ifdef TYPE_ERASED_DECLARATION
+
+// Type-erased interface for:
+struct schedule_model
+{
+    //
+    std::size_t concurrency() const;
+    //
+    void sched(module& m, instruction_ref ins, std::size_t n) const;
+    //
+    void wait(module& m, instruction_ref ins, std::size_t wait_id) const;
+    //
+    void record(module& m, instruction_ref ins, std::size_t wait_id) const;
+    //
+    std::size_t weight(const operation& op) const;
+};
+
+#else

 struct schedule_model
 {
@@ -120,22 +125,22 @@ struct schedule_model
        return (*this).private_detail_te_get_handle().concurrency();
    }

-    void sched(module& p, instruction_ref ins, std::size_t n) const
+    void sched(module& m, instruction_ref ins, std::size_t n) const
    {
        assert((*this).private_detail_te_handle_mem_var);
-        (*this).private_detail_te_get_handle().sched(p, ins, n);
+        (*this).private_detail_te_get_handle().sched(m, ins, n);
    }

-    void wait(module& p, instruction_ref ins, std::size_t wait_id) const
+    void wait(module& m, instruction_ref ins, std::size_t wait_id) const
    {
        assert((*this).private_detail_te_handle_mem_var);
-        (*this).private_detail_te_get_handle().wait(p, ins, wait_id);
+        (*this).private_detail_te_get_handle().wait(m, ins, wait_id);
    }

-    void record(module& p, instruction_ref ins, std::size_t wait_id) const
+    void record(module& m, instruction_ref ins, std::size_t wait_id) const
    {
        assert((*this).private_detail_te_handle_mem_var);
-        (*this).private_detail_te_get_handle().record(p, ins, wait_id);
+        (*this).private_detail_te_get_handle().record(m, ins, wait_id);
    }

    std::size_t weight(const operation& op) const
@@ -159,9 +164,9 @@ struct schedule_model
        virtual const std::type_info& type() const                                = 0;

        virtual std::size_t concurrency() const                                        = 0;
-        virtual void sched(module& p, instruction_ref ins, std::size_t n) const        = 0;
-        virtual void wait(module& p, instruction_ref ins, std::size_t wait_id) const   = 0;
-        virtual void record(module& p, instruction_ref ins, std::size_t wait_id) const = 0;
+        virtual void sched(module& m, instruction_ref ins, std::size_t n) const        = 0;
+        virtual void wait(module& m, instruction_ref ins, std::size_t wait_id) const   = 0;
+        virtual void record(module& m, instruction_ref ins, std::size_t wait_id) const = 0;
        virtual std::size_t weight(const operation& op) const                          = 0;
    };

@@ -195,22 +200,22 @@ struct schedule_model

        std::size_t concurrency() const override { return private_detail_te_value.concurrency(); }

-        void sched(module& p, instruction_ref ins, std::size_t n) const override
+        void sched(module& m, instruction_ref ins, std::size_t n) const override
        {

-            private_detail_te_value.sched(p, ins, n);
+            private_detail_te_value.sched(m, ins, n);
        }

-        void wait(module& p, instruction_ref ins, std::size_t wait_id) const override
+        void wait(module& m, instruction_ref ins, std::size_t wait_id) const override
        {

-            private_detail_te_value.wait(p, ins, wait_id);
+            private_detail_te_value.wait(m, ins, wait_id);
        }

-        void record(module& p, instruction_ref ins, std::size_t wait_id) const override
+        void record(module& m, instruction_ref ins, std::size_t wait_id) const override
        {

-            private_detail_te_value.record(p, ins, wait_id);
+            private_detail_te_value.record(m, ins, wait_id);
        }

        std::size_t weight(const operation& op) const override
@@ -283,6 +288,7 @@ inline const ValueType& any_cast(const schedule_model& x)
        throw std::bad_cast();
    return *y;
 }
+#endif

 #endif


--- a/src/include/migraphx/stream_model.hpp
+++ b/src/include/migraphx/stream_model.hpp
@@ -36,20 +36,26 @@ struct stream_model

 #else

-/*
- * Type-erased interface for:
- *
- * struct stream_model
- * {
- *      std::size_t get_nstream() const;
- *      std::size_t get_stream(instruction_ref ins) const;
- *      std::size_t get_event_id(instruction_ref ins) const;
- *      bool has_stream(instruction_ref ins) const;
- *      bool is_record(instruction_ref ins) const;
- *      bool is_wait(instruction_ref ins) const;
- * };
- *
- */
+#ifdef TYPE_ERASED_DECLARATION
+
+// Type-erased interface for:
+struct stream_model
+{
+    //
+    std::size_t get_nstream() const;
+    //
+    std::size_t get_stream(instruction_ref ins) const;
+    //
+    std::size_t get_event_id(instruction_ref ins) const;
+    //
+    bool has_stream(instruction_ref ins) const;
+    //
+    bool is_record(instruction_ref ins) const;
+    //
+    bool is_wait(instruction_ref ins) const;
+};
+
+#else

 struct stream_model
 {
@@ -296,6 +302,7 @@ inline const ValueType& any_cast(const stream_model& x)
        throw std::bad_cast();
    return *y;
 }
+#endif

 #endif


--- a/src/include/migraphx/target.hpp
+++ b/src/include/migraphx/target.hpp
@@ -82,20 +82,26 @@ argument copy_from_target(T&, const argument& arg)
    return arg;
 }

-/*
- * Type-erased interface for:
- *
- * struct target
- * {
- *      std::string name() const;
- *      std::vector<pass> get_passes(context& ctx,const compile_options& options) const;
- *      context get_context() const;
- *      argument copy_to(const argument& input) const;
- *      argument copy_from(const argument& input) const;
- *      argument allocate(const shape& s) const;
- * };
- *
- */
+#ifdef TYPE_ERASED_DECLARATION
+
+// Type-erased interface for:
+struct target
+{
+    //
+    std::string name() const;
+    //
+    std::vector<pass> get_passes(context& ctx, const compile_options& options) const;
+    //
+    context get_context() const;
+    // (optional)
+    argument copy_to(const argument& input) const;
+    // (optional)
+    argument copy_from(const argument& input) const;
+    // (optional)
+    argument allocate(const shape& s) const;
+};
+
+#else

 struct target
 {
@@ -382,6 +388,7 @@ inline const ValueType& any_cast(const target& x)
        throw std::bad_cast();
    return *y;
 }
+#endif

 #endif


--- a/src/include/migraphx/value.hpp
+++ b/src/include/migraphx/value.hpp
@@ -178,6 +178,7 @@ struct value
    value(std::nullptr_t);

    value(const char* i);
+    value(const std::string& pkey, const char* i);

 #define MIGRAPHX_VALUE_GENERATE_DECL_METHODS(vt, cpp_type) \
    value(cpp_type i);                                     \
@@ -188,6 +189,12 @@ struct value
    const cpp_type* if_##vt() const;
    MIGRAPHX_VISIT_VALUE_TYPES(MIGRAPHX_VALUE_GENERATE_DECL_METHODS)

+    template <class T>
+    using literal_to_string = std::conditional_t<(std::is_convertible<T, const char*>{} and
+                                                  std::is_convertible<T, std::string>{}),
+                                                 std::string,
+                                                 T>;
+
    template <class T>
    using pick_numeric = std::conditional_t<
        std::is_floating_point<T>{},
@@ -246,6 +253,7 @@ struct value
        return *this = from_values(rhs); // NOLINT
    }

+    value& operator=(const char* c);
    value& operator=(std::nullptr_t);
    value& operator=(const std::initializer_list<value>& i);

@@ -370,11 +378,11 @@ struct value
    }

    template <class To>
-    To value_or(const To& default_value) const
+    literal_to_string<To> value_or(const To& default_value) const
    {
        if(this->is_null())
            return default_value;
-        return to<To>();
+        return to<literal_to_string<To>>();
    }

    template <class To>
@@ -390,12 +398,12 @@ struct value
    }

    template <class To>
-    To get(const std::string& pkey, const To& default_value) const
+    literal_to_string<To> get(const std::string& pkey, const To& default_value) const
    {
        const auto* v = find(pkey);
        if(v == this->end())
            return default_value;
-        return v->to<To>();
+        return v->to<literal_to_string<To>>();
    }

    template <class To>
@@ -408,10 +416,11 @@ struct value
    }

    template <class To>
-    std::vector<To> get(const std::string& pkey,
-                        const std::initializer_list<To>& default_value) const
+    std::vector<literal_to_string<To>> get(const std::string& pkey,
+                                           const std::initializer_list<To>& default_value) const
    {
-        return get<std::vector<To>>(pkey, default_value);
+        return get(pkey,
+                   std::vector<literal_to_string<To>>{default_value.begin(), default_value.end()});
    }

    friend bool operator==(const value& x, const value& y);

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -213,7 +213,7 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
    py::class_<migraphx::shape>(m, "shape")
        .def(py::init([](py::kwargs kwargs) {
            auto v    = migraphx::to_value(kwargs);
-            auto t    = migraphx::shape::parse_type(v.get("type", std::string{"float"}));
+            auto t    = migraphx::shape::parse_type(v.get("type", "float"));
            auto lens = v.get<std::size_t>("lens", {1});
            if(v.contains("strides"))
                return migraphx::shape(t, lens, v.at("strides").to_vector<std::size_t>());

--- a/src/register_target.cpp
+++ b/src/register_target.cpp
-#include <migraphx/register_target.hpp>
 #include <unordered_map>
+#include <migraphx/register_target.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -11,7 +11,17 @@ std::unordered_map<std::string, target>& target_map()
 }

 void register_target(const target& t) { target_map()[t.name()] = t; }
-target make_target(const std::string& name) { return target_map().at(name); }
+
+target make_target(const std::string& name)
+{
+    const auto it = target_map().find(name);
+    if(it == target_map().end())
+    {
+        MIGRAPHX_THROW("Requested target '" + name + "' is not enabled or not supported");
+    }
+    return it->second;
+}
+
 std::vector<std::string> get_targets()
 {
    std::vector<std::string> result;

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -119,6 +119,7 @@ target_link_libraries(kernel_file_check compile_for_gpu)

 rocm_clang_tidy_check(kernel_file_check)

+file(GLOB JIT_GPU_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp)
 add_library(migraphx_gpu
    abs.cpp
    analyze_streams.cpp
@@ -131,9 +132,7 @@ add_library(migraphx_gpu
    compile_ops.cpp
    compile_hip.cpp
    compile_hip_code_object.cpp
-    compile_pointwise.cpp
-    compile_roialign.cpp
-    compile_scatternd.cpp
+    compiler.cpp
    concat.cpp
    convert.cpp
    convolution.cpp
@@ -171,6 +170,7 @@ add_library(migraphx_gpu
    target.cpp
    topk.cpp
    write_literals.cpp
+    ${JIT_GPU_SRCS}
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)

@@ -331,6 +331,12 @@ target_compile_definitions(migraphx_gpu PRIVATE
    "-DMIGRAPHX_EXTRACT_KERNEL=${MIGRAPHX_EXTRACT_KERNEL}"
    "-DMIGRAPHX_USE_HIPRTC=0"
 )
+if(DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
+execute_process(COMMAND which ${CMAKE_CXX_COMPILER_LAUNCHER} OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER)
+string(STRIP "${MIGRAPHX_HIP_COMPILER_LAUNCHER}" MIGRAPHX_HIP_COMPILER_LAUNCHER)
+target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_HIP_COMPILER_LAUNCHER=${MIGRAPHX_HIP_COMPILER_LAUNCHER}")
+endif()
+
 endif()

 # Check miopen find mode api

--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
@@ -178,6 +178,12 @@ bool is_hip_clang_compiler()
    return result;
 }

+bool has_compiler_launcher()
+{
+    static const auto result = fs::exists(MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER_LAUNCHER));
+    return result;
+}
+
 std::vector<std::vector<char>>
 compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch)
 {
@@ -210,6 +216,10 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
    src_compiler compiler;
    compiler.flags    = params;
    compiler.compiler = MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER);
+#ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
+    if(has_compiler_launcher())
+        compiler.launcher = MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER_LAUNCHER);
+#endif

    if(is_hcc_compiler())
        compiler.process = [&](const fs::path& obj_path) -> fs::path {
@@ -238,14 +248,6 @@ std::string enum_params(std::size_t count, std::string param)
    return join_strings(items, ",");
 }

-std::size_t compute_global(std::size_t n, std::size_t local)
-{
-    std::size_t groups = (n + local - 1) / local;
-    // max possible number of blocks is set to 1B (1,073,741,824)
-    std::size_t nglobal = std::min<std::size_t>(1073741824, groups) * local;
-    return nglobal;
-}
-
 #endif // MIGRAPHX_USE_HIPRTC

 } // namespace gpu

--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -93,6 +93,32 @@ const std::vector<std::string>& compiler_warnings()
    return warnings;
 }

+void hip_compile_options::set_launch_params(
+    const value& v,
+    const std::function<std::size_t(std::size_t local)>& compute_global,
+    std::size_t default_local)
+{
+    local = v.get("local", default_local);
+    if(v.contains("global"))
+        global = v.at("global").to<std::size_t>();
+    else
+        global = compute_global(local);
+}
+
+std::function<std::size_t(std::size_t local)>
+compute_global_for(context& ctx, std::size_t n, std::size_t over)
+{
+    assert(over > 0);
+    std::size_t max_global = ctx.get_current_device().get_cu_count() *
+                             ctx.get_current_device().get_max_workitems_per_cu();
+    return [n, over, max_global](std::size_t local) {
+        std::size_t groups     = (n + local - 1) / local;
+        std::size_t max_blocks = max_global / local;
+        std::size_t nglobal    = std::min(max_blocks * over, groups) * local;
+        return nglobal;
+    };
+}
+
 operation compile_hip_code_object(const std::string& content, hip_compile_options options)
 {
    std::vector<src_file> srcs;

--- a/src/targets/gpu/compile_ops.cpp
+++ b/src/targets/gpu/compile_ops.cpp
@@ -6,7 +6,7 @@
 #include <migraphx/par_for.hpp>
 #include <migraphx/register_op.hpp>
 #include <migraphx/op/identity.hpp>
-#include <migraphx/gpu/compile_pointwise.hpp>
+#include <migraphx/gpu/compiler.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -40,35 +40,9 @@ struct precompile_op

 MIGRAPHX_REGISTER_OP(precompile_op);

-struct pointwise_compiler
-{
-    std::string name() const { return "pointwise"; }
-
-    operation apply(context& ctx, instruction_ref ins, const operation&) const
-    {
-        assert(not ins->module_inputs().empty());
-        auto* pm = ins->module_inputs().front();
-        return compile_pointwise(ctx, to_shapes(ins->inputs()), *pm);
-    }
-};
-
-using compiler_function = std::function<operation(context&, instruction_ref, operation)>;
-
-template <class T>
-compiler_function make_compiler_function(T x)
-{
-    return {[=](auto&&... xs) { return x.apply(xs...); }};
-}
-
-template <class... Ts>
-std::unordered_map<std::string, compiler_function> make_compilers(Ts... xs)
-{
-    return {{xs.name(), make_compiler_function(xs)}...};
-}
-
 struct compiled_result
 {
-    operation op;
+    compiler_replace replace;
    instruction_ref ins;
 };

@@ -82,7 +56,6 @@ void par_compile(std::size_t n, F f)

 void compile_ops::apply(module& m) const
 {
-    auto compilers = make_compilers(pointwise_compiler{});
    std::vector<std::function<compiled_result()>> compiles;

    for(auto ins : iterator_for(m))
@@ -90,15 +63,15 @@ void compile_ops::apply(module& m) const
        if(ins->name() != "gpu::precompile_op")
            continue;
        operation preop = any_cast<precompile_op>(ins->get_operator()).op;
-        assert(contains(compilers, preop.name()));
-        auto c = compilers[preop.name()];
-        compiles.emplace_back([=]() -> compiled_result { return {c(*ctx, ins, preop), ins}; });
+        compiles.emplace_back([=]() -> compiled_result {
+            return {compile(*ctx, ins, preop), ins};
+        });
    }
    std::vector<compiled_result> results(compiles.size());
    par_compile(compiles.size(), [&](auto i) { results[i] = compiles[i](); });
    for(const auto& cr : results)
    {
-        m.replace_instruction(cr.ins, cr.op, cr.ins->inputs());
+        cr.replace(m, cr.ins);
    }
 }


--- a/src/targets/gpu/compile_pointwise.cpp
+++ b/src/targets/gpu/compile_pointwise.cpp
-#include <migraphx/gpu/compile_pointwise.hpp>
-#include <migraphx/gpu/compile_hip_code_object.hpp>
-#include <migraphx/gpu/compile_hip.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/cpp_generator.hpp>
-#include <migraphx/ranges.hpp>
-#include <migraphx/reduce_dims.hpp>
-#include <migraphx/stringutils.hpp>
-#include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/eliminate_common_subexpression.hpp>
-#include <migraphx/module.hpp>
-#include <migraphx/pass_manager.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-static const char* const pointwise_kernel = R"__migraphx__(
-#include <migraphx/kernels/index.hpp>
-#include <migraphx/kernels/pointwise.hpp>
-#include <args.hpp>
-
-namespace migraphx {
-
-${preamble}
-
-extern "C" {
-__global__ void kernel(${params}) 
-{
-    pointwise(${lambda}, ${args});
-}
-    
-}
-
-} // namespace migraphx
-
-int main() {}
-
-)__migraphx__";
-
-operation compile_pointwise(context&,
-                            const std::vector<shape>& inputs,
-                            const std::string& lambda,
-                            const std::string& preamble)
-{
-    hip_compile_options options;
-    options.global         = compute_global(inputs.front().elements());
-    options.local          = 1024;
-    options.inputs         = inputs;
-    options.output         = inputs.back();
-    options.virtual_inputs = reduce_dims(inputs);
-    options.params         = "-Wno-float-equal";
-    auto src               = interpolate_string(pointwise_kernel,
-                                  {{"params", enum_params(inputs.size(), "void * private_p")},
-                                   {"args", enum_params(inputs.size(), "private_p")},
-                                   {"lambda", lambda},
-                                   {"preamble", preamble}});
-    return compile_hip_code_object(src, options);
-}
-
-operation compile_pointwise(context& ctx, const std::vector<shape>& inputs, module m)
-{
-    run_passes(m, {eliminate_common_subexpression{}, dead_code_elimination{}});
-    cpp_generator g;
-    g.fmap([](const std::string& fname) { return "migraphx::" + fname; });
-    g.add_point_op("where", "${function:where}(${0}, ${1}, ${2})");
-    g.add_point_op("prelu", "${function:where}(${0} < 0, ${0} * ${1}, ${0})");
-    g.add_point_op("sign", "${function:where}(${0} > 0, 1, ${function:where}(${0} < 0, -1, 0))");
-    g.add_point_op("equal", "migraphx::abs(${0} == ${1})");
-    g.add_point_op("less", "migraphx::abs(${0} < ${1})");
-    g.add_point_op("greater", "migraphx::abs(${0} > ${1})");
-    g.add_point_op("not", "migraphx::abs(not ${0})");
-    // Add explict conversions
-    g.fresult(
-        [](const shape& s) { return "migraphx::convert<" + shape::cpp_type(s.type()) + ">"; });
-    auto name =
-        g.create_function(g.generate_module(m).set_attributes({"__device__"}).set_generic_types(m));
-    return compile_pointwise((ctx), inputs, "MIGRAPHX_LIFT(" + name + ")", g.str());
-}
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/compiler.cpp
+++ b/src/targets/gpu/compiler.cpp
+#include <migraphx/gpu/compiler.hpp>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+auto& compiler_map()
+{
+    static std::unordered_map<std::string, compiler_compile> m; // NOLINT
+    return m;
+}
+
+auto& compiler_op_map()
+{
+    static std::unordered_map<std::string, compiler_compile_op> m; // NOLINT
+    return m;
+}
+
+void register_compiler(const std::string& name, compiler_compile c, compiler_compile_op cop)
+{
+    compiler_map()[name]    = std::move(c);
+    compiler_op_map()[name] = std::move(cop);
+}
+
+bool has_compiler_for(const std::string& name) { return compiler_map().count(name) > 0; }
+compiler_replace compile(context& ctx, instruction_ref ins, const operation& op)
+{
+    return compiler_map().at(op.name())(ctx, ins, op);
+}
+operation
+compile_op(const std::string& name, context& ctx, const std::vector<shape>& inputs, const value& v)
+{
+    return compiler_op_map().at(name)(ctx, inputs, v);
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/layernorm.cpp
+++ b/src/targets/gpu/device/layernorm.cpp
@@ -215,36 +215,22 @@ __device__ __half2 block_reduce_half2(

 // m = x - mean(x)
 // m / sqrt(mean(m ^ 2) + 1e-12)
-__global__ void triadd_layernorm_kernel_half2(
-    void* in1, void* in2, void* in3, void* data_out, index_int batch_item_num, index_int block_size)
+__device__ void layernorm_kernel_half2(__half2* in_data,
+                                       __half2* in_data_reduce,
+                                       __half2* out,
+                                       index_int batch_item_num,
+                                       index_int block_size,
+                                       float rbatch_num)
 {
-    __half2* input1 = reinterpret_cast<__half2*>(in1);
-    __half2* input2 = reinterpret_cast<__half2*>(in2);
-    __half2* input3 = reinterpret_cast<__half2*>(in3);
-    __half2* output = reinterpret_cast<__half2*>(data_out);
-    auto rnum       = __float2half2_rn(1.0f / batch_item_num);
-    batch_item_num /= 2;
+    auto rnum = __float2half2_rn(rbatch_num);
    extern MIGRAPHX_DEVICE_SHARED __half2 buffer2[];
-    __half2* in_data_reduce = buffer2;
-    __half2* in_data        = buffer2 + batch_item_num;
-
-    int start = blockIdx.x * batch_item_num;
-    for(int i = threadIdx.x; i < batch_item_num; i += block_size)
-    {
-        int idx           = i + start;
-        in_data[i]        = __hadd2(__hadd2(input1[idx], input2[idx]), input3[idx]);
-        in_data_reduce[i] = in_data[i];
-        // in_data_reduce[i] = __hmul2(in_data[i], rnum);
-    }
-
    auto m =
        block_reduce_half2(in_data_reduce, batch_item_num, threadIdx.x, block_size, half2_sum{});
    m = __hmul2(m, rnum);

    for(int i = threadIdx.x; i < batch_item_num; i += block_size)
    {
-        in_data[i] = __hsub2(in_data[i], m);
-        // in_data_reduce[i] = __hmul2(__hmul2(in_data[i], in_data[i]), rnum);
+        in_data[i]        = __hsub2(in_data[i], m);
        in_data_reduce[i] = __hmul2(in_data[i], in_data[i]);
    }

@@ -255,11 +241,36 @@ __global__ void triadd_layernorm_kernel_half2(
    auto r   = __hadd2(m, eps);
    r        = h2rsqrt(r);

+    int start = blockIdx.x * batch_item_num;
+    for(int i = threadIdx.x; i < batch_item_num; i += block_size)
+    {
+        int idx  = i + start;
+        out[idx] = __hmul2(in_data[i], r);
+    }
+}
+
+__global__ void triadd_layernorm_half2(
+    void* in1, void* in2, void* in3, void* data_out, index_int batch_item_num, index_int block_size)
+{
+    __half2* input1 = reinterpret_cast<__half2*>(in1);
+    __half2* input2 = reinterpret_cast<__half2*>(in2);
+    __half2* input3 = reinterpret_cast<__half2*>(in3);
+    __half2* output = reinterpret_cast<__half2*>(data_out);
+    float rnum      = 1.0f / batch_item_num;
+    batch_item_num /= 2;
+    extern MIGRAPHX_DEVICE_SHARED __half2 buffer2[];
+    __half2* in_data_reduce = buffer2;
+    __half2* in_data        = buffer2 + batch_item_num;
+
+    int start = blockIdx.x * batch_item_num;
    for(int i = threadIdx.x; i < batch_item_num; i += block_size)
    {
-        int idx     = i + start;
-        output[idx] = __hmul2(in_data[i], r);
+        int idx           = i + start;
+        in_data[i]        = __hadd2(__hadd2(input1[idx], input2[idx]), input3[idx]);
+        in_data_reduce[i] = in_data[i];
    }
+
+    layernorm_kernel_half2(in_data, in_data_reduce, output, batch_item_num, block_size, rnum);
 }

 template <class T>
@@ -281,105 +292,59 @@ block_reduce_half(T* buffer, index_int batch_item_num, index_int tid, index_int

 // m = x - mean(x)
 // m / sqrt(mean(m ^ 2) + 1e-12)
-__global__ void triadd_layernorm_kernel_half(
-    void* in1, void* in2, void* in3, void* data_out, index_int batch_item_num, index_int block_size)
+__device__ void layernorm_kernel_half(__half* in_data,
+                                      __half* in_data_reduce,
+                                      __half* out,
+                                      index_int batch_item_num,
+                                      index_int block_size,
+                                      float rnum)
 {
-    __half* input1 = reinterpret_cast<__half*>(in1);
-    __half* input2 = reinterpret_cast<__half*>(in2);
-    __half* input3 = reinterpret_cast<__half*>(in3);
-    __half* output = reinterpret_cast<__half*>(data_out);
-    extern MIGRAPHX_DEVICE_SHARED __half bufferh[];
-    __half* in_data_reduce = bufferh;
-    __half* in_data        = bufferh + batch_item_num;
+    auto m = block_reduce_half(in_data_reduce, batch_item_num, threadIdx.x, block_size);
+    m *= rnum;

-    int start = blockIdx.x * batch_item_num;
-    auto rnum = 1.0f / batch_item_num;
    for(int i = threadIdx.x; i < batch_item_num; i += block_size)
    {
-        int idx           = i + start;
-        in_data[i]        = __float2half(__half2float(input1[idx]) + __half2float(input2[idx]) +
-                                  __half2float(input3[idx]));
-        in_data_reduce[i] = __float2half(__half2float(in_data[i]) * __half2float(rnum));
+        in_data[i]        = __float2half(__half2float(in_data[i]) - __half2float(m));
+        in_data_reduce[i] = __float2half(__half2float(in_data[i]) * __half2float(in_data[i]));
    }

-    auto m = block_reduce_half(in_data_reduce, batch_item_num, threadIdx.x, block_size);
-    for(int i = threadIdx.x; i < batch_item_num; i += block_size)
-    {
-        in_data[i] = __float2half(__half2float(in_data[i]) - __half2float(m));
-        in_data_reduce[i] =
-            __float2half(__half2float(in_data[i]) * __half2float(in_data[i]) * __half2float(rnum));
-    }
+    m = block_reduce_half(in_data_reduce, batch_item_num, threadIdx.x, block_size);
+    m *= rnum;
+    m += 1.0e-12f;

-    m = __float2half(
-        __half2float(block_reduce_half(in_data_reduce, batch_item_num, threadIdx.x, block_size)) +
-        1.0e-12f);
    auto r = __float2half(rsqrt(__half2float(m)));

+    int start = blockIdx.x * batch_item_num;
    for(int i = threadIdx.x; i < batch_item_num; i += block_size)
    {
-        int idx     = i + start;
-        output[idx] = __float2half(__half2float(in_data[i]) * __half2float(r));
-    }
-}
-
-template <class T>
-__device__ T block_reduce(T* buffer, index_int batch_item_num, index_int tid, index_int block_size)
-{
-    __syncthreads();
-    for(index_int s = block_size; s > 0; s >>= 1)
-    {
-        if(tid < s and tid + s < batch_item_num)
-        {
-            buffer[tid] = buffer[tid] + buffer[tid + s];
-        }
-        __syncthreads();
+        int idx  = i + start;
+        out[idx] = __float2half(__half2float(in_data[i]) * __half2float(r));
    }
-
-    return buffer[0];
 }

 // m = x - mean(x)
 // m / sqrt(mean(m ^ 2) + 1e-12)
-template <class T>
-__global__ void triadd_layernorm_kernel(
+__global__ void triadd_layernorm_half(
    void* in1, void* in2, void* in3, void* data_out, index_int batch_item_num, index_int block_size)
 {
-    T* input1 = reinterpret_cast<T*>(in1);
-    T* input2 = reinterpret_cast<T*>(in2);
-    T* input3 = reinterpret_cast<T*>(in3);
-    T* output = reinterpret_cast<T*>(data_out);
-    extern MIGRAPHX_DEVICE_SHARED T buffer[];
-    T* in_data_reduce = buffer;
-    T* in_data        = buffer + batch_item_num;
+    __half* input1 = reinterpret_cast<__half*>(in1);
+    __half* input2 = reinterpret_cast<__half*>(in2);
+    __half* input3 = reinterpret_cast<__half*>(in3);
+    __half* output = reinterpret_cast<__half*>(data_out);
+    float rnum     = 1.0f / batch_item_num;
+    extern MIGRAPHX_DEVICE_SHARED __half bufferh[];
+    __half* in_data_reduce = bufferh;
+    __half* in_data        = bufferh + batch_item_num;

    int start = blockIdx.x * batch_item_num;
-    auto rnum = 1.0f / batch_item_num;
-    for(int i = threadIdx.x; i < batch_item_num; i += block_size)
-    {
-        int idx           = i + start;
-        in_data[i]        = input1[idx] + input2[idx] + input3[idx];
-        in_data_reduce[i] = in_data[i];
-        // in_data_reduce[i] = __half2float(in_data[i]) * rnum;
-    }
-
-    auto m = block_reduce(in_data_reduce, batch_item_num, threadIdx.x, block_size);
-    m      = m * rnum;
    for(int i = threadIdx.x; i < batch_item_num; i += block_size)
    {
-        in_data[i]        = in_data[i] - m;
-        in_data_reduce[i] = in_data[i] * in_data[i];
-        // in_data_reduce[i] = __half2float(in_data[i] * in_data[i]) * rnum;
+        int idx    = i + start;
+        in_data[i] = __float2half(__half2float(input1[idx]) + __half2float(input2[idx]) +
+                                  __half2float(input3[idx]));
    }
-    m      = block_reduce(in_data_reduce, batch_item_num, threadIdx.x, block_size);
-    m      = m * rnum + 1.0e-12f;
-    auto r = rsqrt(m);

-    for(int i = threadIdx.x; i < batch_item_num; i += block_size)
-    {
-        int idx = i + start;
-        // output[idx] = __half2float(in_data[i]) * r;
-        output[idx] = in_data[i] * r;
-    }
+    layernorm_kernel_half(in_data, in_data_reduce, output, batch_item_num, block_size, rnum);
 }

 void triadd_layernorm(hipStream_t stream,
@@ -393,12 +358,17 @@ void triadd_layernorm(hipStream_t stream,
    auto batch_item_num = in_s.lens().back();
    if(type == shape::half_type and (batch_item_num % 2) == 0)
    {
-        auto half2_block_size = compute_block_size(batch_item_num, 1024);
+        auto block_size       = compute_block_size(batch_item_num, 1024);
        int block_num         = in_s.elements() / batch_item_num;
        int shared_size       = batch_item_num * 2 * in_s.type_size();
-        half2_block_size      = half2_block_size / 4;
-        triadd_layernorm_kernel_half2<<<block_num, half2_block_size, shared_size, stream>>>(
+        auto half2_block_size = block_size / 4;
+        triadd_layernorm_half2<<<block_num, half2_block_size, shared_size, stream>>>(
            arg1.data(), arg2.data(), arg3.data(), result.data(), batch_item_num, half2_block_size);
+
+        // auto half_block_size      = block_size / 2;
+        // triadd_layernorm_half2<<<block_num, half_block_size, shared_size, stream>>>(
+        //     arg1.data(), arg2.data(), arg3.data(), result.data(), batch_item_num,
+        //     half_block_size);
    }
    else
    {
@@ -409,11 +379,11 @@ void triadd_layernorm(hipStream_t stream,
 }

 __global__ void
-layernorm_kernel_half2(void* in1, void* data_out, index_int batch_item_num, index_int block_size)
+layernorm_half2(void* in1, void* data_out, index_int batch_item_num, index_int block_size)
 {
    __half2* input1 = reinterpret_cast<__half2*>(in1);
    __half2* output = reinterpret_cast<__half2*>(data_out);
-    auto rnum       = __float2half2_rn(1.0f / batch_item_num);
+    float rnum      = 1.0f / batch_item_num;
    batch_item_num /= 2;
    extern MIGRAPHX_DEVICE_SHARED __half2 buffer2[];
    __half2* in_data_reduce = buffer2;
@@ -427,28 +397,28 @@ layernorm_kernel_half2(void* in1, void* data_out, index_int batch_item_num, inde
        in_data_reduce[i] = in_data[i];
    }

-    auto m =
-        block_reduce_half2(in_data_reduce, batch_item_num, threadIdx.x, block_size, half2_sum{});
-    m = __hmul2(m, rnum);
-
-    for(int i = threadIdx.x; i < batch_item_num; i += block_size)
-    {
-        in_data[i]        = __hsub2(in_data[i], m);
-        in_data_reduce[i] = __hmul2(in_data[i], in_data[i]);
-    }
-
-    m = block_reduce_half2(in_data_reduce, batch_item_num, threadIdx.x, block_size, half2_sum{});
-    m = __hmul2(m, rnum);
+    layernorm_kernel_half2(in_data, in_data_reduce, output, batch_item_num, block_size, rnum);
+}

-    auto eps = __float2half2_rn(1.0e-12f);
-    auto r   = __hadd2(m, eps);
-    r        = h2rsqrt(r);
+__global__ void
+layernorm_half(void* in1, void* data_out, index_int batch_item_num, index_int block_size)
+{
+    __half* input1 = reinterpret_cast<__half*>(in1);
+    __half* output = reinterpret_cast<__half*>(data_out);
+    float rnum     = 1.0f / batch_item_num;
+    extern MIGRAPHX_DEVICE_SHARED __half buffer3[];
+    __half* in_data_reduce = buffer3;
+    __half* in_data        = buffer3 + batch_item_num;

+    int start = blockIdx.x * batch_item_num;
    for(int i = threadIdx.x; i < batch_item_num; i += block_size)
    {
-        int idx     = i + start;
-        output[idx] = __hmul2(in_data[i], r);
+        int idx           = i + start;
+        in_data[i]        = input1[idx];
+        in_data_reduce[i] = in_data[i];
    }
+
+    layernorm_kernel_half(in_data, in_data_reduce, output, batch_item_num, block_size, rnum);
 }

 void layernorm(hipStream_t stream, const argument& result, const argument& arg1)
@@ -458,12 +428,16 @@ void layernorm(hipStream_t stream, const argument& result, const argument& arg1)
    auto batch_item_num = in_s.lens().back();
    if(type == shape::half_type and (batch_item_num % 2) == 0)
    {
-        auto half2_block_size = compute_block_size(batch_item_num, 1024);
+        auto block_size       = compute_block_size(batch_item_num, 1024);
        int block_num         = in_s.elements() / batch_item_num;
        int shared_size       = batch_item_num * 2 * in_s.type_size();
-        half2_block_size      = half2_block_size / 4;
-        layernorm_kernel_half2<<<block_num, half2_block_size, shared_size, stream>>>(
+        auto half2_block_size = block_size / 4;
+        layernorm_half2<<<block_num, half2_block_size, shared_size, stream>>>(
            arg1.data(), result.data(), batch_item_num, half2_block_size);
+
+        // auto half_block_size      = block_size / 2;
+        // layernorm_half2<<<block_num, half_block_size, shared_size, stream>>>(
+        //     arg1.data(), result.data(), batch_item_num, half_block_size);
    }
    else
    {

--- a/src/targets/gpu/driver/CMakeLists.txt
+++ b/src/targets/gpu/driver/CMakeLists.txt

+file(GLOB GPU_DRIVER_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 add_executable(gpu-driver
-    action.cpp
-    compile_pointwise.cpp
-    main.cpp
-    parser.cpp
-    perf.cpp
-    run_op.cpp
+    ${GPU_DRIVER_SRCS}
 )
 target_include_directories(gpu-driver PRIVATE include)
 target_link_libraries(gpu-driver PRIVATE migraphx_gpu)
--- a/src/targets/gpu/driver/compile_pointwise.cpp
+++ b/src/targets/gpu/driver/compile_pointwise.cpp
 #include <migraphx/gpu/driver/action.hpp>
 #include <migraphx/gpu/driver/perf.hpp>
-#include <migraphx/gpu/compile_pointwise.hpp>
+#include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/context.hpp>

 namespace migraphx {
@@ -8,13 +8,13 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace driver {

-struct compile_pointwise : action<compile_pointwise>
+struct compile_op : action<compile_op>
 {
    static void apply(const parser& p, const value& v)
    {
        context ctx;
        auto inputs = p.parse_shapes(v.at("inputs"));
-        auto op     = gpu::compile_pointwise(ctx, inputs, v.at("lambda").to<std::string>());
+        auto op     = gpu::compile_op(v.at("name").to<std::string>(), ctx, inputs, v);
        double t    = time_op(ctx, op, inputs, p.get(v, "iterations", 100));
        std::cout << op << ": " << t << "ms" << std::endl;
    }

--- a/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
@@ -17,8 +17,6 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std

 std::string enum_params(std::size_t count, std::string param);

-std::size_t compute_global(std::size_t n, std::size_t local = 1024);
-
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip_code_object.hpp
@@ -8,6 +8,8 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

+struct context;
+
 struct hip_compile_options
 {
    std::size_t global;
@@ -17,8 +19,24 @@ struct hip_compile_options
    std::string kernel_name           = "kernel";
    std::string params                = "";
    std::vector<shape> virtual_inputs = {};
+
+    /**
+     * @brief Set the launch parameters but allow v to override the values
+     *
+     * @param v A value class which can have a "global" and/or "local" keys to override the default
+     * global and local
+     * @param compute_global A function used to compute the global based on the local
+     * @param default_local The defaul local to use if its missing from the v parameter
+     */
+    void set_launch_params(const value& v,
+                           const std::function<std::size_t(std::size_t local)>& compute_global,
+                           std::size_t default_local = 1024);
 };

+/// Compute global for n elements, but max out on target-specific upper limit
+std::function<std::size_t(std::size_t local)>
+compute_global_for(context& ctx, std::size_t n, std::size_t over = 1);
+
 operation compile_hip_code_object(const std::string& content, hip_compile_options options);

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/compile_pointwise.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_pointwise.hpp
-#ifndef MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
-#define MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
-
-#include <migraphx/config.hpp>
-#include <migraphx/operation.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-struct module;
-
-namespace gpu {
-
-struct context;
-operation compile_pointwise(context& ctx,
-                            const std::vector<shape>& inputs,
-                            const std::string& lambda,
-                            const std::string& preamble = "");
-
-operation compile_pointwise(context& ctx, const std::vector<shape>& inputs, module m);
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-#endif // MIGRAPHX_GUARD_GPU_COMPILE_POINTWISE_HPP
--- a/src/targets/gpu/include/migraphx/gpu/compile_roialign.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_roialign.hpp
-#ifndef MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
-#define MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
-
-#include <migraphx/config.hpp>
-#include <migraphx/operation.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct context;
-operation compile_roialign(context& ctx, const std::vector<shape>& io_shapes, const value& val);
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-#endif // MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP