Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into...

Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into test_runner_match_input_output

Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into...
Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into test_runner_match_input_output
d3267bb3 · Shucai Xiao · 2d9e620b · a33d6fa2 · d3267bb3 · d3267bb3
Commit d3267bb3 authored Nov 24, 2021 by Shucai Xiao
20 changed files
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -218,7 +218,7 @@ jobs:
      run: |
        echo "leak:dnnl::impl::malloc" > suppressions.txt
        export LSAN_OPTIONS="suppressions=$(pwd)/suppressions.txt"
-        rbuild build -d cget -s gh -t check \
+        rbuild build -d cget -s gh -T check \
          -DCMAKE_BUILD_TYPE=${{matrix.configuration}} \
          -DMIGRAPHX_ENABLE_PYTHON=${{matrix.configuration == 'release' && 'On' || 'Off'}} \
          -DCMAKE_CXX_FLAGS_DEBUG="-g1 -Os -fdebug-prefix-map=$PWD=. -fdebug-types-section -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined" \

--- a/src/driver/argument_parser.hpp
+++ b/src/driver/argument_parser.hpp
@@ -17,6 +17,7 @@
 #include <migraphx/type_name.hpp>
 #include <migraphx/functional.hpp>
 #include <migraphx/stringutils.hpp>
+#include <migraphx/rank.hpp>

 namespace migraphx {
 namespace driver {
@@ -106,10 +107,22 @@ struct argument_parser
        return to_string_range(x);
    }

+    template <class T>
+    auto as_string_value(rank<1>, const T& x) -> decltype(to_string(x))
+    {
+        return to_string(x);
+    }
+
+    template <class T>
+    std::string as_string_value(rank<0>, const T&)
+    {
+        throw std::runtime_error("Can't convert to string");
+    }
+
    template <class T, MIGRAPHX_REQUIRES(not is_multi_value<T>{})>
    std::string as_string_value(const T& x)
    {
-        return to_string(x);
+        return as_string_value(rank<1>{}, x);
    }

    template <class T, class... Fs>
@@ -122,10 +135,11 @@ struct argument_parser
                                 return false;
                             }});

-        argument& arg     = arguments.back();
-        arg.type          = migraphx::get_type_name<T>();
-        arg.default_value = as_string_value(x);
+        argument& arg = arguments.back();
+        arg.type      = migraphx::get_type_name<T>();
        migraphx::each_args([&](auto f) { f(x, arg); }, fs...);
+        if(not arg.default_value.empty() and arg.nargs > 0)
+            arg.default_value = as_string_value(x);
    }

    template <class... Fs>

--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
+#include "verify.hpp"
 #include "argument_parser.hpp"
 #include "command.hpp"
-#include "verify.hpp"
+#include "precision.hpp"
 #include "perf.hpp"
 #include "models.hpp"
 #include "marker_roctx.hpp"
@@ -288,14 +289,12 @@ struct compiler_target

 struct compiler
 {
-    static const int q_fp16 = 1;
-    static const int q_int8 = 2;
    loader l;
    program_params parameters;
    compiler_target ct;
-    bool offload_copy = false;
-    bool fast_math    = true;
-    int quantize      = 0;
+    bool offload_copy  = false;
+    bool fast_math     = true;
+    precision quantize = precision::fp32;

    std::vector<std::string> fill0;
    std::vector<std::string> fill1;
@@ -312,8 +311,8 @@ struct compiler
           {"--disable-fast-math"},
           ap.help("Disable fast math optimization"),
           ap.set_value(false));
-        ap(quantize, {"--fp16"}, ap.help("Quantize for fp16"), ap.set_value(q_fp16));
-        ap(quantize, {"--int8"}, ap.help("Quantize for int8"), ap.set_value(q_int8));
+        ap(quantize, {"--fp16"}, ap.help("Quantize for fp16"), ap.set_value(precision::fp16));
+        ap(quantize, {"--int8"}, ap.help("Quantize for int8"), ap.set_value(precision::int8));
    }

    auto params(const program& p) { return parameters.generate(p, ct.get_target(), offload_copy); }
@@ -325,11 +324,11 @@ struct compiler
        if(p.is_compiled())
            return p;
        auto t = ct.get_target();
-        if(quantize == q_fp16)
+        if(quantize == precision::fp16)
        {
            quantize_fp16(p);
        }
-        else if(quantize == q_int8)
+        else if(quantize == precision::int8)
        {
            quantize_int8(p, t, {params(p)});
        }
@@ -377,6 +376,7 @@ struct verify : command<verify>
    bool reduce          = false;
    bool offload_copy    = false;
    bool fast_math       = true;
+    precision quantize   = precision::fp32;
    void parse(argument_parser& ap)
    {
        l.parse(ap);
@@ -396,6 +396,7 @@ struct verify : command<verify>
           ap.help("Verify each instruction"),
           ap.set_value(true));
        ap(reduce, {"-r", "--reduce"}, ap.help("Reduce program and verify"), ap.set_value(true));
+        ap(quantize, {"--fp16"}, ap.help("Quantize for fp16"), ap.set_value(precision::fp16));
    }

    void run()
@@ -412,15 +413,15 @@ struct verify : command<verify>

        if(per_instruction)
        {
-            verify_instructions(p, t, options, tolerance);
+            verify_instructions(p, t, options, quantize, tolerance);
        }
        else if(reduce)
        {
-            verify_reduced_program(p, t, options, m, tolerance);
+            verify_reduced_program(p, t, options, quantize, m, tolerance);
        }
        else
        {
-            verify_program(l.file, p, t, options, m, tolerance);
+            verify_program(l.file, p, t, options, quantize, m, tolerance);
        }
    }
 };

--- a/src/driver/precision.hpp
+++ b/src/driver/precision.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_PRECISION_HPP
+#define MIGRAPHX_GUARD_RTGLIB_PRECISION_HPP
+
+namespace migraphx {
+namespace driver {
+inline namespace MIGRAPHX_INLINE_NS {
+
+enum class precision
+{
+    fp32,
+    fp16,
+    int8
+};
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace driver
+} // namespace migraphx
+
+#endif
--- a/src/driver/verify.cpp
+++ b/src/driver/verify.cpp
@@ -6,6 +6,7 @@
 #include <migraphx/verify_args.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/compile_options.hpp>
+#include <migraphx/quantization.hpp>

 namespace migraphx {
 namespace driver {
@@ -19,9 +20,16 @@ std::vector<argument> run_ref(program p, const parameter_map& inputs)
    return out;
 }

-std::vector<argument>
-run_target(program p, const target& t, const compile_options& options, const parameter_map& inputs)
+std::vector<argument> run_target(program p,
+                                 const target& t,
+                                 const compile_options& options,
+                                 precision quantize,
+                                 const parameter_map& inputs)
 {
+    if(quantize == precision::fp16)
+    {
+        quantize_fp16(p);
+    }
    p.compile(t, options);

    parameter_map m;
@@ -43,24 +51,24 @@ void verify_program(const std::string& name,
                    const program& p,
                    const target& t,
                    compile_options options,
+                    precision quantize,
                    const parameter_map& inputs,
                    double tolerance)
 {
    auto x = run_ref(p, inputs);
-    auto y = run_target(p, t, options, inputs);
+    auto y = run_target(p, t, options, quantize, inputs);

    std::size_t output_num = x.size();
    for(std::size_t i = 0; i < output_num; ++i)
    {
        verify_args(name, x[i], y[i], tolerance);
    }
-    // std::cout << "cpu: " << x << std::endl;
-    // std::cout << "gpu: " << y << std::endl;
 }

 void verify_instructions(const program& prog,
                         const target& t,
                         compile_options options,
+                         precision quantize,
                         double tolerance)
 {
    const auto* mm_prog = prog.get_main_module();
@@ -92,7 +100,8 @@ void verify_instructions(const program& prog,
        {
            std::cout << "Verify: " << ins.name() << std::endl;
            std::cout << p << std::endl;
-            verify_program(ins.name(), p, t, options, create_param_map(p, false), tolerance);
+            verify_program(
+                ins.name(), p, t, options, quantize, create_param_map(p, false), tolerance);
        }
        catch(...)
        {
@@ -106,6 +115,7 @@ void verify_reduced(program p,
                    int n,
                    const target& t,
                    compile_options options,
+                    precision quantize,
                    const parameter_map& inputs,
                    double tolerance)
 {
@@ -114,12 +124,13 @@ void verify_reduced(program p,
    mm->remove_instructions(last, mm->end());
    std::cout << "Verify: " << std::endl;
    std::cout << p << std::endl;
-    verify_program(std::to_string(n), p, t, options, inputs, tolerance);
+    verify_program(std::to_string(n), p, t, options, quantize, inputs, tolerance);
 }

 void verify_reduced_program(const program& p,
                            const target& t,
                            compile_options options,
+                            precision quantize,
                            const parameter_map& inputs,
                            double tolerance)
 {
@@ -127,7 +138,7 @@ void verify_reduced_program(const program& p,
    auto n         = std::distance(mm->begin(), mm->end());
    for(std::size_t i = 0; i < n; i++)
    {
-        verify_reduced(p, i, t, options, inputs, tolerance);
+        verify_reduced(p, i, t, options, quantize, inputs, tolerance);
    }
 }


--- a/src/driver/verify.hpp
+++ b/src/driver/verify.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_DRIVER_VERIFY_HPP
 #define MIGRAPHX_GUARD_RTGLIB_DRIVER_VERIFY_HPP

+#include "precision.hpp"
 #include <migraphx/program.hpp>

 namespace migraphx {
@@ -11,15 +12,18 @@ void verify_program(const std::string& name,
                    const program& p,
                    const target& t,
                    compile_options options     = compile_options{},
+                    precision quantize          = precision::fp32,
                    const parameter_map& inputs = {},
                    double tolerance            = 100);
 void verify_instructions(const program& prog,
                         const target& t,
                         compile_options options = compile_options{},
+                         precision quantize      = precision::fp32,
                         double tolerance        = 80);
 void verify_reduced_program(const program& p,
                            const target& t,
                            compile_options options     = compile_options{},
+                            precision quantize          = precision::fp32,
                            const parameter_map& inputs = {},
                            double tolerance            = 80);


--- a/src/eliminate_contiguous.cpp
+++ b/src/eliminate_contiguous.cpp
@@ -11,11 +11,13 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-static bool try_compute_shape(instruction_ref ins, const std::vector<shape>& inputs)
+static bool try_compute_shape(instruction_ref ins,
+                              const std::vector<shape>& inputs,
+                              const std::vector<module_ref>& mods)
 {
    try
    {
-        shape new_shape = ins->get_operator().compute_shape(inputs);
+        shape new_shape = ins->get_operator().compute_shape(inputs, mods);
        // If the output shape is a standard shape, no need to try its output
        if(new_shape.standard())
        {
@@ -45,7 +47,7 @@ static bool try_compute_shape(instruction_ref ins, const std::vector<shape>& inp
                return (arg == ins) ? new_shape : arg->get_shape();
            });

-            if(!try_compute_shape(output, input_shapes))
+            if(!try_compute_shape(output, input_shapes, mods))
            {
                return false;
            }
@@ -59,10 +61,12 @@ static bool try_compute_shape(instruction_ref ins, const std::vector<shape>& inp
    return true;
 }

-static bool try_compute_shape(instruction_ref ins, const std::vector<instruction_ref>& args)
+static bool try_compute_shape(instruction_ref ins,
+                              const std::vector<instruction_ref>& args,
+                              const std::vector<module_ref>& mods)
 {
    auto inputs = to_shapes(args);
-    return try_compute_shape(ins, inputs);
+    return try_compute_shape(ins, inputs, mods);
 }

 void eliminate_contiguous::apply(module& p) const
@@ -82,7 +86,7 @@ void eliminate_contiguous::apply(module& p) const
                auto new_args = args;
                auto prev     = arg->inputs().front();
                replace(new_args, arg, prev);
-                if(try_compute_shape(ins, new_args))
+                if(try_compute_shape(ins, new_args, ins->module_inputs()))
                {
                    instruction::replace_argument(ins, arg, prev);
                }

--- a/src/include/migraphx/operation.hpp
+++ b/src/include/migraphx/operation.hpp
@@ -103,7 +103,14 @@ auto operator==(const T& x, const U& y) -> decltype(x.name() == y.name())
 } // namespace operation_operators

 template <class T>
-auto normalize_compute_shape_op(rank<2>, const T& x, const std::vector<shape>& inputs)
+auto compute_shape_op(rank<3>, const T& x, const std::vector<shape>& inputs)
+    -> decltype(x.compute_shape(inputs))
+{
+    return x.compute_shape(inputs);
+}
+
+template <class T>
+auto compute_shape_op(rank<2>, const T& x, const std::vector<shape>& inputs)
    -> decltype(x.normalize_compute_shape(inputs))
 {
    dependent_type<operation, T> y = x;
@@ -112,77 +119,53 @@ auto normalize_compute_shape_op(rank<2>, const T& x, const std::vector<shape>& i
 }

 template <class T>
-auto normalize_compute_shape_op(rank<1>, const T& x, const std::vector<shape>& inputs)
+auto compute_shape_op(rank<1>, const T& x, const std::vector<shape>& inputs)
    -> decltype(x.compute_shape(inputs, {}))
 {
    return x.compute_shape(inputs, {});
 }

 template <class T>
-shape normalize_compute_shape_op(rank<0>, const T& x, const std::vector<shape>&)
+shape compute_shape_op(rank<0>, const T& x, const std::vector<shape>&)
 {
    std::string name = x.name();
    MIGRAPHX_THROW("Shape not computable: " + name);
 }

 template <class T>
-shape normalize_compute_shape_op(const T& x, const std::vector<shape>& inputs)
+shape compute_shape_op(const T& x, const std::vector<shape>& inputs)
 {
-    return normalize_compute_shape_op(rank<2>{}, x, inputs);
+    return compute_shape_op(rank<3>{}, x, inputs);
 }

 template <class T>
-auto compute_shape_op(rank<1>,
-                      const T& x,
-                      const std::vector<shape>& inputs,
-                      const std::vector<module_ref>& mod_args)
+auto mod_compute_shape_op(rank<1>,
+                          const T& x,
+                          const std::vector<shape>& inputs,
+                          const std::vector<module_ref>& mod_args)
    -> decltype(x.compute_shape(inputs, mod_args))
 {
    return x.compute_shape(inputs, mod_args);
 }

 template <class T>
-shape
-    compute_shape_op(rank<0>, const T& x, const std::vector<shape>&, const std::vector<module_ref>&)
-{
-    std::string name = x.name();
-    MIGRAPHX_THROW("Shape not computable: " + name);
-}
-
-template <class T>
-shape compute_shape_op(const T& x,
-                       const std::vector<shape>& inputs,
-                       const std::vector<module_ref>& mod_args)
-{
-    return compute_shape_op(rank<1>{}, x, inputs, mod_args);
-}
-
-template <class T>
-auto normalize_compute_shape_op(rank<1>,
-                                const T& x,
-                                const std::vector<shape>& inputs,
-                                std::vector<module_ref>& mod_args)
-    -> decltype(x.normalize_compute_shape(inputs, mod_args))
-{
-    return x.normalize_compute_shape(inputs, mod_args);
-}
-
-template <class T>
-shape normalize_compute_shape_op(rank<0>,
-                                 const T& x,
-                                 const std::vector<shape>&,
-                                 const std::vector<module_ref>&)
+shape mod_compute_shape_op(rank<0>,
+                           const T& x,
+                           const std::vector<shape>& inputs,
+                           const std::vector<module_ref>& mod_args)
 {
+    if(mod_args.empty())
+        return compute_shape_op(x, inputs);
    std::string name = x.name();
    MIGRAPHX_THROW("Shape not computable: " + name);
 }

 template <class T>
-shape normalize_compute_shape_op(const T& x,
-                                 const std::vector<shape>& inputs,
-                                 std::vector<module_ref>& mod_args)
+shape mod_compute_shape_op(const T& x,
+                           const std::vector<shape>& inputs,
+                           const std::vector<module_ref>& mod_args)
 {
-    return normalize_compute_shape_op(rank<1>{}, x, inputs, mod_args);
+    return mod_compute_shape_op(rank<1>{}, x, inputs, mod_args);
 }

 template <class T>
@@ -855,7 +838,7 @@ struct operation
                                                         T&& private_detail_te_self,
                                                         const std::vector<shape>& input)
    {
-        return detail::normalize_compute_shape_op(private_detail_te_self, input);
+        return detail::compute_shape_op(private_detail_te_self, input);
    }

    template <class T>
@@ -874,7 +857,7 @@ struct operation
                                                         const std::vector<shape>& inputs,
                                                         const std::vector<module_ref>& mod_args)
    {
-        return detail::compute_shape_op(private_detail_te_self, inputs, mod_args);
+        return detail::mod_compute_shape_op(private_detail_te_self, inputs, mod_args);
    }

    template <class T>
@@ -1276,7 +1259,7 @@ template <class T>
 inline auto compute_shape(const T& op, const std::vector<shape>& inputs)
    -> decltype(op.normalize_compute_shape(inputs))
 {
-    return detail::normalize_compute_shape_op(op, inputs);
+    return detail::compute_shape_op(op, inputs);
 }

 inline shape compute_shape(const operation& op,
@@ -1301,7 +1284,7 @@ inline auto compute_shape(const T& op,
                          const std::vector<module_ref>& mod_args)
    -> decltype(op.normalize_compute_shape(inputs, mod_args))
 {
-    return detail::normalize_compute_shape_op(op, inputs, mod_args);
+    return detail::compute_shape_op(op, inputs, mod_args);
 }

 inline bool is_context_free(const operation& op) { return op.is_context_free(); }

--- a/src/include/migraphx/stringutils.hpp
+++ b/src/include/migraphx/stringutils.hpp
@@ -168,7 +168,8 @@ inline std::string to_string_range(const std::initializer_list<T>& r)
 }

 template <class T>
-inline std::string to_string(const T& x)
+inline auto to_string(const T& x)
+    -> decltype((std::declval<std::stringstream>() << x), std::string{})
 {
    std::stringstream ss;
    ss << x;

--- a/src/pass_manager.cpp
+++ b/src/pass_manager.cpp
@@ -15,6 +15,8 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_PASSES);
+
 void validate_pass(module& mod, const pass& p, tracer trace)
 {
    (void)mod;
@@ -82,6 +84,8 @@ module& get_module(module_pass_manager& mpm) { return mpm.get_module(); }

 void run_passes(module& mod, const std::vector<pass>& passes, tracer trace)
 {
+    if(enabled(MIGRAPHX_TRACE_PASSES{}))
+        trace = tracer{std::cout};
    for(const auto& p : passes)
    {
        module_pm{&mod, nullptr, &trace}.run_pass(p);
@@ -90,6 +94,8 @@ void run_passes(module& mod, const std::vector<pass>& passes, tracer trace)

 void run_passes(program& prog, const std::vector<pass>& passes, tracer trace)
 {
+    if(enabled(MIGRAPHX_TRACE_PASSES{}))
+        trace = tracer{std::cout};
    for(const auto& p : passes)
    {
        auto mods = prog.get_modules();

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -91,28 +91,34 @@ add_library(migraphx_device
    device/unary_not.cpp
    device/where.cpp
 )
-set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
-rocm_set_soversion(migraphx_device ${MIGRAPHX_SO_VERSION})
-rocm_clang_tidy_check(migraphx_device)
-target_compile_options(migraphx_device PRIVATE -std=c++17 -fno-gpu-rdc -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns)
-target_link_libraries(migraphx_device migraphx hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument)
-if(CMAKE_CXX_COMPILER MATCHES ".*hcc")
-    set(AMDGPU_TARGETS "gfx803;gfx900;gfx906" CACHE STRING "")
-    foreach(AMDGPU_TARGET ${AMDGPU_TARGETS})
-        target_compile_options(migraphx_device PRIVATE -amdgpu-target=${AMDGPU_TARGET})
-        target_link_libraries(migraphx_device -amdgpu-target=${AMDGPU_TARGET})
-    endforeach()
-else()
-    target_compile_options(migraphx_device PRIVATE -Wno-cuda-compat)
-endif()
+add_library(compile_for_gpu INTERFACE)
+target_compile_options(compile_for_gpu INTERFACE -std=c++17 -fno-gpu-rdc -Wno-cuda-compat -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns)
+target_link_libraries(compile_for_gpu INTERFACE hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument)
 check_cxx_compiler_flag("--cuda-host-only -fhip-lambda-host-device -x hip" HAS_HIP_LAMBDA_HOST_DEVICE)
 if(HAS_HIP_LAMBDA_HOST_DEVICE)
  message(STATUS "Enable -fhip-lambda-host-device")
-  target_compile_options(migraphx_device PRIVATE -fhip-lambda-host-device)
+  target_compile_options(compile_for_gpu INTERFACE -fhip-lambda-host-device)
 endif()
+
+set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
+rocm_set_soversion(migraphx_device ${MIGRAPHX_SO_VERSION})
+rocm_clang_tidy_check(migraphx_device)
+target_link_libraries(migraphx_device PUBLIC migraphx)
+target_link_libraries(migraphx_device PRIVATE compile_for_gpu)
 target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
 target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)

+add_library(kernel_file_check EXCLUDE_FROM_ALL)
+foreach(KERNEL_FILE ${KERNEL_FILES})
+    get_filename_component(KERNEL_BASE_FILE ${KERNEL_FILE} NAME_WE)
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp "#include <migraphx/kernels/${KERNEL_BASE_FILE}.hpp>\n")
+    target_sources(kernel_file_check PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp)
+endforeach()
+target_include_directories(kernel_file_check PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/>)
+target_link_libraries(kernel_file_check compile_for_gpu)
+
+rocm_clang_tidy_check(kernel_file_check)
+
 add_library(migraphx_gpu
    abs.cpp
    analyze_streams.cpp
@@ -341,7 +347,7 @@ target_link_libraries(migraphx_gpu PRIVATE migraphx_device migraphx_kernels)
 add_subdirectory(driver)

 rocm_install_targets(
-  TARGETS migraphx_gpu migraphx_device
+  TARGETS migraphx_gpu migraphx_device compile_for_gpu
  INCLUDE
    ${CMAKE_CURRENT_SOURCE_DIR}/include
 )

--- a/src/targets/gpu/compile_ops.cpp
+++ b/src/targets/gpu/compile_ops.cpp
@@ -3,6 +3,7 @@
 #include <migraphx/module.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/instruction.hpp>
+#include <migraphx/par_for.hpp>
 #include <migraphx/register_op.hpp>
 #include <migraphx/op/identity.hpp>
 #include <migraphx/gpu/compile_pointwise.hpp>
@@ -63,17 +64,31 @@ std::unordered_map<std::string, compiler_function> make_compilers(Ts... xs)
    return {{xs.name(), make_compiler_function(xs)}...};
 }

+struct compiled_result
+{
+    operation op;
+    instruction_ref ins;
+};
+
 void compile_ops::apply(module& m) const
 {
    auto compilers = make_compilers(pointwise_compiler{});
+    std::vector<std::function<compiled_result()>> compiles;
+
    for(auto ins : iterator_for(m))
    {
        if(ins->name() != "gpu::precompile_op")
            continue;
        operation preop = any_cast<precompile_op>(ins->get_operator()).op;
        assert(contains(compilers, preop.name()));
-        auto op = compilers[preop.name()](*ctx, ins, preop);
-        m.replace_instruction(ins, op, ins->inputs());
+        auto c = compilers[preop.name()];
+        compiles.emplace_back([=]() -> compiled_result { return {c(*ctx, ins, preop), ins}; });
+    }
+    std::vector<compiled_result> results(compiles.size());
+    par_for(compiles.size(), 1, [&](auto i) { results[i] = compiles[i](); });
+    for(const auto& cr : results)
+    {
+        m.replace_instruction(cr.ins, cr.op, cr.ins->inputs());
    }
 }


--- a/src/targets/gpu/compile_roialign.cpp
+++ b/src/targets/gpu/compile_roialign.cpp
@@ -14,17 +14,29 @@ namespace gpu {
 static const char* const roialign_kernel = R"__migraphx__(
 #include <migraphx/kernels/roialign.hpp>
 #include <migraphx/kernels/basic_ops.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
 #include <args.hpp>

-using namespace migraphx;
+namespace migraphx {

 extern "C" {
+
 __global__ void roialign_kernel(void* in_x, void* in_rois, void* in_ind, void* y) 
 {
-    make_tensors()(in_x, in_rois, in_ind, y)([](auto&&... xs) { roialign(xs...); });
+    make_tensors()(in_x, in_rois, in_ind, y)([](auto&&... xs) {
+        auto settings = make_roalign_settings(MIGRAPHX_MAKE_CONSTANT(float{ROIS_OFFSET}),
+                                              _c<bool{IS_AVG_POOLING}>,
+                                              _c<int64_t{SAMPLING_RATIO}>, 
+                                              MIGRAPHX_MAKE_CONSTANT(float{SPATIAL_SCALE}));
+        roialign(xs..., settings); 
+    });
 }
+
 }

+} // namespace migraphx
+
 int main() {}

 )__migraphx__";

--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -176,23 +176,23 @@ struct array
    }
 };

-template <class T, T... xs>
-struct integral_const_array : array<T, sizeof...(xs)>
+template <class T, T... Xs>
+struct integral_const_array : array<T, sizeof...(Xs)>
 {
-    using base_array = array<T, sizeof...(xs)>;
-    MIGRAPHX_DEVICE_CONSTEXPR integral_const_array() : base_array({xs...}) {}
+    using base_array = array<T, sizeof...(Xs)>;
+    MIGRAPHX_DEVICE_CONSTEXPR integral_const_array() : base_array({Xs...}) {}
 };

-template <class T, T... xs, class F>
-constexpr auto transform(integral_const_array<T, xs...>, F f)
+template <class T, T... Xs, class F>
+constexpr auto transform(integral_const_array<T, Xs...>, F f)
 {
-    return integral_const_array<T, f(xs)...>{};
+    return integral_const_array<T, f(Xs)...>{};
 }

-template <class T, T... xs, class U, U... ys, class F>
-constexpr auto transform(integral_const_array<T, xs...>, integral_const_array<U, ys...>, F f)
+template <class T, T... Xs, class U, U... Ys, class F>
+constexpr auto transform(integral_const_array<T, Xs...>, integral_const_array<U, Ys...>, F f)
 {
-    return integral_const_array<T, f(xs, ys)...>{};
+    return integral_const_array<T, f(Xs, Ys)...>{};
 }

 template <index_int... Ns>

--- a/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
 #ifndef MIGRAPHX_GUARD_KERNELS_DEBUG_HPP
 #define MIGRAPHX_GUARD_KERNELS_DEBUG_HPP

-#include <hip/hip_runtime.h>
+#include <migraphx/kernels/hip.hpp>

 namespace migraphx {

-inline __host__ __device__ void
+// Workaround hip's broken abort on device code
+#ifdef __HIP_DEVICE_COMPILE__
+// NOLINTNEXTLINE
+#define MIGRAPHX_HIP_NORETURN
+#else
+// NOLINTNEXTLINE
+#define MIGRAPHX_HIP_NORETURN [[noreturn]]
+#endif
+
+// noreturn cannot be used on this function because abort in hip is broken
+MIGRAPHX_HIP_NORETURN inline __host__ __device__ void
 assert_fail(const char* assertion, const char* file, unsigned int line, const char* function)
 {
    printf("%s:%u: %s: assertion '%s' failed.\n", file, line, function, assertion);

--- a/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
@@ -168,6 +168,7 @@ constexpr auto transform_args(F f, Fs... fs)
    return [=](auto... xs) { return transform_args(f)(xs...)(transform_args(fs...)); };
 }

+// NOLINTNEXTLINE
 #define MIGRAPHX_LIFT(...) \
    ([](auto&&... xs) { return (__VA_ARGS__)(static_cast<decltype(xs)>(xs)...); })


--- a/src/targets/gpu/kernels/include/migraphx/kernels/generic_constant.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/generic_constant.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_GENERIC_CONSTANT_HPP
+#define MIGRAPHX_GUARD_KERNELS_GENERIC_CONSTANT_HPP
+
+namespace migraphx {
+
+template <class F>
+struct generic_constant
+{
+    static constexpr auto value = F{}();
+    using value_type            = decltype(value);
+    using type                  = generic_constant;
+    constexpr operator value_type() const noexcept { return value; }
+    constexpr value_type operator()() const noexcept { return value; }
+};
+
+template <class F>
+constexpr generic_constant<F> make_generic_constant(F)
+{
+    return {};
+}
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_MAKE_CONSTANT(x)                           \
+    make_generic_constant([] {                              \
+        struct fun                                          \
+        {                                                   \
+            constexpr auto operator()() const { return x; } \
+        };                                                  \
+        return fun{};                                       \
+    }())
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_GENERIC_CONSTANT_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_HIP_HPP
+#define MIGRAPHX_GUARD_KERNELS_HIP_HPP
+
+// Workaround macro redefinition issue with clang tidy
+#if defined(__HIP_PLATFORM_HCC__) && defined(MIGRAPHX_USE_CLANG_TIDY)
+#undef __HIP_PLATFORM_HCC__ // NOLINT
+#endif
+
+#include <hip/hip_runtime.h>
+
+#endif // MIGRAPHX_GUARD_KERNELS_HIP_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
 #ifndef MIGRAPHX_GUARD_KERNELS_INDEX_HPP
 #define MIGRAPHX_GUARD_KERNELS_INDEX_HPP

-#include <hip/hip_runtime.h>
+#include <migraphx/kernels/hip.hpp>
 #include <migraphx/kernels/types.hpp>

 namespace migraphx {
@@ -17,7 +17,7 @@ struct index
 #ifdef MIGRAPHX_NGLOBAL
        return MIGRAPHX_NGLOBAL;
 #else
-        return blockDim.x * gridDim.x;
+        return blockDim.x * gridDim.x; // NOLINT
 #endif
    }

@@ -26,7 +26,7 @@ struct index
 #ifdef MIGRAPHX_NLOCAL
        return MIGRAPHX_NLOCAL;
 #else
-        return blockDim.x;
+        return blockDim.x;             // NOLINT
 #endif
    }

@@ -53,7 +53,7 @@ struct index

 inline __device__ index make_index()
 {
-    return index{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x};
+    return index{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT
 }

 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
@@ -5,28 +5,30 @@

 namespace migraphx {

-template <class T, T v>
+template <class T, T V>
 struct integral_constant
 {
-    static constexpr T value = v;
+    static constexpr T value = V;
    using value_type         = T;
    using type               = integral_constant;
    constexpr operator value_type() const noexcept { return value; }
    constexpr value_type operator()() const noexcept { return value; }
 };

+// NOLINTNEXTLINE
 #define MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(op)                                \
-    template <class T, T v, class U, U w>                                       \
-    constexpr inline integral_constant<decltype(v op w), (v op w)> operator op( \
-        integral_constant<T, v>, integral_constant<U, w>) noexcept              \
+    template <class T, T V, class U, U w>                                       \
+    constexpr inline integral_constant<decltype(V op w), (V op w)> operator op( \
+        integral_constant<T, V>, integral_constant<U, w>) noexcept              \
    {                                                                           \
        return {};                                                              \
    }

+// NOLINTNEXTLINE
 #define MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(op)                             \
-    template <class T, T v>                                                 \
-    constexpr inline integral_constant<decltype(op v), (op v)> operator op( \
-        integral_constant<T, v>) noexcept                                   \
+    template <class T, T V>                                                 \
+    constexpr inline integral_constant<decltype(op V), (op V)> operator op( \
+        integral_constant<T, V>) noexcept                                   \
    {                                                                       \
        return {};                                                          \
    }
@@ -64,8 +66,8 @@ using false_type = bool_constant<false>;
 template <index_int N>
 using index_constant = integral_constant<index_int, N>;

-template <auto v>
-static constexpr auto _c = integral_constant<decltype(v), v>{};
+template <auto V>
+static constexpr auto _c = integral_constant<decltype(V), V>{}; // NOLINT

 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_INTEGRAL_CONSTANT_HPP