Merge

7e297b13 · Paul · 86ea5e91 · aa7ff911 · 7e297b13 · 7e297b13
Commit 7e297b13 authored Jun 13, 2022 by Paul
20 changed files
--- a/src/targets/cpu/include/migraphx/cpu/pointwise.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/pointwise.hpp
@@ -213,7 +213,6 @@ template <std::size_t N, class... Xs>
 bool is_vectorizable(const Xs&... xs)
 {
    return all_of({xs...}, [](const auto& s) {
-
        if(s.standard() and (s.lens().back() % N) == 0)
            return true;
        if(s.broadcasted())
@@ -320,11 +319,10 @@ struct cpu_unary : reduce_dims_base, auto_register_op<cpu_unary<Op>>
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(2);
-        auto s = inputs.at(0);
+        const auto& s = inputs.at(0);
        return {s.type(), s.lens()};
    }
    argument
-    // cppcheck-suppress constParameter
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
        argument result = get_arg(args, args.size() - 1);
@@ -358,12 +356,11 @@ struct cpu_binary : reduce_dims_base, auto_register_op<cpu_binary<Op>>
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(3);
-        auto s = inputs.at(0);
+        const auto& s = inputs.at(0);
        return {s.type(), s.lens()};
    }

    argument
-    // cppcheck-suppress constParameter
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
        argument result = get_arg(args, args.size() - 1);

--- a/src/targets/cpu/layernorm.cpp
+++ b/src/targets/cpu/layernorm.cpp
@@ -31,7 +31,7 @@ struct dnnl_layernorm : dnnl_op<dnnl_layernorm, dnnl::layer_normalization_forwar
    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        return {dnnl::prop_kind::forward_inference,
-                m.at(DNNL_ARG_SRC),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
                1e-12f,
                dnnl::normalization_flags::none};
    }

--- a/src/targets/cpu/logsoftmax.cpp
+++ b/src/targets/cpu/logsoftmax.cpp
@@ -12,7 +12,7 @@ struct dnnl_logsoftmax : dnnl_extend_op<dnnl_logsoftmax, dnnl::logsoftmax_forwar
    get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        int axis = this->op.axis;
-        return {dnnl::prop_kind::forward_inference, m.at(DNNL_ARG_SRC_0), axis};
+        return {dnnl::prop_kind::forward_inference, m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)), axis};
    }
 };


--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -223,7 +223,7 @@ struct cpu_unary2 : auto_register_op<cpu_unary2<Op>>
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(1);
-        auto s = inputs.at(0);
+        const auto& s = inputs.at(0);
        return {s.type(), s.lens()};
    }

@@ -352,7 +352,7 @@ struct cpu_apply
            std::transform(bind_inputs.begin(),
                           bind_inputs.end(),
                           std::back_inserter(inputs),
-                           [&](const auto& s) { return r.instructions.at(s); });
+                           [&](const auto& s) { return r.instructions[s]; });
            inputs.push_back(this->insert_allocation(ins, ins->get_shape()));
            modl->replace_instruction(ins, op, inputs);
        });
@@ -392,8 +392,10 @@ struct cpu_apply
        extend_op("concat", "dnnl::concat");
        extend_op("contiguous", "dnnl::reorder");
        extend_op("convolution", "dnnl::convolution");
+#ifndef MIGRAPHX_ENABLE_ZENDNN
        extend_op("deconvolution", "dnnl::deconvolution");
        extend_op("dot", "dnnl::dot");
+#endif
        extend_op("erf", "cpu::erf");
        extend_op("gather", "cpu::gather");
        extend_op("logsoftmax", "dnnl::logsoftmax");
@@ -458,11 +460,6 @@ struct cpu_apply
        if(has_op("dnnl::pooling") and ins->get_shape().type() == shape::type_t::float_type and
           not v["ceil_mode"].to<bool>())
            return replace(ins, make_op("dnnl::pooling", op.to_value()));
-        std::string mode = v["mode"].to<std::string>();
-        if(mode == "max")
-            return replace(ins, make_op("cpu::pooling_max", v));
-        else if(mode == "average")
-            return replace(ins, make_op("cpu::pooling_average", v));
        return ins;
    }


--- a/src/targets/cpu/lrn.cpp
+++ b/src/targets/cpu/lrn.cpp
@@ -12,7 +12,7 @@ struct dnnl_lrn : dnnl_extend_op<dnnl_lrn, dnnl::lrn_forward, op::lrn>
    {
        return {dnnl::prop_kind::forward_inference,
                dnnl::algorithm::lrn_across_channels,
-                m.at(DNNL_ARG_SRC_0),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)),
                this->op.size,
                this->op.alpha,
                this->op.beta,

--- a/src/targets/cpu/pooling.cpp
+++ b/src/targets/cpu/pooling.cpp
@@ -11,132 +11,21 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {

-struct max_pool
-{
-    static std::string name() { return "max"; }
-    template <class T>
-    static T start()
-    {
-        return std::numeric_limits<T>::lowest();
-    }
-
-    static double apply(double x, double y)
-    {
-        double m = std::max(x, y);
-        return (m);
-    }
-
-    static double final(double x, std::size_t) { return (x); }
-};
-
-struct avg_pool
-{
-    static std::string name() { return "average"; }
-
-    template <class T>
-    static double start()
-    {
-        return 0.0;
-    }
-
-    static double apply(double x, double y) { return x + y; }
-
-    static double final(double x, std::size_t y) { return (y == 0) ? 0.0 : (x / y); }
-};
-
-template <class Op>
-struct cpu_pooling : auto_register_op<cpu_pooling<Op>>
-{
-    cpu_pooling() = default;
-
-    cpu_pooling(op::pooling pop) : op(std::move(pop)) {}
-
-    op::pooling op;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-
-    std::string name() const { return "cpu::pooling_" + Op::name(); }
-    shape compute_shape(std::vector<shape> inputs) const
-    {
-        inputs.pop_back();
-        return op.normalize_compute_shape(inputs);
-    }
-
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
-
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        visit_all(args.back(), args[0])([&](auto output, auto input) {
-            using type   = typename decltype(output)::value_type;
-            auto in_s    = input.get_shape();
-            auto in_lens = in_s.lens();
-            std::vector<std::size_t> vec_len(in_lens.begin() + 2, in_lens.end());
-
-            par_for(output_shape.elements(), [&](auto i) {
-                auto idx_o = output_shape.multi(i);
-                auto n_dim = idx_o.size();
-                std::vector<std::size_t> win_start;
-                std::vector<std::size_t> win_size;
-                for(std::size_t dim = 2; dim < n_dim; ++dim)
-                {
-                    auto d_2  = dim - 2;
-                    int start = static_cast<int>(idx_o[dim] * op.stride[d_2]) -
-                                static_cast<int>(op.padding[d_2]);
-                    int end = std::min(start + op.lengths[d_2], in_lens[dim]);
-                    start   = std::max(start, 0);
-                    win_start.push_back(start);
-                    win_size.push_back(end - start);
-                }
-
-                shape win_shape{output_shape.type(), win_size};
-                auto pool_size = win_shape.elements();
-                double acc     = Op::template start<type>();
-                shape_for_each(win_shape, [&](auto idx_w) {
-                    auto idx = idx_o;
-                    std::transform(idx_w.begin(),
-                                   idx_w.end(),
-                                   win_start.begin(),
-                                   idx.begin() + 2,
-                                   [](auto ii, auto jj) { return ii + jj; });
-                    if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
-                       idx < in_lens)
-                    {
-                        acc = Op::apply(acc, input[in_s.index(idx)]);
-                    }
-                });
-
-                output[i] = type(Op::final(acc, pool_size));
-            });
-        });
-
-        return args.back();
-    }
-};
-
-template struct cpu_pooling<avg_pool>;
-template struct cpu_pooling<max_pool>;
-
 struct dnnl_pooling : dnnl_extend_op<dnnl_pooling, dnnl::pooling_forward, op::pooling>
 {
-    std::vector<int> arg_map(int) const { return {DNNL_ARG_SRC}; }
+    std::vector<int> arg_map(int) const { return {MIGRAPHX_DNNL_PREFIX(ARG_SRC)}; }

    dnnl::pooling_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
-        auto algo  = op.mode == "max" ? dnnl::algorithm::pooling_max : dnnl::algorithm::pooling_avg;
+        auto algo = op.mode == op::pooling_mode::max ? dnnl::algorithm::pooling_max
+                                                     : dnnl::algorithm::pooling_avg;
        auto kdims = op.kdims();
        std::vector<size_t> padding_l(op.padding.begin(), op.padding.begin() + kdims);
        std::vector<size_t> padding_r(op.padding.begin() + kdims, op.padding.end());
        return {dnnl::prop_kind::forward_inference,
                algo,
-                m.at(DNNL_ARG_SRC),
-                m.at(DNNL_ARG_DST),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)),
                to_dnnl_dims(op.stride),
                to_dnnl_dims(op.lengths),
                to_dnnl_dims(padding_l),
@@ -145,5 +34,6 @@ struct dnnl_pooling : dnnl_extend_op<dnnl_pooling, dnnl::pooling_forward, op::po
 };

 } // namespace cpu
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/cpu/preallocate.cpp
+++ b/src/targets/cpu/preallocate.cpp
--- a/src/targets/cpu/reduction.cpp
+++ b/src/targets/cpu/reduction.cpp
@@ -37,7 +37,11 @@ struct dnnl_reduction : dnnl_op<dnnl_reduction, dnnl::reduction>

    dnnl::reduction::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
-        return {to_dnnl_algo(algo), m.at(DNNL_ARG_SRC), m.at(DNNL_ARG_DST), 0, 0};
+        return {to_dnnl_algo(algo),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)),
+                m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST)),
+                0,
+                0};
    }
 };


--- a/src/targets/cpu/reorder.cpp
+++ b/src/targets/cpu/reorder.cpp
@@ -27,7 +27,7 @@ struct dnnl_reorder : dnnl_op<dnnl_reorder, dnnl::reorder>
    };
    desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
-        return {m.at(DNNL_ARG_SRC), m.at(DNNL_ARG_DST)};
+        return {m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC)), m.at(MIGRAPHX_DNNL_PREFIX(ARG_DST))};
    }

    auto get_primitive_desc(const desc& d, const dnnl::primitive_attr& attr) const

--- a/src/targets/cpu/softmax.cpp
+++ b/src/targets/cpu/softmax.cpp
@@ -11,7 +11,7 @@ struct dnnl_softmax : dnnl_extend_op<dnnl_softmax, dnnl::softmax_forward, op::so
    dnnl::softmax_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
    {
        int axis = this->op.axis;
-        return {dnnl::prop_kind::forward_inference, m.at(DNNL_ARG_SRC_0), axis};
+        return {dnnl::prop_kind::forward_inference, m.at(MIGRAPHX_DNNL_PREFIX(ARG_SRC_0)), axis};
    }
 };


--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -3,7 +3,6 @@
 #include <migraphx/check_context.hpp>
 #include <migraphx/adjust_allocation.hpp>
 #include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/decompose.hpp>
 #include <migraphx/eliminate_allocation.hpp>
 #include <migraphx/eliminate_common_subexpression.hpp>
 #include <migraphx/eliminate_concat.hpp>
@@ -15,7 +14,6 @@
 #include <migraphx/memory_coloring.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
-#include <migraphx/remap.hpp>
 #include <migraphx/rewrite_batchnorm.hpp>
 #include <migraphx/rewrite_pooling.hpp>
 #include <migraphx/rewrite_quantization.hpp>
@@ -23,6 +21,7 @@
 #include <migraphx/schedule.hpp>
 #include <migraphx/memory_coloring.hpp>
 #include <migraphx/simplify_algebra.hpp>
+#include <migraphx/simplify_qdq.hpp>
 #include <migraphx/simplify_reshapes.hpp>
 #include <migraphx/preallocate_param.hpp>
 #include <migraphx/cpu/fuse_ops.hpp>
@@ -52,8 +51,6 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
            dead_code_elimination{},
            eliminate_data_type{unsupported_types, shape::type_t::float_type},
            dead_code_elimination{},
-            decompose{},
-            dead_code_elimination{},
            simplify_reshapes{},
            eliminate_identity{},
            eliminate_pad{},

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -11,7 +11,7 @@ if(NOT TARGET MIOpen)
 endif()

 include(Embed)
-file(GLOB KERNEL_FILES
+file(GLOB KERNEL_FILES ${CONFIGURE_DEPENDS}
    ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/*.hpp)
 message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
 add_embed_library(migraphx_kernels ${KERNEL_FILES})
@@ -41,6 +41,7 @@ add_library(migraphx_device
    device/equal.cpp
    device/erf.cpp
    device/exp.cpp
+    device/fill.cpp
    device/floor.cpp
    device/gather.cpp
    device/gelu.cpp
@@ -58,6 +59,8 @@ add_library(migraphx_device
    device/mul.cpp
    device/mul_add.cpp
    device/mul_add_relu.cpp
+    device/multinomial.cpp
+    device/nonzero.cpp
    device/pad.cpp
    device/pow.cpp
    device/prelu.cpp
@@ -84,30 +87,40 @@ add_library(migraphx_device
    device/sub.cpp
    device/tan.cpp
    device/tanh.cpp
+    device/topk.cpp
    device/unary_not.cpp
+    device/where.cpp
 )
-set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
-rocm_set_soversion(migraphx_device ${MIGRAPHX_SO_VERSION})
-rocm_clang_tidy_check(migraphx_device)
-target_compile_options(migraphx_device PRIVATE -std=c++17 -fno-gpu-rdc -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns)
-target_link_libraries(migraphx_device migraphx hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument)
-if(CMAKE_CXX_COMPILER MATCHES ".*hcc")
-    set(AMDGPU_TARGETS "gfx803;gfx900;gfx906" CACHE STRING "")
-    foreach(AMDGPU_TARGET ${AMDGPU_TARGETS})
-        target_compile_options(migraphx_device PRIVATE -amdgpu-target=${AMDGPU_TARGET})
-        target_link_libraries(migraphx_device -amdgpu-target=${AMDGPU_TARGET})
-    endforeach()
-else()
-    target_compile_options(migraphx_device PRIVATE -Wno-cuda-compat)
-endif()
+add_library(compile_for_gpu INTERFACE)
+target_compile_options(compile_for_gpu INTERFACE -std=c++17 -fno-gpu-rdc -Wno-cuda-compat -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns)
+target_link_libraries(compile_for_gpu INTERFACE hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-option-ignored)
 check_cxx_compiler_flag("--cuda-host-only -fhip-lambda-host-device -x hip" HAS_HIP_LAMBDA_HOST_DEVICE)
 if(HAS_HIP_LAMBDA_HOST_DEVICE)
  message(STATUS "Enable -fhip-lambda-host-device")
-  target_compile_options(migraphx_device PRIVATE -fhip-lambda-host-device)
+  target_compile_options(compile_for_gpu INTERFACE -fhip-lambda-host-device)
 endif()
+
+set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
+rocm_set_soversion(migraphx_device ${MIGRAPHX_SO_VERSION})
+rocm_clang_tidy_check(migraphx_device)
+target_link_libraries(migraphx_device PUBLIC migraphx)
+target_link_libraries(migraphx_device PRIVATE compile_for_gpu)
 target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
 target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)

+add_library(kernel_file_check EXCLUDE_FROM_ALL)
+foreach(KERNEL_FILE ${KERNEL_FILES})
+    get_filename_component(KERNEL_BASE_FILE ${KERNEL_FILE} NAME_WE)
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp "#include <migraphx/kernels/${KERNEL_BASE_FILE}.hpp>\n")
+    target_sources(kernel_file_check PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp)
+endforeach()
+target_compile_definitions(kernel_file_check PRIVATE -DMIGRAPHX_NLOCAL=256)
+target_include_directories(kernel_file_check PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/>)
+target_link_libraries(kernel_file_check compile_for_gpu)
+
+rocm_clang_tidy_check(kernel_file_check)
+
+file(GLOB JIT_GPU_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp)
 add_library(migraphx_gpu
    abs.cpp
    analyze_streams.cpp
@@ -117,9 +130,11 @@ add_library(migraphx_gpu
    batch_norm_inference.cpp
    clip.cpp
    code_object_op.cpp
+    compile_ops.cpp
+    compile_gen.cpp
    compile_hip.cpp
    compile_hip_code_object.cpp
-    compile_pointwise.cpp
+    compiler.cpp
    concat.cpp
    convert.cpp
    convolution.cpp
@@ -136,11 +151,15 @@ add_library(migraphx_gpu
    kernel.cpp
    lowering.cpp
    logsoftmax.cpp
+    loop.cpp
    lrn.cpp
    leaky_relu.cpp
    mlir_conv.cpp
+    multinomial.cpp
+    nonzero.cpp
    pack_args.cpp
    pack_int8_args.cpp
+    prefuse_ops.cpp
    pad.cpp
    pooling.cpp
    quant_convolution.cpp
@@ -152,7 +171,9 @@ add_library(migraphx_gpu
    softmax.cpp
    sync_device.cpp
    target.cpp
+    topk.cpp
    write_literals.cpp
+    ${JIT_GPU_SRCS}
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)

@@ -190,9 +211,12 @@ register_migraphx_gpu_ops(hip_
    logical_and
    logical_or
    logical_xor
+    loop
    max
    min
    mul
+    multinomial
+    nonzero
    pad
    pow
    prelu
@@ -218,7 +242,9 @@ register_migraphx_gpu_ops(hip_
    sub
    tanh
    tan
+    topk
    unary_not
+    where
 )
 register_migraphx_gpu_ops(miopen_
    abs
@@ -284,19 +310,37 @@ if(MIGRAPHX_ENABLE_MLIR)
    target_link_libraries(migraphx_gpu PUBLIC ${LIBMLIRMIOPEN})
 endif()

+set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "")
+if(MIGRAPHX_USE_HIPRTC)
+target_compile_definitions(migraphx_gpu PRIVATE -DMIGRAPHX_USE_HIPRTC=1)
+else()
 # Get flags needed to compile hip
 include(TargetFlags)
 target_flags(HIP_COMPILER_FLAGS hip::device)
 # Remove cuda arch flags
-string(REGEX REPLACE "--cuda-gpu-arch=[^ \t\r\n]+" "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-string(REGEX REPLACE "--offload-arch=[^ \t\r\n]+" "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+string(REGEX REPLACE --offload-arch=[a-z0-9:+-]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+# Skip library paths since hip will incorrectly treat it as a source file
+string(APPEND HIP_COMPILER_FLAGS " ")
+foreach(_unused RANGE 2)
+    string(REGEX REPLACE " /[^ ]+\\.(a|so) " " " HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+endforeach()
+
 message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
 target_compile_definitions(migraphx_gpu PRIVATE 
    "-DMIGRAPHX_HIP_COMPILER=${CMAKE_CXX_COMPILER}" 
    "-DMIGRAPHX_HIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}" 
    "-DMIGRAPHX_OFFLOADBUNDLER_BIN=${MIGRAPHX_OFFLOADBUNDLER_BIN}"
    "-DMIGRAPHX_EXTRACT_KERNEL=${MIGRAPHX_EXTRACT_KERNEL}"
+    "-DMIGRAPHX_USE_HIPRTC=0"
 )
+if(DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
+execute_process(COMMAND which ${CMAKE_CXX_COMPILER_LAUNCHER} OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER)
+string(STRIP "${MIGRAPHX_HIP_COMPILER_LAUNCHER}" MIGRAPHX_HIP_COMPILER_LAUNCHER)
+target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_HIP_COMPILER_LAUNCHER=${MIGRAPHX_HIP_COMPILER_LAUNCHER}")
+endif()
+
+endif()

 # Check miopen find mode api
 include(CheckLibraryExists)
@@ -317,7 +361,7 @@ target_link_libraries(migraphx_gpu PRIVATE migraphx_device migraphx_kernels)
 add_subdirectory(driver)

 rocm_install_targets(
-  TARGETS migraphx_gpu migraphx_device
+  TARGETS migraphx_gpu migraphx_device compile_for_gpu
  INCLUDE
    ${CMAKE_CURRENT_SOURCE_DIR}/include
 )

--- a/src/targets/gpu/allocation_model.cpp
+++ b/src/targets/gpu/allocation_model.cpp
 #include <migraphx/gpu/allocation_model.hpp>
 #include <migraphx/make_op.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/module.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/analyze_streams.cpp
+++ b/src/targets/gpu/analyze_streams.cpp
@@ -28,30 +28,30 @@ struct hip_stream_model
    bool is_wait(migraphx::instruction_ref ins) const { return ins->name() == "gpu::wait_event"; }
 };

-stream_model make_stream_model(const module& p)
+stream_model make_stream_model(const module& m)
 {
-    hip_stream_model m;
+    hip_stream_model hsm;
    std::size_t stream = 0;
-    for(auto ins : iterator_for(p))
+    for(auto ins : iterator_for(m))
    {
        if(ins->name() == "gpu::set_stream")
        {
-            auto v       = ins->get_operator().to_value();
-            stream       = v["stream"].to<std::size_t>();
-            m.max_stream = std::max(stream, m.max_stream);
+            auto v         = ins->get_operator().to_value();
+            stream         = v["stream"].to<std::size_t>();
+            hsm.max_stream = std::max(stream, hsm.max_stream);
        }
        if(ins->get_operator().is_context_free())
            continue;
        if(contains({"hip::hip_allocate_memory", "hip::hip_copy_literal", "@param"}, ins->name()))
            continue;
-        m.ins2stream[ins] = stream;
+        hsm.ins2stream[ins] = stream;
    }
-    return m;
+    return hsm;
 }

-std::vector<stream_race> analyze_streams(const module& p)
+std::vector<stream_race> analyze_streams(const module& m)
 {
-    return migraphx::analyze_streams(p, make_stream_model(p));
+    return migraphx::analyze_streams(m, make_stream_model(m));
 }

 } // namespace gpu

--- a/src/targets/gpu/argmax.cpp
+++ b/src/targets/gpu/argmax.cpp
@@ -9,7 +9,7 @@ namespace gpu {

 shape hip_argmax::compute_shape(const std::vector<shape>& inputs) const
 {
-    check_shapes{inputs, *this}.has(2).standard();
+    check_shapes{inputs, *this}.has(2);
    return op.normalize_compute_shape({inputs.at(0)});
 }


--- a/src/targets/gpu/argmin.cpp
+++ b/src/targets/gpu/argmin.cpp
@@ -9,7 +9,7 @@ namespace gpu {

 shape hip_argmin::compute_shape(const std::vector<shape>& inputs) const
 {
-    check_shapes{inputs, *this}.has(2).standard();
+    check_shapes{inputs, *this}.has(2);
    return op.normalize_compute_shape({inputs.at(0)});
 }


--- a/src/targets/gpu/compile_gen.cpp
+++ b/src/targets/gpu/compile_gen.cpp
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/shape.hpp>
+#include <migraphx/permutation.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace gen {
+
+static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)
+{
+    // If all inputs are half then only use half2
+    if(std::all_of(inputs.begin(), inputs.end(), [](const auto& s) {
+           return s.type() == shape::half_type;
+       }))
+        return {2};
+    return {4, 2};
+}
+
+vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs)
+{
+    auto sizes = vector_sizes(inputs);
+    std::vector<std::size_t> max_vec_size;
+    std::transform(inputs.begin(),
+                   inputs.end(),
+                   std::back_inserter(max_vec_size),
+                   [&](const auto& input) -> std::size_t {
+                       auto stride = input.strides()[axis];
+                       auto len    = input.lens()[axis];
+                       if(stride != 0 and stride != 1)
+                           return 1;
+                       if(len == 1 and input.elements() > sizes.front())
+                           return sizes.front();
+                       auto it = std::find_if(
+                           sizes.begin(), sizes.end(), [&](auto i) { return (len % i) == 0; });
+                       if(it != sizes.end())
+                           return *it;
+                       return 1;
+                   });
+    return {*std::min_element(max_vec_size.begin(), max_vec_size.end()), axis};
+}
+
+std::string vectorize::str() const
+{
+    return "vectorize<" + to_string(size) + ", " + to_string(axis) + ">()";
+}
+
+preload preload::broadcasts(std::size_t axis, const std::vector<shape>& inputs)
+{
+    const std::size_t max_lds_bytes = 4096;
+    std::vector<bool> result;
+    std::transform(inputs.begin(),
+                   inputs.end(),
+                   std::back_inserter(result),
+                   [&](const shape& input) { return input.strides()[axis] == 0; });
+    auto bytes = std::inner_product(inputs.begin(),
+                                    inputs.end(),
+                                    result.begin(),
+                                    std::size_t{0},
+                                    std::plus<>{},
+                                    [](const shape& s, bool b) -> std::size_t {
+                                        if(b)
+                                            return s.bytes();
+                                        return 0;
+                                    });
+    if(bytes < max_lds_bytes)
+        return {result};
+    // TODO: Try to partially preload items
+    std::fill(result.begin(), result.end(), false);
+    return {result};
+}
+
+std::string preload::str() const
+{
+    std::vector<std::string> bool_strs;
+    std::transform(args.begin(), std::prev(args.end()), std::back_inserter(bool_strs), [](bool b) {
+        if(b)
+            return "true";
+        return "false";
+    });
+    return "auto_preload<false, " + join_strings(bool_strs, ", ") + ">(idx)";
+}
+
+bool preload::is_preloading() const
+{
+    return std::accumulate(args.begin(), args.end(), false, std::logical_or<>{});
+}
+
+std::size_t find_fast_axis(const std::vector<shape>& inputs)
+{
+    auto permutation = find_permutation(inputs);
+    auto it          = std::max_element(permutation.begin(), permutation.end());
+    return it - permutation.begin();
+}
+
+std::string make_transformer_args(std::vector<std::string> transformers)
+{
+    return join_strings(std::move(transformers), ", ");
+}
+
+} // namespace gen
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
 #include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/errors.hpp>
 #include <migraphx/stringutils.hpp>
-#include <migraphx/compile_src.hpp>
-#include <migraphx/process.hpp>
+#include <migraphx/ranges.hpp>
 #include <migraphx/env.hpp>
 #include <cassert>
+#include <iostream>
+
+#if MIGRAPHX_USE_HIPRTC
+#include <hip/hiprtc.h>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/env.hpp>
+#else
+#include <migraphx/compile_src.hpp>
+#include <migraphx/process.hpp>
+#endif

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DEBUG);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_OPTIMIZE);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_ASM);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_SRC);
+
+#if MIGRAPHX_USE_HIPRTC
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_HIPRTC)
+
+std::string hiprtc_error(hiprtcResult err, const std::string& msg)
+{
+    return "hiprtc: " + (hiprtcGetErrorString(err) + (": " + msg));
+}
+
+void hiprtc_check_error(hiprtcResult err, const std::string& msg, const std::string& ctx)
+{
+    if(err != HIPRTC_SUCCESS)
+        throw make_exception(ctx, hiprtc_error(err, msg));
+}
+
+#define MIGRAPHX_HIPRTC(...) \
+    hiprtc_check_error(__VA_ARGS__, #__VA_ARGS__, MIGRAPHX_MAKE_SOURCE_CTX())
+
+#define MIGRAPHX_HIPRTC_THROW(error, msg) MIGRAPHX_THROW(hiprtc_error(error, msg))
+
+// Workaround hiprtc's broken API
+void hiprtc_program_destroy(hiprtcProgram prog) { hiprtcDestroyProgram(&prog); }
+using hiprtc_program_ptr = MIGRAPHX_MANAGE_PTR(hiprtcProgram, hiprtc_program_destroy);
+
+template <class... Ts>
+hiprtc_program_ptr hiprtc_program_create(Ts... xs)
+{
+    hiprtcProgram prog = nullptr;
+    auto result        = hiprtcCreateProgram(&prog, xs...);
+    hiprtc_program_ptr p{prog};
+    if(result != HIPRTC_SUCCESS)
+        MIGRAPHX_HIPRTC_THROW(result, "Create program failed.");
+    return p;
+}
+
+struct hiprtc_program
+{
+    struct string_array
+    {
+        std::vector<std::string> strings{};
+        std::vector<const char*> c_strs{};
+
+        string_array() {}
+        string_array(const string_array&) = delete;
+
+        std::size_t size() const { return strings.size(); }
+
+        const char** data() { return c_strs.data(); }
+
+        void push_back(std::string s)
+        {
+            strings.push_back(std::move(s));
+            c_strs.push_back(strings.back().c_str());
+        }
+    };
+
+    hiprtc_program_ptr prog = nullptr;
+    string_array headers{};
+    string_array include_names{};
+    std::string cpp_src  = "";
+    std::string cpp_name = "";
+
+    hiprtc_program(const std::vector<src_file>& srcs)
+    {
+        for(auto&& src : srcs)
+        {
+            std::string content{src.content.first, src.content.second};
+            std::string path = src.path.string();
+            if(src.path.extension().string() == ".cpp")
+            {
+                cpp_src  = std::move(content);
+                cpp_name = std::move(path);
+            }
+            else
+            {
+                headers.push_back(std::move(content));
+                include_names.push_back(std::move(path));
+            }
+        }
+        prog = hiprtc_program_create(cpp_src.c_str(),
+                                     cpp_name.c_str(),
+                                     headers.size(),
+                                     headers.data(),
+                                     include_names.data());
+    }
+
+    void compile(const std::vector<std::string>& options)
+    {
+        if(enabled(MIGRAPHX_TRACE_HIPRTC{}))
+            std::cout << "hiprtc " << join_strings(options, " ") << " " << cpp_name << std::endl;
+        std::vector<const char*> c_options;
+        std::transform(options.begin(),
+                       options.end(),
+                       std::back_inserter(c_options),
+                       [](const std::string& s) { return s.c_str(); });
+        auto result = hiprtcCompileProgram(prog.get(), c_options.size(), c_options.data());
+        std::cerr << log() << std::endl;
+        if(result != HIPRTC_SUCCESS)
+            MIGRAPHX_HIPRTC_THROW(result, "Compilation failed.");
+    }
+
+    std::string log()
+    {
+        std::size_t n = 0;
+        MIGRAPHX_HIPRTC(hiprtcGetProgramLogSize(prog.get(), &n));
+        if(n < 2)
+            return {};
+        std::vector<char> buffer(n);
+        MIGRAPHX_HIPRTC(hiprtcGetProgramLog(prog.get(), buffer.data()));
+        assert(buffer.back() == 0);
+        return {buffer.begin(), buffer.end() - 1};
+    }
+
+    std::vector<char> get_code_obj()
+    {
+        std::size_t n = 0;
+        MIGRAPHX_HIPRTC(hiprtcGetCodeSize(prog.get(), &n));
+        std::vector<char> buffer(n);
+        MIGRAPHX_HIPRTC(hiprtcGetCode(prog.get(), buffer.data()));
+        return buffer;
+    }
+};
+
+std::vector<std::vector<char>>
+compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch)
+{
+    hiprtc_program prog(srcs);
+    auto options = split_string(params, ' ');
+    if(enabled(MIGRAPHX_GPU_DEBUG{}))
+        options.push_back("-DMIGRAPHX_DEBUG");
+    if(std::none_of(options.begin(), options.end(), [](const std::string& s) {
+           return starts_with(s, "--std=") or starts_with(s, "-std=");
+       }))
+        options.push_back("-std=c++17");
+    options.push_back("-fno-gpu-rdc");
+    options.push_back(" -O" + string_value_of(MIGRAPHX_GPU_OPTIMIZE{}, "3"));
+    options.push_back("-Wno-cuda-compat");
+    options.push_back("--cuda-gpu-arch=" + arch);
+    prog.compile(options);
+    return {prog.get_code_obj()};
+}
+
+#else // MIGRAPHX_USE_HIPRTC
+
 bool is_hcc_compiler()
 {
    static const auto result = ends_with(MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER), "hcc");
@@ -22,8 +180,18 @@ bool is_hip_clang_compiler()
    return result;
 }

-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DEBUG);
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_OPTIMIZE);
+bool has_compiler_launcher()
+{
+    static const auto result = fs::exists(MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER_LAUNCHER));
+    return result;
+}
+
+src_compiler assemble(src_compiler compiler)
+{
+    compiler.out_ext = ".S";
+    compiler.flags   = replace_string(compiler.flags, " -c", " -S");
+    return compiler;
+}

 std::vector<std::vector<char>>
 compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch)
@@ -57,6 +225,10 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
    src_compiler compiler;
    compiler.flags    = params;
    compiler.compiler = MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER);
+#ifdef MIGRAPHX_HIP_COMPILER_LAUNCHER
+    if(has_compiler_launcher())
+        compiler.launcher = MIGRAPHX_STRINGIZE(MIGRAPHX_HIP_COMPILER_LAUNCHER);
+#endif

    if(is_hcc_compiler())
        compiler.process = [&](const fs::path& obj_path) -> fs::path {
@@ -75,9 +247,34 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
            MIGRAPHX_THROW("Missing hsaco");
        };

+    if(enabled(MIGRAPHX_GPU_DUMP_SRC{}))
+    {
+        for(const auto& src : srcs)
+        {
+            if(src.path.extension() != ".cpp")
+                continue;
+            std::cout << std::string(src.content.first, src.len()) << std::endl;
+        }
+    }
+
+    if(enabled(MIGRAPHX_GPU_DUMP_ASM{}))
+    {
+
+        std::cout << assemble(compiler).compile(srcs).data() << std::endl;
+    }
+
    return {compiler.compile(srcs)};
 }

+std::string enum_params(std::size_t count, std::string param)
+{
+    std::vector<std::string> items(count);
+    transform(range(count), items.begin(), [&](auto i) { return param + std::to_string(i); });
+    return join_strings(items, ",");
+}
+
+#endif // MIGRAPHX_USE_HIPRTC
+
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -68,8 +68,72 @@ __content__
    return replace_string(args_hpp, "__content__", inner);
 }

+const std::vector<std::string>& compiler_warnings()
+{
+    static std::vector<std::string> warnings = {"-Weverything",
+                                                "-Wno-c++98-compat",
+                                                "-Wno-c++98-compat-pedantic",
+                                                "-Wno-conversion",
+                                                "-Wno-double-promotion",
+                                                "-Wno-exit-time-destructors",
+                                                "-Wno-extra-semi",
+                                                "-Wno-extra-semi-stmt",
+                                                "-Wno-float-conversion",
+                                                "-Wno-gnu-anonymous-struct",
+                                                "-Wno-gnu-zero-variadic-macro-arguments",
+                                                "-Wno-missing-prototypes",
+                                                "-Wno-nested-anon-types",
+                                                "-Wno-padded",
+                                                "-Wno-shorten-64-to-32",
+                                                "-Wno-sign-conversion",
+                                                "-Wno-sign-compare",
+                                                "-Wno-unused-command-line-argument",
+                                                "-Wno-weak-vtables",
+                                                "-Wno-c99-extensions"};
+    return warnings;
+}
+
+void hip_compile_options::set_launch_params(
+    const value& v,
+    const std::function<std::size_t(std::size_t local)>& compute_global,
+    std::size_t default_local)
+{
+    local = v.get("local", default_local);
+    if(v.contains("global"))
+        global = v.at("global").to<std::size_t>();
+    else
+        global = compute_global(local);
+}
+
+std::function<std::size_t(std::size_t local)>
+compute_global_for(context& ctx, std::size_t n, std::size_t over)
+{
+    assert(over > 0);
+    std::size_t max_global = ctx.get_current_device().get_cu_count() *
+                             ctx.get_current_device().get_max_workitems_per_cu();
+    return [n, over, max_global](std::size_t local) {
+        std::size_t groups     = (n + local - 1) / local;
+        std::size_t max_blocks = max_global / local;
+        std::size_t nglobal    = std::min(max_blocks * over, groups) * local;
+        return nglobal;
+    };
+}
+
+std::size_t compute_block_size(std::size_t n, std::size_t max_block_size)
+{
+    size_t block_size = 128;
+    while(block_size <= max_block_size and block_size <= n)
+        block_size *= 2;
+    return block_size / 2;
+}
+
 operation compile_hip_code_object(const std::string& content, hip_compile_options options)
 {
+    assert(options.global > 0);
+    assert(options.local > 0);
+    assert(not options.inputs.empty());
+    assert(options.inputs.size() == options.virtual_inputs.size() or
+           options.virtual_inputs.empty());
    std::vector<src_file> srcs;
    std::transform(migraphx_kernels().begin(),
                   migraphx_kernels().end(),
@@ -83,12 +147,14 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option
    srcs.push_back(src_file{fs::path{"main.cpp"},
                            std::make_pair(content.data(), content.data() + content.size())});
    auto args_hpp =
-        generate_args_hpp(options.reduced_inputs.empty() ? options.inputs : options.reduced_inputs);
+        generate_args_hpp(options.virtual_inputs.empty() ? options.inputs : options.virtual_inputs);
    srcs.push_back(src_file{fs::path{"args.hpp"},
                            std::make_pair(args_hpp.data(), args_hpp.data() + args_hpp.size())});
    options.params += " -DMIGRAPHX_NGLOBAL=" + std::to_string(options.global);
    options.params += " -DMIGRAPHX_NLOCAL=" + std::to_string(options.local);
-    options.params += " -I.";
+    options.params += " " + join_strings(compiler_warnings(), " ");
+    options.params += " -ftemplate-backtrace-limit=0";
+    options.params += " -Werror";
    auto cos = compile_hip_src(srcs, std::move(options.params), get_device_name());
    if(cos.size() != 1)
        MIGRAPHX_THROW("No code object");

--- a/src/targets/gpu/compile_ops.cpp
+++ b/src/targets/gpu/compile_ops.cpp
+#include <migraphx/gpu/compile_ops.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/register_op.hpp>
+#include <migraphx/op/identity.hpp>
+#include <migraphx/gpu/compiler.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_COMPILE_PARALLEL);
+
+struct precompile_op
+{
+    operation op = op::identity{};
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"));
+    }
+
+    std::string name() const { return "gpu::precompile_op"; }
+
+    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
+    {
+        inputs.pop_back();
+        return op.compute_shape(inputs, mods);
+    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+MIGRAPHX_REGISTER_OP(precompile_op);
+
+struct compiled_result
+{
+    compiler_replace replace;
+    instruction_ref ins;
+};
+
+template <class F>
+void par_compile(std::size_t n, F f)
+{
+    if(n == 0)
+        return;
+    par_for(n, n / value_of(MIGRAPHX_GPU_COMPILE_PARALLEL{}, n), f);
+}
+
+void compile_ops::apply(module& m) const
+{
+    std::vector<std::function<compiled_result()>> compiles;
+
+    for(auto ins : iterator_for(m))
+    {
+        if(ins->name() != "gpu::precompile_op")
+            continue;
+        operation preop = any_cast<precompile_op>(ins->get_operator()).op;
+        compiles.emplace_back([=]() -> compiled_result {
+            return {compile(*ctx, ins, preop), ins};
+        });
+    }
+    std::vector<compiled_result> results(compiles.size());
+    par_compile(compiles.size(), [&](auto i) { results[i] = compiles[i](); });
+    for(const auto& cr : results)
+    {
+        cr.replace(m, cr.ins);
+    }
+}
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx