Merge branch 'develop' into jit-reduce-reg

4f12db9e · Paul Fultz II · GitHub · c2923b44 · 91cc7242 · 4f12db9e
Unverified Commit 4f12db9e authored Jan 31, 2023 by Paul Fultz II Committed by GitHub Jan 31, 2023
20 changed files
--- a/.github/workflows/sync-onnxrt-main.yaml
+++ b/.github/workflows/sync-onnxrt-main.yaml
 name: Onnxruntime main weekly sync
 on:
  schedule:
-    - cron: "05 09 * * 5"
+    - cron: "05 17 * * 1"

 jobs:
+  runs-on: ubuntu-latest
  sync:
    steps:
      - uses: actions/checkout@v3
        with:
-        ref: develop
-        path: ../
+          ref: develop
+          path: ../

  get_date:
    steps:

--- a/Dockerfile
+++ b/Dockerfile
@@ -95,20 +95,16 @@ RUN cget -p $PREFIX install facebook/zstd@v1.4.5 -X subdir -DCMAKE_DIR=build/cma
 RUN cget -p $PREFIX install ccache@v4.1 -DENABLE_TESTING=OFF
 RUN cget -p /opt/cmake install kitware/cmake@v3.24.3

-RUN export ONNXRT_COMMIT=$(cat test/onnx/.onnxrt-commit)
+COPY ./test/onnx/.onnxrt-commit /

 ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
 ARG ONNXRUNTIME_BRANCH=main
-ARG ONNXRUNTIME_COMMIT=$ONNXRT_COMMIT
-
-# Let us know which commit where're using for CI
-RUN echo "Onnxruntime Commit:" && echo $ONNXRUNTIME_COMMIT
+ARG ONNXRUNTIME_COMMIT

 RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime && \
    cd onnxruntime && \
-    git checkout ${ONNXRUNTIME_COMMIT} && \
-    /bin/sh dockerfiles/scripts/install_common_deps.sh
-
+    if [ -z "$ONNXRUNTIME_COMMIT" ] ; then git checkout $(cat /.onnxrt-commit) ; else git checkout ${ONNXRUNTIME_COMMIT} ; fi && \
+    /bin/sh /onnxruntime/dockerfiles/scripts/install_common_deps.sh


 ADD tools/build_and_test_onnxrt.sh /onnxruntime/build_and_test_onnxrt.sh

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -15,11 +15,13 @@ def rocmtestnode(Map conf) {
        def compiler = bconf.get("compiler", "/opt/rocm/llvm/bin/clang++")
        def flags = bconf.get("flags", "")
        def gpu_debug = bconf.get("gpu_debug", "0")
+        def hiprtc_workarounds = bconf.get("hiprtc_workarounds", "0")
        def cmd = """
            ulimit -c unlimited
            echo "leak:dnnl::impl::malloc" > suppressions.txt
            export LSAN_OPTIONS="suppressions=\$(pwd)/suppressions.txt"
            export MIGRAPHX_GPU_DEBUG=${gpu_debug}
+            export MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS=${hiprtc_workarounds}
            export CXX=${compiler}
            export CXXFLAGS='-Werror'
            env
@@ -110,6 +112,10 @@ rocmtest clang_debug: rocmnode('vega') { cmake_build ->
        cmake_build(flags: "-DCMAKE_BUILD_TYPE=release")
        stash includes: 'build/*.deb', name: 'migraphx-package'
    }
+}, hiprtc_gpu_debug: rocmnode('vega') { cmake_build ->
+    stage('HipRTC GPU Debug') {
+        cmake_build(flags: "-DCMAKE_BUILD_TYPE=release -DMIGRAPHX_USE_HIPRTC=On", gpu_debug: true, hiprtc_workarounds:  true)
+    }
 }, mlir_debug: rocmnode('vega') { cmake_build ->
    stage('MLIR Debug') {
        def sanitizers = "undefined"

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -64,6 +64,7 @@ add_library(migraphx
    normalize_ops.cpp
    op_enums.cpp
    operation.cpp
+    optimize_module.cpp
    opt/memory_coloring.cpp
    opt/memory_coloring_impl.cpp
    pad_calc.cpp

--- a/src/include/migraphx/match/layernorm.hpp
+++ b/src/include/migraphx/match/layernorm.hpp
@@ -48,10 +48,11 @@ struct layernorm_matcher

    auto layernorm_onnx() const
    {
-        return f("div")(arg(0)(x_minus_mean()),
+        auto add_eps = f("add")(either_arg(0, 1)(variance(), is_constant().bind("eps")));
+        return f("div")(
+            arg(0)(x_minus_mean()),

-                        arg(1)(skip_broadcasts(f("sqrt")(arg(0)(
-                            f("add")(either_arg(0, 1)(variance(), is_constant().bind("eps"))))))));
+            arg(1)(skip_broadcasts(f("sqrt")(arg(0)(match::any_of(add_eps, variance()))))));
    }

    auto matcher() const { return layernorm_onnx(); }

--- a/src/include/migraphx/op/gather.hpp
+++ b/src/include/migraphx/op/gather.hpp
@@ -26,6 +26,7 @@

 #include <array>
 #include <migraphx/check_shapes.hpp>
+#include <migraphx/dyn_output.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
 #include <migraphx/literal.hpp>
@@ -61,35 +62,59 @@ struct gather

    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2);
-        auto lens = inputs[0].lens();
-        auto type = inputs[0].type();
-        lens.erase(lens.begin() + axis);
-        if(not inputs[1].scalar())
+        check_shapes{inputs, *this, true}.has(2);
+        shape data    = inputs[0];
+        shape indices = inputs[1];
+        auto type     = data.type();
+        // If index_dims is dynamic, convert the data to dynamic too.
+        if(indices.dynamic())
        {
-            auto ind_lens = inputs[1].lens();
-            lens.insert(lens.begin() + axis, ind_lens.begin(), ind_lens.end());
+            data = data.to_dynamic();
        }
-
-        // for scalar output
-        if(lens.empty())
+        if(data.dynamic())
        {
-            return {type};
+            auto dims = data.dyn_dims();
+            dims.erase(dims.begin() + axis);
+
+            if(not indices.scalar())
+            {
+                auto index_dims = indices.to_dynamic().dyn_dims();
+                dims.insert(dims.begin() + axis, index_dims.begin(), index_dims.end());
+            }
+            return {type, dims};
        }
+        else
+        {
+            // Both data and indices are static.  indices may be scalar
+            auto lens = data.lens();
+            lens.erase(lens.begin() + axis);

-        return {type, lens};
+            if(not indices.scalar())
+            {
+                auto ind_lens = indices.lens();
+                lens.insert(lens.begin() + axis, ind_lens.begin(), ind_lens.end());
+            }
+
+            // for scalar output
+            if(lens.empty())
+            {
+                return {type};
+            }
+
+            return {type, lens};
+        }
    }

-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
        // negative axis means counting dimensions from back
        auto lens                 = args[0].get_shape().lens();
        std::size_t axis_dim_size = lens[axis];
        // max dimension in axis
        visit_all(result, args[0])([&](auto output, auto data) {
            args[1].visit([&](auto indices) {
-                if(output_shape.scalar())
+                if(dyn_out.computed_shape.scalar())
                {
                    auto in_index = indices.front();
                    in_index      = (in_index < 0) ? in_index + axis_dim_size : in_index;

--- a/src/include/migraphx/optimize_module.hpp
+++ b/src/include/migraphx/optimize_module.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_OPTIMIZE_MODULE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_OPTIMIZE_MODULE_HPP
+
+#include <string>
+#include <migraphx/instruction_ref.hpp>
+#include <migraphx/config.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module_pass_manager;
+
+/**
+ * Runs several passes in a loop
+ */
+struct optimize_module
+{
+    std::string name() const { return "optimize_module"; }
+    void apply(module_pass_manager& mpm) const;
+};
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/optimize_module.cpp
+++ b/src/optimize_module.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/optimize_module.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/simplify_reshapes.hpp>
+#include <migraphx/simplify_algebra.hpp>
+#include <migraphx/eliminate_common_subexpression.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/propagate_constant.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+void optimize_module::apply(module_pass_manager& mpm) const
+{
+    for(int i = 0; i < 2; i++)
+    {
+        mpm.run_pass(simplify_reshapes{});
+        mpm.run_pass(simplify_algebra{});
+        mpm.run_pass(eliminate_common_subexpression{});
+        mpm.run_pass(dead_code_elimination{});
+        mpm.run_pass(propagate_constant{});
+        mpm.run_pass(dead_code_elimination{});
+    }
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
-#####################################################################################
+# ####################################################################################
 # The MIT License (MIT)
 #
 # Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
@@ -20,7 +20,7 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
-#####################################################################################
+# ####################################################################################

 list(APPEND CMAKE_PREFIX_PATH /opt/rocm /opt/rocm/hip /opt/rocm/hcc)
 find_package(miopen)
@@ -33,6 +33,8 @@ if(NOT TARGET MIOpen)
    message(SEND_ERROR "Cant find miopen")
 endif()

+set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "Use hipRTC APIs")
+
 include(Embed)
 file(GLOB KERNEL_FILES ${CONFIGURE_DEPENDS}
    ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/*.hpp)
@@ -46,9 +48,10 @@ add_library(compile_for_gpu INTERFACE)
 target_compile_options(compile_for_gpu INTERFACE -std=c++17 -fno-gpu-rdc -Wno-cuda-compat -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns)
 target_link_libraries(compile_for_gpu INTERFACE hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-option-ignored)
 check_cxx_compiler_flag("--cuda-host-only -fhip-lambda-host-device -x hip" HAS_HIP_LAMBDA_HOST_DEVICE)
+
 if(HAS_HIP_LAMBDA_HOST_DEVICE)
-  message(STATUS "Enable -fhip-lambda-host-device")
-  target_compile_options(compile_for_gpu INTERFACE -fhip-lambda-host-device)
+    message(STATUS "Enable -fhip-lambda-host-device")
+    target_compile_options(compile_for_gpu INTERFACE -fhip-lambda-host-device)
 endif()

 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
@@ -60,11 +63,13 @@ target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURR
 target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)

 add_library(kernel_file_check EXCLUDE_FROM_ALL)
+
 foreach(KERNEL_FILE ${KERNEL_FILES})
    get_filename_component(KERNEL_BASE_FILE ${KERNEL_FILE} NAME_WE)
    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp "#include <migraphx/kernels/${KERNEL_BASE_FILE}.hpp>\n")
    target_sources(kernel_file_check PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp)
 endforeach()
+
 target_compile_definitions(kernel_file_check PRIVATE -DMIGRAPHX_NLOCAL=256)
 target_include_directories(kernel_file_check PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/>)
 target_link_libraries(kernel_file_check compile_for_gpu)
@@ -125,6 +130,7 @@ function(register_migraphx_gpu_ops PREFIX)
        register_op(migraphx_gpu HEADER migraphx/gpu/${OP}.hpp OPERATORS gpu::${PREFIX}${OP} INCLUDES migraphx/gpu/context.hpp)
    endforeach()
 endfunction()
+
 register_migraphx_gpu_ops(hip_
    argmax
    argmin
@@ -146,47 +152,41 @@ register_migraphx_gpu_ops(miopen_
    lrn
    pooling
 )
-register_op(migraphx_gpu 
-    HEADER migraphx/gpu/rnn_variable_seq_lens.hpp 
+register_op(migraphx_gpu
+    HEADER migraphx/gpu/rnn_variable_seq_lens.hpp
    OPERATORS gpu::hip_rnn_var_sl_shift_sequence gpu::hip_rnn_var_sl_shift_output gpu::hip_rnn_var_sl_last_output
    INCLUDES migraphx/gpu/context.hpp)
-register_op(migraphx_gpu 
-    HEADER migraphx/gpu/int8_gemm_pack.hpp 
+register_op(migraphx_gpu
+    HEADER migraphx/gpu/int8_gemm_pack.hpp
    OPERATORS gpu::hip_int8_gemm_pack_a gpu::hip_int8_gemm_pack_b
    INCLUDES migraphx/gpu/context.hpp)
-register_op(migraphx_gpu 
-    HEADER migraphx/gpu/gemm.hpp 
+register_op(migraphx_gpu
+    HEADER migraphx/gpu/gemm.hpp
    OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot>
    INCLUDES migraphx/gpu/context.hpp)
-register_op(migraphx_gpu HEADER migraphx/gpu/convolution.hpp 
+register_op(migraphx_gpu HEADER migraphx/gpu/convolution.hpp
    OPERATORS gpu::miopen_convolution<op::convolution> gpu::miopen_convolution<op::deconvolution> gpu::miopen_convolution<op::quant_convolution>
    INCLUDES migraphx/gpu/context.hpp)
 rocm_set_soversion(migraphx_gpu ${MIGRAPHX_SO_VERSION})
 rocm_clang_tidy_check(migraphx_gpu)

-# look for offload bundler
 get_filename_component(CMAKE_CXX_COMPILER_PATH "${CMAKE_CXX_COMPILER}" PATH)
-if(CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+$")
-    find_program(MIGRAPHX_OFFLOADBUNDLER_BIN clang-offload-bundler
-        HINTS ${CMAKE_CXX_COMPILER_PATH}
-        PATH_SUFFIXES bin
-        PATHS /opt/rocm/llvm
-    )
-else()
+
+if(NOT CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+$")
    find_program(MIGRAPHX_EXTRACT_KERNEL extractkernel
        PATH_SUFFIXES bin
        HINTS ${CMAKE_CXX_COMPILER_PATH}
        PATHS
-            /opt/rocm/hip
-            /opt/rocm/hcc
-            /opt/rocm
+        /opt/rocm/hip
+        /opt/rocm/hcc
+        /opt/rocm
    )
 endif()

-message(STATUS "clang-offload-bundler: ${MIGRAPHX_OFFLOADBUNDLER_BIN}")
 message(STATUS "extractkernel: ${MIGRAPHX_EXTRACT_KERNEL}")

 set(MIGRAPHX_ENABLE_MLIR OFF CACHE BOOL "")
+
 if(MIGRAPHX_ENABLE_MLIR)
    # Find package rocMLIR
    find_package(rocMLIR 1.0.0 CONFIG REQUIRED)
@@ -195,36 +195,39 @@ if(MIGRAPHX_ENABLE_MLIR)
    target_link_libraries(migraphx_gpu PUBLIC rocMLIR::rockCompiler)
 endif()

-set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "")
 if(MIGRAPHX_USE_HIPRTC)
-target_compile_definitions(migraphx_gpu PRIVATE -DMIGRAPHX_USE_HIPRTC=1)
+    message(STATUS "MIGraphX is using hipRTC")
+    target_compile_definitions(migraphx_gpu PRIVATE -DMIGRAPHX_USE_HIPRTC=1)
 else()
-# Get flags needed to compile hip
-include(TargetFlags)
-target_flags(HIP_COMPILER_FLAGS hip::device)
-# Remove cuda arch flags
-string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-string(REGEX REPLACE --offload-arch=[a-z0-9:+-]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-# Skip library paths since hip will incorrectly treat it as a source file
-string(APPEND HIP_COMPILER_FLAGS " ")
-foreach(_unused RANGE 2)
-    string(REGEX REPLACE " /[^ ]+\\.(a|so) " " " HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-endforeach()
+    message(STATUS "MIGraphX is using HIP Clang")

-message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
-target_compile_definitions(migraphx_gpu PRIVATE 
-    "-DMIGRAPHX_HIP_COMPILER=${CMAKE_CXX_COMPILER}" 
-    "-DMIGRAPHX_HIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}" 
-    "-DMIGRAPHX_OFFLOADBUNDLER_BIN=${MIGRAPHX_OFFLOADBUNDLER_BIN}"
-    "-DMIGRAPHX_EXTRACT_KERNEL=${MIGRAPHX_EXTRACT_KERNEL}"
-    "-DMIGRAPHX_USE_HIPRTC=0"
-)
-if(DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
-execute_process(COMMAND which ${CMAKE_CXX_COMPILER_LAUNCHER} OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER)
-string(STRIP "${MIGRAPHX_HIP_COMPILER_LAUNCHER}" MIGRAPHX_HIP_COMPILER_LAUNCHER)
-target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_HIP_COMPILER_LAUNCHER=${MIGRAPHX_HIP_COMPILER_LAUNCHER}")
-endif()
+    # Get flags needed to compile hip
+    include(TargetFlags)
+    target_flags(HIP_COMPILER_FLAGS hip::device)
+
+    # Remove cuda arch flags
+    string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+    string(REGEX REPLACE --offload-arch=[a-z0-9:+-]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+
+    # Skip library paths since hip will incorrectly treat it as a source file
+    string(APPEND HIP_COMPILER_FLAGS " ")

+    foreach(_unused RANGE 2)
+        string(REGEX REPLACE " /[^ ]+\\.(a|so) " " " HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+    endforeach()
+
+    message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
+    target_compile_definitions(migraphx_gpu PRIVATE
+        "-DMIGRAPHX_HIP_COMPILER=${CMAKE_CXX_COMPILER}"
+        "-DMIGRAPHX_HIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}"
+        "-DMIGRAPHX_EXTRACT_KERNEL=${MIGRAPHX_EXTRACT_KERNEL}"
+    )
+
+    if(DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
+        execute_process(COMMAND which ${CMAKE_CXX_COMPILER_LAUNCHER} OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER)
+        string(STRIP "${MIGRAPHX_HIP_COMPILER_LAUNCHER}" MIGRAPHX_HIP_COMPILER_LAUNCHER)
+        target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_HIP_COMPILER_LAUNCHER=${MIGRAPHX_HIP_COMPILER_LAUNCHER}")
+    endif()
 endif()

 # Check miopen find mode api
@@ -236,7 +239,7 @@ check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_
 # TODO: Set default to HAS_FIND_2_API
 set(MIGRAPHX_USE_FIND_2_API OFF CACHE BOOL "")

-if(MIGRAPHX_USE_FIND_2_API) 
+if(MIGRAPHX_USE_FIND_2_API)
    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
    message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
 else()
@@ -258,8 +261,7 @@ target_link_libraries(migraphx_gpu PRIVATE migraphx_device migraphx_kernels)
 add_subdirectory(driver)

 rocm_install_targets(
-  TARGETS migraphx_gpu migraphx_device compile_for_gpu
-  INCLUDE
+    TARGETS migraphx_gpu migraphx_device compile_for_gpu
+    INCLUDE
    ${CMAKE_CURRENT_SOURCE_DIR}/include
 )
-
--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
@@ -29,10 +29,9 @@
 #include <cassert>
 #include <iostream>

-#if MIGRAPHX_USE_HIPRTC
+#ifdef MIGRAPHX_USE_HIPRTC
 #include <hip/hiprtc.h>
 #include <migraphx/manage_ptr.hpp>
-#include <migraphx/env.hpp>
 #else
 #include <migraphx/compile_src.hpp>
 #include <migraphx/process.hpp>
@@ -48,9 +47,10 @@ MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_OPTIMIZE);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_ASM);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_SRC);

-#if MIGRAPHX_USE_HIPRTC
+#ifdef MIGRAPHX_USE_HIPRTC

-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_HIPRTC)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_HIPRTC);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS);

 std::string hiprtc_error(hiprtcResult err, const std::string& msg)
 {
@@ -143,25 +143,29 @@ struct hiprtc_program
                       options.end(),
                       std::back_inserter(c_options),
                       [](const std::string& s) { return s.c_str(); });
-        auto result = hiprtcCompileProgram(prog.get(), c_options.size(), c_options.data());
-        std::cerr << log() << std::endl;
+        auto result   = hiprtcCompileProgram(prog.get(), c_options.size(), c_options.data());
+        auto prog_log = log();
+        if(not prog_log.empty())
+        {
+            std::cerr << prog_log << std::endl;
+        }
        if(result != HIPRTC_SUCCESS)
            MIGRAPHX_HIPRTC_THROW(result, "Compilation failed.");
    }

-    std::string log()
+    std::string log() const
    {
        std::size_t n = 0;
        MIGRAPHX_HIPRTC(hiprtcGetProgramLogSize(prog.get(), &n));
-        if(n < 2)
+        if(n == 0)
            return {};
-        std::vector<char> buffer(n);
+        std::string buffer(n, '\0');
        MIGRAPHX_HIPRTC(hiprtcGetProgramLog(prog.get(), buffer.data()));
-        assert(buffer.back() == 0);
-        return {buffer.begin(), buffer.end() - 1};
+        assert(buffer.back() != 0);
+        return buffer;
    }

-    std::vector<char> get_code_obj()
+    std::vector<char> get_code_obj() const
    {
        std::size_t n = 0;
        MIGRAPHX_HIPRTC(hiprtcGetCodeSize(prog.get(), &n));
@@ -176,6 +180,17 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
 {
    hiprtc_program prog(srcs);
    auto options = split_string(params, ' ');
+    options.push_back("-DMIGRAPHX_USE_HIPRTC=1");
+    // remove following three compilation flags for HIPRTC once fixes from hipRTC are available in
+    if(enabled(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS{}))
+    {
+        options.push_back("-DMIGRAPHX_HAS_DPP=0");
+        options.push_back("-DMIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS=1");
+        options.push_back("-Wno-reserved-identifier");
+        options.push_back("-Wno-gnu-line-marker");
+        options.push_back("-Wno-old-style-cast");
+    }
+
    if(enabled(MIGRAPHX_GPU_DEBUG{}))
        options.push_back("-DMIGRAPHX_DEBUG");
    if(std::none_of(options.begin(), options.end(), [](const std::string& s) {
@@ -183,7 +198,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
       }))
        options.push_back("-std=c++17");
    options.push_back("-fno-gpu-rdc");
-    options.push_back(" -O" + string_value_of(MIGRAPHX_GPU_OPTIMIZE{}, "3"));
+    options.push_back("-O" + string_value_of(MIGRAPHX_GPU_OPTIMIZE{}, "3"));
    options.push_back("-Wno-cuda-compat");
    options.push_back("--offload-arch=" + arch);
    prog.compile(options);
@@ -292,6 +307,8 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
    return {compiler.compile(srcs)};
 }

+#endif // MIGRAPHX_USE_HIPRTC
+
 std::string enum_params(std::size_t count, std::string param)
 {
    std::vector<std::string> items(count);
@@ -299,8 +316,6 @@ std::string enum_params(std::size_t count, std::string param)
    return join_strings(items, ",");
 }

-#endif // MIGRAPHX_USE_HIPRTC
-
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -29,7 +29,6 @@
 #include <migraphx/context.hpp>
 #include <migraphx_kernels.hpp>
 #include <migraphx/stringutils.hpp>
-#include <hip/hip_runtime_api.h>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -80,6 +79,7 @@ std::string generate_args_hpp(const std::vector<shape>& inputs)

 #include <migraphx/kernels/args.hpp>
 #include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/types.hpp>

 namespace migraphx {


--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@@ -36,6 +36,7 @@ namespace gpu {
 namespace device {

 #ifdef MIGRAPHX_NO_DPP
+
 template <index_int N,
          class Op,
          class T,
@@ -62,6 +63,7 @@ __device__ auto block_reduce(index idx, Op op, T init, ForStride fs, F f)
    }
    return buffer[0];
 }
+
 #else
 constexpr unsigned int dpp_row_shr(unsigned int x) { return 0x110u | x; }

@@ -96,11 +98,7 @@ __device__ T dpp_mov(T& x)
    input.data = x;
    for(index_int i = 0; i < n; i++)
    {
-#if defined(__HCC__)
-        output.reg[i] = __llvm_amdgcn_move_dpp(input.reg[i], DppCtrl, RowMask, BankMask, BoundCtrl);
-#else
        output.reg[i] = __hip_move_dpp(input.reg[i], DppCtrl, RowMask, BankMask, BoundCtrl);
-#endif
    }
    return output.data;
 }
@@ -310,4 +308,4 @@ void reduce(hipStream_t stream,
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

-#endif
+#endif // MIGRAPHX_NO_DPP
--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -553,11 +553,13 @@ struct find_gemm_pointwise
 {
    auto matcher() const
    {
-        return precompile_name("pointwise")(
+        auto gemm_op   = match::name("gpu::gemm")(match::nargs(3), match::used_once()).bind("gemm");
+        auto binary_op = match::all_of(
            match::nargs(3),
            match::either_arg(0, 1)(
-                match::any_of(match::standard_shape(), match::is_constant()).bind("c"),
-                match::name("gpu::gemm")(match::nargs(3), match::used_once()).bind("gemm")));
+                match::any_of(match::standard_shape(), match::is_constant()).bind("c"), gemm_op));
+        auto unary_op = match::all_of(match::nargs(2), match::arg(0)(gemm_op));
+        return precompile_name("pointwise")(match::any_of(binary_op, unary_op));
    }

    // TODO: Move to matcher.hpp
@@ -589,61 +591,84 @@ struct find_gemm_pointwise
        return match::name("@return")(match::args(match::any_of(add, mul_add, add_mul)));
    }

+    static auto match_mul(const std::string& input)
+    {
+        auto mul = match_mul_const(match_param(input), "alpha");
+        return match::name("@return")(match::args(mul));
+    }
+
    static float get_float(instruction_ref ins) { return ins->get_literal().at<float>(); }

    template <class Gemm>
    static bool update_gemm(Gemm& gemm, module_ref pm, unsigned input)
    {
        auto names = pm->get_parameter_names();
-        if(names.size() != 2)
-            return false;
        std::sort(names.begin(), names.end());
-        unsigned output = input == 0 ? 1 : 0;
-        auto mr         = match::match_instruction(
-            *pm, std::prev(pm->end()), match_add(names[input], names[output]));
-        if(mr.result == pm->end())
-            return false;
-        if(contains(mr.instructions, "alpha_mul"))
+        if(names.size() == 1)
+        {
+            auto mr = match::match_instruction(*pm, std::prev(pm->end()), match_mul(names[input]));
+            if(mr.result == pm->end())
+                return false;
            gemm.alpha *= get_float(mr.instructions["alpha"]);
-        else if(contains(mr.instructions, "beta_mul"))
-            gemm.beta *= get_float(mr.instructions["beta"]);
-        else if(contains(mr.instructions, "gamma_mul"))
+            return true;
+        }
+        else if(names.size() == 2)
        {
-            gemm.alpha *= get_float(mr.instructions["gamma"]);
-            gemm.beta *= get_float(mr.instructions["gamma"]);
+            unsigned output = input == 0 ? 1 : 0;
+            auto mr         = match::match_instruction(
+                *pm, std::prev(pm->end()), match_add(names[input], names[output]));
+            if(mr.result == pm->end())
+                return false;
+            if(contains(mr.instructions, "alpha_mul"))
+                gemm.alpha *= get_float(mr.instructions["alpha"]);
+            else if(contains(mr.instructions, "beta_mul"))
+                gemm.beta *= get_float(mr.instructions["beta"]);
+            else if(contains(mr.instructions, "gamma_mul"))
+            {
+                gemm.alpha *= get_float(mr.instructions["gamma"]);
+                gemm.beta *= get_float(mr.instructions["gamma"]);
+            }
+            return true;
+        }
+        else
+        {
+            return false;
        }
-        return true;
    }

    void apply(module& m, const match::matcher_result& r) const
    {
        auto ins      = r.result;
        auto gemm_ins = r.instructions["gemm"];
-        auto c_ins    = r.instructions["c"];

        auto gemm = any_cast<rocblas_gemm<op::dot>>(gemm_ins->get_operator());

        // Already fused gemm
        if(not float_equal(gemm.beta, 0))
            return;
-        gemm.beta = 1;
+        if(ins->inputs().size() == 3)
+            gemm.beta = 1;

        if(not update_gemm(
               gemm, ins->module_inputs().front(), ins->inputs().front() == gemm_ins ? 0 : 1))
            return;

-        // const-fold input if not standard shape since rocblas can't handle it
-        if(not c_ins->get_shape().standard())
-        {
-            auto c = make_op("contiguous");
-            auto l = c.compute(c.compute_shape({c_ins->get_shape()}), {c_ins->eval()});
-            c_ins  = m.add_literal(l.get_shape(), l.data());
-        }
-
        auto inputs = gemm_ins->inputs();
        inputs.pop_back();

-        inputs.push_back(c_ins);
+        if(ins->inputs().size() == 3)
+        {
+            auto c_ins = r.instructions["c"];
+            // const-fold input if not standard shape since rocblas can't handle it
+            if(not c_ins->get_shape().standard())
+            {
+                auto c = make_op("contiguous");
+                auto l = c.compute(c.compute_shape({c_ins->get_shape()}), {c_ins->eval()});
+                c_ins  = m.add_literal(l.get_shape(), l.data());
+            }
+            inputs.push_back(c_ins);
+        }
+
        inputs.push_back(ins->inputs().back());

        m.replace_instruction(ins, gemm, inputs);

--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -105,7 +105,7 @@ constexpr auto array_for_each(T& x, Ts&... xs)
            }
            else
            {
-                using vec_type = std::remove_reference_t<decltype(array2vec(x))>;
+                using vec_type = remove_reference_t<decltype(array2vec(x))>;
                f(array2vec(x), __builtin_convertvector(array2vec(xs), vec_type)...);
            }
        }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/dpp.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/dpp.hpp
@@ -72,7 +72,7 @@ __device__ T dpp_mov(T& x)
    }
    return output.data;
 }
-#endif
+#endif // MIGRAPHX_HAS_DPP

 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_DPP_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
@@ -26,7 +26,7 @@

 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/algorithm.hpp>
-
+#include <migraphx/kernels/ops.hpp>
 namespace migraphx {

 template <class T>
@@ -53,23 +53,17 @@ __device__ void gathernd(const T& data_t, const U& indices_t, const V& output_t,
    auto indices_shape_lens = indices_shape.lens;
    auto data_shape_lens    = data_shape.lens;
    auto num_slice_dims     = indices_shape_lens.back();
-    std::size_t num_slices  = accumulate(indices_shape_lens.begin(),
-                                        indices_shape_lens.end() - 1,
-                                        1,
-                                        std::multiplies<std::size_t>());
-    std::size_t slice_size  = accumulate(data_shape_lens.begin() + num_slice_dims + batch_dims,
+    std::size_t num_slices =
+        accumulate(indices_shape_lens.begin(), indices_shape_lens.end() - 1, 1, op::product{});
+    std::size_t slice_size = accumulate(data_shape_lens.begin() + num_slice_dims + batch_dims,
                                        data_shape_lens.end(),
                                        1,
-                                        std::multiplies<std::size_t>());
-    const std::size_t num_batches       = accumulate(data_shape_lens.begin(),
-                                               data_shape_lens.begin() + batch_dims,
-                                               1,
-                                               std::multiplies<std::size_t>());
-    const std::size_t data_batch_stride = accumulate(data_shape_lens.begin() + batch_dims,
-                                                     data_shape_lens.end(),
-                                                     1,
-                                                     std::multiplies<std::size_t>());
-    const auto num_slices_per_batch     = num_slices / num_batches;
+                                        op::product{});
+    const std::size_t num_batches =
+        accumulate(data_shape_lens.begin(), data_shape_lens.begin() + batch_dims, 1, op::product{});
+    const std::size_t data_batch_stride =
+        accumulate(data_shape_lens.begin() + batch_dims, data_shape_lens.end(), 1, op::product{});
+    const auto num_slices_per_batch = num_slices / num_batches;

    ind.global_stride(output_shape.elements(), [&](auto i) {
        const auto* indices_ptr     = indices_t.data();
@@ -83,15 +77,15 @@ __device__ void gathernd(const T& data_t, const U& indices_t, const V& output_t,
            int64_t index                   = slice_indices[idx];
            const std::size_t input_dim_idx = batch_dims + idx;
            const auto input_dim            = data_shape_lens[input_dim_idx];
-            assert(index >= -static_cast<int64_t>(input_dim) and
-                   index < static_cast<int64_t>(input_dim));
+            MIGRAPHX_ASSERT(index >= -static_cast<int64_t>(input_dim) and
+                            index < static_cast<int64_t>(input_dim));
            if(index < 0)
                index += input_dim;
            std::size_t size_from_slice_dims =
                accumulate(data_shape_lens.begin() + batch_dims + idx + 1,
                           data_shape_lens.begin() + batch_dims + num_slice_dims,
                           slice_size,
-                           std::multiplies<std::size_t>());
+                           op::product{});
            relative_slice_offset += index * size_from_slice_dims;
        }


--- a/src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
@@ -24,11 +24,18 @@
 #ifndef MIGRAPHX_GUARD_KERNELS_HIP_HPP
 #define MIGRAPHX_GUARD_KERNELS_HIP_HPP

+#ifndef MIGRAPHX_USE_HIPRTC
 // Workaround macro redefinition issue with clang tidy
 #if defined(__HIP_PLATFORM_HCC__) && defined(MIGRAPHX_USE_CLANG_TIDY)
 #undef __HIP_PLATFORM_HCC__ // NOLINT
 #endif
-
 #include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/math_functions.h>
+#include <hip/hip_math_constants.h>
+#elif defined(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS)
+#include <hip/hip_common.h>
+#include <hip/hip_math_constants.h>
+#endif

 #endif // MIGRAPHX_GUARD_KERNELS_HIP_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
@@ -28,8 +28,7 @@
 #include <migraphx/kernels/vec.hpp>
 #include <migraphx/kernels/functional.hpp>
 #include <migraphx/kernels/type_traits.hpp>
-#include <hip/hip_fp16.h>
-#include <hip/math_functions.h>
+#include <migraphx/kernels/hip.hpp>

 namespace migraphx {

@@ -222,7 +221,7 @@ constexpr auto min(const T& a, const U& b)
 template <class T, MIGRAPHX_REQUIRES(is_same<vec_type<T>, half>{})>
 constexpr T sin(T x)
 {
-    constexpr const T shift = M_PI_2;
+    constexpr const T shift = HIP_PIO2_F;
    return migraphx::cos(shift - x);
 }


--- a/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
@@ -76,14 +76,6 @@ struct shape

    constexpr index_int index(index_array x) const { return x.dot(strides); }

-    constexpr index_int index(std::initializer_list<index_int> x) const
-    {
-        index_int idx = 0;
-        for(index_int i = 0; i < x.size(); i++)
-            idx += *(x.begin() + i) * strides[i];
-        return idx;
-    }
-
    constexpr index_int index(index_int i) const
    {
        if(this->standard())

--- a/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
@@ -28,8 +28,45 @@

 namespace migraphx {

-using index_int = std::uint32_t;
-using diff_int  = std::int32_t;
+#if defined(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS) and defined(MIGRAPHX_USE_HIPRTC)
+using int8_t   = signed char;
+using uint8_t  = unsigned char;
+using int16_t  = signed short;
+using uint16_t = unsigned short;
+using int32_t  = signed int;
+using uint32_t = unsigned int;
+using int64_t  = signed long long;
+using uint64_t = unsigned long long;
+#elif defined(MIGRAPHX_USE_HIPRTC)
+using int8_t   = __hip_int8_t;
+using uint8_t  = __hip_uint8_t;
+using int16_t  = __hip_int16_t;
+using uint16_t = __hip_uint16_t;
+using int32_t  = __hip_int32_t;
+using uint32_t = __hip_uint32_t;
+using int64_t  = __hip_int64_t;
+using uint64_t = __hip_uint64_t;
+#else
+using int8_t   = std::int8_t;
+using uint8_t  = std::uint8_t;
+using int16_t  = std::int16_t;
+using uint16_t = std::uint16_t;
+using int32_t  = std::int32_t;
+using uint32_t = std::uint32_t;
+using int64_t  = std::int64_t;
+using uint64_t = std::uint64_t;
+#endif // MIGRAPHX_USE_HIPRTC
+using index_int = uint32_t;
+using diff_int  = int32_t;
+
+static_assert(sizeof(int8_t) == 1, "int8_t must be 1 bytes");
+static_assert(sizeof(uint8_t) == 1, "uint8_t must be 1 bytes");
+static_assert(sizeof(int16_t) == 2, "int16_t must be 2 bytes");
+static_assert(sizeof(uint16_t) == 2, "uint16_t must be 2 bytes");
+static_assert(sizeof(int32_t) == 4, "int32_t must be 4 bytes");
+static_assert(sizeof(uint32_t) == 4, "uint32_t must be 4 bytes");
+static_assert(sizeof(int64_t) == 8, "int64_t must be 8 bytes");
+static_assert(sizeof(uint64_t) == 8, "uint64_t must be 8 bytes");

 #define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT