Merge remote-tracking branch 'origin/bert-opt' into HEAD

3eaa0969 · Alan Turner · e3d0c287 · 22aa9c5e · 3eaa0969 · 3eaa0969
Commit 3eaa0969 authored Feb 01, 2023 by Alan Turner
20 changed files
--- a/.github/workflows/sync-onnxrt-main.yaml
+++ b/.github/workflows/sync-onnxrt-main.yaml
+name: Onnxruntime main weekly sync
+on:
+  schedule:
+    - cron: "05 17 * * 1"
+jobs:
+  runs-on: ubuntu-latest
+  sync:
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          ref: develop
+          path: ../
+  get_date:
+    steps:
+      - run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+  update_file:
+    needs: [sync get_date]
+    steps:
+      - run: git clone https://github.com/microsoft/onnxruntime.git && cd onnxruntime && git rev-parse HEAD >> ../test/onnx/.onnxrt-commit
+  Add_commit:
+    needs: update_file
+    steps:
+      - name: Add & Commit
+        uses: EndBug/add-and-commit@v9.1.1
+        with:
+          new_branch: onnxruntime-sync-${{ steps.date.outputs.date }}
+          add: ../test/onnx/.onnxrt-commit
+          message: Update Onnxruntime commit to latest release
+          default_author: github_actions
+          push: true
+  PR:
+    needs: Add_commit
+    steps:
+      - name: GitHub Action for creating Pull Requests
+        uses: devops-infra/action-pull-request@v0.5.3
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          title: Sync Onnxruntime main
+          reviewer: pfultz2, causten
+          assignee: TedThemistokleous
+          label: automatic, onnxruntime
+          target_branch: develop
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,6 +5,10 @@ ARG PREFIX=/usr/local
 # Support multiarch
 RUN dpkg --add-architecture i386
+# Install rocm key
+RUN apt-get update && apt-get install -y gnupg2 --no-install-recommends curl && \
+    curl -sL http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - 
 # Add rocm repository
 RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/5.3/ ubuntu main > /etc/apt/sources.list.d/rocm.list'
@@ -32,10 +36,27 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    libnuma-dev \
    miopen-hip \
    rocblas \
+    hipfft \
+    rocthrust \
+    rocrand \
+    hipsparse \
+    rccl \
+    rccl-dev \
+    rocm-smi-lib \
+    rocm-dev \
+    roctracer-dev \
+    hipcub  \
+    hipblas  \
+    hipify-clang \
+    half \
+    libssl-dev \
    zlib1g-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
+# add this for roctracer dependancies
+RUN pip3 install CppHeaderParser packaging==22.0
 # Workaround broken rocm packages
 RUN ln -s /opt/rocm-* /opt/rocm
 RUN echo "/opt/rocm/lib" > /etc/ld.so.conf.d/rocm.conf
@@ -72,18 +93,19 @@ RUN /download_models.sh && rm /download_models.sh
 # Install latest ccache version
 RUN cget -p $PREFIX install facebook/zstd@v1.4.5 -X subdir -DCMAKE_DIR=build/cmake
 RUN cget -p $PREFIX install ccache@v4.1 -DENABLE_TESTING=OFF
+RUN cget -p /opt/cmake install kitware/cmake@v3.24.3
-# Install newer cmake for onnx runtime
+COPY ./test/onnx/.onnxrt-commit /
-ARG CMAKE_VERSION=3.24.2
-RUN cget -p /opt/cmake install -X binary https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz
 ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
 ARG ONNXRUNTIME_BRANCH=main
-ARG ONNXRUNTIME_COMMIT=24f1bd6156cf5968bbc76dfb0e801a9b9c56b9fc
+ARG ONNXRUNTIME_COMMIT
 RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime && \
    cd onnxruntime && \
-    git checkout ${ONNXRUNTIME_COMMIT} && \
+    if [ -z "$ONNXRUNTIME_COMMIT" ] ; then git checkout $(cat /.onnxrt-commit) ; else git checkout ${ONNXRUNTIME_COMMIT} ; fi && \
-    /bin/sh dockerfiles/scripts/install_common_deps.sh
+    /bin/sh /onnxruntime/dockerfiles/scripts/install_common_deps.sh
 ADD tools/build_and_test_onnxrt.sh /onnxruntime/build_and_test_onnxrt.sh

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -15,11 +15,13 @@ def rocmtestnode(Map conf) {
        def compiler = bconf.get("compiler", "/opt/rocm/llvm/bin/clang++")
        def flags = bconf.get("flags", "")
        def gpu_debug = bconf.get("gpu_debug", "0")
+        def hiprtc_workarounds = bconf.get("hiprtc_workarounds", "0")
        def cmd = """
            ulimit -c unlimited
            echo "leak:dnnl::impl::malloc" > suppressions.txt
            export LSAN_OPTIONS="suppressions=\$(pwd)/suppressions.txt"
            export MIGRAPHX_GPU_DEBUG=${gpu_debug}
+            export MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS=${hiprtc_workarounds}
            export CXX=${compiler}
            export CXXFLAGS='-Werror'
            env
@@ -110,6 +112,10 @@ rocmtest clang_debug: rocmnode('vega') { cmake_build ->
        cmake_build(flags: "-DCMAKE_BUILD_TYPE=release")
        stash includes: 'build/*.deb', name: 'migraphx-package'
    }
+}, hiprtc_gpu_debug: rocmnode('vega') { cmake_build ->
+    stage('HipRTC GPU Debug') {
+        cmake_build(flags: "-DCMAKE_BUILD_TYPE=release -DMIGRAPHX_USE_HIPRTC=On", gpu_debug: true, hiprtc_workarounds:  true)
+    }
 }, mlir_debug: rocmnode('vega') { cmake_build ->
    stage('MLIR Debug') {
        def sanitizers = "undefined"

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -64,7 +64,7 @@ add_library(migraphx
    normalize_ops.cpp
    op_enums.cpp
    operation.cpp
-    optimize.cpp
+    optimize_module.cpp
    opt/memory_coloring.cpp
    opt/memory_coloring_impl.cpp
    pad_calc.cpp

--- a/src/include/migraphx/convolution.hpp
+++ b/src/include/migraphx/convolution.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_CONVOLUTION_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CONVOLUTION_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/tensor_view.hpp>
+#include <vector>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+template <class Output, class T, class Padding, class Stride>
+void convolution(Output output, T input, T weights, Padding padding, Stride stride, int group)
+{
+    auto output_shape = output.get_shape();
+    auto in_lens      = input.get_shape().lens();
+    auto wei_lens = weights.get_shape().lens();
+    auto wei_n    = wei_lens[0];
+    auto wei_c    = wei_lens[1];
+    std::vector<std::size_t> win_size(wei_lens.begin() + 1, wei_lens.end());
+    par_for(output_shape.elements(), [&](auto i) {
+        auto idx_o = output_shape.multi(i);
+        auto w     = idx_o[1];
+        auto n_dim = idx_o.size();
+        std::vector<std::ptrdiff_t> win_start;
+        for(std::size_t dim = 2; dim < n_dim; ++dim)
+        {
+            auto d_2 = dim - 2;
+            win_start.push_back(std::ptrdiff_t(idx_o[dim] * stride[d_2]) -
+                                std::ptrdiff_t(padding[d_2]));
+        }
+        const auto group_id = w / (wei_n / group);
+        shape win_shape{output_shape.type(), win_size};
+        double acc = 0.0;
+        shape_for_each(win_shape, [&](auto idx_win) {
+            auto k           = idx_win[0];
+            const auto in_ch = group_id * wei_c + k;
+            std::vector<std::ptrdiff_t> idx(idx_o.begin(), idx_o.end());
+            idx[1] = in_ch;
+            std::transform(idx_win.begin() + 1,
+                           idx_win.end(),
+                           win_start.begin(),
+                           idx.begin() + 2,
+                           [](std::ptrdiff_t ii, std::ptrdiff_t jj) { return ii + jj; });
+            std::vector<std::ptrdiff_t> idx_wei(idx_o.size());
+            idx_wei[0] = w;
+            std::copy(idx_win.begin(), idx_win.end(), idx_wei.begin() + 1);
+            if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
+               std::equal(idx.begin(),
+                          idx.end(),
+                          in_lens.begin(),
+                          in_lens.end(),
+                          std::less<std::ptrdiff_t>{}))
+            {
+                acc += input(idx.begin(), idx.end()) * weights(idx_wei.begin(), idx_wei.end());
+            }
+        });
+        output[i] = acc;
+    });
+}
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/match/layernorm.hpp
+++ b/src/include/migraphx/match/layernorm.hpp
@@ -43,16 +43,23 @@ struct layernorm_matcher
    auto variance() const
    {
-        return f("reduce_mean")(arg(0)(f("pow")(arg(0)(x_minus_mean()), arg(1)(has_value(2.0f)))));
+        return f("reduce_mean")(arg(0)(any_of(
+            f("pow")(arg(0)(x_minus_mean()), arg(1)(has_value(2.0f))),
+            f("mul")(arg(0)(x_minus_mean()), arg(1)(x_minus_mean())),
+            f("sqdiff")(either_arg(0, 1)(any().bind("x"), skip_broadcasts(f("reduce_mean")))))));
    }
-    auto layernorm_onnx() const
+    auto sqrt_add_eps(const std::string& name) const
    {
        auto add_eps = f("add")(either_arg(0, 1)(variance(), is_constant().bind("eps")));
-        return f("div")(
+        return skip_broadcasts(f(name)(arg(0)(any_of(add_eps, variance()))));
-            arg(0)(x_minus_mean()),
+    }
-            arg(1)(skip_broadcasts(f("sqrt")(arg(0)(match::any_of(add_eps, variance()))))));
+    auto layernorm_onnx() const
+    {
+        auto div_sqrt  = f("div")(arg(0)(x_minus_mean()), arg(1)(sqrt_add_eps("sqrt")));
+        auto mul_rsqrt = f("mul")(either_arg(0, 1)(x_minus_mean(), sqrt_add_eps("rsqrt")));
+        return any(any_of(div_sqrt, mul_rsqrt));
    }
    auto matcher() const { return layernorm_onnx(); }

--- a/src/include/migraphx/op/convolution.hpp
+++ b/src/include/migraphx/op/convolution.hpp
@@ -24,9 +24,12 @@
 #ifndef MIGRAPHX_GUARD_OPERATORS_CONVOLUTION_HPP
 #define MIGRAPHX_GUARD_OPERATORS_CONVOLUTION_HPP
+#include <migraphx/argument.hpp>
 #include <migraphx/op/common.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/convolution.hpp>
+#include <migraphx/pad_calc.hpp>
 #include <migraphx/value.hpp>
 #include <cmath>
 #include <utility>
@@ -201,6 +204,37 @@ struct convolution
        check_attribute_size();
        return stride.size();
    }
+    argument compute(shape output_shape, std::vector<argument> args) const
+    {
+        std::vector<std::size_t> new_padding;
+        if(padding_mode != op::padding_mode_t::default_)
+        {
+            auto input_lens   = args[0].get_shape().lens();
+            auto weights_lens = args[1].get_shape().lens();
+            new_padding =
+                padding_mode == op::same_upper
+                    ? calc_dyn_auto_pad(input_lens, weights_lens, stride, dilation, true)
+                    : calc_dyn_auto_pad(input_lens, weights_lens, stride, dilation, false);
+            output_shape = compute_padded_shape(
+                args[0].get_shape(), args[1].get_shape(), new_padding, stride, dilation);
+        }
+        else
+        {
+            new_padding = padding;
+            if(output_shape.dynamic())
+            {
+                output_shape =
+                    normalize_compute_shape({args.at(0).get_shape(), args.at(1).get_shape()});
+            }
+        }
+        argument result{output_shape};
+        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
+            migraphx::convolution(output, input, weights, new_padding, stride, group);
+        });
+        return result;
+    }
 };
 } // namespace op

--- a/src/include/migraphx/op/gather.hpp
+++ b/src/include/migraphx/op/gather.hpp
@@ -26,6 +26,7 @@
 #include <array>
 #include <migraphx/check_shapes.hpp>
+#include <migraphx/dyn_output.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
 #include <migraphx/literal.hpp>
@@ -61,13 +62,36 @@ struct gather
    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2);
+        check_shapes{inputs, *this, true}.has(2);
-        auto lens = inputs[0].lens();
+        shape data    = inputs[0];
-        auto type = inputs[0].type();
+        shape indices = inputs[1];
+        auto type     = data.type();
+        // If index_dims is dynamic, convert the data to dynamic too.
+        if(indices.dynamic())
+        {
+            data = data.to_dynamic();
+        }
+        if(data.dynamic())
+        {
+            auto dims = data.dyn_dims();
+            dims.erase(dims.begin() + axis);
+            if(not indices.scalar())
+            {
+                auto index_dims = indices.to_dynamic().dyn_dims();
+                dims.insert(dims.begin() + axis, index_dims.begin(), index_dims.end());
+            }
+            return {type, dims};
+        }
+        else
+        {
+            // Both data and indices are static.  indices may be scalar
+            auto lens = data.lens();
            lens.erase(lens.begin() + axis);
-        if(not inputs[1].scalar())
+            if(not indices.scalar())
            {
-            auto ind_lens = inputs[1].lens();
+                auto ind_lens = indices.lens();
                lens.insert(lens.begin() + axis, ind_lens.begin(), ind_lens.end());
            }
@@ -79,17 +103,18 @@ struct gather
            return {type, lens};
        }
+    }
-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
        // negative axis means counting dimensions from back
        auto lens                 = args[0].get_shape().lens();
        std::size_t axis_dim_size = lens[axis];
        // max dimension in axis
        visit_all(result, args[0])([&](auto output, auto data) {
            args[1].visit([&](auto indices) {
-                if(output_shape.scalar())
+                if(dyn_out.computed_shape.scalar())
                {
                    auto in_index = indices.front();
                    in_index      = (in_index < 0) ? in_index + axis_dim_size : in_index;

--- a/src/include/migraphx/op/quant_convolution.hpp
+++ b/src/include/migraphx/op/quant_convolution.hpp
@@ -25,8 +25,10 @@
 #define MIGRAPHX_GUARD_OPERATORS_QUANT_CONVOLUTION_HPP
 #include <migraphx/op/common.hpp>
+#include <migraphx/argument.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/convolution.hpp>
 #include <migraphx/value.hpp>
 #include <cmath>
 #include <utility>
@@ -114,6 +116,17 @@ struct quant_convolution
        check_attribute_size();
        return stride.size();
    }
+    argument compute(shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        result.visit([&](auto output) {
+            visit_all(args[0], args[1])([&](auto input, auto weights) {
+                migraphx::convolution(output, input, weights, padding, stride, group);
+            });
+        });
+        return result;
+    }
 };
 } // namespace op

--- a/src/include/migraphx/optimize_module.hpp
+++ b/src/include/migraphx/optimize_module.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_RTGLIB_OPTIMIZE_MODULE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_OPTIMIZE_MODULE_HPP
+#include <string>
+#include <migraphx/instruction_ref.hpp>
+#include <migraphx/config.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module_pass_manager;
+/**
+ * Runs several passes in a loop
+ */
+struct optimize_module
+{
+    std::string name() const { return "optimize_module"; }
+    void apply(module_pass_manager& mpm) const;
+};
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/module.cpp
+++ b/src/module.cpp
@@ -822,7 +822,8 @@ static void print_make_op(std::ostream& os, const operation& op)
 static void print_py_shape(std::ostream& os, const migraphx::shape& s)
 {
-    os << "migraphx.shape(" << s.type_string() << ", lens=" << to_json_string(s.lens());
+    os << "migraphx.shape(type=" << to_json_string(s.type_string())
+       << ", lens=" << to_json_string(s.lens());
    if(not s.standard())
        os << ", strides=" << to_json_string(s.strides());
    os << ")";

--- a/src/onnx/parse_gemm.cpp
+++ b/src/onnx/parse_gemm.cpp
@@ -90,41 +90,45 @@ struct parse_gemm : op_parser<parse_gemm>
                    ? info.add_instruction(make_op("transpose", {{"permutation", perm}}), args[1])
                    : args[1];
-        auto ret = info.add_instruction(make_op("dot"), a_arg, b_arg);
+        auto dot_ins = info.add_instruction(make_op("dot"), a_arg, b_arg);
        if(args.size() == 3)
        {
-            // TODO: support dynamic C input
+            if(not float_equal(beta, 0.0f))
-            if(std::any_of(args.cbegin(), args.cend(), [](auto in_arg) {
-                   return in_arg->get_shape().dynamic();
-               }))
            {
-                MIGRAPHX_THROW("PARSE_GEMM: C input not handled for dynamic input shapes");
+                auto c_arg = args[2];
+                if(dot_ins->get_shape().dynamic())
+                {
+                    c_arg = info.add_instruction(make_op("multibroadcast"), args[2], dot_ins);
                }
-            if(not float_equal(beta, 0.0f) and args[2]->get_shape().elements() > 0)
+                else
                {
                    auto out_lens   = a_arg->get_shape().lens();
                    out_lens.back() = b_arg->get_shape().lens().back();
-                auto c_arg      = args[2];
                    auto c_lens     = c_arg->get_shape().lens();
-                if(not std::equal(out_lens.begin(), out_lens.end(), c_lens.begin(), c_lens.end()))
+                    if(not std::equal(
+                           out_lens.begin(), out_lens.end(), c_lens.begin(), c_lens.end()))
                    {
                        c_arg = info.add_instruction(
                            make_op("multibroadcast", {{"out_lens", out_lens}}), args[2]);
                    }
+                }
+                if(not float_equal(beta, 1.0f))
+                {
                    auto beta_literal = info.add_literal(beta);
-                auto beta_c       = info.add_broadcastable_binary_op("mul", c_arg, beta_literal);
+                    c_arg = info.add_broadcastable_binary_op("mul", c_arg, beta_literal);
-                if(beta_c->get_shape().type() != dot_type)
+                    if(c_arg->get_shape().type() != dot_type)
                    {
-                    beta_c = info.add_instruction(make_op("convert", {{"target_type", dot_type}}),
+                        c_arg = info.add_instruction(
-                                                  beta_c);
+                            make_op("convert", {{"target_type", dot_type}}), c_arg);
+                    }
                }
-                return info.add_instruction(make_op("add"), ret, beta_c);
+                return info.add_instruction(make_op("add"), dot_ins, c_arg);
            }
        }
+        return dot_ins;
-        return ret;
    }
 };

--- a/src/onnx/parse_trilu.cpp
+++ b/src/onnx/parse_trilu.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+struct parse_trilu : op_parser<parse_trilu>
+{
+    std::vector<op_desc> operators() const { return {{"Trilu"}}; }
+    instruction_ref parse(const op_desc&,
+                          const onnx_parser&,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        auto input_shape = args[0]->get_shape();
+        assert(input_shape.ndim() >= 2);
+        auto input_lens = input_shape.lens();
+        size_t num_rows = *(input_lens.rbegin() + 1);
+        size_t num_cols = input_lens.back();
+        int k           = 0;
+        bool upper      = true;
+        if(args.size() > 1)
+        {
+            auto arg_k = args[1]->eval();
+            check_arg_empty(arg_k, "PARSE_TRILU: dynamic k not supported");
+            k = arg_k.at<int>();
+        }
+        if(k < 0)
+            MIGRAPHX_THROW("PARSE_TRILU: negative k values not supported");
+        if(contains(info.attributes, "upper"))
+        {
+            upper = static_cast<bool>(info.attributes.at("upper").i());
+        }
+        shape::type_t output_type = args[0]->get_shape().type();
+        // when creating the mask, if upper == 1,
+        // the inner triangle will have values set to 0
+        std::vector<bool> mask_mat(num_rows * num_cols, upper);
+        for(size_t i = 0; i < num_rows; i++)
+        {
+            for(size_t j = 0; j < std::min(k, static_cast<int>(num_cols)); j++)
+            {
+                mask_mat[i * num_cols + j] = not upper;
+            }
+            k++;
+        }
+        auto mask = info.add_literal(
+            migraphx::literal{migraphx::shape{output_type, {num_rows, num_cols}}, mask_mat});
+        return info.add_broadcastable_binary_op("mul", mask, args[0]);
+    }
+};
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/optimize_module.cpp
+++ b/src/optimize_module.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/optimize_module.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/simplify_reshapes.hpp>
+#include <migraphx/simplify_algebra.hpp>
+#include <migraphx/eliminate_common_subexpression.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/propagate_constant.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+void optimize_module::apply(module_pass_manager& mpm) const
+{
+    for(int i = 0; i < 2; i++)
+    {
+        mpm.run_pass(simplify_reshapes{});
+        mpm.run_pass(simplify_algebra{});
+        mpm.run_pass(eliminate_common_subexpression{});
+        mpm.run_pass(dead_code_elimination{});
+        mpm.run_pass(propagate_constant{});
+        mpm.run_pass(dead_code_elimination{});
+    }
+}
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
@@ -31,6 +31,7 @@
 #include <migraphx/op/reshape.hpp>
 #include <migraphx/op/transpose.hpp>
 #include <migraphx/matcher.hpp>
+#include <migraphx/common.hpp>
 #include <migraphx/literal.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/serialize.hpp>
@@ -51,8 +52,9 @@ auto op_lit_broadcast(std::string op, std::string x, std::string y)
 auto conv_const_weights()
 {
-    return match::name("convolution")(match::used_once(),
+    return match::name("convolution")(
-                                      match::args(match::any(), match::is_constant().bind("w")));
+        match::used_once(),
+        match::args(match::none_of(match::is_constant()), match::is_constant().bind("w")));
 }
 auto reduction() { return match::name_contains("reduce"); }
@@ -267,6 +269,32 @@ struct find_dot_add
    }
 };
+struct find_conv_add
+{
+    auto matcher() const
+    {
+        auto add = match::name("add")(
+            match::either_arg(0, 1)(match::any().bind("x"),
+                                    match::any_of(match::is_constant()).bind("a")),
+            match::used_once());
+        return match::name("convolution")(match::used_once(),
+                                          match::args(add, match::is_constant().bind("w")));
+    }
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins   = r.result;
+        auto a_ins = r.instructions["a"];
+        auto x_ins = r.instructions["x"];
+        auto w_ins = r.instructions["w"];
+        auto conv1 = m.insert_instruction(ins, ins->get_operator(), a_ins, w_ins);
+        auto conv2 = m.insert_instruction(ins, ins->get_operator(), x_ins, w_ins);
+        m.replace_instruction(ins, make_op("add"), conv1, conv2);
+    }
+};
 struct find_add_lit_broadcast
 {
    auto matcher() const
@@ -340,12 +368,18 @@ struct find_inner_broadcast
                       std::back_inserter(inputs),
                       [](auto i) { return i->inputs().front(); });
        if(std::any_of(inputs.begin(), inputs.end(), [&](auto i) {
-               return i->get_shape() != inputs.front()->get_shape();
+               return i->get_shape() != inputs.front()->get_shape() and
+                      i->get_shape().elements() != 1;
           }))
            return;
-        auto op = m.insert_instruction(ins, ins->get_operator(), inputs);
+        auto b_it = std::find_if(broadcasts.begin(), broadcasts.end(), [&](auto i) {
-        m.replace_instruction(ins, broadcasts.front()->get_operator(), op);
+            return not i->get_shape().scalar();
+        });
+        if(b_it == broadcasts.end())
+            b_it = broadcasts.begin();
+        auto op = insert_common_op(m, ins, ins->get_operator(), inputs);
+        m.replace_instruction(ins, (*b_it)->get_operator(), op);
    }
 };
@@ -1232,6 +1266,7 @@ void simplify_algebra::apply(module& m) const
                            find_neg_unit_ops{},
                            find_zero_ops{},
                            find_dot_add{},
+                            find_conv_add{},
                            find_div_const{},
                            find_sub_const{},
                            find_rsqrt{},

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
-#####################################################################################
+# ####################################################################################
 # The MIT License (MIT)
 #
 # Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
@@ -20,7 +20,7 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
-#####################################################################################
+# ####################################################################################
 list(APPEND CMAKE_PREFIX_PATH /opt/rocm /opt/rocm/hip /opt/rocm/hcc)
 find_package(miopen)
@@ -33,6 +33,8 @@ if(NOT TARGET MIOpen)
    message(SEND_ERROR "Cant find miopen")
 endif()
+set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "Use hipRTC APIs")
 include(Embed)
 file(GLOB KERNEL_FILES ${CONFIGURE_DEPENDS}
    ${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/migraphx/kernels/*.hpp)
@@ -46,6 +48,7 @@ add_library(compile_for_gpu INTERFACE)
 target_compile_options(compile_for_gpu INTERFACE -std=c++17 -fno-gpu-rdc -Wno-cuda-compat -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns)
 target_link_libraries(compile_for_gpu INTERFACE hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-option-ignored)
 check_cxx_compiler_flag("--cuda-host-only -fhip-lambda-host-device -x hip" HAS_HIP_LAMBDA_HOST_DEVICE)
 if(HAS_HIP_LAMBDA_HOST_DEVICE)
    message(STATUS "Enable -fhip-lambda-host-device")
    target_compile_options(compile_for_gpu INTERFACE -fhip-lambda-host-device)
@@ -60,11 +63,13 @@ target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURR
 target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)
 add_library(kernel_file_check EXCLUDE_FROM_ALL)
 foreach(KERNEL_FILE ${KERNEL_FILES})
    get_filename_component(KERNEL_BASE_FILE ${KERNEL_FILE} NAME_WE)
    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp "#include <migraphx/kernels/${KERNEL_BASE_FILE}.hpp>\n")
    target_sources(kernel_file_check PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp)
 endforeach()
 target_compile_definitions(kernel_file_check PRIVATE -DMIGRAPHX_NLOCAL=256)
 target_include_directories(kernel_file_check PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/>)
 target_link_libraries(kernel_file_check compile_for_gpu)
@@ -127,6 +132,7 @@ function(register_migraphx_gpu_ops PREFIX)
        register_op(migraphx_gpu HEADER migraphx/gpu/${OP}.hpp OPERATORS gpu::${PREFIX}${OP} INCLUDES migraphx/gpu/context.hpp)
    endforeach()
 endfunction()
 register_migraphx_gpu_ops(hip_
    argmax
    argmin
@@ -166,15 +172,9 @@ register_op(migraphx_gpu HEADER migraphx/gpu/convolution.hpp
 rocm_set_soversion(migraphx_gpu ${MIGRAPHX_SO_VERSION})
 rocm_clang_tidy_check(migraphx_gpu)
-# look for offload bundler
 get_filename_component(CMAKE_CXX_COMPILER_PATH "${CMAKE_CXX_COMPILER}" PATH)
-if(CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+$")
-    find_program(MIGRAPHX_OFFLOADBUNDLER_BIN clang-offload-bundler
+if(NOT CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+$")
-        HINTS ${CMAKE_CXX_COMPILER_PATH}
-        PATH_SUFFIXES bin
-        PATHS /opt/rocm/llvm
-    )
-else()
    find_program(MIGRAPHX_EXTRACT_KERNEL extractkernel
        PATH_SUFFIXES bin
        HINTS ${CMAKE_CXX_COMPILER_PATH}
@@ -185,10 +185,10 @@ else()
    )
 endif()
-message(STATUS "clang-offload-bundler: ${MIGRAPHX_OFFLOADBUNDLER_BIN}")
 message(STATUS "extractkernel: ${MIGRAPHX_EXTRACT_KERNEL}")
 set(MIGRAPHX_ENABLE_MLIR OFF CACHE BOOL "")
 if(MIGRAPHX_ENABLE_MLIR)
    # Find package rocMLIR
    find_package(rocMLIR 1.0.0 CONFIG REQUIRED)
@@ -197,40 +197,43 @@ if(MIGRAPHX_ENABLE_MLIR)
    target_link_libraries(migraphx_gpu PUBLIC rocMLIR::rockCompiler)
 endif()
-set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "")
 if(MIGRAPHX_USE_HIPRTC)
-target_compile_definitions(migraphx_gpu PRIVATE -DMIGRAPHX_USE_HIPRTC=1)
+    message(STATUS "MIGraphX is using hipRTC")
+    target_compile_definitions(migraphx_gpu PRIVATE -DMIGRAPHX_USE_HIPRTC=1)
 else()
-# Get flags needed to compile hip
+    message(STATUS "MIGraphX is using HIP Clang")
-include(TargetFlags)
-target_flags(HIP_COMPILER_FLAGS hip::device)
+    # Get flags needed to compile hip
-# Remove cuda arch flags
+    include(TargetFlags)
-string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+    target_flags(HIP_COMPILER_FLAGS hip::device)
-string(REGEX REPLACE --offload-arch=[a-z0-9:+-]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-# Skip library paths since hip will incorrectly treat it as a source file
+    # Remove cuda arch flags
-string(APPEND HIP_COMPILER_FLAGS " ")
+    string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-# Add ck includes
+    string(REGEX REPLACE --offload-arch=[a-z0-9:+-]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-find_path(CK_INCLUDE_PATH ck/ck.hpp)
-message(STATUS "CK path: ${CK_INCLUDE_PATH}")
+    # Skip library paths since hip will incorrectly treat it as a source file
-string(APPEND HIP_COMPILER_FLAGS " -isystem ${CK_INCLUDE_PATH}")
+    string(APPEND HIP_COMPILER_FLAGS " ")
-foreach(_unused RANGE 2)
+    # Add ck includes
+    find_path(CK_INCLUDE_PATH ck/ck.hpp)
+    message(STATUS "CK path: ${CK_INCLUDE_PATH}")
+    string(APPEND HIP_COMPILER_FLAGS " -isystem ${CK_INCLUDE_PATH}")
+    foreach(_unused RANGE 2)
        string(REGEX REPLACE " /[^ ]+\\.(a|so) " " " HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-endforeach()
+    endforeach()
-message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
+    message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
-target_compile_definitions(migraphx_gpu PRIVATE 
+    target_compile_definitions(migraphx_gpu PRIVATE
        "-DMIGRAPHX_HIP_COMPILER=${CMAKE_CXX_COMPILER}"
        "-DMIGRAPHX_HIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}"
-    "-DMIGRAPHX_OFFLOADBUNDLER_BIN=${MIGRAPHX_OFFLOADBUNDLER_BIN}"
        "-DMIGRAPHX_EXTRACT_KERNEL=${MIGRAPHX_EXTRACT_KERNEL}"
-    "-DMIGRAPHX_USE_HIPRTC=0"
+    )
-)
-if(DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
-execute_process(COMMAND which ${CMAKE_CXX_COMPILER_LAUNCHER} OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER)
-string(STRIP "${MIGRAPHX_HIP_COMPILER_LAUNCHER}" MIGRAPHX_HIP_COMPILER_LAUNCHER)
-target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_HIP_COMPILER_LAUNCHER=${MIGRAPHX_HIP_COMPILER_LAUNCHER}")
-endif()
+    if(DEFINED CMAKE_CXX_COMPILER_LAUNCHER)
+        execute_process(COMMAND which ${CMAKE_CXX_COMPILER_LAUNCHER} OUTPUT_VARIABLE MIGRAPHX_HIP_COMPILER_LAUNCHER)
+        string(STRIP "${MIGRAPHX_HIP_COMPILER_LAUNCHER}" MIGRAPHX_HIP_COMPILER_LAUNCHER)
+        target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_HIP_COMPILER_LAUNCHER=${MIGRAPHX_HIP_COMPILER_LAUNCHER}")
+    endif()
 endif()
 # Check miopen find mode api
@@ -268,4 +271,3 @@ rocm_install_targets(
    INCLUDE
    ${CMAKE_CURRENT_SOURCE_DIR}/include
 )
--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
@@ -29,10 +29,9 @@
 #include <cassert>
 #include <iostream>
-#if MIGRAPHX_USE_HIPRTC
+#ifdef MIGRAPHX_USE_HIPRTC
 #include <hip/hiprtc.h>
 #include <migraphx/manage_ptr.hpp>
-#include <migraphx/env.hpp>
 #else
 #include <migraphx/compile_src.hpp>
 #include <migraphx/process.hpp>
@@ -48,9 +47,10 @@ MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_OPTIMIZE);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_ASM);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_GPU_DUMP_SRC);
-#if MIGRAPHX_USE_HIPRTC
+#ifdef MIGRAPHX_USE_HIPRTC
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_HIPRTC)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_HIPRTC);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS);
 std::string hiprtc_error(hiprtcResult err, const std::string& msg)
 {
@@ -144,24 +144,28 @@ struct hiprtc_program
                       std::back_inserter(c_options),
                       [](const std::string& s) { return s.c_str(); });
        auto result   = hiprtcCompileProgram(prog.get(), c_options.size(), c_options.data());
-        std::cerr << log() << std::endl;
+        auto prog_log = log();
+        if(not prog_log.empty())
+        {
+            std::cerr << prog_log << std::endl;
+        }
        if(result != HIPRTC_SUCCESS)
            MIGRAPHX_HIPRTC_THROW(result, "Compilation failed.");
    }
-    std::string log()
+    std::string log() const
    {
        std::size_t n = 0;
        MIGRAPHX_HIPRTC(hiprtcGetProgramLogSize(prog.get(), &n));
-        if(n < 2)
+        if(n == 0)
            return {};
-        std::vector<char> buffer(n);
+        std::string buffer(n, '\0');
        MIGRAPHX_HIPRTC(hiprtcGetProgramLog(prog.get(), buffer.data()));
-        assert(buffer.back() == 0);
+        assert(buffer.back() != 0);
-        return {buffer.begin(), buffer.end() - 1};
+        return buffer;
    }
-    std::vector<char> get_code_obj()
+    std::vector<char> get_code_obj() const
    {
        std::size_t n = 0;
        MIGRAPHX_HIPRTC(hiprtcGetCodeSize(prog.get(), &n));
@@ -176,6 +180,17 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
 {
    hiprtc_program prog(srcs);
    auto options = split_string(params, ' ');
+    options.push_back("-DMIGRAPHX_USE_HIPRTC=1");
+    // remove following three compilation flags for HIPRTC once fixes from hipRTC are available in
+    if(enabled(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS{}))
+    {
+        options.push_back("-DMIGRAPHX_HAS_DPP=0");
+        options.push_back("-DMIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS=1");
+        options.push_back("-Wno-reserved-identifier");
+        options.push_back("-Wno-gnu-line-marker");
+        options.push_back("-Wno-old-style-cast");
+    }
    if(enabled(MIGRAPHX_GPU_DEBUG{}))
        options.push_back("-DMIGRAPHX_DEBUG");
    if(std::none_of(options.begin(), options.end(), [](const std::string& s) {
@@ -183,7 +198,7 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
       }))
        options.push_back("-std=c++17");
    options.push_back("-fno-gpu-rdc");
-    options.push_back(" -O" + string_value_of(MIGRAPHX_GPU_OPTIMIZE{}, "3"));
+    options.push_back("-O" + string_value_of(MIGRAPHX_GPU_OPTIMIZE{}, "3"));
    options.push_back("-Wno-cuda-compat");
    options.push_back("--offload-arch=" + arch);
    prog.compile(options);
@@ -292,6 +307,8 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
    return {compiler.compile(srcs)};
 }
+#endif // MIGRAPHX_USE_HIPRTC
 std::string enum_params(std::size_t count, std::string param)
 {
    std::vector<std::string> items(count);
@@ -299,8 +316,6 @@ std::string enum_params(std::size_t count, std::string param)
    return join_strings(items, ",");
 }
-#endif // MIGRAPHX_USE_HIPRTC
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -29,7 +29,6 @@
 #include <migraphx/context.hpp>
 #include <migraphx_kernels.hpp>
 #include <migraphx/stringutils.hpp>
-#include <hip/hip_runtime_api.h>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -80,6 +79,7 @@ std::string generate_args_hpp(const std::vector<shape>& inputs)
 #include <migraphx/kernels/args.hpp>
 #include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/types.hpp>
 namespace migraphx {

--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@@ -36,6 +36,7 @@ namespace gpu {
 namespace device {
 #ifdef MIGRAPHX_NO_DPP
 template <index_int N,
          class Op,
          class T,
@@ -62,6 +63,7 @@ __device__ auto block_reduce(index idx, Op op, T init, ForStride fs, F f)
    }
    return buffer[0];
 }
 #else
 constexpr unsigned int dpp_row_shr(unsigned int x) { return 0x110u | x; }
@@ -96,11 +98,7 @@ __device__ T dpp_mov(T& x)
    input.data = x;
    for(index_int i = 0; i < n; i++)
    {
-#if defined(__HCC__)
-        output.reg[i] = __llvm_amdgcn_move_dpp(input.reg[i], DppCtrl, RowMask, BankMask, BoundCtrl);
-#else
        output.reg[i] = __hip_move_dpp(input.reg[i], DppCtrl, RowMask, BankMask, BoundCtrl);
-#endif
    }
    return output.data;
 }
@@ -310,4 +308,4 @@ void reduce(hipStream_t stream,
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-#endif
+#endif // MIGRAPHX_NO_DPP
--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -21,8 +21,8 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_CONVOLUTION_HPP
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_CONVOLUTION_HPP
-#define MIGRAPHX_GUARD_RTGLIB_CONVOLUTION_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_CONVOLUTION_HPP
 #include <migraphx/shape.hpp>
 #include <migraphx/generate.hpp>