Merge branch 'develop' into jit-reduce-reg

a5c87ec5 · Paul · dae94657 · 3f49f8eb · a5c87ec5 · a5c87ec5
Commit a5c87ec5 authored Jan 17, 2023 by Paul
20 changed files
--- a/.github/workflows/add-to-project.yaml
+++ b/.github/workflows/add-to-project.yaml
+name: Add items to GH project
+on:
+  pull_request:
+    types:
+      - opened
+  issues:
+    types:
+      - opened
+jobs:
+  add-to-project:
+    name: Add PRs and issues to MIGX project
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/add-to-project@v0.4.0
+        with:
+          project-url: https://github.com/orgs/ROCmSoftwarePlatform/projects/20
+          github-token: ${{ secrets.TEST_PR_WORKFLOW }}
--- a/Dockerfile
+++ b/Dockerfile
@@ -87,7 +87,7 @@ RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXR
 ADD tools/build_and_test_onnxrt.sh /onnxruntime/build_and_test_onnxrt.sh
-RUN cget -p /usr/local install ROCmSoftwarePlatform/rocMLIR@0f38fb33f518b53b94b541feb9b079668c5518e8 -DBUILD_MIXR_TARGET=On -DLLVM_ENABLE_ZSTD=Off -DLLVM_ENABLE_THREADS=Off
+RUN cget -p /usr/local install ROCmSoftwarePlatform/rocMLIR@78b706fe9879587ab98b6614ae539265374a3fae -DBUILD_MIXR_TARGET=On -DLLVM_ENABLE_ZSTD=Off -DLLVM_ENABLE_THREADS=Off
 ENV MIOPEN_FIND_DB_PATH=/tmp/miopen/find-db
 ENV MIOPEN_USER_DB_PATH=/tmp/miopen/user-db

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -11,16 +11,22 @@ def rocmtestnode(Map conf) {
    def image = 'migraphxlib'
    env.CCACHE_COMPRESSLEVEL = 7
    env.CCACHE_DIR = ccache
-    def cmake_build = { compiler, flags ->
+    def cmake_build = { bconf ->
+        def compiler = bconf.get("compiler", "/opt/rocm/llvm/bin/clang++")
+        def flags = bconf.get("flags", "")
+        def gpu_debug = bconf.get("gpu_debug", "0")
        def cmd = """
-            env
            ulimit -c unlimited
            echo "leak:dnnl::impl::malloc" > suppressions.txt
            export LSAN_OPTIONS="suppressions=\$(pwd)/suppressions.txt"
+            export MIGRAPHX_GPU_DEBUG=${gpu_debug}
+            export CXX=${compiler}
+            export CXXFLAGS='-Werror'
+            env
            rm -rf build
            mkdir build
            cd build
-            CXX=${compiler} CXXFLAGS='-Werror' cmake -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache ${flags} ..
+            cmake -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache ${flags} ..
            make -j\$(nproc) generate all doc package check VERBOSE=1
        """
        echo cmd
@@ -93,28 +99,32 @@ rocmtest clang_debug: rocmnode('vega') { cmake_build ->
    stage('Hip Clang Debug') {
        def sanitizers = "undefined"
        def debug_flags = "-g -O2 -fsanitize=${sanitizers} -fno-sanitize-recover=${sanitizers}"
-        cmake_build("/opt/rocm/llvm/bin/clang++", "-DCMAKE_BUILD_TYPE=debug -DMIGRAPHX_ENABLE_PYTHON=Off -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}' -DCMAKE_C_FLAGS_DEBUG='${debug_flags}'")
+        cmake_build(flags: "-DCMAKE_BUILD_TYPE=debug -DMIGRAPHX_ENABLE_PYTHON=Off -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}' -DCMAKE_C_FLAGS_DEBUG='${debug_flags}'")
+    }
+}, clang_gpu_debug: rocmnode('vega') { cmake_build ->
+    stage('Hip Clang GPU Debug') {
+        cmake_build(flags: "-DCMAKE_BUILD_TYPE=release", gpu_debug: true)
    }
 }, clang_release: rocmnode('vega') { cmake_build ->
    stage('Hip Clang Release') {
-        cmake_build("/opt/rocm/llvm/bin/clang++", "-DCMAKE_BUILD_TYPE=release")
+        cmake_build(flags: "-DCMAKE_BUILD_TYPE=release")
        stash includes: 'build/*.deb', name: 'migraphx-package'
    }
 }, mlir_debug: rocmnode('vega') { cmake_build ->
    stage('MLIR Debug') {
        def sanitizers = "undefined"
        def debug_flags = "-g -O2 -fsanitize=${sanitizers} -fno-sanitize-recover=${sanitizers}"
-        cmake_build("/opt/rocm/llvm/bin/clang++", "-DCMAKE_BUILD_TYPE=debug -DMIGRAPHX_ENABLE_PYTHON=Off -DMIGRAPHX_ENABLE_MLIR=On -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}' -DCMAKE_C_FLAGS_DEBUG='${debug_flags}'")
+        cmake_build(flags: "-DCMAKE_BUILD_TYPE=debug -DMIGRAPHX_ENABLE_PYTHON=Off -DMIGRAPHX_ENABLE_MLIR=On -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}' -DCMAKE_C_FLAGS_DEBUG='${debug_flags}'")
    }
 }, clang_asan: rocmnode('nogpu') { cmake_build ->
    stage('Clang ASAN') {
        def sanitizers = "undefined,address"
        def debug_flags = "-g -O2 -fno-omit-frame-pointer -fsanitize=${sanitizers} -fno-sanitize-recover=${sanitizers}"
-        cmake_build("/opt/rocm/llvm/bin/clang++", "-DCMAKE_BUILD_TYPE=debug -DMIGRAPHX_ENABLE_PYTHON=Off -DMIGRAPHX_ENABLE_GPU=Off -DMIGRAPHX_ENABLE_CPU=On -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}' -DCMAKE_C_FLAGS_DEBUG='${debug_flags}'")
+        cmake_build(flags: "-DCMAKE_BUILD_TYPE=debug -DMIGRAPHX_ENABLE_PYTHON=Off -DMIGRAPHX_ENABLE_GPU=Off -DMIGRAPHX_ENABLE_CPU=On -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}' -DCMAKE_C_FLAGS_DEBUG='${debug_flags}'")
    }
 }//, clang_release_navi: rocmnode('navi21') { cmake_build ->
 //    stage('HIP Clang Release Navi') {
-//        cmake_build("/opt/rocm/llvm/bin/clang++", "-DCMAKE_BUILD_TYPE=release")
+//        cmake_build(flags: "-DCMAKE_BUILD_TYPE=release")
 //    }
 //}

--- a/src/include/migraphx/op/pad.hpp
+++ b/src/include/migraphx/op/pad.hpp
@@ -59,18 +59,29 @@ struct pad
    std::string name() const { return "pad"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1);
+        check_shapes{inputs, *this, true}.has(1);
-        auto&& idims = inputs.front().lens();
+        const auto& s0 = inputs.front();
-        std::vector<std::size_t> rdims(idims.begin(), idims.end());
+        if(s0.dynamic())
-        std::size_t num_dims = rdims.size();
-        for(std::size_t i = 0; i < num_dims; i++)
        {
-            rdims[i] += pads[i] + pads[i + num_dims];
+            auto out_dyn_dims = s0.dyn_dims();
+            for(std::size_t i = 0; i < s0.ndim(); ++i)
+            {
+                out_dyn_dims[i] += pads[i] + pads[i + s0.ndim()];
+            }
+            return {s0.type(), out_dyn_dims};
+        }
+        else
+        {
+            auto&& idims = s0.lens();
+            std::vector<std::size_t> rdims(idims.begin(), idims.end());
+            std::size_t num_dims = rdims.size();
+            for(std::size_t i = 0; i < num_dims; i++)
+            {
+                rdims[i] += pads[i] + pads[i + num_dims];
+            }
+            shape s{s0.type(), rdims};
+            return s;
        }
-        shape s{inputs.front().type(), rdims};
-        return s;
    }
    std::size_t pad_ndims() const

--- a/src/include/migraphx/op/reduce_op.hpp
+++ b/src/include/migraphx/op/reduce_op.hpp
@@ -26,6 +26,7 @@
 #include <migraphx/op/name.hpp>
 #include <migraphx/check_shapes.hpp>
+#include <migraphx/dyn_output.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/tensor_view.hpp>
 #include <migraphx/shape_for_each.hpp>
@@ -105,18 +106,41 @@ struct reduce_op : op_name<Derived>
        return tuned_axes;
    }
+    /**
+     * @brief returns a shape in which the axis or axes named
+     * for reduction by this op are set, to size 1.
+     *
+     * @param inputs list of input shapes
+     * @return shape
+     */
    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1);
+        check_shapes{inputs, *this, true}.has(1);
-        auto s          = inputs.at(0);
+        auto s = inputs.at(0);
-        auto lens       = s.lens();
+        if(s.dynamic())
-        auto tuned_axes = tune_axes(lens.size());
-        for(auto axis : tuned_axes)
        {
-            lens[axis] = 1;
+            auto output_dyn_dims = s.dyn_dims();
+            auto tuned_axes      = tune_axes(output_dyn_dims.size());
+            for(const auto& axis : tuned_axes)
+            {
+                // At the time of writing, there's no functional difference between
+                // optimum of 0 (no opt) or 1.
+                output_dyn_dims[axis] = {1, 1, 0};
+            }
+            return shape{s.type(), output_dyn_dims};
+        }
+        else
+        {
+            auto lens       = s.lens();
+            auto tuned_axes = tune_axes(lens.size());
+            for(const auto& axis : tuned_axes)
+            {
+                lens[axis] = 1;
+            }
+            return inputs[0].with_lens(lens);
        }
-        return inputs[0].with_lens(lens);
    }
    template <class T>
@@ -124,7 +148,7 @@ struct reduce_op : op_name<Derived>
                   const std::vector<T>& in_lens,
                   std::vector<T>& out_lens) const
    {
-        for(auto axis : tuned_axes)
+        for(const auto& axis : tuned_axes)
        {
            out_lens[axis] = in_lens[axis];
        }
@@ -151,17 +175,17 @@ struct reduce_op : op_name<Derived>
            static_cast<const Derived&>(*this).output(batch_shape)(val);
    }
-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
        auto arg_lens   = args.front().get_shape().lens();
        auto tuned_axes = tune_axes(arg_lens.size());
-        std::vector<std::size_t> batch_lens(output_shape.lens().size(), 1);
+        std::vector<std::size_t> batch_lens(dyn_out.computed_shape.lens().size(), 1);
        tune_dims(tuned_axes, arg_lens, batch_lens);
-        shape batch_shape{output_shape.type(), batch_lens};
+        shape batch_shape{dyn_out.computed_shape.type(), batch_lens};
        visit_all(result, args[0])([&](auto output, auto input) {
-            par_for(output_shape.elements(), [&](auto i) {
+            par_for(dyn_out.computed_shape.elements(), [&](auto i) {
-                auto out_idx = output_shape.multi(i);
+                auto out_idx = dyn_out.computed_shape.multi(i);
                this->reduce(input, batch_shape, tuned_axes, out_idx, output);
            });
        });

--- a/src/include/migraphx/op/reshape.hpp
+++ b/src/include/migraphx/op/reshape.hpp
@@ -28,6 +28,7 @@
 #include <migraphx/argument.hpp>
 #include <migraphx/config.hpp>
 #include <migraphx/value.hpp>
+#include <migraphx/dyn_output.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -46,14 +47,60 @@ struct reshape
    value attributes() const { return {{"require_std_shape", true}}; }
    std::string name() const { return "reshape"; }
-    shape compute_shape(std::vector<shape> inputs) const
+    shape dyn_compute_shape(shape s0) const
+    {
+        auto dyn_dims      = s0.dyn_dims();
+        auto num_not_fixed = std::count_if(
+            dyn_dims.cbegin(), dyn_dims.cend(), [](auto dd) { return not dd.is_fixed(); });
+        if(num_not_fixed != 1)
+        {
+            MIGRAPHX_THROW("Reshape: Only supports one non-fixed dynamic_dimension");
+        }
+        // track number of fixed elements in input and output
+        std::size_t num_dims_ele = 1;
+        std::size_t num_dd_ele   = 1;
+        for(std::size_t i = 0; i < dyn_dims.size(); ++i)
+        {
+            if(dyn_dims[i].is_fixed())
+            {
+                num_dims_ele *= dims[i];
+                num_dd_ele *= dyn_dims[i].min;
+            }
+            else
+            {
+                if(dims[i] != 0 and dims[i] != -1)
+                {
+                    MIGRAPHX_THROW(
+                        "Reshape: Non-fixed dynamic_dimension doesn't match with 0 or -1 "
+                        "output dimension");
+                }
+            }
+        }
+        if(num_dims_ele != num_dd_ele)
+        {
+            MIGRAPHX_THROW("Reshape: Number of fixed elements must match. Input: " +
+                           std::to_string(num_dd_ele) + " Output: " + std::to_string(num_dims_ele));
+        }
+        // construct output dynamic shape from dims attribute
+        std::vector<shape::dynamic_dimension> output_dyn_dims(dims.size());
+        std::transform(dims.cbegin(),
+                       dims.cend(),
+                       dyn_dims.cbegin(),
+                       output_dyn_dims.begin(),
+                       [](std::size_t dim, auto dyn_dim) {
+                           if(not dyn_dim.is_fixed())
+                               return dyn_dim;
+                           return shape::dynamic_dimension{dim, dim};
+                       });
+        return {s0.type(), output_dyn_dims};
+    }
+    shape static_compute_shape(std::vector<shape> inputs, std::size_t n_neg_dims) const
    {
-        check_shapes{inputs, *this}.has(1).standard();
+        check_shapes{inputs, *this}.standard();
        auto&& idims = inputs.front().lens();
        std::vector<std::size_t> rdims(dims.begin(), dims.end());
-        auto n_neg_dims = std::count(dims.begin(), dims.end(), -1);
-        if(n_neg_dims > 1)
-            MIGRAPHX_THROW("Reshape: Dimensions for reshape can only have one -1 dim");
        for(std::size_t i = 0; i < dims.size(); i++)
        {
@@ -86,9 +133,26 @@ struct reshape
        return s;
    }
-    argument compute(shape output_shape, std::vector<argument> args) const
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this, true}.has(1);
+        auto n_neg_dims = std::count(dims.begin(), dims.end(), -1);
+        if(n_neg_dims > 1)
+            MIGRAPHX_THROW("Reshape: Dimensions for reshape can only have one -1 dim");
+        auto s0 = inputs[0];
+        if(s0.dynamic())
+        {
+            return dyn_compute_shape(s0);
+        }
+        else
+        {
+            return static_compute_shape(inputs, n_neg_dims);
+        }
+    }
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        return args[0].reshape(output_shape);
+        return args[0].reshape(dyn_out.computed_shape);
    }
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }

--- a/src/include/migraphx/shape.hpp
+++ b/src/include/migraphx/shape.hpp
@@ -107,6 +107,13 @@ struct shape
        friend bool operator==(const std::size_t& x, const dynamic_dimension& y);
        friend bool operator!=(const dynamic_dimension& x, const std::size_t& y);
        friend bool operator!=(const std::size_t& x, const dynamic_dimension& y);
+        // add and subtract fixed std::size_t dimension
+        dynamic_dimension& operator+=(const std::size_t& x);
+        dynamic_dimension& operator-=(const std::size_t& x);
+        friend dynamic_dimension operator+(const dynamic_dimension& x, const std::size_t& y);
+        friend dynamic_dimension operator+(const std::size_t& x, const dynamic_dimension& y);
+        friend dynamic_dimension operator-(const dynamic_dimension& x, const std::size_t& y);
    };
    static const std::vector<type_t>& types();

--- a/src/onnx/onnx_parser.cpp
+++ b/src/onnx/onnx_parser.cpp
@@ -110,9 +110,19 @@ instruction_ref onnx_parser::node_info::add_bias(const std::vector<instruction_r
 {
    if(args.size() == 3)
    {
-        auto bias_bcast = mod->add_instruction(
+        instruction_ref bias_bcast;
-            make_op("broadcast", {{"axis", axis}, {"out_lens", curr_ins->get_shape().lens()}}),
+        // if curr_ins has a dynamic output shape use 2 input broadcast
-            args[2]);
+        if(curr_ins->get_shape().dynamic())
+        {
+            bias_bcast =
+                mod->add_instruction(make_op("broadcast", {{"axis", axis}}), args[2], curr_ins);
+        }
+        else
+        {
+            bias_bcast = mod->add_instruction(
+                make_op("broadcast", {{"axis", axis}, {"out_lens", curr_ins->get_shape().lens()}}),
+                args[2]);
+        }
        return mod->add_instruction(make_op("add"), curr_ins, bias_bcast);
    }
    return curr_ins;

--- a/src/onnx/parse_matmul.cpp
+++ b/src/onnx/parse_matmul.cpp
@@ -43,55 +43,79 @@ struct parse_matmul : op_parser<parse_matmul>
                          const onnx_parser::node_info& info,
                          std::vector<instruction_ref> args) const
    {
-        auto l0      = args[0];
+        auto a0 = args[0];
-        auto l1      = args[1];
+        auto a1 = args[1];
-        auto l0_lens = l0->get_shape().lens();
+        auto s0 = a0->get_shape();
-        auto l1_lens = l1->get_shape().lens();
+        auto s1 = a1->get_shape();
-        // args[0] is a vector, prepend 1 to the shape
+        instruction_ref dot_res;
        bool is_a_prepended = false;
-        if(l0_lens.size() == 1)
+        bool is_b_appended  = false;
+        if(s0.ndim() == 1)
        {
            is_a_prepended = true;
-            l0_lens.insert(l0_lens.begin(), 1);
+            a0             = info.add_instruction(make_op("unsqueeze", {{"axes", {0}}}), args[0]);
-            l0 = info.add_instruction(make_op("unsqueeze", {{"axes", {0}}}), args[0]);
        }
+        if(s1.ndim() == 1)
-        bool is_b_appended = false;
-        if(l1_lens.size() == 1)
        {
            is_b_appended = true;
-            l1_lens.push_back(1);
+            a1            = info.add_instruction(make_op("unsqueeze", {{"axes", {1}}}), args[1]);
-            l1 = info.add_instruction(make_op("unsqueeze", {{"axes", {1}}}), args[1]);
        }
-        instruction_ref bl0 = l0;
+        if(s0.dynamic() or s1.dynamic())
-        instruction_ref bl1 = l1;
-        if(not std::equal(
-               l0_lens.rbegin() + 2, l0_lens.rend(), l1_lens.rbegin() + 2, l1_lens.rend()))
        {
-            auto l0_it = l0_lens.begin() + l0_lens.size() - 2;
+            if(opd.op_name == "quant_dot")
-            std::vector<std::size_t> l0_broadcasted_lens(l0_lens.begin(), l0_it);
+            {
-            auto l1_it = l1_lens.begin() + l1_lens.size() - 2;
+                MIGRAPHX_THROW("PARSE_MATMUL: dynamic MatMulInteger not supported");
-            std::vector<std::size_t> l1_broadcasted_lens(l1_lens.begin(), l1_it);
+            }
-            auto output_lens = compute_broadcasted_lens(l0_broadcasted_lens, l1_broadcasted_lens);
+            auto s0_dds = a0->get_shape().to_dynamic().dyn_dims();
-            l0_broadcasted_lens = output_lens;
+            auto s1_dds = a1->get_shape().to_dynamic().dyn_dims();
-            l0_broadcasted_lens.insert(l0_broadcasted_lens.end(), l0_it, l0_lens.end());
-            l1_broadcasted_lens = output_lens;
+            // TODO: handling this case requires a new multibroadcast mode
-            l1_broadcasted_lens.insert(l1_broadcasted_lens.end(), l1_it, l1_lens.end());
+            if(not std::equal(
-            if(l0_lens != l0_broadcasted_lens)
+                   s0_dds.rbegin() + 2, s0_dds.rend(), s1_dds.rbegin() + 2, s1_dds.rend()))
            {
-                bl0 = info.add_instruction(
+                MIGRAPHX_THROW("PARSE_MATMUL: dynamic shape broadcasting not supported");
-                    make_op("multibroadcast", {{"out_lens", l0_broadcasted_lens}}), l0);
            }
-            if(l1_lens != l1_broadcasted_lens)
+            dot_res = info.add_instruction(make_op(opd.op_name), a0, a1);
+        }
+        else
+        {
+            auto s0_lens        = a0->get_shape().lens();
+            auto s1_lens        = a1->get_shape().lens();
+            instruction_ref ba0 = a0;
+            instruction_ref ba1 = a1;
+            // try broadcasting if dimensions other than last two do not match
+            if(not std::equal(
+                   s0_lens.rbegin() + 2, s0_lens.rend(), s1_lens.rbegin() + 2, s1_lens.rend()))
            {
-                bl1 = info.add_instruction(
+                auto l0_it = s0_lens.begin() + s0_lens.size() - 2;
-                    make_op("multibroadcast", {{"out_lens", l1_broadcasted_lens}}), l1);
+                std::vector<std::size_t> l0_broadcasted_lens(s0_lens.begin(), l0_it);
+                auto l1_it = s1_lens.begin() + s1_lens.size() - 2;
+                std::vector<std::size_t> l1_broadcasted_lens(s1_lens.begin(), l1_it);
+                auto output_lens =
+                    compute_broadcasted_lens(l0_broadcasted_lens, l1_broadcasted_lens);
+                l0_broadcasted_lens = output_lens;
+                l0_broadcasted_lens.insert(l0_broadcasted_lens.end(), l0_it, s0_lens.end());
+                l1_broadcasted_lens = output_lens;
+                l1_broadcasted_lens.insert(l1_broadcasted_lens.end(), l1_it, s1_lens.end());
+                if(s0_lens != l0_broadcasted_lens)
+                {
+                    ba0 = info.add_instruction(
+                        make_op("multibroadcast", {{"out_lens", l0_broadcasted_lens}}), a0);
+                }
+                if(s1_lens != l1_broadcasted_lens)
+                {
+                    ba1 = info.add_instruction(
+                        make_op("multibroadcast", {{"out_lens", l1_broadcasted_lens}}), a1);
+                }
            }
+            dot_res = info.add_instruction(make_op(opd.op_name), ba0, ba1);
        }
-        instruction_ref dot_res = info.add_instruction(make_op(opd.op_name), bl0, bl1);
-        int64_t num_axis        = static_cast<int64_t>(dot_res->get_shape().lens().size());
+        // squeeze the appended or prepended dimensions
+        int64_t num_axis = dot_res->get_shape().ndim();
        if(is_a_prepended)
        {
            dot_res = info.add_instruction(make_op("squeeze", {{"axes", {num_axis - 2}}}), dot_res);

--- a/src/onnx/parse_pad.cpp
+++ b/src/onnx/parse_pad.cpp
@@ -147,7 +147,13 @@ struct parse_pad : op_parser<parse_pad>
        {
            auto mode = info.attributes.at("mode").s();
            if(mode == "reflect")
+            {
+                if(args.front()->get_shape().dynamic())
+                {
+                    MIGRAPHX_THROW("PARSE_PAD: reflect padding with dynamic shape not supported");
+                }
                return reflect_pad(info, pads, args.front());
+            }
            if(mode != "constant")
            {
                MIGRAPHX_THROW(

--- a/src/onnx/parse_reduce_op.cpp
+++ b/src/onnx/parse_reduce_op.cpp
@@ -68,8 +68,7 @@ instruction_ref parse_reduce_oper(const std::string& op_name,
        }
        else
        {
-            std::size_t n_dim = args.front()->get_shape().lens().size();
+            axes.resize(args.front()->get_shape().ndim());
-            axes.resize(n_dim);
            std::iota(axes.begin(), axes.end(), 0);
        }
    }

--- a/src/onnx/parse_reshape.cpp
+++ b/src/onnx/parse_reshape.cpp
@@ -49,7 +49,7 @@ struct parse_reshape : op_parser<parse_reshape>
        if(args.size() == 2)
        {
            auto s = args[1]->eval();
-            check_arg_empty(s, "Reshape: dynamic shape is not supported");
+            check_arg_empty(s, "Reshape: non-constant shape input is not supported");
            s.visit([&](auto v) { copy(v, std::back_inserter(dims)); });
        }

--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -504,6 +504,31 @@ bool shape::dynamic_dimension::is_fixed() const { return this->min == this->max;
 bool shape::dynamic_dimension::has_optimal() const { return opt != 0; }
+shape::dynamic_dimension& shape::dynamic_dimension::operator+=(const std::size_t& x)
+{
+    this->min += x;
+    this->max += x;
+    if(this->opt != 0)
+    {
+        this->opt += x;
+    };
+    return *this;
+}
+shape::dynamic_dimension& shape::dynamic_dimension::operator-=(const std::size_t& x)
+{
+    assert(this->min >= x);
+    assert(this->max >= x);
+    this->min -= x;
+    this->max -= x;
+    if(this->opt != 0)
+    {
+        assert(this->opt >= x);
+        this->opt -= x;
+    }
+    return *this;
+}
 bool operator==(const shape::dynamic_dimension& x, const shape::dynamic_dimension& y)
 {
    // don't check opt if both are fixed
@@ -529,6 +554,23 @@ bool operator==(const std::size_t& x, const shape::dynamic_dimension& y) { retur
 bool operator!=(const shape::dynamic_dimension& x, const std::size_t& y) { return not(x == y); }
 bool operator!=(const std::size_t& x, const shape::dynamic_dimension& y) { return not(x == y); }
+shape::dynamic_dimension operator+(const shape::dynamic_dimension& x, const std::size_t& y)
+{
+    auto dd = x;
+    return dd += y;
+}
+shape::dynamic_dimension operator+(const std::size_t& x, const shape::dynamic_dimension& y)
+{
+    return y + x;
+}
+shape::dynamic_dimension operator-(const shape::dynamic_dimension& x, const std::size_t& y)
+{
+    auto dd = x;
+    return dd -= y;
+}
 bool operator==(const shape& x, const shape& y)
 {
    if(x.dynamic() and y.dynamic())

--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
@@ -1065,11 +1065,23 @@ struct find_split_reshape
            return;
        }
+        // Only want to apply this optimization if each split output is followed by
+        // a contiguous op and a reshape
+        if(std::any_of(split_outputs.begin(), split_outputs.end(), [](auto i) {
+               if(i->outputs().size() == 1)
+               {
+                   auto cont = i->outputs().front();
+                   return cont->outputs().size() != 1;
+               }
+               return false;
+           }))
+        {
+            return;
+        }
        std::vector<instruction_ref> vec_rsp(split_outputs.size());
        std::transform(split_outputs.begin(), split_outputs.end(), vec_rsp.begin(), [](auto i) {
-            assert(i->outputs().size() == 1);
            auto cont = i->outputs().front();
-            assert(cont->outputs().size() == 1);
            return cont->outputs().front();
        });

--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -763,16 +763,23 @@ struct find_transpose_slice
        // Compute axis before transpose to use for unsqueeze
        auto perm    = ins->get_operator().to_value()["permutation"].to_vector<int64_t>();
        auto preaxis = std::find(perm.begin(), perm.end(), axis) - perm.begin();
-        // Make unsqeeze
+        // Make unsqueeze
+        std::vector<int64_t> steps(sdistance.size());
+        std::transform(
+            slice.axes.begin(),
+            slice.axes.end(),
+            sdistance.begin(),
+            steps.begin(),
+            [&](const auto ax, const auto sdis) { return ins->get_shape().lens().at(ax) / sdis; });
        auto unsqueeze = m.insert_instruction(
-            ins, make_op("unsqueeze", {{"axes", {preaxis}}, {"steps", sdistance}}), ins->inputs());
+            ins, make_op("unsqueeze", {{"axes", {preaxis}}, {"steps", steps}}), ins->inputs());
        // Make transpose
        std::transform(perm.begin(), perm.end(), perm.begin(), [&](auto i) {
-            if(i > preaxis)
+            if(i >= preaxis)
                return i + 1;
            return i;
        });
-        perm.insert(perm.begin(), preaxis + 1);
+        perm.insert(perm.begin(), preaxis);
        auto transpose =
            m.insert_instruction(ins, make_op("transpose", {{"permutation", perm}}), unsqueeze);
        // Slice and squeeze

--- a/src/targets/gpu/jit/gather.cpp
+++ b/src/targets/gpu/jit/gather.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+// NOLINTNEXTLINE
+static const char* const gather_kernel = R"__migraphx__(
+#include <migraphx/kernels/gather.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
+#include <args.hpp>
+namespace migraphx {
+extern "C" {
+__global__ void gather_kernel(void* in_data, void* in_indices, void* output) 
+{
+    make_tensors()(in_data, in_indices, output)([](auto&&... xs) { 
+        gather<${axis}>(xs...); 
+    });
+}
+}
+} // namespace migraphx
+)__migraphx__";
+struct gather_compiler : compiler<gather_compiler>
+{
+    std::vector<std::string> names() const { return {"gather"}; }
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        hip_compile_options options;
+        const auto& out_s = inputs.back();
+        options.set_launch_params(v, compute_global_for(ctx, out_s.elements()));
+        options.inputs         = inputs;
+        options.output         = out_s;
+        options.kernel_name    = "gather_kernel";
+        options.virtual_inputs = inputs;
+        auto axis = v.at("axis").to<std::string>();
+        auto src = interpolate_string(gather_kernel, {{"axis", axis}});
+        return compile_hip_code_object(src, options);
+    }
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/jit/reduce.cpp
+++ b/src/targets/gpu/jit/reduce.cpp
@@ -156,16 +156,25 @@ struct reduce_compiler : compiler<reduce_compiler>
    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
    {
-        value v              = value::object{};
+        value v = value::object{};
-        auto reduce_elements = get_reduce_elements(ins->inputs());
        if(op.name() == "reduce_sum")
        {
            v["reduction"] = "op::sum{}";
        }
        else if(op.name() == "reduce_mean")
        {
-            v["reduction"] = "op::sum{}";
+            auto reduce_elements = get_reduce_elements(ins->inputs());
-            v["write"]     = "op::mean{" + std::to_string(reduce_elements) + "}";
+            auto reduce_type     = ins->inputs().front()->get_shape().type();
+            v["reduction"]       = "op::sum{}";
+            std::string mean     = "op::mean{" + std::to_string(reduce_elements) + "}";
+            // Use float accumulator when reduction size is too large for half
+            if(reduce_type == shape::half_type and reduce_elements > 16384)
+                v["read"] = "compose(" + mean + ", op::convert_to<float>{})";
+            else if(contains({shape::float_type, shape::half_type, shape::double_type},
+                             reduce_type))
+                v["read"] = mean;
+            else
+                v["write"] = mean;
        }
        else if(op.name() == "reduce_max")
        {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
@@ -187,6 +187,14 @@ constexpr auto fold(F f)
    return [=](auto&&... xs) { return fold_impl(f, static_cast<decltype(xs)&&>(xs)...); };
 }
+template <class... Fs>
+constexpr auto compose(Fs... fs)
+{
+    return fold([](auto f, auto g) {
+        return [=](auto&&... xs) { return f(g(static_cast<decltype(xs)>(xs)...)); };
+    })(fs...);
+}
 template <class... Ts>
 constexpr auto pack(Ts... xs)
 {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/gather.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/gather.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_GATHER_HPP
+#define MIGRAPHX_GUARD_KERNELS_GATHER_HPP
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/shape.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+namespace migraphx {
+template <int Axis, class Input, class Indices>
+constexpr auto gather_shape(Input input, Indices indices)
+{
+    auto lengths = input.lens;
+    lengths[Axis] = indices.elements();
+    return make_shape(lengths, input.strides);
+}
+template <int Axis, class Input, class Indices, class Output>
+__device__ void gather(Input input, Indices indices, Output output)
+{
+    auto ind           = make_index();
+    auto axis_dim_size = input.get_shape().lens[Axis];
+    constexpr auto out_comp = gather_shape<Axis>(get_shape_c<Input>{}, get_shape_c<Indices>{});
+    ind.global_stride(output.get_shape().elements(), [&](auto i) {
+        auto idx      = out_comp.multi(i);
+        auto in_index = indices[idx[Axis]];
+        auto new_in_index = (in_index < 0) ? in_index + axis_dim_size : in_index;
+        idx[Axis] = new_in_index;
+        output[i] = input[idx];
+    });
+}
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
@@ -132,9 +132,14 @@ MIGRAPHX_DEVICE_MATH_FOR(float, fmod, ::fmodf)
 // Builtin half functions
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, abs, ::__habs)
+MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, ceil, ::hceil)
+MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, cos, ::hcos)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, exp, ::hexp)
+MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, floor, ::hfloor)
+MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, isnan, ::__hisnan)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, log, ::hlog)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, rsqrt, ::hrsqrt)
+// MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, sin, ::hsin)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, sqrt, ::hsqrt)
 // Use float to compute half overload
@@ -144,16 +149,11 @@ MIGRAPHX_DEVICE_MATH_HALF(asin, ::asin)
 MIGRAPHX_DEVICE_MATH_HALF(asinh, ::asinh)
 MIGRAPHX_DEVICE_MATH_HALF(atan, ::atan)
 MIGRAPHX_DEVICE_MATH_HALF(atanh, ::atanh)
-MIGRAPHX_DEVICE_MATH_HALF(ceil, ::ceil)
-MIGRAPHX_DEVICE_MATH_HALF(cos, ::cos)
 MIGRAPHX_DEVICE_MATH_HALF(cosh, ::cosh)
 MIGRAPHX_DEVICE_MATH_HALF(erf, ::erf)
-MIGRAPHX_DEVICE_MATH_HALF(floor, ::floor)
-MIGRAPHX_DEVICE_MATH_HALF(isnan, ::isnan)
 MIGRAPHX_DEVICE_MATH_HALF(pow, ::pow)
 MIGRAPHX_DEVICE_MATH_HALF(remainder, ::remainder)
 MIGRAPHX_DEVICE_MATH_HALF(round, ::round)
-MIGRAPHX_DEVICE_MATH_HALF(sin, ::sin)
 MIGRAPHX_DEVICE_MATH_HALF(sinh, ::sinh)
 MIGRAPHX_DEVICE_MATH_HALF(tan, ::tan)
 MIGRAPHX_DEVICE_MATH_HALF(tanh, ::tanh)
@@ -166,19 +166,19 @@ MIGRAPHX_DEVICE_MATH_HALF(fmod, ::fmod)
 // at this time are: exp2, exp10, log2, log10, isinf
 MIGRAPHX_DEVICE_MATH_HALF2(abs, ::__habs2)
 MIGRAPHX_DEVICE_MATH_HALF2(ceil, ::h2ceil)
-MIGRAPHX_DEVICE_MATH_HALF2(floor, ::h2floor)
-MIGRAPHX_DEVICE_MATH_HALF2(sin, ::h2sin)
 MIGRAPHX_DEVICE_MATH_HALF2(cos, ::h2cos)
 MIGRAPHX_DEVICE_MATH_HALF2(exp, ::h2exp)
-MIGRAPHX_DEVICE_MATH_HALF2(exp2, ::h2exp2)
 MIGRAPHX_DEVICE_MATH_HALF2(exp10, ::h2exp10)
-MIGRAPHX_DEVICE_MATH_HALF2(log2, ::h2log2)
+MIGRAPHX_DEVICE_MATH_HALF2(exp2, ::h2exp2)
+MIGRAPHX_DEVICE_MATH_HALF2(floor, ::h2floor)
+MIGRAPHX_DEVICE_MATH_HALF2(isinf, ::__hisinf2)
+MIGRAPHX_DEVICE_MATH_HALF2(isnan, ::__hisnan2)
 MIGRAPHX_DEVICE_MATH_HALF2(log, ::h2log)
 MIGRAPHX_DEVICE_MATH_HALF2(log10, ::h2log10)
+MIGRAPHX_DEVICE_MATH_HALF2(log2, ::h2log2)
 MIGRAPHX_DEVICE_MATH_HALF2(rsqrt, ::h2rsqrt)
+// MIGRAPHX_DEVICE_MATH_HALF2(sin, ::h2sin)
 MIGRAPHX_DEVICE_MATH_HALF2(sqrt, ::h2sqrt)
-MIGRAPHX_DEVICE_MATH_HALF2(isinf, ::__hisinf2)
-MIGRAPHX_DEVICE_MATH_HALF2(isnan, ::__hisnan2)
 template <class T, class U>
 constexpr auto where(bool cond, const T& a, const U& b)
@@ -218,6 +218,14 @@ constexpr auto min(const T& a, const U& b)
    return min<common_type_t<T, U>>(a, b);
 }
+// Sin for half is broken on hip, so use cos instead
+template <class T, MIGRAPHX_REQUIRES(is_same<vec_type<T>, half>{})>
+constexpr T sin(T x)
+{
+    constexpr const T shift = M_PI_2;
+    return migraphx::cos(shift - x);
+}
 MIGRAPHX_DEVICE_MATH_VEC(abs)
 MIGRAPHX_DEVICE_MATH_VEC(acos)
 MIGRAPHX_DEVICE_MATH_VEC(acosh)