Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into select_module_op

c9497134 · charlie · e833a916 · 67f23675 · c9497134 · c9497134
Commit c9497134 authored Feb 15, 2023 by charlie
20 changed files
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -9,6 +9,8 @@ CheckOptions:
    value:           risky   
  - key:             modernize-loop-convert.NamingStyle
    value:           lower_case
+  - key:             misc-const-correctness.AnalyzeValues
+    value:           'false'
  - key:             performance-unnecessary-copy-initialization.AllowedTypes
    value:           'shape'
  - key:             performance-unnecessary-value-param.AllowedTypes

--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -32,7 +32,8 @@ jobs:
    # In this step, this action saves a list of existing images,
    # the cache is created without them in the post run.
    # It also restores the cache if it exists.
-    - uses: satackey/action-docker-layer-caching@v0.0.11
+    # name: Docker Layer Caching2
+    - uses: jpribyl/action-docker-layer-caching@v0.1.1
      # Ignore the failure of a step and avoid terminating the job.
      continue-on-error: true
@@ -81,7 +82,7 @@ jobs:
    # In this step, this action saves a list of existing images,
    # the cache is created without them in the post run.
    # It also restores the cache if it exists.
-    - uses: satackey/action-docker-layer-caching@v0.0.11
+    - uses: jpribyl/action-docker-layer-caching@v0.1.1
      # Ignore the failure of a step and avoid terminating the job.
      continue-on-error: true
@@ -126,7 +127,7 @@ jobs:
    # In this step, this action saves a list of existing images,
    # the cache is created without them in the post run.
    # It also restores the cache if it exists.
-    - uses: satackey/action-docker-layer-caching@v0.0.11
+    - uses: jpribyl/action-docker-layer-caching@v0.1.1
      # Ignore the failure of a step and avoid terminating the job.
      continue-on-error: true

--- a/.github/workflows/history.yaml
+++ b/.github/workflows/history.yaml
+name: History
+on:
+  workflow_dispatch:
+    inputs:
+      start_date:
+        description: Start date for results analysis
+        required: true
+        default: 'yyyy-mm-dd'
+      end_date:
+        description: End date for results analysis
+        required: true
+        default: 'yyyy-mm-dd'
+      history_repo:
+        description: Repository for history results between dates
+        required: true
+        default: 'ROCmSoftwarePlatform/migraphx-reports'
+jobs:
+  release:
+    uses: migraphx-benchmark/actions/.github/workflows/history.yml@main
+    with:
+      start_date: ${{ github.event.inputs.start_date || 'yyyy-mm-dd' }}
+      end_date: ${{ github.event.inputs.end_date || 'yyyy-mm-dd' }}
+      history_repo: ${{ github.event.inputs.history_repo || 'ROCmSoftwarePlatform/migraphx-reports' }}
+    secrets:
+      gh_token: ${{ secrets.MIGRAPHX_BOT_TOKEN }}
+      mail_user: ${{ secrets.MAIL_USERNAME }}
+      mail_pass: ${{ secrets.MAIL_PASSWORD }}
--- a/.github/workflows/performance.yaml
+++ b/.github/workflows/performance.yaml
@@ -12,7 +12,7 @@ on:
      rocm_release:
        description: ROCm Version
        required: true
-        default: '5.3'
+        default: '5.4.2'
      performance_reports_repo:
        description: Result repository
        required: true
@@ -24,7 +24,7 @@ on:
      flags:
        description: -m for Max value; -s for Std dev; -r for Threshold file
        required: true
-        default: '-s'
+        default: '-r'
 concurrency: "perftest-${{ github.head_ref ||  github.base_ref || 'schedule' }}"
@@ -32,9 +32,9 @@ jobs:
  release:
    uses: ROCmSoftwarePlatform/migraphx-benchmark/.github/workflows/perf-test.yml@main
    with:
-      rocm_release: ${{ github.event.inputs.rocm_release || '5.3' }}
+      rocm_release: ${{ github.event.inputs.rocm_release || '5.4.2' }}
      result_number: ${{ github.event.inputs.result_number || '10' }}
-      flags: ${{ github.event.inputs.flags || '-s' }} 
+      flags: ${{ github.event.inputs.flags || '-r' }}
      performance_reports_repo: ${{ github.event.inputs.performance_reports_repo || 'ROCmSoftwarePlatform/migraphx-reports' }} 
    secrets:
      gh_token: ${{ secrets.MIGRAPHX_BOT_TOKEN }}

--- a/.github/workflows/sync-onnxrt-main.yaml
+++ b/.github/workflows/sync-onnxrt-main.yaml
+name: Onnxruntime main weekly sync
+on:
+  schedule:
+    - cron: "05 17 * * 1"
+jobs:
+  runs-on: ubuntu-latest
+  sync:
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          ref: develop
+          path: ../
+  get_date:
+    steps:
+      - run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
+  update_file:
+    needs: [sync get_date]
+    steps:
+      - run: git clone https://github.com/microsoft/onnxruntime.git && cd onnxruntime && git rev-parse HEAD >> ../test/onnx/.onnxrt-commit
+  Add_commit:
+    needs: update_file
+    steps:
+      - name: Add & Commit
+        uses: EndBug/add-and-commit@v9.1.1
+        with:
+          new_branch: onnxruntime-sync-${{ steps.date.outputs.date }}
+          add: ../test/onnx/.onnxrt-commit
+          message: Update Onnxruntime commit to latest release
+          default_author: github_actions
+          push: true
+  PR:
+    needs: Add_commit
+    steps:
+      - name: GitHub Action for creating Pull Requests
+        uses: devops-infra/action-pull-request@v0.5.3
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          title: Sync Onnxruntime main
+          reviewer: pfultz2, causten
+          assignee: TedThemistokleous
+          label: automatic, onnxruntime
+          target_branch: develop
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,8 +5,12 @@ ARG PREFIX=/usr/local
 # Support multiarch
 RUN dpkg --add-architecture i386
+# Install rocm key
+RUN apt-get update && apt-get install -y gnupg2 --no-install-recommends curl && \
+    curl -sL http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - 
 # Add rocm repository
-RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/5.3/ ubuntu main > /etc/apt/sources.list.d/rocm.list'
+RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/5.4.2/ ubuntu main > /etc/apt/sources.list.d/rocm.list'
 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
@@ -32,10 +36,27 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    libnuma-dev \
    miopen-hip \
    rocblas \
+    hipfft \
+    rocthrust \
+    rocrand \
+    hipsparse \
+    rccl \
+    rccl-dev \
+    rocm-smi-lib \
+    rocm-dev \
+    roctracer-dev \
+    hipcub  \
+    hipblas  \
+    hipify-clang \
+    half \
+    libssl-dev \
    zlib1g-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
+# add this for roctracer dependancies
+RUN pip3 install CppHeaderParser packaging==22.0
 # Workaround broken rocm packages
 RUN ln -s /opt/rocm-* /opt/rocm
 RUN echo "/opt/rocm/lib" > /etc/ld.so.conf.d/rocm.conf
@@ -65,25 +86,26 @@ ADD doc/requirements.txt /doc-requirements.txt
 RUN pip3 install -r /doc-requirements.txt
 # Download real models to run onnx unit tests
-ENV ONNX_HOME=$HOME
+ENV ONNX_HOME=/.onnx
 COPY ./tools/download_models.sh /
 RUN /download_models.sh && rm /download_models.sh
 # Install latest ccache version
 RUN cget -p $PREFIX install facebook/zstd@v1.4.5 -X subdir -DCMAKE_DIR=build/cmake
 RUN cget -p $PREFIX install ccache@v4.1 -DENABLE_TESTING=OFF
+RUN cget -p /opt/cmake install kitware/cmake@v3.24.3
-# Install newer cmake for onnx runtime
+COPY ./test/onnx/.onnxrt-commit /
-ARG CMAKE_VERSION=3.24.2
-RUN cget -p /opt/cmake install -X binary https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz
 ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
 ARG ONNXRUNTIME_BRANCH=main
-ARG ONNXRUNTIME_COMMIT=24f1bd6156cf5968bbc76dfb0e801a9b9c56b9fc
+ARG ONNXRUNTIME_COMMIT
 RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime && \
    cd onnxruntime && \
-    git checkout ${ONNXRUNTIME_COMMIT} && \
+    if [ -z "$ONNXRUNTIME_COMMIT" ] ; then git checkout $(cat /.onnxrt-commit) ; else git checkout ${ONNXRUNTIME_COMMIT} ; fi && \
-    /bin/sh dockerfiles/scripts/install_common_deps.sh
+    /bin/sh /onnxruntime/dockerfiles/scripts/install_common_deps.sh
 ADD tools/build_and_test_onnxrt.sh /onnxruntime/build_and_test_onnxrt.sh

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -15,11 +15,13 @@ def rocmtestnode(Map conf) {
        def compiler = bconf.get("compiler", "/opt/rocm/llvm/bin/clang++")
        def flags = bconf.get("flags", "")
        def gpu_debug = bconf.get("gpu_debug", "0")
+        def hiprtc_workarounds = bconf.get("hiprtc_workarounds", "0")
        def cmd = """
            ulimit -c unlimited
            echo "leak:dnnl::impl::malloc" > suppressions.txt
            export LSAN_OPTIONS="suppressions=\$(pwd)/suppressions.txt"
            export MIGRAPHX_GPU_DEBUG=${gpu_debug}
+            export MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS=${hiprtc_workarounds}
            export CXX=${compiler}
            export CXXFLAGS='-Werror'
            env
@@ -110,6 +112,10 @@ rocmtest clang_debug: rocmnode('vega') { cmake_build ->
        cmake_build(flags: "-DCMAKE_BUILD_TYPE=release")
        stash includes: 'build/*.deb', name: 'migraphx-package'
    }
+}, hiprtc_gpu_debug: rocmnode('vega') { cmake_build ->
+    stage('HipRTC GPU Debug') {
+        cmake_build(flags: "-DCMAKE_BUILD_TYPE=release -DMIGRAPHX_USE_HIPRTC=On", gpu_debug: true, hiprtc_workarounds:  true)
+    }
 }, mlir_debug: rocmnode('vega') { cmake_build ->
    stage('MLIR Debug') {
        def sanitizers = "undefined"

--- a/hip-clang.docker
+++ b/hip-clang.docker
@@ -6,7 +6,7 @@ ARG PREFIX=/usr/local
 RUN dpkg --add-architecture i386
 # Add rocm repository
-RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/5.3/ ubuntu main > /etc/apt/sources.list.d/rocm.list'
+RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/5.4.2/ ubuntu main > /etc/apt/sources.list.d/rocm.list'
 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -58,14 +58,14 @@ add_library(migraphx
    layout_nhwc.cpp
    load_save.cpp
    make_op.cpp
+    memory_coloring.cpp
    module.cpp
    msgpack.cpp
    normalize_attributes.cpp
    normalize_ops.cpp
    op_enums.cpp
    operation.cpp
-    opt/memory_coloring.cpp
+    optimize_module.cpp
-    opt/memory_coloring_impl.cpp
    pad_calc.cpp
    pass_manager.cpp
    permutation.cpp

--- a/src/include/migraphx/check_shapes.hpp
+++ b/src/include/migraphx/check_shapes.hpp
@@ -87,7 +87,7 @@ struct check_shapes
    }
    /*!
-     * Check if the number of shape objects is equal to atleast one of the
+     * Require the number of shape objects to equal to one of the
     * given sizes.
     * \param ns template parameter pack of sizes to check against
     */
@@ -100,6 +100,23 @@ struct check_shapes
        return *this;
    }
+    /*!
+     * Require the number of shape objects to equal at least a given amount.  Use this
+     * method for ops that can take any number (variadic) of inputs.
+     * \param n min. number of shapes
+     */
+    const check_shapes& has_at_least(std::size_t n) const
+    {
+        if(this->size() < n)
+            MIGRAPHX_THROW(prefix() + "Wrong number of arguments: expected at least " +
+                           to_string(n) + " but given " + std::to_string(size()));
+        return *this;
+    }
+    /*!
+     * Require all shapes to have the same number of elements.
+     * \param n  number of
+     */
    const check_shapes& nelements(std::size_t n) const
    {
        if(not this->all_of([&](const shape& s) { return s.elements() == n; }))

--- a/src/include/migraphx/half.hpp
+++ b/src/include/migraphx/half.hpp
@@ -58,12 +58,12 @@ using deduce = typename detail::deduce<T>::type;
 namespace std {
 template <class T>
-struct common_type<migraphx::half, T> : std::common_type<float, T>
+struct common_type<migraphx::half, T> : std::common_type<float, T> // NOLINT
 {
 };
 template <class T>
-struct common_type<T, migraphx::half> : std::common_type<float, T>
+struct common_type<T, migraphx::half> : std::common_type<float, T> // NOLINT
 {
 };

--- a/src/include/migraphx/instruction_ref.hpp
+++ b/src/include/migraphx/instruction_ref.hpp
@@ -41,7 +41,7 @@ migraphx::instruction* as_address(const instruction_ref& ins) noexcept;
 namespace std {
 template <>
-struct hash<migraphx::instruction_ref>
+struct hash<migraphx::instruction_ref> // NOLINT
 {
    using argument_type = migraphx::instruction_ref;
    using result_type   = std::size_t;
@@ -52,7 +52,7 @@ struct hash<migraphx::instruction_ref>
 };
 template <>
-struct equal_to<migraphx::instruction_ref>
+struct equal_to<migraphx::instruction_ref> // NOLINT
 {
    using argument_type = migraphx::instruction_ref;
    using result_type   = bool;

--- a/src/include/migraphx/match/layernorm.hpp
+++ b/src/include/migraphx/match/layernorm.hpp
@@ -36,22 +36,46 @@ template <class F>
 struct layernorm_matcher
 {
    F f;
+    auto last_axis() const
+    {
+        return make_basic_pred_matcher([](instruction_ref ins) {
+            auto v = ins->get_operator().to_value();
+            if(not v.contains("axes"))
+                return false;
+            auto axes = v["axes"].to_vector<std::size_t>();
+            if(axes.size() != 1)
+                return false;
+            return axes.front() == ins->inputs().front()->get_shape().lens().size() - 1;
+        });
+    }
+    auto reduce_mean() const { return f("reduce_mean")(last_axis()); }
    auto x_minus_mean() const
    {
-        return f("sub")(arg(0)(any().bind("x")), arg(1)(skip_broadcasts(f("reduce_mean"))));
+        return f("sub")(arg(0)(any().bind("x")), arg(1)(skip_broadcasts(reduce_mean())));
    }
    auto variance() const
    {
-        return f("reduce_mean")(arg(0)(f("pow")(arg(0)(x_minus_mean()), arg(1)(has_value(2.0f)))));
+        return reduce_mean()(arg(0)(any_of(
+            f("pow")(arg(0)(x_minus_mean()), arg(1)(has_value(2.0f))),
+            f("mul")(arg(0)(x_minus_mean()), arg(1)(x_minus_mean())),
+            f("sqdiff")(either_arg(0, 1)(any().bind("x"), skip_broadcasts(reduce_mean()))))));
    }
-    auto layernorm_onnx() const
+    auto sqrt_add_eps(const std::string& name) const
    {
-        return f("div")(arg(0)(x_minus_mean()),
+        auto add_eps = f("add")(either_arg(0, 1)(variance(), is_constant().bind("eps")));
+        return skip_broadcasts(f(name)(arg(0)(any_of(add_eps, variance()))));
+    }
-                        arg(1)(skip_broadcasts(f("sqrt")(arg(0)(
+    auto layernorm_onnx() const
-                            f("add")(either_arg(0, 1)(variance(), is_constant().bind("eps"))))))));
+    {
+        auto div_sqrt  = f("div")(arg(0)(x_minus_mean()), arg(1)(sqrt_add_eps("sqrt")));
+        auto mul_rsqrt = f("mul")(either_arg(0, 1)(x_minus_mean(), sqrt_add_eps("rsqrt")));
+        return any(any_of(div_sqrt, mul_rsqrt));
    }
    auto matcher() const { return layernorm_onnx(); }

--- a/src/include/migraphx/memory_coloring.hpp
+++ b/src/include/migraphx/memory_coloring.hpp
@@ -39,7 +39,7 @@ struct memory_coloring
 {
    std::string allocation_op{};
    bool verify = false;
-    std::string name() const { return "memory coloring"; }
+    std::string name() const { return "memory_coloring"; }
    void apply(module& m) const;
 };

--- a/src/include/migraphx/op/concat.hpp
+++ b/src/include/migraphx/op/concat.hpp
@@ -26,6 +26,7 @@
 #include <array>
 #include <migraphx/check_shapes.hpp>
+#include <migraphx/dyn_output.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
 #include <migraphx/literal.hpp>
@@ -73,49 +74,87 @@ struct concat
        }
        return offsets;
    }
    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        if(inputs.empty())
+        // inputs can contain 1 or more shapes (variadic).  compute_shape_op ensures there must
+        // be at least 1.
+        check_shapes{inputs, *this, true}.same_ndims().same_type();
+        if(std::none_of(inputs.begin(), inputs.end(), [&](const shape& s) { return s.dynamic(); }))
        {
-            MIGRAPHX_THROW("CONCAT: Number of input tensors should exceed 0");
+            // Static input shapes
+            const auto& first_shape_lens = inputs.front().lens();
+            const auto& type             = inputs.front().type();
+            for(std::size_t ll = 0; ll < first_shape_lens.size(); ll++)
+            {
+                if(ll != axis)
+                {
+                    if(not std::all_of(inputs.begin(), inputs.end(), [&](auto s) {
+                           return s.lens()[ll] == first_shape_lens[ll];
+                       }))
+                    {
+                        MIGRAPHX_THROW("CONCAT: all input dimensions should match along axis " +
+                                       std::to_string(ll));
+                    }
+                }
+            }
+            std::size_t new_dim_axis = 0;
+            for(const auto& input : inputs)
+            {
+                const auto& lens = input.lens();
+                new_dim_axis += lens[axis];
+            }
+            std::vector<std::size_t> new_lens = first_shape_lens;
+            new_lens[axis]                    = new_dim_axis;
+            return shape::from_permutation(type, new_lens, find_permutation(inputs));
        }
+        else if(std::all_of(
-        const auto& first_shape_lens = inputs.front().lens();
+                    inputs.begin(), inputs.end(), [&](const shape& s) { return s.dynamic(); }))
-        const auto& type             = inputs.front().type();
-        for(std::size_t l = 0; l < first_shape_lens.size(); l++)
        {
-            if(l != axis)
+            // Dynamic input shapes
+            for(std::size_t index = 0; index < inputs[0].ndim(); index++)
            {
-                if(not std::all_of(inputs.begin(), inputs.end(), [&](auto s) {
+                if(index != axis)
-                       return s.lens()[l] == first_shape_lens[l];
-                   }))
                {
-                    MIGRAPHX_THROW("CONCAT: Non-axis dimensions should match");
+                    if(not std::all_of(inputs.begin(), inputs.end(), [&](const shape& s) {
+                           return s.dyn_dims()[index] == inputs[0].dyn_dims()[index];
+                       }))
+                        MIGRAPHX_THROW("CONCAT: all input dimensions should match in axis " +
+                                       std::to_string(index));
                }
            }
+            std::size_t new_min = 0;
+            std::size_t new_max = 0;
+            for(const auto& input : inputs)
+            {
+                auto ddim = input.dyn_dims()[axis];
+                new_min += ddim.min;
+                new_max += ddim.max;
+            }
+            auto new_dims  = inputs[0].dyn_dims();
+            new_dims[axis] = migraphx::shape::dynamic_dimension{new_min, new_max, 0};
+            return {inputs[0].type(), new_dims};
        }
-        std::size_t new_dim_axis = 0;
+        else
-        for(const auto& input : inputs)
        {
-            const auto& lens = input.lens();
+            MIGRAPHX_THROW("CONCAT: Cannot mix static and dynamic input shapes.");
-            new_dim_axis += lens[axis];
        }
-        std::vector<std::size_t> new_lens;
-        std::copy(first_shape_lens.begin(), first_shape_lens.end(), std::back_inserter(new_lens));
-        new_lens[axis] = new_dim_axis;
-        return shape::from_permutation(type, new_lens, find_permutation(inputs));
    }
-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
-        std::vector<std::size_t> coffsets = compute_offsets(output_shape, args);
+        std::vector<std::size_t> coffsets = compute_offsets(dyn_out.computed_shape, args);
        for(std::size_t l = 0; l < args.size(); l++)
        {
            auto argl = args[l];
            visit_all(result, argl)([&](auto output, auto input) {
-                auto slice_shape =
+                auto slice_shape = shape{dyn_out.computed_shape.type(),
-                    shape{output_shape.type(), input.get_shape().lens(), output_shape.strides()};
+                                         input.get_shape().lens(),
-                auto slice = make_view(slice_shape, output.data() + coffsets[l]);
+                                         dyn_out.computed_shape.strides()};
+                auto slice       = make_view(slice_shape, output.data() + coffsets[l]);
                std::copy(input.begin(), input.end(), slice.begin());
            });
        }

--- a/src/include/migraphx/op/gather.hpp
+++ b/src/include/migraphx/op/gather.hpp
@@ -26,6 +26,7 @@
 #include <array>
 #include <migraphx/check_shapes.hpp>
+#include <migraphx/dyn_output.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
 #include <migraphx/literal.hpp>
@@ -61,35 +62,59 @@ struct gather
    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2);
+        check_shapes{inputs, *this, true}.has(2);
-        auto lens = inputs[0].lens();
+        shape data    = inputs[0];
-        auto type = inputs[0].type();
+        shape indices = inputs[1];
-        lens.erase(lens.begin() + axis);
+        auto type     = data.type();
-        if(not inputs[1].scalar())
+        // If index_dims is dynamic, convert the data to dynamic too.
+        if(indices.dynamic())
        {
-            auto ind_lens = inputs[1].lens();
+            data = data.to_dynamic();
-            lens.insert(lens.begin() + axis, ind_lens.begin(), ind_lens.end());
        }
+        if(data.dynamic())
-        // for scalar output
-        if(lens.empty())
        {
-            return {type};
+            auto dims = data.dyn_dims();
+            dims.erase(dims.begin() + axis);
+            if(not indices.scalar())
+            {
+                auto index_dims = indices.to_dynamic().dyn_dims();
+                dims.insert(dims.begin() + axis, index_dims.begin(), index_dims.end());
+            }
+            return {type, dims};
        }
+        else
+        {
+            // Both data and indices are static.  indices may be scalar
+            auto lens = data.lens();
+            lens.erase(lens.begin() + axis);
-        return {type, lens};
+            if(not indices.scalar())
+            {
+                auto ind_lens = indices.lens();
+                lens.insert(lens.begin() + axis, ind_lens.begin(), ind_lens.end());
+            }
+            // for scalar output
+            if(lens.empty())
+            {
+                return {type};
+            }
+            return {type, lens};
+        }
    }
-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
        // negative axis means counting dimensions from back
        auto lens                 = args[0].get_shape().lens();
        std::size_t axis_dim_size = lens[axis];
        // max dimension in axis
        visit_all(result, args[0])([&](auto output, auto data) {
            args[1].visit([&](auto indices) {
-                if(output_shape.scalar())
+                if(dyn_out.computed_shape.scalar())
                {
                    auto in_index = indices.front();
                    in_index      = (in_index < 0) ? in_index + axis_dim_size : in_index;

--- a/src/include/migraphx/op/gathernd.hpp
+++ b/src/include/migraphx/op/gathernd.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -25,6 +25,7 @@
 #define MIGRAPHX_GUARD_OPERATORS_GATHERND_HPP
 #include <migraphx/check_shapes.hpp>
+#include <migraphx/dyn_output.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/par_for.hpp>
 #include <migraphx/argument.hpp>
@@ -47,33 +48,103 @@ struct gathernd
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2);
+        check_shapes{inputs, *this, true}.has(2);
-        auto r = inputs.front().lens().size();
+        auto i_shape    = inputs.back();
-        auto q = inputs.back().lens().size();
+        auto data_shape = inputs.front();
-        auto k = inputs.back().lens().back();
+        auto r          = data_shape.ndim();
+        auto q          = i_shape.ndim();
+        size_t k;
+        if(i_shape.dynamic())
+        {
+            // the rank of the output is a function of k, so it must be fixed.
+            if(not i_shape.dyn_dims().back().is_fixed())
+            {
+                MIGRAPHX_THROW(
+                    "GATHERND: last dimension of indices tensor must be fixed (min=max)");
+            }
+            k = i_shape.dyn_dims().back().min;
+        }
+        else
+            k = i_shape.lens().back();
+        // Begin input validation checks.
+        int output_ndim = int(q) + r - k - batch_dims - 1;
        if(k > r - batch_dims)
        {
            MIGRAPHX_THROW("GATHERND: Indices of length " + std::to_string(k) +
                           " cannot be used to access data of rank " +
                           std::to_string(r - batch_dims));
        }
-        auto indices_lens_iter = inputs.back().lens().begin();
-        auto output_lens_size  = q + r - k - batch_dims - 1;
+        if(batch_dims >= q or batch_dims >= r)
-        std::vector<std::size_t> output_lens(output_lens_size);
+        {
-        std::copy(indices_lens_iter, indices_lens_iter + (q - 1), output_lens.begin());
+            MIGRAPHX_THROW("GATHERND: rank of an input cannot be less than batch_dims=" +
-        if(k < r - batch_dims)
+                           std::to_string(batch_dims));
+        }
+        if(output_ndim < 0)
+        {
+            MIGRAPHX_THROW("GATHERND: Indices too large for static data input: k=" +
+                           std::to_string(k));
+        }
+        if(migraphx::none_of(inputs, [](auto v) { return v.dynamic(); }))
+        {
+            auto indices_lens_iter = i_shape.lens().begin();
+            // A rank 0 output is a scalar
+            if(output_ndim == 0)
+                return shape{data_shape.type(), {1}};
+            // Part of the output shape comes from indices tensor, part from data tensor
+            std::vector<std::size_t> output_lens(output_ndim);
+            std::copy(indices_lens_iter, indices_lens_iter + (q - 1), output_lens.begin());
+            // fill the rest of output shape from data tensor
+            if(k + batch_dims < r)
+            {
+                auto data_lens = data_shape.lens();
+                std::copy(data_lens.begin() + batch_dims + k,
+                          data_lens.end(),
+                          output_lens.begin() + q - 1);
+            }
+            shape output_shape{data_shape.type(), output_lens};
+            return output_shape;
+        }
+        else
        {
-            auto data_lens = inputs.front().lens();
+            // If one or both inputs are dynamic shapes, the output is dynamic.
-            std::copy(
+            // Make both inputs dynamic to simplify computations.
-                data_lens.begin() + batch_dims + k, data_lens.end(), output_lens.begin() + q - 1);
+            data_shape = data_shape.to_dynamic();
+            i_shape    = i_shape.to_dynamic();
+            // A rank 0 output is a scalar
+            if(output_ndim == 0)
+                return shape(data_shape.type(), {shape::dynamic_dimension({1, 1, 0})});
+            // Part of the output shape comes from indices tensor, part from data tensor
+            std::vector<shape::dynamic_dimension> output_dims(output_ndim);
+            std::copy(i_shape.dyn_dims().begin(),
+                      i_shape.dyn_dims().begin() + q - 1,
+                      output_dims.begin());
+            // fill the rest of output shape from data tensor
+            if(k + batch_dims < r)
+            {
+                auto data_dims = data_shape.dyn_dims();
+                std::copy(data_dims.begin() + batch_dims + k,
+                          data_dims.begin() + r,
+                          output_dims.begin() + q - 1);
+            }
+            shape output_shape(data_shape.type(), output_dims);
+            return output_shape;
        }
-        shape output_shape{inputs.front().type(), output_lens};
-        return output_shape;
    }
-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
        visit_all(result, args[0])([&](auto output, auto data) {
            args[1].visit([&](auto indices) {
                auto indices_shape        = indices.get_shape();

--- a/src/include/migraphx/op/normalize_attribute.hpp
+++ b/src/include/migraphx/op/normalize_attribute.hpp
@@ -31,18 +31,30 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {
-// different attributes
+/**
-// 1) use_input(default)/use_output
+ * `normalize_attribute` settings:
-// 2) use_rank(default)/use_len
+ * Note that default options are not included as enums.
-// 3) clip_min(default)/not_clip_min
+ * 1. `use_input` (default) vs. `use_output`:
-//   3.1) include_min(default)/exclude_min
+ *  Affects the rank of the attribute.
-// 4) clip_max(default)/not_clip_max
+ *  `use_input -> lens.size()`, `use_output -> lens.size() + vec.size()`.
-//   4.1) exclude_max(default)/include_max
+ * 2. use_rank (default) vs use_len:
-// 5) normalize padding
+ *  `use_rank` sets the max value/index of the attribute as the rank of lens.
+ *  `use_lens` sets the max value/index as the corresponding value in lens at the axes index.
+ * 3. `clip_min` vs. `not_clip_min` (default):
+ *  Clip values less than the minimum to the minimum or not.
+ * 4. `include_min` vs. `exclude_min` (default):
+ *  Include or exclude the minimum value/index for range checking and clipping.
+ * 5. `clip_max` vs. `not_clip_max` (default):
+ *  Clip values greater than the maximum or not.
+ * 6. `include_max` vs. `exclude_max` (default):
+ *  Include or exclude the maximum value/index for range checking and clipping.
+ * 7. `normalize_padding`:
+ *  To normalize the padding to `2*(pad ndim)` dimensions.
+ */
 enum class normalize_attribute
 {
-    use_len,
    use_output,
+    use_len,
    clip_max,
    clip_min,
    include_max,

--- a/src/include/migraphx/op/scatternd_op.hpp
+++ b/src/include/migraphx/op/scatternd_op.hpp
@@ -28,44 +28,89 @@
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/par_for.hpp>
+#include <migraphx/ranges.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {
+/**
+ * @brief
+ * N-dimensional Scatter operations. This struct is parent class to ops which differ in what formula
+ * is used to reduce (combine old and new values of) the scattered value.  It was originally based
+ * on Onnx ScatterND operation (see
+ * https://github.com/onnx/onnx/blob/main/docs/Operators.md#ScatterND) and is also similar to Numpy
+ * numpy.add.at().
+ *
+ * @tparam Derived   a template parameter in the CRTP inheritance idiom, represents one of the child
+ * operations.
+ */
 template <class Derived>
 struct scatternd_op : op_name<Derived>
 {
+    /** Validate input shapes and return the correct output shape.  For Scatter ops, the output
+     * is the same shape as the data tensor (first input), but cast to a standard shape.
+     *
+     */
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(3);
+        check_shapes{inputs, *this, true}.has(3);
-        auto r         = inputs.front().lens().size();
+        auto data_shape  = inputs.front();
-        auto q         = inputs.at(1).lens().size();
+        auto index_shape = inputs.at(1);
-        auto k         = inputs.at(1).lens().back();
+        auto upd_shape   = inputs.back();
-        auto ind_lens  = inputs.at(1).lens();
-        auto upd_lens  = inputs.back().lens();
+        auto r = data_shape.ndim();
-        auto data_lens = inputs.front().lens();
+        auto q = index_shape.ndim();
+        size_t k;
+        if(index_shape.dynamic())
+        {
+            // the rank of the output is a function of k, so k must be fixed.
+            if(not index_shape.dyn_dims().back().is_fixed())
+            {
+                MIGRAPHX_THROW(
+                    "GATHERND: last dimension of indices tensor must be fixed (min=max)");
+            }
+            k = index_shape.dyn_dims().back().min;
+        }
+        else
+            k = index_shape.lens().back();
+        // Checks on the sizes of input tensors
+        if(q + r != upd_shape.ndim() + k + 1)
+            MIGRAPHX_THROW("ScatterND:  ranks of inputs don't match. " + std::to_string(q) + " + " +
+                           std::to_string(r) + " - " + std::to_string(k) +
+                           " - 1 != " + std::to_string(upd_shape.ndim()));
        if(k > r)
            MIGRAPHX_THROW("ScatterND: index of size " + std::to_string(k) +
                           " is too large for tensor of rank " + std::to_string(r));
-        if(not(std::equal(ind_lens.begin(), ind_lens.begin() + q - 1, upd_lens.begin()) and
-               std::equal(data_lens.begin() + k, data_lens.end(), upd_lens.begin() + q - 1)))
+        // Convert all static shape dimensions to dynamic so they can be compared.
-            MIGRAPHX_THROW("ScatterND: incorrect update shape. update.lens != indices.lens[0:q-1] "
+        // It's possible for some of the 3 inputs to be dynamic shapes and some static,
-                           "++ data.lens[k:r-1]");
+        // but any dynamic dimension that's compared to a static dimension must be fixed.
-        auto s = inputs.front();
+        auto ind_dims  = index_shape.to_dynamic().dyn_dims();
-        if(s.broadcasted())
+        auto upd_dims  = upd_shape.to_dynamic().dyn_dims();
+        auto data_dims = data_shape.to_dynamic().dyn_dims();
+        // Check that corresponding portions of tensor shapes match.
+        if(not(std::equal(ind_dims.begin(), ind_dims.begin() + q - 1, upd_dims.begin()) and
+               std::equal(data_dims.begin() + k, data_dims.end(), upd_dims.begin() + q - 1)))
+            MIGRAPHX_THROW("ScatterND: incorrect update shape. Update dimensions must match "
+                           "indices and data.");
+        if(data_shape.dynamic())
+            return data_shape;
+        else if(data_shape.broadcasted())
        {
-            return {s.type(), s.lens()};
+            return {data_shape.type(), data_shape.lens()};
        }
        else
        {
-            return s.with_lens(s.lens());
+            return data_shape.with_lens(data_shape.lens());
        }
    }
-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
        auto& self = static_cast<const Derived&>(*this);
        visit_all(result, args[0], args[2])([&](auto output, auto data, auto updates) {
            std::copy(data.begin(), data.end(), output.begin());
@@ -74,8 +119,8 @@ struct scatternd_op : op_name<Derived>
                auto updates_std   = shape{updates_shape.type(), updates_shape.lens()};
                auto indices_shape = indices.get_shape();
                auto k             = indices_shape.lens().back();
-                auto q             = indices_shape.lens().size();
+                auto q             = indices_shape.ndim();
-                auto r             = output_shape.lens().size();
+                auto r             = dyn_out.computed_shape.ndim();
                par_for(updates_shape.elements(), [&](const auto i) {
                    auto updates_idx = updates_std.multi(i);
                    std::vector<std::size_t> indices_idx(q, 0);
@@ -89,7 +134,7 @@ struct scatternd_op : op_name<Derived>
                    std::copy(index_start, index_end, out_idx.begin());
                    std::copy(updates_idx.begin() + q - 1, updates_idx.end(), out_idx.begin() + k);
-                    self.reduction()(output[output_shape.index(out_idx)], updates[i]);
+                    self.reduction()(output[dyn_out.computed_shape.index(out_idx)], updates[i]);
                });
            });
        });

--- a/src/include/migraphx/op/where.hpp
+++ b/src/include/migraphx/op/where.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -42,9 +42,17 @@ struct where
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(3).same_dims();
+        check_shapes{inputs, *this, true}.has(3).same_dims();
        auto s1 = inputs.at(1);
        auto s2 = inputs.at(2);
+        if(s1.dynamic() or s2.dynamic())
+        {
+            if(s1 == s2)
+                return s1;
+            MIGRAPHX_THROW("WHERE: dynamic input shapes must be the same");
+        }
+        // Compare two static shapes, returning a standard shape
        if(s1 == s2 and s1.packed())
        {
            return s1;
@@ -63,12 +71,12 @@ struct where
        }
    }
-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    argument compute(const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
        visit_all(result, args[1], args[2])([&](auto output, const auto x, const auto y) {
            args[0].visit([&](const auto condition) {
-                par_for(output_shape.elements(),
+                par_for(dyn_out.computed_shape.elements(),
                        [&](auto i) { output[i] = condition[i] ? x[i] : y[i]; });
            });
        });