Merge branch 'develop' into blas_tuning

18cf0435 · Umang Yadav · GitHub · 12258d8f · 3e8d7196 · 18cf0435
Unverified Commit 18cf0435 authored Apr 18, 2023 by Umang Yadav Committed by GitHub Apr 18, 2023
20 changed files
--- a/.dockerignore
+++ b/.dockerignore
+# Ignore everything
+**
+
+# Allow files and directories
+!*.txt
+!*.ini
+!/tools/*.sh
+!/doc/*.txt
+!/test/onnx/.onnxrt-commit
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
 name: migraphx

-on: [push, pull_request]
+on: 
+  pull_request:
+  push:
+    branches: 
+      - develop
+      - master
+      - 'release/**'
+

 jobs:
  cancel:
@@ -17,40 +24,29 @@ jobs:
    - name: Free space
      run: |
        sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws* /usr/local/lib/heroku 
-        du . --max-depth=1 -h
-        ls -la
-        cd /usr/local
-        du . --max-depth=1 -h
-        ls -la
-        cd /usr/local/lib
-        echo $(pwd)
-        du . --max-depth=1 -h
-        ls -la

    - uses: actions/checkout@v3

    # In this step, this action saves a list of existing images,
    # the cache is created without them in the post run.
    # It also restores the cache if it exists.
-    # name: Docker Layer Caching2
-    - uses: jpribyl/action-docker-layer-caching@v0.1.1
+    - name: Docker layer cache
+      uses: jpribyl/action-docker-layer-caching@v0.1.1
+      with:
+        key: docker-layer-caching-migraphx-${{hashFiles('hip-clang.docker', '**/*requirements.txt', '**/install_prereqs.sh', 'rbuild.ini')}}
+        restore-keys:
+          docker-layer-caching-migraphx-
      # Ignore the failure of a step and avoid terminating the job.
      continue-on-error: true

-    - name: Prepare timestamp
-      id: cache_timestamp
-      shell: bash 
-      run: echo timestamp="$(date +'%Y-%m-%dT%H:%M:%S')" >> $GITHUB_OUTPUT
-
-    - name: Cache files for tidy
-      uses: pat-s/always-upload-cache@v3.0.11
+    - name: Restore cache files for tidy
+      uses: actions/cache/restore@v3 
+      id: tidy_restore
      with:
        path: tidy-cache
-        key: tidy-cache-${{ steps.cache_timestamp.outputs.timestamp }}
-        restore-keys: |
-            tidy-cache-${{ steps.cache_timestamp.outputs.timestamp }}
-            tidy-cache-
-
+        key: tidy-cache-${{ github.ref }}
+        restore-keys: tidy-cache-
+        
    - name: Build the Docker image
      run: docker build . --file hip-clang.docker --tag migraphx

@@ -70,6 +66,25 @@ jobs:
          ..
        make -j2 -k onnx-proto tf-proto tidy

+    # GH actions can not update existing cache, as a workaround clear cache and then save it
+    - name: Clear tidy cache before saving
+      if: ${{ steps.tidy_restore.outputs.cache-hit }}
+      shell: bash
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      run: |
+        gh extension install actions/gh-actions-cache --pin v1.0.1
+        gh actions-cache delete ${{ steps.tidy_restore.outputs.cache-matched-key }} --confirm
+      continue-on-error: true
+
+    - name: Save cache files for tidy
+      uses: actions/cache/save@v3 
+      if: always()
+      with:
+        path: tidy-cache
+        key: tidy-cache-${{ github.ref }}
+
+
  cppcheck:
    runs-on: ubuntu-20.04

@@ -81,23 +96,22 @@ jobs:
    # In this step, this action saves a list of existing images,
    # the cache is created without them in the post run.
    # It also restores the cache if it exists.
-    - uses: jpribyl/action-docker-layer-caching@v0.1.1
+    - name: Docker layer cache
+      uses: jpribyl/action-docker-layer-caching@v0.1.1
+      with:
+        key: docker-layer-caching-migraphx-${{hashFiles('hip-clang.docker', '**/*requirements.txt', '**/install_prereqs.sh', 'rbuild.ini')}}
+        restore-keys:
+          docker-layer-caching-migraphx-
      # Ignore the failure of a step and avoid terminating the job.
      continue-on-error: true

-    - name: Prepare timestamp
-      id: cache_timestamp
-      shell: bash 
-      run: echo timestamp="$(date +'%Y-%m-%dT%H:%M:%S')" >> $GITHUB_OUTPUT
-
-    - name: Cache files for cppcheck
-      uses: pat-s/always-upload-cache@v2.1.3
+    - name: Restore cache files for cppcheck
+      id: cppcheck_restore
+      uses: actions/cache/restore@v3
      with:
        path: cppcheck-cache
-        key: cppcheck-cache-${{ hashFiles('cppcheck.rules', 'CMakeLists.txt') }}-${{ steps.cache_timestamp.outputs.timestamp }}
-        restore-keys: |
-            cppcheck-cache-${{ hashFiles('cppcheck.rules', 'CMakeLists.txt') }}-${{ steps.cache_timestamp.outputs.timestamp }}
-            cppcheck-cache-${{ hashFiles('cppcheck.rules', 'CMakeLists.txt') }}-
+        key: cppcheck-cache-${{ hashFiles('cppcheck.rules', 'CMakeLists.txt') }}-${{ github.ref }}
+        restore-keys: cppcheck-cache-${{ hashFiles('cppcheck.rules', 'CMakeLists.txt') }}-

    - name: Build the Docker image
      run: docker build . --file hip-clang.docker --tag migraphx
@@ -114,6 +128,25 @@ jobs:
          ..
        make -j2 cppcheck

+    # GH actions can not update existing cache, as a workaround clear cache and then save it
+    - name: Clear cppcheck cache before saving
+      if: ${{ steps.cppcheck_restore.outputs.cache-hit }}
+      shell: bash
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      run: |
+        gh extension install actions/gh-actions-cache --pin v1.0.1
+        gh actions-cache delete ${{ steps.cppcheck_restore.outputs.cache-matched-key }} --confirm
+      continue-on-error: true
+
+    - name: Save cache files for cppcheck
+      uses: actions/cache/save@v3
+      if: always()
+      with:
+        path: cppcheck-cache
+        key: cppcheck-cache-${{ hashFiles('cppcheck.rules', 'CMakeLists.txt') }}-${{ github.ref }}
+
+
  format:
    runs-on: ubuntu-20.04

@@ -125,7 +158,12 @@ jobs:
    # In this step, this action saves a list of existing images,
    # the cache is created without them in the post run.
    # It also restores the cache if it exists.
-    - uses: jpribyl/action-docker-layer-caching@v0.1.1
+    - name: Docker layer cache
+      uses: jpribyl/action-docker-layer-caching@v0.1.1
+      with:
+        key: docker-layer-caching-migraphx-${{hashFiles('hip-clang.docker', '**/*requirements.txt', '**/install_prereqs.sh', 'rbuild.ini')}}
+        restore-keys:
+          docker-layer-caching-migraphx-
      # Ignore the failure of a step and avoid terminating the job.
      continue-on-error: true

@@ -206,8 +244,13 @@ jobs:
          - codecov

    steps:
-    - name: Free space
-      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws*  /usr/local/lib/heroku
+    - name: Free space and install rbuild, lld
+      run: |
+        sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws*  /usr/local/lib/heroku
+        sudo apt-get install -y lld
+        python -m pip install --upgrade pip
+        pip install https://github.com/RadeonOpenCompute/rbuild/archive/master.tar.gz
+
    - uses: actions/checkout@v3
    - name: Set up Python
      uses: actions/setup-python@v4
@@ -217,36 +260,25 @@ jobs:
      # Ignore the failure of a step and avoid terminating the job.
      continue-on-error: true
      uses: actions/cache@v3
+      id: deps_cache
      with:
        # This path is specific to Ubuntu
        path: ${{ github.workspace }}/cget
        # Look to see if there is a cache hit for the corresponding requirements file
-        key: 
-          ${{ matrix.os }}-cget-4-${{ hashFiles('requirements.txt', 'dev-requirements.txt') }}
-          ${{ matrix.os }}-cget-4-
-
+        key: ${{ matrix.os }}-cget-4-${{ hashFiles('requirements.txt', 'dev-requirements.txt') }}
+        restore-keys: ${{ matrix.os }}-cget-4-

    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install https://github.com/RadeonOpenCompute/rbuild/archive/master.tar.gz
-        rbuild prepare -d cget -s gh
-        sudo apt-get install -y lld
-    - name: Prepare timestamp
-      id: cache_timestamp
-      shell: bash 
-      run: echo timestamp="$(date +'%Y-%m-%dT%H:%M:%S')" >> $GITHUB_OUTPUT
+      if: steps.deps_cache.outputs.cache-hit != 'true'
+      run: rbuild prepare -d cget -s gh

-    - name: Cache files for ccache
-      # Ignore the failure of a step and avoid terminating the job.
-      continue-on-error: true
-      uses: pat-s/always-upload-cache@v2.1.3
+    - name: Restore cache files for ccache
+      uses: actions/cache/restore@v3 
+      id: ccache_restore
      with:
-        path: ccache
-        key: ${{ matrix.os }}-${{ matrix.configuration }}-ccache-${{ steps.cache_timestamp.outputs.timestamp }}
-        restore-keys: |
-            ${{ matrix.os }}-${{ matrix.configuration }}-ccache-${{ steps.cache_timestamp.outputs.timestamp }}
-            ${{ matrix.os }}-${{ matrix.configuration }}-ccache-
+        path: ${{ github.workspace }}/ccache
+        key: ${{ matrix.os }}-${{ matrix.configuration }}-ccache-${{ github.ref }}
+        restore-keys: ${{ matrix.os }}-${{ matrix.configuration }}-ccache-

    - name: Build and test
      env:
@@ -266,6 +298,23 @@ jobs:
          -DCMAKE_SHARED_LINKER_FLAGS='-fuse-ld=lld'
        ${{ github.workspace }}/cget/bin/ccache -s

+    # GH actions can not update existing cache, as a workaround clear cache and then save it
+    - name: Clear ccache cache before saving
+      if: ${{ steps.ccache_restore.outputs.cache-hit }}
+      shell: bash
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      run: |
+        gh extension install actions/gh-actions-cache --pin v1.0.1
+        gh actions-cache delete ${{ steps.ccache_restore.outputs.cache-matched-key }} --confirm
+
+    - name: Save cache files for ccache
+      uses: actions/cache/save@v3 
+      if: always()
+      with:
+        path: ${{ github.workspace }}/ccache
+        key: ${{ matrix.os }}-${{ matrix.configuration }}-ccache-${{ github.ref }}
+
    - name: Upload code coverage
      if: "matrix.configuration == 'codecov'"
      env:
@@ -309,6 +358,7 @@ jobs:
      uses: actions/setup-python@v4
      with:
        python-version: 3.7
+
    - name: Cache dependencies
      # Ignore the failure of a step and avoid terminating the job.
      continue-on-error: true
@@ -317,9 +367,8 @@ jobs:
        # This path is specific to Ubuntu
        path: ${{ github.workspace }}/cget
        # Look to see if there is a cache hit for the corresponding requirements file
-        key: 
-          ${{ matrix.os }}-cget-4-${{ hashFiles('requirements.txt', 'dev-requirements.txt') }}
-          ${{ matrix.os }}-cget-4-
+        key: ${{ matrix.os }}-cget-4-${{ hashFiles('requirements.txt', 'dev-requirements.txt') }}
+        restore-keys: ${{ matrix.os }}-cget-4-


    - name: Install dependencies
@@ -328,22 +377,15 @@ jobs:
        pip install https://github.com/RadeonOpenCompute/rbuild/archive/master.tar.gz
        rbuild prepare -d cget -s gh
        sudo apt-get install -y lld
-    - name: Prepare timestamp
-      id: cache_timestamp
-      shell: bash
-      run: echo timestamp="$(date +'%Y-%m-%dT%H:%M:%S')" >> $GITHUB_OUTPUT

-    - name: Cache files for ccache
-      # Ignore the failure of a step and avoid terminating the job.
-      continue-on-error: true
-      uses: pat-s/always-upload-cache@v2.1.3
+    - name: Restore cache files for ccache
+      id: ccache_restore_fpga
+      uses: actions/cache/restore@v3 
      with:
-        path: ccache
-        key: ${{ matrix.os }}-${{ matrix.configuration }}-ccache-${{ steps.cache_timestamp.outputs.timestamp }}
-        restore-keys: |
-            ${{ matrix.os }}-${{ matrix.configuration }}-ccache-${{ steps.cache_timestamp.outputs.timestamp }}
-            ${{ matrix.os }}-${{ matrix.configuration }}-ccache-
-
+        path: ${{ github.workspace }}/ccache
+        key: ${{ matrix.os }}-${{ matrix.configuration }}-ccache-${{ github.ref }}
+        restore-keys: ${{ matrix.os }}-${{ matrix.configuration }}-ccache-
+    
    - name: Build and test
      env:
        CMAKE_PREFIX_PATH: ${{ github.workspace }}/cget
@@ -363,17 +405,36 @@ jobs:
          -DMIGRAPHX_ENABLE_FPGA=On
        ${{ github.workspace }}/cget/bin/ccache -s

-    #- name: Upload code coverage
-    #  if: "matrix.configuration == 'codecov'"
-    #  env:
-    #    CODECOV_TOKEN: "8545af1c-f90b-4345-92a5-0d075503ca56"
-    #  run: |
-    #    sudo apt-get install -y lcov
-    #    cd build
-    #    lcov --directory . --capture --output-file $(pwd)/coverage.info
-    #    lcov --remove $(pwd)/coverage.info '/usr/*' --output-file $(pwd)/coverage.info
-    #    lcov --list $(pwd)/coverage.info
-    #    curl -Os https://uploader.codecov.io/latest/linux/codecov
-    #    chmod +x codecov
-    #    ./codecov -t ${CODECOV_TOKEN}
-    #    echo "Uploaded"
+    # this is a workaround, with GH actions can not update existing cache
+    - name: Clear ccache cache before saving
+      if: ${{ steps.ccache_restore_fpga.outputs.cache-hit }}
+      shell: bash
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      run: |
+        gh extension install actions/gh-actions-cache
+        gh actions-cache delete ${{ steps.ccache_restore_fpga.outputs.cache-matched-key }} --confirm
+      continue-on-error: true
+
+    - name: Save cache files for ccache
+      uses: actions/cache/save@v3 
+      if: always()
+      with:
+        path: ${{ github.workspace }}/ccache
+        key: ${{ matrix.os }}-${{ matrix.configuration }}-ccache-${{ github.ref }}
+
+      #- name: Upload code coverage
+      #  if: "matrix.configuration == 'codecov'"
+      #  env:
+      #    CODECOV_TOKEN: "8545af1c-f90b-4345-92a5-0d075503ca56"
+      #  run: |
+      #    sudo apt-get install -y lcov
+      #    cd build
+      #    lcov --directory . --capture --output-file $(pwd)/coverage.info
+      #    lcov --remove $(pwd)/coverage.info '/usr/*' --output-file $(pwd)/coverage.info
+      #    lcov --list $(pwd)/coverage.info
+      #    curl -Os https://uploader.codecov.io/latest/linux/codecov
+      #    chmod +x codecov
+      #    ./codecov -t ${CODECOV_TOKEN}
+      #    echo "Uploaded"
+
--- a/.github/workflows/clean-closed-pr-caches.yaml
+++ b/.github/workflows/clean-closed-pr-caches.yaml
+name: Cleanup caches of closed PR
+on:
+  pull_request:
+    types:
+      - closed
+
+jobs:
+  cleanup:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v3
+        
+      - name: Cleanup
+        run: |
+          gh extension install actions/gh-actions-cache --pin v1.0.1
+          
+          REPO=${{ github.repository }}
+          BRANCH="refs/pull/${{ github.event.pull_request.number }}/merge"
+
+          echo "Fetching list of cache key"
+          cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH | cut -f 1 | tail -n +3)
+
+          ## Setting this to not fail the workflow while deleting cache keys. 
+          set +e
+          echo "Deleting caches..."
+          for cacheKey in $cacheKeysForPR
+          do
+              gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm
+          done
+          echo "Done"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/rocm-image-release.yaml
+++ b/.github/workflows/rocm-image-release.yaml
@@ -10,12 +10,38 @@ on:
        description: Repository for benchmark utils
        required: true
        default: 'ROCmSoftwarePlatform/migraphx-benchmark-utils'
+      base_image:
+        description: Base image for rocm Docker build
+        required: true
+        default: "rocm/dev-ubuntu-20.04"
+      docker_image:
+        description: Docker image name for rocm Docker build
+        required: true
+        default: "rocm-migraphx"
+      build_navi:
+        description: Build navi number
+        required: true
+        default: "0"
+      organization:
+        type: string
+        description: Organization based on which location of files will be different
+        required: true
+        default: "AMD"
+      overwrite:
+        type: boolean
+        description: Overwrite image if it already exists
+        required: true

 jobs:
  release:
    uses: ROCmSoftwarePlatform/migraphx-benchmark/.github/workflows/rocm-release.yml@main
    with:
-      rocm_release: ${{ github.event.inputs.rocm_release }}
+      rocm_release: ${{ github.event.inputs.rocm_release || '5.1' }}
      benchmark-utils_repo: ${{ github.event.inputs.benchmark-utils_repo || 'ROCmSoftwarePlatform/migraphx-benchmark-utils' }}
+      organization: ${{ github.event.inputs.organization || 'AMD' }}
+      base_image: ${{ github.event.inputs.base_image || 'rocm/dev-ubuntu-20.04' }}
+      docker_image: ${{ github.event.inputs.docker_image || 'rocm-migraphx' }}
+      build_navi: ${{ github.event.inputs.build_navi || '0' }}
+      overwrite: ${{ github.event.inputs.overwrite == 'true' }}
    secrets:
      gh_token: ${{ secrets.MIGRAPHX_BOT_TOKEN }}
--- a/.github/workflows/sync-onnxrt-main.yaml
+++ b/.github/workflows/sync-onnxrt-main.yaml
 name: Onnxruntime main weekly sync
+
 on:
  schedule:
    - cron: '07 17 * * 5'

--- a/Dockerfile
+++ b/Dockerfile
@@ -110,7 +110,7 @@ RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXR

 ADD tools/build_and_test_onnxrt.sh /onnxruntime/build_and_test_onnxrt.sh

-RUN cget -p /usr/local install ROCmSoftwarePlatform/rocMLIR@acb727b348086b58a7f261b32c0e4f0686a4c0ee -DBUILD_MIXR_TARGET=On -DLLVM_ENABLE_ZSTD=Off -DLLVM_ENABLE_THREADS=Off
+RUN cget -p /usr/local install ROCmSoftwarePlatform/rocMLIR@55c6ee66cc7502db7950693b3e845676cbf400b1 -DBUILD_MIXR_TARGET=On -DLLVM_ENABLE_ZSTD=Off -DLLVM_ENABLE_THREADS=Off

 ENV MIOPEN_FIND_DB_PATH=/tmp/miopen/find-db
 ENV MIOPEN_USER_DB_PATH=/tmp/miopen/user-db

--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ build MIGraphX. The specific steps are as follows:
 1) Install rocm-cmake, pip3, rocblas, and miopen-hip with the command

 ```
-sudo apt update && sudo apt install -y rocm-cmake python3-pip rocblas miopen-hip
+sudo apt install -y rocm-cmake python3-pip rocblas miopen-hip
 ```

 2) Install [rbuild](https://github.com/RadeonOpenCompute/rbuild) (sudo may be required here.)
@@ -68,14 +68,11 @@ pip3 install https://github.com/RadeonOpenCompute/rbuild/archive/master.tar.gz
 3) Build MIGraphX source code

 ```
-rbuild build -d depend -B build --cxx=/opt/rocm/llvm/bin/clang++
+rbuild build -d depend -B build
 ```

 then all the prerequisites are in the folder `depend`, and MIGraphX is built in the `build` directory.

-Note that for ROCm3.7 and later releases, Ubuntu 18.04 or later releases are needed. 
-Upgrade to Ubuntu 18.04 is available at [Upgrade Ubuntu to 18.04](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/wiki/Upgrade-to-Ubuntu-18.04-for-ROCM3.7-or-later-releases)
-
 Also note that you may meet the error of `rbuild: command not found`. It is because rbuild is installed 
 at `$HOME/.local/bin`, which is not in `PATH`. You can either export PATH as `export PATH=$HOME/.local/bin:$PATH` 
 to add the folder to `PATH` or add the option `--prefix /usr/local` in the pip3 command when installing rbuild.
@@ -89,7 +86,7 @@ If using this approach, we need to install the prerequisites, configure the cmak
 For convenience, the prerequisites can be built automatically with rbuild as:

 ```
-rbuild build -d depend --cxx=/opt/rocm/llvm/bin/clang++
+rbuild prepare -d depend
 ```

 then all the prerequisites are in the folder `depend`, and they can be used in the `cmake` configuration
@@ -174,7 +171,6 @@ To install:
 dpkg -i <path_to_deb_file>
 ```

-
 ### Calling MIGraphX APIs
 To use MIGraphX's C/C++ API in your cmake project, we need to set `CMAKE_PREFIX_PATH` to the MIGraphX
 installation location and then do 
@@ -184,8 +180,24 @@ target_link_libraries(myApp migraphx::c)
 ```
 Where `myApp` is the cmake target in your project.

+## Building for development
+
+Using rbuild, the dependencies for development can be installed with:
+
+```
+rbuild develop
+```
+
+This will install the dependencies for development into the `deps` directory and
+configure `cmake` to use those dependencies in the `build` directory. These
+directories can be changed by passing the `--deps-dir` and `--build-dir` flags
+to `rbuild` command:
+
+```
+rbuild develop --build-dir build_rocm_55 --deps-dir /home/user/deps_dir
+```

-### Building the documentation
+## Building the documentation

 HTML and PDF documentation can be built using:


--- a/doc/src/driver/compile.rst
+++ b/doc/src/driver/compile.rst
@@ -32,6 +32,10 @@ Disable fast math optimization

 Perform an exhaustive search to find the fastest version of generated kernels for selected backend

+.. options:: --split-single-dyn-dim
+
+Enable the split single dynamic dimension pass
+
 .. option::  --fp16

 Quantize for fp16

--- a/doc/src/driver/read.rst
+++ b/doc/src/driver/read.rst
@@ -24,7 +24,7 @@ Load as MIGraphX JSON

 .. option::  --batch [unsigned int] (Default: 1)

-Set batch size for model
+For a static model, set batch size. For a dynamic batch model, sets the batch size at runtime.

 .. option::  --nhwc

@@ -46,6 +46,14 @@ Trim instructions from the end (Default: 0)

 Dim of a parameter (format: "@name d1 d2 dn")

+.. options:: --dyn-input-dim [std::vector<std::string>]
+
+Set dynamic dimensions of a parameter using JSON formatting (format "@name" "dynamic_dimension_json")
+
+.. options:: --default-dyn-dim
+
+Set the default dynamic dimension (format {min:x, max:y, optimals:[o1,o2,...]})
+
 .. option::  --optimize, -O

 Optimize when reading

--- a/examples/migraphx/migraphx_driver/README.md
+++ b/examples/migraphx/migraphx_driver/README.md
@@ -29,7 +29,7 @@ See below for a comprehensive list of commands and option arguments, as well as
 | --tf                                     | Load file as a tensorflow graph                           |
 | --migraphx                               | Load file as a migraphx graph                             |
 | --migraphx-json                          | Load file as a migraphx JSON graph                        |
-| --batch                                  | Set batch size for the model                              |
+| --batch                                  | For a static model, set batch size. For a dynamic batch model, sets the batch size at runtime.|
 | --nhwc                                   | Treat tensorflow format as nhwc                           |
 | --nchw                                   | Treat tensorflow format as nchw                           |
 | --skip-unknown-operators                 | Skip unknown operators when parsing and continue to parse |
@@ -44,12 +44,16 @@ See below for a comprehensive list of commands and option arguments, as well as
 | --output \| -o                           | Output to file                                            |
 | --fill0                                  | Fill parameter with 0s                                    |
 | --fill1                                  | Fill parameter with 1s                                    |
+| --input-dim                              | Set static dimensions of a parameter                      |
+| --dyn-input-dim                          | Set dynamic dimensions of a parameter                     |
+| --default-dyn-dim                        | Set default dynamic dimension                             |
 | --gpu                                    | Compile on the gpu                                        |
 | --cpu                                    | Compile on the cpu                                        |
 | --ref                                    | Compile on the reference implementation                   |
 | --enable-offload-copy                    | Enable implicit offload copying                           |
 | --disable-fast-math                      | Disable fast math optimization                            |
 | --exhaustive-tune                        | Enable exhaustive search to find fastest kernel           |
+| --split-single-dyn-dim                   | Enable split_single_dyn_dim compiler pass                 |
 | --fp16                                   | Quantize for fp16                                         |
 | --int8                                   | Quantize for int8                                         |
 | --tolerance                              | Tolerance for errors                                      |

--- a/rbuild.ini
+++ b/rbuild.ini
@@ -14,6 +14,7 @@ define =
    CMAKE_C_COMPILER_LAUNCHER=${deps_dir}/bin/ccache
    CMAKE_CXX_COMPILER_LAUNCHER=${deps_dir}/bin/ccache
    MIGRAPHX_ENABLE_CPU=On
+    BUILD_DEV=On

 [develop]
 cxx = ${rocm_path}/llvm/bin/clang++
@@ -25,3 +26,4 @@ define =
    CMAKE_C_COMPILER_LAUNCHER=${deps_dir}/bin/ccache
    CMAKE_CXX_COMPILER_LAUNCHER=${deps_dir}/bin/ccache
    MIGRAPHX_ENABLE_CPU=On
+    BUILD_DEV=On
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -50,6 +50,7 @@ add_library(migraphx
    env.cpp
    file_buffer.cpp
    fuse_pointwise.cpp
+    fuse_reduce.cpp
    generate.cpp
    inline_module.cpp
    insert_pad.cpp

--- a/src/common.cpp
+++ b/src/common.cpp
@@ -148,10 +148,8 @@ shape common_shape(const std::vector<shape>& shapes)
    return {compute_common_types(shapes), compute_common_lens(shapes)};
 }

-instruction_ref insert_common_op(module& m,
-                                 instruction_ref ins,
-                                 const operation& op,
-                                 std::vector<instruction_ref> inputs)
+std::vector<instruction_ref>
+insert_common_args(module& m, instruction_ref ins, std::vector<instruction_ref> inputs)
 {
    if(std::any_of(
           inputs.cbegin(), inputs.cend(), [](auto input) { return input->get_shape().dynamic(); }))
@@ -210,7 +208,20 @@ instruction_ref insert_common_op(module& m,
            return input;
        });
    }
-    return m.insert_instruction(ins, op, inputs);
+    return inputs;
+}
+
+std::vector<instruction_ref> add_common_args(module& m, std::vector<instruction_ref> inputs)
+{
+    return insert_common_args(m, m.end(), std::move(inputs));
+}
+
+instruction_ref insert_common_op(module& m,
+                                 instruction_ref ins,
+                                 const operation& op,
+                                 std::vector<instruction_ref> inputs)
+{
+    return m.insert_instruction(ins, op, insert_common_args(m, ins, std::move(inputs)));
 }

 instruction_ref add_common_op(module& m, const operation& op, std::vector<instruction_ref> inputs)

--- a/src/cpp_generator.cpp
+++ b/src/cpp_generator.cpp
@@ -106,6 +106,13 @@ cpp_generator::function& cpp_generator::function::set_generic_types(const module
    return *this;
 }

+cpp_generator::function& cpp_generator::function::add_generic_param(const std::string& pname)
+{
+    params.push_back({pname, "T" + pname});
+    tparams.push_back("class T" + pname);
+    return *this;
+}
+
 struct cpp_generator_impl
 {
    std::stringstream fs{};
@@ -182,7 +189,8 @@ std::string cpp_generator::generate_point_op(const operation& op,

 std::string cpp_generator::str() const { return impl->fs.str(); }

-cpp_generator::function cpp_generator::generate_module(const module& m)
+cpp_generator::function cpp_generator::generate_module(const module& m,
+                                                       const generate_module_callback& g)
 {
    function f;
    auto name = transform_string(m.name(), [](char c) {
@@ -195,13 +203,7 @@ cpp_generator::function cpp_generator::generate_module(const module& m)
            if(ins->name() == "@literal")
                return shape::cpp_type(ins->get_shape().type()) + "(" +
                       ins->get_literal().to_string() + ")";
-            std::vector<std::string> args;
-            std::transform(ins->inputs().begin(),
-                           ins->inputs().end(),
-                           std::back_inserter(args),
-                           [&](auto i) { return names.at(i); });
-
-            auto s = this->generate_point_op(ins->get_operator(), args);
+            auto s = g(ins, names);
            if(impl->fresult)
                return impl->fresult(ins->get_shape()) + '(' + s + ')';
            else
@@ -210,6 +212,24 @@ cpp_generator::function cpp_generator::generate_module(const module& m)
    return f;
 }

+std::vector<std::string>
+cpp_generator::to_args(const std::vector<instruction_ref>& inputs,
+                       const std::unordered_map<instruction_ref, std::string>& names)
+{
+    std::vector<std::string> args;
+    std::transform(inputs.begin(), inputs.end(), std::back_inserter(args), [&](auto i) {
+        return names.at(i);
+    });
+    return args;
+}
+
+cpp_generator::function cpp_generator::generate_module(const module& m)
+{
+    return this->generate_module(m, [&](auto ins, const auto& names) {
+        return this->generate_point_op(ins->get_operator(), to_args(ins->inputs(), names));
+    });
+}
+
 std::string cpp_generator::create_function(const cpp_generator::function& f)
 {
    impl->function_count++;
@@ -218,6 +238,8 @@ std::string cpp_generator::create_function(const cpp_generator::function& f)
    std::string name = f.name.empty() ? "f" + std::to_string(impl->function_count) : f.name;
    impl->fs << join_strings(f.attributes, " ") << " " << f.return_type << " " << name;
    char delim = '(';
+    if(f.params.empty())
+        impl->fs << delim;
    for(auto&& p : f.params)
    {
        impl->fs << delim << p.type << " " << p.name;

--- a/src/driver/argument_parser.hpp
+++ b/src/driver/argument_parser.hpp
@@ -148,13 +148,21 @@ struct value_parser
    template <MIGRAPHX_REQUIRES(not std::is_enum<T>{} and not is_multi_value<T>{})>
    static T apply(const std::string& x)
    {
-        T result;
-        std::stringstream ss;
-        ss.str(x);
-        ss >> result;
-        if(ss.fail())
-            throw std::runtime_error("Failed to parse '" + x + "' as " + type_name<T>::apply());
-        return result;
+        // handle whitespace in string
+        if constexpr(std::is_same<T, std::string>{})
+        {
+            return x;
+        }
+        else
+        {
+            T result;
+            std::stringstream ss;
+            ss.str(x);
+            ss >> result;
+            if(ss.fail())
+                throw std::runtime_error("Failed to parse '" + x + "' as " + type_name<T>::apply());
+            return result;
+        }
    }

    template <MIGRAPHX_REQUIRES(std::is_enum<T>{} and not is_multi_value<T>{})>

--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
@@ -33,6 +33,7 @@
 #include <migraphx/tf.hpp>
 #include <migraphx/onnx.hpp>
 #include <migraphx/stringutils.hpp>
+#include <migraphx/convert_to_json.hpp>
 #include <migraphx/load_save.hpp>
 #include <migraphx/json.hpp>
 #include <migraphx/version.h>
@@ -68,7 +69,9 @@ struct loader
    bool brief                  = false;
    std::string output_type;
    std::string output;
+    std::string default_dyn_dim;
    std::vector<std::string> param_dims;
+    std::vector<std::string> dyn_param_dims;
    std::vector<std::string> output_names;

    void parse(argument_parser& ap)
@@ -83,7 +86,11 @@ struct loader
        ap(file_type, {"--tf"}, ap.help("Load as tensorflow"), ap.set_value("tf"));
        ap(file_type, {"--migraphx"}, ap.help("Load as MIGraphX"), ap.set_value("migraphx"));
        ap(file_type, {"--migraphx-json"}, ap.help("Load as MIGraphX JSON"), ap.set_value("json"));
-        ap(batch, {"--batch"}, ap.help("Set batch size for model"));
+        ap(batch,
+           {"--batch"},
+           ap.help("For a static model, sets default_dim_value size (commonly batch size). For a "
+                   "dynamic batch model, sets the batch "
+                   "size at runtime."));
        ap(is_nhwc, {"--nhwc"}, ap.help("Treat tensorflow format as nhwc"), ap.set_value(true));
        ap(skip_unknown_operators,
           {"--skip-unknown-operators"},
@@ -96,7 +103,16 @@ struct loader
           ap.help("Dim of a parameter (format: \"@name d1 d2 dn\")"),
           ap.append(),
           ap.nargs(2));
-
+        ap(dyn_param_dims,
+           {"--dyn-input-dim"},
+           ap.help("Dynamic dimensions of a parameter (format: \"@name_1\" \"[{min:x, max:y, "
+                   "optimals:[o1,o2,...]}, dim2,dim3, ...]\", \"@name_2\", ... You can supply a "
+                   "single integer value for a dimension to specify it as fixed."),
+           ap.append(),
+           ap.nargs(2));
+        ap(default_dyn_dim,
+           {"--default-dyn-dim"},
+           ap.help("Default dynamic dimension (format: \"{min:x, max:y, optimals:[o1,o2]}\")."));
        ap(output_names,
           {"--output-names"},
           ap.help("Names of node output (format: \"name_1 name_2 name_n\")"),
@@ -147,6 +163,40 @@ struct loader
        return map_input_dims;
    }

+    static auto parse_dyn_dims_json(const std::string& dd_json)
+    {
+        // expecting a json string like "[{min:1,max:64,optimals:[1,2,4,8]},3,224,224]"
+        auto v = from_json_string(convert_to_json(dd_json));
+        std::vector<migraphx::shape::dynamic_dimension> dyn_dims;
+        std::transform(v.begin(), v.end(), std::back_inserter(dyn_dims), [&](auto x) {
+            if(x.is_object())
+                return from_value<migraphx::shape::dynamic_dimension>(x);
+            auto d = x.template to<std::size_t>();
+            return migraphx::shape::dynamic_dimension{d, d};
+        });
+        return dyn_dims;
+    }
+
+    static auto parse_dyn_dims_map(const std::vector<std::string>& param_dyn_dims)
+    {
+        // expecting vector of strings formatted like
+        // {"@param_name_0", "dd_json_0", "@param_name_1", "dd_json_1", ...}
+        std::unordered_map<std::string, std::vector<shape::dynamic_dimension>> map_dyn_input_dims;
+        std::string name = "";
+        for(auto&& x : param_dyn_dims)
+        {
+            if(x[0] == '@')
+            {
+                name = x.substr(1);
+            }
+            else
+            {
+                map_dyn_input_dims[name] = parse_dyn_dims_json(x);
+            }
+        }
+        return map_dyn_input_dims;
+    }
+
    static auto parse_output_names(const std::vector<std::string>& output_names_info)
    {
        std::vector<std::string> output_node_names;
@@ -158,13 +208,44 @@ struct loader
        return output_node_names;
    }

+    tf_options get_tf_options() const
+    {
+        auto map_input_dims    = parse_param_dims(param_dims);
+        auto output_node_names = parse_output_names(output_names);
+        tf_options options;
+        options.is_nhwc           = is_nhwc;
+        options.batch_size        = batch;
+        options.map_input_dims    = map_input_dims;
+        options.output_node_names = output_node_names;
+        return options;
+    }
+
+    onnx_options get_onnx_options() const
+    {
+        auto map_input_dims     = parse_param_dims(param_dims);
+        auto map_dyn_input_dims = parse_dyn_dims_map(dyn_param_dims);
+        onnx_options options;
+        if(default_dyn_dim.empty())
+        {
+            options.default_dim_value = batch;
+        }
+        else
+        {
+            auto v                        = from_json_string(convert_to_json(default_dyn_dim));
+            options.default_dyn_dim_value = from_value<migraphx::shape::dynamic_dimension>(v);
+        }
+        options.skip_unknown_operators = skip_unknown_operators;
+        options.print_program_on_error = true;
+        options.map_input_dims         = map_input_dims;
+        options.map_dyn_input_dims     = map_dyn_input_dims;
+        return options;
+    }
+
    program load()
    {
        program p;
        if(model.empty())
        {
-            auto map_input_dims    = parse_param_dims(param_dims);
-            auto output_node_names = parse_output_names(output_names);
            if(file_type.empty())
            {
                if(ends_with(file, ".onnx"))
@@ -179,16 +260,11 @@ struct loader
            std::cout << "Reading: " << file << std::endl;
            if(file_type == "onnx")
            {
-                onnx_options options;
-                options.default_dim_value      = batch;
-                options.skip_unknown_operators = skip_unknown_operators;
-                options.print_program_on_error = true;
-                options.map_input_dims         = map_input_dims;
-                p                              = parse_onnx(file, options);
+                p = parse_onnx(file, get_onnx_options());
            }
            else if(file_type == "tf")
            {
-                p = parse_tf(file, tf_options{is_nhwc, batch, map_input_dims, output_node_names});
+                p = parse_tf(file, get_tf_options());
            }
            else if(file_type == "json")
            {
@@ -289,14 +365,21 @@ struct program_params
        ap(fill1, {"--fill1"}, ap.help("Fill parameter with 1s"), ap.append(), ap.nargs(2));
    }

-    auto generate(const program& p, const target& t, bool offload)
+    auto generate(const program& p, const target& t, bool offload, unsigned batch)
    {
        parameter_map m;
+        auto param_shapes = p.get_parameter_shapes();
+        std::unordered_map<std::string, shape> static_param_shapes;
+        std::transform(
+            param_shapes.cbegin(),
+            param_shapes.cend(),
+            std::inserter(static_param_shapes, static_param_shapes.end()),
+            [&](const auto& x) { return std::make_pair(x.first, x.second.to_static(batch)); });
        for(auto&& s : fill0)
-            m[s] = fill_argument(p.get_parameter_shape(s), 0);
+            m[s] = fill_argument(static_param_shapes.at(s), 0);
        for(auto&& s : fill1)
-            m[s] = fill_argument(p.get_parameter_shape(s), 1);
-        fill_param_map(m, p, t, offload);
+            m[s] = fill_argument(static_param_shapes.at(s), 1);
+        fill_param_map(m, static_param_shapes, t, offload);
        return m;
    }
 };
@@ -305,12 +388,12 @@ struct compiler_target
 {
 #ifdef HAVE_GPU
    std::string target_name = "gpu";
-#elif HAVE_CPU
+#elif defined(HAVE_CPU)
    std::string target_name = "cpu";
-#elif HAVE_FPGA
-    std::string target_name = "fpga"
+#elif defined(HAVE_FPGA)
+    std::string target_name = "fpga";
 #else
-    std::string target_name = "ref"
+    std::string target_name = "ref";
 #endif

    void parse(argument_parser& ap)
@@ -353,13 +436,18 @@ struct compiler
           {"--exhaustive-tune"},
           ap.help("Exhastively search for best tuning parameters for kernels"),
           ap.set_value(true));
+        ap(co.split_single_dyn_dim,
+           {"--split-single-dyn-dim"},
+           ap.help("If there is a single non-fixed dynamic dimension in the model, then split to "
+                   "static submodules"),
+           ap.set_value(true));
        ap(quantize, {"--fp16"}, ap.help("Quantize for fp16"), ap.set_value(precision::fp16));
        ap(quantize, {"--int8"}, ap.help("Quantize for int8"), ap.set_value(precision::int8));
    }

    auto params(const program& p)
    {
-        return parameters.generate(p, ct.get_target(), co.offload_copy);
+        return parameters.generate(p, ct.get_target(), co.offload_copy, l.batch);
    }

    program compile()
@@ -432,7 +520,7 @@ struct verify : command<verify>
        std::cout << p << std::endl;

        auto t = c.ct.get_target();
-        auto m = c.parameters.generate(p, t, true);
+        auto m = c.parameters.generate(p, t, true, c.l.batch);

        if(per_instruction)
        {

--- a/src/driver/perf.cpp
+++ b/src/driver/perf.cpp
@@ -39,36 +39,25 @@ auto get_hash(const T& x)
    return std::hash<T>{}(x);
 }

-parameter_map fill_param_map(parameter_map& m, const program& p, const target& t, bool offload)
+parameter_map fill_param_map(parameter_map& m,
+                             const std::unordered_map<std::string, shape>& param_shapes,
+                             const target& t,
+                             bool offload)
 {
-    for(auto&& x : p.get_parameter_shapes())
+    for(auto&& x : param_shapes)
    {
        argument& arg = m[x.first];
        if(arg.empty())
+        {
+            assert(not x.second.dynamic());
            arg = generate_argument(x.second, get_hash(x.first));
+        }
        if(not offload)
            arg = t.copy_to(arg);
    }
    return m;
 }

-parameter_map fill_param_map(parameter_map& m, const program& p, bool gpu)
-{
-    for(auto&& x : p.get_parameter_shapes())
-    {
-        argument& arg = m[x.first];
-        if(arg.empty())
-            arg = generate_argument(x.second, get_hash(x.first));
-#ifdef HAVE_GPU
-        if(gpu)
-            arg = gpu::to_gpu(arg);
-#else
-        (void)gpu;
-#endif
-    }
-    return m;
-}
-
 parameter_map create_param_map(const program& p, const target& t, bool offload)
 {
    parameter_map m;

--- a/src/driver/perf.hpp
+++ b/src/driver/perf.hpp
@@ -30,8 +30,10 @@ namespace migraphx {
 namespace driver {
 inline namespace MIGRAPHX_INLINE_NS {

-parameter_map
-fill_param_map(parameter_map& m, const program& p, const target& t, bool offload = false);
+parameter_map fill_param_map(parameter_map& m,
+                             const std::unordered_map<std::string, shape>& param_shapes,
+                             const target& t,
+                             bool offload = false);
 parameter_map create_param_map(const program& p, const target& t, bool offload = false);

 parameter_map fill_param_map(parameter_map& m, const program& p, bool gpu);

--- a/src/fuse_pointwise.cpp
+++ b/src/fuse_pointwise.cpp
@@ -74,6 +74,7 @@ static void create_pointwise_modules(module_pass_manager& mpm)
        std::unordered_map<instruction_ref, instruction_ref> param_map;
        std::vector<instruction_ref> pointwise_inputs;
        std::size_t i = 0;
+
        for(auto input : ins->inputs())
        {
            if(contains(param_map, input))
@@ -92,6 +93,10 @@ static void create_pointwise_modules(module_pass_manager& mpm)
            }
        }

+        // Don't create pointwise module if no inputs are detected
+        if(pointwise_inputs.empty())
+            continue;
+
        std::vector<instruction_ref> inputs;
        std::transform(ins->inputs().begin(),
                       ins->inputs().end(),

--- a/src/fuse_reduce.cpp
+++ b/src/fuse_reduce.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/fuse_reduce.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/register_op.hpp>
+#include <iterator>
+#include <map>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct fused_reduce
+{
+    std::vector<std::int64_t> axes{};
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.axes, "axes"));
+    }
+
+    shape compute_shape(const std::vector<shape>& inputs, std::vector<module_ref> mods) const
+    {
+        if(mods.size() != 1)
+            MIGRAPHX_THROW("should have one submodule.");
+        auto* sm = mods.front();
+        if(sm->get_output_shapes().size() != 1)
+            MIGRAPHX_THROW("Only one output supported");
+        auto names = sm->get_parameter_names();
+        check_shapes{inputs, *this}.has(names.size()).same_ndims();
+        std::sort(names.begin(), names.end());
+        auto shapes = sm->get_parameter_shapes();
+        // Check dimension matches for each input
+        if(not equal(names, inputs, [&](const auto& name, const auto& input) {
+               return shapes.at(name).lens() == input.lens();
+           }))
+            MIGRAPHX_THROW("Dimenstion does not match the submodule.");
+        const auto& s = inputs.at(0);
+        auto lens     = s.lens();
+        if(lens != sm->get_output_shapes().front().lens())
+        {
+            for(const auto& axis : axes)
+            {
+                lens[axis] = 1;
+            }
+        }
+
+        return shape::from_permutation(
+            sm->get_output_shapes().front().type(), lens, find_permutation(inputs));
+    }
+
+    std::string name() const { return "fused_reduce"; }
+};
+MIGRAPHX_REGISTER_OP(fused_reduce);
+
+static std::unordered_map<instruction_ref, instruction_ref>
+get_ins_param_map(const std::vector<instruction_ref>& inputs, const_module_ref sm)
+{
+    std::unordered_map<instruction_ref, instruction_ref> result;
+    auto names = sm->get_parameter_names();
+    std::sort(names.begin(), names.end());
+    assert(names.size() == inputs.size());
+    std::transform(names.begin(),
+                   names.end(),
+                   inputs.begin(),
+                   std::inserter(result, result.end()),
+                   [&](const auto& name, auto input) {
+                       return std::make_pair(input, sm->get_parameter(name));
+                   });
+    return result;
+}
+
+static void insert_params(module_ref sm,
+                          instruction_ref ins,
+                          std::unordered_map<instruction_ref, instruction_ref>& map_ins)
+{
+    auto n = sm->get_parameter_shapes().size();
+    for(auto input : ins->inputs())
+    {
+        if(contains(map_ins, input))
+            continue;
+        auto s         = shape{input->get_shape().type(), input->get_shape().lens()};
+        map_ins[input] = sm->add_parameter("x" + std::to_string(n++), s);
+    }
+}
+
+static auto insert_ins_in_submodule(module_ref sm,
+                                    instruction_ref ins,
+                                    std::unordered_map<instruction_ref, instruction_ref>& map_ins)
+{
+    insert_params(sm, ins, map_ins);
+    return sm->add_instructions({ins}, map_ins);
+}
+
+static auto insert_ins_in_submodule(module_ref sm, instruction_ref ins)
+{
+    std::unordered_map<instruction_ref, instruction_ref> map_ins;
+    return insert_ins_in_submodule(sm, ins, map_ins);
+}
+
+static auto
+insert_module_in_submodule(module_ref sm,
+                           instruction_ref ins,
+                           std::unordered_map<instruction_ref, instruction_ref>& map_ins)
+{
+    insert_params(sm, ins, map_ins);
+    auto* m        = ins->module_inputs().front();
+    auto param_map = get_ins_param_map(ins->inputs(), m);
+    for(auto&& [input, param] : param_map)
+    {
+        map_ins[param] = map_ins.at(input);
+    }
+    return sm->add_instructions(m, map_ins);
+}
+
+static std::vector<instruction_ref>
+find_inputs(module_ref sm,
+            const module& parent,
+            const std::unordered_map<instruction_ref, instruction_ref>& map_ins)
+{
+    std::vector<instruction_ref> result;
+    std::map<std::string, instruction_ref> names;
+    for(auto&& [input, param] : map_ins)
+    {
+        if(not sm->has_instruction(param))
+            continue;
+        if(param->name() != "@param")
+            continue;
+        if(not parent.has_instruction(input))
+            continue;
+        auto v      = param->get_operator().to_value();
+        auto name   = v.at("parameter").to<std::string>();
+        names[name] = input;
+    }
+    std::transform(names.begin(), names.end(), std::back_inserter(result), [](const auto& p) {
+        return p.second;
+    });
+    assert(result.size() == sm->get_parameter_shapes().size());
+    return result;
+}
+
+static void create_reduce_modules(module_pass_manager& mpm)
+{
+    std::size_t n = 0;
+    for(auto ins : iterator_for(mpm.get_module()))
+    {
+        if(not ins->get_operator().attributes().get("reduce", false))
+            continue;
+        if(ins->inputs().size() != 1)
+            continue;
+
+        auto* rm =
+            mpm.create_module(mpm.get_module().name() + ":" + ins->name() + std::to_string(n++));
+        rm->set_bypass();
+
+        rm->add_return(insert_ins_in_submodule(rm, ins));
+        auto v = ins->get_operator().to_value();
+        mpm.get_module().replace_instruction(
+            ins, make_op("fused_reduce", {{"axes", v["axes"]}}), ins->inputs(), {rm});
+    }
+}
+
+template <class... Ms>
+static auto match_broadcast(Ms... ms)
+{
+    return match::skip(match::name("contiguous"))(
+        match::name("multibroadcast")(match::arg(0)(ms...), match::used_once()).bind("broadcast"));
+}
+
+template <class... Ms>
+static auto any_input(Ms... ms)
+{
+    return match::any_of[match::inputs()](match::any(ms...).bind("input"));
+}
+
+static auto match_broadcastable_input(const std::string& op, const std::string& name)
+{
+    auto match_op                 = match::name(op)(match::used_once()).bind(name);
+    auto match_op_input           = any_input(match_op, match::used_once());
+    auto broadcast_match_op_input = any_input(match_broadcast(match_op), match::used_once());
+    return match::any_of(match_op_input, broadcast_match_op_input);
+}
+
+namespace {
+struct find_pointwise_reduce
+{
+    auto matcher() const
+    {
+        return match::name("fused_reduce")(match_broadcastable_input("pointwise", "pointwise"));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto reduce = r.result;
+        auto input  = r.instructions["pointwise"];
+
+        const auto* pm     = input->module_inputs().front();
+        const auto* old_rm = reduce->module_inputs().front();
+        auto* rm           = mpm.create_module(pm->name() + ":" + old_rm->name());
+        rm->set_bypass();
+
+        std::unordered_map<instruction_ref, instruction_ref> map_ins;
+        // Insert pointwise
+        auto rins      = insert_ins_in_submodule(rm, input, map_ins).front();
+        map_ins[input] = rins;
+
+        if(contains(r.instructions, "broadcast"))
+        {
+            auto broadcast     = r.instructions["broadcast"];
+            map_ins[broadcast] = insert_ins_in_submodule(rm, broadcast, map_ins).front();
+        }
+
+        // Insert fused_reduce
+        rm->add_return(insert_module_in_submodule(rm, reduce, map_ins));
+
+        auto new_inputs = find_inputs(rm, mpm.get_module(), map_ins);
+        mpm.get_module().replace_instruction(reduce, reduce->get_operator(), new_inputs, {rm});
+    }
+};
+
+struct find_reduce_pointwise
+{
+
+    auto matcher() const
+    {
+        return match::name("pointwise")(match_broadcastable_input("fused_reduce", "reduce"));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto pw     = r.result;
+        auto reduce = r.instructions["reduce"];
+        auto input  = r.instructions["input"];
+
+        const auto* pm     = pw->module_inputs().front();
+        const auto* old_rm = reduce->module_inputs().front();
+        auto* rm           = mpm.create_module(old_rm->name() + ":" + pm->name());
+        rm->set_bypass();
+        std::unordered_map<instruction_ref, instruction_ref> map_ins;
+        // Copy module instructions
+        insert_module_in_submodule(rm, reduce, map_ins);
+        if(contains(r.instructions, "broadcast"))
+        {
+            auto broadcast                       = r.instructions["broadcast"];
+            map_ins[broadcast->inputs().front()] = rm->get_returns().front();
+            auto bout                            = insert_ins_in_submodule(rm, broadcast, map_ins);
+            map_ins[input]                       = bout.front();
+        }
+        else
+        {
+            map_ins[input] = rm->get_returns().front();
+        }
+
+        auto out = insert_ins_in_submodule(rm, pw, map_ins);
+        rm->replace_return(out);
+
+        auto new_inputs = find_inputs(rm, mpm.get_module(), map_ins);
+        mpm.get_module().replace_instruction(pw, reduce->get_operator(), new_inputs, {rm});
+    }
+};
+
+struct find_reduce_reduce
+{
+    auto matcher() const
+    {
+        return match::name("fused_reduce")(match_broadcastable_input("fused_reduce", "reduce"));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto reduce1 = r.result;
+        auto reduce2 = r.instructions["reduce"];
+        auto input   = r.instructions["input"];
+
+        if(reduce1->get_operator() != reduce2->get_operator())
+            return;
+
+        const auto* rm1 = reduce1->module_inputs().front();
+        const auto* rm2 = reduce2->module_inputs().front();
+        auto* rm        = mpm.create_module(rm1->name() + ":" + rm2->name());
+        rm->set_bypass();
+
+        std::unordered_map<instruction_ref, instruction_ref> map_ins;
+        // Copy reduce1 instructions
+        insert_module_in_submodule(rm, reduce2, map_ins);
+        if(contains(r.instructions, "broadcast"))
+        {
+            auto broadcast                       = r.instructions["broadcast"];
+            map_ins[broadcast->inputs().front()] = rm->get_returns().front();
+            auto bout                            = insert_ins_in_submodule(rm, broadcast, map_ins);
+            map_ins[input]                       = bout.front();
+        }
+        else
+        {
+            map_ins[input] = rm->get_returns().front();
+        }
+
+        auto out = insert_module_in_submodule(rm, reduce1, map_ins);
+        rm->replace_return(out);
+
+        auto new_inputs = find_inputs(rm, mpm.get_module(), map_ins);
+        mpm.get_module().replace_instruction(reduce1, reduce1->get_operator(), new_inputs, {rm});
+    }
+};
+
+} // namespace
+
+void fuse_reduce::apply(module_pass_manager& mpm) const
+{
+    create_reduce_modules(mpm);
+    mpm.run_pass(dead_code_elimination{});
+    for(int i = 0; i < 4; i++)
+    {
+        match::find_matches(
+            mpm, find_reduce_pointwise{}, find_pointwise_reduce{}, find_reduce_reduce{});
+        mpm.run_pass(dead_code_elimination{});
+    }
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx