Merge branch 'nv_main' of v2.12

0d874a4e · wenjh · a68e5f87 · dfdd3820 · 0d874a4e · 0d874a4e
Commit 0d874a4e authored Mar 03, 2026 by wenjh
20 changed files
--- a/.github/actions/build-pytorch-wheel/Dockerfile
+++ b/.github/actions/build-pytorch-wheel/Dockerfile
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+FROM ubuntu:22.04
+ENV DEBIAN_FRONTEND=noninteractive
+ENV CUDA_HOME=/usr/local/cuda
+ENV PATH=$PATH:$CUDA_HOME/bin
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+ENV TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.6;9.0"
+ARG PYTHON_VERSION=3.12
+ARG TORCH_VERSION=2.9.1
+ARG CUDA_VERSION=12.9.1
+ARG CUDNN_MAJOR_VERSION=9
+ENV PATH=/opt/venv/bin:$PATH
+ENV PYTHONUNBUFFERED=1
+ARG AARCH=x86_64
+# Install Python
+RUN apt-get update && \
+    apt-get install -y software-properties-common wget && \
+    add-apt-repository ppa:deadsnakes/ppa -y && \
+    apt-get install -y python$PYTHON_VERSION-dev python$PYTHON_VERSION-venv python3-pip && \
+    python$PYTHON_VERSION -m venv /opt/venv
+# Install cuda-toolkit
+RUN CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1'}) && \
+    CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $2'}) && \
+    rm /etc/apt/sources.list.d/cuda*.list || true && \
+    rm /etc/apt/sources.list.d/nvidia-cuda.list || true && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${AARCH}/cuda-keyring_1.1-1_all.deb && \
+    dpkg -i cuda-keyring_1.1-1_all.deb && \
+    rm cuda-keyring_1.1-1_all.deb && \
+    apt-get update && \
+    apt-get install -y cuda-toolkit-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} cudnn-cuda-$CUDA_MAJOR_VERSION libcudnn$CUDNN_MAJOR_VERSION-cuda-$CUDA_MAJOR_VERSION libnccl2 libnccl-dev cmake
+# Install PyTorch
+RUN export MATRIX_CUDA_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1 $2'}) && \
+    export MATRIX_TORCH_VERSION=$(echo $TORCH_VERSION | awk -F \. {'print $1 "." $2'}) && \
+    export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \ 
+    minv = {'2.5': 118, '2.6': 118, '2.7': 118, '2.8': 126, '2.9': 126}[env['MATRIX_TORCH_VERSION']]; \
+    maxv = {'2.5': 124, '2.6': 126, '2.7': 128, '2.8': 129, '2.9': 130}[env['MATRIX_TORCH_VERSION']]; \
+    print(minv if int(env['MATRIX_CUDA_VERSION']) < 120 else maxv)" \
+    ) && \
+    pip install --no-cache-dir torch==${TORCH_VERSION} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
\ No newline at end of file
--- a/.github/actions/build-pytorch-wheel/action.yml
+++ b/.github/actions/build-pytorch-wheel/action.yml
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+name: Build PyTorch Wheel
+description: Builds a PyTorch wheel for TransformerEngine
+inputs:
+  release-version:
+    description: 'The release version to use for the build'
+    required: true
+  python-version:
+    description: 'The Python version to use for the build'
+    required: true
+  cuda-version:
+    description: 'The CUDA version to use for the build'
+    required: true
+  cudnn-version:
+    description: 'The cuDNN version to use for the build'
+    required: true
+  torch-version:
+    description: 'The PyTorch version to use for the build'
+    required: true
+  cxx11_abi:
+    description: 'Enable torch flag C++11 ABI (TRUE/FALSE)'
+    required: true
+  base-image:
+    description: 'The base image to use for the build'
+    required: false
+  aarch:
+    description: 'The architecture to use for the build'
+    required: true
+outputs:
+  wheel_name:
+    description: 'The name of the built wheel'
+    value: ${{ steps.build_wheel.outputs.wheel_name }}
+runs:
+  using: 'composite'
+  steps:
+    - name: Move /var/lib/docker/
+      shell: bash -euxo pipefail {0}
+      run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"
+    - name: Maximize build space
+      uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
+      with:
+        root-reserve-mb: 5120
+        temp-reserve-mb: 32
+        swap-size-mb: 10240
+        remove-dotnet: 'true'
+        remove-android: 'true'
+        remove-haskell: 'true'
+        remove-codeql: 'true'
+        build-mount-path: '/var/lib/docker/'
+    - name: Restore /var/lib/docker/
+      shell: bash -euxo pipefail {0}
+      run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ inputs.release-version }}
+        submodules: recursive
+    - name: Checkout build tools
+      uses: actions/checkout@v4
+      with:
+        path: build-tools
+        submodules: recursive
+    - name: Build image
+      shell: bash -euxo pipefail {0}
+      env:
+        BASE_IMAGE: ${{ inputs.base-image }}
+      run: |
+        if [[ "${BASE_IMAGE}" == "" ]]; then
+          docker build \
+            -t transformer-engine-build \
+            -f build-tools/.github/actions/build-pytorch-wheel/Dockerfile \
+            --build-arg PYTHON_VERSION=${{ inputs.python-version }} \
+            --build-arg TORCH_VERSION=${{ inputs.torch-version }} \
+            --build-arg CUDA_VERSION=${{ inputs.cuda-version }} \
+            --build-arg CUDNN_MAJOR_VERSION=${{ inputs.cudnn-version }} \
+            --build-arg AARCH=${{ inputs.aarch }} \
+            .
+        else
+          docker pull ${BASE_IMAGE}
+          docker tag ${BASE_IMAGE} transformer-engine-build
+        fi
+    - name: Build wheel
+      shell: bash -euxo pipefail {0}
+      id: build_wheel
+      env:
+        CXX11_ABI: ${{ inputs.cxx11_abi }}
+      run: |
+        echo ::group::Build wheel
+        EXIT_CODE=$(docker run \
+            --rm \
+            --shm-size=64g \
+            --workdir /workspace/transformer_engine/pytorch \
+            --volume $(pwd):/workspace \
+            --volume $GITHUB_OUTPUT:$GITHUB_OUTPUT \
+            -e PIP_CONSTRAINT= \
+            -e CXX11_ABI=$CXX11_ABI \
+            -e GITHUB_OUTPUT=$GITHUB_OUTPUT \
+            transformer-engine-build bash /workspace/build-tools/.github/actions/build-pytorch-wheel/build.sh | tail -n 1)
+        # Do not fail the job if timeout killed the build
+        exit $EXIT_CODE
+        echo ::endgroup::
+    - name: Log Built Wheels
+      shell: bash -euxo pipefail {0}
+      run: |
+        ls transformer_engine/pytorch/dist
--- a/.github/actions/build-pytorch-wheel/build.sh
+++ b/.github/actions/build-pytorch-wheel/build.sh
+#!/bin/bash
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+set -eoxu pipefail
+export NVTE_PYTORCH_FORCE_BUILD=TRUE
+export NVTE_NO_LOCAL_VERSION=1
+export NVTE_PYTORCH_FORCE_CXX11_ABI=$CXX11_ABI
+export PIP_CONSTRAINT=
+pip install wheel packaging nvidia-mathdx ninja pybind11
+# 5h timeout since GH allows max 6h and we want some buffer
+EXIT_CODE=0
+timeout 5h python setup.py bdist_wheel --dist-dir=dist || EXIT_CODE=$?
+if [ $EXIT_CODE -eq 0 ]; then
+    wheel_name=$(python -c "import setup; print(setup.get_wheel_url()[1])" | tail -n 1)
+    ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
+    echo "wheel_name=${wheel_name}" | tee -a "$GITHUB_OUTPUT"
+fi
+echo $EXIT_CODE
--- a/.github/scripts/check_for_ngc_images.sh
+++ b/.github/scripts/check_for_ngc_images.sh
+#!/bin/bash
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+# Configuration
+BASE_IMAGE="nvcr.io/nvidia/pytorch"
+TAG_SUFFIX="-py3"
+MONTHS_TO_CHECK=5 # Check current month and previous 4 months (total 5)
+# Initialize an array to store existing tags
+EXISTING_TAGS=()
+echo "Checking for existence of the last ${MONTHS_TO_CHECK} NGC PyTorch images: ${BASE_IMAGE}:YY.MM${TAG_SUFFIX}"
+echo "---------------------------------------------------------------------"
+# Loop through the last N months
+for i in $(seq 0 $((MONTHS_TO_CHECK - 1))); do
+    # Calculate Year and Month for the tag
+    CURRENT_YEAR=$(date +%Y)
+    CURRENT_MONTH=$(date +%m)
+    # Calculate target month and year
+    TARGET_DATE=$(date -d "$CURRENT_YEAR-$CURRENT_MONTH-01 -$i months" +%y.%m)
+    # Construct the full image tag and the tag-only string
+    IMAGE_TAG="${TARGET_DATE}${TAG_SUFFIX}"
+    FULL_IMAGE="${BASE_IMAGE}:${IMAGE_TAG}"
+    echo "Checking: ${FULL_IMAGE}"
+    # Use 'docker manifest inspect' to check for image existence without pulling.
+    if docker manifest inspect "${FULL_IMAGE}" > /dev/null 2>&1; then
+        echo "✅ EXISTS: Found."
+        # Add the tag-only string to the array
+        EXISTING_TAGS+=("nvcr.io/nvidia/pytorch:${IMAGE_TAG}")
+    else
+        echo "❌ MISSING: Not found."
+    fi
+done
+echo "---------------------------------------------------------------------"
+## JSON Output Generation
+# This uses the collected array to build a JSON string.
+# 1. Convert the shell array to a newline-separated string.
+TAGS_NL_SEP=$(printf "%s\n" "${EXISTING_TAGS[@]}")
+# 2. Use jq to read the newline-separated list and format it into a JSON array.
+# . | split("\n") | .[:-1] reads the input, splits it by newline, and removes the trailing empty element.
+if command -v jq &> /dev/null; then
+    JSON_STRING=$(echo -e "${TAGS_NL_SEP}" | jq -R -s 'split("\n") | .[:-1]')
+    echo "Generated JSON String of Existing Tags:"
+    echo "${JSON_STRING}"
+    # Optional: Save the JSON string to a variable for further use
+    # echo "JSON_STRING is now available in the shell if you source this script."
+else
+    echo "WARNING: 'jq' is not installed. Cannot format output as JSON."
+    echo "Found Tags: ${EXISTING_TAGS[*]}"
+fi
+echo "---"
+echo "Check complete."
+echo "${JSON_STRING}" > ngc_images.json
--- a/.github/workflows/attach-wheels-to-release.yml
+++ b/.github/workflows/attach-wheels-to-release.yml
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+# This workflow will:
+# - Create a new Github release
+# - Build wheels for supported architectures
+# - Deploy the wheels to the Github release
+# - Release the static code to PyPi
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+name: Attach wheels to release
+on:
+  release:
+    types: [published]
+  workflow_dispatch:
+    inputs:
+      runs-on:
+        description: 'The runner to use for the build'
+        required: true
+        type: string
+        default: ubuntu-22.04
+      release-version:
+        description: 'Release version'
+        required: true
+        default: '0.1.0'
+      python-version:
+        description: 'Python version'
+        required: true
+        default: '3.12'
+      torch-version:
+        description: 'Torch version'
+        required: true
+        default: '2.8.0'
+      cuda-version:
+        description: 'CUDA version'
+        required: true
+        default: '12.9.1'
+      cudnn-version:
+        description: 'CUDNN version'
+        required: true
+        default: '9'
+      cxx11_abi:
+        description: 'C++11 ABI'
+        required: true
+        type: choice
+        default: 'TRUE'
+        options:
+          - 'TRUE'
+          - 'FALSE'
+      ngc-image:
+        description: 'NGC PyTorch image (will take precedence over the source build)'
+        required: false
+        type: string
+        default: ''
+jobs:
+  pre-flight:
+    runs-on: ubuntu-latest
+    outputs:
+      build-wheel-matrix: ${{ steps.matrix.outputs.matrix }}
+      release-assets-url: ${{ steps.release-assets-url.outputs.upload_url }}
+      ngc-images: ${{ steps.check_for_ngc_images.outputs.IMAGES }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Build release matrix
+        id: matrix
+        env:
+          EVENT: ${{ github.event_name }}
+        run: |
+          if [[ "$EVENT" == "release" ]]; then
+            MATRIX=$(echo '{
+              "os": ["ubuntu-22.04", "ubuntu-22.04-arm"],
+              "release-version": ["${{ github.event.release.tag_name }}"],
+              "python-version": ["3.12"], 
+              "torch-version": ["2.8.0"], 
+              "cuda-version": ["12.9.1"], 
+              "cudnn-version": ["9"], 
+              "cxx11_abi": ["TRUE"]
+            }' | jq -rc)
+          else
+            MATRIX=$(echo '{
+              "os": ["${{ inputs.runs-on }}"],
+              "release-version": ["${{ inputs.release-version }}"],
+              "python-version": ["${{ inputs.python-version }}"], 
+              "torch-version": ["${{ inputs.torch-version }}"], 
+              "cuda-version": ["${{ inputs.cuda-version }}"], 
+              "cudnn-version": ["${{ inputs.cudnn-version }}"], 
+              "cxx11_abi": ["${{ inputs.cxx11_abi }}"]
+            }' | jq -rc)
+          fi
+          echo "matrix=$MATRIX" | tee -a "$GITHUB_OUTPUT"
+      - name: Get Release with tag
+        id: get_current_release
+        uses: joutvhu/get-release@9a8271732adc3299a22f8ad09b0a67eb3aa836ac
+        if: ${{ github.event_name == 'workflow_dispatch' }}
+        with:
+          tag_name: ${{ inputs.release-version }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Get release assets url
+        env:
+          EVENT: ${{ github.event_name }}
+        if: ${{ (success() || !failure()) && !cancelled()}}
+        id: release-assets-url
+        run: |
+          if [[ "$EVENT" == "release" ]]; then
+            echo "upload_url=${{ github.event.release.upload_url }}" | tee -a "$GITHUB_OUTPUT"
+          else
+            echo "upload_url=${{ steps.get_current_release.outputs.upload_url }}" | tee -a "$GITHUB_OUTPUT"
+          fi
+      - name: Check for NGC PyTorch images
+        id: check_for_ngc_images
+        if: ${{ (success() || !failure()) && !cancelled()}}
+        env:
+          EVENT: ${{ github.event_name }}
+        run: |
+          if [[ "$EVENT" == "release" ]]; then
+            bash ./.github/scripts/check_for_ngc_images.sh
+            echo "IMAGES=$(cat ngc_images.json | jq -cr)" | tee -a $GITHUB_OUTPUT
+          else
+            echo 'IMAGES=["${{ inputs.ngc-image }}"]' | tee -a "$GITHUB_OUTPUT"
+          fi
+  build_wheels:
+    name: Build Wheel
+    runs-on: ${{ matrix.os }}
+    needs: pre-flight
+    if: ${{ github.event_name == 'release' || inputs.ngc-image == '' }}
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJson(needs.pre-flight.outputs.build-wheel-matrix) }}
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+      - name: 'Build PyTorch Wheel'
+        uses: ./.github/actions/build-pytorch-wheel
+        id: build-pytorch-wheel
+        with:
+          release-version: ${{ matrix.release-version }}
+          python-version: ${{ matrix.python-version }}
+          cuda-version: ${{ matrix.cuda-version }}
+          cudnn-version: ${{ matrix.cudnn-version }}
+          torch-version: ${{ matrix.torch-version }}
+          cxx11_abi: ${{ matrix.cxx11_abi }}
+          aarch: ${{ matrix.os == 'ubuntu-22.04' && 'x86_64' || 'sbsa' }}
+        env:
+          NVTE_FRAMEWORK: pytorch
+          MAX_JOBS: 1
+      - name: Upload Release Asset
+        id: upload_release_asset
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.pre-flight.outputs.release-assets-url }}
+          asset_path: ./transformer_engine/pytorch/dist/${{ steps.build-pytorch-wheel.outputs.wheel_name }}
+          asset_name: ${{ steps.build-pytorch-wheel.outputs.wheel_name }}
+          asset_content_type: application/*
+  build_wheels_for_ngc:
+    name: Build Wheels for NGC PyTorch images
+    runs-on: ${{ matrix.os }}
+    needs: pre-flight
+    if: ${{ github.event_name == 'release' || inputs.ngc-image != '' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-22.04]
+        container-image: ${{ fromJson(needs.pre-flight.outputs.ngc-images) }}
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+      - name: 'Build PyTorch Wheel'
+        uses: ./.github/actions/build-pytorch-wheel
+        id: build-pytorch-wheel
+        with:
+          base-image: ${{ matrix.container-image }}
+      - name: Upload Release Asset
+        id: upload_release_asset
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.pre-flight.outputs.release-assets-url }}
+          asset_path: ./transformer_engine/pytorch/dist/${{ steps.build-pytorch-wheel.outputs.wheel_name }}
+          asset_name: ${{ steps.build-pytorch-wheel.outputs.wheel_name }}
+          asset_content_type: application/*
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.

--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
@@ -24,37 +24,66 @@ jobs:
        uses: actions/checkout@v3
        with:
          submodules: recursive
+      - name: ccache
+        uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad
      - name: 'Build'
-        run: pip install --no-build-isolation . -v
+        run: NVTE_USE_CCACHE=1 NVTE_CCACHE_BIN=sccache pip install --no-build-isolation . -v
        env:
          NVTE_FRAMEWORK: none
          MAX_JOBS: 1
+          SCCACHE_GHA_ENABLED: "true"
      - name: 'Sanity check'
        run: python3 -c "import transformer_engine"
        working-directory: /
  pytorch:
    name: 'PyTorch'
    runs-on: ubuntu-latest
-    container:
-      image: nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04
-      options: --user root
    steps:
-      - name: 'Dependencies'
+      - name: Move /var/lib/docker/
-        run: |
+        shell: bash -euxo pipefail {0}
-          apt-get update
+        run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"
-          apt-get install -y git python3.9 pip cudnn9-cuda-12
-          pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript
+      - name: Maximize build space
+        uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
+        with:
+          root-reserve-mb: 5120
+          temp-reserve-mb: 32
+          swap-size-mb: 10240
+          remove-dotnet: 'true'
+          remove-android: 'true'
+          remove-haskell: 'true'
+          remove-codeql: 'true'
+          build-mount-path: '/var/lib/docker/'
+      - name: Restore /var/lib/docker/
+        shell: bash -euxo pipefail {0}
+        run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"
      - name: 'Checkout'
        uses: actions/checkout@v3
        with:
          submodules: recursive
+      - name: Start named container
+        run: |
+          docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04 sleep infinity
+      - name: 'Dependencies'
+        run: |
+          docker exec builder bash -c '\
+            apt-get update && \
+            apt-get install -y git python3.9 pip cudnn9-cuda-12 && \
+            pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript && \
+            apt-get clean \
+          '
      - name: 'Build'
-        run: pip install --no-build-isolation . -v --no-deps
+        run: docker exec builder bash -c 'pip install --no-build-isolation . -v --no-deps'
        env:
          NVTE_FRAMEWORK: pytorch
          MAX_JOBS: 1
      - name: 'Sanity check'
-        run: python3 tests/pytorch/test_sanity_import.py
+        run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py'
  jax:
    name: 'JAX'
    runs-on: ubuntu-latest
@@ -63,37 +92,65 @@ jobs:
      options: --user root
    steps:
      - name: 'Dependencies'
-        run: pip install pybind11[global]
+        run: pip install cmake==3.21.0 pybind11[global]
      - name: 'Checkout'
        uses: actions/checkout@v3
        with:
          submodules: recursive
+      - name: ccache
+        uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad
      - name: 'Build'
-        run: pip install --no-build-isolation . -v
+        run: |
+          NVTE_CCACHE_BIN=sccache NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v
        env:
          NVTE_FRAMEWORK: jax
          MAX_JOBS: 1
+          SCCACHE_GHA_ENABLED: "true"
      - name: 'Sanity check'
        run: python3 tests/jax/test_sanity_import.py
  all:
    name: 'All'
    runs-on: ubuntu-latest
-    container:
-      image: ghcr.io/nvidia/jax:jax
-      options: --user root
    steps:
-      - name: 'Dependencies'
+      - name: Move /var/lib/docker/
-        run: |
+        shell: bash -euxo pipefail {0}
-          pip install pybind11[global] einops onnxscript
+        run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"
-          pip install torch --index-url https://download.pytorch.org/whl/cu130
+      - name: Maximize build space
+        uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
+        with:
+          root-reserve-mb: 5120
+          temp-reserve-mb: 32
+          swap-size-mb: 10240
+          remove-dotnet: 'true'
+          remove-android: 'true'
+          remove-haskell: 'true'
+          remove-codeql: 'true'
+          build-mount-path: '/var/lib/docker/'
+      - name: Restore /var/lib/docker/
+        shell: bash -euxo pipefail {0}
+        run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"
      - name: 'Checkout'
        uses: actions/checkout@v3
        with:
          submodules: recursive
+      - name: Start named container
+        run: |
+          docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d ghcr.io/nvidia/jax:jax sleep infinity
+      - name: 'Dependencies'
+        run: |
+          docker exec builder bash -c '\
+            pip install cmake==3.21.0 pybind11[global] einops onnxscript && \
+            pip install torch --no-cache-dir --index-url https://download.pytorch.org/whl/cu130
+          '
      - name: 'Build'
-        run: pip install --no-build-isolation . -v --no-deps
+        run: docker exec builder bash -c 'pip install --no-cache-dir --no-build-isolation . -v --no-deps'
        env:
          NVTE_FRAMEWORK: all
          MAX_JOBS: 1
      - name: 'Sanity check'
-        run: python3 tests/pytorch/test_sanity_import.py && python3 tests/jax/test_sanity_import.py
+        run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py && python3 tests/jax/test_sanity_import.py'
--- a/.github/workflows/deploy_nightly_docs.yml
+++ b/.github/workflows/deploy_nightly_docs.yml
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.

--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
@@ -17,15 +17,15 @@ jobs:
        uses: actions/checkout@v3
      - name: 'Install dependencies'
        run: |
-          pip install sphinx==8.1.3 sphinx_rtd_theme==3.0.1 nbsphinx==0.9.5 IPython ipython_genutils==0.2.0 ipywidgets==8.0.2 astroid==3.3.2
+          pip install sphinx==8.1.3 sphinx_rtd_theme==3.0.1 nbsphinx==0.9.5 IPython ipython_genutils==0.2.0 ipywidgets==8.0.2 astroid==3.3.2 sphinx-tabs==3.4.7
          pip install breathe==4.35.0 sphinx-autoapi==3.3.2
          sudo apt-get install -y pandoc graphviz doxygen
          export GIT_SHA=$(git show-ref --hash HEAD)
      - name: 'Build docs'
-        run: |
+        run: | # SPHINXOPTS="-W" errors out on warnings
          doxygen docs/Doxyfile
          cd docs
-          make html
+          make html SPHINXOPTS="-W"
      - name: 'Upload docs'
        uses: actions/upload-artifact@v4
        with:

--- a/.github/workflows/license.yml
+++ b/.github/workflows/license.yml
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.

--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.

--- a/.github/workflows/trigger-ci.yml
+++ b/.github/workflows/trigger-ci.yml
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
@@ -56,8 +56,8 @@ jobs:
           || github.actor == 'vcherepanov-nv'
           || github.actor == 'tdophung'
           || github.actor == 'vthumbe1503'
-           || github.actor == 'janekb04'
           || github.actor == 'shengfangd'
+           || github.actor == 'kainzhong'
         )
    steps:
      - name: Check if comment is issued by authorized person

--- a/.github/workflows/upload-ci-logs.yml
+++ b/.github/workflows/upload-ci-logs.yml
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.

--- a/.gitignore
+++ b/.gitignore
+.venv
 *.o
 *.swp
 *.ii
@@ -40,3 +41,4 @@ compile_commands.json
 .nfs
 tensor_dumps/
 artifacts/
+*.DS_Store
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
 ..
-    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+    Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
    See LICENSE for license information.

--- a/CPPLINT.cfg
+++ b/CPPLINT.cfg
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.

--- a/README.rst
+++ b/README.rst
 ..
-    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+    Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
    See LICENSE for license information.
@@ -13,23 +13,14 @@ Transformer Engine
 Latest News
 ===========
+* [11/2025] `NVIDIA Blackwell Architecture Sweeps MLPerf Training v5.1 Benchmarks <https://developer.nvidia.com/blog/nvidia-blackwell-architecture-sweeps-mlperf-training-v5-1-benchmarks/>`_
+* [11/2025] `Scale Biology Transformer Models with PyTorch and NVIDIA BioNeMo Recipes <https://developer.nvidia.com/blog/scale-biology-transformer-models-with-pytorch-and-nvidia-bionemo-recipes/>`_
+* [11/2025] `FP8 Training of Large-Scale RL Models <https://lmsys.org/blog/2025-11-25-fp8-rl/>`_
 * [09/2025] `Pretraining Large Language Models with NVFP4 <https://www.arxiv.org/pdf/2509.25149>`_
 * [09/2025] `Native FP8 Mixed Precision Training for Ling 2.0, Open Sourced! <https://huggingface.co/blog/im0qianqian/ling-mini-2-fp8-mixed-precision-training-solution>`_
 * [09/2025] `Faster Training Throughput in FP8 Precision with NVIDIA NeMo <https://developer.nvidia.com/blog/faster-training-throughput-in-fp8-precision-with-nvidia-nemo/>`_
 * [08/2025] `How we built DeepL's next-generation LLMs with FP8 for training and inference <https://www.deepl.com/en/blog/tech/next-generation-llm-fp8-training>`_
 * [08/2025] `NVFP4 Trains with Precision of 16-bit and Speed and Efficiency of 4-bit <https://developer.nvidia.com/blog/nvfp4-trains-with-precision-of-16-bit-and-speed-and-efficiency-of-4-bit/>`_
-* [06/2025] `Floating Point 8: An Introduction to Efficient, Lower-Precision AI Training <https://developer.nvidia.com/blog/floating-point-8-an-introduction-to-efficient-lower-precision-ai-training/>`_
-* [05/2025] `Advanced Optimization Strategies for LLM Training on NVIDIA Grace Hopper <https://developer.nvidia.com/blog/advanced-optimization-strategies-for-llm-training-on-nvidia-grace-hopper/>`_
-* [03/2025] `Stable and Scalable FP8 Deep Learning Training on Blackwell | GTC 2025 <https://www.nvidia.com/en-us/on-demand/session/gtc25-s72778/>`_
-* [03/2025] `Measure and Improve AI Workload Performance with NVIDIA DGX Cloud Benchmarking <https://developer.nvidia.com/blog/measure-and-improve-ai-workload-performance-with-nvidia-dgx-cloud-benchmarking/>`_
-.. image:: docs/examples/comparison-fp8-bf16-training-nvidia-dgx-cloud-benchmarking-performance-explorer.jpg
-  :width: 600
-  :alt: Comparison of FP8 versus BF16 training, as seen in NVIDIA DGX Cloud Benchmarking Performance Explorer
-* [02/2025] `Understanding the Language of Life's Biomolecules Across Evolution at a New Scale with Evo 2 <https://developer.nvidia.com/blog/understanding-the-language-of-lifes-biomolecules-across-evolution-at-a-new-scale-with-evo-2/>`_
-* [02/2025] `NVIDIA DGX Cloud Introduces Ready-To-Use Templates to Benchmark AI Platform Performance <https://developer.nvidia.com/blog/nvidia-dgx-cloud-introduces-ready-to-use-templates-to-benchmark-ai-platform-performance/>`_
-* [01/2025] `Continued Pretraining of State-of-the-Art LLMs for Sovereign AI and Regulated Industries with iGenius and NVIDIA DGX Cloud <https://developer.nvidia.com/blog/continued-pretraining-of-state-of-the-art-llms-for-sovereign-ai-and-regulated-industries-with-igenius-and-nvidia-dgx-cloud/>`_
 `Previous News <#previous-news>`_
@@ -259,6 +250,7 @@ These environment variables can be set before installation to customize the buil
 * **NVTE_FRAMEWORK**: Comma-separated list of frameworks to build for (e.g., ``pytorch,jax``)
 * **MAX_JOBS**: Limit number of parallel build jobs (default varies by system)
 * **NVTE_BUILD_THREADS_PER_JOB**: Control threads per build job
+* **NVTE_CUDA_ARCHS**: Semicolon-separated list of CUDA compute architectures to compile for (e.g., ``80;90`` for A100 and H100). If not set, automatically determined based on CUDA version. Setting this can significantly reduce build time and binary size.
 Compiling with FlashAttention
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -424,6 +416,18 @@ Videos
 Previous News
 =============
+* [06/2025] `Floating Point 8: An Introduction to Efficient, Lower-Precision AI Training <https://developer.nvidia.com/blog/floating-point-8-an-introduction-to-efficient-lower-precision-ai-training/>`_
+* [05/2025] `Advanced Optimization Strategies for LLM Training on NVIDIA Grace Hopper <https://developer.nvidia.com/blog/advanced-optimization-strategies-for-llm-training-on-nvidia-grace-hopper/>`_
+* [03/2025] `Stable and Scalable FP8 Deep Learning Training on Blackwell | GTC 2025 <https://www.nvidia.com/en-us/on-demand/session/gtc25-s72778/>`_
+* [03/2025] `Measure and Improve AI Workload Performance with NVIDIA DGX Cloud Benchmarking <https://developer.nvidia.com/blog/measure-and-improve-ai-workload-performance-with-nvidia-dgx-cloud-benchmarking/>`_
+.. image:: docs/examples/comparison-fp8-bf16-training-nvidia-dgx-cloud-benchmarking-performance-explorer.jpg
+  :width: 600
+  :alt: Comparison of FP8 versus BF16 training, as seen in NVIDIA DGX Cloud Benchmarking Performance Explorer
+* [02/2025] `Understanding the Language of Life's Biomolecules Across Evolution at a New Scale with Evo 2 <https://developer.nvidia.com/blog/understanding-the-language-of-lifes-biomolecules-across-evolution-at-a-new-scale-with-evo-2/>`_
+* [02/2025] `NVIDIA DGX Cloud Introduces Ready-To-Use Templates to Benchmark AI Platform Performance <https://developer.nvidia.com/blog/nvidia-dgx-cloud-introduces-ready-to-use-templates-to-benchmark-ai-platform-performance/>`_
+* [01/2025] `Continued Pretraining of State-of-the-Art LLMs for Sovereign AI and Regulated Industries with iGenius and NVIDIA DGX Cloud <https://developer.nvidia.com/blog/continued-pretraining-of-state-of-the-art-llms-for-sovereign-ai-and-regulated-industries-with-igenius-and-nvidia-dgx-cloud/>`_
 * [11/2024] `Developing a 172B LLM with Strong Japanese Capabilities Using NVIDIA Megatron-LM <https://developer.nvidia.com/blog/developing-a-172b-llm-with-strong-japanese-capabilities-using-nvidia-megatron-lm/>`_
 * [11/2024] `How FP8 boosts LLM training by 18% on Amazon SageMaker P5 instances <https://aws.amazon.com/blogs/machine-learning/how-fp8-boosts-llm-training-by-18-on-amazon-sagemaker-p5-instances/>`_
 * [11/2024] `Efficiently train models with large sequence lengths using Amazon SageMaker model parallel <https://aws.amazon.com/blogs/machine-learning/efficiently-train-models-with-large-sequence-lengths-using-amazon-sagemaker-model-parallel/>`_

--- a/benchmarks/attention/benchmark_attention.py
+++ b/benchmarks/attention/benchmark_attention.py
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.

--- a/benchmarks/benchmark_rht_cast.py
+++ b/benchmarks/benchmark_rht_cast.py
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.

--- a/benchmarks/linear/benchmark_grouped_linear.py
+++ b/benchmarks/linear/benchmark_grouped_linear.py
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
@@ -45,6 +45,16 @@ nsys profile \
    --trace=cuda,nvtx,cudnn,cublas \
    python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe nvfp4
+# Example for jagged input benchmark to simulate unbalanced token splits
+python benchmarks/linear/benchmark_grouped_linear.py --recipe nvfp4 --jagged-input "15296,8960,14656,14784,11712,7936,14080,10880"
+# Example to look at a single kernel target with NCU, like the fused hadamard amax kernel for NVFP4 recipe
+ncu -f -o ./benchmarks/linear/ncu_b200_numgemm_8_nvfp4_rht_amax \
+    --set=full \
+    --kernel-name "GroupHadamardAmaxTmaKernel" \
+    -s 5 -c 5 \
+    python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe nvfp4
 """
 RECIPES = {
@@ -163,7 +173,9 @@ def benchmark_linear(
    return timing_ms
-def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4):
+def run_benchmark_linear(
+    mkns, recipe_name, use_bias, num_gemms=4, m_splits_provided=None, fwd_only=False
+):
    data = []
    assert not use_bias, "Bias is not supported for GroupedLinear benchmark"
@@ -172,13 +184,14 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4):
        device = "cuda"
        x = torch.randn((m, k), dtype=torch.bfloat16, device=device, requires_grad=True)
        ws = [torch.randn((n, k), dtype=torch.bfloat16, device=device) for _ in range(num_gemms)]
-        assert m % num_gemms == 0
+        m_splits = [m // num_gemms] * num_gemms if m_splits_provided is None else m_splits_provided
-        m_splits = [m // num_gemms] * num_gemms
        # Bias is not supported for GroupedLinear benchmark
        bias = None
        # Run the benchmark
        print(f"fwd_m={m}, fwd_k={k}, fwd_n={n}")
+        print(f"m_splits: {m_splits}")
+        print(f"fwd_only: {fwd_only}")
        grouped_fwd_bwd_timing_ms = benchmark_linear(
            x,
@@ -186,7 +199,7 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4):
            m_splits,
            bias,
            recipe_name,
-            mode="fwd_bwd",
+            mode="fwd_only" if fwd_only else "fwd_bwd",
            num_gemms=num_gemms,
        )
@@ -202,6 +215,8 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4):
            ]
        )
+    timing_notation = "grouped_fwd_time_ms" if fwd_only else "grouped_fwd_bwd_time_ms"
    df = pd.DataFrame(
        data=data,
        columns=[
@@ -210,7 +225,7 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4):
            "n",
            "recipe",
            "num_gemms",
-            "grouped_fwd_bwd_time_ms",
+            timing_notation,
        ],
    )
@@ -223,7 +238,7 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--profile", action="store_true", help="Enable profiling mode")
    parser.add_argument(
-        "--output_dir",
+        "--output-dir",
        type=str,
        default="benchmark_output/",
        help="output path for report",
@@ -235,8 +250,41 @@ if __name__ == "__main__":
        default="bf16",
        help="Recipe to use, options are fp8_sub_channel, mxfp8, bf16, or all",
    )
+    # add an argument for the jagged input
+    # example: [15296, 8960, 14656, 14784, 11712, 7936, 14080, 10880] => sums up to 98304
+    parser.add_argument(
+        "--jagged-input",
+        type=str,
+        default=None,
+        help="Jagged input to use, example: [15296, 8960, 14656, 14784, 11712, 7936, 14080, 10880]",
+    )
+    parser.add_argument(
+        "--hidden-dim",
+        type=int,
+        default=7168,
+        help="Hidden dimension to use, default is 7168",
+    )
+    parser.add_argument(
+        "--output-dim",
+        type=int,
+        default=2048,
+        help="Output dimension to use, default is 2048",
+    )
+    parser.add_argument(
+        "--fwd-only",
+        action="store_true",
+        default=False,
+        help="Run forward pass only, default is both forward and backward passes",
+    )
    args = parser.parse_args()
+    jagged_input_splits = None
+    if args.jagged_input is not None:
+        jagged_input_splits = [int(x) for x in args.jagged_input.split(",")]
+        print(f"Jagged input splits: {jagged_input_splits}")
+        print(f"Jagged input splits sum: {sum(jagged_input_splits)}")
+        print(f"Jagged input splits num_gemms: {len(jagged_input_splits)}")
    use_bias = False
    # Set the MKN values to benchmark
    # Deepseek V3 EP64, SEQ_LEN=8192, topK8
@@ -256,11 +304,28 @@ if __name__ == "__main__":
    # 4 or 8local experts per rank
    num_gemms_list = [4, 8]
+    if jagged_input_splits is not None:
+        num_gemms_list = [len(jagged_input_splits)]
+    token_dim_list = [16384, 32768, 65536, 98304]
+    hidden_dim_list = [7168]
+    output_dim_list = [2048]
+    # override the default targets to benchmark if specified
+    if jagged_input_splits is not None:
+        token_dim_list = [sum(jagged_input_splits)]
+    if args.hidden_dim is not None:
+        hidden_dim_list = [args.hidden_dim]
+    if args.output_dim is not None:
+        output_dim_list = [args.output_dim]
    # MKN for group linear
    mkns = []
-    for m in [65536]:
+    for m in token_dim_list:
-        for k in [7168]:
+        for k in hidden_dim_list:
-            for n in [2048]:
+            for n in output_dim_list:
                mkns.append((m, k, n))
    # default recipes to run if not specified
@@ -272,14 +337,20 @@ if __name__ == "__main__":
        recipe_list = [args.recipe]
    if args.profile:
-        mkns = [(8192 * 8, 7168, 2048)]
+        num_gemms_list = [8]
+        hidden_dim_to_profile = 7168 if args.hidden_dim is None else args.hidden_dim
+        output_dim_to_profile = 2048 if args.output_dim is None else args.output_dim
+        token_dim_to_profile = 8192 * 8
+        if jagged_input_splits is not None:
+            num_gemms_list = [len(jagged_input_splits)]
+            token_dim_to_profile = sum(jagged_input_splits)
+        mkns = [(token_dim_to_profile, hidden_dim_to_profile, output_dim_to_profile)]
        # in profile mode, only run one recipe specified in args.recipe
        assert args.recipe != "all", (
            "In profile mode, only one recipe can be specified, please specify the recipe as"
            " fp8_sub_channel, mxfp8, nvfp4, or bf16"
        )
        recipe_list = [args.recipe]
-        num_gemms_list = [8]
        torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
    # Initialize a dataframe to store the results
@@ -310,6 +381,8 @@ if __name__ == "__main__":
                recipe_name,
                use_bias,
                num_gemms=num_gemms,
+                m_splits_provided=jagged_input_splits,
+                fwd_only=args.fwd_only,
            )
            df_linears = pd.concat([df_linears, df])