Commit 970620a5 authored by wenjh's avatar wenjh
Browse files

merge nv_release_v2.10 to release_v2.10


Signed-off-by: wenjh's avatarwenjh <wenjh@sugon.com>
parents c1a1c04e 769ed778
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
FROM ubuntu:22.04
ENV DEBIAN_FRONTEND=noninteractive
ENV CUDA_HOME=/usr/local/cuda
ENV PATH=$PATH:$CUDA_HOME/bin
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
ENV TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.6;9.0"
ARG PYTHON_VERSION=3.12
ARG TORCH_VERSION=2.9.1
ARG CUDA_VERSION=12.9.1
ARG CUDNN_MAJOR_VERSION=9
ENV PATH=/opt/venv/bin:$PATH
ENV PYTHONUNBUFFERED=1
ARG AARCH=x86_64
# Install Python
RUN apt-get update && \
apt-get install -y software-properties-common wget && \
add-apt-repository ppa:deadsnakes/ppa -y && \
apt-get install -y python$PYTHON_VERSION-dev python$PYTHON_VERSION-venv python3-pip && \
python$PYTHON_VERSION -m venv /opt/venv
# Install cuda-toolkit
RUN CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1'}) && \
CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $2'}) && \
rm /etc/apt/sources.list.d/cuda*.list || true && \
rm /etc/apt/sources.list.d/nvidia-cuda.list || true && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${AARCH}/cuda-keyring_1.1-1_all.deb && \
dpkg -i cuda-keyring_1.1-1_all.deb && \
rm cuda-keyring_1.1-1_all.deb && \
apt-get update && \
apt-get install -y cuda-toolkit-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} cudnn-cuda-$CUDA_MAJOR_VERSION libcudnn$CUDNN_MAJOR_VERSION-cuda-$CUDA_MAJOR_VERSION libnccl2 libnccl-dev cmake
# Install PyTorch
RUN export MATRIX_CUDA_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1 $2'}) && \
export MATRIX_TORCH_VERSION=$(echo $TORCH_VERSION | awk -F \. {'print $1 "." $2'}) && \
export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
minv = {'2.5': 118, '2.6': 118, '2.7': 118, '2.8': 126, '2.9': 126}[env['MATRIX_TORCH_VERSION']]; \
maxv = {'2.5': 124, '2.6': 126, '2.7': 128, '2.8': 129, '2.9': 130}[env['MATRIX_TORCH_VERSION']]; \
print(minv if int(env['MATRIX_CUDA_VERSION']) < 120 else maxv)" \
) && \
pip install --no-cache-dir torch==${TORCH_VERSION} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
\ No newline at end of file
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
name: Build PyTorch Wheel
description: Builds a PyTorch wheel for TransformerEngine
inputs:
release-version:
description: 'The release version to use for the build'
required: true
python-version:
description: 'The Python version to use for the build'
required: true
cuda-version:
description: 'The CUDA version to use for the build'
required: true
cudnn-version:
description: 'The cuDNN version to use for the build'
required: true
torch-version:
description: 'The PyTorch version to use for the build'
required: true
cxx11_abi:
description: 'Enable torch flag C++11 ABI (TRUE/FALSE)'
required: true
base-image:
description: 'The base image to use for the build'
required: false
aarch:
description: 'The architecture to use for the build'
required: true
outputs:
wheel_name:
description: 'The name of the built wheel'
value: ${{ steps.build_wheel.outputs.wheel_name }}
runs:
using: 'composite'
steps:
- name: Move /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"
- name: Maximize build space
uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
with:
root-reserve-mb: 5120
temp-reserve-mb: 32
swap-size-mb: 10240
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
build-mount-path: '/var/lib/docker/'
- name: Restore /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"
- name: Checkout
uses: actions/checkout@v4
with:
ref: ${{ inputs.release-version }}
submodules: recursive
- name: Checkout build tools
uses: actions/checkout@v4
with:
path: build-tools
submodules: recursive
- name: Build image
shell: bash -euxo pipefail {0}
env:
BASE_IMAGE: ${{ inputs.base-image }}
run: |
if [[ "${BASE_IMAGE}" == "" ]]; then
docker build \
-t transformer-engine-build \
-f build-tools/.github/actions/build-pytorch-wheel/Dockerfile \
--build-arg PYTHON_VERSION=${{ inputs.python-version }} \
--build-arg TORCH_VERSION=${{ inputs.torch-version }} \
--build-arg CUDA_VERSION=${{ inputs.cuda-version }} \
--build-arg CUDNN_MAJOR_VERSION=${{ inputs.cudnn-version }} \
--build-arg AARCH=${{ inputs.aarch }} \
.
else
docker pull ${BASE_IMAGE}
docker tag ${BASE_IMAGE} transformer-engine-build
fi
- name: Build wheel
shell: bash -euxo pipefail {0}
id: build_wheel
env:
CXX11_ABI: ${{ inputs.cxx11_abi }}
run: |
echo ::group::Build wheel
EXIT_CODE=$(docker run \
--rm \
--shm-size=64g \
--workdir /workspace/transformer_engine/pytorch \
--volume $(pwd):/workspace \
--volume $GITHUB_OUTPUT:$GITHUB_OUTPUT \
-e PIP_CONSTRAINT= \
-e CXX11_ABI=$CXX11_ABI \
-e GITHUB_OUTPUT=$GITHUB_OUTPUT \
transformer-engine-build bash /workspace/build-tools/.github/actions/build-pytorch-wheel/build.sh | tail -n 1)
# Do not fail the job if timeout killed the build
exit $EXIT_CODE
echo ::endgroup::
- name: Log Built Wheels
shell: bash -euxo pipefail {0}
run: |
ls transformer_engine/pytorch/dist
#!/bin/bash
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
set -eoxu pipefail
export NVTE_PYTORCH_FORCE_BUILD=TRUE
export NVTE_NO_LOCAL_VERSION=1
export NVTE_PYTORCH_FORCE_CXX11_ABI=$CXX11_ABI
export PIP_CONSTRAINT=
pip install wheel packaging nvidia-mathdx ninja pybind11
# 5h timeout since GH allows max 6h and we want some buffer
EXIT_CODE=0
timeout 5h python setup.py bdist_wheel --dist-dir=dist || EXIT_CODE=$?
if [ $EXIT_CODE -eq 0 ]; then
wheel_name=$(python -c "import setup; print(setup.get_wheel_url()[1])" | tail -n 1)
ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
echo "wheel_name=${wheel_name}" | tee -a "$GITHUB_OUTPUT"
fi
echo $EXIT_CODE
#!/bin/bash
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
# Configuration
BASE_IMAGE="nvcr.io/nvidia/pytorch"
TAG_SUFFIX="-py3"
MONTHS_TO_CHECK=5 # Check current month and previous 4 months (total 5)
# Initialize an array to store existing tags
EXISTING_TAGS=()
echo "Checking for existence of the last ${MONTHS_TO_CHECK} NGC PyTorch images: ${BASE_IMAGE}:YY.MM${TAG_SUFFIX}"
echo "---------------------------------------------------------------------"
# Loop through the last N months
for i in $(seq 0 $((MONTHS_TO_CHECK - 1))); do
# Calculate Year and Month for the tag
CURRENT_YEAR=$(date +%Y)
CURRENT_MONTH=$(date +%m)
# Calculate target month and year
TARGET_DATE=$(date -d "$CURRENT_YEAR-$CURRENT_MONTH-01 -$i months" +%y.%m)
# Construct the full image tag and the tag-only string
IMAGE_TAG="${TARGET_DATE}${TAG_SUFFIX}"
FULL_IMAGE="${BASE_IMAGE}:${IMAGE_TAG}"
echo "Checking: ${FULL_IMAGE}"
# Use 'docker manifest inspect' to check for image existence without pulling.
if docker manifest inspect "${FULL_IMAGE}" > /dev/null 2>&1; then
echo "✅ EXISTS: Found."
# Add the tag-only string to the array
EXISTING_TAGS+=("nvcr.io/nvidia/pytorch:${IMAGE_TAG}")
else
echo "❌ MISSING: Not found."
fi
done
echo "---------------------------------------------------------------------"
## JSON Output Generation
# This uses the collected array to build a JSON string.
# 1. Convert the shell array to a newline-separated string.
TAGS_NL_SEP=$(printf "%s\n" "${EXISTING_TAGS[@]}")
# 2. Use jq to read the newline-separated list and format it into a JSON array.
# . | split("\n") | .[:-1] reads the input, splits it by newline, and removes the trailing empty element.
if command -v jq &> /dev/null; then
JSON_STRING=$(echo -e "${TAGS_NL_SEP}" | jq -R -s 'split("\n") | .[:-1]')
echo "Generated JSON String of Existing Tags:"
echo "${JSON_STRING}"
# Optional: Save the JSON string to a variable for further use
# echo "JSON_STRING is now available in the shell if you source this script."
else
echo "WARNING: 'jq' is not installed. Cannot format output as JSON."
echo "Found Tags: ${EXISTING_TAGS[*]}"
fi
echo "---"
echo "Check complete."
echo "${JSON_STRING}" > ngc_images.json
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
# This workflow will:
# - Create a new Github release
# - Build wheels for supported architectures
# - Deploy the wheels to the Github release
# - Release the static code to PyPi
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
name: Attach wheels to release
on:
release:
types: [published]
workflow_dispatch:
inputs:
runs-on:
description: 'The runner to use for the build'
required: true
type: string
default: ubuntu-22.04
release-version:
description: 'Release version'
required: true
default: '0.1.0'
python-version:
description: 'Python version'
required: true
default: '3.12'
torch-version:
description: 'Torch version'
required: true
default: '2.8.0'
cuda-version:
description: 'CUDA version'
required: true
default: '12.9.1'
cudnn-version:
description: 'CUDNN version'
required: true
default: '9'
cxx11_abi:
description: 'C++11 ABI'
required: true
type: choice
default: 'TRUE'
options:
- 'TRUE'
- 'FALSE'
ngc-image:
description: 'NGC PyTorch image (will take precedence over the source build)'
required: false
type: string
default: ''
jobs:
pre-flight:
runs-on: ubuntu-latest
outputs:
build-wheel-matrix: ${{ steps.matrix.outputs.matrix }}
release-assets-url: ${{ steps.release-assets-url.outputs.upload_url }}
ngc-images: ${{ steps.check_for_ngc_images.outputs.IMAGES }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Build release matrix
id: matrix
env:
EVENT: ${{ github.event_name }}
run: |
if [[ "$EVENT" == "release" ]]; then
MATRIX=$(echo '{
"os": ["ubuntu-22.04", "ubuntu-22.04-arm"],
"release-version": ["${{ github.event.release.tag_name }}"],
"python-version": ["3.12"],
"torch-version": ["2.8.0"],
"cuda-version": ["12.9.1"],
"cudnn-version": ["9"],
"cxx11_abi": ["TRUE"]
}' | jq -rc)
else
MATRIX=$(echo '{
"os": ["${{ inputs.runs-on }}"],
"release-version": ["${{ inputs.release-version }}"],
"python-version": ["${{ inputs.python-version }}"],
"torch-version": ["${{ inputs.torch-version }}"],
"cuda-version": ["${{ inputs.cuda-version }}"],
"cudnn-version": ["${{ inputs.cudnn-version }}"],
"cxx11_abi": ["${{ inputs.cxx11_abi }}"]
}' | jq -rc)
fi
echo "matrix=$MATRIX" | tee -a "$GITHUB_OUTPUT"
- name: Get Release with tag
id: get_current_release
uses: joutvhu/get-release@v1
if: ${{ github.event_name == 'workflow_dispatch' }}
with:
tag_name: ${{ inputs.release-version }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Get release assets url
env:
EVENT: ${{ github.event_name }}
if: ${{ (success() || !failure()) && !cancelled()}}
id: release-assets-url
run: |
if [[ "$EVENT" == "release" ]]; then
echo "upload_url=${{ github.event.release.upload_url }}" | tee -a "$GITHUB_OUTPUT"
else
echo "upload_url=${{ steps.get_current_release.outputs.upload_url }}" | tee -a "$GITHUB_OUTPUT"
fi
- name: Check for NGC PyTorch images
id: check_for_ngc_images
if: ${{ (success() || !failure()) && !cancelled()}}
env:
EVENT: ${{ github.event_name }}
run: |
if [[ "$EVENT" == "release" ]]; then
bash ./.github/scripts/check_for_ngc_images.sh
echo "IMAGES=$(cat ngc_images.json | jq -cr)" | tee -a $GITHUB_OUTPUT
else
echo 'IMAGES=["${{ inputs.ngc-image }}"]' | tee -a "$GITHUB_OUTPUT"
fi
build_wheels:
name: Build Wheel
runs-on: ${{ matrix.os }}
needs: pre-flight
if: ${{ github.event_name == 'release' || inputs.ngc-image == '' }}
strategy:
fail-fast: false
matrix: ${{ fromJson(needs.pre-flight.outputs.build-wheel-matrix) }}
steps:
- name: 'Checkout'
uses: actions/checkout@v3
- name: 'Build PyTorch Wheel'
uses: ./.github/actions/build-pytorch-wheel
id: build-pytorch-wheel
with:
release-version: ${{ matrix.release-version }}
python-version: ${{ matrix.python-version }}
cuda-version: ${{ matrix.cuda-version }}
cudnn-version: ${{ matrix.cudnn-version }}
torch-version: ${{ matrix.torch-version }}
cxx11_abi: ${{ matrix.cxx11_abi }}
aarch: ${{ matrix.os == 'ubuntu-22.04' && 'x86_64' || 'sbsa' }}
env:
NVTE_FRAMEWORK: pytorch
MAX_JOBS: 1
- name: Upload Release Asset
id: upload_release_asset
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ needs.pre-flight.outputs.release-assets-url }}
asset_path: ./transformer_engine/pytorch/dist/${{ steps.build-pytorch-wheel.outputs.wheel_name }}
asset_name: ${{ steps.build-pytorch-wheel.outputs.wheel_name }}
asset_content_type: application/*
build_wheels_for_ngc:
name: Build Wheels for NGC PyTorch images
runs-on: ${{ matrix.os }}
needs: pre-flight
if: ${{ github.event_name == 'release' || inputs.ngc-image != '' }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-22.04]
container-image: ${{ fromJson(needs.pre-flight.outputs.ngc-images) }}
steps:
- name: 'Checkout'
uses: actions/checkout@v3
- name: 'Build PyTorch Wheel'
uses: ./.github/actions/build-pytorch-wheel
id: build-pytorch-wheel
with:
base-image: ${{ matrix.container-image }}
- name: Upload Release Asset
id: upload_release_asset
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ needs.pre-flight.outputs.release-assets-url }}
asset_path: ./transformer_engine/pytorch/dist/${{ steps.build-pytorch-wheel.outputs.wheel_name }}
asset_name: ${{ steps.build-pytorch-wheel.outputs.wheel_name }}
asset_content_type: application/*
......@@ -35,26 +35,52 @@ jobs:
pytorch:
name: 'PyTorch'
runs-on: ubuntu-latest
container:
image: nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04
options: --user root
steps:
- name: 'Dependencies'
run: |
apt-get update
apt-get install -y git python3.9 pip cudnn9-cuda-12
pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript
- name: Move /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"
- name: Maximize build space
uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
with:
root-reserve-mb: 5120
temp-reserve-mb: 32
swap-size-mb: 10240
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
build-mount-path: '/var/lib/docker/'
- name: Restore /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"
- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive
- name: Start named container
run: |
docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04 sleep infinity
- name: 'Dependencies'
run: |
docker exec builder bash -c '\
apt-get update && \
apt-get install -y git python3.9 pip cudnn9-cuda-12 && \
pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript && \
apt-get clean \
'
- name: 'Build'
run: pip install --no-build-isolation . -v --no-deps
run: docker exec builder bash -c 'pip install --no-build-isolation . -v --no-deps'
env:
NVTE_FRAMEWORK: pytorch
MAX_JOBS: 1
- name: 'Sanity check'
run: python3 tests/pytorch/test_sanity_import.py
run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py'
jax:
name: 'JAX'
runs-on: ubuntu-latest
......@@ -78,22 +104,47 @@ jobs:
all:
name: 'All'
runs-on: ubuntu-latest
container:
image: ghcr.io/nvidia/jax:jax
options: --user root
steps:
- name: 'Dependencies'
run: |
pip install pybind11[global] einops onnxscript
pip install torch --index-url https://download.pytorch.org/whl/cu130
- name: Move /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"
- name: Maximize build space
uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
with:
root-reserve-mb: 5120
temp-reserve-mb: 32
swap-size-mb: 10240
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
build-mount-path: '/var/lib/docker/'
- name: Restore /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"
- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive
- name: Start named container
run: |
docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d ghcr.io/nvidia/jax:jax sleep infinity
- name: 'Dependencies'
run: |
docker exec builder bash -c '\
pip install pybind11[global] einops onnxscript && \
pip install torch --no-cache-dir --index-url https://download.pytorch.org/whl/cu130
'
- name: 'Build'
run: pip install --no-build-isolation . -v --no-deps
run: docker exec builder bash -c 'pip install --no-cache-dir --no-build-isolation . -v --no-deps'
env:
NVTE_FRAMEWORK: all
MAX_JOBS: 1
- name: 'Sanity check'
run: python3 tests/pytorch/test_sanity_import.py && python3 tests/jax/test_sanity_import.py
run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py && python3 tests/jax/test_sanity_import.py'
......@@ -22,10 +22,10 @@ jobs:
sudo apt-get install -y pandoc graphviz doxygen
export GIT_SHA=$(git show-ref --hash HEAD)
- name: 'Build docs'
run: |
run: | # SPHINXOPTS="-W" errors out on warnings
doxygen docs/Doxyfile
cd docs
make html
make html SPHINXOPTS="-W"
- name: 'Upload docs'
uses: actions/upload-artifact@v4
with:
......
.venv
*.o
*.swp
*.ii
......
......@@ -17,3 +17,5 @@ Common API
.. autoapiclass:: transformer_engine.common.recipe.Float8CurrentScaling(fp8_format=Format.HYBRID)
.. autoapiclass:: transformer_engine.common.recipe.Float8BlockScaling(fp8_format=Format.E4M3)
.. autoapiclass:: transformer_engine.common.recipe.CustomRecipe(qfactory, fp8_dpa=False, fp8_mha=False)
......@@ -4,7 +4,7 @@
See LICENSE for license information.
Jax
=======
===
Pre-defined Variable of Logical Axes
------------------------------------
......@@ -20,11 +20,11 @@ Variables are available in `transformer_engine.jax.sharding`.
Checkpointing
------------------------------------
-------------
When using checkpointing with Transformer Engine JAX, please be aware of the checkpointing policy being applied to your model. Any JAX checkpointing policy using `dot`, such as `jax.checkpoint_policies.dots_with_no_batch_dims`, may not work with GEMMs provided by Transformer Engine as they do not always use the `jax.lax.dot_general` primitive. Instead, you can use `transformer_engine.jax.checkpoint_policies.dots_and_te_gemms_with_no_batch_dims` or similar policies that are designed to work with Transformer Engine's GEMMs and `jax.lax.dot_general` GEMMs. You may also use any JAX policies that do not filter by primitive, such as `jax.checkpoint_policies.save_only_these_names` or `jax.checkpoint_policies.everything_saveable`.
Modules
------------------------------------
-------
.. autoapiclass:: transformer_engine.jax.flax.TransformerLayerType
.. autoapiclass:: transformer_engine.jax.MeshResource()
......
......@@ -3,7 +3,7 @@
See LICENSE for license information.
pyTorch
PyTorch
=======
.. autoapiclass:: transformer_engine.pytorch.Linear(in_features, out_features, bias=True, **kwargs)
......@@ -37,9 +37,6 @@ pyTorch
.. autoapiclass:: transformer_engine.pytorch.CudaRNGStatesTracker()
:members: reset, get_states, set_states, add, fork
.. autoapifunction:: transformer_engine.pytorch.fp8_autocast
.. autoapifunction:: transformer_engine.pytorch.fp8_model_init
.. autoapifunction:: transformer_engine.pytorch.autocast
......@@ -47,6 +44,16 @@ pyTorch
.. autoapifunction:: transformer_engine.pytorch.checkpoint
.. autoapifunction:: transformer_engine.pytorch.make_graphed_callables
.. autoapifunction:: transformer_engine.pytorch.get_cpu_offload_context
.. autoapifunction:: transformer_engine.pytorch.parallel_cross_entropy
Recipe availability
-------------------
.. autoapifunction:: transformer_engine.pytorch.is_fp8_available
.. autoapifunction:: transformer_engine.pytorch.is_mxfp8_available
......@@ -63,9 +70,8 @@ pyTorch
.. autoapifunction:: transformer_engine.pytorch.get_default_recipe
.. autoapifunction:: transformer_engine.pytorch.make_graphed_callables
.. autoapifunction:: transformer_engine.pytorch.get_cpu_offload_context
Mixture of Experts (MoE) functions
----------------------------------
.. autoapifunction:: transformer_engine.pytorch.moe_permute
......@@ -75,13 +81,71 @@ pyTorch
.. autoapifunction:: transformer_engine.pytorch.moe_sort_chunks_by_index
.. autoapifunction:: transformer_engine.pytorch.parallel_cross_entropy
.. autoapifunction:: transformer_engine.pytorch.moe_sort_chunks_by_index_with_probs
Communication-computation overlap
---------------------------------
.. autoapifunction:: transformer_engine.pytorch.initialize_ub
.. autoapifunction:: transformer_engine.pytorch.destroy_ub
.. autoapiclass:: transformer_engine.pytorch.UserBufferQuantizationMode
:members: FP8, NONE
Quantized tensors
-----------------
.. autoapiclass:: transformer_engine.pytorch.QuantizedTensorStorage
:members: update_usage, prepare_for_saving, restore_from_saved
.. autoapiclass:: transformer_engine.pytorch.QuantizedTensor(shape, dtype, *, requires_grad=False, device=None)
:members: dequantize, quantize_
.. autoapiclass:: transformer_engine.pytorch.Float8TensorStorage(data, fp8_scale_inv, fp8_dtype, data_transpose=None, quantizer=None)
.. autoapiclass:: transformer_engine.pytorch.MXFP8TensorStorage(rowwise_data, rowwise_scale_inv, columnwise_data, columnwise_scale_inv, fp8_dtype, quantizer)
.. autoapiclass:: transformer_engine.pytorch.Float8BlockwiseQTensorStorage(rowwise_data, rowwise_scale_inv, columnwise_data, columnwise_scale_inv, fp8_dtype, quantizer, is_2D_scaled, data_format)
.. autoapiclass:: transformer_engine.pytorch.NVFP4TensorStorage(rowwise_data, rowwise_scale_inv, columnwise_data, columnwise_scale_inv, amax_rowwise, amax_columnwise, fp4_dtype, quantizer)
.. autoapiclass:: transformer_engine.pytorch.Float8Tensor(shape, dtype, data, fp8_scale_inv, fp8_dtype, requires_grad=False, data_transpose=None, quantizer=None)
.. autoapiclass:: transformer_engine.pytorch.MXFP8Tensor(rowwise_data, rowwise_scale_inv, columnwise_data, columnwise_scale_inv, fp8_dtype, quantizer)
.. autoapiclass:: transformer_engine.pytorch.Float8BlockwiseQTensor(rowwise_data, rowwise_scale_inv, columnwise_data, columnwise_scale_inv, fp8_dtype, quantizer, is_2D_scaled, data_format)
.. autoapiclass:: transformer_engine.pytorch.NVFP4Tensor(rowwise_data, rowwise_scale_inv, columnwise_data, columnwise_scale_inv, amax_rowwise, amax_columnwise, fp4_dtype, quantizer)
Quantizers
----------
.. autoapiclass:: transformer_engine.pytorch.Quantizer(rowwise, columnwise)
:members: update_quantized, quantize
.. autoapiclass:: transformer_engine.pytorch.Float8Quantizer(scale, amax, fp8_dtype, *, rowwise=True, columnwise=True)
.. autoapiclass:: transformer_engine.pytorch.Float8CurrentScalingQuantizer(fp8_dtype, device, *, rowwise=True, columnwise=True, **kwargs)
.. autoapiclass:: transformer_engine.pytorch.MXFP8Quantizer(fp8_dtype, *, rowwise=True, columnwise=True)
.. autoapiclass:: transformer_engine.pytorch.Float8BlockQuantizer(fp8_dtype, *, rowwise, columnwise, **kwargs)
.. autoapiclass:: transformer_engine.pytorch.NVFP4Quantizer(fp4_dtype, *, rowwise=True, columnwise=True, **kwargs)
Tensor saving and restoring functions
-------------------------------------
.. autoapifunction:: transformer_engine.pytorch.prepare_for_saving
.. autoapifunction:: transformer_engine.pytorch.restore_from_saved
Deprecated functions
--------------------
.. autoapifunction:: transformer_engine.pytorch.fp8_autocast
.. autoapifunction:: transformer_engine.pytorch.fp8_model_init
......@@ -61,7 +61,11 @@ extensions = [
]
templates_path = ["_templates"]
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
exclude_patterns = [
"_build",
"Thumbs.db",
"sphinx_rtd_theme",
]
source_suffix = ".rst"
......@@ -94,6 +98,7 @@ napoleon_custom_sections = [
("Values", "params_style"),
("Graphing parameters", "params_style"),
("FP8-related parameters", "params_style"),
("Quantization parameters", "params_style"),
]
breathe_projects = {"TransformerEngine": root_path / "docs" / "doxygen" / "xml"}
......@@ -101,3 +106,23 @@ breathe_default_project = "TransformerEngine"
autoapi_generate_api_docs = False
autoapi_dirs = [root_path / "transformer_engine"]
autoapi_ignore = ["*test*"]
# There are 2 warnings about the same namespace (transformer_engine) in two different c++ api
# docs pages. This seems to be the only way to suppress these warnings.
def setup(app):
"""Custom Sphinx setup to filter warnings."""
import logging
# Filter out duplicate C++ declaration warnings
class DuplicateDeclarationFilter(logging.Filter):
def filter(self, record):
message = record.getMessage()
if "Duplicate C++ declaration" in message and "transformer_engine" in message:
return False
return True
# Apply filter to Sphinx logger
logger = logging.getLogger("sphinx")
logger.addFilter(DuplicateDeclarationFilter())
......@@ -2,8 +2,9 @@
Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
See LICENSE for license information.
Precision debug tools
==============================================
=====================
.. toctree::
:caption: Precision debug tools
......
......@@ -4,7 +4,7 @@
See LICENSE for license information.
Getting started
==============
===============
.. note::
......@@ -38,7 +38,7 @@ To start debugging, one needs to create a configuration YAML file. This file lis
one - ``UserProvidedPrecision`` - is a custom feature implemented by the user. Nvidia-DL-Framework-Inspect inserts features into the layers according to the config.
Example training script
----------------------
-----------------------
Let's look at a simple example of training a Transformer layer using Transformer Engine with FP8 precision. This example demonstrates how to set up the layer, define an optimizer, and perform a few training iterations using synthetic data.
......@@ -81,7 +81,7 @@ We will demonstrate two debug features on the code above:
2. Logging statistics for other GEMM operations, such as gradient statistics for data gradient GEMM within the LayerNormLinear sub-layer of the TransformerLayer.
Config file
----------
-----------
We need to prepare the configuration YAML file, as below
......@@ -114,7 +114,8 @@ We need to prepare the configuration YAML file, as below
Further explanation on how to create config files is in the :doc:`next part of the documentation <2_config_file_structure>`.
Adjusting Python file
--------------------
---------------------
.. code-block:: python
......@@ -145,7 +146,8 @@ In the modified code above, the following changes were made:
3. Added ``debug_api.step()`` after each of the forward-backward pass.
Inspecting the logs
------------------
-------------------
Let's look at the files with the logs. Two files will be created:
......@@ -213,7 +215,8 @@ The second log file (``nvdlfw_inspect_statistics_logs/nvdlfw_inspect_globalrank-
INFO - transformer_layer.self_attention.layernorm_qkv_activation_l1_norm iteration=000004 value=130776.7969
Logging using TensorBoard
------------------------
-------------------------
Precision debug tools support logging using `TensorBoard <https://www.tensorflow.org/tensorboard>`_. To enable it, one needs to pass the argument ``tb_writer`` to the ``debug_api.initialize()``. Let's modify ``train.py`` file.
......
......@@ -4,13 +4,14 @@
See LICENSE for license information.
Config File Structure
====================
=====================
To enable debug features, create a configuration YAML file to specify the desired behavior, such as determining which GEMMs (General Matrix Multiply operations) should run in higher precision rather than FP8 and defining which statistics to log.
Below, we outline how to structure the configuration YAML file.
General Format
-------------
--------------
A config file can have one or more sections, each containing settings for specific layers and features:
......@@ -55,7 +56,8 @@ Sections may have any name and must contain:
3. Additional fields describing features for those layers.
Layer Specification
------------------
-------------------
Debug layers can be identified by a ``name`` parameter:
......@@ -89,7 +91,8 @@ Examples:
(...)
Names in Transformer Layers
--------------------------
---------------------------
There are three ways to assign a name to a layer in the Transformer Engine:
......@@ -154,7 +157,7 @@ Below is an example ``TransformerLayer`` with four linear layers that can be inf
Structured Configuration for GEMMs and Tensors
---------------------------------------------
----------------------------------------------
Sometimes a feature is parameterized by a list of tensors or by a list of GEMMs.
There are multiple ways of describing this parameterization.
......@@ -216,7 +219,7 @@ We can use both structs for tensors and GEMMs. The tensors_struct should be nest
gemm_feature_param1: value
Enabling or Disabling Sections and Features
------------------------------------------
-------------------------------------------
Debug features can be enabled or disabled with the ``enabled`` keyword:
......
......@@ -11,7 +11,8 @@ Please refer to the Nvidia-DL-Framework-Inspect `documentation <https://github.c
Below, we outline the steps for debug initialization.
initialize()
-----------
------------
Must be called once on every rank in the global context to initialize Nvidia-DL-Framework-Inspect.
......@@ -34,7 +35,7 @@ Must be called once on every rank in the global context to initialize Nvidia-DL-
log_dir="./log_dir")
set_tensor_reduction_group()
--------------------------
----------------------------
Needed only for logging tensor stats. In multi-GPU training, activation and gradient tensors are distributed across multiple nodes. This method lets you specify the group for the reduction of stats; see the `reduction group section <./4_distributed.rst#reduction-groups>`_ for more details.
......@@ -61,7 +62,7 @@ If the tensor reduction group is not specified, then statistics are reduced acro
# activation/gradient tensor statistics are reduced along pipeline_parallel_group
set_weight_tensor_tp_group_reduce()
---------------------------------
-----------------------------------
By default, weight tensor statistics are reduced within the tensor parallel group. This function allows you to disable that behavior; for more details, see `reduction group section <./4_distributed.rst#reduction-groups>`_.
......
......@@ -4,7 +4,7 @@
See LICENSE for license information.
Debug features
==========
==============
.. autoapiclass:: transformer_engine.debug.features.log_tensor_stats.LogTensorStats
.. autoapiclass:: transformer_engine.debug.features.log_fp8_tensor_stats.LogFp8TensorStats
......
......@@ -4,7 +4,7 @@
See LICENSE for license information.
Distributed training
===================
====================
Nvidia-Pytorch-Inspect with Transformer Engine supports multi-GPU training. This guide describes how to run it and how the supported features work in the distributed setting.
......@@ -14,7 +14,8 @@ To use precision debug tools in multi-GPU training, one needs to:
2. If one wants to log stats, one may want to invoke ``debug_api.set_tensor_reduction_group`` with a proper reduction group.
Behavior of the features
-----------------------
------------------------
In a distributed setting, **DisableFP8GEMM** and **DisableFP8Layer** function similarly to the single-GPU case, with no notable differences.
......@@ -28,7 +29,8 @@ In a distributed setting, **DisableFP8GEMM** and **DisableFP8Layer** function si
Logging-related features are more complex and will be discussed further in the next sections.
Reduction groups
--------------
----------------
In setups with tensor, data, or pipeline parallelism, some tensors are distributed across multiple GPUs, requiring a reduction operation to compute statistics for these tensors.
......@@ -65,7 +67,8 @@ Below, we illustrate configurations for a 4-node setup with tensor parallelism s
Microbatching
-----------
-------------
Let's dive into how statistics collection works with microbatching. By microbatching, we mean invoking multiple ``forward()`` calls for each ``debug_api.step()``. The behavior is as follows:
......@@ -73,7 +76,7 @@ Let's dive into how statistics collection works with microbatching. By microbatc
- For other tensors, the stats are accumulated.
Logging to files and TensorBoard
------------------------------
--------------------------------
In a single-node setup with ``default_logging_enabled=True``, all logs are saved by default to ``log_dir/nvdlfw_inspect_statistics_logs/nvdlfw_inspect_globalrank-0.log``. In multi-GPU training, each node writes its reduced statistics to its unique file, named ``log_dir/nvdlfw_inspect_statistics_logs/nvdlfw_inspect_globalrank-i.log`` for rank i. Because these logs contain reduced statistics, the logged values are identical for all nodes within a reduction group.
......
......@@ -2,8 +2,9 @@
Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
See LICENSE for license information.
API
============
===
.. toctree::
:caption: Precision debug tools API
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment