Commit c1a1c04e authored by wenjh's avatar wenjh
Browse files

Merge nv_main(2.10) to main


Signed-off-by: wenjh's avatarwenjh <wenjh@sugon.com>
parents e698a0a7 66aed3ae
......@@ -19,7 +19,7 @@ jobs:
run: |
apt-get update
apt-get install -y git python3.9 pip cudnn9-cuda-12
pip install cmake==3.21.0 pybind11[global] ninja nvidia-mathdx==25.1.1
pip install cmake==3.21.0 pybind11[global] ninja
- name: 'Checkout'
uses: actions/checkout@v3
with:
......@@ -43,7 +43,7 @@ jobs:
run: |
apt-get update
apt-get install -y git python3.9 pip cudnn9-cuda-12
pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript nvidia-mathdx==25.1.1
pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript
- name: 'Checkout'
uses: actions/checkout@v3
with:
......@@ -63,7 +63,7 @@ jobs:
options: --user root
steps:
- name: 'Dependencies'
run: pip install pybind11[global] nvidia-mathdx==25.1.1
run: pip install pybind11[global]
- name: 'Checkout'
uses: actions/checkout@v3
with:
......@@ -83,7 +83,9 @@ jobs:
options: --user root
steps:
- name: 'Dependencies'
run: pip install torch pybind11[global] einops onnxscript nvidia-mathdx==25.1.1
run: |
pip install pybind11[global] einops onnxscript
pip install torch --index-url https://download.pytorch.org/whl/cu130
- name: 'Checkout'
uses: actions/checkout@v3
with:
......
recursive-include transformer_engine/common/include *.*
......@@ -205,7 +205,7 @@ pip Installation
**Prerequisites for pip installation:**
* A compatible C++ compiler
* CUDA Toolkit with cuDNN and NVCC (NVIDIA CUDA Compiler) installed
* CUDA Toolkit with cuDNN and NVCC (NVIDIA CUDA Compiler) if installing from source.
To install the latest stable version with pip:
......
......@@ -8,53 +8,67 @@ import torch.utils.benchmark as benchmark
import pandas as pd
from transformer_engine.pytorch.module import GroupedLinear
from transformer_engine.common.recipe import Float8BlockScaling, MXFP8BlockScaling
from transformer_engine.common.recipe import (
Float8BlockScaling,
MXFP8BlockScaling,
NVFP4BlockScaling,
)
from transformer_engine.pytorch.quantization import autocast, FP8GlobalStateManager
from contextlib import nullcontext
"""
# Profile BF16 recipe with Nsight Systems
nsys profile \
--output=./benchmarks/linear/b200_mkn_4096_4096_4096_numgemm_8_bf16 \
--output=./benchmarks/linear/b200_numgemm_8_bf16 \
--force-overwrite true \
--trace=cuda,nvtx,cudnn,cublas \
python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe bf16
# Profile FP8 sub-channel recipe with Nsight Systems
nsys profile \
--output=./benchmarks/linear/h100hbm_mkn_4096_4096_4096_numgemm_8_fp8_sub_channel \
--output=./benchmarks/linear/h100hbm_numgemm_8_fp8_sub_channel \
--force-overwrite true \
--trace=cuda,nvtx,cudnn,cublas \
python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe fp8_sub_channel
# Profile MXFP8 recipe with Nsight Systems
nsys profile \
--output=./benchmarks/linear/b200_mkn_4096_4096_4096_numgemm_8_mxfp8 \
--output=./benchmarks/linear/b200_numgemm_8_mxfp8 \
--force-overwrite true \
--trace=cuda,nvtx,cudnn,cublas \
python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe mxfp8
# Profile NVFP4 recipe with Nsight Systems
nsys profile \
--output=./benchmarks/linear/b200_numgemm_8_nvfp4 \
--force-overwrite true \
--trace=cuda,nvtx,cudnn,cublas \
python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe nvfp4
"""
RECIPES = {
"bf16": None,
"fp8_sub_channel": Float8BlockScaling(),
"mxfp8": MXFP8BlockScaling(),
"nvfp4": NVFP4BlockScaling(),
}
mxfp8_available, reason_for_no_mxfp8 = FP8GlobalStateManager.is_mxfp8_available()
fp8_block_scaling_available, reason_for_no_fp8_block_scaling = (
FP8GlobalStateManager.is_fp8_block_scaling_available()
)
nvfp4_available, reason_for_no_nvfp4 = FP8GlobalStateManager.is_nvfp4_available()
def run_linear_multiple_steps(layer, x, m_splits, mode, gradient, run_num_steps=1, recipe=None):
assert mode in ["fwd_only", "fwd_bwd"]
fp8_context = autocast(enabled=True, fp8_recipe=recipe) if recipe is not None else nullcontext()
# print(f"fp8_context: {fp8_context} and is it nullcontext? {isinstance(fp8_context, nullcontext)}")
quantization_context = (
autocast(enabled=True, recipe=recipe) if recipe is not None else nullcontext()
)
if mode == "fwd_only":
with torch.no_grad(), fp8_context:
with torch.no_grad(), quantization_context:
for i in range(run_num_steps):
y_q = layer.forward(
x,
......@@ -67,7 +81,7 @@ def run_linear_multiple_steps(layer, x, m_splits, mode, gradient, run_num_steps=
layer.zero_grad()
x.grad = None
with fp8_context:
with quantization_context:
for i in range(run_num_steps):
label = f"step_{i}"
torch.cuda.nvtx.range_push(label)
......@@ -142,7 +156,7 @@ def benchmark_linear(
"recipe": recipe,
},
num_threads=1,
).blocked_autorange(min_run_time=5)
).blocked_autorange(min_run_time=10)
print(f"{recipe_name}: {timing} \n")
timing_ms = timing.median * 1000 / num_microbatches
......@@ -225,30 +239,44 @@ if __name__ == "__main__":
use_bias = False
# Set the MKN values to benchmark
# Deepseek V3 EP64, SEQ_LEN=8192, topK8
# 256 expert => 4 local experts
# Avg M per expert: AvgM = SEQ_LEN * topK / localExperts = 16384
# M = AvgM * localExperts = 65536
# K = 7168
# N = 2048
# Deepseek V3 EP32, SEQ_LEN=8192, topK8
# 256 expert => 8 local experts
# Avg M per expert: AvgM = SEQ_LEN * topK / localExperts = 8192
# M = AvgM * localExperts = 65536
# K = 7168
# N = 2048
# 4 or 8local experts per rank
num_gemms_list = [4, 8]
# MKN for group linear
mkns = []
for m in [8192]:
# for m in [4096, 8192, 16384]:
# for n in [1024, 2048, 4096, 8192, 16384]:
for n in [8192]:
for k in [4096]:
for m in [65536]:
for k in [7168]:
for n in [2048]:
mkns.append((m, k, n))
# default recipes to run if not specified
recipe_list = ["bf16"]
if args.recipe == "all":
recipe_list = ["bf16", "fp8_sub_channel", "mxfp8"]
recipe_list = ["bf16", "fp8_sub_channel", "mxfp8", "nvfp4"]
else:
recipe_list = [args.recipe]
num_gemms_list = [8]
if args.profile:
mkns = [(4096 * 8, 4096, 4096)]
mkns = [(8192 * 8, 7168, 2048)]
# in profile mode, only run one recipe specified in args.recipe
assert args.recipe != "all", (
"In profile mode, only one recipe can be specified, please specify the recipe as"
" fp8_sub_channel, mxfp8, or bf16"
" fp8_sub_channel, mxfp8, nvfp4, or bf16"
)
recipe_list = [args.recipe]
num_gemms_list = [8]
......@@ -265,13 +293,17 @@ if __name__ == "__main__":
"bf16",
"fp8_sub_channel",
"mxfp8",
], "Recipe must be one of bf16, fp8_sub_channel, or mxfp8"
"nvfp4",
], "Recipe must be one of bf16, fp8_sub_channel, mxfp8, or nvfp4"
if recipe_name == "mxfp8" and not mxfp8_available:
print(f"MXFP8 is not available, skipping {recipe_name}")
continue
if recipe_name == "fp8_sub_channel" and not fp8_block_scaling_available:
print(f"FP8 block scaling is not available, skipping {recipe_name}")
continue
if recipe_name == "nvfp4" and not nvfp4_available:
print(f"NVFP4 is not available, skipping {recipe_name}")
continue
df = run_benchmark_linear(
mkns,
......
......@@ -295,11 +295,9 @@ def cuda_archs() -> str:
if archs is None:
version = cuda_version()
if version >= (13, 0):
archs = "75;80;89;90;100;100a;103a;120"
elif version >= (12, 9):
archs = "70;80;89;90;100;100a;103a;120"
archs = "75;80;89;90;100;120"
elif version >= (12, 8):
archs = "70;80;89;90;100;100a;120"
archs = "70;80;89;90;100;120"
else:
archs = "70;80;89;90"
return archs
......
......@@ -7,23 +7,34 @@ FROM quay.io/pypa/manylinux_2_28_aarch64
WORKDIR /TransformerEngine/
COPY ../.. /TransformerEngine/
ARG VER="12-3"
ARG ARCH="aarch64"
RUN dnf -y install vim
ARG CUDA_MAJOR="12"
ARG CUDA_MINOR="3"
# Args for build_wheels.sh
ARG BUILD_METAPACKAGE=true
ARG BUILD_COMMON=true
ARG BUILD_PYTORCH=true
ARG BUILD_JAX=true
ENV BUILD_METAPACKAGE=${BUILD_METAPACKAGE}
ENV BUILD_COMMON=${BUILD_COMMON}
ENV BUILD_PYTORCH=${BUILD_PYTORCH}
ENV BUILD_JAX=${BUILD_JAX}
ENV CUDA_MAJOR=${CUDA_MAJOR}
# Cuda toolkit, cudnn, driver.
RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
RUN dnf -y install epel-release
RUN dnf -y install cuda-compiler-${VER}.${ARCH} \
cuda-libraries-${VER}.${ARCH} \
cuda-libraries-devel-${VER}.${ARCH}
RUN dnf -y install --allowerasing cudnn9-cuda-12
RUN dnf -y install cuda-compiler-${CUDA_MAJOR}-${CUDA_MINOR}.aarch64 \
cuda-libraries-${CUDA_MAJOR}-${CUDA_MINOR}.aarch64 \
cuda-libraries-devel-${CUDA_MAJOR}-${CUDA_MINOR}.aarch64
RUN dnf -y install --allowerasing cudnn9-cuda-${CUDA_MAJOR}
RUN dnf clean all
RUN rm -rf /var/cache/dnf/*
RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/999_nvidia_cuda.conf
RUN dnf -y install cuda-toolkit
RUN dnf -y install cuda-toolkit-${CUDA_MAJOR}
RUN dnf clean all
RUN dnf -y install glog.aarch64 glog-devel.aarch64
RUN dnf -y install libnccl libnccl-devel libnccl-static
ENV PATH="/usr/local/cuda/bin:${PATH}"
ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
......@@ -33,4 +44,4 @@ ENV CUDA_PATH=/usr/local/cuda
ENV CUDADIR=/usr/local/cuda
ENV NVTE_RELEASE_BUILD=1
CMD ["/bin/bash", "/TransformerEngine/build_tools/wheel_utils/build_wheels.sh", "manylinux_2_28_aarch64", "true", "true", "false", "false", "false"]
CMD ["/bin/bash", "-c", "bash /TransformerEngine/build_tools/wheel_utils/build_wheels.sh manylinux_2_28_aarch64 $BUILD_METAPACKAGE $BUILD_COMMON $BUILD_PYTORCH $BUILD_JAX $CUDA_MAJOR"]
......@@ -7,23 +7,34 @@ FROM quay.io/pypa/manylinux_2_28_x86_64
WORKDIR /TransformerEngine/
COPY ../.. /TransformerEngine/
ARG VER="12-3"
ARG ARCH="x86_64"
RUN dnf -y install vim
ARG CUDA_MAJOR="12"
ARG CUDA_MINOR="3"
# Args for build_wheels.sh
ARG BUILD_METAPACKAGE=true
ARG BUILD_COMMON=true
ARG BUILD_PYTORCH=true
ARG BUILD_JAX=true
ENV BUILD_METAPACKAGE=${BUILD_METAPACKAGE}
ENV BUILD_COMMON=${BUILD_COMMON}
ENV BUILD_PYTORCH=${BUILD_PYTORCH}
ENV BUILD_JAX=${BUILD_JAX}
ENV CUDA_MAJOR=${CUDA_MAJOR}
# Cuda toolkit, cudnn, driver.
RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
RUN dnf -y install epel-release
RUN dnf -y install cuda-compiler-${VER}.${ARCH} \
cuda-libraries-${VER}.${ARCH} \
cuda-libraries-devel-${VER}.${ARCH}
RUN dnf -y install --allowerasing cudnn9-cuda-12
RUN dnf -y install cuda-compiler-${CUDA_MAJOR}-${CUDA_MINOR}.x86_64 \
cuda-libraries-${CUDA_MAJOR}-${CUDA_MINOR}.x86_64 \
cuda-libraries-devel-${CUDA_MAJOR}-${CUDA_MINOR}.x86_64
RUN dnf -y install --allowerasing cudnn9-cuda-${CUDA_MAJOR}
RUN dnf clean all
RUN rm -rf /var/cache/dnf/*
RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/999_nvidia_cuda.conf
RUN dnf -y install cuda-toolkit
RUN dnf -y install cuda-toolkit-${CUDA_MAJOR}
RUN dnf clean all
RUN dnf -y install glog.x86_64 glog-devel.x86_64
RUN dnf -y install libnccl libnccl-devel libnccl-static
ENV PATH="/usr/local/cuda/bin:${PATH}"
ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
......@@ -33,4 +44,4 @@ ENV CUDA_PATH=/usr/local/cuda
ENV CUDADIR=/usr/local/cuda
ENV NVTE_RELEASE_BUILD=1
CMD ["/bin/bash", "/TransformerEngine/build_tools/wheel_utils/build_wheels.sh", "manylinux_2_28_x86_64", "true", "true", "true", "true", "true"]
CMD ["/bin/bash", "-c", "bash /TransformerEngine/build_tools/wheel_utils/build_wheels.sh manylinux_2_28_x86_64 $BUILD_METAPACKAGE $BUILD_COMMON $BUILD_PYTORCH $BUILD_JAX $CUDA_MAJOR"]
\ No newline at end of file
......@@ -9,8 +9,10 @@ BUILD_METAPACKAGE=${2:-true}
BUILD_COMMON=${3:-true}
BUILD_PYTORCH=${4:-true}
BUILD_JAX=${5:-true}
CUDA_MAJOR=${6:-12}
export NVTE_RELEASE_BUILD=1
export PIP_CONSTRAINT=""
export TARGET_BRANCH=${TARGET_BRANCH:-}
mkdir -p /wheelhouse/logs
......@@ -21,7 +23,7 @@ git checkout $TARGET_BRANCH
git submodule update --init --recursive
# Install deps
/opt/python/cp310-cp310/bin/pip install cmake pybind11[global] ninja
/opt/python/cp310-cp310/bin/pip install cmake pybind11[global] ninja setuptools wheel
if $BUILD_METAPACKAGE ; then
cd /TransformerEngine
......@@ -36,18 +38,18 @@ if $BUILD_COMMON ; then
# Create the wheel.
/opt/python/cp310-cp310/bin/python setup.py bdist_wheel --verbose --python-tag=py3 --plat-name=$PLATFORM 2>&1 | tee /wheelhouse/logs/common.txt
# Repack the wheel for cuda specific package, i.e. cu12.
# Repack the wheel for specific cuda version.
/opt/python/cp310-cp310/bin/wheel unpack dist/*
# From python 3.10 to 3.11, the package name delimiter in metadata got changed from - (hyphen) to _ (underscore).
sed -i "s/Name: transformer-engine/Name: transformer-engine-cu12/g" "transformer_engine-${VERSION}/transformer_engine-${VERSION}.dist-info/METADATA"
sed -i "s/Name: transformer_engine/Name: transformer_engine_cu12/g" "transformer_engine-${VERSION}/transformer_engine-${VERSION}.dist-info/METADATA"
mv "${WHL_BASE}/${WHL_BASE}.dist-info" "${WHL_BASE}/transformer_engine_cu12-${VERSION}.dist-info"
sed -i "s/Name: transformer-engine/Name: transformer-engine-cu${CUDA_MAJOR}/g" "transformer_engine-${VERSION}/transformer_engine-${VERSION}.dist-info/METADATA"
sed -i "s/Name: transformer_engine/Name: transformer_engine_cu${CUDA_MAJOR}/g" "transformer_engine-${VERSION}/transformer_engine-${VERSION}.dist-info/METADATA"
mv "${WHL_BASE}/${WHL_BASE}.dist-info" "${WHL_BASE}/transformer_engine_cu${CUDA_MAJOR}-${VERSION}.dist-info"
/opt/python/cp310-cp310/bin/wheel pack ${WHL_BASE}
# Rename the wheel to make it python version agnostic.
whl_name=$(basename dist/*)
IFS='-' read -ra whl_parts <<< "$whl_name"
whl_name_target="${whl_parts[0]}_cu12-${whl_parts[1]}-py3-none-${whl_parts[4]}"
whl_name_target="${whl_parts[0]}_cu${CUDA_MAJOR}-${whl_parts[1]}-py3-none-${whl_parts[4]}"
rm -rf $WHL_BASE dist
mv *.whl /wheelhouse/"$whl_name_target"
fi
......@@ -61,7 +63,7 @@ fi
if $BUILD_JAX ; then
cd /TransformerEngine/transformer_engine/jax
/opt/python/cp310-cp310/bin/pip install "jax[cuda12_local]" jaxlib
/opt/python/cp310-cp310/bin/pip install "jax[cuda${CUDA_MAJOR}_local]" jaxlib
/opt/python/cp310-cp310/bin/python setup.py sdist 2>&1 | tee /wheelhouse/logs/jax.txt
cp dist/* /wheelhouse/
fi
......@@ -2,7 +2,29 @@
#
# See LICENSE for license information.
docker build --no-cache -t "aarch_wheel" -f build_tools/wheel_utils/Dockerfile.aarch .
# Remove leftovers.
rm -rf aarch_wheelhouse_cu12 aarch_wheelhouse_cu13
# CUDA 12.
docker build --no-cache \
--build-arg CUDA_MAJOR=12 \
--build-arg CUDA_MINOR=3 \
--build-arg BUILD_METAPACKAGE=false \
--build-arg BUILD_COMMON=true \
--build-arg BUILD_PYTORCH=false \
--build-arg BUILD_JAX=false \
-t "aarch_wheel" -f build_tools/wheel_utils/Dockerfile.aarch .
docker run --runtime=nvidia --gpus=all --ipc=host "aarch_wheel"
docker cp $(docker ps -aq | head -1):/wheelhouse aarch_wheelhouse_cu12
# CUDA 13.
docker build --no-cache \
--build-arg CUDA_MAJOR=13 \
--build-arg CUDA_MINOR=0 \
--build-arg BUILD_METAPACKAGE=false \
--build-arg BUILD_COMMON=true \
--build-arg BUILD_PYTORCH=false \
--build-arg BUILD_JAX=false \
-t "aarch_wheel" -f build_tools/wheel_utils/Dockerfile.aarch .
docker run --runtime=nvidia --gpus=all --ipc=host "aarch_wheel"
rm -rf aarch_wheelhouse
docker cp $(docker ps -aq | head -1):/wheelhouse/ aarch_wheelhouse
docker cp $(docker ps -aq | head -1):/wheelhouse aarch_wheelhouse_cu13
......@@ -2,7 +2,29 @@
#
# See LICENSE for license information.
docker build --no-cache -t "x86_wheel" -f build_tools/wheel_utils/Dockerfile.x86 .
# Remove leftovers.
rm -rf x86_wheelhouse_cu12 x86_wheelhouse_cu13
# CUDA 12.
docker build --no-cache \
--build-arg CUDA_MAJOR=12 \
--build-arg CUDA_MINOR=3 \
--build-arg BUILD_METAPACKAGE=true \
--build-arg BUILD_COMMON=true \
--build-arg BUILD_PYTORCH=true \
--build-arg BUILD_JAX=true \
-t "x86_wheel" -f build_tools/wheel_utils/Dockerfile.x86 .
docker run --runtime=nvidia --gpus=all --ipc=host "x86_wheel"
docker cp $(docker ps -aq | head -1):/wheelhouse x86_wheelhouse_cu12
# CUDA 13.
docker build --no-cache \
--build-arg CUDA_MAJOR=13 \
--build-arg CUDA_MINOR=0 \
--build-arg BUILD_METAPACKAGE=false \
--build-arg BUILD_COMMON=true \
--build-arg BUILD_PYTORCH=false \
--build-arg BUILD_JAX=false \
-t "x86_wheel" -f build_tools/wheel_utils/Dockerfile.x86 .
docker run --runtime=nvidia --gpus=all --ipc=host "x86_wheel"
rm -rf x86_wheelhouse
docker cp $(docker ps -aq | head -1):/wheelhouse x86_wheelhouse
docker cp $(docker ps -aq | head -1):/wheelhouse x86_wheelhouse_cu13
......@@ -38,6 +38,14 @@ Transformer Engine can be directly installed from `our PyPI <https://pypi.org/pr
To obtain the necessary Python bindings for Transformer Engine, the frameworks needed must be explicitly specified as extra dependencies in a comma-separated list (e.g. [jax,pytorch]). Transformer Engine ships wheels for the core library. Source distributions are shipped for the JAX and PyTorch extensions.
The core package from Transformer Engine (without any framework extensions) can be installed via:
.. code-block:: bash
pip3 install transformer_engine[core]
By default, this will install the core library compiled for CUDA 12. The cuda major version can be specified by modified the extra dependency to `core_cu12` or `core_cu13`.
pip - from GitHub
-----------------------
......
......@@ -3,6 +3,9 @@
# See LICENSE for license information.
"""Shared functions for the encoder tests"""
from functools import lru_cache
import os
import pathlib
import zipfile
import jax
import jax.numpy
......@@ -118,3 +121,50 @@ def get_quantization_recipe_from_name_string(name: str):
return recipe.NVFP4BlockScaling()
case _:
raise ValueError(f"Invalid quantization_recipe, got {name}")
@lru_cache(maxsize=None)
def _get_example_artifacts_dir() -> pathlib.Path:
"""Path to directory with pre-downloaded datasets"""
# Check environment variable
path = os.getenv("NVTE_TEST_CHECKPOINT_ARTIFACT_PATH")
if path:
return pathlib.Path(path).resolve()
# Fallback to path in root dir
root_dir = pathlib.Path(__file__).resolve().parent.parent.parent
return root_dir / "artifacts" / "examples" / "jax"
def _unpack_cached_dataset(artifacts_dir: pathlib.Path, folder_name: str) -> None:
"""Unpack a cached dataset if available"""
dataset_dir = artifacts_dir / folder_name
if not dataset_dir.exists():
print(f"Cached dataset {folder_name} not found at {dataset_dir}, skipping unpack")
return
# Disable any HF network calls since the dataset is cached locally
os.environ["HF_HUB_OFFLINE"] = "1"
for filename in os.listdir(dataset_dir):
filepath = dataset_dir / filename
if not filename.endswith(".zip"):
continue
print(f"Unpacking cached dataset {folder_name} from {filepath}")
with zipfile.ZipFile(filepath, "r") as zip_ref:
zip_ref.extractall(pathlib.Path.home() / ".cache" / "huggingface")
print(
f"Unpacked cached dataset {folder_name} to"
f" {pathlib.Path.home() / '.cache' / 'huggingface'}"
)
# This is cached so we don't have to unpack datasets multiple times
@lru_cache(maxsize=None)
def unpack_cached_datasets_if_available() -> None:
"""Unpack cached datasets if available"""
artifacts_dir = _get_example_artifacts_dir()
_unpack_cached_dataset(artifacts_dir, "mnist")
_unpack_cached_dataset(artifacts_dir, "encoder")
......@@ -23,12 +23,14 @@ from common import (
is_bf16_supported,
get_quantization_recipe_from_name_string,
assert_params_sufficiently_sharded,
unpack_cached_datasets_if_available,
)
import transformer_engine.jax as te
import transformer_engine.jax.cpp_extensions as tex
import transformer_engine.jax.flax as te_flax
from transformer_engine.jax.quantize import is_scaling_mode_supported, ScalingMode
unpack_cached_datasets_if_available()
DEVICE_DP_AXIS = "data"
DEVICE_TP_AXIS = "model"
......
......@@ -19,12 +19,17 @@ from flax.training import train_state
from jax.experimental import mesh_utils
from jax.sharding import PartitionSpec, NamedSharding
from common import is_bf16_supported, get_quantization_recipe_from_name_string
from common import (
is_bf16_supported,
get_quantization_recipe_from_name_string,
unpack_cached_datasets_if_available,
)
import transformer_engine.jax as te
import transformer_engine.jax.cpp_extensions as tex
import transformer_engine.jax.flax as te_flax
from transformer_engine.jax.quantize import is_scaling_mode_supported, ScalingMode
unpack_cached_datasets_if_available()
DEVICE_DP_AXIS = "data"
PARAMS_KEY = "params"
......
......@@ -27,11 +27,13 @@ from common import (
is_mxfp8_supported,
is_nvfp4_supported,
get_quantization_recipe_from_name_string,
unpack_cached_datasets_if_available,
)
import transformer_engine.jax as te
import transformer_engine.jax.cpp_extensions as tex
import transformer_engine.jax.flax as te_flax
unpack_cached_datasets_if_available()
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
DEVICE_DP_AXIS = "data"
......@@ -670,7 +672,7 @@ class TestEncoder(unittest.TestCase):
def test_te_nvfp4(self):
"""Test Transformer Engine with NVFP4"""
result = self.exec(True, "NVFP4BlockScaling")
assert result[0] < 0.451 and result[1] > 0.79
assert result[0] < 0.451 and result[1] > 0.787
@unittest.skipIf(not is_bf16_supported(), "Device compute capability 8.0+ is required for BF16")
def test_te_bf16_shardy(self):
......@@ -708,7 +710,7 @@ class TestEncoder(unittest.TestCase):
def test_te_nvfp4_shardy(self):
"""Test Transformer Engine with NVFP4"""
result = self.exec(True, "NVFP4BlockScaling", enable_shardy=True)
assert result[0] < 0.451 and result[1] > 0.79
assert result[0] < 0.451 and result[1] > 0.787
if __name__ == "__main__":
......
......@@ -16,11 +16,16 @@ from datasets import load_dataset
from flax import linen as nn
from flax.training import train_state
from common import is_bf16_supported, get_quantization_recipe_from_name_string
from common import (
is_bf16_supported,
get_quantization_recipe_from_name_string,
unpack_cached_datasets_if_available,
)
import transformer_engine.jax as te
import transformer_engine.jax.flax as te_flax
from transformer_engine.jax.quantize import is_scaling_mode_supported, ScalingMode
unpack_cached_datasets_if_available()
PARAMS_KEY = "params"
DROPOUT_KEY = "dropout"
......@@ -385,7 +390,7 @@ class TestEncoder(unittest.TestCase):
self.args.use_fp8 = True
self.args.fp8_recipe = "NVFP4BlockScaling"
actual = train_and_evaluate(self.args)
assert actual[0] < 0.476 and actual[1] > 0.775
assert actual[0] < 0.477 and actual[1] > 0.769
if __name__ == "__main__":
......
......@@ -22,7 +22,13 @@ from transformer_engine.jax.quantize import is_scaling_mode_supported, ScalingMo
DIR = str(Path(__file__).resolve().parents[1])
sys.path.append(str(DIR))
from encoder.common import is_bf16_supported, get_quantization_recipe_from_name_string
from encoder.common import (
is_bf16_supported,
get_quantization_recipe_from_name_string,
unpack_cached_datasets_if_available,
)
unpack_cached_datasets_if_available()
IMAGE_H = 28
IMAGE_W = 28
......
......@@ -3,8 +3,7 @@
# See LICENSE for license information.
[build-system]
requires = ["setuptools>=61.0", "cmake>=3.21", "wheel", "pybind11[global]", "ninja", "nvidia-mathdx==25.1.1", "pip", "torch>=2.1", "jax>=0.5.0", "flax>=0.7.1"]
requires = ["setuptools>=61.0", "cmake>=3.21", "wheel", "pybind11[global]", "ninja", "pip", "torch>=2.1", "jax>=0.5.0", "flax>=0.7.1"]
# Use legacy backend to import local packages in setup.py
build-backend = "setuptools.build_meta:__legacy__"
......@@ -2,7 +2,19 @@
#
# See LICENSE for license information.
function error_exit() {
echo "Error: $1"
exit 1
}
function test_fail() {
RET=1
FAILED_CASES="$FAILED_CASES $1"
echo "Error: sub-test failed: $1"
}
RET=0
FAILED_CASES=""
: ${TE_PATH:=/opt/transformerengine}
: ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features}
......@@ -14,24 +26,27 @@ mkdir -p "$XML_LOG_DIR"
# Nvinspect will be disabled if no feature is active.
: ${NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE:=$TE_PATH/tests/pytorch/debug/test_configs/dummy_feature.yaml}
FAIL=0
# It is not installed as a requirement,
# because it is not available on PyPI.
pip uninstall -y nvdlfw-inspect
pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git
pip install pytest==8.2.1
pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity.xml $TE_PATH/tests/pytorch/debug/test_sanity.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
pytest -v -s --junitxml=$XML_LOG_DIR/test_config.xml $TE_PATH/tests/pytorch/debug/test_config.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics.xml $TE_PATH/tests/pytorch/debug/test_numerics.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
pytest -v -s --junitxml=$XML_LOG_DIR/test_log.xml $TE_PATH/tests/pytorch/debug/test_log.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
NVTE_TORCH_COMPILE=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_api_features.xml $TE_PATH/tests/pytorch/debug/test_api_features.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
pytest -v -s --junitxml=$XML_LOG_DIR/test_perf.xml $TE_PATH/tests/pytorch/debug/test_perf.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
pip install pytest==8.2.1 || error_exit "Failed to install pytest"
pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity.xml $TE_PATH/tests/pytorch/debug/test_sanity.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "test_sanity.py"
pytest -v -s --junitxml=$XML_LOG_DIR/test_config.xml $TE_PATH/tests/pytorch/debug/test_config.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "test_config.py"
pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics.xml $TE_PATH/tests/pytorch/debug/test_numerics.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "test_numerics.py"
pytest -v -s --junitxml=$XML_LOG_DIR/test_log.xml $TE_PATH/tests/pytorch/debug/test_log.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || test_fail "test_log.py"
NVTE_TORCH_COMPILE=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_api_features.xml $TE_PATH/tests/pytorch/debug/test_api_features.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || test_fail "test_api_features.py"
pytest -v -s --junitxml=$XML_LOG_DIR/test_perf.xml $TE_PATH/tests/pytorch/debug/test_perf.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || test_fail "test_perf.py"
# standard sanity and numerics tests with initialized debug
NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity_2.xml $TE_PATH/tests/pytorch/test_sanity.py || FAIL=1
NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics_2.xml $TE_PATH/tests/pytorch/test_numerics.py || FAIL=1
exit $FAIL
NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity_2.xml $TE_PATH/tests/pytorch/test_sanity.py || test_fail "debug test_sanity.py"
NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics_2.xml $TE_PATH/tests/pytorch/test_numerics.py || test_fail "debug test_numerics.py"
if [ "$RET" -ne 0 ]; then
echo "Error in the following test cases:$FAILED_CASES"
exit 1
fi
echo "All tests passed"
exit 0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment