Merge nv_main(2.10) to main

Signed-off-by: wenjh <wenjh@sugon.com>

Merge nv_main(2.10) to main
Signed-off-by: wenjh <wenjh@sugon.com>
c1a1c04e · wenjh · e698a0a7 · 66aed3ae · c1a1c04e · c1a1c04e
Commit c1a1c04e authored Dec 27, 2025 by wenjh
20 changed files
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -19,7 +19,7 @@ jobs:
        run: |
          apt-get update
          apt-get install -y git python3.9 pip cudnn9-cuda-12
-          pip install cmake==3.21.0 pybind11[global] ninja nvidia-mathdx==25.1.1
+          pip install cmake==3.21.0 pybind11[global] ninja
      - name: 'Checkout'
        uses: actions/checkout@v3
        with:
@@ -43,7 +43,7 @@ jobs:
        run: |
          apt-get update
          apt-get install -y git python3.9 pip cudnn9-cuda-12
-          pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript nvidia-mathdx==25.1.1
+          pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript
      - name: 'Checkout'
        uses: actions/checkout@v3
        with:
@@ -63,7 +63,7 @@ jobs:
      options: --user root
    steps:
      - name: 'Dependencies'
-        run: pip install pybind11[global] nvidia-mathdx==25.1.1
+        run: pip install pybind11[global]
      - name: 'Checkout'
        uses: actions/checkout@v3
        with:
@@ -83,7 +83,9 @@ jobs:
      options: --user root
    steps:
      - name: 'Dependencies'
-        run: pip install torch pybind11[global] einops onnxscript nvidia-mathdx==25.1.1
+        run: |
+          pip install pybind11[global] einops onnxscript
+          pip install torch --index-url https://download.pytorch.org/whl/cu130
      - name: 'Checkout'
        uses: actions/checkout@v3
        with:

--- a/MANIFEST.in
+++ b/MANIFEST.in
+recursive-include transformer_engine/common/include *.*
--- a/README.rst
+++ b/README.rst
@@ -205,7 +205,7 @@ pip Installation
 **Prerequisites for pip installation:**

 * A compatible C++ compiler
-* CUDA Toolkit with cuDNN and NVCC (NVIDIA CUDA Compiler) installed
+* CUDA Toolkit with cuDNN and NVCC (NVIDIA CUDA Compiler) if installing from source.

 To install the latest stable version with pip:


--- a/benchmarks/linear/benchmark_grouped_linear.py
+++ b/benchmarks/linear/benchmark_grouped_linear.py
@@ -8,53 +8,67 @@ import torch.utils.benchmark as benchmark
 import pandas as pd

 from transformer_engine.pytorch.module import GroupedLinear
-from transformer_engine.common.recipe import Float8BlockScaling, MXFP8BlockScaling
+from transformer_engine.common.recipe import (
+    Float8BlockScaling,
+    MXFP8BlockScaling,
+    NVFP4BlockScaling,
+)
 from transformer_engine.pytorch.quantization import autocast, FP8GlobalStateManager
 from contextlib import nullcontext

 """
 # Profile BF16 recipe with Nsight Systems
 nsys profile \
-    --output=./benchmarks/linear/b200_mkn_4096_4096_4096_numgemm_8_bf16 \
+    --output=./benchmarks/linear/b200_numgemm_8_bf16 \
    --force-overwrite true \
    --trace=cuda,nvtx,cudnn,cublas \
    python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe bf16

 # Profile FP8 sub-channel recipe with Nsight Systems
 nsys profile \
-    --output=./benchmarks/linear/h100hbm_mkn_4096_4096_4096_numgemm_8_fp8_sub_channel \
+    --output=./benchmarks/linear/h100hbm_numgemm_8_fp8_sub_channel \
    --force-overwrite true \
    --trace=cuda,nvtx,cudnn,cublas \
    python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe fp8_sub_channel

 # Profile MXFP8 recipe with Nsight Systems
 nsys profile \
-    --output=./benchmarks/linear/b200_mkn_4096_4096_4096_numgemm_8_mxfp8 \
+    --output=./benchmarks/linear/b200_numgemm_8_mxfp8 \
    --force-overwrite true \
    --trace=cuda,nvtx,cudnn,cublas \
    python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe mxfp8

+# Profile NVFP4 recipe with Nsight Systems
+nsys profile \
+    --output=./benchmarks/linear/b200_numgemm_8_nvfp4 \
+    --force-overwrite true \
+    --trace=cuda,nvtx,cudnn,cublas \
+    python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe nvfp4
+
 """

 RECIPES = {
    "bf16": None,
    "fp8_sub_channel": Float8BlockScaling(),
    "mxfp8": MXFP8BlockScaling(),
+    "nvfp4": NVFP4BlockScaling(),
 }

 mxfp8_available, reason_for_no_mxfp8 = FP8GlobalStateManager.is_mxfp8_available()
 fp8_block_scaling_available, reason_for_no_fp8_block_scaling = (
    FP8GlobalStateManager.is_fp8_block_scaling_available()
 )
+nvfp4_available, reason_for_no_nvfp4 = FP8GlobalStateManager.is_nvfp4_available()


 def run_linear_multiple_steps(layer, x, m_splits, mode, gradient, run_num_steps=1, recipe=None):
    assert mode in ["fwd_only", "fwd_bwd"]
-    fp8_context = autocast(enabled=True, fp8_recipe=recipe) if recipe is not None else nullcontext()
-    # print(f"fp8_context: {fp8_context} and is it nullcontext? {isinstance(fp8_context, nullcontext)}")
+    quantization_context = (
+        autocast(enabled=True, recipe=recipe) if recipe is not None else nullcontext()
+    )

    if mode == "fwd_only":
-        with torch.no_grad(), fp8_context:
+        with torch.no_grad(), quantization_context:
            for i in range(run_num_steps):
                y_q = layer.forward(
                    x,
@@ -67,7 +81,7 @@ def run_linear_multiple_steps(layer, x, m_splits, mode, gradient, run_num_steps=
        layer.zero_grad()
        x.grad = None

-        with fp8_context:
+        with quantization_context:
            for i in range(run_num_steps):
                label = f"step_{i}"
                torch.cuda.nvtx.range_push(label)
@@ -142,7 +156,7 @@ def benchmark_linear(
            "recipe": recipe,
        },
        num_threads=1,
-    ).blocked_autorange(min_run_time=5)
+    ).blocked_autorange(min_run_time=10)
    print(f"{recipe_name}: {timing} \n")
    timing_ms = timing.median * 1000 / num_microbatches

@@ -225,30 +239,44 @@ if __name__ == "__main__":

    use_bias = False
    # Set the MKN values to benchmark
+    # Deepseek V3 EP64, SEQ_LEN=8192, topK8
+    # 256 expert => 4 local experts
+    # Avg M per expert: AvgM = SEQ_LEN * topK / localExperts = 16384
+    # M = AvgM * localExperts = 65536
+    # K = 7168
+    # N = 2048
+
+    # Deepseek V3 EP32, SEQ_LEN=8192, topK8
+    # 256 expert => 8 local experts
+    # Avg M per expert: AvgM = SEQ_LEN * topK / localExperts = 8192
+    # M = AvgM * localExperts = 65536
+    # K = 7168
+    # N = 2048
+
+    # 4 or 8local experts per rank
+    num_gemms_list = [4, 8]
+
+    # MKN for group linear
    mkns = []
-    for m in [8192]:
-        # for m in [4096, 8192, 16384]:
-        # for n in [1024, 2048, 4096, 8192, 16384]:
-        for n in [8192]:
-            for k in [4096]:
+    for m in [65536]:
+        for k in [7168]:
+            for n in [2048]:
                mkns.append((m, k, n))

    # default recipes to run if not specified
    recipe_list = ["bf16"]

    if args.recipe == "all":
-        recipe_list = ["bf16", "fp8_sub_channel", "mxfp8"]
+        recipe_list = ["bf16", "fp8_sub_channel", "mxfp8", "nvfp4"]
    else:
        recipe_list = [args.recipe]

-    num_gemms_list = [8]
-
    if args.profile:
-        mkns = [(4096 * 8, 4096, 4096)]
+        mkns = [(8192 * 8, 7168, 2048)]
        # in profile mode, only run one recipe specified in args.recipe
        assert args.recipe != "all", (
            "In profile mode, only one recipe can be specified, please specify the recipe as"
-            " fp8_sub_channel, mxfp8, or bf16"
+            " fp8_sub_channel, mxfp8, nvfp4, or bf16"
        )
        recipe_list = [args.recipe]
        num_gemms_list = [8]
@@ -265,13 +293,17 @@ if __name__ == "__main__":
                "bf16",
                "fp8_sub_channel",
                "mxfp8",
-            ], "Recipe must be one of bf16, fp8_sub_channel, or mxfp8"
+                "nvfp4",
+            ], "Recipe must be one of bf16, fp8_sub_channel, mxfp8, or nvfp4"
            if recipe_name == "mxfp8" and not mxfp8_available:
                print(f"MXFP8 is not available, skipping {recipe_name}")
                continue
            if recipe_name == "fp8_sub_channel" and not fp8_block_scaling_available:
                print(f"FP8 block scaling is not available, skipping {recipe_name}")
                continue
+            if recipe_name == "nvfp4" and not nvfp4_available:
+                print(f"NVFP4 is not available, skipping {recipe_name}")
+                continue

            df = run_benchmark_linear(
                mkns,

--- a/build_tools/VERSION.txt
+++ b/build_tools/VERSION.txt
-2.10.0.dev0
+2.11.0.dev0
--- a/build_tools/utils.py
+++ b/build_tools/utils.py
@@ -295,11 +295,9 @@ def cuda_archs() -> str:
    if archs is None:
        version = cuda_version()
        if version >= (13, 0):
-            archs = "75;80;89;90;100;100a;103a;120"
-        elif version >= (12, 9):
-            archs = "70;80;89;90;100;100a;103a;120"
+            archs = "75;80;89;90;100;120"
        elif version >= (12, 8):
-            archs = "70;80;89;90;100;100a;120"
+            archs = "70;80;89;90;100;120"
        else:
            archs = "70;80;89;90"
    return archs

--- a/build_tools/wheel_utils/Dockerfile.aarch
+++ b/build_tools/wheel_utils/Dockerfile.aarch
@@ -7,23 +7,34 @@ FROM quay.io/pypa/manylinux_2_28_aarch64
 WORKDIR /TransformerEngine/
 COPY ../.. /TransformerEngine/

-ARG VER="12-3"
-ARG ARCH="aarch64"
-RUN dnf -y install vim
+ARG CUDA_MAJOR="12"
+ARG CUDA_MINOR="3"
+
+# Args for build_wheels.sh
+ARG BUILD_METAPACKAGE=true
+ARG BUILD_COMMON=true
+ARG BUILD_PYTORCH=true
+ARG BUILD_JAX=true
+ENV BUILD_METAPACKAGE=${BUILD_METAPACKAGE}
+ENV BUILD_COMMON=${BUILD_COMMON}
+ENV BUILD_PYTORCH=${BUILD_PYTORCH}
+ENV BUILD_JAX=${BUILD_JAX}
+ENV CUDA_MAJOR=${CUDA_MAJOR}

 # Cuda toolkit, cudnn, driver.
 RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
 RUN dnf -y install epel-release
-RUN dnf -y install cuda-compiler-${VER}.${ARCH} \
-                   cuda-libraries-${VER}.${ARCH} \
-                   cuda-libraries-devel-${VER}.${ARCH}
-RUN dnf -y install --allowerasing cudnn9-cuda-12
+RUN dnf -y install cuda-compiler-${CUDA_MAJOR}-${CUDA_MINOR}.aarch64 \
+                   cuda-libraries-${CUDA_MAJOR}-${CUDA_MINOR}.aarch64 \
+                   cuda-libraries-devel-${CUDA_MAJOR}-${CUDA_MINOR}.aarch64
+RUN dnf -y install --allowerasing cudnn9-cuda-${CUDA_MAJOR}
 RUN dnf clean all
 RUN rm -rf /var/cache/dnf/*
 RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/999_nvidia_cuda.conf
-RUN dnf -y install cuda-toolkit
+RUN dnf -y install cuda-toolkit-${CUDA_MAJOR}
 RUN dnf clean all
 RUN dnf -y install glog.aarch64 glog-devel.aarch64
+RUN dnf -y install libnccl libnccl-devel libnccl-static

 ENV PATH="/usr/local/cuda/bin:${PATH}"
 ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
@@ -33,4 +44,4 @@ ENV CUDA_PATH=/usr/local/cuda
 ENV CUDADIR=/usr/local/cuda
 ENV NVTE_RELEASE_BUILD=1

-CMD ["/bin/bash", "/TransformerEngine/build_tools/wheel_utils/build_wheels.sh", "manylinux_2_28_aarch64", "true", "true", "false", "false", "false"]
+CMD ["/bin/bash", "-c", "bash /TransformerEngine/build_tools/wheel_utils/build_wheels.sh manylinux_2_28_aarch64 $BUILD_METAPACKAGE $BUILD_COMMON $BUILD_PYTORCH $BUILD_JAX $CUDA_MAJOR"]
--- a/build_tools/wheel_utils/Dockerfile.x86
+++ b/build_tools/wheel_utils/Dockerfile.x86
@@ -7,23 +7,34 @@ FROM quay.io/pypa/manylinux_2_28_x86_64
 WORKDIR /TransformerEngine/
 COPY ../.. /TransformerEngine/

-ARG VER="12-3"
-ARG ARCH="x86_64"
-RUN dnf -y install vim
+ARG CUDA_MAJOR="12"
+ARG CUDA_MINOR="3"
+
+# Args for build_wheels.sh
+ARG BUILD_METAPACKAGE=true
+ARG BUILD_COMMON=true
+ARG BUILD_PYTORCH=true
+ARG BUILD_JAX=true
+ENV BUILD_METAPACKAGE=${BUILD_METAPACKAGE}
+ENV BUILD_COMMON=${BUILD_COMMON}
+ENV BUILD_PYTORCH=${BUILD_PYTORCH}
+ENV BUILD_JAX=${BUILD_JAX}
+ENV CUDA_MAJOR=${CUDA_MAJOR}

 # Cuda toolkit, cudnn, driver.
 RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 RUN dnf -y install epel-release
-RUN dnf -y install cuda-compiler-${VER}.${ARCH} \
-                   cuda-libraries-${VER}.${ARCH} \
-                   cuda-libraries-devel-${VER}.${ARCH}
-RUN dnf -y install --allowerasing cudnn9-cuda-12
+RUN dnf -y install cuda-compiler-${CUDA_MAJOR}-${CUDA_MINOR}.x86_64 \
+                   cuda-libraries-${CUDA_MAJOR}-${CUDA_MINOR}.x86_64 \
+                   cuda-libraries-devel-${CUDA_MAJOR}-${CUDA_MINOR}.x86_64
+RUN dnf -y install --allowerasing cudnn9-cuda-${CUDA_MAJOR}
 RUN dnf clean all
 RUN rm -rf /var/cache/dnf/*
 RUN echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/999_nvidia_cuda.conf
-RUN dnf -y install cuda-toolkit
+RUN dnf -y install cuda-toolkit-${CUDA_MAJOR}
 RUN dnf clean all
 RUN dnf -y install glog.x86_64 glog-devel.x86_64
+RUN dnf -y install libnccl libnccl-devel libnccl-static

 ENV PATH="/usr/local/cuda/bin:${PATH}"
 ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
@@ -33,4 +44,4 @@ ENV CUDA_PATH=/usr/local/cuda
 ENV CUDADIR=/usr/local/cuda
 ENV NVTE_RELEASE_BUILD=1

-CMD ["/bin/bash", "/TransformerEngine/build_tools/wheel_utils/build_wheels.sh", "manylinux_2_28_x86_64", "true", "true", "true", "true", "true"]
+CMD ["/bin/bash", "-c", "bash /TransformerEngine/build_tools/wheel_utils/build_wheels.sh manylinux_2_28_x86_64 $BUILD_METAPACKAGE $BUILD_COMMON $BUILD_PYTORCH $BUILD_JAX $CUDA_MAJOR"]
\ No newline at end of file
--- a/build_tools/wheel_utils/build_wheels.sh
+++ b/build_tools/wheel_utils/build_wheels.sh
@@ -9,8 +9,10 @@ BUILD_METAPACKAGE=${2:-true}
 BUILD_COMMON=${3:-true}
 BUILD_PYTORCH=${4:-true}
 BUILD_JAX=${5:-true}
+CUDA_MAJOR=${6:-12}

 export NVTE_RELEASE_BUILD=1
+export PIP_CONSTRAINT=""
 export TARGET_BRANCH=${TARGET_BRANCH:-}
 mkdir -p /wheelhouse/logs

@@ -21,7 +23,7 @@ git checkout $TARGET_BRANCH
 git submodule update --init --recursive

 # Install deps
-/opt/python/cp310-cp310/bin/pip install cmake pybind11[global] ninja
+/opt/python/cp310-cp310/bin/pip install cmake pybind11[global] ninja setuptools wheel

 if $BUILD_METAPACKAGE ; then
        cd /TransformerEngine
@@ -36,18 +38,18 @@ if $BUILD_COMMON ; then
        # Create the wheel.
        /opt/python/cp310-cp310/bin/python setup.py bdist_wheel --verbose --python-tag=py3 --plat-name=$PLATFORM 2>&1 | tee /wheelhouse/logs/common.txt

-        # Repack the wheel for cuda specific package, i.e. cu12.
+        # Repack the wheel for specific cuda version.
        /opt/python/cp310-cp310/bin/wheel unpack dist/*
        # From python 3.10 to 3.11, the package name delimiter in metadata got changed from - (hyphen) to _ (underscore).
-        sed -i "s/Name: transformer-engine/Name: transformer-engine-cu12/g" "transformer_engine-${VERSION}/transformer_engine-${VERSION}.dist-info/METADATA"
-        sed -i "s/Name: transformer_engine/Name: transformer_engine_cu12/g" "transformer_engine-${VERSION}/transformer_engine-${VERSION}.dist-info/METADATA"
-        mv "${WHL_BASE}/${WHL_BASE}.dist-info" "${WHL_BASE}/transformer_engine_cu12-${VERSION}.dist-info"
+        sed -i "s/Name: transformer-engine/Name: transformer-engine-cu${CUDA_MAJOR}/g" "transformer_engine-${VERSION}/transformer_engine-${VERSION}.dist-info/METADATA"
+        sed -i "s/Name: transformer_engine/Name: transformer_engine_cu${CUDA_MAJOR}/g" "transformer_engine-${VERSION}/transformer_engine-${VERSION}.dist-info/METADATA"
+        mv "${WHL_BASE}/${WHL_BASE}.dist-info" "${WHL_BASE}/transformer_engine_cu${CUDA_MAJOR}-${VERSION}.dist-info"
        /opt/python/cp310-cp310/bin/wheel pack ${WHL_BASE}

        # Rename the wheel to make it python version agnostic.
        whl_name=$(basename dist/*)
        IFS='-' read -ra whl_parts <<< "$whl_name"
-        whl_name_target="${whl_parts[0]}_cu12-${whl_parts[1]}-py3-none-${whl_parts[4]}"
+        whl_name_target="${whl_parts[0]}_cu${CUDA_MAJOR}-${whl_parts[1]}-py3-none-${whl_parts[4]}"
        rm -rf $WHL_BASE dist
        mv *.whl /wheelhouse/"$whl_name_target"
 fi
@@ -61,7 +63,7 @@ fi

 if $BUILD_JAX ; then
 	cd /TransformerEngine/transformer_engine/jax
-        /opt/python/cp310-cp310/bin/pip install "jax[cuda12_local]" jaxlib
+	/opt/python/cp310-cp310/bin/pip install "jax[cuda${CUDA_MAJOR}_local]" jaxlib
 	/opt/python/cp310-cp310/bin/python setup.py sdist 2>&1 | tee /wheelhouse/logs/jax.txt
 	cp dist/* /wheelhouse/
 fi
--- a/build_tools/wheel_utils/launch_aarch.sh
+++ b/build_tools/wheel_utils/launch_aarch.sh
@@ -2,7 +2,29 @@
 #
 # See LICENSE for license information.

-docker build --no-cache -t "aarch_wheel" -f build_tools/wheel_utils/Dockerfile.aarch .
+# Remove leftovers.
+rm -rf aarch_wheelhouse_cu12 aarch_wheelhouse_cu13
+
+# CUDA 12.
+docker build --no-cache \
+  --build-arg CUDA_MAJOR=12 \
+  --build-arg CUDA_MINOR=3 \
+  --build-arg BUILD_METAPACKAGE=false \
+  --build-arg BUILD_COMMON=true \
+  --build-arg BUILD_PYTORCH=false \
+  --build-arg BUILD_JAX=false \
+  -t "aarch_wheel" -f build_tools/wheel_utils/Dockerfile.aarch .
+docker run --runtime=nvidia --gpus=all --ipc=host "aarch_wheel"
+docker cp $(docker ps -aq | head -1):/wheelhouse aarch_wheelhouse_cu12
+
+# CUDA 13.
+docker build --no-cache \
+  --build-arg CUDA_MAJOR=13 \
+  --build-arg CUDA_MINOR=0 \
+  --build-arg BUILD_METAPACKAGE=false \
+  --build-arg BUILD_COMMON=true \
+  --build-arg BUILD_PYTORCH=false \
+  --build-arg BUILD_JAX=false \
+  -t "aarch_wheel" -f build_tools/wheel_utils/Dockerfile.aarch .
 docker run --runtime=nvidia --gpus=all --ipc=host "aarch_wheel"
-rm -rf aarch_wheelhouse
-docker cp $(docker ps -aq | head -1):/wheelhouse/ aarch_wheelhouse
+docker cp $(docker ps -aq | head -1):/wheelhouse aarch_wheelhouse_cu13
--- a/build_tools/wheel_utils/launch_x86.sh
+++ b/build_tools/wheel_utils/launch_x86.sh
@@ -2,7 +2,29 @@
 #
 # See LICENSE for license information.

-docker build --no-cache -t "x86_wheel" -f build_tools/wheel_utils/Dockerfile.x86 .
+# Remove leftovers.
+rm -rf x86_wheelhouse_cu12 x86_wheelhouse_cu13
+
+# CUDA 12.
+docker build --no-cache \
+  --build-arg CUDA_MAJOR=12 \
+  --build-arg CUDA_MINOR=3 \
+  --build-arg BUILD_METAPACKAGE=true \
+  --build-arg BUILD_COMMON=true \
+  --build-arg BUILD_PYTORCH=true \
+  --build-arg BUILD_JAX=true \
+  -t "x86_wheel" -f build_tools/wheel_utils/Dockerfile.x86 .
+docker run --runtime=nvidia --gpus=all --ipc=host "x86_wheel"
+docker cp $(docker ps -aq | head -1):/wheelhouse x86_wheelhouse_cu12
+
+# CUDA 13.
+docker build --no-cache \
+  --build-arg CUDA_MAJOR=13 \
+  --build-arg CUDA_MINOR=0 \
+  --build-arg BUILD_METAPACKAGE=false \
+  --build-arg BUILD_COMMON=true \
+  --build-arg BUILD_PYTORCH=false \
+  --build-arg BUILD_JAX=false \
+  -t "x86_wheel" -f build_tools/wheel_utils/Dockerfile.x86 .
 docker run --runtime=nvidia --gpus=all --ipc=host "x86_wheel"
-rm -rf x86_wheelhouse
-docker cp $(docker ps -aq | head -1):/wheelhouse x86_wheelhouse
+docker cp $(docker ps -aq | head -1):/wheelhouse x86_wheelhouse_cu13
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -38,6 +38,14 @@ Transformer Engine can be directly installed from `our PyPI <https://pypi.org/pr

 To obtain the necessary Python bindings for Transformer Engine, the frameworks needed must be explicitly specified as extra dependencies in a comma-separated list (e.g. [jax,pytorch]). Transformer Engine ships wheels for the core library. Source distributions are shipped for the JAX and PyTorch extensions.

+The core package from Transformer Engine (without any framework extensions) can be installed via:
+
+.. code-block:: bash
+
+    pip3 install transformer_engine[core]
+
+By default, this will install the core library compiled for CUDA 12. The cuda major version can be specified by modified the extra dependency to `core_cu12` or `core_cu13`.
+
 pip - from GitHub
 -----------------------


--- a/examples/jax/encoder/common.py
+++ b/examples/jax/encoder/common.py
@@ -3,6 +3,9 @@
 # See LICENSE for license information.
 """Shared functions for the encoder tests"""
 from functools import lru_cache
+import os
+import pathlib
+import zipfile

 import jax
 import jax.numpy
@@ -118,3 +121,50 @@ def get_quantization_recipe_from_name_string(name: str):
            return recipe.NVFP4BlockScaling()
        case _:
            raise ValueError(f"Invalid quantization_recipe, got {name}")
+
+
+@lru_cache(maxsize=None)
+def _get_example_artifacts_dir() -> pathlib.Path:
+    """Path to directory with pre-downloaded datasets"""
+
+    # Check environment variable
+    path = os.getenv("NVTE_TEST_CHECKPOINT_ARTIFACT_PATH")
+    if path:
+        return pathlib.Path(path).resolve()
+
+    # Fallback to path in root dir
+    root_dir = pathlib.Path(__file__).resolve().parent.parent.parent
+    return root_dir / "artifacts" / "examples" / "jax"
+
+
+def _unpack_cached_dataset(artifacts_dir: pathlib.Path, folder_name: str) -> None:
+    """Unpack a cached dataset if available"""
+    dataset_dir = artifacts_dir / folder_name
+    if not dataset_dir.exists():
+        print(f"Cached dataset {folder_name} not found at {dataset_dir}, skipping unpack")
+        return
+
+    # Disable any HF network calls since the dataset is cached locally
+    os.environ["HF_HUB_OFFLINE"] = "1"
+
+    for filename in os.listdir(dataset_dir):
+        filepath = dataset_dir / filename
+        if not filename.endswith(".zip"):
+            continue
+        print(f"Unpacking cached dataset {folder_name} from {filepath}")
+
+        with zipfile.ZipFile(filepath, "r") as zip_ref:
+            zip_ref.extractall(pathlib.Path.home() / ".cache" / "huggingface")
+        print(
+            f"Unpacked cached dataset {folder_name} to"
+            f" {pathlib.Path.home() / '.cache' / 'huggingface'}"
+        )
+
+
+# This is cached so we don't have to unpack datasets multiple times
+@lru_cache(maxsize=None)
+def unpack_cached_datasets_if_available() -> None:
+    """Unpack cached datasets if available"""
+    artifacts_dir = _get_example_artifacts_dir()
+    _unpack_cached_dataset(artifacts_dir, "mnist")
+    _unpack_cached_dataset(artifacts_dir, "encoder")
--- a/examples/jax/encoder/test_model_parallel_encoder.py
+++ b/examples/jax/encoder/test_model_parallel_encoder.py
@@ -23,12 +23,14 @@ from common import (
    is_bf16_supported,
    get_quantization_recipe_from_name_string,
    assert_params_sufficiently_sharded,
+    unpack_cached_datasets_if_available,
 )
 import transformer_engine.jax as te
 import transformer_engine.jax.cpp_extensions as tex
 import transformer_engine.jax.flax as te_flax
 from transformer_engine.jax.quantize import is_scaling_mode_supported, ScalingMode

+unpack_cached_datasets_if_available()

 DEVICE_DP_AXIS = "data"
 DEVICE_TP_AXIS = "model"

--- a/examples/jax/encoder/test_multigpu_encoder.py
+++ b/examples/jax/encoder/test_multigpu_encoder.py
@@ -19,12 +19,17 @@ from flax.training import train_state
 from jax.experimental import mesh_utils
 from jax.sharding import PartitionSpec, NamedSharding

-from common import is_bf16_supported, get_quantization_recipe_from_name_string
+from common import (
+    is_bf16_supported,
+    get_quantization_recipe_from_name_string,
+    unpack_cached_datasets_if_available,
+)
 import transformer_engine.jax as te
 import transformer_engine.jax.cpp_extensions as tex
 import transformer_engine.jax.flax as te_flax
 from transformer_engine.jax.quantize import is_scaling_mode_supported, ScalingMode

+unpack_cached_datasets_if_available()

 DEVICE_DP_AXIS = "data"
 PARAMS_KEY = "params"

--- a/examples/jax/encoder/test_multiprocessing_encoder.py
+++ b/examples/jax/encoder/test_multiprocessing_encoder.py
@@ -27,11 +27,13 @@ from common import (
    is_mxfp8_supported,
    is_nvfp4_supported,
    get_quantization_recipe_from_name_string,
+    unpack_cached_datasets_if_available,
 )
 import transformer_engine.jax as te
 import transformer_engine.jax.cpp_extensions as tex
 import transformer_engine.jax.flax as te_flax

+unpack_cached_datasets_if_available()

 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 DEVICE_DP_AXIS = "data"
@@ -670,7 +672,7 @@ class TestEncoder(unittest.TestCase):
    def test_te_nvfp4(self):
        """Test Transformer Engine with NVFP4"""
        result = self.exec(True, "NVFP4BlockScaling")
-        assert result[0] < 0.451 and result[1] > 0.79
+        assert result[0] < 0.451 and result[1] > 0.787

    @unittest.skipIf(not is_bf16_supported(), "Device compute capability 8.0+ is required for BF16")
    def test_te_bf16_shardy(self):
@@ -708,7 +710,7 @@ class TestEncoder(unittest.TestCase):
    def test_te_nvfp4_shardy(self):
        """Test Transformer Engine with NVFP4"""
        result = self.exec(True, "NVFP4BlockScaling", enable_shardy=True)
-        assert result[0] < 0.451 and result[1] > 0.79
+        assert result[0] < 0.451 and result[1] > 0.787


 if __name__ == "__main__":

--- a/examples/jax/encoder/test_single_gpu_encoder.py
+++ b/examples/jax/encoder/test_single_gpu_encoder.py
@@ -16,11 +16,16 @@ from datasets import load_dataset
 from flax import linen as nn
 from flax.training import train_state

-from common import is_bf16_supported, get_quantization_recipe_from_name_string
+from common import (
+    is_bf16_supported,
+    get_quantization_recipe_from_name_string,
+    unpack_cached_datasets_if_available,
+)
 import transformer_engine.jax as te
 import transformer_engine.jax.flax as te_flax
 from transformer_engine.jax.quantize import is_scaling_mode_supported, ScalingMode

+unpack_cached_datasets_if_available()

 PARAMS_KEY = "params"
 DROPOUT_KEY = "dropout"
@@ -385,7 +390,7 @@ class TestEncoder(unittest.TestCase):
        self.args.use_fp8 = True
        self.args.fp8_recipe = "NVFP4BlockScaling"
        actual = train_and_evaluate(self.args)
-        assert actual[0] < 0.476 and actual[1] > 0.775
+        assert actual[0] < 0.477 and actual[1] > 0.769


 if __name__ == "__main__":

--- a/examples/jax/mnist/test_single_gpu_mnist.py
+++ b/examples/jax/mnist/test_single_gpu_mnist.py
@@ -22,7 +22,13 @@ from transformer_engine.jax.quantize import is_scaling_mode_supported, ScalingMo

 DIR = str(Path(__file__).resolve().parents[1])
 sys.path.append(str(DIR))
-from encoder.common import is_bf16_supported, get_quantization_recipe_from_name_string
+from encoder.common import (
+    is_bf16_supported,
+    get_quantization_recipe_from_name_string,
+    unpack_cached_datasets_if_available,
+)
+
+unpack_cached_datasets_if_available()

 IMAGE_H = 28
 IMAGE_W = 28

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,8 +3,7 @@
 # See LICENSE for license information.

 [build-system]
-requires = ["setuptools>=61.0", "cmake>=3.21", "wheel", "pybind11[global]", "ninja", "nvidia-mathdx==25.1.1", "pip", "torch>=2.1", "jax>=0.5.0", "flax>=0.7.1"]
+requires = ["setuptools>=61.0", "cmake>=3.21", "wheel", "pybind11[global]", "ninja", "pip", "torch>=2.1", "jax>=0.5.0", "flax>=0.7.1"]

 # Use legacy backend to import local packages in setup.py
 build-backend = "setuptools.build_meta:__legacy__"
-
--- a/qa/L0_pytorch_debug_unittest/test.sh
+++ b/qa/L0_pytorch_debug_unittest/test.sh
@@ -2,7 +2,19 @@
 #
 # See LICENSE for license information.

+function error_exit() {
+    echo "Error: $1"
+    exit 1
+}

+function test_fail() {
+    RET=1
+    FAILED_CASES="$FAILED_CASES $1"
+    echo "Error: sub-test failed: $1"
+}
+
+RET=0
+FAILED_CASES=""

 : ${TE_PATH:=/opt/transformerengine}
 : ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features}
@@ -14,24 +26,27 @@ mkdir -p "$XML_LOG_DIR"
 # Nvinspect will be disabled if no feature is active.
 : ${NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE:=$TE_PATH/tests/pytorch/debug/test_configs/dummy_feature.yaml}

-FAIL=0
-
 # It is not installed as a requirement,
 # because it is not available on PyPI.
 pip uninstall -y nvdlfw-inspect
 pip install git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git

-pip install pytest==8.2.1
-pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity.xml $TE_PATH/tests/pytorch/debug/test_sanity.py  --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
-pytest -v -s --junitxml=$XML_LOG_DIR/test_config.xml $TE_PATH/tests/pytorch/debug/test_config.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
-pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics.xml $TE_PATH/tests/pytorch/debug/test_numerics.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
-pytest -v -s --junitxml=$XML_LOG_DIR/test_log.xml $TE_PATH/tests/pytorch/debug/test_log.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
-NVTE_TORCH_COMPILE=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_api_features.xml $TE_PATH/tests/pytorch/debug/test_api_features.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
-pytest -v -s --junitxml=$XML_LOG_DIR/test_perf.xml $TE_PATH/tests/pytorch/debug/test_perf.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
+pip install pytest==8.2.1 || error_exit "Failed to install pytest"

+pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity.xml $TE_PATH/tests/pytorch/debug/test_sanity.py  --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "test_sanity.py"
+pytest -v -s --junitxml=$XML_LOG_DIR/test_config.xml $TE_PATH/tests/pytorch/debug/test_config.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "test_config.py"
+pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics.xml $TE_PATH/tests/pytorch/debug/test_numerics.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "test_numerics.py"
+pytest -v -s --junitxml=$XML_LOG_DIR/test_log.xml $TE_PATH/tests/pytorch/debug/test_log.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || test_fail "test_log.py"
+NVTE_TORCH_COMPILE=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_api_features.xml $TE_PATH/tests/pytorch/debug/test_api_features.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || test_fail "test_api_features.py"
+pytest -v -s --junitxml=$XML_LOG_DIR/test_perf.xml $TE_PATH/tests/pytorch/debug/test_perf.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || test_fail "test_perf.py"

 # standard sanity and numerics tests with initialized debug
-NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity_2.xml $TE_PATH/tests/pytorch/test_sanity.py || FAIL=1
-NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics_2.xml $TE_PATH/tests/pytorch/test_numerics.py || FAIL=1
-
-exit $FAIL
+NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_sanity_2.xml $TE_PATH/tests/pytorch/test_sanity.py || test_fail "debug test_sanity.py"
+NVTE_TEST_NVINSPECT_ENABLED=1 NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FUSED_ATTN=0 pytest -v -s --junitxml=$XML_LOG_DIR/test_numerics_2.xml $TE_PATH/tests/pytorch/test_numerics.py || test_fail "debug test_numerics.py"
+
+if [ "$RET" -ne 0 ]; then
+    echo "Error in the following test cases:$FAILED_CASES"
+    exit 1
+fi
+echo "All tests passed"
+exit 0