"tests/vscode:/vscode.git/clone" did not exist on "ba0bfd40e21cacfd5da6a1e43028a37258a29cb4"
Commit 0d874a4e authored by wenjh's avatar wenjh
Browse files

Merge branch 'nv_main' of v2.12

parents a68e5f87 dfdd3820
# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
FROM ubuntu:22.04
ENV DEBIAN_FRONTEND=noninteractive
ENV CUDA_HOME=/usr/local/cuda
ENV PATH=$PATH:$CUDA_HOME/bin
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
ENV TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.6;9.0"
ARG PYTHON_VERSION=3.12
ARG TORCH_VERSION=2.9.1
ARG CUDA_VERSION=12.9.1
ARG CUDNN_MAJOR_VERSION=9
ENV PATH=/opt/venv/bin:$PATH
ENV PYTHONUNBUFFERED=1
ARG AARCH=x86_64
# Install Python
RUN apt-get update && \
apt-get install -y software-properties-common wget && \
add-apt-repository ppa:deadsnakes/ppa -y && \
apt-get install -y python$PYTHON_VERSION-dev python$PYTHON_VERSION-venv python3-pip && \
python$PYTHON_VERSION -m venv /opt/venv
# Install cuda-toolkit
RUN CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1'}) && \
CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $2'}) && \
rm /etc/apt/sources.list.d/cuda*.list || true && \
rm /etc/apt/sources.list.d/nvidia-cuda.list || true && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${AARCH}/cuda-keyring_1.1-1_all.deb && \
dpkg -i cuda-keyring_1.1-1_all.deb && \
rm cuda-keyring_1.1-1_all.deb && \
apt-get update && \
apt-get install -y cuda-toolkit-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} cudnn-cuda-$CUDA_MAJOR_VERSION libcudnn$CUDNN_MAJOR_VERSION-cuda-$CUDA_MAJOR_VERSION libnccl2 libnccl-dev cmake
# Install PyTorch
RUN export MATRIX_CUDA_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1 $2'}) && \
export MATRIX_TORCH_VERSION=$(echo $TORCH_VERSION | awk -F \. {'print $1 "." $2'}) && \
export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
minv = {'2.5': 118, '2.6': 118, '2.7': 118, '2.8': 126, '2.9': 126}[env['MATRIX_TORCH_VERSION']]; \
maxv = {'2.5': 124, '2.6': 126, '2.7': 128, '2.8': 129, '2.9': 130}[env['MATRIX_TORCH_VERSION']]; \
print(minv if int(env['MATRIX_CUDA_VERSION']) < 120 else maxv)" \
) && \
pip install --no-cache-dir torch==${TORCH_VERSION} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
\ No newline at end of file
# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
name: Build PyTorch Wheel
description: Builds a PyTorch wheel for TransformerEngine
inputs:
release-version:
description: 'The release version to use for the build'
required: true
python-version:
description: 'The Python version to use for the build'
required: true
cuda-version:
description: 'The CUDA version to use for the build'
required: true
cudnn-version:
description: 'The cuDNN version to use for the build'
required: true
torch-version:
description: 'The PyTorch version to use for the build'
required: true
cxx11_abi:
description: 'Enable torch flag C++11 ABI (TRUE/FALSE)'
required: true
base-image:
description: 'The base image to use for the build'
required: false
aarch:
description: 'The architecture to use for the build'
required: true
outputs:
wheel_name:
description: 'The name of the built wheel'
value: ${{ steps.build_wheel.outputs.wheel_name }}
runs:
using: 'composite'
steps:
- name: Move /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"
- name: Maximize build space
uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
with:
root-reserve-mb: 5120
temp-reserve-mb: 32
swap-size-mb: 10240
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
build-mount-path: '/var/lib/docker/'
- name: Restore /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"
- name: Checkout
uses: actions/checkout@v4
with:
ref: ${{ inputs.release-version }}
submodules: recursive
- name: Checkout build tools
uses: actions/checkout@v4
with:
path: build-tools
submodules: recursive
- name: Build image
shell: bash -euxo pipefail {0}
env:
BASE_IMAGE: ${{ inputs.base-image }}
run: |
if [[ "${BASE_IMAGE}" == "" ]]; then
docker build \
-t transformer-engine-build \
-f build-tools/.github/actions/build-pytorch-wheel/Dockerfile \
--build-arg PYTHON_VERSION=${{ inputs.python-version }} \
--build-arg TORCH_VERSION=${{ inputs.torch-version }} \
--build-arg CUDA_VERSION=${{ inputs.cuda-version }} \
--build-arg CUDNN_MAJOR_VERSION=${{ inputs.cudnn-version }} \
--build-arg AARCH=${{ inputs.aarch }} \
.
else
docker pull ${BASE_IMAGE}
docker tag ${BASE_IMAGE} transformer-engine-build
fi
- name: Build wheel
shell: bash -euxo pipefail {0}
id: build_wheel
env:
CXX11_ABI: ${{ inputs.cxx11_abi }}
run: |
echo ::group::Build wheel
EXIT_CODE=$(docker run \
--rm \
--shm-size=64g \
--workdir /workspace/transformer_engine/pytorch \
--volume $(pwd):/workspace \
--volume $GITHUB_OUTPUT:$GITHUB_OUTPUT \
-e PIP_CONSTRAINT= \
-e CXX11_ABI=$CXX11_ABI \
-e GITHUB_OUTPUT=$GITHUB_OUTPUT \
transformer-engine-build bash /workspace/build-tools/.github/actions/build-pytorch-wheel/build.sh | tail -n 1)
# Do not fail the job if timeout killed the build
exit $EXIT_CODE
echo ::endgroup::
- name: Log Built Wheels
shell: bash -euxo pipefail {0}
run: |
ls transformer_engine/pytorch/dist
#!/bin/bash
# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
set -eoxu pipefail
export NVTE_PYTORCH_FORCE_BUILD=TRUE
export NVTE_NO_LOCAL_VERSION=1
export NVTE_PYTORCH_FORCE_CXX11_ABI=$CXX11_ABI
export PIP_CONSTRAINT=
pip install wheel packaging nvidia-mathdx ninja pybind11
# 5h timeout since GH allows max 6h and we want some buffer
EXIT_CODE=0
timeout 5h python setup.py bdist_wheel --dist-dir=dist || EXIT_CODE=$?
if [ $EXIT_CODE -eq 0 ]; then
wheel_name=$(python -c "import setup; print(setup.get_wheel_url()[1])" | tail -n 1)
ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
echo "wheel_name=${wheel_name}" | tee -a "$GITHUB_OUTPUT"
fi
echo $EXIT_CODE
#!/bin/bash
# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
# Configuration
BASE_IMAGE="nvcr.io/nvidia/pytorch"
TAG_SUFFIX="-py3"
MONTHS_TO_CHECK=5 # Check current month and previous 4 months (total 5)
# Initialize an array to store existing tags
EXISTING_TAGS=()
echo "Checking for existence of the last ${MONTHS_TO_CHECK} NGC PyTorch images: ${BASE_IMAGE}:YY.MM${TAG_SUFFIX}"
echo "---------------------------------------------------------------------"
# Loop through the last N months
for i in $(seq 0 $((MONTHS_TO_CHECK - 1))); do
# Calculate Year and Month for the tag
CURRENT_YEAR=$(date +%Y)
CURRENT_MONTH=$(date +%m)
# Calculate target month and year
TARGET_DATE=$(date -d "$CURRENT_YEAR-$CURRENT_MONTH-01 -$i months" +%y.%m)
# Construct the full image tag and the tag-only string
IMAGE_TAG="${TARGET_DATE}${TAG_SUFFIX}"
FULL_IMAGE="${BASE_IMAGE}:${IMAGE_TAG}"
echo "Checking: ${FULL_IMAGE}"
# Use 'docker manifest inspect' to check for image existence without pulling.
if docker manifest inspect "${FULL_IMAGE}" > /dev/null 2>&1; then
echo "✅ EXISTS: Found."
# Add the tag-only string to the array
EXISTING_TAGS+=("nvcr.io/nvidia/pytorch:${IMAGE_TAG}")
else
echo "❌ MISSING: Not found."
fi
done
echo "---------------------------------------------------------------------"
## JSON Output Generation
# This uses the collected array to build a JSON string.
# 1. Convert the shell array to a newline-separated string.
TAGS_NL_SEP=$(printf "%s\n" "${EXISTING_TAGS[@]}")
# 2. Use jq to read the newline-separated list and format it into a JSON array.
# . | split("\n") | .[:-1] reads the input, splits it by newline, and removes the trailing empty element.
if command -v jq &> /dev/null; then
JSON_STRING=$(echo -e "${TAGS_NL_SEP}" | jq -R -s 'split("\n") | .[:-1]')
echo "Generated JSON String of Existing Tags:"
echo "${JSON_STRING}"
# Optional: Save the JSON string to a variable for further use
# echo "JSON_STRING is now available in the shell if you source this script."
else
echo "WARNING: 'jq' is not installed. Cannot format output as JSON."
echo "Found Tags: ${EXISTING_TAGS[*]}"
fi
echo "---"
echo "Check complete."
echo "${JSON_STRING}" > ngc_images.json
# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
# This workflow will:
# - Create a new Github release
# - Build wheels for supported architectures
# - Deploy the wheels to the Github release
# - Release the static code to PyPi
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
name: Attach wheels to release
on:
release:
types: [published]
workflow_dispatch:
inputs:
runs-on:
description: 'The runner to use for the build'
required: true
type: string
default: ubuntu-22.04
release-version:
description: 'Release version'
required: true
default: '0.1.0'
python-version:
description: 'Python version'
required: true
default: '3.12'
torch-version:
description: 'Torch version'
required: true
default: '2.8.0'
cuda-version:
description: 'CUDA version'
required: true
default: '12.9.1'
cudnn-version:
description: 'CUDNN version'
required: true
default: '9'
cxx11_abi:
description: 'C++11 ABI'
required: true
type: choice
default: 'TRUE'
options:
- 'TRUE'
- 'FALSE'
ngc-image:
description: 'NGC PyTorch image (will take precedence over the source build)'
required: false
type: string
default: ''
jobs:
pre-flight:
runs-on: ubuntu-latest
outputs:
build-wheel-matrix: ${{ steps.matrix.outputs.matrix }}
release-assets-url: ${{ steps.release-assets-url.outputs.upload_url }}
ngc-images: ${{ steps.check_for_ngc_images.outputs.IMAGES }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Build release matrix
id: matrix
env:
EVENT: ${{ github.event_name }}
run: |
if [[ "$EVENT" == "release" ]]; then
MATRIX=$(echo '{
"os": ["ubuntu-22.04", "ubuntu-22.04-arm"],
"release-version": ["${{ github.event.release.tag_name }}"],
"python-version": ["3.12"],
"torch-version": ["2.8.0"],
"cuda-version": ["12.9.1"],
"cudnn-version": ["9"],
"cxx11_abi": ["TRUE"]
}' | jq -rc)
else
MATRIX=$(echo '{
"os": ["${{ inputs.runs-on }}"],
"release-version": ["${{ inputs.release-version }}"],
"python-version": ["${{ inputs.python-version }}"],
"torch-version": ["${{ inputs.torch-version }}"],
"cuda-version": ["${{ inputs.cuda-version }}"],
"cudnn-version": ["${{ inputs.cudnn-version }}"],
"cxx11_abi": ["${{ inputs.cxx11_abi }}"]
}' | jq -rc)
fi
echo "matrix=$MATRIX" | tee -a "$GITHUB_OUTPUT"
- name: Get Release with tag
id: get_current_release
uses: joutvhu/get-release@9a8271732adc3299a22f8ad09b0a67eb3aa836ac
if: ${{ github.event_name == 'workflow_dispatch' }}
with:
tag_name: ${{ inputs.release-version }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Get release assets url
env:
EVENT: ${{ github.event_name }}
if: ${{ (success() || !failure()) && !cancelled()}}
id: release-assets-url
run: |
if [[ "$EVENT" == "release" ]]; then
echo "upload_url=${{ github.event.release.upload_url }}" | tee -a "$GITHUB_OUTPUT"
else
echo "upload_url=${{ steps.get_current_release.outputs.upload_url }}" | tee -a "$GITHUB_OUTPUT"
fi
- name: Check for NGC PyTorch images
id: check_for_ngc_images
if: ${{ (success() || !failure()) && !cancelled()}}
env:
EVENT: ${{ github.event_name }}
run: |
if [[ "$EVENT" == "release" ]]; then
bash ./.github/scripts/check_for_ngc_images.sh
echo "IMAGES=$(cat ngc_images.json | jq -cr)" | tee -a $GITHUB_OUTPUT
else
echo 'IMAGES=["${{ inputs.ngc-image }}"]' | tee -a "$GITHUB_OUTPUT"
fi
build_wheels:
name: Build Wheel
runs-on: ${{ matrix.os }}
needs: pre-flight
if: ${{ github.event_name == 'release' || inputs.ngc-image == '' }}
strategy:
fail-fast: false
matrix: ${{ fromJson(needs.pre-flight.outputs.build-wheel-matrix) }}
steps:
- name: 'Checkout'
uses: actions/checkout@v3
- name: 'Build PyTorch Wheel'
uses: ./.github/actions/build-pytorch-wheel
id: build-pytorch-wheel
with:
release-version: ${{ matrix.release-version }}
python-version: ${{ matrix.python-version }}
cuda-version: ${{ matrix.cuda-version }}
cudnn-version: ${{ matrix.cudnn-version }}
torch-version: ${{ matrix.torch-version }}
cxx11_abi: ${{ matrix.cxx11_abi }}
aarch: ${{ matrix.os == 'ubuntu-22.04' && 'x86_64' || 'sbsa' }}
env:
NVTE_FRAMEWORK: pytorch
MAX_JOBS: 1
- name: Upload Release Asset
id: upload_release_asset
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ needs.pre-flight.outputs.release-assets-url }}
asset_path: ./transformer_engine/pytorch/dist/${{ steps.build-pytorch-wheel.outputs.wheel_name }}
asset_name: ${{ steps.build-pytorch-wheel.outputs.wheel_name }}
asset_content_type: application/*
build_wheels_for_ngc:
name: Build Wheels for NGC PyTorch images
runs-on: ${{ matrix.os }}
needs: pre-flight
if: ${{ github.event_name == 'release' || inputs.ngc-image != '' }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-22.04]
container-image: ${{ fromJson(needs.pre-flight.outputs.ngc-images) }}
steps:
- name: 'Checkout'
uses: actions/checkout@v3
- name: 'Build PyTorch Wheel'
uses: ./.github/actions/build-pytorch-wheel
id: build-pytorch-wheel
with:
base-image: ${{ matrix.container-image }}
- name: Upload Release Asset
id: upload_release_asset
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ needs.pre-flight.outputs.release-assets-url }}
asset_path: ./transformer_engine/pytorch/dist/${{ steps.build-pytorch-wheel.outputs.wheel_name }}
asset_name: ${{ steps.build-pytorch-wheel.outputs.wheel_name }}
asset_content_type: application/*
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# #
# See LICENSE for license information. # See LICENSE for license information.
......
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# #
# See LICENSE for license information. # See LICENSE for license information.
...@@ -24,37 +24,66 @@ jobs: ...@@ -24,37 +24,66 @@ jobs:
uses: actions/checkout@v3 uses: actions/checkout@v3
with: with:
submodules: recursive submodules: recursive
- name: ccache
uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad
- name: 'Build' - name: 'Build'
run: pip install --no-build-isolation . -v run: NVTE_USE_CCACHE=1 NVTE_CCACHE_BIN=sccache pip install --no-build-isolation . -v
env: env:
NVTE_FRAMEWORK: none NVTE_FRAMEWORK: none
MAX_JOBS: 1 MAX_JOBS: 1
SCCACHE_GHA_ENABLED: "true"
- name: 'Sanity check' - name: 'Sanity check'
run: python3 -c "import transformer_engine" run: python3 -c "import transformer_engine"
working-directory: / working-directory: /
pytorch: pytorch:
name: 'PyTorch' name: 'PyTorch'
runs-on: ubuntu-latest runs-on: ubuntu-latest
container:
image: nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04
options: --user root
steps: steps:
- name: 'Dependencies' - name: Move /var/lib/docker/
run: | shell: bash -euxo pipefail {0}
apt-get update run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"
apt-get install -y git python3.9 pip cudnn9-cuda-12
pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript - name: Maximize build space
uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
with:
root-reserve-mb: 5120
temp-reserve-mb: 32
swap-size-mb: 10240
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
build-mount-path: '/var/lib/docker/'
- name: Restore /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"
- name: 'Checkout' - name: 'Checkout'
uses: actions/checkout@v3 uses: actions/checkout@v3
with: with:
submodules: recursive submodules: recursive
- name: Start named container
run: |
docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04 sleep infinity
- name: 'Dependencies'
run: |
docker exec builder bash -c '\
apt-get update && \
apt-get install -y git python3.9 pip cudnn9-cuda-12 && \
pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript && \
apt-get clean \
'
- name: 'Build' - name: 'Build'
run: pip install --no-build-isolation . -v --no-deps run: docker exec builder bash -c 'pip install --no-build-isolation . -v --no-deps'
env: env:
NVTE_FRAMEWORK: pytorch NVTE_FRAMEWORK: pytorch
MAX_JOBS: 1 MAX_JOBS: 1
- name: 'Sanity check' - name: 'Sanity check'
run: python3 tests/pytorch/test_sanity_import.py run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py'
jax: jax:
name: 'JAX' name: 'JAX'
runs-on: ubuntu-latest runs-on: ubuntu-latest
...@@ -63,37 +92,65 @@ jobs: ...@@ -63,37 +92,65 @@ jobs:
options: --user root options: --user root
steps: steps:
- name: 'Dependencies' - name: 'Dependencies'
run: pip install pybind11[global] run: pip install cmake==3.21.0 pybind11[global]
- name: 'Checkout' - name: 'Checkout'
uses: actions/checkout@v3 uses: actions/checkout@v3
with: with:
submodules: recursive submodules: recursive
- name: ccache
uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad
- name: 'Build' - name: 'Build'
run: pip install --no-build-isolation . -v run: |
NVTE_CCACHE_BIN=sccache NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v
env: env:
NVTE_FRAMEWORK: jax NVTE_FRAMEWORK: jax
MAX_JOBS: 1 MAX_JOBS: 1
SCCACHE_GHA_ENABLED: "true"
- name: 'Sanity check' - name: 'Sanity check'
run: python3 tests/jax/test_sanity_import.py run: python3 tests/jax/test_sanity_import.py
all: all:
name: 'All' name: 'All'
runs-on: ubuntu-latest runs-on: ubuntu-latest
container:
image: ghcr.io/nvidia/jax:jax
options: --user root
steps: steps:
- name: 'Dependencies' - name: Move /var/lib/docker/
run: | shell: bash -euxo pipefail {0}
pip install pybind11[global] einops onnxscript run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"
pip install torch --index-url https://download.pytorch.org/whl/cu130
- name: Maximize build space
uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
with:
root-reserve-mb: 5120
temp-reserve-mb: 32
swap-size-mb: 10240
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
build-mount-path: '/var/lib/docker/'
- name: Restore /var/lib/docker/
shell: bash -euxo pipefail {0}
run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"
- name: 'Checkout' - name: 'Checkout'
uses: actions/checkout@v3 uses: actions/checkout@v3
with: with:
submodules: recursive submodules: recursive
- name: Start named container
run: |
docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d ghcr.io/nvidia/jax:jax sleep infinity
- name: 'Dependencies'
run: |
docker exec builder bash -c '\
pip install cmake==3.21.0 pybind11[global] einops onnxscript && \
pip install torch --no-cache-dir --index-url https://download.pytorch.org/whl/cu130
'
- name: 'Build' - name: 'Build'
run: pip install --no-build-isolation . -v --no-deps run: docker exec builder bash -c 'pip install --no-cache-dir --no-build-isolation . -v --no-deps'
env: env:
NVTE_FRAMEWORK: all NVTE_FRAMEWORK: all
MAX_JOBS: 1 MAX_JOBS: 1
- name: 'Sanity check' - name: 'Sanity check'
run: python3 tests/pytorch/test_sanity_import.py && python3 tests/jax/test_sanity_import.py run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py && python3 tests/jax/test_sanity_import.py'
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# #
# See LICENSE for license information. # See LICENSE for license information.
......
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# #
# See LICENSE for license information. # See LICENSE for license information.
...@@ -17,15 +17,15 @@ jobs: ...@@ -17,15 +17,15 @@ jobs:
uses: actions/checkout@v3 uses: actions/checkout@v3
- name: 'Install dependencies' - name: 'Install dependencies'
run: | run: |
pip install sphinx==8.1.3 sphinx_rtd_theme==3.0.1 nbsphinx==0.9.5 IPython ipython_genutils==0.2.0 ipywidgets==8.0.2 astroid==3.3.2 pip install sphinx==8.1.3 sphinx_rtd_theme==3.0.1 nbsphinx==0.9.5 IPython ipython_genutils==0.2.0 ipywidgets==8.0.2 astroid==3.3.2 sphinx-tabs==3.4.7
pip install breathe==4.35.0 sphinx-autoapi==3.3.2 pip install breathe==4.35.0 sphinx-autoapi==3.3.2
sudo apt-get install -y pandoc graphviz doxygen sudo apt-get install -y pandoc graphviz doxygen
export GIT_SHA=$(git show-ref --hash HEAD) export GIT_SHA=$(git show-ref --hash HEAD)
- name: 'Build docs' - name: 'Build docs'
run: | run: | # SPHINXOPTS="-W" errors out on warnings
doxygen docs/Doxyfile doxygen docs/Doxyfile
cd docs cd docs
make html make html SPHINXOPTS="-W"
- name: 'Upload docs' - name: 'Upload docs'
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
......
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# #
# See LICENSE for license information. # See LICENSE for license information.
......
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# #
# See LICENSE for license information. # See LICENSE for license information.
......
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# #
# See LICENSE for license information. # See LICENSE for license information.
...@@ -56,8 +56,8 @@ jobs: ...@@ -56,8 +56,8 @@ jobs:
|| github.actor == 'vcherepanov-nv' || github.actor == 'vcherepanov-nv'
|| github.actor == 'tdophung' || github.actor == 'tdophung'
|| github.actor == 'vthumbe1503' || github.actor == 'vthumbe1503'
|| github.actor == 'janekb04'
|| github.actor == 'shengfangd' || github.actor == 'shengfangd'
|| github.actor == 'kainzhong'
) )
steps: steps:
- name: Check if comment is issued by authorized person - name: Check if comment is issued by authorized person
......
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# #
# See LICENSE for license information. # See LICENSE for license information.
......
.venv
*.o *.o
*.swp *.swp
*.ii *.ii
...@@ -40,3 +41,4 @@ compile_commands.json ...@@ -40,3 +41,4 @@ compile_commands.json
.nfs .nfs
tensor_dumps/ tensor_dumps/
artifacts/ artifacts/
*.DS_Store
.. ..
Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
See LICENSE for license information. See LICENSE for license information.
......
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# #
# See LICENSE for license information. # See LICENSE for license information.
......
.. ..
Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
See LICENSE for license information. See LICENSE for license information.
...@@ -13,23 +13,14 @@ Transformer Engine ...@@ -13,23 +13,14 @@ Transformer Engine
Latest News Latest News
=========== ===========
* [11/2025] `NVIDIA Blackwell Architecture Sweeps MLPerf Training v5.1 Benchmarks <https://developer.nvidia.com/blog/nvidia-blackwell-architecture-sweeps-mlperf-training-v5-1-benchmarks/>`_
* [11/2025] `Scale Biology Transformer Models with PyTorch and NVIDIA BioNeMo Recipes <https://developer.nvidia.com/blog/scale-biology-transformer-models-with-pytorch-and-nvidia-bionemo-recipes/>`_
* [11/2025] `FP8 Training of Large-Scale RL Models <https://lmsys.org/blog/2025-11-25-fp8-rl/>`_
* [09/2025] `Pretraining Large Language Models with NVFP4 <https://www.arxiv.org/pdf/2509.25149>`_ * [09/2025] `Pretraining Large Language Models with NVFP4 <https://www.arxiv.org/pdf/2509.25149>`_
* [09/2025] `Native FP8 Mixed Precision Training for Ling 2.0, Open Sourced! <https://huggingface.co/blog/im0qianqian/ling-mini-2-fp8-mixed-precision-training-solution>`_ * [09/2025] `Native FP8 Mixed Precision Training for Ling 2.0, Open Sourced! <https://huggingface.co/blog/im0qianqian/ling-mini-2-fp8-mixed-precision-training-solution>`_
* [09/2025] `Faster Training Throughput in FP8 Precision with NVIDIA NeMo <https://developer.nvidia.com/blog/faster-training-throughput-in-fp8-precision-with-nvidia-nemo/>`_ * [09/2025] `Faster Training Throughput in FP8 Precision with NVIDIA NeMo <https://developer.nvidia.com/blog/faster-training-throughput-in-fp8-precision-with-nvidia-nemo/>`_
* [08/2025] `How we built DeepL's next-generation LLMs with FP8 for training and inference <https://www.deepl.com/en/blog/tech/next-generation-llm-fp8-training>`_ * [08/2025] `How we built DeepL's next-generation LLMs with FP8 for training and inference <https://www.deepl.com/en/blog/tech/next-generation-llm-fp8-training>`_
* [08/2025] `NVFP4 Trains with Precision of 16-bit and Speed and Efficiency of 4-bit <https://developer.nvidia.com/blog/nvfp4-trains-with-precision-of-16-bit-and-speed-and-efficiency-of-4-bit/>`_ * [08/2025] `NVFP4 Trains with Precision of 16-bit and Speed and Efficiency of 4-bit <https://developer.nvidia.com/blog/nvfp4-trains-with-precision-of-16-bit-and-speed-and-efficiency-of-4-bit/>`_
* [06/2025] `Floating Point 8: An Introduction to Efficient, Lower-Precision AI Training <https://developer.nvidia.com/blog/floating-point-8-an-introduction-to-efficient-lower-precision-ai-training/>`_
* [05/2025] `Advanced Optimization Strategies for LLM Training on NVIDIA Grace Hopper <https://developer.nvidia.com/blog/advanced-optimization-strategies-for-llm-training-on-nvidia-grace-hopper/>`_
* [03/2025] `Stable and Scalable FP8 Deep Learning Training on Blackwell | GTC 2025 <https://www.nvidia.com/en-us/on-demand/session/gtc25-s72778/>`_
* [03/2025] `Measure and Improve AI Workload Performance with NVIDIA DGX Cloud Benchmarking <https://developer.nvidia.com/blog/measure-and-improve-ai-workload-performance-with-nvidia-dgx-cloud-benchmarking/>`_
.. image:: docs/examples/comparison-fp8-bf16-training-nvidia-dgx-cloud-benchmarking-performance-explorer.jpg
:width: 600
:alt: Comparison of FP8 versus BF16 training, as seen in NVIDIA DGX Cloud Benchmarking Performance Explorer
* [02/2025] `Understanding the Language of Life's Biomolecules Across Evolution at a New Scale with Evo 2 <https://developer.nvidia.com/blog/understanding-the-language-of-lifes-biomolecules-across-evolution-at-a-new-scale-with-evo-2/>`_
* [02/2025] `NVIDIA DGX Cloud Introduces Ready-To-Use Templates to Benchmark AI Platform Performance <https://developer.nvidia.com/blog/nvidia-dgx-cloud-introduces-ready-to-use-templates-to-benchmark-ai-platform-performance/>`_
* [01/2025] `Continued Pretraining of State-of-the-Art LLMs for Sovereign AI and Regulated Industries with iGenius and NVIDIA DGX Cloud <https://developer.nvidia.com/blog/continued-pretraining-of-state-of-the-art-llms-for-sovereign-ai-and-regulated-industries-with-igenius-and-nvidia-dgx-cloud/>`_
`Previous News <#previous-news>`_ `Previous News <#previous-news>`_
...@@ -259,6 +250,7 @@ These environment variables can be set before installation to customize the buil ...@@ -259,6 +250,7 @@ These environment variables can be set before installation to customize the buil
* **NVTE_FRAMEWORK**: Comma-separated list of frameworks to build for (e.g., ``pytorch,jax``) * **NVTE_FRAMEWORK**: Comma-separated list of frameworks to build for (e.g., ``pytorch,jax``)
* **MAX_JOBS**: Limit number of parallel build jobs (default varies by system) * **MAX_JOBS**: Limit number of parallel build jobs (default varies by system)
* **NVTE_BUILD_THREADS_PER_JOB**: Control threads per build job * **NVTE_BUILD_THREADS_PER_JOB**: Control threads per build job
* **NVTE_CUDA_ARCHS**: Semicolon-separated list of CUDA compute architectures to compile for (e.g., ``80;90`` for A100 and H100). If not set, automatically determined based on CUDA version. Setting this can significantly reduce build time and binary size.
Compiling with FlashAttention Compiling with FlashAttention
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
...@@ -424,6 +416,18 @@ Videos ...@@ -424,6 +416,18 @@ Videos
Previous News Previous News
============= =============
* [06/2025] `Floating Point 8: An Introduction to Efficient, Lower-Precision AI Training <https://developer.nvidia.com/blog/floating-point-8-an-introduction-to-efficient-lower-precision-ai-training/>`_
* [05/2025] `Advanced Optimization Strategies for LLM Training on NVIDIA Grace Hopper <https://developer.nvidia.com/blog/advanced-optimization-strategies-for-llm-training-on-nvidia-grace-hopper/>`_
* [03/2025] `Stable and Scalable FP8 Deep Learning Training on Blackwell | GTC 2025 <https://www.nvidia.com/en-us/on-demand/session/gtc25-s72778/>`_
* [03/2025] `Measure and Improve AI Workload Performance with NVIDIA DGX Cloud Benchmarking <https://developer.nvidia.com/blog/measure-and-improve-ai-workload-performance-with-nvidia-dgx-cloud-benchmarking/>`_
.. image:: docs/examples/comparison-fp8-bf16-training-nvidia-dgx-cloud-benchmarking-performance-explorer.jpg
:width: 600
:alt: Comparison of FP8 versus BF16 training, as seen in NVIDIA DGX Cloud Benchmarking Performance Explorer
* [02/2025] `Understanding the Language of Life's Biomolecules Across Evolution at a New Scale with Evo 2 <https://developer.nvidia.com/blog/understanding-the-language-of-lifes-biomolecules-across-evolution-at-a-new-scale-with-evo-2/>`_
* [02/2025] `NVIDIA DGX Cloud Introduces Ready-To-Use Templates to Benchmark AI Platform Performance <https://developer.nvidia.com/blog/nvidia-dgx-cloud-introduces-ready-to-use-templates-to-benchmark-ai-platform-performance/>`_
* [01/2025] `Continued Pretraining of State-of-the-Art LLMs for Sovereign AI and Regulated Industries with iGenius and NVIDIA DGX Cloud <https://developer.nvidia.com/blog/continued-pretraining-of-state-of-the-art-llms-for-sovereign-ai-and-regulated-industries-with-igenius-and-nvidia-dgx-cloud/>`_
* [11/2024] `Developing a 172B LLM with Strong Japanese Capabilities Using NVIDIA Megatron-LM <https://developer.nvidia.com/blog/developing-a-172b-llm-with-strong-japanese-capabilities-using-nvidia-megatron-lm/>`_ * [11/2024] `Developing a 172B LLM with Strong Japanese Capabilities Using NVIDIA Megatron-LM <https://developer.nvidia.com/blog/developing-a-172b-llm-with-strong-japanese-capabilities-using-nvidia-megatron-lm/>`_
* [11/2024] `How FP8 boosts LLM training by 18% on Amazon SageMaker P5 instances <https://aws.amazon.com/blogs/machine-learning/how-fp8-boosts-llm-training-by-18-on-amazon-sagemaker-p5-instances/>`_ * [11/2024] `How FP8 boosts LLM training by 18% on Amazon SageMaker P5 instances <https://aws.amazon.com/blogs/machine-learning/how-fp8-boosts-llm-training-by-18-on-amazon-sagemaker-p5-instances/>`_
* [11/2024] `Efficiently train models with large sequence lengths using Amazon SageMaker model parallel <https://aws.amazon.com/blogs/machine-learning/efficiently-train-models-with-large-sequence-lengths-using-amazon-sagemaker-model-parallel/>`_ * [11/2024] `Efficiently train models with large sequence lengths using Amazon SageMaker model parallel <https://aws.amazon.com/blogs/machine-learning/efficiently-train-models-with-large-sequence-lengths-using-amazon-sagemaker-model-parallel/>`_
......
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# #
# See LICENSE for license information. # See LICENSE for license information.
......
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# #
# See LICENSE for license information. # See LICENSE for license information.
......
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# #
# See LICENSE for license information. # See LICENSE for license information.
...@@ -45,6 +45,16 @@ nsys profile \ ...@@ -45,6 +45,16 @@ nsys profile \
--trace=cuda,nvtx,cudnn,cublas \ --trace=cuda,nvtx,cudnn,cublas \
python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe nvfp4 python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe nvfp4
# Example for jagged input benchmark to simulate unbalanced token splits
python benchmarks/linear/benchmark_grouped_linear.py --recipe nvfp4 --jagged-input "15296,8960,14656,14784,11712,7936,14080,10880"
# Example to look at a single kernel target with NCU, like the fused hadamard amax kernel for NVFP4 recipe
ncu -f -o ./benchmarks/linear/ncu_b200_numgemm_8_nvfp4_rht_amax \
--set=full \
--kernel-name "GroupHadamardAmaxTmaKernel" \
-s 5 -c 5 \
python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe nvfp4
""" """
RECIPES = { RECIPES = {
...@@ -163,7 +173,9 @@ def benchmark_linear( ...@@ -163,7 +173,9 @@ def benchmark_linear(
return timing_ms return timing_ms
def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4): def run_benchmark_linear(
mkns, recipe_name, use_bias, num_gemms=4, m_splits_provided=None, fwd_only=False
):
data = [] data = []
assert not use_bias, "Bias is not supported for GroupedLinear benchmark" assert not use_bias, "Bias is not supported for GroupedLinear benchmark"
...@@ -172,13 +184,14 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4): ...@@ -172,13 +184,14 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4):
device = "cuda" device = "cuda"
x = torch.randn((m, k), dtype=torch.bfloat16, device=device, requires_grad=True) x = torch.randn((m, k), dtype=torch.bfloat16, device=device, requires_grad=True)
ws = [torch.randn((n, k), dtype=torch.bfloat16, device=device) for _ in range(num_gemms)] ws = [torch.randn((n, k), dtype=torch.bfloat16, device=device) for _ in range(num_gemms)]
assert m % num_gemms == 0 m_splits = [m // num_gemms] * num_gemms if m_splits_provided is None else m_splits_provided
m_splits = [m // num_gemms] * num_gemms
# Bias is not supported for GroupedLinear benchmark # Bias is not supported for GroupedLinear benchmark
bias = None bias = None
# Run the benchmark # Run the benchmark
print(f"fwd_m={m}, fwd_k={k}, fwd_n={n}") print(f"fwd_m={m}, fwd_k={k}, fwd_n={n}")
print(f"m_splits: {m_splits}")
print(f"fwd_only: {fwd_only}")
grouped_fwd_bwd_timing_ms = benchmark_linear( grouped_fwd_bwd_timing_ms = benchmark_linear(
x, x,
...@@ -186,7 +199,7 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4): ...@@ -186,7 +199,7 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4):
m_splits, m_splits,
bias, bias,
recipe_name, recipe_name,
mode="fwd_bwd", mode="fwd_only" if fwd_only else "fwd_bwd",
num_gemms=num_gemms, num_gemms=num_gemms,
) )
...@@ -202,6 +215,8 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4): ...@@ -202,6 +215,8 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4):
] ]
) )
timing_notation = "grouped_fwd_time_ms" if fwd_only else "grouped_fwd_bwd_time_ms"
df = pd.DataFrame( df = pd.DataFrame(
data=data, data=data,
columns=[ columns=[
...@@ -210,7 +225,7 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4): ...@@ -210,7 +225,7 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4):
"n", "n",
"recipe", "recipe",
"num_gemms", "num_gemms",
"grouped_fwd_bwd_time_ms", timing_notation,
], ],
) )
...@@ -223,7 +238,7 @@ if __name__ == "__main__": ...@@ -223,7 +238,7 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--profile", action="store_true", help="Enable profiling mode") parser.add_argument("--profile", action="store_true", help="Enable profiling mode")
parser.add_argument( parser.add_argument(
"--output_dir", "--output-dir",
type=str, type=str,
default="benchmark_output/", default="benchmark_output/",
help="output path for report", help="output path for report",
...@@ -235,8 +250,41 @@ if __name__ == "__main__": ...@@ -235,8 +250,41 @@ if __name__ == "__main__":
default="bf16", default="bf16",
help="Recipe to use, options are fp8_sub_channel, mxfp8, bf16, or all", help="Recipe to use, options are fp8_sub_channel, mxfp8, bf16, or all",
) )
# add an argument for the jagged input
# example: [15296, 8960, 14656, 14784, 11712, 7936, 14080, 10880] => sums up to 98304
parser.add_argument(
"--jagged-input",
type=str,
default=None,
help="Jagged input to use, example: [15296, 8960, 14656, 14784, 11712, 7936, 14080, 10880]",
)
parser.add_argument(
"--hidden-dim",
type=int,
default=7168,
help="Hidden dimension to use, default is 7168",
)
parser.add_argument(
"--output-dim",
type=int,
default=2048,
help="Output dimension to use, default is 2048",
)
parser.add_argument(
"--fwd-only",
action="store_true",
default=False,
help="Run forward pass only, default is both forward and backward passes",
)
args = parser.parse_args() args = parser.parse_args()
jagged_input_splits = None
if args.jagged_input is not None:
jagged_input_splits = [int(x) for x in args.jagged_input.split(",")]
print(f"Jagged input splits: {jagged_input_splits}")
print(f"Jagged input splits sum: {sum(jagged_input_splits)}")
print(f"Jagged input splits num_gemms: {len(jagged_input_splits)}")
use_bias = False use_bias = False
# Set the MKN values to benchmark # Set the MKN values to benchmark
# Deepseek V3 EP64, SEQ_LEN=8192, topK8 # Deepseek V3 EP64, SEQ_LEN=8192, topK8
...@@ -256,11 +304,28 @@ if __name__ == "__main__": ...@@ -256,11 +304,28 @@ if __name__ == "__main__":
# 4 or 8local experts per rank # 4 or 8local experts per rank
num_gemms_list = [4, 8] num_gemms_list = [4, 8]
if jagged_input_splits is not None:
num_gemms_list = [len(jagged_input_splits)]
token_dim_list = [16384, 32768, 65536, 98304]
hidden_dim_list = [7168]
output_dim_list = [2048]
# override the default targets to benchmark if specified
if jagged_input_splits is not None:
token_dim_list = [sum(jagged_input_splits)]
if args.hidden_dim is not None:
hidden_dim_list = [args.hidden_dim]
if args.output_dim is not None:
output_dim_list = [args.output_dim]
# MKN for group linear # MKN for group linear
mkns = [] mkns = []
for m in [65536]: for m in token_dim_list:
for k in [7168]: for k in hidden_dim_list:
for n in [2048]: for n in output_dim_list:
mkns.append((m, k, n)) mkns.append((m, k, n))
# default recipes to run if not specified # default recipes to run if not specified
...@@ -272,14 +337,20 @@ if __name__ == "__main__": ...@@ -272,14 +337,20 @@ if __name__ == "__main__":
recipe_list = [args.recipe] recipe_list = [args.recipe]
if args.profile: if args.profile:
mkns = [(8192 * 8, 7168, 2048)] num_gemms_list = [8]
hidden_dim_to_profile = 7168 if args.hidden_dim is None else args.hidden_dim
output_dim_to_profile = 2048 if args.output_dim is None else args.output_dim
token_dim_to_profile = 8192 * 8
if jagged_input_splits is not None:
num_gemms_list = [len(jagged_input_splits)]
token_dim_to_profile = sum(jagged_input_splits)
mkns = [(token_dim_to_profile, hidden_dim_to_profile, output_dim_to_profile)]
# in profile mode, only run one recipe specified in args.recipe # in profile mode, only run one recipe specified in args.recipe
assert args.recipe != "all", ( assert args.recipe != "all", (
"In profile mode, only one recipe can be specified, please specify the recipe as" "In profile mode, only one recipe can be specified, please specify the recipe as"
" fp8_sub_channel, mxfp8, nvfp4, or bf16" " fp8_sub_channel, mxfp8, nvfp4, or bf16"
) )
recipe_list = [args.recipe] recipe_list = [args.recipe]
num_gemms_list = [8]
torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__() torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
# Initialize a dataframe to store the results # Initialize a dataframe to store the results
...@@ -310,6 +381,8 @@ if __name__ == "__main__": ...@@ -310,6 +381,8 @@ if __name__ == "__main__":
recipe_name, recipe_name,
use_bias, use_bias,
num_gemms=num_gemms, num_gemms=num_gemms,
m_splits_provided=jagged_input_splits,
fwd_only=args.fwd_only,
) )
df_linears = pd.concat([df_linears, df]) df_linears = pd.concat([df_linears, df])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment