Commit 5b6ef054 authored by yuguo's avatar yuguo
Browse files
parents 76060570 a7eeb28b
---
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: -1
AlignAfterOpenBracket: Align
AlignArrayOfStructures: None
AlignConsecutiveAssignments:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCompound: false
AlignFunctionPointers: false
PadOperators: true
AlignConsecutiveBitFields:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCompound: false
AlignFunctionPointers: false
PadOperators: false
AlignConsecutiveDeclarations:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCompound: false
AlignFunctionPointers: false
PadOperators: false
AlignConsecutiveMacros:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCompound: false
AlignFunctionPointers: false
PadOperators: false
AlignConsecutiveShortCaseStatements:
Enabled: false
AcrossEmptyLines: false
AcrossComments: false
AlignCaseColons: false
AlignEscapedNewlines: Left
AlignOperands: Align
AlignTrailingComments:
Kind: Always
OverEmptyLines: 0
AllowAllArgumentsOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowBreakBeforeNoexceptSpecifier: Never
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortCompoundRequirementOnASingleLine: true
AllowShortEnumsOnASingleLine: true
AllowShortFunctionsOnASingleLine: All
AllowShortIfStatementsOnASingleLine: WithoutElse
AllowShortLambdasOnASingleLine: All
AllowShortLoopsOnASingleLine: true
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: Yes
AttributeMacros:
- __capability
BinPackArguments: true
BinPackParameters: true
BitFieldColonSpacing: Both
BraceWrapping:
AfterCaseLabel: false
AfterClass: false
AfterControlStatement: Never
AfterEnum: false
AfterExternBlock: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
BeforeCatch: false
BeforeElse: false
BeforeLambdaBody: false
BeforeWhile: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakAdjacentStringLiterals: true
BreakAfterAttributes: Leave
BreakAfterJavaFieldAnnotations: false
BreakArrays: true
BreakBeforeBinaryOperators: None
BreakBeforeConceptDeclarations: Always
BreakBeforeBraces: Attach
BreakBeforeInlineASMColon: OnlyMultiline
BreakBeforeTernaryOperators: true
BreakConstructorInitializers: BeforeColon
BreakInheritanceList: BeforeColon
BreakStringLiterals: true
ColumnLimit: 100
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: true
DisableFormat: false
EmptyLineAfterAccessModifier: Never
EmptyLineBeforeAccessModifier: LogicalBlock
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
IfMacros:
- KJ_IF_MAYBE
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '^<ext/.*\.h>'
Priority: 2
SortPriority: 0
CaseSensitive: false
- Regex: '^<.*\.h>'
Priority: 1
SortPriority: 0
CaseSensitive: false
- Regex: '^<.*'
Priority: 2
SortPriority: 0
CaseSensitive: false
- Regex: '.*'
Priority: 3
SortPriority: 0
CaseSensitive: false
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
IndentCaseBlocks: false
IndentCaseLabels: true
IndentExternBlock: AfterExternBlock
IndentGotoLabels: true
IndentPPDirectives: None
IndentRequiresClause: true
IndentWidth: 2
IndentWrappedFunctionNames: false
InsertBraces: false
InsertNewlineAtEOF: false
InsertTrailingCommas: None
IntegerLiteralSeparator:
Binary: 0
BinaryMinDigits: 0
Decimal: 0
DecimalMinDigits: 0
Hex: 0
HexMinDigits: 0
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
KeepEmptyLinesAtEOF: false
LambdaBodyIndentation: Signature
LineEnding: DeriveLF
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Never
ObjCBlockIndentWidth: 2
ObjCBreakBeforeNestedBlockParam: true
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PackConstructorInitializers: NextLine
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakOpenParenthesis: 0
PenaltyBreakScopeResolution: 500
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyIndentedWhitespace: 0
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
PPIndentWidth: -1
QualifierAlignment: Leave
RawStringFormats:
- Language: Cpp
Delimiters:
- cc
- CC
- cpp
- Cpp
- CPP
- 'c++'
- 'C++'
CanonicalDelimiter: ''
BasedOnStyle: google
- Language: TextProto
Delimiters:
- pb
- PB
- proto
- PROTO
EnclosingFunctions:
- EqualsProto
- EquivToProto
- PARSE_PARTIAL_TEXT_PROTO
- PARSE_TEST_PROTO
- PARSE_TEXT_PROTO
- ParseTextOrDie
- ParseTextProtoOrDie
- ParseTestProto
- ParsePartialTestProto
CanonicalDelimiter: pb
BasedOnStyle: google
ReferenceAlignment: Pointer
ReflowComments: false
RemoveBracesLLVM: false
RemoveParentheses: Leave
RemoveSemicolon: false
RequiresClausePosition: OwnLine
RequiresExpressionIndentation: OuterScope
SeparateDefinitionBlocks: Leave
ShortNamespaceLines: 1
SkipMacroDefinitionBody: false
SortIncludes: CaseSensitive
SortJavaStaticImport: Before
SortUsingDeclarations: LexicographicNumeric
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceAroundPointerQualifiers: Default
SpaceBeforeAssignmentOperators: true
SpaceBeforeCaseColon: false
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeJsonColon: false
SpaceBeforeParens: ControlStatements
SpaceBeforeParensOptions:
AfterControlStatements: true
AfterForeachMacros: true
AfterFunctionDefinitionName: false
AfterFunctionDeclarationName: false
AfterIfMacros: true
AfterOverloadedOperator: false
AfterPlacementOperator: true
AfterRequiresInClause: false
AfterRequiresInExpression: false
BeforeNonEmptyParentheses: false
SpaceBeforeRangeBasedForLoopColon: true
SpaceBeforeSquareBrackets: false
SpaceInEmptyBlock: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: Never
SpacesInContainerLiterals: true
SpacesInLineCommentPrefix:
Minimum: 1
Maximum: -1
SpacesInParens: Never
SpacesInParensOptions:
InCStyleCasts: false
InConditionalStatements: false
InEmptyParentheses: false
Other: false
SpacesInSquareBrackets: false
Standard: Auto
StatementAttributeLikeMacros:
- Q_EMIT
StatementMacros:
- Q_UNUSED
- QT_REQUIRE_VERSION
TabWidth: 8
UseTab: Never
VerilogBreakBetweenInstancePorts: true
WhitespaceSensitiveMacros:
- BOOST_PP_STRINGIZE
- CF_SWIFT_NAME
- NS_SWIFT_NAME
- PP_STRINGIZE
- STRINGIZE
...
# Description
Please include a brief summary of the changes, relevant motivation and context.
Fixes # (issue)
## Type of change
- [ ] Documentation change (change only to the documentation, either a fix or a new content)
- [ ] Bug fix (non-breaking change which fixes an issue)
- [ ] New feature (non-breaking change which adds functionality)
- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
- [ ] Infra/Build change
- [ ] Code refactoring
## Changes
Please list the changes introduced in this PR:
- Change A
- Change B
# Checklist:
- [ ] I have read and followed the [contributing guidelines](https://github.com/NVIDIA/TransformerEngine/blob/main/CONTRIBUTING.rst)
- [ ] The functionality is complete
- [ ] I have commented my code, particularly in hard-to-understand areas
- [ ] I have made corresponding changes to the documentation
- [ ] My changes generate no new warnings
- [ ] I have added tests that prove my fix is effective or that my feature works
- [ ] New and existing unit tests pass locally with my changes
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
# A workflow to trigger ci on hybrid infra (github + self hosted runner)
name: Blossom-CI
on:
issue_comment:
types: [created]
workflow_dispatch:
inputs:
platform:
description: 'runs-on argument'
required: false
args:
description: 'argument'
required: false
jobs:
Authorization:
name: Authorization
runs-on: blossom
outputs:
args: ${{ env.args }}
# This job only runs for pull request comments
if: >
github.event.comment.body == '/blossom-ci'
&& (
github.actor == 'ptrendx'
|| github.actor == 'ksivaman'
)
steps:
- name: Check if comment is issued by authorized person
run: blossom-ci
env:
OPERATION: 'AUTH'
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
Vulnerability-scan:
name: Vulnerability scan
needs: [Authorization]
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v2
with:
repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
lfs: 'true'
- name: Run blossom action
uses: NVIDIA/blossom-action@main
env:
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
with:
args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
Job-trigger:
name: Start ci job
needs: [Vulnerability-scan]
runs-on: blossom
steps:
- name: Start ci job
run: blossom-ci
env:
OPERATION: 'START-CI-JOB'
CI_SERVER: ${{ secrets.CI_SERVER }}
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
Upload-Log:
name: Upload log
runs-on: blossom
if : github.event_name == 'workflow_dispatch'
steps:
- name: Jenkins log for pull request ${{ fromJson(github.event.inputs.args).pr }} (click here)
run: blossom-ci
env:
OPERATION: 'POST-PROCESSING'
CI_SERVER: ${{ secrets.CI_SERVER }}
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
# A workflow to trigger TE build on GitHub
name: 'Build'
on:
pull_request:
workflow_dispatch:
jobs:
core:
name: 'Core'
runs-on: ubuntu-latest
container:
image: nvcr.io/nvidia/cuda:12.1.0-devel-ubuntu22.04
options: --user root
steps:
- name: 'Dependencies'
run: |
apt-get update
apt-get install -y git python3.9 pip ninja-build cudnn9-cuda-12
pip install cmake==3.21.0
- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive
- name: 'Build'
run: pip install . -v
env:
NVTE_FRAMEWORK: none
MAX_JOBS: 1
- name: 'Sanity check'
run: python3 -c "import transformer_engine"
working-directory: /
pytorch:
name: 'PyTorch'
runs-on: ubuntu-latest
container:
image: nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04
options: --user root
steps:
- name: 'Dependencies'
run: |
apt-get update
apt-get install -y git python3.9 pip ninja-build cudnn9-cuda-12
pip install cmake torch pydantic importlib-metadata>=1.0 packaging pybind11
- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive
- name: 'Build'
run: pip install . -v --no-deps
env:
NVTE_FRAMEWORK: pytorch
MAX_JOBS: 1
- name: 'Sanity check'
if: false # Sanity import test requires Flash Attention
run: python3 tests/pytorch/test_sanity_import.py
jax:
name: 'JAX'
runs-on: ubuntu-latest
container:
image: ghcr.io/nvidia/jax:jax
options: --user root
steps:
- name: 'Checkout'
uses: actions/checkout@v3
with:
submodules: recursive
- name: 'Build'
run: pip install . -v
env:
NVTE_FRAMEWORK: jax
MAX_JOBS: 1
- name: 'Sanity check'
run: python tests/jax/test_sanity_import.py
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
# A workflow to deploy the nightly version of TE documentation to GitHub Pages
name: Deploy nightly docs
on:
push:
branches: [ "main" ]
jobs:
build:
uses: ./.github/workflows/docs.yml
prepare:
needs: build
runs-on: ubuntu-latest
steps:
- name: Download artifact
uses: actions/download-artifact@v4
with:
name: "te_docs"
path: "html"
- name: Prepare for pages
uses: actions/upload-pages-artifact@v1.0.7
with:
name: github-pages
path: "html"
deploy:
needs: prepare
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
permissions:
pages: write
id-token: write
runs-on: ubuntu-latest
steps:
- name: Deploy
uses: actions/deploy-pages@v2.0.0
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
# A workflow to trigger the build of TE documentation on GitHub
name: 'Documentation'
on:
pull_request:
workflow_dispatch:
workflow_call:
jobs:
build_docs:
name: 'Build'
runs-on: ubuntu-latest
steps:
- name: 'Checkout'
uses: actions/checkout@v3
- name: 'Install dependencies'
run: |
pip install sphinx==8.1.3 sphinx_rtd_theme==3.0.1 nbsphinx==0.9.5 IPython ipython_genutils==0.2.0 ipywidgets==8.0.2 astroid==3.3.2
pip install breathe==4.35.0 sphinx-autoapi==3.3.2
sudo apt-get install -y pandoc graphviz doxygen
export GIT_SHA=$(git show-ref --hash HEAD)
- name: 'Build docs'
run: |
doxygen docs/Doxyfile
cd docs
make html
- name: 'Upload docs'
uses: actions/upload-artifact@v4
with:
name: te_docs
path: docs/_build/html
retention-days: 7
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
# A workflow to trigger the TE license check on GitHub
name: 'License'
on:
pull_request:
workflow_dispatch:
jobs:
check:
name: 'Check'
runs-on: ubuntu-latest
steps:
- name: 'Checkout'
uses: actions/checkout@v3
- name: 'Check License'
run: |
export TE_PATH=.
bash ./qa/L0_license/test.sh
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
# A workflow to trigger lint tests on GitHub
name: 'Lint'
on:
pull_request:
workflow_dispatch:
jobs:
pytorch_cpplint:
name: 'PyTorch C++'
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
- name: 'Lint'
run: |
sudo apt-get update
sudo apt-get install pip -y
export CPP_ONLY=1
export TE_PATH=.
bash ./qa/L0_pytorch_lint/test.sh
pytorch_pylint:
name: 'PyTorch Python'
runs-on: ubuntu-latest
steps:
- name: 'Checkout'
uses: actions/checkout@v3
- name: 'Lint'
run: |
sudo apt-get update
sudo apt-get install pip -y
pip install torch numpy
export PYTHON_ONLY=1
export TE_PATH=.
bash ./qa/L0_pytorch_lint/test.sh
jax_cpplint:
name: 'JAX C++'
runs-on: ubuntu-latest
steps:
- name: 'Checkout'
uses: actions/checkout@v3
- name: 'Lint'
run: |
sudo apt-get update
sudo apt-get install pip -y
export CPP_ONLY=1
export TE_PATH=.
bash ./qa/L0_jax_lint/test.sh
jax_pylint:
name: 'JAX Python'
runs-on: ubuntu-latest
steps:
- name: 'Checkout'
uses: actions/checkout@v3
- name: 'Lint'
run: |
sudo apt-get update
sudo apt-get install pip -y
export PYTHON_ONLY=1
export TE_PATH=.
bash ./qa/L0_jax_lint/test.sh
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
# A workflow to trigger ci on hybrid infra (github + self hosted runner)
name: TE-CI Trigger
on:
issue_comment:
types: [created]
jobs:
Authorization:
name: Authorization
runs-on: blossom
outputs:
args: ${{ env.args }}
# This job only runs for pull request comments
if: >
startsWith(github.event.comment.body, '/te-ci')
&& (
github.actor == 'ptrendx'
|| github.actor == 'ksivaman'
|| github.actor == 'schetlur-nv'
|| github.actor == 'timmoon10'
|| github.actor == 'zlsh80826'
|| github.actor == 'mingxu1067'
|| github.actor == 'cyanguwa'
|| github.actor == 'nzmora-nvidia'
|| github.actor == 'galagam'
|| github.actor == 'nouiz'
|| github.actor == 'denera'
|| github.actor == 'sudhakarsingh27'
|| github.actor == 'Oleg-Goncharov'
|| github.actor == 'phu0ngng'
|| github.actor == 'xrennvidia'
|| github.actor == 'yaox12'
|| github.actor == 'huanghua1994'
|| github.actor == 'mgoldfarb-nvidia'
|| github.actor == 'pggPL'
|| github.actor == 'vasunvidia'
|| github.actor == 'erhoo82'
|| github.actor == 'kocchop'
|| github.actor == 'youngeunkwon0405'
|| github.actor == 'KshitijLakhani'
|| github.actor == 'jberchtold-nvidia'
|| github.actor == 'sanandaraj5597'
|| github.actor == 'negvet'
)
steps:
- name: Check if comment is issued by authorized person
run: blossom-ci
env:
OPERATION: 'AUTH'
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
Vulnerability-scan:
name: Vulnerability scan
needs: [Authorization]
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v2
with:
repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
lfs: 'true'
- name: Run blossom action
uses: NVIDIA/blossom-action@main
env:
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
with:
args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
Job-trigger:
name: Start ci job
needs: [Vulnerability-scan]
runs-on: blossom
steps:
- name: Start ci job
run: blossom-ci
env:
OPERATION: 'START-CI-JOB'
CI_SERVER: ${{ secrets.CI_SERVER }}
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
# A workflow to trigger ci on hybrid infra (github + self hosted runner)
name: TE-CI Logs
on:
workflow_dispatch:
inputs:
platform:
description: 'runs-on argument'
required: false
args:
description: 'argument'
required: false
job_name:
description: 'name of the job'
required: true
commit_sha:
description: 'SHA of the commit that was tested.'
required: true
result:
description: 'Job result'
required: true
run-name: PR ${{ fromJson(github.event.inputs.args).pr }} - ${{ inputs.job_name }}
jobs:
Upload-Log:
name: Upload log
runs-on: blossom
steps:
- name: Log
run: blossom-ci
env:
OPERATION: 'POST-PROCESSING'
CI_SERVER: ${{ secrets.CI_SERVER }}
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
status_update:
name: Update commit status
runs-on: ubuntu-latest
permissions:
statuses: write
needs: [Upload-Log]
if: ${{ always() }}
steps:
- name: Set status
run: |
curl \
-X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
https://api.github.com/repos/${{ github.repository }}/statuses/${{ inputs.commit_sha }} \
-d "{\"state\":\"${{ inputs.result }}\",\"target_url\":\"${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\",\"description\":\"\",\"context\":\"te-ci/${{ inputs.job_name }}\"}"
*.o
*.swp
*.ii
*.ptx
*.cubin
*.fatbin*
*.module_id
*.nsys-rep
*.ncu-rep
*.sqlite
*.eggs
build/
*.so
*.egg-info
__pycache__
.ycm_extra_conf.py
.vimrc
.vs
.vscode
.cache
.hypothesis
.devcontainer.json
tests/cpp/build/
.ipynb_checkpoints
*.log
CMakeFiles/CMakeSystem.cmake
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
develop-eggs/
dist/
downloads/
.pytest_cache/
compile_commands.json
.nfs
tensor_dumps/
\ No newline at end of file
[submodule "3rdparty/googletest"]
path = 3rdparty/googletest
url = https://github.com/google/googletest.git
[submodule "3rdparty/cudnn-frontend"]
path = 3rdparty/cudnn-frontend
url = https://github.com/NVIDIA/cudnn-frontend.git
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
default_language_version:
python: python3
ci:
autofix_prs: true
autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
autoupdate_schedule: quarterly
submodules: false
skip: []
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: check-merge-conflict
- id: check-added-large-files
- id: end-of-file-fixer
files: .*.(c|cc|cxx|cpp|cu|cuh|h|hpp|py)$
- id: trailing-whitespace
files: .*.(c|cc|cxx|cpp|cu|cuh|h|hpp|py)$
- repo: https://github.com/psf/black
rev: 24.4.2
hooks:
- id: black
name: Format python code
args: [--line-length=100, --preview, --enable-unstable-feature=string_processing]
types: [python]
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v18.1.6
hooks:
- id: clang-format
entry: clang-format -i
args: ["-style=file"]
files: ^transformer_engine.*\.(c|cc|cxx|cpp|cu|cuh|h|hpp)$
Subproject commit 20c28ea798fe99e31d7274e009ee2fbf0e88abfd
Subproject commit f8d7d77c06936315286eb55f8de22cd23c188571
This software includes third-party components under the following licenses:
========================
GoogleTest
Copyright 2008, Google Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
========================
pybind11
Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Please also refer to the file CONTRIBUTING.md, which clarifies licensing of
external contributions to this project including patches, pull requests, etc.
========================
PyTorch
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
and IDIAP Research Institute nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
========================
FlashAttn
Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
All rights reserved.
All contributions by Nvidia:
Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
========================
cudnn-frontend
Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
..
Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
See LICENSE for license information.
Contribution Rules
==================
.. role:: bash(code)
:language: bash
Coding Guidelines
-----------------
* We follow `Google C++ Style Guide <https://google.github.io/styleguide/cppguide.html>`_. When no
rules can be found, follow the already occuring conventions. If there is no precedence in our
codebase we are open to discussion.
* Prior to your contribution, please make sure that the code passes the linter check. We do both C++
and Python linting. To invoke the check, please use
.. code-block:: bash
TE_PATH=<path to TE source> bash qa/L0_<framework>_lint/test.sh
* Avoid introducing unnecessary complexity into existing code so that maintainability and
readability are preserved.
* Try to keep pull requests (PRs) as concise as possible:
- Avoid committing commented-out code.
- Wherever possible, each PR should address a single concern. If there are several
otherwise-unrelated things that should be fixed to reach a desired endpoint, our recommendation
is to open several PRs and indicate the dependencies in the description. The more complex the
changes are in a single PR, the more time it will take to review those changes.
* Write PR and commit titles using imperative mood.
- Format commit messages sticking to rules described in
`this <https://chris.beams.io/posts/git-commit/>`_ guide.
* Make sure all `L0_*` tests pass:
- In the :bash:`qa/` directory, there are basic sanity tests scripted in directories named
`L0_...`. A given test can be executed by running the :bash:`./test.sh` command in each test
directory. The :bash:`test.sh` script assumes that the TE source can be found under
:bash:`/opt/transformerengine`. This assumption can be changed by setting :bash:`TE_PATH`
environment variable to the directory containing TE source.
- One of the tests, `L0_license` tests for valid NVIDIA copyright and license text. If you create
a new file and do not want to pass copyright rights to NVIDIA, please add the file to the
`exclude_copyright` list in :bash:`qa/L0_license/config.json`.
* Transformer Engine's default build assumes recent versions of TE's dependencies (CUDA toolkit
etc.). Contributions that add compatibility with older versions of those dependencies will be
considered, but NVIDIA cannot guarantee that all possible build configurations work, are not
broken by future contributions, and retain highest performance.
* Make sure that you can contribute your work to open source (no license and/or patent conflict is
introduced by your code). You need to `Sign Your Work`_.
* Thanks in advance for your patience as we review your contributions; we do appreciate them!
Sign Your Work
--------------
* We require that all contributors "sign-off" on their commits. This certifies that the contribution
is your original work, or you have rights to submit it under the same license, or a compatible
license.
* Any contribution which contains commits that are not Signed-Off will not be accepted.
* To sign off on a commit you simply use the `--signoff` (or `-s`) option when committing your changes:
.. code-block:: bash
$ git commit -s -m "Add cool feature."
This will append the following to your commit message:
.. code-block:: text
Signed-off-by: Your Name <your@email.com>
* Full text of the DCO:
.. code-block:: text
Developer Certificate of Origin
Version 1.1
Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
Everyone is permitted to copy and distribute verbatim copies of this
license document, but changing it is not allowed.
Developer's Certificate of Origin 1.1
By making a contribution to this project, I certify that:
(a) The contribution was created in whole or in part by me and I
have the right to submit it under the open source license
indicated in the file; or
(b) The contribution is based upon previous work that, to the best
of my knowledge, is covered under an appropriate open source
license and I have the right under that license to submit that
work with modifications, whether created in whole or in part
by me, under the same open source license (unless I am
permitted to submit under a different license), as indicated
in the file; or
(c) The contribution was provided directly to me by some other
person who certified (a), (b) or (c) and I have not modified
it.
(d) I understand and agree that this project and the contribution
are public and that a record of the contribution (including all
personal information I submit with it, including my sign-off) is
maintained indefinitely and may be redistributed consistent with
this project or the open source license(s) involved.
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
# Stop searching for additional config files.
set noparent
# Limit line length.
linelength=100
# Ignore the following errors.
filter=-build/include_subdir
filter=-build/namespaces
filter=-readability/todo
filter=-build/header_guard
filter=-build/include
filter=-build/c++11
filter=-runtime/references
filter=-whitespace
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
..
Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
See LICENSE for license information.
|License|
Transformer Engine
==================
`Quickstart <#examples>`_ | `Installation <#installation>`_ | `User Guide <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html>`_ | `Examples <https://github.com/NVIDIA/TransformerEngine/tree/main/examples>`_ | `FP8 Convergence <#fp8-convergence>`_ | `Integrations <#integrations>`_ | `Release notes <https://docs.nvidia.com/deeplearning/transformer-engine/release-notes/index.html>`_
Latest News
===========
* [03/2024] `Turbocharged Training: Optimizing the Databricks Mosaic AI stack with FP8 <https://www.databricks.com/blog/turbocharged-training-optimizing-databricks-mosaic-ai-stack-fp8>`_
* [03/2024] `FP8 Training Support in SageMaker Model Parallelism Library <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-release-notes.html>`_
* [12/2023] `New NVIDIA NeMo Framework Features and NVIDIA H200 <https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility/>`_
.. image:: docs/examples/H200-NeMo-performance.png
:width: 600
:alt: H200
* [11/2023] `Inflection-2: The Next Step Up <https://inflection.ai/inflection-2>`_
* [11/2023] `Unleashing The Power Of Transformers With NVIDIA Transformer Engine <https://lambdalabs.com/blog/unleashing-the-power-of-transformers-with-nvidia-transformer-engine>`_
* [11/2023] `Accelerating PyTorch Training Workloads with FP8 <https://towardsdatascience.com/accelerating-pytorch-training-workloads-with-fp8-5a5123aec7d7>`_
* [09/2023] `Transformer Engine added to AWS DL Container for PyTorch Training <https://github.com/aws/deep-learning-containers/pull/3315>`_
* [06/2023] `Breaking MLPerf Training Records with NVIDIA H100 GPUs <https://developer.nvidia.com/blog/breaking-mlperf-training-records-with-nvidia-h100-gpus/>`_
* [04/2023] `Benchmarking Large Language Models on NVIDIA H100 GPUs with CoreWeave (Part 1) <https://www.mosaicml.com/blog/coreweave-nvidia-h100-part-1>`_
What is Transformer Engine?
===========================
.. overview-begin-marker-do-not-remove
Transformer Engine (TE) is a library for accelerating Transformer models on NVIDIA GPUs, including
using 8-bit floating point (FP8) precision on Hopper, Ada, and Blackwell GPUs, to provide better
performance with lower memory utilization in both training and inference. TE provides a collection
of highly optimized building blocks for popular Transformer architectures and an automatic mixed
precision-like API that can be used seamlessly with your framework-specific code. TE also includes a
framework agnostic C++ API that can be integrated with other deep learning libraries to enable FP8
support for Transformers.
As the number of parameters in Transformer models continues to grow, training and inference for
architectures such as BERT, GPT and T5 become very memory and compute-intensive. Most deep learning
frameworks train with FP32 by default. This is not essential, however, to achieve full accuracy for
many deep learning models. Using mixed-precision training, which combines single-precision (FP32)
with lower precision (e.g. FP16) format when training a model, results in significant speedups with
minimal differences in accuracy as compared to FP32 training. With Hopper GPU
architecture FP8 precision was introduced, which offers improved performance over FP16 with no
degradation in accuracy. Although all major deep learning frameworks support FP16, FP8 support is
not available natively in frameworks today.
TE addresses the problem of FP8 support by providing APIs that integrate with popular Large Language
Model (LLM) libraries. It provides a Python API consisting of modules to easily build a Transformer
layer as well as a framework-agnostic library in C++ including structs and kernels needed for FP8
support. Modules provided by TE internally maintain scaling factors and other values needed for FP8
training, greatly simplifying mixed precision training for users.
Highlights
==========
* Easy-to-use modules for building Transformer layers with FP8 support
* Optimizations (e.g. fused kernels) for Transformer models
* Support for FP8 on NVIDIA Hopper, Ada, and Blackwell GPUs
* Support for optimizations across all precisions (FP16, BF16) on NVIDIA Ampere GPU architecture generations and later
Examples
========
PyTorch
^^^^^^^
.. code-block:: python
import torch
import transformer_engine.pytorch as te
from transformer_engine.common import recipe
# Set dimensions.
in_features = 768
out_features = 3072
hidden_size = 2048
# Initialize model and inputs.
model = te.Linear(in_features, out_features, bias=True)
inp = torch.randn(hidden_size, in_features, device="cuda")
# Create an FP8 recipe. Note: All input args are optional.
fp8_recipe = recipe.DelayedScaling(margin=0, fp8_format=recipe.Format.E4M3)
# Enable autocasting for the forward pass
with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
out = model(inp)
loss = out.sum()
loss.backward()
JAX
^^^
Flax
~~~~
.. code-block:: python
import flax
import jax
import jax.numpy as jnp
import transformer_engine.jax as te
import transformer_engine.jax.flax as te_flax
from transformer_engine.common import recipe
BATCH = 32
SEQLEN = 128
HIDDEN = 1024
# Initialize RNG and inputs.
rng = jax.random.PRNGKey(0)
init_rng, data_rng = jax.random.split(rng)
inp = jax.random.normal(data_rng, [BATCH, SEQLEN, HIDDEN], jnp.float32)
# Create an FP8 recipe. Note: All input args are optional.
fp8_recipe = recipe.DelayedScaling(margin=0, fp8_format=recipe.Format.HYBRID)
# Enable autocasting for the forward pass
with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
model = te_flax.DenseGeneral(features=HIDDEN)
def loss_fn(params, other_vars, inp):
out = model.apply({'params':params, **other_vars}, inp)
return jnp.mean(out)
# Initialize models.
variables = model.init(init_rng, inp)
other_variables, params = flax.core.pop(variables, 'params')
# Construct the forward and backward function
fwd_bwd_fn = jax.value_and_grad(loss_fn, argnums=(0, 1))
for _ in range(10):
loss, (param_grads, other_grads) = fwd_bwd_fn(params, other_variables, inp)
.. overview-end-marker-do-not-remove
Installation
============
.. installation
Pre-requisites
^^^^^^^^^^^^^^^^^^^^
* Linux x86_64
* CUDA 12.1+ (CUDA 12.8+ for Blackwell)
* NVIDIA Driver supporting CUDA 12.1 or later
* cuDNN 9.3 or later
Docker
^^^^^^^^^^^^^^^^^^^^
The quickest way to get started with Transformer Engine is by using Docker images on
`NVIDIA GPU Cloud (NGC) Catalog <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch>`_.
For example to use the NGC PyTorch container interactively,
.. code-block:: bash
docker run --gpus all -it --rm nvcr.io/nvidia/pytorch:25.01-py3
Where 25.01 (corresponding to January 2025 release) is the container version.
pip
^^^^^^^^^^^^^^^^^^^^
To install the latest stable version of Transformer Engine,
.. code-block:: bash
pip3 install git+https://github.com/NVIDIA/TransformerEngine.git@stable
This will automatically detect if any supported deep learning frameworks are installed and build
Transformer Engine support for them. To explicitly specify frameworks, set the environment variable
NVTE_FRAMEWORK to a comma-separated list (e.g. NVTE_FRAMEWORK=jax,pytorch).
Alternatively, the package can be directly installed from
`Transformer Engine's PyPI <https://pypi.org/project/transformer-engine/>`_, e.g.
.. code-block:: bash
pip3 install transformer_engine[pytorch]
To obtain the necessary Python bindings for Transformer Engine, the frameworks needed must be
explicitly specified as extra dependencies in a comma-separated list (e.g. [jax,pytorch]).
Transformer Engine ships wheels for the core library. Source distributions are shipped for the JAX
and PyTorch extensions.
From source
^^^^^^^^^^^
`See the installation guide <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html#installation-from-source>`_.
Compiling with FlashAttention-2
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Transformer Engine release v0.11.0 added support for FlashAttention-2 in PyTorch for improved performance.
It is a known issue that FlashAttention-2 compilation is resource-intensive and requires a large amount of RAM (see `bug <https://github.com/Dao-AILab/flash-attention/issues/358>`_), which may lead to out of memory errors during the installation of Transformer Engine. Please try setting **MAX_JOBS=1** in the environment to circumvent the issue.
Note that NGC PyTorch 23.08+ containers include FlashAttention-2.
Breaking Changes
================
v1.7: Padding mask definition for PyTorch
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In an effort to unify the definition and usage of the attention mask across all three frameworks in Transformer Engine, the padding mask has changed from `True` meaning inclusion of the corresponding position in attention to exclusion of that position in our PyTorch implementation. Since v1.7, all attention mask types follow the same definition where `True` means masking out the corresponding position and `False` means including that position in attention calculation.
An example of this change is,
.. code-block:: bash
# for a batch of 3 sequences where `a`s, `b`s and `c`s are the useful tokens
# and `0`s are the padding tokens,
[a, a, a, 0, 0,
b, b, 0, 0, 0,
c, c, c, c, 0]
# the padding mask for this batch before v1.7 is,
[ True, True, True, False, False,
True, True, False, False, False,
True, True, True, True, False]
# and for v1.7 onwards it should be,
[False, False, False, True, True,
False, False, True, True, True,
False, False, False, False, True]
FP8 Convergence
===============
FP8 has been tested extensively across different model architectures and configurations and we found **no significant difference** between FP8 and BF16 training loss curves. FP8 has also been validated for accuracy on downstream LLM tasks (e.g. LAMBADA and WikiText). Below are examples of models tested for convergence across different frameworks.
+------------+------------------+---------------------------------------------------------------------------------------------------------+
| Model | Framework | Source |
+============+==================+=========================================================================================================+
| T5-770M | JAX/T5x | https://github.com/NVIDIA/JAX-Toolbox/tree/main/rosetta/rosetta/projects/t5x#convergence-and-performance|
+------------+------------------+---------------------------------------------------------------------------------------------------------+
| MPT-1.3B | Mosaic Composer | https://www.mosaicml.com/blog/coreweave-nvidia-h100-part-1 |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
| GPT-5B | JAX/Paxml | https://github.com/NVIDIA/JAX-Toolbox/tree/main/rosetta/rosetta/projects/pax#h100-results |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
| GPT-5B | NeMo Framework | Available on request |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
| LLama2-7B | Alibaba Pai | https://mp.weixin.qq.com/s/NQT0uKXLbXyh5031zBdeBQ |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
| T5-11B | JAX/T5x | Available on request |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
| MPT-13B | Mosaic Composer | https://www.databricks.com/blog/turbocharged-training-optimizing-databricks-mosaic-ai-stack-fp8 |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
| GPT-22B | NeMo Framework | Available on request |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
| LLama2-70B | Alibaba Pai | https://mp.weixin.qq.com/s/NQT0uKXLbXyh5031zBdeBQ |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
| GPT-175B | JAX/Paxml | https://github.com/NVIDIA/JAX-Toolbox/tree/main/rosetta/rosetta/projects/pax#h100-results |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
Integrations
============
Transformer Engine has been integrated with popular LLM frameworks such as:
* `DeepSpeed <https://github.com/microsoft/DeepSpeed/pull/3731>`_
* `Hugging Face Accelerate <https://github.com/huggingface/accelerate/releases/tag/v0.17.0>`_
* `Lightning <https://github.com/Lightning-AI/lightning/issues/17172>`_
* `MosaicML Composer <https://github.com/mosaicml/composer/releases/tag/v0.13.1>`_
* `NVIDIA JAX Toolbox <https://github.com/NVIDIA/JAX-Toolbox>`_
* `NVIDIA Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_
* `NVIDIA NeMo Framework <https://github.com/NVIDIA/NeMo-Megatron-Launcher>`_
* `Amazon SageMaker Model Parallel Library <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features-v2-tensor-parallelism.html>`_
* `Levanter <https://github.com/stanford-crfm/levanter>`_
* `GPT-NeoX <https://github.com/EleutherAI/gpt-neox>`_
* `Hugging Face Nanotron <https://github.com/huggingface/nanotron>`_ - Coming soon!
* `Colossal-AI <https://github.com/hpcaitech/ColossalAI>`_ - Coming soon!
* `PeriFlow <https://github.com/friendliai/periflow-python-sdk>`_ - Coming soon!
Contributing
============
We welcome contributions to Transformer Engine! To contribute to Transformer Engine and make pull requests,
follow the guidelines outlined in the `<CONTRIBUTING.rst>`_ guide.
Papers
======
* `Attention original paper <https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf>`_
* `Megatron-LM tensor parallel <https://arxiv.org/pdf/1909.08053.pdf>`_
* `Megatron-LM sequence parallel <https://arxiv.org/pdf/2205.05198.pdf>`_
* `FP8 Formats for Deep Learning <https://arxiv.org/abs/2209.05433>`_
Videos
======
* `What's New in Transformer Engine and FP8 Training | GTC 2024 <https://www.nvidia.com/en-us/on-demand/session/gtc24-s62457/>`_
* `FP8 Training with Transformer Engine | GTC 2023 <https://www.nvidia.com/en-us/on-demand/session/gtcspring23-s51393>`_
* `FP8 for Deep Learning | GTC 2023 <https://www.nvidia.com/en-us/on-demand/session/gtcspring23-s52166/>`_
* `Inside the Hopper Architecture <https://www.nvidia.com/en-us/on-demand/session/gtcspring22-s42663/>`_
.. |License| image:: https://img.shields.io/badge/License-Apache%202.0-blue.svg
:target: https://opensource.org/licenses/Apache-2.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment