Merge branch 'main' of https://github.com/NVIDIA/TransformerEngine

5b6ef054 · yuguo · 76060570 · a7eeb28b · 5b6ef054 · 5b6ef054
Commit 5b6ef054 authored Mar 17, 2025 by yuguo
20 changed files
--- a/.clang-format
+++ b/.clang-format
+---
+Language:        Cpp
+# BasedOnStyle:  Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: None
+AlignConsecutiveAssignments:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  AlignFunctionPointers: false
+  PadOperators:    true
+AlignConsecutiveBitFields:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  AlignFunctionPointers: false
+  PadOperators:    false
+AlignConsecutiveDeclarations:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  AlignFunctionPointers: false
+  PadOperators:    false
+AlignConsecutiveMacros:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  AlignFunctionPointers: false
+  PadOperators:    false
+AlignConsecutiveShortCaseStatements:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCaseColons: false
+AlignEscapedNewlines: Left
+AlignOperands:   Align
+AlignTrailingComments:
+  Kind:            Always
+  OverEmptyLines:  0
+AllowAllArgumentsOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowBreakBeforeNoexceptSpecifier: Never
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortCompoundRequirementOnASingleLine: true
+AllowShortEnumsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: WithoutElse
+AllowShortLambdasOnASingleLine: All
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+AttributeMacros:
+  - __capability
+BinPackArguments: true
+BinPackParameters: true
+BitFieldColonSpacing: Both
+BraceWrapping:
+  AfterCaseLabel:  false
+  AfterClass:      false
+  AfterControlStatement: Never
+  AfterEnum:       false
+  AfterExternBlock: false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  BeforeLambdaBody: false
+  BeforeWhile:     false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakAdjacentStringLiterals: true
+BreakAfterAttributes: Leave
+BreakAfterJavaFieldAnnotations: false
+BreakArrays:     true
+BreakBeforeBinaryOperators: None
+BreakBeforeConceptDeclarations: Always
+BreakBeforeBraces: Attach
+BreakBeforeInlineASMColon: OnlyMultiline
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeColon
+BreakInheritanceList: BeforeColon
+BreakStringLiterals: true
+ColumnLimit:     100
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: true
+DisableFormat:   false
+EmptyLineAfterAccessModifier: Never
+EmptyLineBeforeAccessModifier: LogicalBlock
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IfMacros:
+  - KJ_IF_MAYBE
+IncludeBlocks:   Regroup
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '^<.*'
+    Priority:        2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '.*'
+    Priority:        3
+    SortPriority:    0
+    CaseSensitive:   false
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IncludeIsMainSourceRegex: ''
+IndentAccessModifiers: false
+IndentCaseBlocks: false
+IndentCaseLabels: true
+IndentExternBlock: AfterExternBlock
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentRequiresClause: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+InsertBraces:    false
+InsertNewlineAtEOF: false
+InsertTrailingCommas: None
+IntegerLiteralSeparator:
+  Binary:          0
+  BinaryMinDigits: 0
+  Decimal:         0
+  DecimalMinDigits: 0
+  Hex:             0
+  HexMinDigits:    0
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+KeepEmptyLinesAtEOF: false
+LambdaBodyIndentation: Signature
+LineEnding:      DeriveLF
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 2
+ObjCBreakBeforeNestedBlockParam: true
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PackConstructorInitializers: NextLine
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakOpenParenthesis: 0
+PenaltyBreakScopeResolution: 500
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyIndentedWhitespace: 0
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+PPIndentWidth:   -1
+QualifierAlignment: Leave
+RawStringFormats:
+  - Language:        Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+  - Language:        TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+      - ParseTestProto
+      - ParsePartialTestProto
+    CanonicalDelimiter: pb
+    BasedOnStyle:    google
+ReferenceAlignment: Pointer
+ReflowComments:  false
+RemoveBracesLLVM: false
+RemoveParentheses: Leave
+RemoveSemicolon: false
+RequiresClausePosition: OwnLine
+RequiresExpressionIndentation: OuterScope
+SeparateDefinitionBlocks: Leave
+ShortNamespaceLines: 1
+SkipMacroDefinitionBody: false
+SortIncludes:    CaseSensitive
+SortJavaStaticImport: Before
+SortUsingDeclarations: LexicographicNumeric
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceAroundPointerQualifiers: Default
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCaseColon: false
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeJsonColon: false
+SpaceBeforeParens: ControlStatements
+SpaceBeforeParensOptions:
+  AfterControlStatements: true
+  AfterForeachMacros: true
+  AfterFunctionDefinitionName: false
+  AfterFunctionDeclarationName: false
+  AfterIfMacros:   true
+  AfterOverloadedOperator: false
+  AfterPlacementOperator: true
+  AfterRequiresInClause: false
+  AfterRequiresInExpression: false
+  BeforeNonEmptyParentheses: false
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
+SpaceInEmptyBlock: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  Never
+SpacesInContainerLiterals: true
+SpacesInLineCommentPrefix:
+  Minimum:         1
+  Maximum:         -1
+SpacesInParens:  Never
+SpacesInParensOptions:
+  InCStyleCasts:   false
+  InConditionalStatements: false
+  InEmptyParentheses: false
+  Other:           false
+SpacesInSquareBrackets: false
+Standard:        Auto
+StatementAttributeLikeMacros:
+  - Q_EMIT
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseTab:          Never
+VerilogBreakBetweenInstancePorts: true
+WhitespaceSensitiveMacros:
+  - BOOST_PP_STRINGIZE
+  - CF_SWIFT_NAME
+  - NS_SWIFT_NAME
+  - PP_STRINGIZE
+  - STRINGIZE
+...
+
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
+# Description
+
+Please include a brief summary of the changes, relevant motivation and context.
+
+Fixes # (issue)
+
+## Type of change
+
+- [ ] Documentation change (change only to the documentation, either a fix or a new content)
+- [ ] Bug fix (non-breaking change which fixes an issue)
+- [ ] New feature (non-breaking change which adds functionality)
+- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
+- [ ] Infra/Build change
+- [ ] Code refactoring
+
+## Changes
+
+Please list the changes introduced in this PR:
+
+- Change A
+- Change B
+
+# Checklist:
+
+- [ ] I have read and followed the [contributing guidelines](https://github.com/NVIDIA/TransformerEngine/blob/main/CONTRIBUTING.rst)
+- [ ] The functionality is complete
+- [ ] I have commented my code, particularly in hard-to-understand areas
+- [ ] I have made corresponding changes to the documentation
+- [ ] My changes generate no new warnings
+- [ ] I have added tests that prove my fix is effective or that my feature works
+- [ ] New and existing unit tests pass locally with my changes
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+# A workflow to trigger ci on hybrid infra (github + self hosted runner)
+name: Blossom-CI
+on:
+  issue_comment:
+    types: [created]
+  workflow_dispatch:
+      inputs:
+          platform:
+            description: 'runs-on argument'
+            required: false
+          args:
+            description: 'argument'
+            required: false
+jobs:
+  Authorization:
+    name: Authorization
+    runs-on: blossom
+    outputs:
+      args: ${{ env.args }}
+
+    # This job only runs for pull request comments
+    if: >
+         github.event.comment.body == '/blossom-ci'
+         && (
+           github.actor == 'ptrendx'
+           || github.actor == 'ksivaman'
+         )
+    steps:
+      - name: Check if comment is issued by authorized person
+        run: blossom-ci
+        env:
+          OPERATION: 'AUTH'
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
+
+  Vulnerability-scan:
+    name: Vulnerability scan
+    needs: [Authorization]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+        with:
+          repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
+          ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
+          lfs: 'true'
+
+      - name: Run blossom action
+        uses: NVIDIA/blossom-action@main
+        env:
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
+        with:
+          args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
+          args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
+          args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
+
+  Job-trigger:
+    name: Start ci job
+    needs: [Vulnerability-scan]
+    runs-on: blossom
+    steps:
+      - name: Start ci job
+        run: blossom-ci
+        env:
+          OPERATION: 'START-CI-JOB'
+          CI_SERVER: ${{ secrets.CI_SERVER }}
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  Upload-Log:
+    name: Upload log
+    runs-on: blossom
+    if : github.event_name == 'workflow_dispatch'
+    steps:
+      - name: Jenkins log for pull request ${{ fromJson(github.event.inputs.args).pr }} (click here)
+        run: blossom-ci
+        env:
+          OPERATION: 'POST-PROCESSING'
+          CI_SERVER: ${{ secrets.CI_SERVER }}
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+# A workflow to trigger TE build on GitHub
+name: 'Build'
+on:
+  pull_request:
+  workflow_dispatch:
+jobs:
+  core:
+    name: 'Core'
+    runs-on: ubuntu-latest
+    container:
+      image: nvcr.io/nvidia/cuda:12.1.0-devel-ubuntu22.04
+      options: --user root
+    steps:
+      - name: 'Dependencies'
+        run: |
+          apt-get update
+          apt-get install -y git python3.9 pip ninja-build cudnn9-cuda-12
+          pip install cmake==3.21.0
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+      - name: 'Build'
+        run: pip install . -v
+        env:
+          NVTE_FRAMEWORK: none
+          MAX_JOBS: 1
+      - name: 'Sanity check'
+        run: python3 -c "import transformer_engine"
+        working-directory: /
+  pytorch:
+    name: 'PyTorch'
+    runs-on: ubuntu-latest
+    container:
+      image: nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04
+      options: --user root
+    steps:
+      - name: 'Dependencies'
+        run: |
+          apt-get update
+          apt-get install -y git python3.9 pip ninja-build cudnn9-cuda-12
+          pip install cmake torch pydantic importlib-metadata>=1.0 packaging pybind11
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+      - name: 'Build'
+        run: pip install . -v --no-deps
+        env:
+          NVTE_FRAMEWORK: pytorch
+          MAX_JOBS: 1
+      - name: 'Sanity check'
+        if: false  # Sanity import test requires Flash Attention
+        run: python3 tests/pytorch/test_sanity_import.py
+  jax:
+    name: 'JAX'
+    runs-on: ubuntu-latest
+    container:
+      image: ghcr.io/nvidia/jax:jax
+      options: --user root
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+      - name: 'Build'
+        run: pip install . -v
+        env:
+          NVTE_FRAMEWORK: jax
+          MAX_JOBS: 1
+      - name: 'Sanity check'
+        run: python tests/jax/test_sanity_import.py
--- a/.github/workflows/deploy_nightly_docs.yml
+++ b/.github/workflows/deploy_nightly_docs.yml
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+# A workflow to deploy the nightly version of TE documentation to GitHub Pages
+name: Deploy nightly docs
+on:
+  push:
+    branches: [ "main" ]
+jobs:
+  build:
+    uses: ./.github/workflows/docs.yml
+
+  prepare:
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download artifact
+        uses: actions/download-artifact@v4
+        with:
+            name: "te_docs"
+            path: "html"
+      - name: Prepare for pages
+        uses: actions/upload-pages-artifact@v1.0.7
+        with:
+          name: github-pages
+          path: "html"
+  deploy:
+    needs: prepare
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    permissions:
+      pages: write
+      id-token: write
+    runs-on: ubuntu-latest
+    steps:
+    - name: Deploy
+      uses: actions/deploy-pages@v2.0.0
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+# A workflow to trigger the build of TE documentation on GitHub
+name: 'Documentation'
+on:
+  pull_request:
+  workflow_dispatch:
+  workflow_call:
+jobs:
+  build_docs:
+    name: 'Build'
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+      - name: 'Install dependencies'
+        run: |
+          pip install sphinx==8.1.3 sphinx_rtd_theme==3.0.1 nbsphinx==0.9.5 IPython ipython_genutils==0.2.0 ipywidgets==8.0.2 astroid==3.3.2
+          pip install breathe==4.35.0 sphinx-autoapi==3.3.2
+          sudo apt-get install -y pandoc graphviz doxygen
+          export GIT_SHA=$(git show-ref --hash HEAD)
+      - name: 'Build docs'
+        run: |
+          doxygen docs/Doxyfile
+          cd docs
+          make html
+      - name: 'Upload docs'
+        uses: actions/upload-artifact@v4
+        with:
+          name: te_docs
+          path: docs/_build/html
+          retention-days: 7
--- a/.github/workflows/license.yml
+++ b/.github/workflows/license.yml
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+# A workflow to trigger the TE license check on GitHub
+name: 'License'
+on:
+  pull_request:
+  workflow_dispatch:
+jobs:
+  check:
+    name: 'Check'
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+      - name: 'Check License'
+        run: |
+          export TE_PATH=.
+          bash ./qa/L0_license/test.sh
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+# A workflow to trigger lint tests on GitHub
+name: 'Lint'
+on:
+  pull_request:
+  workflow_dispatch:
+jobs:
+  pytorch_cpplint:
+    name: 'PyTorch C++'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: 'Lint'
+        run: |
+          sudo apt-get update
+          sudo apt-get install pip -y
+          export CPP_ONLY=1
+          export TE_PATH=.
+          bash ./qa/L0_pytorch_lint/test.sh
+  pytorch_pylint:
+    name: 'PyTorch Python'
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+      - name: 'Lint'
+        run: |
+          sudo apt-get update
+          sudo apt-get install pip -y
+          pip install torch numpy
+          export PYTHON_ONLY=1
+          export TE_PATH=.
+          bash ./qa/L0_pytorch_lint/test.sh
+  jax_cpplint:
+    name: 'JAX C++'
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+      - name: 'Lint'
+        run: |
+          sudo apt-get update
+          sudo apt-get install pip -y
+          export CPP_ONLY=1
+          export TE_PATH=.
+          bash ./qa/L0_jax_lint/test.sh
+  jax_pylint:
+    name: 'JAX Python'
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@v3
+      - name: 'Lint'
+        run: |
+          sudo apt-get update
+          sudo apt-get install pip -y
+          export PYTHON_ONLY=1
+          export TE_PATH=.
+          bash ./qa/L0_jax_lint/test.sh
--- a/.github/workflows/trigger-ci.yml
+++ b/.github/workflows/trigger-ci.yml
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+# A workflow to trigger ci on hybrid infra (github + self hosted runner)
+name: TE-CI Trigger
+on:
+  issue_comment:
+    types: [created]
+jobs:
+  Authorization:
+    name: Authorization
+    runs-on: blossom
+    outputs:
+      args: ${{ env.args }}
+
+    # This job only runs for pull request comments
+    if: >
+         startsWith(github.event.comment.body, '/te-ci')
+         && (
+           github.actor == 'ptrendx'
+           || github.actor == 'ksivaman'
+           || github.actor == 'schetlur-nv'
+           || github.actor == 'timmoon10'
+           || github.actor == 'zlsh80826'
+           || github.actor == 'mingxu1067'
+           || github.actor == 'cyanguwa'
+           || github.actor == 'nzmora-nvidia'
+           || github.actor == 'galagam'
+           || github.actor == 'nouiz'
+           || github.actor == 'denera'
+           || github.actor == 'sudhakarsingh27'
+           || github.actor == 'Oleg-Goncharov'
+           || github.actor == 'phu0ngng'
+           || github.actor == 'xrennvidia'
+           || github.actor == 'yaox12'
+           || github.actor == 'huanghua1994'
+           || github.actor == 'mgoldfarb-nvidia'
+           || github.actor == 'pggPL'
+           || github.actor == 'vasunvidia'
+           || github.actor == 'erhoo82'
+           || github.actor == 'kocchop'
+           || github.actor == 'youngeunkwon0405'
+           || github.actor == 'KshitijLakhani'
+           || github.actor == 'jberchtold-nvidia'
+           || github.actor == 'sanandaraj5597'
+           || github.actor == 'negvet'
+         )
+    steps:
+      - name: Check if comment is issued by authorized person
+        run: blossom-ci
+        env:
+          OPERATION: 'AUTH'
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
+
+  Vulnerability-scan:
+    name: Vulnerability scan
+    needs: [Authorization]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+        with:
+          repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
+          ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
+          lfs: 'true'
+
+      - name: Run blossom action
+        uses: NVIDIA/blossom-action@main
+        env:
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
+        with:
+          args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
+          args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
+          args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
+
+  Job-trigger:
+    name: Start ci job
+    needs: [Vulnerability-scan]
+    runs-on: blossom
+    steps:
+      - name: Start ci job
+        run: blossom-ci
+        env:
+          OPERATION: 'START-CI-JOB'
+          CI_SERVER: ${{ secrets.CI_SERVER }}
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/upload-ci-logs.yml
+++ b/.github/workflows/upload-ci-logs.yml
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+# A workflow to trigger ci on hybrid infra (github + self hosted runner)
+name: TE-CI Logs
+on:
+  workflow_dispatch:
+      inputs:
+          platform:
+            description: 'runs-on argument'
+            required: false
+          args:
+            description: 'argument'
+            required: false
+          job_name:
+            description: 'name of the job'
+            required: true
+          commit_sha:
+            description: 'SHA of the commit that was tested.'
+            required: true
+          result:
+            description: 'Job result'
+            required: true
+run-name: PR ${{ fromJson(github.event.inputs.args).pr }} - ${{ inputs.job_name }}
+jobs:
+  Upload-Log:
+    name: Upload log
+    runs-on: blossom
+    steps:
+      - name: Log
+        run: blossom-ci
+        env:
+          OPERATION: 'POST-PROCESSING'
+          CI_SERVER: ${{ secrets.CI_SERVER }}
+          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  status_update:
+    name: Update commit status
+    runs-on: ubuntu-latest
+    permissions:
+      statuses: write
+    needs: [Upload-Log]
+    if: ${{ always() }}
+    steps:
+      - name: Set status
+        run: |
+          curl \
+          -X POST \
+          -H "Accept: application/vnd.github+json" \
+          -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+          https://api.github.com/repos/${{ github.repository }}/statuses/${{ inputs.commit_sha }} \
+          -d "{\"state\":\"${{ inputs.result }}\",\"target_url\":\"${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}\",\"description\":\"\",\"context\":\"te-ci/${{ inputs.job_name }}\"}"
--- a/.gitignore
+++ b/.gitignore
+*.o
+*.swp
+*.ii
+*.ptx
+*.cubin
+*.fatbin*
+*.module_id
+*.nsys-rep
+*.ncu-rep
+*.sqlite
+*.eggs
+build/
+*.so
+*.egg-info
+__pycache__
+.ycm_extra_conf.py
+.vimrc
+.vs
+.vscode
+.cache
+.hypothesis
+.devcontainer.json
+tests/cpp/build/
+.ipynb_checkpoints
+*.log
+CMakeFiles/CMakeSystem.cmake
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+develop-eggs/
+dist/
+downloads/
+.pytest_cache/
+compile_commands.json
+.nfs
+tensor_dumps/
\ No newline at end of file
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "3rdparty/googletest"]
+	path = 3rdparty/googletest
+	url = https://github.com/google/googletest.git
+[submodule "3rdparty/cudnn-frontend"]
+	path = 3rdparty/cudnn-frontend
+	url = https://github.com/NVIDIA/cudnn-frontend.git
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+default_language_version:
+  python: python3
+
+ci:
+  autofix_prs: true
+  autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
+  autoupdate_schedule: quarterly
+  submodules: false
+  skip: []
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: check-merge-conflict
+      - id: check-added-large-files
+      - id: end-of-file-fixer
+        files: .*.(c|cc|cxx|cpp|cu|cuh|h|hpp|py)$
+      - id: trailing-whitespace
+        files: .*.(c|cc|cxx|cpp|cu|cuh|h|hpp|py)$
+
+  - repo: https://github.com/psf/black
+    rev: 24.4.2
+    hooks:
+      - id: black
+        name: Format python code
+        args: [--line-length=100, --preview, --enable-unstable-feature=string_processing]
+        types: [python]
+
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v18.1.6
+    hooks:
+      - id: clang-format
+        entry: clang-format -i
+        args: ["-style=file"]
+        files: ^transformer_engine.*\.(c|cc|cxx|cpp|cu|cuh|h|hpp)$
--- a/cudnn-frontend @ 20c28ea7
+++ b/cudnn-frontend @ 20c28ea7
+Subproject commit 20c28ea798fe99e31d7274e009ee2fbf0e88abfd
--- a/googletest @ f8d7d77c
+++ b/googletest @ f8d7d77c
+Subproject commit f8d7d77c06936315286eb55f8de22cd23c188571
--- a/Acknowledgements.txt
+++ b/Acknowledgements.txt
+This software includes third-party components under the following licenses:
+
+========================
+GoogleTest
+
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+========================
+pybind11
+
+Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Please also refer to the file CONTRIBUTING.md, which clarifies licensing of
+external contributions to this project including patches, pull requests, etc.
+
+========================
+PyTorch
+
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+   and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+========================
+FlashAttn
+
+Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
+All rights reserved.
+
+All contributions by Nvidia:
+Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+========================
+cudnn-frontend
+
+Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
+..
+    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+
+Contribution Rules
+==================
+
+.. role:: bash(code)
+   :language: bash
+
+Coding Guidelines
+-----------------
+
+* We follow `Google C++ Style Guide <https://google.github.io/styleguide/cppguide.html>`_. When no
+  rules can be found, follow the already occuring conventions. If there is no precedence in our
+  codebase we are open to discussion.
+* Prior to your contribution, please make sure that the code passes the linter check. We do both C++
+  and Python linting. To invoke the check, please use
+
+  .. code-block:: bash
+
+    TE_PATH=<path to TE source> bash qa/L0_<framework>_lint/test.sh
+
+* Avoid introducing unnecessary complexity into existing code so that maintainability and
+  readability are preserved.
+* Try to keep pull requests (PRs) as concise as possible:
+
+  - Avoid committing commented-out code.
+  - Wherever possible, each PR should address a single concern. If there are several
+    otherwise-unrelated things that should be fixed to reach a desired endpoint, our recommendation
+    is to open several PRs and indicate the dependencies in the description. The more complex the
+    changes are in a single PR, the more time it will take to review those changes.
+
+* Write PR and commit titles using imperative mood.
+
+  - Format commit messages sticking to rules described in
+    `this <https://chris.beams.io/posts/git-commit/>`_ guide.
+
+* Make sure all `L0_*` tests pass:
+
+  - In the :bash:`qa/` directory, there are basic sanity tests scripted in directories named
+    `L0_...`. A given test can be executed by running the :bash:`./test.sh` command in each test
+    directory. The :bash:`test.sh` script assumes that the TE source can be found under
+    :bash:`/opt/transformerengine`. This assumption can be changed by setting :bash:`TE_PATH`
+    environment variable to the directory containing TE source.
+  - One of the tests, `L0_license` tests for valid NVIDIA copyright and license text. If you create
+    a new file and do not want to pass copyright rights to NVIDIA, please add the file to the
+    `exclude_copyright` list in :bash:`qa/L0_license/config.json`.
+
+* Transformer Engine's default build assumes recent versions of TE's dependencies (CUDA toolkit
+  etc.). Contributions that add compatibility with older versions of those dependencies will be
+  considered, but NVIDIA cannot guarantee that all possible build configurations work, are not
+  broken by future contributions, and retain highest performance.
+* Make sure that you can contribute your work to open source (no license and/or patent conflict is
+  introduced by your code). You need to `Sign Your Work`_.
+* Thanks in advance for your patience as we review your contributions; we do appreciate them!
+
+Sign Your Work
+--------------
+
+* We require that all contributors "sign-off" on their commits. This certifies that the contribution
+  is your original work, or you have rights to submit it under the same license, or a compatible
+  license.
+
+  * Any contribution which contains commits that are not Signed-Off will not be accepted.
+
+* To sign off on a commit you simply use the `--signoff` (or `-s`) option when committing your changes:
+
+  .. code-block:: bash
+
+    $ git commit -s -m "Add cool feature."
+
+  This will append the following to your commit message:
+
+  .. code-block:: text
+
+    Signed-off-by: Your Name <your@email.com>
+
+* Full text of the DCO:
+
+  .. code-block:: text
+
+    Developer Certificate of Origin
+    Version 1.1
+
+    Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+
+    Everyone is permitted to copy and distribute verbatim copies of this
+    license document, but changing it is not allowed.
+
+
+    Developer's Certificate of Origin 1.1
+
+    By making a contribution to this project, I certify that:
+
+    (a) The contribution was created in whole or in part by me and I
+        have the right to submit it under the open source license
+        indicated in the file; or
+
+    (b) The contribution is based upon previous work that, to the best
+        of my knowledge, is covered under an appropriate open source
+        license and I have the right under that license to submit that
+        work with modifications, whether created in whole or in part
+        by me, under the same open source license (unless I am
+        permitted to submit under a different license), as indicated
+        in the file; or
+
+    (c) The contribution was provided directly to me by some other
+        person who certified (a), (b) or (c) and I have not modified
+        it.
+
+    (d) I understand and agree that this project and the contribution
+        are public and that a record of the contribution (including all
+        personal information I submit with it, including my sign-off) is
+        maintained indefinitely and may be redistributed consistent with
+        this project or the open source license(s) involved.
--- a/CPPLINT.cfg
+++ b/CPPLINT.cfg
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+# Stop searching for additional config files.
+set noparent
+
+# Limit line length.
+linelength=100
+
+# Ignore the following errors.
+filter=-build/include_subdir
+filter=-build/namespaces
+filter=-readability/todo
+filter=-build/header_guard
+filter=-build/include
+filter=-build/c++11
+filter=-runtime/references
+filter=-whitespace
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
--- a/README.rst
+++ b/README.rst
+..
+    Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+
+|License|
+
+Transformer Engine
+==================
+
+`Quickstart <#examples>`_ | `Installation <#installation>`_ | `User Guide <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html>`_ | `Examples <https://github.com/NVIDIA/TransformerEngine/tree/main/examples>`_ | `FP8 Convergence <#fp8-convergence>`_ | `Integrations <#integrations>`_ | `Release notes <https://docs.nvidia.com/deeplearning/transformer-engine/release-notes/index.html>`_
+
+Latest News
+===========
+
+* [03/2024] `Turbocharged Training: Optimizing the Databricks Mosaic AI stack with FP8 <https://www.databricks.com/blog/turbocharged-training-optimizing-databricks-mosaic-ai-stack-fp8>`_
+* [03/2024] `FP8 Training Support in SageMaker Model Parallelism Library <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-release-notes.html>`_
+* [12/2023] `New NVIDIA NeMo Framework Features and NVIDIA H200 <https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility/>`_
+
+.. image:: docs/examples/H200-NeMo-performance.png
+  :width: 600
+  :alt: H200
+
+* [11/2023] `Inflection-2: The Next Step Up <https://inflection.ai/inflection-2>`_
+* [11/2023] `Unleashing The Power Of Transformers With NVIDIA Transformer Engine <https://lambdalabs.com/blog/unleashing-the-power-of-transformers-with-nvidia-transformer-engine>`_
+* [11/2023] `Accelerating PyTorch Training Workloads with FP8 <https://towardsdatascience.com/accelerating-pytorch-training-workloads-with-fp8-5a5123aec7d7>`_
+* [09/2023] `Transformer Engine added to AWS DL Container for PyTorch Training <https://github.com/aws/deep-learning-containers/pull/3315>`_
+* [06/2023] `Breaking MLPerf Training Records with NVIDIA H100 GPUs <https://developer.nvidia.com/blog/breaking-mlperf-training-records-with-nvidia-h100-gpus/>`_
+* [04/2023] `Benchmarking Large Language Models on NVIDIA H100 GPUs with CoreWeave (Part 1) <https://www.mosaicml.com/blog/coreweave-nvidia-h100-part-1>`_
+
+What is Transformer Engine?
+===========================
+.. overview-begin-marker-do-not-remove
+
+Transformer Engine (TE) is a library for accelerating Transformer models on NVIDIA GPUs, including
+using 8-bit floating point (FP8) precision on Hopper, Ada, and Blackwell GPUs, to provide better
+performance with lower memory utilization in both training and inference. TE provides a collection
+of highly optimized building blocks for popular Transformer architectures and an automatic mixed
+precision-like API that can be used seamlessly with your framework-specific code. TE also includes a
+framework agnostic C++ API that can be integrated with other deep learning libraries to enable FP8
+support for Transformers.
+
+As the number of parameters in Transformer models continues to grow, training and inference for
+architectures such as BERT, GPT and T5 become very memory and compute-intensive. Most deep learning
+frameworks train with FP32 by default. This is not essential, however, to achieve full accuracy for
+many deep learning models. Using mixed-precision training, which combines single-precision (FP32)
+with lower precision (e.g. FP16) format when training a model, results in significant speedups with
+minimal differences in accuracy as compared to FP32 training. With Hopper GPU
+architecture FP8 precision was introduced, which offers improved performance over FP16 with no
+degradation in accuracy. Although all major deep learning frameworks support FP16, FP8 support is
+not available natively in frameworks today.
+
+TE addresses the problem of FP8 support by providing APIs that integrate with popular Large Language
+Model (LLM) libraries. It provides a Python API consisting of modules to easily build a Transformer
+layer as well as a framework-agnostic library in C++ including structs and kernels needed for FP8
+support. Modules provided by TE internally maintain scaling factors and other values needed for FP8
+training, greatly simplifying mixed precision training for users.
+
+Highlights
+==========
+
+* Easy-to-use modules for building Transformer layers with FP8 support
+* Optimizations (e.g. fused kernels) for Transformer models
+* Support for FP8 on NVIDIA Hopper, Ada, and Blackwell GPUs
+* Support for optimizations across all precisions (FP16, BF16) on NVIDIA Ampere GPU architecture generations and later
+
+Examples
+========
+
+PyTorch
+^^^^^^^
+
+.. code-block:: python
+
+  import torch
+  import transformer_engine.pytorch as te
+  from transformer_engine.common import recipe
+
+  # Set dimensions.
+  in_features = 768
+  out_features = 3072
+  hidden_size = 2048
+
+  # Initialize model and inputs.
+  model = te.Linear(in_features, out_features, bias=True)
+  inp = torch.randn(hidden_size, in_features, device="cuda")
+
+  # Create an FP8 recipe. Note: All input args are optional.
+  fp8_recipe = recipe.DelayedScaling(margin=0, fp8_format=recipe.Format.E4M3)
+
+  # Enable autocasting for the forward pass
+  with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
+      out = model(inp)
+
+  loss = out.sum()
+  loss.backward()
+
+
+JAX
+^^^
+
+Flax
+~~~~
+
+.. code-block:: python
+
+  import flax
+  import jax
+  import jax.numpy as jnp
+  import transformer_engine.jax as te
+  import transformer_engine.jax.flax as te_flax
+  from transformer_engine.common import recipe
+
+  BATCH = 32
+  SEQLEN = 128
+  HIDDEN = 1024
+
+  # Initialize RNG and inputs.
+  rng = jax.random.PRNGKey(0)
+  init_rng, data_rng = jax.random.split(rng)
+  inp = jax.random.normal(data_rng, [BATCH, SEQLEN, HIDDEN], jnp.float32)
+
+  # Create an FP8 recipe. Note: All input args are optional.
+  fp8_recipe = recipe.DelayedScaling(margin=0, fp8_format=recipe.Format.HYBRID)
+
+  # Enable autocasting for the forward pass
+  with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
+      model = te_flax.DenseGeneral(features=HIDDEN)
+
+      def loss_fn(params, other_vars, inp):
+        out = model.apply({'params':params, **other_vars}, inp)
+        return jnp.mean(out)
+
+      # Initialize models.
+      variables = model.init(init_rng, inp)
+      other_variables, params = flax.core.pop(variables, 'params')
+
+      # Construct the forward and backward function
+      fwd_bwd_fn = jax.value_and_grad(loss_fn, argnums=(0, 1))
+
+      for _ in range(10):
+        loss, (param_grads, other_grads) = fwd_bwd_fn(params, other_variables, inp)
+
+.. overview-end-marker-do-not-remove
+
+Installation
+============
+.. installation
+
+Pre-requisites
+^^^^^^^^^^^^^^^^^^^^
+* Linux x86_64
+* CUDA 12.1+ (CUDA 12.8+ for Blackwell)
+* NVIDIA Driver supporting CUDA 12.1 or later
+* cuDNN 9.3 or later
+
+Docker
+^^^^^^^^^^^^^^^^^^^^
+
+The quickest way to get started with Transformer Engine is by using Docker images on
+`NVIDIA GPU Cloud (NGC) Catalog <https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch>`_.
+For example to use the NGC PyTorch container interactively,
+
+.. code-block:: bash
+
+    docker run --gpus all -it --rm nvcr.io/nvidia/pytorch:25.01-py3
+
+Where 25.01 (corresponding to January 2025 release) is the container version.
+
+pip
+^^^^^^^^^^^^^^^^^^^^
+To install the latest stable version of Transformer Engine,
+
+.. code-block:: bash
+
+    pip3 install git+https://github.com/NVIDIA/TransformerEngine.git@stable
+
+This will automatically detect if any supported deep learning frameworks are installed and build
+Transformer Engine support for them. To explicitly specify frameworks, set the environment variable
+NVTE_FRAMEWORK to a comma-separated list (e.g. NVTE_FRAMEWORK=jax,pytorch).
+
+Alternatively, the package can be directly installed from
+`Transformer Engine's PyPI <https://pypi.org/project/transformer-engine/>`_, e.g.
+
+.. code-block:: bash
+
+    pip3 install transformer_engine[pytorch]
+
+To obtain the necessary Python bindings for Transformer Engine, the frameworks needed must be
+explicitly specified as extra dependencies in a comma-separated list (e.g. [jax,pytorch]).
+Transformer Engine ships wheels for the core library. Source distributions are shipped for the JAX
+and PyTorch extensions.
+
+From source
+^^^^^^^^^^^
+`See the installation guide <https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html#installation-from-source>`_.
+
+Compiling with FlashAttention-2
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Transformer Engine release v0.11.0 added support for FlashAttention-2 in PyTorch for improved performance.
+
+It is a known issue that FlashAttention-2 compilation is resource-intensive and requires a large amount of RAM (see `bug <https://github.com/Dao-AILab/flash-attention/issues/358>`_), which may lead to out of memory errors during the installation of Transformer Engine. Please try setting **MAX_JOBS=1** in the environment to circumvent the issue.
+
+Note that NGC PyTorch 23.08+ containers include FlashAttention-2.
+
+Breaking Changes
+================
+
+v1.7: Padding mask definition for PyTorch
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+In an effort to unify the definition and usage of the attention mask across all three frameworks in Transformer Engine, the padding mask has changed from `True` meaning inclusion of the corresponding position in attention to exclusion of that position in our PyTorch implementation. Since v1.7, all attention mask types follow the same definition where `True` means masking out the corresponding position and `False` means including that position in attention calculation.
+
+An example of this change is,
+
+.. code-block:: bash
+
+    # for a batch of 3 sequences where `a`s, `b`s and `c`s are the useful tokens
+    # and `0`s are the padding tokens,
+    [a, a, a, 0, 0,
+     b, b, 0, 0, 0,
+     c, c, c, c, 0]
+    # the padding mask for this batch before v1.7 is,
+    [ True,  True,  True, False, False,
+      True,  True, False, False, False,
+      True,  True,  True,  True, False]
+    # and for v1.7 onwards it should be,
+    [False, False, False,  True,  True,
+     False, False,  True,  True,  True,
+     False, False, False, False,  True]
+
+FP8 Convergence
+===============
+
+FP8 has been tested extensively across different model architectures and configurations and we found **no significant difference** between FP8 and BF16 training loss curves. FP8 has also been validated for accuracy on downstream LLM tasks (e.g. LAMBADA and WikiText). Below are examples of models tested for convergence across different frameworks.
+
+------------+------------------+---------------------------------------------------------------------------------------------------------+
+| Model      | Framework        | Source                                                                                                  |
+============+==================+=========================================================================================================+
+| T5-770M    |  JAX/T5x         | https://github.com/NVIDIA/JAX-Toolbox/tree/main/rosetta/rosetta/projects/t5x#convergence-and-performance|
+------------+------------------+---------------------------------------------------------------------------------------------------------+
+| MPT-1.3B   |  Mosaic Composer | https://www.mosaicml.com/blog/coreweave-nvidia-h100-part-1                                              |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
+| GPT-5B     |  JAX/Paxml       | https://github.com/NVIDIA/JAX-Toolbox/tree/main/rosetta/rosetta/projects/pax#h100-results               |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
+| GPT-5B     |  NeMo Framework  | Available on request                                                                                    |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
+| LLama2-7B  |  Alibaba Pai     | https://mp.weixin.qq.com/s/NQT0uKXLbXyh5031zBdeBQ                                                       |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
+| T5-11B     |  JAX/T5x         | Available on request                                                                                    |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
+| MPT-13B    |  Mosaic Composer | https://www.databricks.com/blog/turbocharged-training-optimizing-databricks-mosaic-ai-stack-fp8         |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
+| GPT-22B    |  NeMo Framework  | Available on request                                                                                    |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
+| LLama2-70B |  Alibaba Pai     | https://mp.weixin.qq.com/s/NQT0uKXLbXyh5031zBdeBQ                                                       |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
+| GPT-175B   |  JAX/Paxml       | https://github.com/NVIDIA/JAX-Toolbox/tree/main/rosetta/rosetta/projects/pax#h100-results               |
+------------+------------------+---------------------------------------------------------------------------------------------------------+
+
+Integrations
+============
+
+Transformer Engine has been integrated with popular LLM frameworks such as:
+
+* `DeepSpeed <https://github.com/microsoft/DeepSpeed/pull/3731>`_
+* `Hugging Face Accelerate <https://github.com/huggingface/accelerate/releases/tag/v0.17.0>`_
+* `Lightning <https://github.com/Lightning-AI/lightning/issues/17172>`_
+* `MosaicML Composer <https://github.com/mosaicml/composer/releases/tag/v0.13.1>`_
+* `NVIDIA JAX Toolbox <https://github.com/NVIDIA/JAX-Toolbox>`_
+* `NVIDIA Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_
+* `NVIDIA NeMo Framework <https://github.com/NVIDIA/NeMo-Megatron-Launcher>`_
+* `Amazon SageMaker Model Parallel Library <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features-v2-tensor-parallelism.html>`_
+* `Levanter <https://github.com/stanford-crfm/levanter>`_
+* `GPT-NeoX <https://github.com/EleutherAI/gpt-neox>`_
+* `Hugging Face Nanotron <https://github.com/huggingface/nanotron>`_ - Coming soon!
+* `Colossal-AI <https://github.com/hpcaitech/ColossalAI>`_ - Coming soon!
+* `PeriFlow <https://github.com/friendliai/periflow-python-sdk>`_ - Coming soon!
+
+
+Contributing
+============
+
+We welcome contributions to Transformer Engine! To contribute to Transformer Engine and make pull requests,
+follow the guidelines outlined in the `<CONTRIBUTING.rst>`_ guide.
+
+Papers
+======
+
+* `Attention original paper <https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf>`_
+* `Megatron-LM tensor parallel <https://arxiv.org/pdf/1909.08053.pdf>`_
+* `Megatron-LM sequence parallel <https://arxiv.org/pdf/2205.05198.pdf>`_
+* `FP8 Formats for Deep Learning <https://arxiv.org/abs/2209.05433>`_
+
+Videos
+======
+
+* `What's New in Transformer Engine and FP8 Training | GTC 2024 <https://www.nvidia.com/en-us/on-demand/session/gtc24-s62457/>`_
+* `FP8 Training with Transformer Engine | GTC 2023 <https://www.nvidia.com/en-us/on-demand/session/gtcspring23-s51393>`_
+* `FP8 for Deep Learning | GTC 2023 <https://www.nvidia.com/en-us/on-demand/session/gtcspring23-s52166/>`_
+* `Inside the Hopper Architecture <https://www.nvidia.com/en-us/on-demand/session/gtcspring22-s42663/>`_
+
+.. |License| image:: https://img.shields.io/badge/License-Apache%202.0-blue.svg
+   :target: https://opensource.org/licenses/Apache-2.0