“13.0”

1106877d · jerrrrry · 1106877d · 1106877d · 1106877d · 1106877d
Commit 1106877d authored Sep 23, 2025 by jerrrrry
20 changed files
--- a/.gitignore
+++ b/.gitignore
+__pycache__
+*.bak
+*.log
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "Megatron-LM"]
+	path = Megatron-LM
+	url = https://github.com/NVIDIA/Megatron-LM.git
+	branch = main 
+[submodule]
+	Megatron-LM = main
--- a/Megatron-LM/.flake8
+++ b/Megatron-LM/.flake8
+[flake8]
+max-line-length = 100
+extend-ignore = E203,E501,F401,E402,E714
+per-file-ignores = __init__.py:F401
\ No newline at end of file
--- a/Megatron-LM/.github/ISSUE_TEMPLATE/bug.md
+++ b/Megatron-LM/.github/ISSUE_TEMPLATE/bug.md
+---
+name: BUG
+about: Report a bug that needs attention
+title: "[BUG]"
+labels: ''
+assignees: ''
+---
+**Describe the bug**
+A clear and concise description of what the bug is.
+**To Reproduce**
+Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+**Stack trace/logs**
+If applicable, add the stack trace or logs from the time of the error.
+**Environment (please complete the following information):**
+ - Megatron-LM commit ID
+ - PyTorch version
+ - CUDA version
+ - NCCL version
+**Proposed fix**
+If you have a proposal for how to fix the issue state it here or link to a PR.
+**Additional context**
+Add any other context about the problem here.
--- a/Megatron-LM/.github/ISSUE_TEMPLATE/enhancement.md
+++ b/Megatron-LM/.github/ISSUE_TEMPLATE/enhancement.md
+---
+name: ENHANCEMENT
+about: Suggest an idea to improve this project
+title: "[ENHANCEMENT]"
+labels: ''
+assignees: ''
+---
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+**Proposed implementation**
+If you have a proposed implementation for the feature state it here or link to a PR.
+**Additional context**
+Add any other context or screenshots about the feature request here.
--- a/Megatron-LM/.github/ISSUE_TEMPLATE/question.md
+++ b/Megatron-LM/.github/ISSUE_TEMPLATE/question.md
+---
+name: QUESTION
+about: Ask a question about Megatron-LM that is not a bug, regression or enhancement
+  request
+title: "[QUESTION]"
+labels: ''
+assignees: ''
+---
+**Your question**
+Ask a clear and concise question about Megatron-LM.
--- a/Megatron-LM/.github/ISSUE_TEMPLATE/regression.md
+++ b/Megatron-LM/.github/ISSUE_TEMPLATE/regression.md
+---
+name: REGRESSION
+about: Report a regression in speed or accuracy due to a Megatron-LM update
+title: "[REGRESSION]"
+labels: ''
+assignees: ''
+---
+**Describe the regression**
+A clear and concise description of what the regression is.
+**To Reproduce**
+Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
+**Previous performance**
+What speed or accuracy did you previously see.
+**New performance**
+What speed or accuracy do you see after the update.
+**Stack trace/logs**
+If applicable, add the stack trace or logs related to the regression.
+**Environment (please complete the following information):**
+ - Previous Megatron-LM commit ID
+ - New Megatron-LM commit ID
+ - Previous PyTorch version
+ - New PyTorch version
+ - Previous CUDA version
+ - New CUDA version
+ - Previous NCCL version
+ - New NCCL version
+**Proposed fix**
+If you have a proposal for how to fix the issue state it here or link to a PR.
+**Additional context**
+Add any other context about the problem here.
--- a/Megatron-LM/.github/workflows/stale.yml
+++ b/Megatron-LM/.github/workflows/stale.yml
+# This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
+#
+# You can adjust the behavior by modifying this file.
+# For more information, see:
+# https://github.com/actions/stale
+name: Mark stale issues and pull requests
+on:
+  schedule:
+  - cron: '15 18 * * *'
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+    - uses: actions/stale@v5
+      with:
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
+        days-before-stale: 60
+        stale-issue-message: 'Marking as stale. No activity in 60 days.'
+        stale-pr-message: 'Marking as stale. No activity in 60 days.'
+        stale-issue-label: 'stale'
+        stale-pr-label: 'stale'
+        remove-stale-when-updated: true
+        operations-per-run: 1000
+        days-before-close: -1
--- a/Megatron-LM/.gitignore
+++ b/Megatron-LM/.gitignore
+__pycache__
+*.so
+build
+.coverage_*
+*.egg-info
+*~
+slurm*
+logs
+.vscode
+local/
+.gitmodules
+wandb/
+onelogger.log
+onelogger.err
+.venv/
--- a/Megatron-LM/.gitlab-ci.yml
+++ b/Megatron-LM/.gitlab-ci.yml
+.merge_train_rule: &merge_train_rule
+  UNIT_TEST: "yes"
+  UNIT_TEST_REPEAT: 1
+  UNIT_TEST_TIMEOUT: 30
+  INTEGRATION_TEST: "yes"
+  INTEGRATION_TEST_SCOPE: mr
+  FUNCTIONAL_TEST: "yes"
+  FUNCTIONAL_TEST_SCOPE: mr-slim
+  FUNCTIONAL_TEST_REPEAT: 5
+  FUNCTIONAL_TEST_TIME_LIMIT: 2700
+  CLUSTER_A100: ""
+  CLUSTER_H100: ""
+  PUBLISH: "no"
+workflow:
+  rules:
+    # Do not trigger for forks
+    - if: $CI_PROJECT_NAMESPACE != "ADLR" || ($CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_PROJECT_PATH != "ADLR/megatron-lm")
+      when: never
+    # ci-branches only for schedule
+    - if: $CI_COMMIT_BRANCH =~ /ci-/ && $CI_PIPELINE_SOURCE != "schedule"
+      when: never
+    # For schedules pipelines
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      auto_cancel:
+        on_new_commit: none
+    # For manual pipelines
+    - if: $CI_PIPELINE_SOURCE == "web"
+    # For push to main
+    - if: $CI_PIPELINE_SOURCE == 'push' && $CI_COMMIT_REF_PROTECTED == "true"
+      variables:
+        UNIT_TEST: "no"
+        INTEGRATION_TEST: "no"
+        FUNCTIONAL_TEST: "yes"
+        FUNCTIONAL_TEST_SCOPE: mr
+        FUNCTIONAL_TEST_REPEAT: 5
+        FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
+        FUNCTIONAL_TEST_TIME_LIMIT: 2700
+        CLUSTER_A100: ""
+        CLUSTER_H100: ""
+        PUBLISH: "no"
+      auto_cancel:
+        on_new_commit: none
+    # For merge-trains that need to be fast-tracked
+    - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' && $CI_MERGE_REQUEST_LABELS =~ /fast-track/
+      variables:
+        UNIT_TEST: "yes"
+        UNIT_TEST_REPEAT: 1
+        UNIT_TEST_TIMEOUT: 30
+        INTEGRATION_TEST: "no"
+        FUNCTIONAL_TEST: "no"
+        CLUSTER_A100: ""
+        CLUSTER_H100: ""
+        PUBLISH: "no"
+    # For normal merge-trains
+    - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train'
+      variables: *merge_train_rule
+    # For MRs with integration suite
+    - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run tests/
+      variables: *merge_train_rule
+    # For MRs with nightly
+    - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run nightly/
+      variables:
+        UNIT_TEST: "yes"
+        UNIT_TEST_REPEAT: 1
+        UNIT_TEST_TIMEOUT: 30
+        INTEGRATION_TEST: "no"
+        FUNCTIONAL_TEST: "yes"
+        FUNCTIONAL_TEST_SCOPE: nightly
+        FUNCTIONAL_TEST_REPEAT: 5
+        FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
+        FUNCTIONAL_TEST_TIME_LIMIT: 2700
+        CLUSTER_A100: ""
+        CLUSTER_H100: ""
+        PUBLISH: "no"
+    # For MRs with weekly
+    - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run weekly/
+      variables:
+        UNIT_TEST: "yes"
+        UNIT_TEST_REPEAT: 1
+        UNIT_TEST_TIMEOUT: 30
+        INTEGRATION_TEST: "no"
+        FUNCTIONAL_TEST: "yes"
+        FUNCTIONAL_TEST_SCOPE: weekly
+        FUNCTIONAL_TEST_REPEAT: 1
+        FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
+        FUNCTIONAL_TEST_TIME_LIMIT: 9000
+        CLUSTER_A100: ""
+        CLUSTER_H100: ""
+        PUBLISH: "no"
+    # For MRs with heavy suite
+    - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run functional tests/
+      variables:
+        UNIT_TEST: "yes"
+        UNIT_TEST_REPEAT: 1
+        UNIT_TEST_TIMEOUT: 30
+        INTEGRATION_TEST: "no"
+        FUNCTIONAL_TEST: "yes"
+        FUNCTIONAL_TEST_SCOPE: mr
+        FUNCTIONAL_TEST_REPEAT: 5
+        FUNCTIONAL_TEST_TIME_LIMIT: 2700
+        CLUSTER_A100: ""
+        CLUSTER_H100: ""
+        PUBLISH: "no"
+    # Default MRs
+    - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result'
+      variables:
+        UNIT_TEST: "yes"
+        UNIT_TEST_REPEAT: 1
+        UNIT_TEST_TIMEOUT: 30
+        INTEGRATION_TEST: "no"
+        FUNCTIONAL_TEST: "no"
+        PUBLISH: "no"
+    - when: never
+  auto_cancel:
+    on_new_commit: interruptible
+stages:
+  - build
+  - test
+  - integration_tests
+  - functional_tests
+  - publish
+default:
+  interruptible: true
+  retry:
+    max: 2
+    when: runner_system_failure
+variables:
+  UNIT_TEST:
+    value: "yes"
+    options:
+      - "yes"
+      - "no"
+    description: To run the funtional test suite
+  UNIT_TEST_REPEAT:
+    value: "1"
+    description: "Number of repetitions"
+  UNIT_TEST_TIMEOUT:
+    value: "30"
+    description: Timeout (minutes) for Unit tests (all repeats)
+  INTEGRATION_TEST:
+    value: "yes"
+    options:
+      - "yes"
+      - "no"
+    description: To run the integration test suite
+  INTEGRATION_TEST_SCOPE:
+    value: "mr"
+    options:
+      - "mr"
+      - "nightly"
+      - "weekly"
+      - "pre-release"
+      - "release"
+    description: "Testsuite to run (only for INTEGRATION_TEST=yes)"
+  INTEGRATION_TEST_TIME_LIMIT:
+    value: "900"
+    description: "Timeout in seconds per test"
+  INTEGRATION_TEST_CASES:
+    value: "all"
+    description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
+  FUNCTIONAL_TEST:
+    value: "yes"
+    options:
+      - "yes"
+      - "no"
+    description: To run the funtional test suite
+  FUNCTIONAL_TEST_SCOPE:
+    value: "mr"
+    options:
+      - "mr"
+      - "nightly"
+      - "weekly"
+      - "pre-release"
+      - "release"
+    description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)"
+  FUNCTIONAL_TEST_REPEAT:
+    value: "5"
+    description: "Number of repetitions per test"
+  FUNCTIONAL_TEST_TIME_LIMIT:
+    value: "2700"
+    description: "Timeout in seconds per test"
+  FUNCTIONAL_TEST_CASES:
+    value: "all"
+    description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
+  FUNCTIONAL_TEST_NAME:
+    description: "Name of functional test run (only for pre-release and release)"
+    value: "$$CI_COMMIT_SHA"
+  FUNCTIONAL_TEST_RECORD_CHECKPOINTS:
+    value: "no"
+    description: "Record golden checkpoints"
+    options:
+      - "yes"
+      - "no"
+  CLUSTER_A100:
+    value: "dgxa100_dracooci"
+    options:
+      - "dgxa100_dracooci"
+      - "dgxa100_dracooci-ord"
+    description: "Cluster for A100 workloads"
+  CLUSTER_H100:
+    value: "dgxh100_coreweave"
+    options:
+      - "dgxh100_coreweave"
+      - "dgxh100_eos"
+    description: "Cluster for H100 workloads"
+  PUBLISH:
+    value: "no"
+    options:
+      - "yes"
+      - "no"
+    description: Build and publish a wheel to PyPi
+  PUBLISH_COMMIT:
+    value: "$$CI_COMMIT_SHA"
+    description: Which commit to publish
+  PUBLISH_VERSION_BUMP_BRANCH:
+    value: "$$CI_COMMIT_BRANCH"
+    description: Which branch to target for version bump
+  PUBLISH_SCOPE:
+    value: "code-freeze"
+    options:
+      - "code-freeze"
+      - "release"
+      - "review-reminder"
+    description: Type of publish (freeze or final release)
+  # CI wide variables
+  CI_MCORE_LTS_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_lts
+  CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev
+  CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci
+  UTILITY_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_utility
+  TE_GIT_REF: ""
+include:
+  - .gitlab/stages/00.pre.yml
+  - .gitlab/stages/01.build.yml
+  - .gitlab/stages/02.test.yml
+  - .gitlab/stages/03.integration-tests.yml
+  - .gitlab/stages/04.functional-tests.yml
+  - .gitlab/stages/05.publish.yml
--- a/Megatron-LM/.gitlab/labeler-config.yml
+++ b/Megatron-LM/.gitlab/labeler-config.yml
+CI:
+  - .gitlab-ci.yml
+  - Dockerfile.ci.lts
+  - Dockerfile.ci.dev
+  - .github/**
+  - .gitlab/**
+Datasets:
+  - megatron/core/datasets/**
+BERT:
+  - megatron/core/models/bert/**
+GPT:
+  - megatron/core/models/gpt/**
+RETRO:
+  - megatron/core/models/retro/**
+Dist-Ckpt:
+  - megatron/core/dist_checkpointing
+Dist-Opt:
+  - megatron/core/optimizer/distrib_optimizer
+Inference:
+  - megatron/core/inference
+MoE:
+  - megatron/core/transformer/moe
+Tests:
+  - tests/**
+ParallelState:
+  - megatron/core/parallel_state.py
--- a/Megatron-LM/.gitlab/scripts/build.sh
+++ b/Megatron-LM/.gitlab/scripts/build.sh
+#! /bin/bash
+set -x
+env
+eval "IMAGE=\$$IMAGE"
+docker context create tls-environment
+docker buildx create --name container --driver=docker-container --use tls-environment
+ADDITIONAL_PARAMS=()
+if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" || "$CI_COMMIT_BRANCH" == "main" ]]; then
+    ADDITIONAL_PARAMS+=("--pull")
+    ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:main,mode=max")
+    ADDITIONAL_PARAMS+=("-t ${IMAGE}:main")
+elif [[ -n "$CI_MERGE_REQUEST_IID" ]]; then
+    ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID},mode=max")
+    ADDITIONAL_PARAMS+=("-t ${IMAGE}:${CI_MERGE_REQUEST_IID}")
+fi
+if [[ "$CI_COMMIT_BRANCH" == "ci-nightly" ]]; then
+    ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly")
+fi
+if [[ -n "$TE_GIT_REF" ]]; then
+    ADDITIONAL_PARAMS+=("--build-arg TE_COMMIT=${TE_GIT_REF}")
+fi
+echo $(git rev-parse HEAD)
+JET_API_VERSION=$(curl -s -u "$ARTIFACTORY_USER:$ARTIFACTORY_TOKEN" "https://sc-hw-artf.nvidia.com/artifactory/api/pypi/hw-joc-pypi/simple/jet-api/" | grep -o 'href="../../jet-api/[0-9.]*/' | sed 's|href="../../jet-api/||;s|/||' | sort -V -r | head -n1)
+DOCKER_BUILDKIT=1 docker build \
+    --secret id=JET_INDEX_URLS \
+    --secret id=LOGGER_INDEX_URL \
+    --secret id=EXPERIMENTAL_FLASH_ATTN \
+    --target $STAGE \
+    -f docker/$FILE \
+    -t ${IMAGE}:${CI_PIPELINE_ID} \
+    --builder=container \
+    --build-arg JET_API_VERSION=$JET_API_VERSION \
+    --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID} \
+    --cache-from type=registry,ref=${IMAGE}-buildcache:main \
+    --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \
+    --push \
+    --progress plain \
+    ${ADDITIONAL_PARAMS[@]} .
--- a/Megatron-LM/.gitlab/scripts/check_imports.py
+++ b/Megatron-LM/.gitlab/scripts/check_imports.py
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/usr/bin/env python3
+"""
+Import checker script for megatron.hub package.
+This script recursively discovers all Python modules in the specified package
+and attempts to import them, reporting any import errors.
+"""
+import importlib
+import os
+import sys
+import traceback
+from typing import Dict, List, Tuple
+import click
+class ImportChecker:
+    """Check imports for all modules in a package."""
+    def __init__(self, package_name: str = "megatron.core", verbose: bool = False):
+        self.package_name = package_name
+        self.success_count = 0
+        self.failure_count = 0
+        self.graceful_count = 0
+        self.skipped_count = 0
+        self.failures: Dict[str, str] = {}
+        self.successes: List[str] = []
+        self.graceful_failures: Dict[str, str] = {}
+        self.skipped: List[str] = []
+        # Modules to skip (known problematic ones)
+        self.skip_patterns = {
+            "__pycache__",
+            ".pytest_cache",
+            ".git",
+            "test_",
+            "_test",
+        }
+        # Add current directory to Python path if not already there
+        current_dir = os.getcwd()
+        if current_dir not in sys.path:
+            sys.path.insert(0, current_dir)
+    def should_skip_module(self, module_name: str) -> bool:
+        """Check if a module should be skipped."""
+        for pattern in self.skip_patterns:
+            if pattern in module_name:
+                return True
+        return False
+    def discover_modules(self, package_path: str) -> List[str]:
+        """Discover all Python modules in the given package path."""
+        modules = []
+        package = importlib.import_module(package_path)
+        package_path = package.__path__[0]
+        # Walk through all Python files
+        for root, dirs, files in os.walk(package.__path__[0]):
+            # Skip hidden directories and __pycache__
+            dirs[:] = [d for d in dirs if not d.startswith(".") and d != "__pycache__"]
+            for file in files:
+                if file.endswith(".py") and not file.startswith("."):
+                    # Convert file path to module name
+                    rel_path = os.path.relpath(os.path.join(root, file), package_path)
+                    module_parts = rel_path.replace(os.sep, ".").replace(".py", "")
+                    # Handle __init__.py files
+                    if module_parts.endswith(".__init__"):
+                        module_parts = module_parts[:-9]  # Remove .__init__
+                    full_module_name = (
+                        f"{self.package_name}.{module_parts}"
+                        if module_parts
+                        else self.package_name
+                    )
+                    if not self.should_skip_module(full_module_name):
+                        modules.append(full_module_name)
+            # Remove duplicates and sort
+            modules = sorted(list(set(modules)))
+        return modules
+    def import_module(self, module_name: str) -> Tuple[str, str]:
+        """
+        Try to import a module and return success status and error message.
+        Returns:
+            Tuple of (status: str, error_message: str)
+            status can be: "success", "graceful", or "failed"
+        """
+        try:
+            if module_name in sys.modules:
+                del sys.modules[module_name]
+            importlib.import_module(module_name)
+            return "success", ""
+        except Exception:
+            tb = traceback.format_exc()
+            if "UnavailableError" in tb:
+                return "graceful", "UnavailableError detected during import"
+            return "failed", f"{str(tb)}"
+    def check_all_imports(self):
+        """Check imports for all discovered modules."""
+        print(f"Discovering modules in package '{self.package_name}'...")
+        modules = self.discover_modules(self.package_name)
+        if not modules:
+            print("No modules found!")
+            return
+        print(f"Found {len(modules)} modules to check")
+        print("=" * 60)
+        for i, module_name in enumerate(modules, 1):
+            status, error_msg = self.import_module(module_name)
+            if status == "success":
+                self.success_count += 1
+                self.successes.append(module_name)
+            elif status == "graceful":
+                self.graceful_count += 1
+                self.graceful_failures[module_name] = error_msg
+            else:  # failed
+                self.failure_count += 1
+                self.failures[module_name] = error_msg
+        """Print a summary of the import check results."""
+        total = (
+            self.success_count
+            + self.failure_count
+            + self.graceful_count
+            + self.skipped_count
+        )
+        print("\n" + "=" * 60)
+        print("IMPORT CHECK SUMMARY")
+        print("=" * 60)
+        print(f"Total modules checked: {total}")
+        print(
+            f"Successful imports:    {self.success_count} ({self.success_count / total * 100:.1f}%)"
+        )
+        print(
+            f"Gracefully handled:    {self.graceful_count} ({self.graceful_count / total * 100:.1f}%)"
+        )
+        print(
+            f"Failed imports:        {self.failure_count} ({self.failure_count / total * 100:.1f}%)"
+        )
+        if self.skipped_count > 0:
+            print(
+                f"Skipped modules:       {self.skipped_count} ({self.skipped_count / total * 100:.1f}%)"
+            )
+        if self.graceful_failures:
+            print(f"\n🟡 GRACEFULLY HANDLED ({len(self.graceful_failures)}):")
+            print("-" * 40)
+        if self.failures:
+            print(f"\n❌ FAILED IMPORTS ({len(self.failures)}):")
+            print("-" * 40)
+            for module_name, error_msg in self.failures.items():
+                print(f"\n• {module_name}")
+                # Show only the first few lines of error to keep output manageable
+                error_lines = error_msg.split("\n")
+                for line in error_lines:
+                    # if self.package_name.replace(".", os.sep) not in line:
+                    #     continue
+                    if line.strip():
+                        print(f"  {line}")
+        return self.failure_count == 0
+@click.command()
+@click.option(
+    "--package-name",
+    required=True,
+    help="Package name to check imports for",
+)
+def main(package_name: str):
+    """Main entry point."""
+    checker = ImportChecker(package_name=package_name)
+    successful = checker.check_all_imports()
+    exit(0 if successful else 1)
+if __name__ == "__main__":
+    main()
--- a/Megatron-LM/.gitlab/scripts/fetch-legacy-suite.sh
+++ b/Megatron-LM/.gitlab/scripts/fetch-legacy-suite.sh
+#!/bin/bash
+set -euxo pipefail
+# Default values
+MCORE_REPO="https://github.com/nvidia/megatron-lm.git"
+MCORE_MR_COMMIT="main"
+MCORE_BACKWARDS_COMMIT=""
+# Parse command line arguments
+usage() {
+    cat <<EOF
+Usage: $0 [OPTIONS]
+Clone and setup megatron-lm repositories for testing.
+Options:
+    --repo URL              Git repository URL (default: $MCORE_REPO)
+    --backwards-commit COMMIT Commit hash or reference for the backwards compatibility test
+    --help                  Show this help message
+Example:
+    $0 --repo $MCORE_REPO \\
+       --backwards-commit core_r0.12.0
+EOF
+    exit 1
+}
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+    --repo)
+        MCORE_REPO="$2"
+        shift 2
+        ;;
+    --backwards-commit)
+        MCORE_BACKWARDS_COMMIT="$2"
+        shift 2
+        ;;
+    --help)
+        usage
+        ;;
+    *)
+        echo "Unknown option: $1"
+        usage
+        ;;
+    esac
+done
+# Validate required arguments
+if [[ -z "${MCORE_BACKWARDS_COMMIT:-}" ]]; then
+    echo "Error: --backwards-commit is required"
+    usage
+fi
+# Checkout backwards-ref
+rm -rf megatron-lm-legacy
+mkdir megatron-lm-legacy
+pushd megatron-lm-legacy
+git init
+git remote add origin $MCORE_REPO
+git fetch origin $MCORE_BACKWARDS_COMMIT
+git checkout $MCORE_BACKWARDS_COMMIT
+git rev-parse HEAD
+rm -rf megatron
+cp -a ../megatron-lm/megatron ./
+popd
+# Copy unit test script
+cp megatron-lm/tests/unit_tests/run_ci_test.sh megatron-lm-legacy/tests/unit_tests/run_ci_test.sh
--- a/Megatron-LM/.gitlab/stages/00.pre.yml
+++ b/Megatron-LM/.gitlab/stages/00.pre.yml
+include:
+  - template: Security/Secret-Detection.gitlab-ci.yml
+.pre_rules:
+  rules:
+    - if: $CI_PIPELINE_SOURCE == 'main'
+      allow_failure: true
+      when: always
+    - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
+      allow_failure: true
+      when: always
+    - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result'
+      when: always
+    - when: never
+  stage: .pre
+.dind_rules:
+  image: docker:26.1.4-dind
+  variables:
+    DOCKER_HOST: unix:///var/run/docker.sock
+  before_script:
+    - docker system prune -a --filter "until=36h" -f || true
+    - echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin
+    - echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin
+pre:mirror_to_github:
+  rules:
+    - if: '$CI_COMMIT_REF_PROTECTED == "true" && $CI_PIPELINE_SOURCE == "push"'
+      allow_failure: true
+    - when: never
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  stage: .pre
+  image: python:3.10
+  variables:
+    GIT_STRATEGY: "clone"
+  script:
+    - git checkout $CI_COMMIT_BRANCH
+    - git remote add github https://ko3n1g:$GH_TOKEN@github.com/NVIDIA/Megatron-LM.git || true
+    - git push -u github $CI_COMMIT_BRANCH
+  retry:
+    max: 2
+pre:create_ci_branches:
+  rules:
+    - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push"'
+      allow_failure: true
+    - when: never
+  parallel:
+    matrix:
+      - branch: ci-unit-test-extended
+      - branch: ci-rebuild-mcore-nemo-image
+      - branch: ci-mr
+      - branch: ci-nightly
+      - branch: ci-weekly
+      - branch: ci-pre-release
+      - branch: ci-review-reminder
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  stage: .pre
+  image: python:3.10
+  variables:
+    GIT_STRATEGY: "clone"
+  script:
+    - git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/adlr/megatron-lm.git"
+    - git switch --force-create $branch
+    - git push --force -u origin $branch
+  retry:
+    max: 2
+pre:label_merge_request:
+  extends: [.pre_rules]
+  image: golang:1.22
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  before_script:
+    - git clone -b nv https://${GITLAB_ENDPOINT}/okoenig/gitlab-mr-labeler.git
+    - cd gitlab-mr-labeler
+    - go install .
+    - cd ..
+    - go install github.com/itchyny/gojq/cmd/gojq@latest
+  script:
+    - set -x
+    - |
+      LABELS=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}")
+    - LABELS=$(echo "$LABELS" | gojq '.labels -= ["ParallelState"]')
+    - |
+      if git --no-pager diff --merge-base origin/${CI_MERGE_REQUEST_TARGET_BRANCH_NAME} -- 'megatron/core/' | grep -q 'parallel_state'; then
+        LABELS=$(echo "$LABELS" | gojq '.labels += ["ParallelState"]')
+        echo "$LABELS"
+      fi
+    - echo LABELS=$(echo "$LABELS" | gojq '.labels | join(",")') > labels
+    - gitlab-mr-labeler -f .gitlab/labeler-config.yml -t ${PROJECT_ACCESS_TOKEN_MCORE} --debug true
+    - cat labels
+  after_script:
+    - |
+      source labels
+      curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data-urlencode "add_labels=$LABELS" -X PUT
+pre:maybe_cherry_pick_commit:
+  rules:
+    - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push"'
+    - when: never
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  stage: .pre
+  image: nentangso/alpine-git-curl-jq
+  variables:
+    GIT_STRATEGY: "clone"
+  script:
+    - set -x
+    - set +e
+    - SHA=$(git rev-list --no-merges -n 1 HEAD)
+    - MESSAGE=$(git log -n 1 --pretty=format:%s $SHA)
+    - MR_ID=$(echo $MESSAGE | awk -F'!' '{print $2}' | awk '{print $1}' )
+    - git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
+    - git config --global user.email "mcore-bot@nvidia.com"
+    - git config --global user.name "Mcore Bot"
+    - |
+      MR=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}")
+      LABELS=$(echo -E $MR | jq '.labels | join(",")' | tr -d '"')
+      AUTHOR_ID=$(echo -E $MR | jq '.author.id' | tr -d '"')
+      AUTHOR_NAME=$(echo -E $MR | jq '.author.username' | tr -d '"')
+      TITLE=$(echo -E $MR | jq '.title' | tr -d '"')
+      MILESTONE_ID=$(echo -E $MR | jq '.milestone.id' | tr -d '"')
+      TARGET_BRANCHES=$(echo "$LABELS" | grep -o 'core_[^,]*')
+      if [[ $TARGET_BRANCHES == "" ]]; then
+        echo Nothing to cherry pick
+        exit 0
+      fi
+      echo $TARGET_BRANCHES | while read -r RELEASE_BRANCH ; do
+        TARGET_BRANCH_EXISTS_OK=$([[ "$(git ls-remote --heads origin refs/heads/$RELEASE_BRANCH)" != "" ]] && echo true || echo false)
+        if [[ "$TARGET_BRANCH_EXISTS_OK" == "false" ]]; then
+          echo Release branch does not yet exist, will not  cherry-pick
+          continue
+        fi
+        (
+          git fetch origin $RELEASE_BRANCH:$RELEASE_BRANCH
+          git switch --force-create cherry-pick-$MR_ID-$RELEASE_BRANCH $RELEASE_BRANCH
+          git cherry-pick $SHA
+          git push -u origin --force cherry-pick-$MR_ID-$RELEASE_BRANCH
+          git checkout ${CI_DEFAULT_BRANCH:-main}
+        )
+        CHERRYPICK_SUCCESSFUL=$?
+        if [[ $CHERRYPICK_SUCCESSFUL -eq 0 ]]; then
+          curl \
+            --header "PRIVATE-TOKEN: $PAT" \
+            --url https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests \
+            -d "source_branch=cherry-pick-$MR_ID-$RELEASE_BRANCH" \
+            -d "target_branch=$RELEASE_BRANCH" \
+            -d "title=Cherry pick \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\`" \
+            -d "labels=cherry-pick" \
+            -d "reviewer_ids=$AUTHOR_ID" \
+            -d "milestone_id=$MILESTONE_ID" \
+            -d "description=[🤖]: Hi @$AUTHOR_NAME 👋,<br><br>we've cherry picked \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience\!"
+        else
+          URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/merge_requests/$MR_ID
+          MESSAGE='{
+            "blocks": [
+              {
+                "type": "section",
+                "text": {
+                  "type": "mrkdwn",
+                  "text": "beep boop 🤖: Cherry-pick of <'$URL'|!'$MR_ID'> failed\ncc '$SLACK_ADMIN'"
+                }
+              }
+            ]
+          }'
+          curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${MCORE_NOTIFICATION_HOOK}
+        fi
+      done
+  interruptible: false
+pre:check_milestone:
+  extends: [.pre_rules]
+  image: badouralix/curl-jq
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  script:
+    - env
+    - |
+      MILESTONE=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" | jq '.milestone')
+    - |
+      if [[ "$MILESTONE" == "null" ]]; then
+        LATEST_MILESTONE=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/milestones?state=active&order_by=due_date&sort=desc" | jq '.[0].id')
+        curl --request PUT --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data "milestone_id=${LATEST_MILESTONE}"
+        echo "Applied latest milestone (ID: ${LATEST_MILESTONE}) to this MR"
+      fi
+pre:check_status_of_main:
+  extends: [.pre_rules]
+  image: python:3.10
+  timeout: 7 days
+  variables:
+    KUBERNETES_SERVICE_MEMORY_REQUEST: 32Gi
+    KUBERNETES_SERVICE_MEMORY_LIMIT: 32Gi
+    KUBERNETES_SERVICE_CPU_REQUEST: 8
+    KUBERNETES_SERVICE_CPU_LIMIT: 12
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  script:
+    - env
+    - pip install --no-cache-dir python-gitlab click
+    - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
+    - export GITLAB_ENDPOINT
+    - python tests/test_utils/python_scripts/check_status_of_main.py --target-branch "$CI_MERGE_REQUEST_TARGET_BRANCH_NAME"
+  rules:
+    - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' && $CI_MERGE_REQUEST_LABELS =~ /fast-track/
+      when: never
+    - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train'
+      when: always
+    - when: never
--- a/Megatron-LM/.gitlab/stages/01.build.yml
+++ b/Megatron-LM/.gitlab/stages/01.build.yml
+.build_rules:
+  rules:
+    - when: on_success
+  stage: test
+.build_image:
+  extends: [.build_rules, .dind_rules]
+  stage: build
+  tags:
+    - arch/amd64
+    - origin/jet-fleet
+    - env/prod
+    - ${TAG}
+  services:
+    - name: docker:24.0.5-dind
+      variables:
+        HEALTHCHECK_TCP_PORT: "2376"
+  timeout: 180m
+  variables:
+    DOCKER_HOST: tcp://docker:2376
+    DOCKER_TLS_CERTDIR: "/certs"
+    DOCKER_TLS_VERIFY: 1
+    DOCKER_CERT_PATH: "$DOCKER_TLS_CERTDIR/client"
+    TAG: purpose/builder-large
+    STAGE: jet
+    MCORE_BACKWARDS_REF: core_r0.12.0
+    KUBERNETES_SERVICE_MEMORY_REQUEST: 90Gi
+    KUBERNETES_SERVICE_MEMORY_LIMIT: 90Gi
+    # KUBERNETES_SERVICE_CPU_REQUEST: 60
+    # KUBERNETES_SERVICE_CPU_LIMIT: 60
+  script:
+    - eval PUBLISH_COMMIT=$PUBLISH_COMMIT
+    - apk add bash curl git
+    - export TE_GIT_REF=$TE_GIT_REF
+    - bash .gitlab/scripts/build.sh
+    - git fetch origin $MCORE_BACKWARDS_REF
+    - MCORE_BACKWARDS_COMMIT=$(git rev-parse FETCH_HEAD)
+    - echo "MCORE_MR_COMMIT=$CI_COMMIT_SHA" | tee -a build.env
+    - echo "MCORE_BACKWARDS_COMMIT=$MCORE_BACKWARDS_COMMIT" | tee -a build.env
+    - cat build.env
+  retry:
+    max: 2
+  artifacts:
+    reports:
+      dotenv: build.env
+test:build_image:
+  extends: [.build_image]
+  parallel:
+    matrix:
+      - IMAGE: CI_MCORE_LTS_IMAGE
+        FILE: Dockerfile.ci.lts
+        BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
+      - IMAGE: CI_MCORE_DEV_IMAGE
+        FILE: Dockerfile.ci.dev
+        BASE_IMAGE: nvcr.io/nvidia/pytorch:25.05-py3
+      - IMAGE: UTILITY_IMAGE
+        FILE: Dockerfile.linting
+        BASE_IMAGE: python:3.10
+test:build_nemo_image:
+  extends: [.build_image]
+  variables:
+    IMAGE: CI_NEMO_IMAGE
+    FILE: Dockerfile.ci.nemo
+    BASE_IMAGE: nvcr.io/nvidian/nemo:nightly
+  rules:
+    - if: $FUNCTIONAL_TEST == "yes" || $INTEGRATION_TEST == "yes" || $CI_COMMIT_BRANCH == "ci-rebuild-mcore-nemo-image"
+      when: on_success
--- a/Megatron-LM/.gitlab/stages/02.test.yml
+++ b/Megatron-LM/.gitlab/stages/02.test.yml
+.test_rules:
+  rules:
+    - if: $PUBLISH == "yes"
+      when: never
+    - when: on_success
+  stage: test
+include:
+  - template: Security/Secret-Detection.gitlab-ci.yml
+wait_for_resources:
+  extends: [.test_rules]
+  needs:
+    - test:linting_formatting
+    - test:linting_copyright
+    - job: test:linting_secret_detection
+      optional: true
+    - test:build_image
+  image: python:3.10
+  timeout: 7 days
+  variables:
+    KUBERNETES_SERVICE_MEMORY_REQUEST: 32Gi
+    KUBERNETES_SERVICE_MEMORY_LIMIT: 32Gi
+    KUBERNETES_SERVICE_CPU_REQUEST: 8
+    KUBERNETES_SERVICE_CPU_LIMIT: 12
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  script:
+    - env
+    - pip install --no-cache-dir python-gitlab click
+    - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
+    - export GITLAB_ENDPOINT
+    - export NUM_CONCURRENT_JOBS
+    - python tests/test_utils/python_scripts/wait_for_resources.py --pipeline-id $CI_PIPELINE_ID
+  rules:
+    - if: $CI_MERGE_REQUEST_LABELS =~ /fast-track/
+      when: never
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+      when: on_success
+    - when: never
+test:unit_tests_configure:
+  extends: [.test_rules]
+  needs:
+    - test:build_image
+    - job: wait_for_resources
+      optional: true
+  image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  before_script:
+    - git rm -r tests/test_utils/local_recipes || true
+    - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/test_utils/local_recipes
+    - ls tests/test_utils/local_recipes
+  script:
+    - env
+    - set -x
+    - |
+      A100_CLUSTER=$([[ "$CLUSTER_A100" != "" ]] && echo $CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER)
+      H100_CLUSTER=$([[ "$CLUSTER_H100" != "" ]] && echo $CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER)
+    - |
+      ARGS=(
+        "--scope unit-tests"
+        "--n-repeat ${UNIT_TEST_REPEAT}"
+        "--time-limit $(( UNIT_TEST_TIMEOUT * 60 ))"
+        "--test-cases all"
+        "--cluster dgxh100_coreweave"
+        "--platform dgx_h100"
+        "--partition batch_short,batch"
+        "--container-image ${UTILITY_IMAGE}"
+        "--container-tag ${CI_PIPELINE_ID}"
+        "--dependent-job test:unit_tests_configure"
+        "--slurm-account ${CI_SLURM_ACCOUNT}"
+        "--no-enable-warmup"
+      )
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+        ${ARGS[@]} \
+        --environment "lts" \
+        --tag "legacy" \
+        --output-path "unit-test-job-lts-legacy.yaml"
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+        ${ARGS[@]} \
+        --environment "lts" \
+        --tag "latest" \
+        --output-path "unit-test-job-lts-latest.yaml"
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+        ${ARGS[@]} \
+        --environment "dev" \
+        --tag "legacy" \
+        --output-path "unit-test-job-dev-legacy.yaml"
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+        ${ARGS[@]} \
+        --environment "dev" \
+        --tag "latest" \
+        --output-path "unit-test-job-dev-latest.yaml"
+  rules:
+    - if: $UNIT_TEST == 'yes' && $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
+      allow_failure: true
+      when: on_success
+    - if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0'
+      when: on_success
+  artifacts:
+    paths:
+      - unit-test-job-dev-legacy.yaml
+      - unit-test-job-dev-latest.yaml
+      - unit-test-job-lts-legacy.yaml
+      - unit-test-job-lts-latest.yaml
+      - tests/test_utils/local_recipes
+.unit_tests_run:
+  needs:
+    - test:linting_formatting
+    - test:linting_copyright
+    - job: test:linting_secret_detection
+      optional: true
+    - test:unit_tests_configure
+    - test:build_image
+  extends: [.test_rules]
+  trigger:
+    include:
+      - artifact: unit-test-job-$ENVIRONMENT-$TAG.yaml
+        job: test:unit_tests_configure
+    strategy: depend
+  variables:
+    RO_API_TOKEN: $PAT
+    CONTAINER_TAG: $CI_PIPELINE_ID
+    CI_MCORE_LTS_IMAGE: $CI_MCORE_LTS_IMAGE
+    GITLAB_ENDPOINT: $GITLAB_ENDPOINT
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    MCORE_MR_COMMIT: $MCORE_MR_COMMIT
+    MCORE_BACKWARDS_COMMIT: $MCORE_BACKWARDS_COMMIT
+  inherit:
+    variables: true
+  rules:
+    - if: $UNIT_TEST == 'yes' && $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
+      allow_failure: true
+      when: on_success
+    - if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0'
+      when: on_success
+test:unit_tests_pyt(DEV)_mcore(latest):
+  extends: [.unit_tests_run]
+  variables:
+    ENVIRONMENT: dev
+    TAG: latest
+test:unit_tests_pyt(LTS)_mcore(latest):
+  extends: [.unit_tests_run]
+  variables:
+    ENVIRONMENT: lts
+    TAG: latest
+test:unit_tests_notify:
+  extends: [.test_rules]
+  image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+  needs:
+    - test:unit_tests_pyt(DEV)_mcore(latest)
+    - test:unit_tests_pyt(LTS)_mcore(latest)
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  script:
+    - env
+    - export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK}
+    - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
+    - export GITLAB_ENDPOINT
+    - export TAG_TEAM=$([[ "$CI_COMMIT_BRANCH" == "main" ]] && echo "1" || "0")
+    - export TEAM_SLUG=$SLACK_ADMIN
+    - |
+      python tests/test_utils/python_scripts/notify.py \
+        --pipeline-id "${CI_PIPELINE_ID}" \
+        --check-for unit-tests \
+        --pipeline-context "unit-tests-extended" \
+        --pipeline-created-at "${CI_PIPELINE_CREATED_AT}"
+  artifacts:
+    when: always
+    paths:
+      - scripts
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "ci-unit-test-extended"
+      when: always
+    - when: never
+test:linting_docs_build:
+  extends: [.test_rules]
+  image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  needs: [test:build_image]
+  script:
+    - cd ..
+    - rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/nemo-megatron-core-tme/documentation.git
+    - mv megatron-lm/ documentation/
+    - cd documentation/
+    - ./repo docs
+test:linting_formatting:
+  extends: [.test_rules]
+  image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  needs: [test:build_image]
+  variables:
+    GIT_STRATEGY: "clone"
+  script:
+    - |
+      if [[ "$CI_PIPELINE_SOURCE" != "merge_request_event" ]]; then
+        exit 0
+      fi
+    - set +e
+    - git fetch origin main:main
+    - |
+      if [[ "$CI_MERGE_REQUEST_PROJECT_PATH" == "$CI_MERGE_REQUEST_SOURCE_PROJECT_PATH" ]]; then 
+        bash tools/autoformat.sh
+        set -e
+        git fetch origin $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME
+        git checkout $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME
+        git config --global user.email "mcore-bot@nvidia.com"
+        git config --global user.name "Mcore Bot"
+        git remote set-url origin "https://gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
+        git add -A .
+        git commit -m "chore: Format files" || true
+        git push -u origin $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME
+      fi
+    - env
+    - BASE_REF="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" CHECK_ONLY=true SKIP_DOCS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Skip docs"* ]] && echo "true" || echo "false") bash tools/autoformat.sh
+test:linting_copyright:
+  extends: [.test_rules]
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+  needs: [test:build_image]
+  script:
+    - git fetch origin main
+    - bash tools/copyright.sh
+# Override from template
+secret_detection:
+  rules:
+    - when: never
+# Inherit and modify template
+test:linting_secret_detection:
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  extends: [".secret-analyzer"]
+  needs: [test:build_image]
+  variables:
+    GIT_DEPTH: 0
+    SECRET_DETECTION_LOG_OPTIONS: ${CI_MERGE_REQUEST_DIFF_BASE_SHA}..${CI_COMMIT_SHA}
+  allow_failure: false
+  rules:
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+    - when: never
+  script:
+    - apk add jq
+    - /analyzer run
+    - |
+      if [[ $(cat gl-secret-detection-report.json | jq '.vulnerabilities | length > 0') == true ]]; then
+        echo "Atleast one vulnerability has been found"
+        cat gl-secret-detection-report.json | jq '.'
+        exit 1
+      fi
+test:unit_tests_x_coverage_report:
+  extends: [.test_rules]
+  needs:
+    - job: test:unit_tests_pyt(DEV)_mcore(latest)
+    - job: test:unit_tests_pyt(LTS)_mcore(latest)
+  image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  script:
+    - env
+    - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
+    - export GITLAB_ENDPOINT
+    - python tests/test_utils/python_scripts/download_coverage_results.py --pipeline-id ${CI_PIPELINE_ID}
+    - coverage combine --keep $(ls coverage_results/*/coverage_report)
+    - coverage report
+    - coverage xml
+  coverage: "/TOTAL.+ ([0-9]{1,3}%)/"
+  artifacts:
+    reports:
+      coverage_report:
+        coverage_format: cobertura
+        path: coverage.xml
+  rules:
+    - if: $UNIT_TEST == 'yes' && $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
+      allow_failure: true
+      when: on_success
+    - if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0'
+      when: on_success
+test:safe_imports:
+  extends: [.test_rules]
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/builder-large
+    - team/megatron
+  services:
+    - name: docker:24.0.5-dind
+      variables:
+        HEALTHCHECK_TCP_PORT: "2376"
+  variables:
+    KUBERNETES_SERVICE_MEMORY_REQUEST: 32Gi
+    KUBERNETES_SERVICE_MEMORY_LIMIT: 32Gi
+    KUBERNETES_SERVICE_CPU_REQUEST: 8
+    KUBERNETES_SERVICE_CPU_LIMIT: 12
+  image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+  needs: [test:build_image]
+  script:
+    - env
+    - pip install -e .
+    - python .gitlab/scripts/check_imports.py --package-name megatron.core
+  rules:
+    - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME != 'main'
+      when: never
+    - if: $UNIT_TEST == 'yes' && $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
+      allow_failure: true
+      when: on_success
+    - if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0'
+      when: on_success
+  retry:
+    max: 2
--- a/Megatron-LM/.gitlab/stages/03.integration-tests.yml
+++ b/Megatron-LM/.gitlab/stages/03.integration-tests.yml
+.integration_tests_rules:
+  stage: integration_tests
+  rules:
+    - if: $INTEGRATION_TEST == "yes"
+      when: on_success
+    - when: never
+default:
+  id_tokens:
+    VAULT_JWT_TOKEN:
+      aud: https://stg.vault.nvidia.com
+include:
+  - project: dl/jet/gitlab-templates
+    ref: main
+    file: downstreams.yml
+integration:configure:
+  needs:
+    - test:build_image
+    - job: test:unit_tests_pyt(DEV)_mcore(latest)
+      optional: true
+    - job: test:unit_tests_pyt(LTS)_mcore(latest)
+      optional: true
+    - job: test:build_nemo_image
+  extends: [.integration_tests_rules]
+  image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  before_script:
+    - git rm -r tests/test_utils/local_recipes || true
+    - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/test_utils/local_recipes
+    - ls tests/test_utils/local_recipes
+  script:
+    - set -x
+    - |
+      A100_CLUSTER=$([[ "$CLUSTER_A100" != "" ]] && echo $CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER)
+      H100_CLUSTER=$([[ "$CLUSTER_H100" != "" ]] && echo $CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER)
+    - |
+      ARGS=(
+        "--scope $INTEGRATION_TEST_SCOPE"
+        "--n-repeat 1"
+        "--time-limit $INTEGRATION_TEST_TIME_LIMIT"
+        "--test-cases $INTEGRATION_TEST_CASES"
+        "--container-image ${UTILITY_IMAGE}"
+        "--container-tag ${CI_PIPELINE_ID}"
+        "--slurm-account ${CI_SLURM_ACCOUNT}"
+        "--no-enable-warmup"
+        "--dependent-job integration:configure"
+        "--enable-lightweight-mode"
+      )
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+        ${ARGS[@]} \
+        --environment dev \
+        --platform dgx_a100 \
+        --cluster $A100_CLUSTER \
+        --output-path "functional-test-job-dev-A100.yaml"
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+        ${ARGS[@]} \
+        --environment dev \
+        --platform dgx_h100 \
+        --cluster $H100_CLUSTER \
+        --output-path "functional-test-job-dev-H100.yaml"
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+        ${ARGS[@]} \
+        --environment lts \
+        --platform dgx_a100 \
+        --cluster $A100_CLUSTER \
+        --output-path "functional-test-job-lts-A100.yaml"
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+        ${ARGS[@]} \
+        --environment lts \
+        --platform dgx_h100 \
+        --cluster $H100_CLUSTER \
+        --output-path "functional-test-job-lts-H100.yaml"
+  artifacts:
+    paths:
+      - functional-test-job-lts-A100.yaml
+      - functional-test-job-lts-H100.yaml
+      - functional-test-job-dev-H100.yaml
+      - functional-test-job-dev-A100.yaml
+      - tests/test_utils/local_recipes
+.integration_run:
+  needs:
+    - integration:configure
+    - test:build_image
+    - wait_for_resources
+  extends: [.integration_tests_rules]
+  trigger:
+    include:
+      - artifact: functional-test-job-$ENVIRONMENT-$CLUSTER.yaml
+        job: integration:configure
+    strategy: depend
+  variables:
+    RO_API_TOKEN: $PAT
+    CONTAINER_TAG: $CI_PIPELINE_ID
+    CI_MCORE_LTS_IMAGE: $CI_MCORE_LTS_IMAGE
+    GITLAB_ENDPOINT: $GITLAB_ENDPOINT
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    DASHBOARD_ENDPOINT: $DASHBOARD_ENDPOINT
+    MCORE_MR_COMMIT: $MCORE_MR_COMMIT
+    MCORE_BACKWARDS_COMMIT: $MCORE_BACKWARDS_COMMIT
+  inherit:
+    variables: true
+integration:run_lts_dgx_a100:
+  extends: [.integration_run]
+  variables:
+    ENVIRONMENT: lts
+    CLUSTER: A100
+integration:run_lts_dgx_h100:
+  extends: [.integration_run]
+  variables:
+    ENVIRONMENT: lts
+    CLUSTER: H100
+integration:run_dev_dgx_a100:
+  extends: [.integration_run]
+  variables:
+    ENVIRONMENT: dev
+    CLUSTER: A100
+integration:run_dev_dgx_h100:
+  extends: [.integration_run]
+  variables:
+    ENVIRONMENT: dev
+    CLUSTER: H100
--- a/Megatron-LM/.gitlab/stages/04.functional-tests.yml
+++ b/Megatron-LM/.gitlab/stages/04.functional-tests.yml
+.functional_tests_rules:
+  stage: functional_tests
+  rules:
+    - if: $FUNCTIONAL_TEST == "yes"
+      when: on_success
+    - when: never
+default:
+  id_tokens:
+    VAULT_JWT_TOKEN:
+      aud: https://stg.vault.nvidia.com
+include:
+  - project: dl/jet/gitlab-templates
+    ref: main
+    file: downstreams.yml
+functional:configure:
+  needs:
+    - test:build_image
+    - test:build_nemo_image
+    - job: test:unit_tests_pyt(DEV)_mcore(latest)
+      optional: true
+    - job: test:unit_tests_pyt(LTS)_mcore(latest)
+      optional: true
+    - job: integration:run_lts_dgx_a100
+      optional: true
+    - job: integration:run_dev_dgx_a100
+      optional: true
+    - job: integration:run_lts_dgx_h100
+      optional: true
+    - job: integration:run_dev_dgx_h100
+      optional: true
+  extends: [.functional_tests_rules]
+  image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  before_script:
+    - git rm -r tests/test_utils/local_recipes || true
+    - git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/test_utils/local_recipes
+    - ls tests/test_utils/local_recipes
+  script:
+    - set -x
+    - |
+      A100_CLUSTER=$([[ "$CLUSTER_A100" != "" ]] && echo $CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER)
+      H100_CLUSTER=$([[ "$CLUSTER_H100" != "" ]] && echo $CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER)
+    - |
+      RECORD_CHECKPOINTS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Record checkpoints"* || "$FUNCTIONAL_TEST_RECORD_CHECKPOINTS" == "yes" ]] && echo "true" || echo "false")
+    - |
+      if [[ "$FUNCTIONAL_TEST_SCOPE" == "release" || "$FUNCTIONAL_TEST_SCOPE" == "pre-release" ]]; then
+        FUNCTIONAL_TEST_NAME=$(eval echo $FUNCTIONAL_TEST_NAME)
+        RELEASE_ARGS=(
+          "--run-name"
+          $FUNCTIONAL_TEST_NAME
+          "--wandb-experiment"
+          $(echo $FUNCTIONAL_TEST_NAME | tr '/' '-')
+        )
+      else
+        RELEASE_ARGS=()
+      fi
+    - |
+      ARGS=(
+        "--scope $FUNCTIONAL_TEST_SCOPE"
+        "--n-repeat $FUNCTIONAL_TEST_REPEAT"
+        "--time-limit $FUNCTIONAL_TEST_TIME_LIMIT"
+        "--test-cases $FUNCTIONAL_TEST_CASES"
+        "--container-image ${UTILITY_IMAGE}"
+        "--container-tag ${CI_PIPELINE_ID}"
+        "--dependent-job functional:configure"
+        "--record-checkpoints ${RECORD_CHECKPOINTS}"
+        "--slurm-account ${CI_SLURM_ACCOUNT}"
+        "--no-enable-warmup"
+      )
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+        ${ARGS[@]} \
+        --environment dev \
+        --platform dgx_a100 \
+        --cluster $A100_CLUSTER \
+        --output-path "functional-test-job-dev-A100.yaml" \
+        ${RELEASE_ARGS[@]}
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+        ${ARGS[@]} \
+        --environment dev \
+        --platform dgx_h100 \
+        --cluster $H100_CLUSTER \
+        --output-path "functional-test-job-dev-H100.yaml" \
+        ${RELEASE_ARGS[@]}
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+        ${ARGS[@]} \
+        --environment lts \
+        --platform dgx_a100 \
+        --cluster $A100_CLUSTER \
+        --output-path "functional-test-job-lts-A100.yaml" \
+        ${RELEASE_ARGS[@]}
+    - |
+      export PYTHONPATH=$(pwd)
+      python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
+        ${ARGS[@]} \
+        --environment lts \
+        --platform dgx_h100 \
+        --cluster $H100_CLUSTER \
+        --output-path "functional-test-job-lts-H100.yaml" \
+        ${RELEASE_ARGS[@]}
+  artifacts:
+    paths:
+      - functional-test-job-lts-A100.yaml
+      - functional-test-job-lts-H100.yaml
+      - functional-test-job-dev-A100.yaml
+      - functional-test-job-dev-H100.yaml
+      - tests/test_utils/local_recipes
+.functional_run:
+  needs:
+    - functional:configure
+    - test:build_image
+  extends: [.functional_tests_rules]
+  trigger:
+    include:
+      - artifact: functional-test-job-$ENVIRONMENT-$CLUSTER.yaml
+        job: functional:configure
+    strategy: depend
+  variables:
+    RO_API_TOKEN: $PAT
+    CONTAINER_TAG: $CI_PIPELINE_ID
+    CI_MCORE_LTS_IMAGE: $CI_MCORE_LTS_IMAGE
+    GITLAB_ENDPOINT: $GITLAB_ENDPOINT
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    DASHBOARD_ENDPOINT: $DASHBOARD_ENDPOINT
+    MCORE_MR_COMMIT: $MCORE_MR_COMMIT
+    MCORE_BACKWARDS_COMMIT: $MCORE_BACKWARDS_COMMIT
+    CLUSTER: $CLUSTER
+  inherit:
+    variables: true
+functional:run_lts_dgx_a100:
+  extends: [.functional_run]
+  variables:
+    ENVIRONMENT: lts
+    CLUSTER: A100
+functional:run_lts_dgx_h100:
+  extends: [.functional_run]
+  variables:
+    ENVIRONMENT: lts
+    CLUSTER: H100
+functional:run_dev_dgx_a100:
+  extends: [.functional_run]
+  variables:
+    ENVIRONMENT: dev
+    CLUSTER: A100
+functional:run_dev_dgx_h100:
+  extends: [.functional_run]
+  variables:
+    ENVIRONMENT: dev
+    CLUSTER: H100
+functional:run_nemo:
+  extends: [.functional_tests_rules]
+  trigger:
+    project: "dl/joc/nemo-ci"
+    branch: main-mirror
+    strategy: depend
+  inherit:
+    variables: true
+  variables:
+    MCORE_COMMIT: $CI_COMMIT_SHA
+    TEST_NEMO2_MODULE: "True"
+    ALLOW_FAILURE_DEPENDENCY: "True"
+    TESTS_TO_RUN_ON_THIS_COMMIT: nightly
+  rules:
+    - if: $FUNCTIONAL_TEST == "yes"
+      when: manual
+      allow_failure: true
+    - when: never
+functional:x_notify:
+  extends: [.functional_tests_rules]
+  image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+  needs:
+    - functional:run_lts_dgx_a100
+    - functional:run_dev_dgx_a100
+    - functional:run_lts_dgx_h100
+    - functional:run_dev_dgx_h100
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  variables:
+    WEBHOOK_URL: ${MCORE_NOTIFICATION_HOOK}
+    RO_API_TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}
+    CONTEXT: $FUNCTIONAL_TEST_SCOPE
+  script:
+    - env
+    - export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK}
+    - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
+    - export GITLAB_ENDPOINT
+    - export CONTEXT=$FUNCTIONAL_TEST_SCOPE
+    - export TAG_TEAM=$([[ "$CI_COMMIT_BRANCH" == "main" ]] && echo "1" || "0")
+    - export TEAM_SLUG=$SLACK_ADMIN
+    - |
+      python tests/test_utils/python_scripts/notify.py \
+        --pipeline-id "${CI_PIPELINE_ID}" \
+        --check-for functional-tests \
+        --pipeline-context $CONTEXT \
+        --pipeline-created-at "${CI_PIPELINE_CREATED_AT}"
+  artifacts:
+    when: always
+    paths:
+      - scripts
+  rules:
+    - if: ($CI_PIPELINE_SOURCE == "schedule" || $CI_COMMIT_BRANCH == "main") && $FUNCTIONAL_TEST == "yes"
+      when: always
+    - when: never
+functional:x_download_golden_values:
+  extends: [.functional_tests_rules]
+  image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  script:
+    - env
+    - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
+    - export GITLAB_ENDPOINT
+    - python tests/test_utils/python_scripts/download_golden_values.py --pipeline-id ${CI_PIPELINE_ID}
+  artifacts:
+    paths:
+      - tests/
+  rules:
+    - if: $FUNCTIONAL_TEST == "yes"
+      when: manual
+      allow_failure: true
+    - when: never
--- a/Megatron-LM/.gitlab/stages/05.publish.yml
+++ b/Megatron-LM/.gitlab/stages/05.publish.yml
+.publish_common_freeze:
+  stage: publish
+  rules:
+    - if: ($CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH) && $PUBLISH == "yes" && $PUBLISH_SCOPE == "code-freeze"
+      when: manual
+    - when: never
+.publish_common_release:
+  stage: publish
+  rules:
+    - if: $CI_COMMIT_BRANCH =~ /^core_r/ && $PUBLISH == "yes" && $PUBLISH_SCOPE == "release"
+      when: manual
+    - if: $PUBLISH == "yes" && $PUBLISH_SCOPE == "release"
+      when: manual
+    - when: never
+publish:test_release_pypi_build_wheel:
+  extends: [.test_rules]
+  stage: publish
+  image:
+    name: ${IMAGE}
+    entrypoint: [""]
+  services:
+    - name: docker:24.0.5-dind
+      variables:
+        HEALTHCHECK_TCP_PORT: "2376"
+  needs: [test:build_image]
+  parallel:
+    matrix:
+      - PLATFORM: arm64
+        IMAGE: quay.io/pypa/manylinux_2_28_aarch64
+      - PLATFORM: amd64
+        IMAGE: quay.io/pypa/manylinux_2_28_x86_64
+  tags:
+    - arch/${PLATFORM}
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/builder-small
+    - team/megatron
+  variables:
+    PY_ENV: pytorch_25.03
+    KUBERNETES_SERVICE_MEMORY_REQUEST: 16Gi
+    KUBERNETES_SERVICE_MEMORY_LIMIT: 16Gi
+    PUBLISH_DRYRUN: "yes"
+    KUBERNETES_SERVICE_CPU_REQUEST: 4
+    KUBERNETES_SERVICE_CPU_LIMIT: 8
+  before_script:
+    - env
+    - eval PUBLISH_COMMIT=$PUBLISH_COMMIT
+    - env
+    - git fetch origin $PUBLISH_COMMIT
+    - git checkout $PUBLISH_COMMIT
+  script:
+    - echo $PUBLISH_DRYRUN
+    - |
+      if [ "$PUBLISH_DRYRUN" = "yes" ]; then
+        PRE_RELEASE=$(sed -n "s/.*PRE_RELEASE = '\(.*\)'/\1/p" megatron/core/package_info.py)
+        sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '${PRE_RELEASE}.dev$((RANDOM % 900000 + 100000))'" megatron/core/package_info.py 
+      fi
+    - /opt/python/cp310-cp310/bin/python -m build
+    - /opt/python/cp311-cp311/bin/python -m build
+    - auditwheel repair dist/*.whl
+    - rm -rf dist/*.whl
+    - pushd megatron/core
+    - EXPECTED_RELEASE_NUMBER=$(/opt/python/cp311-cp311/bin/python -c "import package_info; print(package_info.__version__)")
+    - popd
+    - echo "EXPECTED_RELEASE_NUMBER_$PLATFORM=$EXPECTED_RELEASE_NUMBER" | tee -a build.env
+  artifacts:
+    paths:
+      - megatron/core/package_info.py
+      - wheelhouse/
+      - dist/
+    reports:
+      dotenv: build.env
+  retry:
+    max: 2
+publish:test_release_pypi_test_wheel:
+  extends: [.test_rules]
+  stage: publish
+  image:
+    name: python:3.11
+    entrypoint: [""]
+  needs:
+    - job: publish:test_release_pypi_build_wheel
+      optional: true
+  parallel:
+    matrix:
+      - PLATFORM: arm64
+      - PLATFORM: amd64
+  services:
+    - name: docker:24.0.5-dind
+      variables:
+        HEALTHCHECK_TCP_PORT: "2376"
+  tags:
+    - arch/${PLATFORM}
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/builder-small
+    - team/megatron
+  variables:
+    KUBERNETES_SERVICE_MEMORY_REQUEST: 16Gi
+    KUBERNETES_SERVICE_MEMORY_LIMIT: 16Gi
+    KUBERNETES_SERVICE_CPU_REQUEST: 4
+    KUBERNETES_SERVICE_CPU_LIMIT: 8
+    GIT_STRATEGY: none
+    PUBLISH_DRYRUN: "yes"
+  script:
+    - rm -rf megatron
+    - pip install -U --no-cache-dir pip
+    - |
+      if [[ "$PLATFORM" == "arm64" ]]; then
+        for file in wheelhouse/*cp311*aarch64.whl; do
+          pip install --no-cache-dir "$file[dev,mlm]"
+        done
+      else
+        for file in wheelhouse/*cp311*x86_64.whl; do
+          pip install --no-cache-dir "$file[dev,mlm]"
+        done
+      fi
+    - RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
+    - |
+      if [[ "$PLATFORM" == "arm64" ]]; then
+        test "$EXPECTED_RELEASE_NUMBER_arm64" == "$RELEASE_NUMBER"
+      else
+        test "$EXPECTED_RELEASE_NUMBER_amd64" == "$RELEASE_NUMBER"
+      fi
+    - echo "RELEASE_NUMBER=$RELEASE_NUMBER" | tee -a build.env
+  artifacts:
+    reports:
+      dotenv: build.env
+    paths:
+      - wheelhouse/
+      - dist/
+  retry:
+    max: 2
+publish:test_release_pypi_push_wheel:
+  extends: [.test_rules]
+  image: python:3.11
+  stage: publish
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  needs:
+    - job: publish:test_release_pypi_test_wheel
+      optional: true
+  variables:
+    GIT_STRATEGY: none
+    PUBLISH_DRYRUN: "yes"
+  timeout: 3m
+  script:
+    - echo $PUBLISH_DRYRUN
+    - |
+      if [ "$PUBLISH_DRYRUN" = "yes" ]; then
+        REPOSITORY=testpypi
+        export TWINE_USERNAME=$TWINE_TEST_USERNAME
+        export TWINE_PASSWORT=$TWINE_TEST_PASSWORD
+      else
+        REPOSITORY=pypi
+        export TWINE_USERNAME=$TWINE_PROD_USERNAME
+        export TWINE_PASSWORT=$TWINE_PROD_PASSWORD
+      fi
+    - ls -al dist/
+    - ls -al wheelhouse/
+    - pip install twine
+    - |
+      if [[ "$PUBLISH_DRYRUN" != "yes" ]]; then
+        twine upload --verbose -u $TWINE_USERNAME -p $TWINE_PASSWORT --repository   $REPOSITORY wheelhouse/* dist/*
+      fi
+publish:test_release_github:
+  extends: [.test_rules]
+  needs: [publish:test_release_pypi_test_wheel]
+  stage: publish
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  image: nentangso/alpine-git-curl-jq
+  before_script:
+    - eval PUBLISH_COMMIT=$PUBLISH_COMMIT
+    - git fetch origin $PUBLISH_COMMIT
+    - git checkout $PUBLISH_COMMIT
+  variables:
+    PUBLISH_DRYRUN: "yes"
+  script:
+    - echo $PUBLISH_DRYRUN
+    - NAME="NVIDIA Megatron Core $RELEASE_NUMBER"
+    - IS_PRERELEASE=$([[ "$RELEASE_NUMBER" == *rc* ]] && echo "true" || echo "false")
+    - |
+      if [[ "$IS_PRERELEASE" == "true" ]]; then
+        DATE=$(date +"%Y-%m-%d")
+        CHANGELOG="Prerelease: $NAME ($DATE)"
+      else
+        CHANGELOG=$(awk '/^## '"$NAME"'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md)
+        CHANGELOG=$(echo "$CHANGELOG" | sed '/./!d')
+      fi
+    - |
+      PAYLOAD=$(jq -nc \
+                  --arg TAG_NAME "core_v${RELEASE_NUMBER}" \
+                  --arg CI_COMMIT_SHA "$PUBLISH_COMMIT" \
+                  --arg NAME "$NAME" \
+                  --arg BODY "$CHANGELOG" \
+                  --argjson PRERELEASE "$IS_PRERELEASE" \
+                  '{
+                      "tag_name": $TAG_NAME,
+                      "target_commitish": $CI_COMMIT_SHA,
+                      "name": $NAME,
+                      "body": $BODY,
+                      "draft": false,
+                      "prerelease": $PRERELEASE,
+                      "generate_release_notes": false
+                  }'
+              )
+      echo -E "$PAYLOAD" | tee -a payload.txt
+    - cat payload.txt
+    - |
+      CMD=$(echo -E 'curl -L \
+        -X POST \
+        -H "Accept: application/vnd.github+json" \
+        -H "Authorization: Bearer '"$GH_TOKEN"'" \
+        -H "X-GitHub-Api-Version: 2022-11-28" \
+        https://api.github.com/repos/NVIDIA/Megatron-LM/releases \
+        -d @payload.txt
+      ')
+    - |
+      if [[ "$PUBLISH_DRYRUN" == "yes" ]]; then
+        echo -E "$CMD"
+      else
+        eval "$CMD"
+      fi
+publish:test_release_notify:
+  needs: [publish:test_release_pypi_test_wheel, publish:test_release_pypi_push_wheel, publish:test_release_github]
+  extends: [.test_rules]
+  image: badouralix/curl-jq
+  stage: publish
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  variables:
+    PUBLISH_DRYRUN: "yes"
+  script:
+    - echo $PUBLISH_DRYRUN
+    - URL="https://github.com/NVIDIA/Megatron-LM/releases/tag/core_v$RELEASE_NUMBER"
+    - |
+      cat << EOF > message.json
+      {
+          "blocks": [
+            {
+              "type": "section",
+              "text": {
+                "type": "mrkdwn",
+                "text": "Releasebot 🤖: Megatron-Core released <${URL}|core_v${RELEASE_NUMBER}> 🚀"
+              }
+            }
+          ]
+      }
+      EOF
+    - cat message.json
+    - |
+      CMD=$(echo curl \
+        -X POST \
+        -H "Content-type: application/json" \
+        -d @message.json ${MCORE_NOTIFICATION_HOOK_MAIN}
+      )
+      if [[ "$PUBLISH_DRYRUN" == "yes" ]]; then
+        echo "$CMD"
+      else
+        eval "$CMD"
+      fi
+publish:test_release_version_bump:
+  needs: [publish:test_release_pypi_test_wheel, publish:test_release_pypi_push_wheel, publish:test_release_github]
+  extends: [.test_rules]
+  image: nentangso/alpine-git-curl-jq
+  stage: publish
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  before_script:
+    - eval PUBLISH_COMMIT=$PUBLISH_COMMIT
+    - eval PUBLISH_VERSION_BUMP_BRANCH=$PUBLISH_VERSION_BUMP_BRANCH
+    - git fetch origin $PUBLISH_COMMIT
+    - git checkout $PUBLISH_COMMIT
+  variables:
+    PUBLISH_DRYRUN: "yes"
+  script:
+    - env
+    - echo $PUBLISH_DRYRUN
+    - MAJOR=$(cat megatron/core/package_info.py | awk '/^MAJOR = /' | awk -F"= " '{print $2}')
+    - MINOR=$(cat megatron/core/package_info.py | awk '/^MINOR = /' | awk -F"= " '{print $2}')
+    - PATCH=$(cat megatron/core/package_info.py | awk '/^PATCH = /' | awk -F"= " '{print $2}')
+    - PRERELEASE=$(cat megatron/core/package_info.py | awk '/^PRE_RELEASE = /' | awk -F"= " '{print $2}' | tr -d '"' | tr -d "'")
+    - |
+      if [[ "$PRERELEASE" != "" ]]; then
+        NEXT_PATCH=$PATCH
+        NEXT_PRERELEASE=rc$((${PRERELEASE#rc} + 1))
+      else
+        NEXT_PATCH=$((${PATCH} + 1))
+        NEXT_PRERELEASE=$NEXT_PRERELEASE
+      fi
+    - sed -i "/^PATCH/c\PATCH = $NEXT_PATCH" megatron/core/package_info.py
+    - sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '$NEXT_PRERELEASE'" megatron/core/package_info.py
+    - git config --global user.email "mcore-bot@nvidia.com"
+    - git config --global user.name "Mcore Bot"
+    - git remote set-url origin "https://gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
+    - |
+      CMD=$(
+        cat <<'EOF'
+          git switch --force-create bot/chore/bump-version && \
+          git add megatron/core/package_info.py && \
+          git commit -m "chore: adjust version version" && \
+          git push -f -u origin bot/chore/bump-version && \
+          curl \
+            --header "PRIVATE-TOKEN: $PAT" \
+            --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests" \
+            -d "source_branch=bot/chore/bump-version" \
+            -d "target_branch=$PUBLISH_VERSION_BUMP_BRANCH" \
+            -d "title=chore: Fix version of \`$PUBLISH_VERSION_BUMP_BRANCH\`" \
+            -d "description=[🤖]: Hi @okoenig 👋,<br><br>we've adjusted the version number of \`$PUBLISH_VERSION_BUMP_BRANCH\` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience\!"
+      EOF
+      )
+    - |
+      if [[ "$PUBLISH_DRYRUN" == "yes" ]]; then
+        echo "$CMD"
+      else
+        eval "$CMD"
+      fi
+publish:code_freeze:
+  extends: [.publish_common_freeze]
+  image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
+  needs: [test:build_image]
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  script:
+    - git fetch origin $CI_DEFAULT_BRANCH
+    - git config --global user.email "mcore-bot@nvidia.com"
+    - git config --global user.name "Mcore Bot"
+    - git remote set-url origin "https://gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
+    - sed -i "/^PRE_RELEASE/c\PRE_RELEASE = ''" megatron/core/package_info.py
+    - VERSION=$(python -c "from megatron import core; print(core.__version__)")
+    - RELEASE_BRANCH=core_r$VERSION
+    - git switch --force-create $RELEASE_BRANCH origin/$CI_DEFAULT_BRANCH
+    - git push -u origin $RELEASE_BRANCH
+    - |
+      MESSAGE='{
+        "blocks": [
+          {
+            "type": "section",
+            "text": {
+              "type": "mrkdwn",
+              "text": "Releasebot 🤖: Megatron Core has been frozen 🎉 to branch `'"$RELEASE_BRANCH"'`"
+            }
+          }
+        ]
+      }'
+    - |
+      curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${MCORE_NOTIFICATION_HOOK_MAIN}
+    - git switch main
+    - git switch --force-create bot/chore/bump-version
+    - git add megatron/core/package_info.py
+    - |
+      git commit -m "chore: adjust version version"
+    - git push -u origin bot/chore/bump-version
+    - |
+      curl \
+        --header "PRIVATE-TOKEN: $PAT" \
+        --url https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests \
+        -d "source_branch=bot/chore/bump-version" \
+        -d "target_branch=$RELEASE_BRANCH" \
+        -d "title=chore: Fix version of \`$RELEASE_BRANCH\`" \
+        -d "description=[🤖]: Hi @okoenig 👋,<br><br>we've adjusted the version number of \`$RELEASE_BRANCH\` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience\!"
+publish:release_pypi_build_wheel:
+  extends: [publish:test_release_pypi_build_wheel, .publish_common_release]
+  dependencies: []
+  variables:
+    PUBLISH_DRYRUN: "no"
+publish:release_pypi_test_wheel:
+  extends: [publish:test_release_pypi_test_wheel, .publish_common_release]
+  needs: [publish:release_pypi_build_wheel]
+  variables:
+    PUBLISH_DRYRUN: "no"
+publish:release_pypi_push_wheel:
+  extends: [publish:test_release_pypi_push_wheel, .publish_common_release]
+  needs: [publish:release_pypi_test_wheel]
+  dependencies: [publish:release_pypi_test_wheel]
+  variables:
+    PUBLISH_DRYRUN: "no"
+publish:release_github:
+  extends: [publish:test_release_github, .publish_common_release]
+  dependencies: [publish:release_pypi_test_wheel]
+  needs: [publish:release_pypi_test_wheel]
+  variables:
+    PUBLISH_DRYRUN: "no"
+publish:release_version_bump:
+  needs: [publish:release_pypi_test_wheel]
+  extends: [publish:test_release_version_bump, .publish_common_release]
+  variables:
+    PUBLISH_DRYRUN: "no"
+publish:release_notify:
+  needs: [publish:release_pypi_test_wheel, publish:release_pypi_push_wheel, publish:release_github]
+  extends: [publish:test_release_notify, .publish_common_release]
+  dependencies: [publish:release_pypi_test_wheel]
+  variables:
+    PUBLISH_DRYRUN: "no"
+publish:docs:
+  extends: [.publish_common_release]
+  image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  before_script:
+    - eval PUBLISH_COMMIT=$PUBLISH_COMMIT
+    - git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*'
+    - git fetch origin $PUBLISH_COMMIT
+    - git checkout $PUBLISH_COMMIT
+  script:
+    - cd ..
+    - rm -rf documentation && git clone --recursive https://gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT}/nemo-megatron-core-tme/documentation.git
+    - cd documentation/megatron-lm
+    - git config --global user.email "mcore-bot@nvidia.com"
+    - git config --global user.name "Mcore Bot"
+    - git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*'
+    - git fetch origin $PUBLISH_COMMIT
+    - git checkout $PUBLISH_COMMIT
+    - cd ..
+    - git add megatron-lm
+    - |
+      git commit -m 'feat: Bump mcore'
+    - git push
+  rules:
+    - if: '$CI_COMMIT_REF_PROTECTED == "true" && $CI_PIPELINE_SOURCE == "push"'
+      allow_failure: true
+    - when: never
+publish:upload_statistics:
+  stage: publish
+  image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+  needs:
+    - job: test:unit_tests_pyt(DEV)_mcore(latest)
+    - job: test:unit_tests_pyt(LTS)_mcore(latest)
+    - job: functional:run_lts_dgx_a100
+      optional: true
+    - job: functional:run_lts_dgx_h100
+      optional: true
+    - job: functional:run_dev_dgx_a100
+      optional: true
+    - job: functional:run_dev_dgx_h100
+      optional: true
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  script:
+    - env
+    - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
+    - export GITLAB_ENDPOINT
+    - export DASHBOARD_ENDPOINT
+    - python tests/test_utils/python_scripts/dashboard.py --pipeline-id ${CI_PIPELINE_ID}
+  rules:
+    - if: ($CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' || $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train') && ($UNIT_TEST == "yes" || $INTEGRATION_TEST == "yes" || $FUNCTIONAL_TEST == "yes")
+      when: always
+      allow_failure: true
+    - when: never
+public:review_reminder:
+  stage: publish
+  image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
+  script:
+    - export GITLAB_ENDPOINT
+    - export RO_API_TOKEN=${PAT}
+    - export SLACK_WEBHOOK_URL=${SLACK_REMINDER_HOOK}
+    - export SLACK_API_TOKEN=${SLACK_API_TOKEN}
+    - python tests/test_utils/python_scripts/auto_reminder.py
+  tags:
+    - arch/amd64
+    - env/prod
+    - origin/jet-fleet
+    - owner/jet-core
+    - purpose/utility
+    - team/megatron
+  rules:
+    - if: $CI_COMMIT_BRANCH == "ci-review-reminder" && $PUBLISH == "yes" && $PUBLISH_SCOPE == "review-reminder"
+    - when: never