Commit 1106877d authored by jerrrrry's avatar jerrrrry
Browse files

“13.0”

parents
Pipeline #2934 failed with stages
in 0 seconds
__pycache__
*.bak
*.log
[submodule "Megatron-LM"]
path = Megatron-LM
url = https://github.com/NVIDIA/Megatron-LM.git
branch = main
[submodule]
Megatron-LM = main
[flake8]
max-line-length = 100
extend-ignore = E203,E501,F401,E402,E714
per-file-ignores = __init__.py:F401
\ No newline at end of file
---
name: BUG
about: Report a bug that needs attention
title: "[BUG]"
labels: ''
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
**Expected behavior**
A clear and concise description of what you expected to happen.
**Stack trace/logs**
If applicable, add the stack trace or logs from the time of the error.
**Environment (please complete the following information):**
- Megatron-LM commit ID
- PyTorch version
- CUDA version
- NCCL version
**Proposed fix**
If you have a proposal for how to fix the issue state it here or link to a PR.
**Additional context**
Add any other context about the problem here.
---
name: ENHANCEMENT
about: Suggest an idea to improve this project
title: "[ENHANCEMENT]"
labels: ''
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Proposed implementation**
If you have a proposed implementation for the feature state it here or link to a PR.
**Additional context**
Add any other context or screenshots about the feature request here.
---
name: QUESTION
about: Ask a question about Megatron-LM that is not a bug, regression or enhancement
request
title: "[QUESTION]"
labels: ''
assignees: ''
---
**Your question**
Ask a clear and concise question about Megatron-LM.
---
name: REGRESSION
about: Report a regression in speed or accuracy due to a Megatron-LM update
title: "[REGRESSION]"
labels: ''
assignees: ''
---
**Describe the regression**
A clear and concise description of what the regression is.
**To Reproduce**
Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
**Previous performance**
What speed or accuracy did you previously see.
**New performance**
What speed or accuracy do you see after the update.
**Stack trace/logs**
If applicable, add the stack trace or logs related to the regression.
**Environment (please complete the following information):**
- Previous Megatron-LM commit ID
- New Megatron-LM commit ID
- Previous PyTorch version
- New PyTorch version
- Previous CUDA version
- New CUDA version
- Previous NCCL version
- New NCCL version
**Proposed fix**
If you have a proposal for how to fix the issue state it here or link to a PR.
**Additional context**
Add any other context about the problem here.
# This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
#
# You can adjust the behavior by modifying this file.
# For more information, see:
# https://github.com/actions/stale
name: Mark stale issues and pull requests
on:
schedule:
- cron: '15 18 * * *'
jobs:
stale:
runs-on: ubuntu-latest
permissions:
issues: write
pull-requests: write
steps:
- uses: actions/stale@v5
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
days-before-stale: 60
stale-issue-message: 'Marking as stale. No activity in 60 days.'
stale-pr-message: 'Marking as stale. No activity in 60 days.'
stale-issue-label: 'stale'
stale-pr-label: 'stale'
remove-stale-when-updated: true
operations-per-run: 1000
days-before-close: -1
__pycache__
*.so
build
.coverage_*
*.egg-info
*~
slurm*
logs
.vscode
local/
.gitmodules
wandb/
onelogger.log
onelogger.err
.venv/
.merge_train_rule: &merge_train_rule
UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
INTEGRATION_TEST: "yes"
INTEGRATION_TEST_SCOPE: mr
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: mr-slim
FUNCTIONAL_TEST_REPEAT: 5
FUNCTIONAL_TEST_TIME_LIMIT: 2700
CLUSTER_A100: ""
CLUSTER_H100: ""
PUBLISH: "no"
workflow:
rules:
# Do not trigger for forks
- if: $CI_PROJECT_NAMESPACE != "ADLR" || ($CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_PROJECT_PATH != "ADLR/megatron-lm")
when: never
# ci-branches only for schedule
- if: $CI_COMMIT_BRANCH =~ /ci-/ && $CI_PIPELINE_SOURCE != "schedule"
when: never
# For schedules pipelines
- if: $CI_PIPELINE_SOURCE == "schedule"
auto_cancel:
on_new_commit: none
# For manual pipelines
- if: $CI_PIPELINE_SOURCE == "web"
# For push to main
- if: $CI_PIPELINE_SOURCE == 'push' && $CI_COMMIT_REF_PROTECTED == "true"
variables:
UNIT_TEST: "no"
INTEGRATION_TEST: "no"
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: mr
FUNCTIONAL_TEST_REPEAT: 5
FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
FUNCTIONAL_TEST_TIME_LIMIT: 2700
CLUSTER_A100: ""
CLUSTER_H100: ""
PUBLISH: "no"
auto_cancel:
on_new_commit: none
# For merge-trains that need to be fast-tracked
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' && $CI_MERGE_REQUEST_LABELS =~ /fast-track/
variables:
UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
INTEGRATION_TEST: "no"
FUNCTIONAL_TEST: "no"
CLUSTER_A100: ""
CLUSTER_H100: ""
PUBLISH: "no"
# For normal merge-trains
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train'
variables: *merge_train_rule
# For MRs with integration suite
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run tests/
variables: *merge_train_rule
# For MRs with nightly
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run nightly/
variables:
UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
INTEGRATION_TEST: "no"
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: nightly
FUNCTIONAL_TEST_REPEAT: 5
FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
FUNCTIONAL_TEST_TIME_LIMIT: 2700
CLUSTER_A100: ""
CLUSTER_H100: ""
PUBLISH: "no"
# For MRs with weekly
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run weekly/
variables:
UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
INTEGRATION_TEST: "no"
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: weekly
FUNCTIONAL_TEST_REPEAT: 1
FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
FUNCTIONAL_TEST_TIME_LIMIT: 9000
CLUSTER_A100: ""
CLUSTER_H100: ""
PUBLISH: "no"
# For MRs with heavy suite
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run functional tests/
variables:
UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
INTEGRATION_TEST: "no"
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: mr
FUNCTIONAL_TEST_REPEAT: 5
FUNCTIONAL_TEST_TIME_LIMIT: 2700
CLUSTER_A100: ""
CLUSTER_H100: ""
PUBLISH: "no"
# Default MRs
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result'
variables:
UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
INTEGRATION_TEST: "no"
FUNCTIONAL_TEST: "no"
PUBLISH: "no"
- when: never
auto_cancel:
on_new_commit: interruptible
stages:
- build
- test
- integration_tests
- functional_tests
- publish
default:
interruptible: true
retry:
max: 2
when: runner_system_failure
variables:
UNIT_TEST:
value: "yes"
options:
- "yes"
- "no"
description: To run the funtional test suite
UNIT_TEST_REPEAT:
value: "1"
description: "Number of repetitions"
UNIT_TEST_TIMEOUT:
value: "30"
description: Timeout (minutes) for Unit tests (all repeats)
INTEGRATION_TEST:
value: "yes"
options:
- "yes"
- "no"
description: To run the integration test suite
INTEGRATION_TEST_SCOPE:
value: "mr"
options:
- "mr"
- "nightly"
- "weekly"
- "pre-release"
- "release"
description: "Testsuite to run (only for INTEGRATION_TEST=yes)"
INTEGRATION_TEST_TIME_LIMIT:
value: "900"
description: "Timeout in seconds per test"
INTEGRATION_TEST_CASES:
value: "all"
description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
FUNCTIONAL_TEST:
value: "yes"
options:
- "yes"
- "no"
description: To run the funtional test suite
FUNCTIONAL_TEST_SCOPE:
value: "mr"
options:
- "mr"
- "nightly"
- "weekly"
- "pre-release"
- "release"
description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)"
FUNCTIONAL_TEST_REPEAT:
value: "5"
description: "Number of repetitions per test"
FUNCTIONAL_TEST_TIME_LIMIT:
value: "2700"
description: "Timeout in seconds per test"
FUNCTIONAL_TEST_CASES:
value: "all"
description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
FUNCTIONAL_TEST_NAME:
description: "Name of functional test run (only for pre-release and release)"
value: "$$CI_COMMIT_SHA"
FUNCTIONAL_TEST_RECORD_CHECKPOINTS:
value: "no"
description: "Record golden checkpoints"
options:
- "yes"
- "no"
CLUSTER_A100:
value: "dgxa100_dracooci"
options:
- "dgxa100_dracooci"
- "dgxa100_dracooci-ord"
description: "Cluster for A100 workloads"
CLUSTER_H100:
value: "dgxh100_coreweave"
options:
- "dgxh100_coreweave"
- "dgxh100_eos"
description: "Cluster for H100 workloads"
PUBLISH:
value: "no"
options:
- "yes"
- "no"
description: Build and publish a wheel to PyPi
PUBLISH_COMMIT:
value: "$$CI_COMMIT_SHA"
description: Which commit to publish
PUBLISH_VERSION_BUMP_BRANCH:
value: "$$CI_COMMIT_BRANCH"
description: Which branch to target for version bump
PUBLISH_SCOPE:
value: "code-freeze"
options:
- "code-freeze"
- "release"
- "review-reminder"
description: Type of publish (freeze or final release)
# CI wide variables
CI_MCORE_LTS_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_lts
CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev
CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci
UTILITY_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_utility
TE_GIT_REF: ""
include:
- .gitlab/stages/00.pre.yml
- .gitlab/stages/01.build.yml
- .gitlab/stages/02.test.yml
- .gitlab/stages/03.integration-tests.yml
- .gitlab/stages/04.functional-tests.yml
- .gitlab/stages/05.publish.yml
CI:
- .gitlab-ci.yml
- Dockerfile.ci.lts
- Dockerfile.ci.dev
- .github/**
- .gitlab/**
Datasets:
- megatron/core/datasets/**
BERT:
- megatron/core/models/bert/**
GPT:
- megatron/core/models/gpt/**
RETRO:
- megatron/core/models/retro/**
Dist-Ckpt:
- megatron/core/dist_checkpointing
Dist-Opt:
- megatron/core/optimizer/distrib_optimizer
Inference:
- megatron/core/inference
MoE:
- megatron/core/transformer/moe
Tests:
- tests/**
ParallelState:
- megatron/core/parallel_state.py
#! /bin/bash
set -x
env
eval "IMAGE=\$$IMAGE"
docker context create tls-environment
docker buildx create --name container --driver=docker-container --use tls-environment
ADDITIONAL_PARAMS=()
if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" || "$CI_COMMIT_BRANCH" == "main" ]]; then
ADDITIONAL_PARAMS+=("--pull")
ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:main,mode=max")
ADDITIONAL_PARAMS+=("-t ${IMAGE}:main")
elif [[ -n "$CI_MERGE_REQUEST_IID" ]]; then
ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID},mode=max")
ADDITIONAL_PARAMS+=("-t ${IMAGE}:${CI_MERGE_REQUEST_IID}")
fi
if [[ "$CI_COMMIT_BRANCH" == "ci-nightly" ]]; then
ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly")
fi
if [[ -n "$TE_GIT_REF" ]]; then
ADDITIONAL_PARAMS+=("--build-arg TE_COMMIT=${TE_GIT_REF}")
fi
echo $(git rev-parse HEAD)
JET_API_VERSION=$(curl -s -u "$ARTIFACTORY_USER:$ARTIFACTORY_TOKEN" "https://sc-hw-artf.nvidia.com/artifactory/api/pypi/hw-joc-pypi/simple/jet-api/" | grep -o 'href="../../jet-api/[0-9.]*/' | sed 's|href="../../jet-api/||;s|/||' | sort -V -r | head -n1)
DOCKER_BUILDKIT=1 docker build \
--secret id=JET_INDEX_URLS \
--secret id=LOGGER_INDEX_URL \
--secret id=EXPERIMENTAL_FLASH_ATTN \
--target $STAGE \
-f docker/$FILE \
-t ${IMAGE}:${CI_PIPELINE_ID} \
--builder=container \
--build-arg JET_API_VERSION=$JET_API_VERSION \
--cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID} \
--cache-from type=registry,ref=${IMAGE}-buildcache:main \
--build-arg FROM_IMAGE_NAME=$BASE_IMAGE \
--push \
--progress plain \
${ADDITIONAL_PARAMS[@]} .
# Copyright (c) 2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/env python3
"""
Import checker script for megatron.hub package.
This script recursively discovers all Python modules in the specified package
and attempts to import them, reporting any import errors.
"""
import importlib
import os
import sys
import traceback
from typing import Dict, List, Tuple
import click
class ImportChecker:
"""Check imports for all modules in a package."""
def __init__(self, package_name: str = "megatron.core", verbose: bool = False):
self.package_name = package_name
self.success_count = 0
self.failure_count = 0
self.graceful_count = 0
self.skipped_count = 0
self.failures: Dict[str, str] = {}
self.successes: List[str] = []
self.graceful_failures: Dict[str, str] = {}
self.skipped: List[str] = []
# Modules to skip (known problematic ones)
self.skip_patterns = {
"__pycache__",
".pytest_cache",
".git",
"test_",
"_test",
}
# Add current directory to Python path if not already there
current_dir = os.getcwd()
if current_dir not in sys.path:
sys.path.insert(0, current_dir)
def should_skip_module(self, module_name: str) -> bool:
"""Check if a module should be skipped."""
for pattern in self.skip_patterns:
if pattern in module_name:
return True
return False
def discover_modules(self, package_path: str) -> List[str]:
"""Discover all Python modules in the given package path."""
modules = []
package = importlib.import_module(package_path)
package_path = package.__path__[0]
# Walk through all Python files
for root, dirs, files in os.walk(package.__path__[0]):
# Skip hidden directories and __pycache__
dirs[:] = [d for d in dirs if not d.startswith(".") and d != "__pycache__"]
for file in files:
if file.endswith(".py") and not file.startswith("."):
# Convert file path to module name
rel_path = os.path.relpath(os.path.join(root, file), package_path)
module_parts = rel_path.replace(os.sep, ".").replace(".py", "")
# Handle __init__.py files
if module_parts.endswith(".__init__"):
module_parts = module_parts[:-9] # Remove .__init__
full_module_name = (
f"{self.package_name}.{module_parts}"
if module_parts
else self.package_name
)
if not self.should_skip_module(full_module_name):
modules.append(full_module_name)
# Remove duplicates and sort
modules = sorted(list(set(modules)))
return modules
def import_module(self, module_name: str) -> Tuple[str, str]:
"""
Try to import a module and return success status and error message.
Returns:
Tuple of (status: str, error_message: str)
status can be: "success", "graceful", or "failed"
"""
try:
if module_name in sys.modules:
del sys.modules[module_name]
importlib.import_module(module_name)
return "success", ""
except Exception:
tb = traceback.format_exc()
if "UnavailableError" in tb:
return "graceful", "UnavailableError detected during import"
return "failed", f"{str(tb)}"
def check_all_imports(self):
"""Check imports for all discovered modules."""
print(f"Discovering modules in package '{self.package_name}'...")
modules = self.discover_modules(self.package_name)
if not modules:
print("No modules found!")
return
print(f"Found {len(modules)} modules to check")
print("=" * 60)
for i, module_name in enumerate(modules, 1):
status, error_msg = self.import_module(module_name)
if status == "success":
self.success_count += 1
self.successes.append(module_name)
elif status == "graceful":
self.graceful_count += 1
self.graceful_failures[module_name] = error_msg
else: # failed
self.failure_count += 1
self.failures[module_name] = error_msg
"""Print a summary of the import check results."""
total = (
self.success_count
+ self.failure_count
+ self.graceful_count
+ self.skipped_count
)
print("\n" + "=" * 60)
print("IMPORT CHECK SUMMARY")
print("=" * 60)
print(f"Total modules checked: {total}")
print(
f"Successful imports: {self.success_count} ({self.success_count / total * 100:.1f}%)"
)
print(
f"Gracefully handled: {self.graceful_count} ({self.graceful_count / total * 100:.1f}%)"
)
print(
f"Failed imports: {self.failure_count} ({self.failure_count / total * 100:.1f}%)"
)
if self.skipped_count > 0:
print(
f"Skipped modules: {self.skipped_count} ({self.skipped_count / total * 100:.1f}%)"
)
if self.graceful_failures:
print(f"\n🟡 GRACEFULLY HANDLED ({len(self.graceful_failures)}):")
print("-" * 40)
if self.failures:
print(f"\n❌ FAILED IMPORTS ({len(self.failures)}):")
print("-" * 40)
for module_name, error_msg in self.failures.items():
print(f"\n{module_name}")
# Show only the first few lines of error to keep output manageable
error_lines = error_msg.split("\n")
for line in error_lines:
# if self.package_name.replace(".", os.sep) not in line:
# continue
if line.strip():
print(f" {line}")
return self.failure_count == 0
@click.command()
@click.option(
"--package-name",
required=True,
help="Package name to check imports for",
)
def main(package_name: str):
"""Main entry point."""
checker = ImportChecker(package_name=package_name)
successful = checker.check_all_imports()
exit(0 if successful else 1)
if __name__ == "__main__":
main()
#!/bin/bash
set -euxo pipefail
# Default values
MCORE_REPO="https://github.com/nvidia/megatron-lm.git"
MCORE_MR_COMMIT="main"
MCORE_BACKWARDS_COMMIT=""
# Parse command line arguments
usage() {
cat <<EOF
Usage: $0 [OPTIONS]
Clone and setup megatron-lm repositories for testing.
Options:
--repo URL Git repository URL (default: $MCORE_REPO)
--backwards-commit COMMIT Commit hash or reference for the backwards compatibility test
--help Show this help message
Example:
$0 --repo $MCORE_REPO \\
--backwards-commit core_r0.12.0
EOF
exit 1
}
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--repo)
MCORE_REPO="$2"
shift 2
;;
--backwards-commit)
MCORE_BACKWARDS_COMMIT="$2"
shift 2
;;
--help)
usage
;;
*)
echo "Unknown option: $1"
usage
;;
esac
done
# Validate required arguments
if [[ -z "${MCORE_BACKWARDS_COMMIT:-}" ]]; then
echo "Error: --backwards-commit is required"
usage
fi
# Checkout backwards-ref
rm -rf megatron-lm-legacy
mkdir megatron-lm-legacy
pushd megatron-lm-legacy
git init
git remote add origin $MCORE_REPO
git fetch origin $MCORE_BACKWARDS_COMMIT
git checkout $MCORE_BACKWARDS_COMMIT
git rev-parse HEAD
rm -rf megatron
cp -a ../megatron-lm/megatron ./
popd
# Copy unit test script
cp megatron-lm/tests/unit_tests/run_ci_test.sh megatron-lm-legacy/tests/unit_tests/run_ci_test.sh
include:
- template: Security/Secret-Detection.gitlab-ci.yml
.pre_rules:
rules:
- if: $CI_PIPELINE_SOURCE == 'main'
allow_failure: true
when: always
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
allow_failure: true
when: always
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result'
when: always
- when: never
stage: .pre
.dind_rules:
image: docker:26.1.4-dind
variables:
DOCKER_HOST: unix:///var/run/docker.sock
before_script:
- docker system prune -a --filter "until=36h" -f || true
- echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin
- echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin
pre:mirror_to_github:
rules:
- if: '$CI_COMMIT_REF_PROTECTED == "true" && $CI_PIPELINE_SOURCE == "push"'
allow_failure: true
- when: never
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
stage: .pre
image: python:3.10
variables:
GIT_STRATEGY: "clone"
script:
- git checkout $CI_COMMIT_BRANCH
- git remote add github https://ko3n1g:$GH_TOKEN@github.com/NVIDIA/Megatron-LM.git || true
- git push -u github $CI_COMMIT_BRANCH
retry:
max: 2
pre:create_ci_branches:
rules:
- if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push"'
allow_failure: true
- when: never
parallel:
matrix:
- branch: ci-unit-test-extended
- branch: ci-rebuild-mcore-nemo-image
- branch: ci-mr
- branch: ci-nightly
- branch: ci-weekly
- branch: ci-pre-release
- branch: ci-review-reminder
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
stage: .pre
image: python:3.10
variables:
GIT_STRATEGY: "clone"
script:
- git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/adlr/megatron-lm.git"
- git switch --force-create $branch
- git push --force -u origin $branch
retry:
max: 2
pre:label_merge_request:
extends: [.pre_rules]
image: golang:1.22
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
before_script:
- git clone -b nv https://${GITLAB_ENDPOINT}/okoenig/gitlab-mr-labeler.git
- cd gitlab-mr-labeler
- go install .
- cd ..
- go install github.com/itchyny/gojq/cmd/gojq@latest
script:
- set -x
- |
LABELS=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}")
- LABELS=$(echo "$LABELS" | gojq '.labels -= ["ParallelState"]')
- |
if git --no-pager diff --merge-base origin/${CI_MERGE_REQUEST_TARGET_BRANCH_NAME} -- 'megatron/core/' | grep -q 'parallel_state'; then
LABELS=$(echo "$LABELS" | gojq '.labels += ["ParallelState"]')
echo "$LABELS"
fi
- echo LABELS=$(echo "$LABELS" | gojq '.labels | join(",")') > labels
- gitlab-mr-labeler -f .gitlab/labeler-config.yml -t ${PROJECT_ACCESS_TOKEN_MCORE} --debug true
- cat labels
after_script:
- |
source labels
curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data-urlencode "add_labels=$LABELS" -X PUT
pre:maybe_cherry_pick_commit:
rules:
- if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push"'
- when: never
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
stage: .pre
image: nentangso/alpine-git-curl-jq
variables:
GIT_STRATEGY: "clone"
script:
- set -x
- set +e
- SHA=$(git rev-list --no-merges -n 1 HEAD)
- MESSAGE=$(git log -n 1 --pretty=format:%s $SHA)
- MR_ID=$(echo $MESSAGE | awk -F'!' '{print $2}' | awk '{print $1}' )
- git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
- git config --global user.email "mcore-bot@nvidia.com"
- git config --global user.name "Mcore Bot"
- |
MR=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}")
LABELS=$(echo -E $MR | jq '.labels | join(",")' | tr -d '"')
AUTHOR_ID=$(echo -E $MR | jq '.author.id' | tr -d '"')
AUTHOR_NAME=$(echo -E $MR | jq '.author.username' | tr -d '"')
TITLE=$(echo -E $MR | jq '.title' | tr -d '"')
MILESTONE_ID=$(echo -E $MR | jq '.milestone.id' | tr -d '"')
TARGET_BRANCHES=$(echo "$LABELS" | grep -o 'core_[^,]*')
if [[ $TARGET_BRANCHES == "" ]]; then
echo Nothing to cherry pick
exit 0
fi
echo $TARGET_BRANCHES | while read -r RELEASE_BRANCH ; do
TARGET_BRANCH_EXISTS_OK=$([[ "$(git ls-remote --heads origin refs/heads/$RELEASE_BRANCH)" != "" ]] && echo true || echo false)
if [[ "$TARGET_BRANCH_EXISTS_OK" == "false" ]]; then
echo Release branch does not yet exist, will not cherry-pick
continue
fi
(
git fetch origin $RELEASE_BRANCH:$RELEASE_BRANCH
git switch --force-create cherry-pick-$MR_ID-$RELEASE_BRANCH $RELEASE_BRANCH
git cherry-pick $SHA
git push -u origin --force cherry-pick-$MR_ID-$RELEASE_BRANCH
git checkout ${CI_DEFAULT_BRANCH:-main}
)
CHERRYPICK_SUCCESSFUL=$?
if [[ $CHERRYPICK_SUCCESSFUL -eq 0 ]]; then
curl \
--header "PRIVATE-TOKEN: $PAT" \
--url https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests \
-d "source_branch=cherry-pick-$MR_ID-$RELEASE_BRANCH" \
-d "target_branch=$RELEASE_BRANCH" \
-d "title=Cherry pick \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\`" \
-d "labels=cherry-pick" \
-d "reviewer_ids=$AUTHOR_ID" \
-d "milestone_id=$MILESTONE_ID" \
-d "description=[🤖]: Hi @$AUTHOR_NAME 👋,<br><br>we've cherry picked \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience\!"
else
URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/merge_requests/$MR_ID
MESSAGE='{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "beep boop 🤖: Cherry-pick of <'$URL'|!'$MR_ID'> failed\ncc '$SLACK_ADMIN'"
}
}
]
}'
curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${MCORE_NOTIFICATION_HOOK}
fi
done
interruptible: false
pre:check_milestone:
extends: [.pre_rules]
image: badouralix/curl-jq
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
script:
- env
- |
MILESTONE=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" | jq '.milestone')
- |
if [[ "$MILESTONE" == "null" ]]; then
LATEST_MILESTONE=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/milestones?state=active&order_by=due_date&sort=desc" | jq '.[0].id')
curl --request PUT --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data "milestone_id=${LATEST_MILESTONE}"
echo "Applied latest milestone (ID: ${LATEST_MILESTONE}) to this MR"
fi
pre:check_status_of_main:
extends: [.pre_rules]
image: python:3.10
timeout: 7 days
variables:
KUBERNETES_SERVICE_MEMORY_REQUEST: 32Gi
KUBERNETES_SERVICE_MEMORY_LIMIT: 32Gi
KUBERNETES_SERVICE_CPU_REQUEST: 8
KUBERNETES_SERVICE_CPU_LIMIT: 12
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
script:
- env
- pip install --no-cache-dir python-gitlab click
- export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
- export GITLAB_ENDPOINT
- python tests/test_utils/python_scripts/check_status_of_main.py --target-branch "$CI_MERGE_REQUEST_TARGET_BRANCH_NAME"
rules:
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' && $CI_MERGE_REQUEST_LABELS =~ /fast-track/
when: never
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train'
when: always
- when: never
.build_rules:
rules:
- when: on_success
stage: test
.build_image:
extends: [.build_rules, .dind_rules]
stage: build
tags:
- arch/amd64
- origin/jet-fleet
- env/prod
- ${TAG}
services:
- name: docker:24.0.5-dind
variables:
HEALTHCHECK_TCP_PORT: "2376"
timeout: 180m
variables:
DOCKER_HOST: tcp://docker:2376
DOCKER_TLS_CERTDIR: "/certs"
DOCKER_TLS_VERIFY: 1
DOCKER_CERT_PATH: "$DOCKER_TLS_CERTDIR/client"
TAG: purpose/builder-large
STAGE: jet
MCORE_BACKWARDS_REF: core_r0.12.0
KUBERNETES_SERVICE_MEMORY_REQUEST: 90Gi
KUBERNETES_SERVICE_MEMORY_LIMIT: 90Gi
# KUBERNETES_SERVICE_CPU_REQUEST: 60
# KUBERNETES_SERVICE_CPU_LIMIT: 60
script:
- eval PUBLISH_COMMIT=$PUBLISH_COMMIT
- apk add bash curl git
- export TE_GIT_REF=$TE_GIT_REF
- bash .gitlab/scripts/build.sh
- git fetch origin $MCORE_BACKWARDS_REF
- MCORE_BACKWARDS_COMMIT=$(git rev-parse FETCH_HEAD)
- echo "MCORE_MR_COMMIT=$CI_COMMIT_SHA" | tee -a build.env
- echo "MCORE_BACKWARDS_COMMIT=$MCORE_BACKWARDS_COMMIT" | tee -a build.env
- cat build.env
retry:
max: 2
artifacts:
reports:
dotenv: build.env
test:build_image:
extends: [.build_image]
parallel:
matrix:
- IMAGE: CI_MCORE_LTS_IMAGE
FILE: Dockerfile.ci.lts
BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
- IMAGE: CI_MCORE_DEV_IMAGE
FILE: Dockerfile.ci.dev
BASE_IMAGE: nvcr.io/nvidia/pytorch:25.05-py3
- IMAGE: UTILITY_IMAGE
FILE: Dockerfile.linting
BASE_IMAGE: python:3.10
test:build_nemo_image:
extends: [.build_image]
variables:
IMAGE: CI_NEMO_IMAGE
FILE: Dockerfile.ci.nemo
BASE_IMAGE: nvcr.io/nvidian/nemo:nightly
rules:
- if: $FUNCTIONAL_TEST == "yes" || $INTEGRATION_TEST == "yes" || $CI_COMMIT_BRANCH == "ci-rebuild-mcore-nemo-image"
when: on_success
.test_rules:
rules:
- if: $PUBLISH == "yes"
when: never
- when: on_success
stage: test
include:
- template: Security/Secret-Detection.gitlab-ci.yml
wait_for_resources:
extends: [.test_rules]
needs:
- test:linting_formatting
- test:linting_copyright
- job: test:linting_secret_detection
optional: true
- test:build_image
image: python:3.10
timeout: 7 days
variables:
KUBERNETES_SERVICE_MEMORY_REQUEST: 32Gi
KUBERNETES_SERVICE_MEMORY_LIMIT: 32Gi
KUBERNETES_SERVICE_CPU_REQUEST: 8
KUBERNETES_SERVICE_CPU_LIMIT: 12
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
script:
- env
- pip install --no-cache-dir python-gitlab click
- export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
- export GITLAB_ENDPOINT
- export NUM_CONCURRENT_JOBS
- python tests/test_utils/python_scripts/wait_for_resources.py --pipeline-id $CI_PIPELINE_ID
rules:
- if: $CI_MERGE_REQUEST_LABELS =~ /fast-track/
when: never
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
when: on_success
- when: never
test:unit_tests_configure:
extends: [.test_rules]
needs:
- test:build_image
- job: wait_for_resources
optional: true
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
before_script:
- git rm -r tests/test_utils/local_recipes || true
- git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/test_utils/local_recipes
- ls tests/test_utils/local_recipes
script:
- env
- set -x
- |
A100_CLUSTER=$([[ "$CLUSTER_A100" != "" ]] && echo $CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER)
H100_CLUSTER=$([[ "$CLUSTER_H100" != "" ]] && echo $CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER)
- |
ARGS=(
"--scope unit-tests"
"--n-repeat ${UNIT_TEST_REPEAT}"
"--time-limit $(( UNIT_TEST_TIMEOUT * 60 ))"
"--test-cases all"
"--cluster dgxh100_coreweave"
"--platform dgx_h100"
"--partition batch_short,batch"
"--container-image ${UTILITY_IMAGE}"
"--container-tag ${CI_PIPELINE_ID}"
"--dependent-job test:unit_tests_configure"
"--slurm-account ${CI_SLURM_ACCOUNT}"
"--no-enable-warmup"
)
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
${ARGS[@]} \
--environment "lts" \
--tag "legacy" \
--output-path "unit-test-job-lts-legacy.yaml"
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
${ARGS[@]} \
--environment "lts" \
--tag "latest" \
--output-path "unit-test-job-lts-latest.yaml"
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
${ARGS[@]} \
--environment "dev" \
--tag "legacy" \
--output-path "unit-test-job-dev-legacy.yaml"
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
${ARGS[@]} \
--environment "dev" \
--tag "latest" \
--output-path "unit-test-job-dev-latest.yaml"
rules:
- if: $UNIT_TEST == 'yes' && $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
allow_failure: true
when: on_success
- if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0'
when: on_success
artifacts:
paths:
- unit-test-job-dev-legacy.yaml
- unit-test-job-dev-latest.yaml
- unit-test-job-lts-legacy.yaml
- unit-test-job-lts-latest.yaml
- tests/test_utils/local_recipes
.unit_tests_run:
needs:
- test:linting_formatting
- test:linting_copyright
- job: test:linting_secret_detection
optional: true
- test:unit_tests_configure
- test:build_image
extends: [.test_rules]
trigger:
include:
- artifact: unit-test-job-$ENVIRONMENT-$TAG.yaml
job: test:unit_tests_configure
strategy: depend
variables:
RO_API_TOKEN: $PAT
CONTAINER_TAG: $CI_PIPELINE_ID
CI_MCORE_LTS_IMAGE: $CI_MCORE_LTS_IMAGE
GITLAB_ENDPOINT: $GITLAB_ENDPOINT
PARENT_PIPELINE_ID: $CI_PIPELINE_ID
MCORE_MR_COMMIT: $MCORE_MR_COMMIT
MCORE_BACKWARDS_COMMIT: $MCORE_BACKWARDS_COMMIT
inherit:
variables: true
rules:
- if: $UNIT_TEST == 'yes' && $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
allow_failure: true
when: on_success
- if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0'
when: on_success
test:unit_tests_pyt(DEV)_mcore(latest):
extends: [.unit_tests_run]
variables:
ENVIRONMENT: dev
TAG: latest
test:unit_tests_pyt(LTS)_mcore(latest):
extends: [.unit_tests_run]
variables:
ENVIRONMENT: lts
TAG: latest
test:unit_tests_notify:
extends: [.test_rules]
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
needs:
- test:unit_tests_pyt(DEV)_mcore(latest)
- test:unit_tests_pyt(LTS)_mcore(latest)
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
script:
- env
- export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK}
- export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
- export GITLAB_ENDPOINT
- export TAG_TEAM=$([[ "$CI_COMMIT_BRANCH" == "main" ]] && echo "1" || "0")
- export TEAM_SLUG=$SLACK_ADMIN
- |
python tests/test_utils/python_scripts/notify.py \
--pipeline-id "${CI_PIPELINE_ID}" \
--check-for unit-tests \
--pipeline-context "unit-tests-extended" \
--pipeline-created-at "${CI_PIPELINE_CREATED_AT}"
artifacts:
when: always
paths:
- scripts
rules:
- if: $CI_PIPELINE_SOURCE == "schedule" && $CI_COMMIT_BRANCH == "ci-unit-test-extended"
when: always
- when: never
test:linting_docs_build:
extends: [.test_rules]
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
needs: [test:build_image]
script:
- cd ..
- rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/nemo-megatron-core-tme/documentation.git
- mv megatron-lm/ documentation/
- cd documentation/
- ./repo docs
test:linting_formatting:
extends: [.test_rules]
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
needs: [test:build_image]
variables:
GIT_STRATEGY: "clone"
script:
- |
if [[ "$CI_PIPELINE_SOURCE" != "merge_request_event" ]]; then
exit 0
fi
- set +e
- git fetch origin main:main
- |
if [[ "$CI_MERGE_REQUEST_PROJECT_PATH" == "$CI_MERGE_REQUEST_SOURCE_PROJECT_PATH" ]]; then
bash tools/autoformat.sh
set -e
git fetch origin $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME
git checkout $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME
git config --global user.email "mcore-bot@nvidia.com"
git config --global user.name "Mcore Bot"
git remote set-url origin "https://gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
git add -A .
git commit -m "chore: Format files" || true
git push -u origin $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME
fi
- env
- BASE_REF="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" CHECK_ONLY=true SKIP_DOCS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Skip docs"* ]] && echo "true" || echo "false") bash tools/autoformat.sh
test:linting_copyright:
extends: [.test_rules]
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
needs: [test:build_image]
script:
- git fetch origin main
- bash tools/copyright.sh
# Override from template
secret_detection:
rules:
- when: never
# Inherit and modify template
test:linting_secret_detection:
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
extends: [".secret-analyzer"]
needs: [test:build_image]
variables:
GIT_DEPTH: 0
SECRET_DETECTION_LOG_OPTIONS: ${CI_MERGE_REQUEST_DIFF_BASE_SHA}..${CI_COMMIT_SHA}
allow_failure: false
rules:
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
- when: never
script:
- apk add jq
- /analyzer run
- |
if [[ $(cat gl-secret-detection-report.json | jq '.vulnerabilities | length > 0') == true ]]; then
echo "Atleast one vulnerability has been found"
cat gl-secret-detection-report.json | jq '.'
exit 1
fi
test:unit_tests_x_coverage_report:
extends: [.test_rules]
needs:
- job: test:unit_tests_pyt(DEV)_mcore(latest)
- job: test:unit_tests_pyt(LTS)_mcore(latest)
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
script:
- env
- export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
- export GITLAB_ENDPOINT
- python tests/test_utils/python_scripts/download_coverage_results.py --pipeline-id ${CI_PIPELINE_ID}
- coverage combine --keep $(ls coverage_results/*/coverage_report)
- coverage report
- coverage xml
coverage: "/TOTAL.+ ([0-9]{1,3}%)/"
artifacts:
reports:
coverage_report:
coverage_format: cobertura
path: coverage.xml
rules:
- if: $UNIT_TEST == 'yes' && $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
allow_failure: true
when: on_success
- if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0'
when: on_success
test:safe_imports:
extends: [.test_rules]
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/builder-large
- team/megatron
services:
- name: docker:24.0.5-dind
variables:
HEALTHCHECK_TCP_PORT: "2376"
variables:
KUBERNETES_SERVICE_MEMORY_REQUEST: 32Gi
KUBERNETES_SERVICE_MEMORY_LIMIT: 32Gi
KUBERNETES_SERVICE_CPU_REQUEST: 8
KUBERNETES_SERVICE_CPU_LIMIT: 12
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
needs: [test:build_image]
script:
- env
- pip install -e .
- python .gitlab/scripts/check_imports.py --package-name megatron.core
rules:
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME != 'main'
when: never
- if: $UNIT_TEST == 'yes' && $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
allow_failure: true
when: on_success
- if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0'
when: on_success
retry:
max: 2
.integration_tests_rules:
stage: integration_tests
rules:
- if: $INTEGRATION_TEST == "yes"
when: on_success
- when: never
default:
id_tokens:
VAULT_JWT_TOKEN:
aud: https://stg.vault.nvidia.com
include:
- project: dl/jet/gitlab-templates
ref: main
file: downstreams.yml
integration:configure:
needs:
- test:build_image
- job: test:unit_tests_pyt(DEV)_mcore(latest)
optional: true
- job: test:unit_tests_pyt(LTS)_mcore(latest)
optional: true
- job: test:build_nemo_image
extends: [.integration_tests_rules]
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
before_script:
- git rm -r tests/test_utils/local_recipes || true
- git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/test_utils/local_recipes
- ls tests/test_utils/local_recipes
script:
- set -x
- |
A100_CLUSTER=$([[ "$CLUSTER_A100" != "" ]] && echo $CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER)
H100_CLUSTER=$([[ "$CLUSTER_H100" != "" ]] && echo $CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER)
- |
ARGS=(
"--scope $INTEGRATION_TEST_SCOPE"
"--n-repeat 1"
"--time-limit $INTEGRATION_TEST_TIME_LIMIT"
"--test-cases $INTEGRATION_TEST_CASES"
"--container-image ${UTILITY_IMAGE}"
"--container-tag ${CI_PIPELINE_ID}"
"--slurm-account ${CI_SLURM_ACCOUNT}"
"--no-enable-warmup"
"--dependent-job integration:configure"
"--enable-lightweight-mode"
)
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
${ARGS[@]} \
--environment dev \
--platform dgx_a100 \
--cluster $A100_CLUSTER \
--output-path "functional-test-job-dev-A100.yaml"
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
${ARGS[@]} \
--environment dev \
--platform dgx_h100 \
--cluster $H100_CLUSTER \
--output-path "functional-test-job-dev-H100.yaml"
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
${ARGS[@]} \
--environment lts \
--platform dgx_a100 \
--cluster $A100_CLUSTER \
--output-path "functional-test-job-lts-A100.yaml"
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
${ARGS[@]} \
--environment lts \
--platform dgx_h100 \
--cluster $H100_CLUSTER \
--output-path "functional-test-job-lts-H100.yaml"
artifacts:
paths:
- functional-test-job-lts-A100.yaml
- functional-test-job-lts-H100.yaml
- functional-test-job-dev-H100.yaml
- functional-test-job-dev-A100.yaml
- tests/test_utils/local_recipes
.integration_run:
needs:
- integration:configure
- test:build_image
- wait_for_resources
extends: [.integration_tests_rules]
trigger:
include:
- artifact: functional-test-job-$ENVIRONMENT-$CLUSTER.yaml
job: integration:configure
strategy: depend
variables:
RO_API_TOKEN: $PAT
CONTAINER_TAG: $CI_PIPELINE_ID
CI_MCORE_LTS_IMAGE: $CI_MCORE_LTS_IMAGE
GITLAB_ENDPOINT: $GITLAB_ENDPOINT
PARENT_PIPELINE_ID: $CI_PIPELINE_ID
DASHBOARD_ENDPOINT: $DASHBOARD_ENDPOINT
MCORE_MR_COMMIT: $MCORE_MR_COMMIT
MCORE_BACKWARDS_COMMIT: $MCORE_BACKWARDS_COMMIT
inherit:
variables: true
integration:run_lts_dgx_a100:
extends: [.integration_run]
variables:
ENVIRONMENT: lts
CLUSTER: A100
integration:run_lts_dgx_h100:
extends: [.integration_run]
variables:
ENVIRONMENT: lts
CLUSTER: H100
integration:run_dev_dgx_a100:
extends: [.integration_run]
variables:
ENVIRONMENT: dev
CLUSTER: A100
integration:run_dev_dgx_h100:
extends: [.integration_run]
variables:
ENVIRONMENT: dev
CLUSTER: H100
.functional_tests_rules:
stage: functional_tests
rules:
- if: $FUNCTIONAL_TEST == "yes"
when: on_success
- when: never
default:
id_tokens:
VAULT_JWT_TOKEN:
aud: https://stg.vault.nvidia.com
include:
- project: dl/jet/gitlab-templates
ref: main
file: downstreams.yml
functional:configure:
needs:
- test:build_image
- test:build_nemo_image
- job: test:unit_tests_pyt(DEV)_mcore(latest)
optional: true
- job: test:unit_tests_pyt(LTS)_mcore(latest)
optional: true
- job: integration:run_lts_dgx_a100
optional: true
- job: integration:run_dev_dgx_a100
optional: true
- job: integration:run_lts_dgx_h100
optional: true
- job: integration:run_dev_dgx_h100
optional: true
extends: [.functional_tests_rules]
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
before_script:
- git rm -r tests/test_utils/local_recipes || true
- git submodule add --force https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/ADLR/megatron-lm-convergence-tests.git tests/test_utils/local_recipes
- ls tests/test_utils/local_recipes
script:
- set -x
- |
A100_CLUSTER=$([[ "$CLUSTER_A100" != "" ]] && echo $CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER)
H100_CLUSTER=$([[ "$CLUSTER_H100" != "" ]] && echo $CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER)
- |
RECORD_CHECKPOINTS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Record checkpoints"* || "$FUNCTIONAL_TEST_RECORD_CHECKPOINTS" == "yes" ]] && echo "true" || echo "false")
- |
if [[ "$FUNCTIONAL_TEST_SCOPE" == "release" || "$FUNCTIONAL_TEST_SCOPE" == "pre-release" ]]; then
FUNCTIONAL_TEST_NAME=$(eval echo $FUNCTIONAL_TEST_NAME)
RELEASE_ARGS=(
"--run-name"
$FUNCTIONAL_TEST_NAME
"--wandb-experiment"
$(echo $FUNCTIONAL_TEST_NAME | tr '/' '-')
)
else
RELEASE_ARGS=()
fi
- |
ARGS=(
"--scope $FUNCTIONAL_TEST_SCOPE"
"--n-repeat $FUNCTIONAL_TEST_REPEAT"
"--time-limit $FUNCTIONAL_TEST_TIME_LIMIT"
"--test-cases $FUNCTIONAL_TEST_CASES"
"--container-image ${UTILITY_IMAGE}"
"--container-tag ${CI_PIPELINE_ID}"
"--dependent-job functional:configure"
"--record-checkpoints ${RECORD_CHECKPOINTS}"
"--slurm-account ${CI_SLURM_ACCOUNT}"
"--no-enable-warmup"
)
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
${ARGS[@]} \
--environment dev \
--platform dgx_a100 \
--cluster $A100_CLUSTER \
--output-path "functional-test-job-dev-A100.yaml" \
${RELEASE_ARGS[@]}
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
${ARGS[@]} \
--environment dev \
--platform dgx_h100 \
--cluster $H100_CLUSTER \
--output-path "functional-test-job-dev-H100.yaml" \
${RELEASE_ARGS[@]}
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
${ARGS[@]} \
--environment lts \
--platform dgx_a100 \
--cluster $A100_CLUSTER \
--output-path "functional-test-job-lts-A100.yaml" \
${RELEASE_ARGS[@]}
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
${ARGS[@]} \
--environment lts \
--platform dgx_h100 \
--cluster $H100_CLUSTER \
--output-path "functional-test-job-lts-H100.yaml" \
${RELEASE_ARGS[@]}
artifacts:
paths:
- functional-test-job-lts-A100.yaml
- functional-test-job-lts-H100.yaml
- functional-test-job-dev-A100.yaml
- functional-test-job-dev-H100.yaml
- tests/test_utils/local_recipes
.functional_run:
needs:
- functional:configure
- test:build_image
extends: [.functional_tests_rules]
trigger:
include:
- artifact: functional-test-job-$ENVIRONMENT-$CLUSTER.yaml
job: functional:configure
strategy: depend
variables:
RO_API_TOKEN: $PAT
CONTAINER_TAG: $CI_PIPELINE_ID
CI_MCORE_LTS_IMAGE: $CI_MCORE_LTS_IMAGE
GITLAB_ENDPOINT: $GITLAB_ENDPOINT
PARENT_PIPELINE_ID: $CI_PIPELINE_ID
DASHBOARD_ENDPOINT: $DASHBOARD_ENDPOINT
MCORE_MR_COMMIT: $MCORE_MR_COMMIT
MCORE_BACKWARDS_COMMIT: $MCORE_BACKWARDS_COMMIT
CLUSTER: $CLUSTER
inherit:
variables: true
functional:run_lts_dgx_a100:
extends: [.functional_run]
variables:
ENVIRONMENT: lts
CLUSTER: A100
functional:run_lts_dgx_h100:
extends: [.functional_run]
variables:
ENVIRONMENT: lts
CLUSTER: H100
functional:run_dev_dgx_a100:
extends: [.functional_run]
variables:
ENVIRONMENT: dev
CLUSTER: A100
functional:run_dev_dgx_h100:
extends: [.functional_run]
variables:
ENVIRONMENT: dev
CLUSTER: H100
functional:run_nemo:
extends: [.functional_tests_rules]
trigger:
project: "dl/joc/nemo-ci"
branch: main-mirror
strategy: depend
inherit:
variables: true
variables:
MCORE_COMMIT: $CI_COMMIT_SHA
TEST_NEMO2_MODULE: "True"
ALLOW_FAILURE_DEPENDENCY: "True"
TESTS_TO_RUN_ON_THIS_COMMIT: nightly
rules:
- if: $FUNCTIONAL_TEST == "yes"
when: manual
allow_failure: true
- when: never
functional:x_notify:
extends: [.functional_tests_rules]
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
needs:
- functional:run_lts_dgx_a100
- functional:run_dev_dgx_a100
- functional:run_lts_dgx_h100
- functional:run_dev_dgx_h100
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
variables:
WEBHOOK_URL: ${MCORE_NOTIFICATION_HOOK}
RO_API_TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}
CONTEXT: $FUNCTIONAL_TEST_SCOPE
script:
- env
- export WEBHOOK_URL=${MCORE_NOTIFICATION_HOOK}
- export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
- export GITLAB_ENDPOINT
- export CONTEXT=$FUNCTIONAL_TEST_SCOPE
- export TAG_TEAM=$([[ "$CI_COMMIT_BRANCH" == "main" ]] && echo "1" || "0")
- export TEAM_SLUG=$SLACK_ADMIN
- |
python tests/test_utils/python_scripts/notify.py \
--pipeline-id "${CI_PIPELINE_ID}" \
--check-for functional-tests \
--pipeline-context $CONTEXT \
--pipeline-created-at "${CI_PIPELINE_CREATED_AT}"
artifacts:
when: always
paths:
- scripts
rules:
- if: ($CI_PIPELINE_SOURCE == "schedule" || $CI_COMMIT_BRANCH == "main") && $FUNCTIONAL_TEST == "yes"
when: always
- when: never
functional:x_download_golden_values:
extends: [.functional_tests_rules]
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
script:
- env
- export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
- export GITLAB_ENDPOINT
- python tests/test_utils/python_scripts/download_golden_values.py --pipeline-id ${CI_PIPELINE_ID}
artifacts:
paths:
- tests/
rules:
- if: $FUNCTIONAL_TEST == "yes"
when: manual
allow_failure: true
- when: never
.publish_common_freeze:
stage: publish
rules:
- if: ($CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH) && $PUBLISH == "yes" && $PUBLISH_SCOPE == "code-freeze"
when: manual
- when: never
.publish_common_release:
stage: publish
rules:
- if: $CI_COMMIT_BRANCH =~ /^core_r/ && $PUBLISH == "yes" && $PUBLISH_SCOPE == "release"
when: manual
- if: $PUBLISH == "yes" && $PUBLISH_SCOPE == "release"
when: manual
- when: never
publish:test_release_pypi_build_wheel:
extends: [.test_rules]
stage: publish
image:
name: ${IMAGE}
entrypoint: [""]
services:
- name: docker:24.0.5-dind
variables:
HEALTHCHECK_TCP_PORT: "2376"
needs: [test:build_image]
parallel:
matrix:
- PLATFORM: arm64
IMAGE: quay.io/pypa/manylinux_2_28_aarch64
- PLATFORM: amd64
IMAGE: quay.io/pypa/manylinux_2_28_x86_64
tags:
- arch/${PLATFORM}
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/builder-small
- team/megatron
variables:
PY_ENV: pytorch_25.03
KUBERNETES_SERVICE_MEMORY_REQUEST: 16Gi
KUBERNETES_SERVICE_MEMORY_LIMIT: 16Gi
PUBLISH_DRYRUN: "yes"
KUBERNETES_SERVICE_CPU_REQUEST: 4
KUBERNETES_SERVICE_CPU_LIMIT: 8
before_script:
- env
- eval PUBLISH_COMMIT=$PUBLISH_COMMIT
- env
- git fetch origin $PUBLISH_COMMIT
- git checkout $PUBLISH_COMMIT
script:
- echo $PUBLISH_DRYRUN
- |
if [ "$PUBLISH_DRYRUN" = "yes" ]; then
PRE_RELEASE=$(sed -n "s/.*PRE_RELEASE = '\(.*\)'/\1/p" megatron/core/package_info.py)
sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '${PRE_RELEASE}.dev$((RANDOM % 900000 + 100000))'" megatron/core/package_info.py
fi
- /opt/python/cp310-cp310/bin/python -m build
- /opt/python/cp311-cp311/bin/python -m build
- auditwheel repair dist/*.whl
- rm -rf dist/*.whl
- pushd megatron/core
- EXPECTED_RELEASE_NUMBER=$(/opt/python/cp311-cp311/bin/python -c "import package_info; print(package_info.__version__)")
- popd
- echo "EXPECTED_RELEASE_NUMBER_$PLATFORM=$EXPECTED_RELEASE_NUMBER" | tee -a build.env
artifacts:
paths:
- megatron/core/package_info.py
- wheelhouse/
- dist/
reports:
dotenv: build.env
retry:
max: 2
publish:test_release_pypi_test_wheel:
extends: [.test_rules]
stage: publish
image:
name: python:3.11
entrypoint: [""]
needs:
- job: publish:test_release_pypi_build_wheel
optional: true
parallel:
matrix:
- PLATFORM: arm64
- PLATFORM: amd64
services:
- name: docker:24.0.5-dind
variables:
HEALTHCHECK_TCP_PORT: "2376"
tags:
- arch/${PLATFORM}
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/builder-small
- team/megatron
variables:
KUBERNETES_SERVICE_MEMORY_REQUEST: 16Gi
KUBERNETES_SERVICE_MEMORY_LIMIT: 16Gi
KUBERNETES_SERVICE_CPU_REQUEST: 4
KUBERNETES_SERVICE_CPU_LIMIT: 8
GIT_STRATEGY: none
PUBLISH_DRYRUN: "yes"
script:
- rm -rf megatron
- pip install -U --no-cache-dir pip
- |
if [[ "$PLATFORM" == "arm64" ]]; then
for file in wheelhouse/*cp311*aarch64.whl; do
pip install --no-cache-dir "$file[dev,mlm]"
done
else
for file in wheelhouse/*cp311*x86_64.whl; do
pip install --no-cache-dir "$file[dev,mlm]"
done
fi
- RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
- |
if [[ "$PLATFORM" == "arm64" ]]; then
test "$EXPECTED_RELEASE_NUMBER_arm64" == "$RELEASE_NUMBER"
else
test "$EXPECTED_RELEASE_NUMBER_amd64" == "$RELEASE_NUMBER"
fi
- echo "RELEASE_NUMBER=$RELEASE_NUMBER" | tee -a build.env
artifacts:
reports:
dotenv: build.env
paths:
- wheelhouse/
- dist/
retry:
max: 2
publish:test_release_pypi_push_wheel:
extends: [.test_rules]
image: python:3.11
stage: publish
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
needs:
- job: publish:test_release_pypi_test_wheel
optional: true
variables:
GIT_STRATEGY: none
PUBLISH_DRYRUN: "yes"
timeout: 3m
script:
- echo $PUBLISH_DRYRUN
- |
if [ "$PUBLISH_DRYRUN" = "yes" ]; then
REPOSITORY=testpypi
export TWINE_USERNAME=$TWINE_TEST_USERNAME
export TWINE_PASSWORT=$TWINE_TEST_PASSWORD
else
REPOSITORY=pypi
export TWINE_USERNAME=$TWINE_PROD_USERNAME
export TWINE_PASSWORT=$TWINE_PROD_PASSWORD
fi
- ls -al dist/
- ls -al wheelhouse/
- pip install twine
- |
if [[ "$PUBLISH_DRYRUN" != "yes" ]]; then
twine upload --verbose -u $TWINE_USERNAME -p $TWINE_PASSWORT --repository $REPOSITORY wheelhouse/* dist/*
fi
publish:test_release_github:
extends: [.test_rules]
needs: [publish:test_release_pypi_test_wheel]
stage: publish
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
image: nentangso/alpine-git-curl-jq
before_script:
- eval PUBLISH_COMMIT=$PUBLISH_COMMIT
- git fetch origin $PUBLISH_COMMIT
- git checkout $PUBLISH_COMMIT
variables:
PUBLISH_DRYRUN: "yes"
script:
- echo $PUBLISH_DRYRUN
- NAME="NVIDIA Megatron Core $RELEASE_NUMBER"
- IS_PRERELEASE=$([[ "$RELEASE_NUMBER" == *rc* ]] && echo "true" || echo "false")
- |
if [[ "$IS_PRERELEASE" == "true" ]]; then
DATE=$(date +"%Y-%m-%d")
CHANGELOG="Prerelease: $NAME ($DATE)"
else
CHANGELOG=$(awk '/^## '"$NAME"'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md)
CHANGELOG=$(echo "$CHANGELOG" | sed '/./!d')
fi
- |
PAYLOAD=$(jq -nc \
--arg TAG_NAME "core_v${RELEASE_NUMBER}" \
--arg CI_COMMIT_SHA "$PUBLISH_COMMIT" \
--arg NAME "$NAME" \
--arg BODY "$CHANGELOG" \
--argjson PRERELEASE "$IS_PRERELEASE" \
'{
"tag_name": $TAG_NAME,
"target_commitish": $CI_COMMIT_SHA,
"name": $NAME,
"body": $BODY,
"draft": false,
"prerelease": $PRERELEASE,
"generate_release_notes": false
}'
)
echo -E "$PAYLOAD" | tee -a payload.txt
- cat payload.txt
- |
CMD=$(echo -E 'curl -L \
-X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer '"$GH_TOKEN"'" \
-H "X-GitHub-Api-Version: 2022-11-28" \
https://api.github.com/repos/NVIDIA/Megatron-LM/releases \
-d @payload.txt
')
- |
if [[ "$PUBLISH_DRYRUN" == "yes" ]]; then
echo -E "$CMD"
else
eval "$CMD"
fi
publish:test_release_notify:
needs: [publish:test_release_pypi_test_wheel, publish:test_release_pypi_push_wheel, publish:test_release_github]
extends: [.test_rules]
image: badouralix/curl-jq
stage: publish
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
variables:
PUBLISH_DRYRUN: "yes"
script:
- echo $PUBLISH_DRYRUN
- URL="https://github.com/NVIDIA/Megatron-LM/releases/tag/core_v$RELEASE_NUMBER"
- |
cat << EOF > message.json
{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "Releasebot 🤖: Megatron-Core released <${URL}|core_v${RELEASE_NUMBER}> 🚀"
}
}
]
}
EOF
- cat message.json
- |
CMD=$(echo curl \
-X POST \
-H "Content-type: application/json" \
-d @message.json ${MCORE_NOTIFICATION_HOOK_MAIN}
)
if [[ "$PUBLISH_DRYRUN" == "yes" ]]; then
echo "$CMD"
else
eval "$CMD"
fi
publish:test_release_version_bump:
needs: [publish:test_release_pypi_test_wheel, publish:test_release_pypi_push_wheel, publish:test_release_github]
extends: [.test_rules]
image: nentangso/alpine-git-curl-jq
stage: publish
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
before_script:
- eval PUBLISH_COMMIT=$PUBLISH_COMMIT
- eval PUBLISH_VERSION_BUMP_BRANCH=$PUBLISH_VERSION_BUMP_BRANCH
- git fetch origin $PUBLISH_COMMIT
- git checkout $PUBLISH_COMMIT
variables:
PUBLISH_DRYRUN: "yes"
script:
- env
- echo $PUBLISH_DRYRUN
- MAJOR=$(cat megatron/core/package_info.py | awk '/^MAJOR = /' | awk -F"= " '{print $2}')
- MINOR=$(cat megatron/core/package_info.py | awk '/^MINOR = /' | awk -F"= " '{print $2}')
- PATCH=$(cat megatron/core/package_info.py | awk '/^PATCH = /' | awk -F"= " '{print $2}')
- PRERELEASE=$(cat megatron/core/package_info.py | awk '/^PRE_RELEASE = /' | awk -F"= " '{print $2}' | tr -d '"' | tr -d "'")
- |
if [[ "$PRERELEASE" != "" ]]; then
NEXT_PATCH=$PATCH
NEXT_PRERELEASE=rc$((${PRERELEASE#rc} + 1))
else
NEXT_PATCH=$((${PATCH} + 1))
NEXT_PRERELEASE=$NEXT_PRERELEASE
fi
- sed -i "/^PATCH/c\PATCH = $NEXT_PATCH" megatron/core/package_info.py
- sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '$NEXT_PRERELEASE'" megatron/core/package_info.py
- git config --global user.email "mcore-bot@nvidia.com"
- git config --global user.name "Mcore Bot"
- git remote set-url origin "https://gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
- |
CMD=$(
cat <<'EOF'
git switch --force-create bot/chore/bump-version && \
git add megatron/core/package_info.py && \
git commit -m "chore: adjust version version" && \
git push -f -u origin bot/chore/bump-version && \
curl \
--header "PRIVATE-TOKEN: $PAT" \
--url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests" \
-d "source_branch=bot/chore/bump-version" \
-d "target_branch=$PUBLISH_VERSION_BUMP_BRANCH" \
-d "title=chore: Fix version of \`$PUBLISH_VERSION_BUMP_BRANCH\`" \
-d "description=[🤖]: Hi @okoenig 👋,<br><br>we've adjusted the version number of \`$PUBLISH_VERSION_BUMP_BRANCH\` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience\!"
EOF
)
- |
if [[ "$PUBLISH_DRYRUN" == "yes" ]]; then
echo "$CMD"
else
eval "$CMD"
fi
publish:code_freeze:
extends: [.publish_common_freeze]
image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
needs: [test:build_image]
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
script:
- git fetch origin $CI_DEFAULT_BRANCH
- git config --global user.email "mcore-bot@nvidia.com"
- git config --global user.name "Mcore Bot"
- git remote set-url origin "https://gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
- sed -i "/^PRE_RELEASE/c\PRE_RELEASE = ''" megatron/core/package_info.py
- VERSION=$(python -c "from megatron import core; print(core.__version__)")
- RELEASE_BRANCH=core_r$VERSION
- git switch --force-create $RELEASE_BRANCH origin/$CI_DEFAULT_BRANCH
- git push -u origin $RELEASE_BRANCH
- |
MESSAGE='{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "Releasebot 🤖: Megatron Core has been frozen 🎉 to branch `'"$RELEASE_BRANCH"'`"
}
}
]
}'
- |
curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${MCORE_NOTIFICATION_HOOK_MAIN}
- git switch main
- git switch --force-create bot/chore/bump-version
- git add megatron/core/package_info.py
- |
git commit -m "chore: adjust version version"
- git push -u origin bot/chore/bump-version
- |
curl \
--header "PRIVATE-TOKEN: $PAT" \
--url https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests \
-d "source_branch=bot/chore/bump-version" \
-d "target_branch=$RELEASE_BRANCH" \
-d "title=chore: Fix version of \`$RELEASE_BRANCH\`" \
-d "description=[🤖]: Hi @okoenig 👋,<br><br>we've adjusted the version number of \`$RELEASE_BRANCH\` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience\!"
publish:release_pypi_build_wheel:
extends: [publish:test_release_pypi_build_wheel, .publish_common_release]
dependencies: []
variables:
PUBLISH_DRYRUN: "no"
publish:release_pypi_test_wheel:
extends: [publish:test_release_pypi_test_wheel, .publish_common_release]
needs: [publish:release_pypi_build_wheel]
variables:
PUBLISH_DRYRUN: "no"
publish:release_pypi_push_wheel:
extends: [publish:test_release_pypi_push_wheel, .publish_common_release]
needs: [publish:release_pypi_test_wheel]
dependencies: [publish:release_pypi_test_wheel]
variables:
PUBLISH_DRYRUN: "no"
publish:release_github:
extends: [publish:test_release_github, .publish_common_release]
dependencies: [publish:release_pypi_test_wheel]
needs: [publish:release_pypi_test_wheel]
variables:
PUBLISH_DRYRUN: "no"
publish:release_version_bump:
needs: [publish:release_pypi_test_wheel]
extends: [publish:test_release_version_bump, .publish_common_release]
variables:
PUBLISH_DRYRUN: "no"
publish:release_notify:
needs: [publish:release_pypi_test_wheel, publish:release_pypi_push_wheel, publish:release_github]
extends: [publish:test_release_notify, .publish_common_release]
dependencies: [publish:release_pypi_test_wheel]
variables:
PUBLISH_DRYRUN: "no"
publish:docs:
extends: [.publish_common_release]
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
before_script:
- eval PUBLISH_COMMIT=$PUBLISH_COMMIT
- git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*'
- git fetch origin $PUBLISH_COMMIT
- git checkout $PUBLISH_COMMIT
script:
- cd ..
- rm -rf documentation && git clone --recursive https://gitlab-ci-token:${PAT}@${GITLAB_ENDPOINT}/nemo-megatron-core-tme/documentation.git
- cd documentation/megatron-lm
- git config --global user.email "mcore-bot@nvidia.com"
- git config --global user.name "Mcore Bot"
- git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*'
- git fetch origin $PUBLISH_COMMIT
- git checkout $PUBLISH_COMMIT
- cd ..
- git add megatron-lm
- |
git commit -m 'feat: Bump mcore'
- git push
rules:
- if: '$CI_COMMIT_REF_PROTECTED == "true" && $CI_PIPELINE_SOURCE == "push"'
allow_failure: true
- when: never
publish:upload_statistics:
stage: publish
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
needs:
- job: test:unit_tests_pyt(DEV)_mcore(latest)
- job: test:unit_tests_pyt(LTS)_mcore(latest)
- job: functional:run_lts_dgx_a100
optional: true
- job: functional:run_lts_dgx_h100
optional: true
- job: functional:run_dev_dgx_a100
optional: true
- job: functional:run_dev_dgx_h100
optional: true
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
script:
- env
- export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
- export GITLAB_ENDPOINT
- export DASHBOARD_ENDPOINT
- python tests/test_utils/python_scripts/dashboard.py --pipeline-id ${CI_PIPELINE_ID}
rules:
- if: ($CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' || $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train') && ($UNIT_TEST == "yes" || $INTEGRATION_TEST == "yes" || $FUNCTIONAL_TEST == "yes")
when: always
allow_failure: true
- when: never
public:review_reminder:
stage: publish
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
script:
- export GITLAB_ENDPOINT
- export RO_API_TOKEN=${PAT}
- export SLACK_WEBHOOK_URL=${SLACK_REMINDER_HOOK}
- export SLACK_API_TOKEN=${SLACK_API_TOKEN}
- python tests/test_utils/python_scripts/auto_reminder.py
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
rules:
- if: $CI_COMMIT_BRANCH == "ci-review-reminder" && $PUBLISH == "yes" && $PUBLISH_SCOPE == "review-reminder"
- when: never
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment