Commit 2d2fca6c authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
Pipeline #3401 failed with stages
in 0 seconds
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "Test Template"
description: "Template for running NeMo tests in a containerized environment"
inputs:
container-image:
description: "Container image to use for test"
required: true
timeout:
description: "Max runtime of test in minutes"
required: false
default: "30"
script:
description: "Test script to execute"
required: true
is-optional:
description: "Pass this job on failure."
required: false
default: "false"
is_unit_test:
description: "Upload coverage as unit test"
required: false
default: "false"
tag:
description: Latest or legacy test suite
required: true
test_case:
description: Test case to launch
required: true
model:
description: Model to launch
required: false
PAT:
description: "GitHub Personal Access Token"
required: true
runs:
using: "composite"
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Change ownership of /home/runner/
shell: bash
run: sudo chown -R $(whoami) /home/runner/
- name: Setup python
uses: actions/setup-python@v5
with:
python-version: 3.12
- name: Install uuidgen
shell: bash -x -e -u -o pipefail {0}
run: |
apt-get update
apt-get install -y uuid-runtime
- name: Create run-script (unit test)
shell: bash -x -e -u -o pipefail {0}
if: inputs.is_unit_test == 'true'
run: |
echo "::group::Create run-script"
cmd=$(cat <<'RUN_TEST_EOF'
#!/bin/bash
export PYTHONPATH=$(pwd)
export NEMORUN_HOME=$(pwd)
pip install --no-cache-dir uv
uv sync --only-group test
uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
--scope unit-tests \
--model unit-tests \
--test-case "${{ inputs.test_case }}" \
--environment dev \
--platform dgx_h100 \
--tag ${{ inputs.tag }} \
--container-image ${{ inputs.container-image }}
RUN_TEST_EOF
)
echo "$cmd" | tee "job.sh"
echo "::endgroup::"
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: nv-gha-runners/get-pr-info@main
- name: Install GH CLI
shell: bash -x -e -u -o pipefail {0}
run: |
apt-get update
apt-get install -y gh
- name: Has Run tests label
shell: bash -x -e -u -o pipefail {0}
id: has-run-tests-label
env:
GH_TOKEN: ${{ github.token }}
run: |
PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false"
echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
- name: Create run-script (e2e test)
shell: bash -x -e -u -o pipefail {0}
if: inputs.is_unit_test == 'false'
env:
MODEL: ${{ inputs.model }}
run: |
echo "::group::Create run-script"
cmd=$(cat <<'RUN_TEST_EOF'
#!/bin/bash
set -euxo pipefail
export PYTHONPATH=$(pwd)
export NEMORUN_HOME=$(pwd)
pip install --no-cache-dir uv
uv sync --only-group test
uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
--scope mr \
--model ${{ inputs.model }} \
--test-case ${{ inputs.test_case }} \
--environment dev \
--platform dgx_h100 \
--container-image ${{ inputs.container-image }} \
--data-dir /mnt/datadrive/TestData/megatron-lm/artifacts \
--enable-lightweight-mode
RUN_TEST_EOF
)
echo "$cmd" | tee "job.sh"
echo "::endgroup::"
- name: Set timeout
shell: bash -x -e -u -o pipefail {0}
id: timeout_in_seconds
run: |
echo "::group::Set timeout"
echo "main=$(( ${{ inputs.timeout }} * 60 ))" | tee -a "$GITHUB_OUTPUT"
echo "::endgroup::"
- name: Pull container
shell: bash -x -e -u -o pipefail {0}
run: |
echo "::group::Pull container"
docker pull ${{ inputs.container-image }}
echo "::endgroup::"
- name: Run main script
shell: bash -x -e -u -o pipefail {0}
id: run-main-script
run: |
echo "::group::Run main script"
EXIT_CODE=0
/bin/bash job.sh || EXIT_CODE=$?
echo "exit_code=$EXIT_CODE" | tee -a "$GITHUB_OUTPUT"
exit $EXIT_CODE
echo "::endgroup::"
- name: Check result
id: check
shell: bash -x -e -u -o pipefail {0}
if: always()
env:
IS_UNIT_TEST: ${{ inputs.is_unit_test == 'true' }}
run: |
echo "::group::Check result"
logs_report=logs-${{ inputs.test_case }}-${{ github.run_id }}-$(uuidgen)
echo "logs_report=$logs_report" | sed 's/\//-/g' | sed 's/\*/-/g' | tee -a "$GITHUB_OUTPUT"
if [[ "$IS_UNIT_TEST" == "true" ]]; then
coverage_report=coverage-${{ inputs.is_unit_test == 'true' && 'unit-test' || 'e2e' }}-${{ github.run_id }}-$(uuidgen)
else
coverage_report=none
fi
echo "coverage_report=$coverage_report" | tee -a "$GITHUB_OUTPUT"
EXIT_CODE=${{ steps.run-main-script.outputs.exit_code }}
IS_SUCCESS=$([[ "$EXIT_CODE" -eq 0 ]] && echo "true" || echo "false")
if [[ "$IS_SUCCESS" == "false" && "${{ inputs.is-optional }}" == "true" ]]; then
echo "::warning:: Test failed, but displayed as successful because it is marked as optional."
IS_SUCCESS=true
fi
if [[ "$IS_SUCCESS" == "false" ]]; then
echo Test did not finish successfully.
exit 1
fi
if [[ "$coverage_report" != "none" ]]; then
uv run coverage report -i
fi
exit $EXIT_CODE
echo "::endgroup::"
- name: Upload coverage
uses: actions/upload-artifact@v4
if: ${{ always() && steps.check.outputs.coverage_report != 'none' }}
with:
name: ${{ steps.check.outputs.coverage_report }}
path: |
coverage.xml
.coverage
include-hidden-files: true
- name: Upload logs
uses: actions/upload-artifact@v4
if: always()
with:
name: ${{ steps.check.outputs.logs_report }}
path: ${{ inputs.is_unit_test == 'true' && 'logs' || 'assets_dir' }}
include-hidden-files: true
enabled: true
auto_sync_draft: false
auto_sync_ready: true
# What does this PR do ?
<!-- Add a one line overview of what this PR aims to accomplish. -->
:warning: For major changes (either in lines of code or in its impact), please make sure to first share discuss a design-doc with the team.
## Contribution process
```mermaid
flowchart LR
A[Pre-checks] --> B[PR Tests]
subgraph Code Review/Approval
C1[Expert Review] --> C2[Final Review]
end
B --> C1
C2 --> D[Merge]
```
### Pre-checks
- [ ] I want this PR in a versioned release and have added the appropriate Milestone (e.g., `Core 0.8`)
- [ ] I have added relevant unit tests
- [ ] I have added relevant functional tests
- [ ] I have added proper typing to my code [Typing guidelines](https://docs.python.org/3/library/typing.html)
- [ ] I have added relevant documentation
- [ ] I have run the [autoformatter.sh](https://github.com/NVIDIA/Megatron-LM/blob/main/tools/autoformat.sh) on my PR
### Code review
The following process is enforced via the CODEOWNERS file for changes into `megatron/core`. For changes outside of `megatron/core`, it is up to the PR author whether or not to tag the Final Reviewer team.
<details>
<summary>For MRs into `main` branch</summary>
#### (Step 1): Add PR label `Expert Review`
#### (Step 2): Collect the expert reviewers reviews
1. Attach the `Expert Review` label when your PR is ready for review.
2. GitHub auto-assigns expert reviewers based on your changes. They will get notified and pick up your PR soon.
:warning: Only proceed to the next step once all reviewers have approved, merge-conflict are resolved and the CI is passing.
Final Review might get declined if these requirements are not fulfilled.
#### (Step 3): Final Review
1. Add `Final Review` label
2. GitHub auto-assigns final reviewers based on your changes. They will get notified and pick up your PR soon.
#### (Optional Step 4): Cherry-pick into release branch
If this PR also needs to be merged into `core_r*` release branches, after this PR has been merged, select `Cherry-pick` to open a new PR into the release branch.
</details>
<details>
<summary>For MRs into `dev` branch</summary>
The proposed review process for `dev` branch is under active discussion.
MRs are mergable after one approval by either `eharper@nvidia.com` or `zijiey@nvidia.com`.
</details>
### Merging your PR
Any member of [core-adlr](https://github.com/orgs/teams/NVIDIA/core-adlr) and [`core-nemo`](https://github.com/orgs/teams/NVIDIA/core-nemo) will be able to merge your PR.
name: ~Update dependencies template
on:
workflow_call:
inputs:
target-branch:
required: true
type: string
description: "The target branch to bump"
secrets:
PAT:
required: true
AZURE_CLIENT_ID:
required: true
AZURE_TENANT_ID:
required: true
AZURE_SUBSCRIPTION_ID:
required: true
SSH_KEY:
required: true
SSH_PWD:
required: true
jobs:
pre-flight:
runs-on: ubuntu-latest
outputs:
bump-branch: bump-ci-container-${{ steps.ref.outputs.date }}-${{ inputs.target-branch }}
date: ${{ steps.ref.outputs.date }}
steps:
- name: Get date
id: ref
run: echo "date=$(date +%F)" | tee -a "$GITHUB_OUTPUT"
update-lockfile:
environment: nemo-ci
runs-on: linux-amd64-cpu16
needs: [pre-flight]
env:
SOURCE_BRANCH: ${{ needs.pre-flight.outputs.bump-branch }}
TARGET_BRANCH: ${{ inputs.target-branch }}
steps:
- name: Install Azure CLI
run: curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- name: Azure Login
uses: azure/login@v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
- name: Azure ACR Login
run: az acr login --name nemoci
- name: Checkout repo
uses: actions/checkout@v4
with:
ref: ${{ env.TARGET_BRANCH }}
- name: Build container
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
docker build -f docker/Dockerfile.ci.dev --build-arg FROM_IMAGE_NAME="nvcr.io/nvidia/pytorch:25.06-py3" --target=main -t megatron-core .
- name: Create bump branch if not exists
run: |
if ! git ls-remote --exit-code origin $SOURCE_BRANCH; then
git checkout -b $SOURCE_BRANCH $TARGET_BRANCH
git push origin $SOURCE_BRANCH
fi
- name: Checkout repo
uses: actions/checkout@v4
with:
ref: ${{ env.SOURCE_BRANCH }}
- name: Upgrade lock file
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
docker run \
--rm \
-v $(pwd):/workspace \
-w /workspace \
-e GH_TOKEN=${{ secrets.PAT }} \
megatron-core \
bash -c 'uv lock --upgrade'
- name: Upload lock file
uses: actions/upload-artifact@v4
with:
name: lock-file-${{ env.SOURCE_BRANCH }}
path: uv.lock
create-pr:
needs: [update-lockfile, pre-flight]
runs-on: ubuntu-latest
environment: main
env:
SOURCE_BRANCH: ${{ needs.pre-flight.outputs.bump-branch }}
TARGET_BRANCH: ${{ inputs.target-branch }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
token: ${{ secrets.PAT }}
ref: ${{ env.TARGET_BRANCH }}
- name: Install GPG
run: sudo apt-get install -y gnupg2
- name: Import GPG key (for signing)
uses: crazy-max/ghaction-import-gpg@e89d40939c28e39f97cf32126055eeae86ba74ec
id: gpg-action
with:
gpg_private_key: ${{ secrets.SSH_KEY }}
passphrase: ${{ secrets.SSH_PWD }}
git_user_signingkey: true
git_commit_gpgsign: true
- name: Rebase against ${{ env.SOURCE_BRANCH }}
run: |
if git ls-remote --exit-code origin ${{ env.SOURCE_BRANCH }}; then
git fetch origin ${{ env.SOURCE_BRANCH }}
git rebase -S origin/${{ env.SOURCE_BRANCH }}
fi
- name: Download lock file
uses: actions/download-artifact@v4
with:
name: lock-file-${{ env.SOURCE_BRANCH }}
- name: Create Bump PR
uses: peter-evans/create-pull-request@v6
id: create-pull-request
env:
title: "chore(beep boop 🤖): Bump `uv.lock` (${{ inputs.target-branch}}) (${{ needs.pre-flight.outputs.date }})"
with:
branch: ${{ env.SOURCE_BRANCH }}
base: ${{ env.TARGET_BRANCH }}
title: ${{ env.title }}
token: ${{ secrets.PAT }}
body: |
🚀 PR to bump `uv.lock` in `${{ inputs.target-branch }}`.
📝 Please remember the following to-do's before merge:
- [ ] Verify the presubmit CI
🙏 Please merge this PR only if the CI workflow completed successfully.
commit-message: ${{ env.title }}
signoff: true
committer: "${{ steps.gpg-action.outputs.name }} <${{ steps.gpg-action.outputs.email }}>"
# Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Build, test, and publish a PyPi wheel (to testpypi).
on:
push:
branches:
- main
- "pull-request/[0-9]+"
- "deploy-release/*"
merge_group:
types: [checks_requested]
defaults:
run:
shell: bash -x -e -u -o pipefail {0}
permissions:
id-token: write
contents: read
jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.5
build-test-publish-wheel:
needs: [pre-flight]
if: |
!(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.63.1
with:
dry-run: true
python-package: megatron.core
python-version: "3.10"
packaging: uv
no-publish: ${{ !(github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) }}
custom-container: nvcr.io/nvidia/pytorch:25.05-py3
no-build-isolation: true
runner: linux-amd64-cpu16
secrets:
TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
GH_TOKEN: ${{ secrets.PAT }}
build-test-publish-wheel-summary:
needs: [pre-flight, build-test-publish-wheel]
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| always()
)
&& !cancelled()
runs-on: ubuntu-latest
steps:
- name: Result
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
echo "✅ All previous jobs completed successfully"
exit 0
else
echo "❌ Found $FAILED_JOBS failed job(s)"
# Show which jobs failed
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
exit 1
fi
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Create PR to main with cherry-pick from release
on:
push:
branches:
- main
jobs:
cherry-pick:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cherry_pick.yml@v0.65.9
with:
target-branches-pattern: 'core_(*dev_)?r[0-9]+\.[0-9]+\.[0-9]+'
secrets:
PAT: ${{ secrets.PAT }}
SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Approve Test Queue
on:
schedule:
- cron: "*/5 * * * *" # Runs every 5 minutes
workflow_dispatch: # Allows manual triggering
jobs:
approve-queue:
runs-on: ubuntu-latest
environment: main
strategy:
matrix:
branch: [main, dev, others]
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install requests
- name: Approve waiting deployments
env:
GITHUB_TOKEN: ${{ secrets.PAT }}
MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }}
PYTHONUNBUFFERED: 1
shell: python
run: |
import os
import requests
import re
# GitHub API configuration
GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
REPO = os.environ["GITHUB_REPOSITORY"]
MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"]) // 2
API_BASE = f"https://api.github.com/repos/NVIDIA/Megatron-LM"
# Headers for GitHub API
headers = {
"Authorization": f"token {GITHUB_TOKEN}",
"Accept": "application/vnd.github.v3+json",
"X-GitHub-Api-Version": "2022-11-28",
}
def make_request(endpoint, method="GET", data=None):
"""Make a request to the GitHub API with error handling."""
url = f"{API_BASE}/{endpoint}"
try:
if method == "GET":
response = requests.get(url, headers=headers)
else:
response = requests.post(url, headers=headers, json=data)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
print(f"Error making request to {endpoint}: {str(e)}")
if hasattr(e.response, 'text'):
print(f"Response: {e.response.text}")
return None
def is_pr_targeting_branch(workflow_run, target_branch):
"""
Check if a workflow run belongs to a PR targeting the given branch.
Extract PR number from head branch like 'pull-request/1913' and verify base branch.
"""
print(workflow_run.get("head_branch", ""))
head_branch = workflow_run.get("head_branch", "")
match = re.match(r"pull-request/(\d+)", head_branch)
if not match:
return False # Not a PR branch pattern
pr_number = int(match.group(1))
# Fetch PR info from GitHub API
pr_info = make_request(f"pulls/{pr_number}")
if not pr_info:
print(f"Failed to fetch PR #{pr_number}")
return False
base_branch = pr_info.get("base", {}).get("ref")
if (
(base_branch == target_branch) or
(base_branch != "main" and base_branch != "dev" and target_branch == "others")
):
print(f"PR #{pr_number} targets {target_branch}")
return True
return False
# Get current running and queued workflows
print("Fetching workflow runs...")
queued_workflow_runs = make_request("actions/runs?status=queued").get("workflow_runs", [])
in_progress_workflow_runs = make_request("actions/runs?status=in_progress").get("workflow_runs", [])
# Filter for workflows belonging to PRs targeting ${{ matrix.branch }}
queued_workflow_runs = [run for run in queued_workflow_runs
if run["name"] == "CICD Megatron-LM" and is_pr_targeting_branch(run, "${{ matrix.branch }}")]
in_progress_workflow_runs = [run for run in in_progress_workflow_runs
if run["name"] == "CICD Megatron-LM" and is_pr_targeting_branch(run, "${{ matrix.branch }}")]
# Count running and queued workflows
queued_workflows = len(queued_workflow_runs)
in_progress_workflows = len(in_progress_workflow_runs)
total_workflows = queued_workflows + in_progress_workflows
print(f"Current queued workflows (PRs targeting ${{ matrix.branch }}): {queued_workflows}")
print(f"Current running workflows (PRs targeting ${{ matrix.branch }}): {in_progress_workflows}")
print(f"Total workflows: {total_workflows}")
print(f"Max concurrency: {MAX_CONCURRENCY}")
if total_workflows >= MAX_CONCURRENCY:
print("Maximum concurrency reached, no new approvals will be made")
exit(0)
# Get waiting CI workflows for test environment
print("Fetching deployments...")
pending_workflows = make_request("actions/runs?status=waiting").get("workflow_runs", [])
print("Pending workflows:", len(pending_workflows))
pending_workflows = [run for run in pending_workflows
if run["name"] == "CICD Megatron-LM" and is_pr_targeting_branch(run, "${{ matrix.branch }}")]
# Sort deployments by creation date (oldest first)
print("Sorting workflows...")
pending_workflows = sorted(pending_workflows, key=lambda x: x["created_at"])
# Process each deployment
print(f"Processing {len(pending_workflows)} pending workflows...")
for workflow in pending_workflows:
if total_workflows >= MAX_CONCURRENCY:
print("Maximum concurrency reached, stopping approvals")
break
workflow_id = workflow["id"]
workflow_name = workflow["display_title"]
print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}")
deployment_url = f"actions/runs/{workflow_id}/pending_deployments"
deployment = make_request(deployment_url)[0]
environment_id = deployment["environment"]["id"]
# Approve the deployment
status_data = {
"environment_ids": [environment_id],
"state": "approved",
"comment": "Automatically approved by queue manager"
}
result = make_request(deployment_url, method="POST", data=status_data)
if result:
total_workflows += 1
else:
print(f"Failed to approve deployment {deployment['id']}")
exit(1)
notify:
if: failure()
runs-on: ubuntu-latest
needs: [approve-queue]
steps:
- name: Notify
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_REPOSITORY: ${{ github.repository }}
run: |
curl -X POST \
-H 'Content-type: application/json' \
--data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Test-queue-approval-bot workflow> failed. Please review manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
$SLACK_WEBHOOK
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: CICD Megatron-LM
on:
schedule:
- cron: 0 0 * * *
push:
branches:
- dev
- main
- "pull-request/[0-9]+"
- "deploy-release/*"
merge_group:
types: [checks_requested]
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
env:
container-registry: 766267172432.dkr.ecr.us-east-1.amazonaws.com
jobs:
is-not-external-contributor:
runs-on: ubuntu-latest
environment: nemo-ci
outputs:
is_external_contributor: ${{ github.event.pull_request.user.type == 'User' }}
permissions:
issues: write
pull-requests: write
env:
GITHUB_TOKEN: ${{ secrets.PAT }}
REPO: ${{ github.repository }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
token: ${{ env.GITHUB_TOKEN }}
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: nv-gha-runners/get-pr-info@main
- name: Check membership
id: check-membership
env:
IS_MAIN_BRANCH: ${{ github.ref == 'refs/heads/main' }}
IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
SCHEDULED_JOB: ${{ github.event_name == 'schedule' }}
run: |
PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }}
if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then
echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT
exit 0
fi
echo "Checking if $PR_AUTHOR is a repo collaborator..."
API_URL="https://api.github.com/repos/$REPO/collaborators/$PR_AUTHOR"
REPO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer $GITHUB_TOKEN" \
-H "X-GitHub-Api-Version: 2022-11-28" \
$API_URL)
echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA-NeMo..."
API_URL="https://api.github.com/orgs/NVIDIA-NeMo/members/$PR_AUTHOR"
ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer $GITHUB_TOKEN" \
-H "X-GitHub-Api-Version: 2022-11-28" \
$API_URL)
echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA..."
API_URL="https://api.github.com/orgs/NVIDIA/members/$PR_AUTHOR"
ORG_NVIDIA_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer $GITHUB_TOKEN" \
-H "X-GitHub-Api-Version: 2022-11-28" \
$API_URL)
if [ "$REPO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_MEMBERSHIP_RESPONSE" -eq 204 ]; then
echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT
else
echo "is_maintainer=false" | tee -a $GITHUB_OUTPUT
fi
- name: Find Comment
uses: peter-evans/find-comment@v4
if: startsWith(github.ref, 'refs/heads/pull-request/')
id: fc
with:
issue-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
repository: ${{ github.repository }}
body-includes: "<!--external-contributor-comment-->"
- name: Delete comment
uses: actions/github-script@v7
if: startsWith(github.ref, 'refs/heads/pull-request/') && steps.fc.outputs.comment-id != ''
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
await github.rest.issues.deleteComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: ${{ steps.fc.outputs.comment-id }}
})
- name: Write pull request comment
if: startsWith(github.ref, 'refs/heads/pull-request/') && steps.check-membership.outputs.is_maintainer == 'false'
uses: peter-evans/create-or-update-comment@v5
with:
issue-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
repository: ${{ github.repository }}
body: |
<!--external-contributor-comment-->
Thank you for your contribution!
NVIDIA Megatron-LM is currently transitioning to development on Github. We will aim to review your PR after we complete our transition and stabilize our Github development process.
Thank you for your understanding.
- name: exit
run: |
if [ "${{ steps.check-membership.outputs.is_maintainer }}" == "true" ]; then
exit 0
else
exit 1
fi
pre-flight:
needs: [is-not-external-contributor]
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.10
linting:
runs-on: ubuntu-latest
needs: [pre-flight]
if: |
(
needs.pre-flight.outputs.is_deployment_workflow == 'false'
&& needs.pre-flight.outputs.is_ci_workload == 'true'
) || (
needs.pre-flight.outputs.is_deployment_workflow == 'false'
&& needs.pre-flight.outputs.is_ci_workload == 'false'
&& needs.pre-flight.outputs.docs_only == 'false'
)
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install uv
uses: astral-sh/setup-uv@v1
with:
version: 0.7.2
- name: Install linting tools
run: |
uv sync --locked --only-group linting
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: nv-gha-runners/get-pr-info@main
- name: Run linting
if: startsWith(github.ref, 'refs/heads/pull-request/')
run: |
export PATH=".venv/bin:$PATH"
export GITLAB_ENDPOINT=github.com
export CI_PROJECT_NAMESPACE=NVIDIA
export BASE_REF="${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}"
export CHECK_ONLY=true
export SKIP_DOCS=false
bash tools/autoformat.sh
cicd-wait-in-queue:
runs-on: ubuntu-latest
needs: [pre-flight, linting]
environment: ${{ needs.pre-flight.outputs.is_merge_group == 'true' && 'merge-gate' || 'test' }}
if: |
!(needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| needs.pre-flight.outputs.docs_only == 'true')
steps:
- name: Running CI tests
run: |
echo "Running CI tests"
echo "is_merge_group: ${{ needs.pre-flight.outputs.is_merge_group }}"
cicd-container-build:
needs: [pre-flight, cicd-wait-in-queue]
runs-on: nvidia-ci-aws-gpu-x8
environment: nemo-ci
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& needs.pre-flight.outputs.is_merge_group == 'false'
&& !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup python
uses: actions/setup-python@v5
with:
python-version: 3.12
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: nv-gha-runners/get-pr-info@main
- name: Download test data
shell: bash
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
echo "::group::Download test data"
pip install --no-cache-dir pygithub click
python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets
echo "::endgroup::"
- name: Install GH CLI
shell: bash
run: |
apt-get update
apt-get install -y gh
- name: Pull cache
run: |
docker pull ${{ env.container-registry }}/megatron-lm:main || true
docker pull ${{ env.container-registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} || true
- name: Get last merged PR
id: cache_from
env:
GH_TOKEN: ${{ github.token }}
run: |
LAST_PRS=$(gh api graphql -f query='
query {
repository(owner: "NVIDIA", name: "Megatron-LM") {
pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
nodes {
number
}
}
}
}' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
echo "${{ env.container-registry }}/megatron-lm:$number"
done)
echo "LAST_PRS<<EOF" | tee -a $GITHUB_OUTPUT
echo "$LAST_PRS" | tee -a $GITHUB_OUTPUT
echo "EOF" | tee -a $GITHUB_OUTPUT
- name: Build and push
uses: docker/build-push-action@v5
with:
file: ./docker/Dockerfile.ci.dev
push: true
context: .
target: main
build-args: |
FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:25.09-py3
cache-from: |
${{ env.container-registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}
${{ env.container-registry }}/megatron-lm:main
${{ steps.cache_from.outputs.LAST_PRS }}
no-cache: false
tags: |
${{ env.container-registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}
${{ env.container-registry }}/megatron-lm:${{ github.sha }}
secrets: |
GH_TOKEN=${{ secrets.PAT }}
cicd-parse-unit-tests:
runs-on: ubuntu-latest
outputs:
unit-tests: ${{ steps.parse-unit-tests.outputs.unit-tests }}
needs:
- pre-flight
- cicd-wait-in-queue
- cicd-container-build
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& needs.pre-flight.outputs.is_merge_group == 'false'
&& !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Parse unit tests
id: parse-unit-tests
run: |
cat tests/test_utils/recipes/unit-tests.yaml | yq -o json '[.products[].test_case[] | { "bucket": .}] | sort_by(.model, .test_case)' | jq -c > unit-tests.json
echo "unit-tests=$(cat unit-tests.json)" | tee -a $GITHUB_OUTPUT
cicd-unit-tests-latest:
strategy:
fail-fast: false
matrix:
include: ${{ fromJson(needs.cicd-parse-unit-tests.outputs.unit-tests) }}
needs:
- pre-flight
- cicd-wait-in-queue
- cicd-container-build
- cicd-parse-unit-tests
runs-on: nvidia-ci-aws-gpu-x8
name: "${{ matrix.bucket }} - latest"
environment: nemo-ci
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& needs.pre-flight.outputs.is_merge_group == 'false'
&& !cancelled()
env:
PIP_DISABLE_PIP_VERSION_CHECK: 1
PIP_NO_PYTHON_VERSION_WARNING: 1
PIP_ROOT_USER_ACTION: ignore
steps:
- name: Checkout
uses: actions/checkout@v4
- name: main
uses: ./.github/actions
with:
test_case: ${{ matrix.bucket }}
tag: latest
timeout: ${{ matrix.timeout || 30 }}
is_unit_test: "true"
PAT: ${{ secrets.PAT }}
container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
cicd-parse-integration-tests:
runs-on: ubuntu-latest
needs:
- pre-flight
- cicd-wait-in-queue
- cicd-container-build
- cicd-unit-tests-latest
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& needs.pre-flight.outputs.is_merge_group == 'false'
&& !cancelled()
outputs:
integration-tests: ${{ steps.main.outputs.integration-tests }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: nv-gha-runners/get-pr-info@main
- name: Has Run tests label
id: has-run-tests-label
env:
GH_TOKEN: ${{ secrets.PAT }}
run: |
PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false"
echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
- name: Parse functional tests
id: main
env:
HAS_RUN_TESTS_LABEL: ${{ steps.has-run-tests-label.outputs.HAS_RUN_TESTS_LABEL }}
run: |
export PYTHONPATH=$(pwd)
if [ "$HAS_RUN_TESTS_LABEL" == "true" ]; then
ARGS=(
--scope mr
--enable-lightweight-mode
)
else
ARGS=(
--scope mr-slim
)
fi
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
--n-repeat 5 \
--time-limit 2700 \
--test-cases all \
--container-image mcore_ci_dev \
--container-tag latest \
--dependent-job functional:configure \
--record-checkpoints false \
--slurm-account gh \
--no-enable-warmup \
--environment dev \
--platform dgx_h100 \
--cluster ghci \
${ARGS[@]} \
--output-path integration-tests.yaml
cat integration-tests.yaml | \
yq -o json 'del(.default, .stages, .workflow) | to_entries | map({"model": .value.stage, "test_case": .key}) | sort_by(.model, .test_case)' | jq -c > integration-tests.json
echo "integration-tests=$(cat integration-tests.json)" | tee -a "$GITHUB_OUTPUT"
cicd-integration-tests-latest:
strategy:
fail-fast: false
matrix:
include: ${{ fromJson(needs.cicd-parse-integration-tests.outputs.integration-tests) }}
needs:
- pre-flight
- cicd-wait-in-queue
- cicd-parse-integration-tests
- cicd-unit-tests-latest
runs-on: nvidia-ci-aws-gpu-x8
name: "${{ matrix.model }}/${{ matrix.test_case }} - latest"
environment: nemo-ci
env:
PIP_DISABLE_PIP_VERSION_CHECK: 1
PIP_NO_PYTHON_VERSION_WARNING: 1
PIP_ROOT_USER_ACTION: ignore
if: |
(
success()
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.force_run_all == 'true'
)
&& needs.pre-flight.outputs.is_merge_group == 'false'
&& !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v4
- name: main
uses: ./.github/actions
with:
test_case: ${{ matrix.test_case }}
model: ${{ matrix.model }}
tag: latest
timeout: ${{ matrix.timeout || 30 }}
is_unit_test: "false"
PAT: ${{ secrets.PAT }}
container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
Nemo_CICD_Test:
needs:
- pre-flight
- cicd-unit-tests-latest
- cicd-integration-tests-latest
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| needs.pre-flight.outputs.is_ci_workload == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
|| always()
)
&& !cancelled()
runs-on: ubuntu-latest
permissions: write-all
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Get workflow result
id: result
shell: bash -x -e -u -o pipefail {0}
env:
GH_TOKEN: ${{ github.token }}
RUN_ID: ${{ github.run_id }}
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }}
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion == "failure")] | length') || echo 0
SKIPPED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion == "skipped")] | length') || echo 0
if [ "${FAILED_JOBS:-0}" -eq 0 ] && ([ "${SKIPPED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]); then
echo "✅ All previous jobs completed successfully"
exit 0
else
echo "❌ Found $FAILED_JOBS failed job(s)"
# Show which jobs failed
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion == "failure") | .name'
exit 1
fi
Coverage_Fake:
runs-on: ubuntu-latest
needs: [Nemo_CICD_Test, pre-flight]
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| github.event == 'merge_group'
)
&& needs.pre-flight.outputs.is_ci_workload == 'false'
&& !cancelled()
environment: nemo-ci
steps:
- name: Generate fake coverage report
uses: actions/github-script@v6
with:
github-token: ${{ secrets.PAT }}
script: |
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: context.sha,
state: 'success',
description: 'No code changes - coverage check skipped',
context: 'codecov/patch'
});
Coverage:
runs-on: ubuntu-latest
needs: [Nemo_CICD_Test]
if: |
(
(needs.pre-flight.outputs.is_ci_workload == 'true' && !failure())
|| success()
)
&& !cancelled()
strategy:
matrix:
flag: [unit-test]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Download coverage reports of current branch
uses: actions/download-artifact@v4
with:
pattern: coverage-${{ matrix.flag }}-*
- name: List coverage files
run: find . -type f -name "*.xml" -o -name "*.lcov"
- name: Get total coverage of current branch
shell: bash -x -e -u -o pipefail {0}
if: always()
run: |
pip install coverage
ls -al .
ls -al coverage-*/
coverage combine --keep $(ls coverage-*/.coverage)
coverage report -i
rm -rf coverage-*
ls -al
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
verbose: true
flags: ${{ matrix.flag }}
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: coverage-${{ matrix.flag }}-aggregated
path: |
.coverage
include-hidden-files: true
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Stale-Close-Inactive-Issues-PRs
on:
schedule:
- cron: "30 1 * * *"
jobs:
close-issues:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_close_inactive_issue_pr.yml@v0.44.0
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Community Bot
on:
issues:
types: [opened, edited, reopened, closed, deleted]
issue_comment:
types: [created, edited, deleted]
jobs:
community-bot:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_community_bot.yml@v0.65.10
secrets:
GH_TOKEN: ${{ secrets.PAT }}
environment: main
# Copyright (c) 2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Copyright check
on:
push:
branches:
- dev
- main
- "pull-request/[0-9]+"
- "deploy-release/*"
jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
copyright-check:
needs: [pre-flight]
if: |
!(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_copyright_check.yml@v0.65.11
copyright-check-summary:
needs: [pre-flight, copyright-check]
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| always()
)
&& !cancelled()
runs-on: ubuntu-latest
steps:
- name: Result
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
echo "✅ All previous jobs completed successfully"
exit 0
else
echo "❌ Found $FAILED_JOBS failed job(s)"
# Show which jobs failed
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
exit 1
fi
name: Dependabot
on:
schedule:
- cron: "0 8 * * 1"
workflow_dispatch: # Allow manual triggering
permissions:
id-token: write
contents: write
jobs:
get-release-branch-names:
runs-on: ubuntu-latest
environment: nemo-ci
outputs:
mcore: ${{ steps.get-branch.outputs.mcore_release_branch }}
steps:
- name: Get release branch names
id: get-branch
env:
PAT: ${{ secrets.PAT }}
run: |
latest_branch=$(git ls-remote --heads https://token:${PAT}@github.com/NVIDIA-NeMo/Eval.git 'refs/heads/r*' |
grep -o 'core_r[0-9]\+\.[0-9]\+\.[0-9]\+' |
sort -V |
tail -n1)
echo "mcore_release_branch=$latest_branch" >> $GITHUB_OUTPUT
bump-tags:
needs: [get-release-branch-names]
strategy:
fail-fast: false
matrix:
include:
- target-branch: ${{ needs.get-release-branch-names.outputs.mcore }}
- target-branch: main
uses: ./.github/workflows/_update_dependencies.yml
with:
target-branch: ${{ matrix.target-branch }}
secrets:
PAT: ${{ secrets.PAT }}
AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
SSH_KEY: ${{ secrets.SSH_KEY }}
SSH_PWD: ${{ secrets.SSH_PWD }}
notify:
if: failure()
runs-on: ubuntu-latest
needs: [bump-tags]
steps:
- name: Notify
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_REPOSITORY: ${{ github.repository }}
run: |
curl -X POST \
-H 'Content-type: application/json' \
--data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Dependabot workflow> failed. Please fix manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
$SLACK_WEBHOOK
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This workflow verifies that the basic install works across all supported platforms.
# For basic install, all imports need to either be successful or appropriately guarded.
name: Installation Test
on:
push:
branches:
- dev
- main
- "pull-request/[0-9]+"
- "deploy-release/*"
merge_group:
types: [checks_requested]
jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.5
pip-test-pytorch:
needs: [pre-flight]
if: |
!(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
runs-on: linux-amd64-cpu16
name: Pip - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch
container:
image: nvcr.io/nvidia/pytorch:25.05-py3
environment: nemo-ci
strategy:
fail-fast: false
matrix:
python-version: ["3.12"]
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set PATH
run: |
echo "UV_PROJECT_ENVIRONMENT=/opt/venv" | tee -a "$GITHUB_ENV"
echo "UV_LINK_MODE=copy" | tee -a "$GITHUB_ENV"
echo "CUDA_HOME=/usr/local/cuda" | tee -a "$GITHUB_ENV"
echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" | tee -a "$GITHUB_ENV"
echo "PATH=$HOME/.local/bin:$PATH:$CUDA_HOME/bin" | tee -a "$GITHUB_ENV"
echo "TORCH_CUDA_ARCH_LIST=6.0;6.1;7.0;7.5;8.0;8.6;9.0" | tee -a "$GITHUB_ENV"
- name: Install megatron-core
shell: bash -x -e -u -o pipefail {0}
run: bash docker/common/install.sh --environment dev --base-image pytorch --python-version ${{ matrix.python-version }}
- name: Checkout check-imports
uses: actions/checkout@v4
with:
repository: NVIDIA-NeMo/FW-CI-templates
ref: v0.63.2
path: FW-CI-templates
- name: Check imports for megatron-core
uses: ./FW-CI-templates/.github/actions/check-imports
with:
package-name: megatron.core
python-binary: ${{ env.UV_PROJECT_ENVIRONMENT }}/bin/python
uv-test-pytorch:
needs: [pre-flight]
if: |
!(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_merge_group == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
runs-on: linux-amd64-cpu16
name: UV - Python${{ matrix.python-version }} - AMD64/Linux - NGC PyTorch
container:
image: nvcr.io/nvidia/pytorch:25.05-py3
environment: nemo-ci
strategy:
fail-fast: false
matrix:
python-version: ["3.12"]
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set PATH
run: |
echo "UV_PROJECT_ENVIRONMENT=/opt/venv" | tee -a "$GITHUB_ENV"
echo "VIRTUAL_ENV=/opt/venv" | tee -a "$GITHUB_ENV"
echo "UV_LINK_MODE=copy" | tee -a "$GITHUB_ENV"
echo "CUDA_HOME=/usr/local/cuda" | tee -a "$GITHUB_ENV"
echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" | tee -a "$GITHUB_ENV"
echo "PATH=$HOME/.local/bin:$PATH:$CUDA_HOME/bin" | tee -a "$GITHUB_ENV"
echo "CUDACXX=/usr/local/cuda/bin/nvcc" | tee -a "$GITHUB_ENV"
echo "TORCH_CUDA_ARCH_LIST=6.0;6.1;7.0;7.5;8.0;8.6;9.0" | tee -a "$GITHUB_ENV"
- name: Install project
shell: bash
run: bash docker/common/install.sh --environment dev --base-image pytorch --use-uv
# NGC PyTorch 25.05 has a version of triton that is broken on CPU only machines.
# - name: Checkout check-imports
# uses: actions/checkout@v4
# with:
# repository: NVIDIA-NeMo/FW-CI-templates
# ref: v0.63.2
# path: FW-CI-templates
# - name: Check imports for megatron-core
# uses: ./FW-CI-templates/.github/actions/check-imports
# with:
# package-name: megatron.core
# python-binary: ${{ env.UV_PROJECT_ENVIRONMENT }}/bin/python
install-test-summary:
needs: [pre-flight, pip-test-pytorch, uv-test-pytorch]
runs-on: ubuntu-latest
name: Install test summary
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| always()
)
&& !cancelled()
steps:
- name: Get workflow result
id: result
shell: bash -x -e -u -o pipefail {0}
env:
GH_TOKEN: ${{ github.token }}
RUN_ID: ${{ github.run_id }}
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
echo "✅ All previous jobs completed successfully"
exit 0
else
echo "❌ Found $FAILED_JOBS failed job(s)"
# Show which jobs failed
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
exit 1
fi
__pycache__
*.so
build
.coverage_*
*.egg-info
*~
slurm*
logs
.vscode
local/
.gitmodules
wandb/
onelogger.log
onelogger.err
.venv
runs/
/test_cases/
**/dist/
\ No newline at end of file
.merge_train_rule: &merge_train_rule
UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
INTEGRATION_TEST: "no"
INTEGRATION_TEST_SCOPE: mr
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: mr-slim
FUNCTIONAL_TEST_REPEAT: 1
FUNCTIONAL_TEST_TIME_LIMIT: 2700
CLUSTER_A100: ""
CLUSTER_H100: ""
PUBLISH: "no"
workflow:
rules:
# Do not trigger for forks
- if: $CI_PROJECT_NAMESPACE != "ADLR" || ($CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_PROJECT_PATH != "ADLR/megatron-lm")
when: never
# ci-branches only for schedule
- if: $CI_COMMIT_BRANCH =~ /ci-/ && $CI_PIPELINE_SOURCE != "schedule"
when: never
# For schedules pipelines
- if: $CI_PIPELINE_SOURCE == "schedule"
auto_cancel:
on_new_commit: none
# For manual pipelines
- if: $CI_PIPELINE_SOURCE == "web"
# For push to main
- if: $CI_PIPELINE_SOURCE == 'push' && ($CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "dev" || $CI_COMMIT_BRANCH =~ /^core_/)
variables:
UNIT_TEST: "no"
INTEGRATION_TEST: "no"
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: mr
FUNCTIONAL_TEST_REPEAT: 5
FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
FUNCTIONAL_TEST_TIME_LIMIT: 3600
CLUSTER_A100: ""
CLUSTER_H100: ""
PUBLISH: "no"
auto_cancel:
on_new_commit: interruptible
# For merge-trains that need to be fast-tracked
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' && $CI_MERGE_REQUEST_LABELS =~ /fast-track/
variables:
UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
INTEGRATION_TEST: "no"
FUNCTIONAL_TEST: "no"
CLUSTER_A100: ""
CLUSTER_H100: ""
PUBLISH: "no"
# For normal merge-trains
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train'
variables: *merge_train_rule
# For MRs with integration suite
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run tests/
variables:
UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
INTEGRATION_TEST: "yes"
INTEGRATION_TEST_SCOPE: mr
FUNCTIONAL_TEST: "no"
FUNCTIONAL_TEST_SCOPE: mr-slim
FUNCTIONAL_TEST_REPEAT: 1
FUNCTIONAL_TEST_TIME_LIMIT: 2700
CLUSTER_A100: ""
CLUSTER_H100: ""
PUBLISH: "no"
# For MRs with nightly
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run nightly/
variables:
UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
INTEGRATION_TEST: "no"
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: nightly
FUNCTIONAL_TEST_REPEAT: 5
FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
FUNCTIONAL_TEST_TIME_LIMIT: 2700
CLUSTER_A100: ""
CLUSTER_H100: ""
PUBLISH: "no"
# For MRs with weekly
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run weekly/
variables:
UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
INTEGRATION_TEST: "no"
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: weekly
FUNCTIONAL_TEST_REPEAT: 1
FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no"
FUNCTIONAL_TEST_TIME_LIMIT: 9000
CLUSTER_A100: ""
CLUSTER_H100: ""
PUBLISH: "no"
# For MRs with heavy suite
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run functional tests/
variables:
UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
INTEGRATION_TEST: "no"
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: mr
FUNCTIONAL_TEST_REPEAT: 1
FUNCTIONAL_TEST_TIME_LIMIT: 2700
CLUSTER_A100: ""
CLUSTER_H100: ""
PUBLISH: "no"
# Default MRs
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result'
variables:
UNIT_TEST: "yes"
UNIT_TEST_REPEAT: 1
UNIT_TEST_TIMEOUT: 30
INTEGRATION_TEST: "no"
FUNCTIONAL_TEST: "no"
PUBLISH: "no"
- when: never
auto_cancel:
on_new_commit: interruptible
stages:
- build
- test
- integration_tests
- functional_tests
- publish
default:
interruptible: true
retry:
max: 2
when: runner_system_failure
variables:
UNIT_TEST:
value: "yes"
options:
- "yes"
- "no"
description: To run the funtional test suite
UNIT_TEST_REPEAT:
value: "1"
description: "Number of repetitions"
UNIT_TEST_TIMEOUT:
value: "30"
description: Timeout (minutes) for Unit tests (all repeats)
INTEGRATION_TEST:
value: "yes"
options:
- "yes"
- "no"
description: To run the integration test suite
INTEGRATION_TEST_SCOPE:
value: "mr"
options:
- "mr"
- "nightly"
- "weekly"
- "pre-release"
- "release"
description: "Testsuite to run (only for INTEGRATION_TEST=yes)"
INTEGRATION_TEST_TIME_LIMIT:
value: "900"
description: "Timeout in seconds per test"
INTEGRATION_TEST_CASES:
value: "all"
description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
FUNCTIONAL_TEST:
value: "yes"
options:
- "yes"
- "no"
description: To run the funtional test suite
FUNCTIONAL_TEST_SCOPE:
value: "mr"
options:
- "mr"
- "nightly"
- "weekly"
- "pre-release"
- "release"
description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)"
FUNCTIONAL_TEST_REPEAT:
value: "5"
description: "Number of repetitions per test"
FUNCTIONAL_TEST_TIME_LIMIT:
value: "2700"
description: "Timeout in seconds per test"
FUNCTIONAL_TEST_CASES:
value: "all"
description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite."
FUNCTIONAL_TEST_NAME:
description: "Name of functional test run (only for pre-release and release)"
value: "$$CI_COMMIT_SHA"
FUNCTIONAL_TEST_RECORD_CHECKPOINTS:
value: "no"
description: "Record golden checkpoints"
options:
- "yes"
- "no"
CLUSTER_A100:
value: "dgxa100_dracooci"
options:
- "dgxa100_dracooci"
- "dgxa100_dracooci-ord"
description: "Cluster for A100 workloads"
CLUSTER_H100:
value: "dgxh100_coreweave"
options:
- "dgxh100_coreweave"
- "dgxh100_eos"
description: "Cluster for H100 workloads"
PUBLISH:
value: "no"
options:
- "yes"
- "no"
description: Build and publish a wheel to PyPi
PUBLISH_COMMIT:
value: "$$CI_COMMIT_SHA"
description: Which commit to publish
PUBLISH_VERSION_BUMP_BRANCH:
value: "$$CI_COMMIT_BRANCH"
description: Which branch to target for version bump
PUBLISH_SCOPE:
value: "code-freeze"
options:
- "code-freeze"
- "release"
- "review-reminder"
- "upgrade-dependencies"
description: Type of publish (freeze or final release)
# CI wide variables
CI_MCORE_LTS_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_lts
CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev
CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci
UTILITY_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_utility
TE_GIT_REF: ""
include:
- .gitlab/stages/00.pre.yml
- .gitlab/stages/01.build.yml
- .gitlab/stages/02.test.yml
- .gitlab/stages/03.integration-tests.yml
- .gitlab/stages/04.functional-tests.yml
- .gitlab/stages/05.publish.yml
CI:
- .gitlab-ci.yml
- Dockerfile.ci.lts
- Dockerfile.ci.dev
- .github/**
- .gitlab/**
Datasets:
- megatron/core/datasets/**
BERT:
- megatron/core/models/bert/**
GPT:
- megatron/core/models/gpt/**
RETRO:
- megatron/core/models/retro/**
Dist-Ckpt:
- megatron/core/dist_checkpointing
Dist-Opt:
- megatron/core/optimizer/distrib_optimizer
Inference:
- megatron/core/inference
MoE:
- megatron/core/transformer/moe
Tests:
- tests/**
ParallelState:
- megatron/core/parallel_state.py
#! /bin/bash
set -x
env
eval "IMAGE=\$$IMAGE"
# Start a named container in detached mode
docker run -d --name download_test_data -w /workdir/ python:3.12-slim bash -c 'sleep infinity'
docker cp tests/. download_test_data:/workdir/tests
docker exec -e GH_TOKEN=$GH_TOKEN download_test_data bash -c '
ls -al /workdir/
pip install --no-cache-dir pygithub click
python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets
'
docker cp download_test_data:/workdir/assets ./
docker rm -f download_test_data
docker context create tls-environment
docker buildx create --name container --driver=docker-container --use tls-environment
ADDITIONAL_PARAMS=()
if [[ "$CI_COMMIT_BRANCH" == "ci-rebuild-mcore-nemo-image" || "$CI_COMMIT_BRANCH" == "main" || "$CI_COMMIT_BRANCH" == "dev" ]]; then
ADDITIONAL_PARAMS+=("--pull")
ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:main,mode=max")
ADDITIONAL_PARAMS+=("-t ${IMAGE}:${CI_COMMIT_BRANCH}")
elif [[ -n "$CI_MERGE_REQUEST_IID" ]]; then
ADDITIONAL_PARAMS+=("--cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID},mode=max")
ADDITIONAL_PARAMS+=("-t ${IMAGE}:${CI_MERGE_REQUEST_IID}")
fi
if [[ "$CI_COMMIT_BRANCH" == "ci-nightly" ]]; then
ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly")
fi
if [[ -n "$TE_GIT_REF" ]]; then
ADDITIONAL_PARAMS+=("--build-arg TE_COMMIT=${TE_GIT_REF}")
fi
echo $(git rev-parse HEAD)
JET_API_VERSION=$(curl -s -u "$ARTIFACTORY_USER:$ARTIFACTORY_TOKEN" "https://sc-hw-artf.nvidia.com/artifactory/api/pypi/hw-joc-pypi/simple/jet-api/" | grep -o 'href="../../jet-api/[0-9.]*/' | sed 's|href="../../jet-api/||;s|/||' | sort -V -r | head -n1)
DOCKER_BUILDKIT=1 docker build \
--secret id=JET_INDEX_URLS \
--secret id=LOGGER_INDEX_URL \
--target $STAGE \
-f docker/$FILE \
-t ${IMAGE}:${CI_PIPELINE_ID} \
--builder=container \
--build-arg JET_API_VERSION=$JET_API_VERSION \
--cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID} \
--cache-from type=registry,ref=${IMAGE}-buildcache:main \
--build-arg FROM_IMAGE_NAME=$BASE_IMAGE \
--push \
--progress plain \
${ADDITIONAL_PARAMS[@]} .
# Copyright (c) 2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/env python3
"""
Import checker script for megatron.hub package.
This script recursively discovers all Python modules in the specified package
and attempts to import them, reporting any import errors.
"""
import importlib
import os
import sys
import traceback
from typing import Dict, List, Tuple
import click
class ImportChecker:
"""Check imports for all modules in a package."""
def __init__(self, package_name: str = "megatron.core", verbose: bool = False):
self.package_name = package_name
self.success_count = 0
self.failure_count = 0
self.graceful_count = 0
self.skipped_count = 0
self.failures: Dict[str, str] = {}
self.successes: List[str] = []
self.graceful_failures: Dict[str, str] = {}
self.skipped: List[str] = []
# Modules to skip (known problematic ones)
self.skip_patterns = {
"__pycache__",
".pytest_cache",
".git",
"test_",
"_test",
}
# Add current directory to Python path if not already there
current_dir = os.getcwd()
if current_dir not in sys.path:
sys.path.insert(0, current_dir)
def should_skip_module(self, module_name: str) -> bool:
"""Check if a module should be skipped."""
for pattern in self.skip_patterns:
if pattern in module_name:
return True
return False
def discover_modules(self, package_path: str) -> List[str]:
"""Discover all Python modules in the given package path."""
modules = []
package = importlib.import_module(package_path)
package_path = package.__path__[0]
# Walk through all Python files
for root, dirs, files in os.walk(package.__path__[0]):
# Skip hidden directories and __pycache__
dirs[:] = [d for d in dirs if not d.startswith(".") and d != "__pycache__"]
for file in files:
if file.endswith(".py") and not file.startswith("."):
# Convert file path to module name
rel_path = os.path.relpath(os.path.join(root, file), package_path)
module_parts = rel_path.replace(os.sep, ".").replace(".py", "")
# Handle __init__.py files
if module_parts.endswith(".__init__"):
module_parts = module_parts[:-9] # Remove .__init__
full_module_name = (
f"{self.package_name}.{module_parts}"
if module_parts
else self.package_name
)
if not self.should_skip_module(full_module_name):
modules.append(full_module_name)
# Remove duplicates and sort
modules = sorted(list(set(modules)))
return modules
def import_module(self, module_name: str) -> Tuple[str, str]:
"""
Try to import a module and return success status and error message.
Returns:
Tuple of (status: str, error_message: str)
status can be: "success", "graceful", or "failed"
"""
try:
if module_name in sys.modules:
del sys.modules[module_name]
importlib.import_module(module_name)
return "success", ""
except Exception:
tb = traceback.format_exc()
if "UnavailableError" in tb:
return "graceful", "UnavailableError detected during import"
return "failed", f"{str(tb)}"
def check_all_imports(self):
"""Check imports for all discovered modules."""
print(f"Discovering modules in package '{self.package_name}'...")
modules = self.discover_modules(self.package_name)
if not modules:
print("No modules found!")
return
print(f"Found {len(modules)} modules to check")
print("=" * 60)
for i, module_name in enumerate(modules, 1):
status, error_msg = self.import_module(module_name)
if status == "success":
self.success_count += 1
self.successes.append(module_name)
elif status == "graceful":
self.graceful_count += 1
self.graceful_failures[module_name] = error_msg
else: # failed
self.failure_count += 1
self.failures[module_name] = error_msg
"""Print a summary of the import check results."""
total = (
self.success_count
+ self.failure_count
+ self.graceful_count
+ self.skipped_count
)
print("\n" + "=" * 60)
print("IMPORT CHECK SUMMARY")
print("=" * 60)
print(f"Total modules checked: {total}")
print(
f"Successful imports: {self.success_count} ({self.success_count / total * 100:.1f}%)"
)
print(
f"Gracefully handled: {self.graceful_count} ({self.graceful_count / total * 100:.1f}%)"
)
print(
f"Failed imports: {self.failure_count} ({self.failure_count / total * 100:.1f}%)"
)
if self.skipped_count > 0:
print(
f"Skipped modules: {self.skipped_count} ({self.skipped_count / total * 100:.1f}%)"
)
if self.graceful_failures:
print(f"\n🟡 GRACEFULLY HANDLED ({len(self.graceful_failures)}):")
print("-" * 40)
if self.failures:
print(f"\n❌ FAILED IMPORTS ({len(self.failures)}):")
print("-" * 40)
for module_name, error_msg in self.failures.items():
print(f"\n{module_name}")
# Show only the first few lines of error to keep output manageable
error_lines = error_msg.split("\n")
for line in error_lines:
# if self.package_name.replace(".", os.sep) not in line:
# continue
if line.strip():
print(f" {line}")
return self.failure_count == 0
@click.command()
@click.option(
"--package-name",
required=True,
help="Package name to check imports for",
)
def main(package_name: str):
"""Main entry point."""
checker = ImportChecker(package_name=package_name)
successful = checker.check_all_imports()
exit(0 if successful else 1)
if __name__ == "__main__":
main()
#!/bin/bash
set -euxo pipefail
# Default values
MCORE_REPO="https://github.com/nvidia/megatron-lm.git"
MCORE_MR_COMMIT="main"
MCORE_BACKWARDS_COMMIT=""
# Parse command line arguments
usage() {
cat <<EOF
Usage: $0 [OPTIONS]
Clone and setup megatron-lm repositories for testing.
Options:
--repo URL Git repository URL (default: $MCORE_REPO)
--backwards-commit COMMIT Commit hash or reference for the backwards compatibility test
--help Show this help message
Example:
$0 --repo $MCORE_REPO \\
--backwards-commit core_r0.12.0
EOF
exit 1
}
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--repo)
MCORE_REPO="$2"
shift 2
;;
--backwards-commit)
MCORE_BACKWARDS_COMMIT="$2"
shift 2
;;
--help)
usage
;;
*)
echo "Unknown option: $1"
usage
;;
esac
done
# Validate required arguments
if [[ -z "${MCORE_BACKWARDS_COMMIT:-}" ]]; then
echo "Error: --backwards-commit is required"
usage
fi
# Checkout backwards-ref
rm -rf megatron-lm-legacy
mkdir megatron-lm-legacy
pushd megatron-lm-legacy
git init
git remote add origin $MCORE_REPO
git fetch origin $MCORE_BACKWARDS_COMMIT
git checkout $MCORE_BACKWARDS_COMMIT
git rev-parse HEAD
rm -rf megatron
cp -a ../megatron-lm/megatron ./
popd
# Copy unit test script
cp megatron-lm/tests/unit_tests/run_ci_test.sh megatron-lm-legacy/tests/unit_tests/run_ci_test.sh
cp megatron-lm/pyproject.toml megatron-lm-legacy/pyproject.toml
\ No newline at end of file
include:
- template: Security/Secret-Detection.gitlab-ci.yml
.pre_rules:
rules:
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
allow_failure: true
when: always
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result'
when: always
- when: never
stage: .pre
.dind_rules:
image: docker:26.1.4-dind
variables:
DOCKER_HOST: unix:///var/run/docker.sock
before_script:
- docker system prune -a --filter "until=36h" -f || true
- echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin
- echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin
pre:mirror_to_github:
rules:
- if: '($CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "dev") && $CI_PIPELINE_SOURCE == "push"'
allow_failure: true
- when: never
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
stage: .pre
image: python:3.10
variables:
GIT_STRATEGY: "clone"
script:
- git checkout $CI_COMMIT_BRANCH
- git remote add github https://ko3n1g:$GH_TOKEN@github.com/NVIDIA/Megatron-LM.git || true
- git push -u github $CI_COMMIT_BRANCH
retry:
max: 2
pre:create_ci_branches:
rules:
- if: '$CI_COMMIT_BRANCH == "main" && $CI_PIPELINE_SOURCE == "push"'
allow_failure: true
- when: never
parallel:
matrix:
- branch: ci-unit-test-extended
- branch: ci-rebuild-mcore-nemo-image
- branch: ci-mr
- branch: ci-nightly
- branch: ci-weekly
- branch: ci-pre-release
- branch: ci-review-reminder
- branch: ci-upgrade-dependencies
- branch: ci-approve-main
- branch: ci-approve-dev
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
stage: .pre
image: python:3.10
variables:
GIT_STRATEGY: "clone"
script:
- git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/adlr/megatron-lm.git"
- git switch --force-create $branch
- git push --force -u origin $branch
retry:
max: 2
pre:create_ci_branches_dev:
rules:
- if: '$CI_COMMIT_BRANCH == "dev" && $CI_PIPELINE_SOURCE == "push"'
allow_failure: true
- when: never
parallel:
matrix:
- branch: ci-dev-unit-test-extended
- branch: ci-dev-rebuild-mcore-nemo-image
- branch: ci-dev-mr
- branch: ci-dev-nightly
- branch: ci-dev-upgrade-dependencies
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
stage: .pre
image: python:3.10
variables:
GIT_STRATEGY: "clone"
script:
- git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/adlr/megatron-lm.git"
- git switch --force-create $branch
- git push --force -u origin $branch
retry:
max: 2
pre:label_merge_request:
extends: [.pre_rules]
image: golang:1.22
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
before_script:
- git clone -b nv https://${GITLAB_ENDPOINT}/okoenig/gitlab-mr-labeler.git
- cd gitlab-mr-labeler
- go install .
- cd ..
- go install github.com/itchyny/gojq/cmd/gojq@latest
script:
- set -x
- |
LABELS=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}")
- LABELS=$(echo "$LABELS" | gojq '.labels -= ["ParallelState"]')
- |
if git --no-pager diff --merge-base origin/${CI_MERGE_REQUEST_TARGET_BRANCH_NAME} -- 'megatron/core/' | grep -q 'parallel_state'; then
LABELS=$(echo "$LABELS" | gojq '.labels += ["ParallelState"]')
echo "$LABELS"
fi
- echo LABELS=$(echo "$LABELS" | gojq '.labels | join(",")') > labels
- gitlab-mr-labeler -f .gitlab/labeler-config.yml -t ${PROJECT_ACCESS_TOKEN_MCORE} --debug true
- cat labels
after_script:
- |
source labels
curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data-urlencode "add_labels=$LABELS" -X PUT
pre:maybe_cherry_pick_to_main:
rules:
- if: "$CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == 'dev' && $CI_MERGE_REQUEST_LABELS =~ /mirror-to-main/"
- when: never
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
stage: .pre
image: nentangso/alpine-git-curl-jq
variables:
GIT_STRATEGY: "clone"
script:
- |
set -x
MR_ID=$CI_MERGE_REQUEST_IID
TARGET_BRANCH="cp/$MR_ID-into-main"
TARGET_BRANCH_EXISTS_OK=$([[ "$(git ls-remote --heads origin refs/heads/$TARGET_BRANCH)" != "" ]] && echo true || echo false)
if [[ "$TARGET_BRANCH_EXISTS_OK" == "true" ]]; then
echo Target branch already exists, will not cherry-pick again.
exit 0
fi
MR=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}")
LABELS=$(echo -E $MR | jq '.labels | join(",")' | tr -d '"')
AUTHOR_ID=$(echo -E $MR | jq '.author.id' | tr -d '"')
AUTHOR_NAME=$(echo -E $MR | jq '.author.username' | tr -d '"')
TITLE=$(echo -E $MR | jq '.title' | tr -d '"')
MILESTONE_ID=$(echo -E $MR | jq '.milestone.id' | tr -d '"')
git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/$CI_PROJECT_PATH.git"
git remote add mr-origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/$CI_MERGE_REQUEST_SOURCE_PROJECT_PATH.git"
git config --global user.email "mcore-bot@nvidia.com"
git config --global user.name "Mcore Bot"
git fetch origin dev
git fetch mr-origin $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME
git checkout $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME
START_COMMIT=$(git merge-base origin/dev mr-origin/$CI_MERGE_REQUEST_SOURCE_BRANCH_NAME)
END_COMMIT=$(git rev-parse HEAD)
git fetch origin main
git checkout main
git checkout -b $TARGET_BRANCH
git cherry-pick $START_COMMIT..$END_COMMIT
git push -u origin $TARGET_BRANCH
curl \
--header "PRIVATE-TOKEN: $PAT" \
--url https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests \
-d "source_branch=$TARGET_BRANCH" \
-d "target_branch=main" \
-d "title=cp MR !$MR_ID from dev: \`$TITLE\`" \
-d "labels=cherry-picked-from-dev" \
-d "reviewer_ids=$AUTHOR_ID" \
-d "milestone_id=$MILESTONE_ID" \
-d "description=[🤖]: Hi @$AUTHOR_NAME 👋,<br><br>we've cherry picked \`$TITLE (!$MR_ID)\` into \`main\` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience\!"
pre:maybe_cherry_pick_commit:
rules:
- if: '$CI_COMMIT_BRANCH == "main" && $CI_PIPELINE_SOURCE == "push"'
- when: never
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
stage: .pre
image: nentangso/alpine-git-curl-jq
variables:
GIT_STRATEGY: "clone"
script:
- set -x
- set +e
- SHA=$(git rev-list --no-merges -n 1 HEAD)
- MESSAGE=$(git log -n 1 --pretty=format:%s $SHA)
- MR_ID=$(echo $MESSAGE | awk -F'!' '{print $2}' | awk '{print $1}' )
- git remote set-url origin "https://gitlab-ci-token:${PROJECT_ACCESS_TOKEN_MCORE}@${GITLAB_ENDPOINT}/$CI_PROJECT_NAMESPACE/megatron-lm.git"
- git config --global user.email "mcore-bot@nvidia.com"
- git config --global user.name "Mcore Bot"
- |
MR=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${MR_ID}")
LABELS=$(echo -E $MR | jq '.labels | join(",")' | tr -d '"')
AUTHOR_ID=$(echo -E $MR | jq '.author.id' | tr -d '"')
AUTHOR_NAME=$(echo -E $MR | jq '.author.username' | tr -d '"')
TITLE=$(echo -E $MR | jq '.title' | tr -d '"')
MILESTONE_ID=$(echo -E $MR | jq '.milestone.id' | tr -d '"')
TARGET_BRANCHES=$(echo "$LABELS" | grep -o 'core_[^,]*')
if [[ $TARGET_BRANCHES == "" ]]; then
echo Nothing to cherry pick
exit 0
fi
echo $TARGET_BRANCHES | while read -r RELEASE_BRANCH ; do
TARGET_BRANCH_EXISTS_OK=$([[ "$(git ls-remote --heads origin refs/heads/$RELEASE_BRANCH)" != "" ]] && echo true || echo false)
if [[ "$TARGET_BRANCH_EXISTS_OK" == "false" ]]; then
echo Release branch does not yet exist, will not cherry-pick
continue
fi
(
git fetch origin $RELEASE_BRANCH:$RELEASE_BRANCH
git switch --force-create cherry-pick-$MR_ID-$RELEASE_BRANCH $RELEASE_BRANCH
git cherry-pick $SHA
git push -u origin --force cherry-pick-$MR_ID-$RELEASE_BRANCH
git checkout main
)
CHERRYPICK_SUCCESSFUL=$?
if [[ $CHERRYPICK_SUCCESSFUL -eq 0 ]]; then
curl \
--header "PRIVATE-TOKEN: $PAT" \
--url https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests \
-d "source_branch=cherry-pick-$MR_ID-$RELEASE_BRANCH" \
-d "target_branch=$RELEASE_BRANCH" \
-d "title=Cherry pick \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\`" \
-d "labels=cherry-pick" \
-d "reviewer_ids=$AUTHOR_ID" \
-d "milestone_id=$MILESTONE_ID" \
-d "description=[🤖]: Hi @$AUTHOR_NAME 👋,<br><br>we've cherry picked \`$TITLE ($MR_ID)\` into \`$RELEASE_BRANCH\` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience\!"
else
URL=https://${GITLAB_ENDPOINT}/ADLR/megatron-lm/-/merge_requests/$MR_ID
MESSAGE='{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "beep boop 🤖: Cherry-pick of <'$URL'|!'$MR_ID'> failed\ncc '$SLACK_ADMIN'"
}
}
]
}'
curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${MCORE_NOTIFICATION_HOOK}
fi
done
interruptible: false
pre:check_milestone:
extends: [.pre_rules]
image: badouralix/curl-jq
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
script:
- env
- |
MILESTONE=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" | jq '.milestone')
- |
if [[ "$MILESTONE" == "null" ]]; then
LATEST_MILESTONE=$(curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/milestones?state=active&order_by=due_date&sort=desc" | jq '.[0].id')
curl --request PUT --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data "milestone_id=${LATEST_MILESTONE}"
echo "Applied latest milestone (ID: ${LATEST_MILESTONE}) to this MR"
fi
pre:check_status_of_main:
extends: [.pre_rules]
image: python:3.10
timeout: 7 days
variables:
KUBERNETES_SERVICE_MEMORY_REQUEST: 32Gi
KUBERNETES_SERVICE_MEMORY_LIMIT: 32Gi
KUBERNETES_SERVICE_CPU_REQUEST: 8
KUBERNETES_SERVICE_CPU_LIMIT: 12
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
script:
- env
- pip install --no-cache-dir python-gitlab click
- export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
- export GITLAB_ENDPOINT
- python tests/test_utils/python_scripts/check_status_of_main.py --target-branch "$CI_MERGE_REQUEST_TARGET_BRANCH_NAME"
rules:
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' && $CI_MERGE_REQUEST_LABELS =~ /fast-track/
when: never
- if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train'
when: always
- when: never
pre:approve_merge_gate:
extends: [.pre_rules]
image: maniator/gh
tags:
- arch/amd64
- env/prod
- origin/jet-fleet
- owner/jet-core
- purpose/utility
- team/megatron
script:
- |
set -eoux pipefail
EXIT_CODE=0
python tests/test_utils/python_scripts/check_status_of_main.py --target-branch "$CI_COMMIT_BRANCH" --once || EXIT_CODE=$?
export GH_TOKEN=$GH_TOKEN
export REPO=NVIDIA/Megatron-LM
export TARGET_BRANCH="$CI_COMMIT_BRANCH"
if [[ $EXIT_CODE -eq 0 ]]; then
STATUS="approved"
COMMENT="Main is healthy. Submitting PR."
else
STATUS="rejected"
COMMENT="Main is not healthy. An automation engineer is investigating. No need to take any action."
fi
gh api "repos/$REPO/actions/runs?status=waiting" --jq '.workflow_runs[].id' \
| while read run_id; do
HEAD_BRANCH=$(gh api "repos/$REPO/actions/runs/$run_id" --jq '.head_branch')
PR_NUMBER="${HEAD_BRANCH##*/}"
if [ -n "$PR_NUMBER" ]; then
PR_BASE=$(gh api "repos/$REPO/pulls/$PR_NUMBER" --jq '.base.ref')
if [ "$PR_BASE" = "$TARGET_BRANCH" ]; then
gh api \
--method POST "repos/$REPO/actions/runs/$run_id/pending_deployments" \
-F "environment_ids[]=$(gh api "repos/$REPO/environments" --jq '.environments[] | select(.name=="merge-gate") | .id')" \
-f state="$STATUS" \
-f comment="$COMMENT";
fi
fi
done
retry:
max: 2
rules:
- if: $CI_PIPELINE_SOURCE == "schedule" && ($CI_COMMIT_BRANCH == 'ci-approve-dev' || $CI_COMMIT_BRANCH == 'ci-approve-main')
when: always
- when: never
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment