# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: CICD Megatron-LM on: schedule: - cron: 0 0 * * * push: branches: - dev - main - "pull-request/[0-9]+" - "deploy-release/*" merge_group: types: [checks_requested] workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }} cancel-in-progress: true permissions: id-token: write contents: read env: container-registry: 766267172432.dkr.ecr.us-east-1.amazonaws.com jobs: is-not-external-contributor: runs-on: ubuntu-latest environment: nemo-ci outputs: is_external_contributor: ${{ github.event.pull_request.user.type == 'User' }} permissions: issues: write pull-requests: write env: GITHUB_TOKEN: ${{ secrets.PAT }} REPO: ${{ github.repository }} steps: - name: Checkout repository uses: actions/checkout@v4 with: token: ${{ env.GITHUB_TOKEN }} - name: Get PR info id: get-pr-info if: startsWith(github.ref, 'refs/heads/pull-request/') uses: nv-gha-runners/get-pr-info@main - name: Check membership id: check-membership env: IS_MAIN_BRANCH: ${{ github.ref == 'refs/heads/main' }} IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }} SCHEDULED_JOB: ${{ github.event_name == 'schedule' }} run: | PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }} if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT exit 0 fi echo "Checking if $PR_AUTHOR is a repo collaborator..." API_URL="https://api.github.com/repos/$REPO/collaborators/$PR_AUTHOR" REPO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ -H "Accept: application/vnd.github+json" \ -H "Authorization: Bearer $GITHUB_TOKEN" \ -H "X-GitHub-Api-Version: 2022-11-28" \ $API_URL) echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA-NeMo..." API_URL="https://api.github.com/orgs/NVIDIA-NeMo/members/$PR_AUTHOR" ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ -H "Accept: application/vnd.github+json" \ -H "Authorization: Bearer $GITHUB_TOKEN" \ -H "X-GitHub-Api-Version: 2022-11-28" \ $API_URL) echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA..." API_URL="https://api.github.com/orgs/NVIDIA/members/$PR_AUTHOR" ORG_NVIDIA_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \ -H "Accept: application/vnd.github+json" \ -H "Authorization: Bearer $GITHUB_TOKEN" \ -H "X-GitHub-Api-Version: 2022-11-28" \ $API_URL) if [ "$REPO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_MEMBERSHIP_RESPONSE" -eq 204 ]; then echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT else echo "is_maintainer=false" | tee -a $GITHUB_OUTPUT fi - name: Find Comment uses: peter-evans/find-comment@v4 if: startsWith(github.ref, 'refs/heads/pull-request/') id: fc with: issue-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} repository: ${{ github.repository }} body-includes: "" - name: Delete comment uses: actions/github-script@v7 if: startsWith(github.ref, 'refs/heads/pull-request/') && steps.fc.outputs.comment-id != '' with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | await github.rest.issues.deleteComment({ owner: context.repo.owner, repo: context.repo.repo, comment_id: ${{ steps.fc.outputs.comment-id }} }) - name: Write pull request comment if: startsWith(github.ref, 'refs/heads/pull-request/') && steps.check-membership.outputs.is_maintainer == 'false' uses: peter-evans/create-or-update-comment@v5 with: issue-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} repository: ${{ github.repository }} body: | Thank you for your contribution! NVIDIA Megatron-LM is currently transitioning to development on Github. We will aim to review your PR after we complete our transition and stabilize our Github development process. Thank you for your understanding. - name: exit run: | if [ "${{ steps.check-membership.outputs.is_maintainer }}" == "true" ]; then exit 0 else exit 1 fi pre-flight: needs: [is-not-external-contributor] uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.65.10 linting: runs-on: ubuntu-latest needs: [pre-flight] if: | ( needs.pre-flight.outputs.is_deployment_workflow == 'false' && needs.pre-flight.outputs.is_ci_workload == 'true' ) || ( needs.pre-flight.outputs.is_deployment_workflow == 'false' && needs.pre-flight.outputs.is_ci_workload == 'false' && needs.pre-flight.outputs.docs_only == 'false' ) steps: - name: Checkout uses: actions/checkout@v4 with: fetch-depth: 0 - name: Install uv uses: astral-sh/setup-uv@v1 with: version: 0.7.2 - name: Install linting tools run: | uv sync --locked --only-group linting - name: Get PR info id: get-pr-info if: startsWith(github.ref, 'refs/heads/pull-request/') uses: nv-gha-runners/get-pr-info@main - name: Run linting if: startsWith(github.ref, 'refs/heads/pull-request/') run: | export PATH=".venv/bin:$PATH" export GITLAB_ENDPOINT=github.com export CI_PROJECT_NAMESPACE=NVIDIA export BASE_REF="${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}" export CHECK_ONLY=true export SKIP_DOCS=false bash tools/autoformat.sh cicd-wait-in-queue: runs-on: ubuntu-latest needs: [pre-flight, linting] environment: ${{ needs.pre-flight.outputs.is_merge_group == 'true' && 'merge-gate' || 'test' }} if: | !(needs.pre-flight.outputs.is_ci_workload == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.docs_only == 'true') steps: - name: Running CI tests run: | echo "Running CI tests" echo "is_merge_group: ${{ needs.pre-flight.outputs.is_merge_group }}" cicd-container-build: needs: [pre-flight, cicd-wait-in-queue] runs-on: nvidia-ci-aws-gpu-x8 environment: nemo-ci if: | ( success() || needs.pre-flight.outputs.is_ci_workload == 'true' || needs.pre-flight.outputs.force_run_all == 'true' ) && needs.pre-flight.outputs.is_merge_group == 'false' && !cancelled() steps: - name: Checkout uses: actions/checkout@v4 - name: Setup python uses: actions/setup-python@v5 with: python-version: 3.12 - name: Get PR info id: get-pr-info if: startsWith(github.ref, 'refs/heads/pull-request/') uses: nv-gha-runners/get-pr-info@main - name: Download test data shell: bash env: GH_TOKEN: ${{ secrets.PAT }} run: | echo "::group::Download test data" pip install --no-cache-dir pygithub click python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets echo "::endgroup::" - name: Install GH CLI shell: bash run: | apt-get update apt-get install -y gh - name: Pull cache run: | docker pull ${{ env.container-registry }}/megatron-lm:main || true docker pull ${{ env.container-registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} || true - name: Get last merged PR id: cache_from env: GH_TOKEN: ${{ github.token }} run: | LAST_PRS=$(gh api graphql -f query=' query { repository(owner: "NVIDIA", name: "Megatron-LM") { pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) { nodes { number } } } }' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do echo "${{ env.container-registry }}/megatron-lm:$number" done) echo "LAST_PRS< unit-tests.json echo "unit-tests=$(cat unit-tests.json)" | tee -a $GITHUB_OUTPUT cicd-unit-tests-latest: strategy: fail-fast: false matrix: include: ${{ fromJson(needs.cicd-parse-unit-tests.outputs.unit-tests) }} needs: - pre-flight - cicd-wait-in-queue - cicd-container-build - cicd-parse-unit-tests runs-on: nvidia-ci-aws-gpu-x8 name: "${{ matrix.bucket }} - latest" environment: nemo-ci if: | ( success() || needs.pre-flight.outputs.is_ci_workload == 'true' || needs.pre-flight.outputs.force_run_all == 'true' ) && needs.pre-flight.outputs.is_merge_group == 'false' && !cancelled() env: PIP_DISABLE_PIP_VERSION_CHECK: 1 PIP_NO_PYTHON_VERSION_WARNING: 1 PIP_ROOT_USER_ACTION: ignore steps: - name: Checkout uses: actions/checkout@v4 - name: main uses: ./.github/actions with: test_case: ${{ matrix.bucket }} tag: latest timeout: ${{ matrix.timeout || 30 }} is_unit_test: "true" PAT: ${{ secrets.PAT }} container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }} cicd-parse-integration-tests: runs-on: ubuntu-latest needs: - pre-flight - cicd-wait-in-queue - cicd-container-build - cicd-unit-tests-latest if: | ( success() || needs.pre-flight.outputs.is_ci_workload == 'true' || needs.pre-flight.outputs.force_run_all == 'true' ) && needs.pre-flight.outputs.is_merge_group == 'false' && !cancelled() outputs: integration-tests: ${{ steps.main.outputs.integration-tests }} steps: - name: Checkout uses: actions/checkout@v4 - name: Get PR info id: get-pr-info if: startsWith(github.ref, 'refs/heads/pull-request/') uses: nv-gha-runners/get-pr-info@main - name: Has Run tests label id: has-run-tests-label env: GH_TOKEN: ${{ secrets.PAT }} run: | PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }} HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false" echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT - name: Parse functional tests id: main env: HAS_RUN_TESTS_LABEL: ${{ steps.has-run-tests-label.outputs.HAS_RUN_TESTS_LABEL }} run: | export PYTHONPATH=$(pwd) if [ "$HAS_RUN_TESTS_LABEL" == "true" ]; then ARGS=( --scope mr --enable-lightweight-mode ) else ARGS=( --scope mr-slim ) fi python tests/test_utils/python_scripts/generate_jet_trigger_job.py \ --n-repeat 5 \ --time-limit 2700 \ --test-cases all \ --container-image mcore_ci_dev \ --container-tag latest \ --dependent-job functional:configure \ --record-checkpoints false \ --slurm-account gh \ --no-enable-warmup \ --environment dev \ --platform dgx_h100 \ --cluster ghci \ ${ARGS[@]} \ --output-path integration-tests.yaml cat integration-tests.yaml | \ yq -o json 'del(.default, .stages, .workflow) | to_entries | map({"model": .value.stage, "test_case": .key}) | sort_by(.model, .test_case)' | jq -c > integration-tests.json echo "integration-tests=$(cat integration-tests.json)" | tee -a "$GITHUB_OUTPUT" cicd-integration-tests-latest: strategy: fail-fast: false matrix: include: ${{ fromJson(needs.cicd-parse-integration-tests.outputs.integration-tests) }} needs: - pre-flight - cicd-wait-in-queue - cicd-parse-integration-tests - cicd-unit-tests-latest runs-on: nvidia-ci-aws-gpu-x8 name: "${{ matrix.model }}/${{ matrix.test_case }} - latest" environment: nemo-ci env: PIP_DISABLE_PIP_VERSION_CHECK: 1 PIP_NO_PYTHON_VERSION_WARNING: 1 PIP_ROOT_USER_ACTION: ignore if: | ( success() || needs.pre-flight.outputs.is_ci_workload == 'true' || needs.pre-flight.outputs.force_run_all == 'true' ) && needs.pre-flight.outputs.is_merge_group == 'false' && !cancelled() steps: - name: Checkout uses: actions/checkout@v4 - name: main uses: ./.github/actions with: test_case: ${{ matrix.test_case }} model: ${{ matrix.model }} tag: latest timeout: ${{ matrix.timeout || 30 }} is_unit_test: "false" PAT: ${{ secrets.PAT }} container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }} Nemo_CICD_Test: needs: - pre-flight - cicd-unit-tests-latest - cicd-integration-tests-latest if: | ( needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || always() ) && !cancelled() runs-on: ubuntu-latest permissions: write-all steps: - name: Checkout uses: actions/checkout@v4 - name: Get workflow result id: result shell: bash -x -e -u -o pipefail {0} env: GH_TOKEN: ${{ github.token }} RUN_ID: ${{ github.run_id }} SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }} run: | FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion == "failure")] | length') || echo 0 SKIPPED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion == "skipped")] | length') || echo 0 if [ "${FAILED_JOBS:-0}" -eq 0 ] && ([ "${SKIPPED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]); then echo "✅ All previous jobs completed successfully" exit 0 else echo "❌ Found $FAILED_JOBS failed job(s)" # Show which jobs failed gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion == "failure") | .name' exit 1 fi Coverage_Fake: runs-on: ubuntu-latest needs: [Nemo_CICD_Test, pre-flight] if: | ( needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || github.event == 'merge_group' ) && needs.pre-flight.outputs.is_ci_workload == 'false' && !cancelled() environment: nemo-ci steps: - name: Generate fake coverage report uses: actions/github-script@v6 with: github-token: ${{ secrets.PAT }} script: | await github.rest.repos.createCommitStatus({ owner: context.repo.owner, repo: context.repo.repo, sha: context.sha, state: 'success', description: 'No code changes - coverage check skipped', context: 'codecov/patch' }); Coverage: runs-on: ubuntu-latest needs: [Nemo_CICD_Test] if: | ( (needs.pre-flight.outputs.is_ci_workload == 'true' && !failure()) || success() ) && !cancelled() strategy: matrix: flag: [unit-test] steps: - name: Checkout uses: actions/checkout@v4 - name: Download coverage reports of current branch uses: actions/download-artifact@v4 with: pattern: coverage-${{ matrix.flag }}-* - name: List coverage files run: find . -type f -name "*.xml" -o -name "*.lcov" - name: Get total coverage of current branch shell: bash -x -e -u -o pipefail {0} if: always() run: | pip install coverage ls -al . ls -al coverage-*/ coverage combine --keep $(ls coverage-*/.coverage) coverage report -i rm -rf coverage-* ls -al - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} verbose: true flags: ${{ matrix.flag }} - name: Upload artifacts uses: actions/upload-artifact@v4 with: name: coverage-${{ matrix.flag }}-aggregated path: | .coverage include-hidden-files: true