.merge_train_rule: &merge_train_rule UNIT_TEST: "yes" UNIT_TEST_REPEAT: 1 UNIT_TEST_TIMEOUT: 30 INTEGRATION_TEST: "yes" INTEGRATION_TEST_SCOPE: mr FUNCTIONAL_TEST: "yes" FUNCTIONAL_TEST_SCOPE: mr-slim FUNCTIONAL_TEST_REPEAT: 5 FUNCTIONAL_TEST_TIME_LIMIT: 2700 CLUSTER_A100: "" CLUSTER_H100: "" PUBLISH: "no" workflow: rules: # Do not trigger for forks - if: $CI_PROJECT_NAMESPACE != "ADLR" || ($CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_PROJECT_PATH != "ADLR/megatron-lm") when: never # ci-branches only for schedule - if: $CI_COMMIT_BRANCH =~ /ci-/ && $CI_PIPELINE_SOURCE != "schedule" when: never # For schedules pipelines - if: $CI_PIPELINE_SOURCE == "schedule" auto_cancel: on_new_commit: none # For manual pipelines - if: $CI_PIPELINE_SOURCE == "web" # For push to main - if: $CI_PIPELINE_SOURCE == 'push' && $CI_COMMIT_REF_PROTECTED == "true" variables: UNIT_TEST: "no" INTEGRATION_TEST: "no" FUNCTIONAL_TEST: "yes" FUNCTIONAL_TEST_SCOPE: mr FUNCTIONAL_TEST_REPEAT: 5 FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no" FUNCTIONAL_TEST_TIME_LIMIT: 2700 CLUSTER_A100: "" CLUSTER_H100: "" PUBLISH: "no" auto_cancel: on_new_commit: none # For merge-trains that need to be fast-tracked - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' && $CI_MERGE_REQUEST_LABELS =~ /fast-track/ variables: UNIT_TEST: "yes" UNIT_TEST_REPEAT: 1 UNIT_TEST_TIMEOUT: 30 INTEGRATION_TEST: "no" FUNCTIONAL_TEST: "no" CLUSTER_A100: "" CLUSTER_H100: "" PUBLISH: "no" # For normal merge-trains - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merge_train' variables: *merge_train_rule # For MRs with integration suite - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run tests/ variables: *merge_train_rule # For MRs with nightly - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ variables: UNIT_TEST: "yes" UNIT_TEST_REPEAT: 1 UNIT_TEST_TIMEOUT: 30 INTEGRATION_TEST: "no" FUNCTIONAL_TEST: "yes" FUNCTIONAL_TEST_SCOPE: nightly FUNCTIONAL_TEST_REPEAT: 5 FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no" FUNCTIONAL_TEST_TIME_LIMIT: 2700 CLUSTER_A100: "" CLUSTER_H100: "" PUBLISH: "no" # For MRs with weekly - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ variables: UNIT_TEST: "yes" UNIT_TEST_REPEAT: 1 UNIT_TEST_TIMEOUT: 30 INTEGRATION_TEST: "no" FUNCTIONAL_TEST: "yes" FUNCTIONAL_TEST_SCOPE: weekly FUNCTIONAL_TEST_REPEAT: 1 FUNCTIONAL_TEST_RECORD_CHECKPOINTS: "no" FUNCTIONAL_TEST_TIME_LIMIT: 9000 CLUSTER_A100: "" CLUSTER_H100: "" PUBLISH: "no" # For MRs with heavy suite - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' && $CI_MERGE_REQUEST_LABELS =~ /Run functional tests/ variables: UNIT_TEST: "yes" UNIT_TEST_REPEAT: 1 UNIT_TEST_TIMEOUT: 30 INTEGRATION_TEST: "no" FUNCTIONAL_TEST: "yes" FUNCTIONAL_TEST_SCOPE: mr FUNCTIONAL_TEST_REPEAT: 5 FUNCTIONAL_TEST_TIME_LIMIT: 2700 CLUSTER_A100: "" CLUSTER_H100: "" PUBLISH: "no" # Default MRs - if: $CI_MERGE_REQUEST_EVENT_TYPE == 'merged_result' variables: UNIT_TEST: "yes" UNIT_TEST_REPEAT: 1 UNIT_TEST_TIMEOUT: 30 INTEGRATION_TEST: "no" FUNCTIONAL_TEST: "no" PUBLISH: "no" - when: never auto_cancel: on_new_commit: interruptible stages: - build - test - integration_tests - functional_tests - publish default: interruptible: true retry: max: 2 when: runner_system_failure variables: UNIT_TEST: value: "yes" options: - "yes" - "no" description: To run the funtional test suite UNIT_TEST_REPEAT: value: "1" description: "Number of repetitions" UNIT_TEST_TIMEOUT: value: "30" description: Timeout (minutes) for Unit tests (all repeats) INTEGRATION_TEST: value: "yes" options: - "yes" - "no" description: To run the integration test suite INTEGRATION_TEST_SCOPE: value: "mr" options: - "mr" - "nightly" - "weekly" - "pre-release" - "release" description: "Testsuite to run (only for INTEGRATION_TEST=yes)" INTEGRATION_TEST_TIME_LIMIT: value: "900" description: "Timeout in seconds per test" INTEGRATION_TEST_CASES: value: "all" description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite." FUNCTIONAL_TEST: value: "yes" options: - "yes" - "no" description: To run the funtional test suite FUNCTIONAL_TEST_SCOPE: value: "mr" options: - "mr" - "nightly" - "weekly" - "pre-release" - "release" description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)" FUNCTIONAL_TEST_REPEAT: value: "5" description: "Number of repetitions per test" FUNCTIONAL_TEST_TIME_LIMIT: value: "2700" description: "Timeout in seconds per test" FUNCTIONAL_TEST_CASES: value: "all" description: "Comma-separated list of test_cases to run. Use 'all' to run the full suite." FUNCTIONAL_TEST_NAME: description: "Name of functional test run (only for pre-release and release)" value: "$$CI_COMMIT_SHA" FUNCTIONAL_TEST_RECORD_CHECKPOINTS: value: "no" description: "Record golden checkpoints" options: - "yes" - "no" CLUSTER_A100: value: "dgxa100_dracooci" options: - "dgxa100_dracooci" - "dgxa100_dracooci-ord" description: "Cluster for A100 workloads" CLUSTER_H100: value: "dgxh100_coreweave" options: - "dgxh100_coreweave" - "dgxh100_eos" description: "Cluster for H100 workloads" PUBLISH: value: "no" options: - "yes" - "no" description: Build and publish a wheel to PyPi PUBLISH_COMMIT: value: "$$CI_COMMIT_SHA" description: Which commit to publish PUBLISH_VERSION_BUMP_BRANCH: value: "$$CI_COMMIT_BRANCH" description: Which branch to target for version bump PUBLISH_SCOPE: value: "code-freeze" options: - "code-freeze" - "release" - "review-reminder" description: Type of publish (freeze or final release) # CI wide variables CI_MCORE_LTS_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_lts CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci UTILITY_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_utility TE_GIT_REF: "" include: - .gitlab/stages/00.pre.yml - .gitlab/stages/01.build.yml - .gitlab/stages/02.test.yml - .gitlab/stages/03.integration-tests.yml - .gitlab/stages/04.functional-tests.yml - .gitlab/stages/05.publish.yml