workflow: rules: - if: $CI_PIPELINE_SOURCE == "schedule" variables: FUNCTIONAL_TEST: "yes" - if: $CI_PIPELINE_SOURCE == "web" - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH variables: FUNCTIONAL_TEST: "no" - if: $CI_COMMIT_BRANCH =~ /^core_r/ variables: FUNCTIONAL_TEST: "no" - if: $CI_MERGE_REQUEST_LABELS =~ /Run tests and nightly/ variables: FUNCTIONAL_TEST: "yes" SLURM_CLUSTER: dgxa100_dracooci SCOPE: mr-and-nightly - if: $CI_MERGE_REQUEST_LABELS =~ /Run tests/ variables: FUNCTIONAL_TEST: "yes" SLURM_CLUSTER: dgxa100_dracooci SCOPE: mr - if: $CI_PIPELINE_SOURCE == "merge_request_event" variables: FUNCTIONAL_TEST: "no" - when: never auto_cancel: on_new_commit: interruptible stages: - build - unit_tests - functional_tests default: interruptible: true variables: FUNCTIONAL_TEST: "yes" SCOPE: value: "mr" options: - "mr" - "nightly" - "mr-and-nightly" - "weekly" - "release" description: "Testsuite to run" SLURM_CLUSTER: value: "dgxa100_dracooci" options: - "dgxa100_dracooci" - "dgxh100_eos" description: '"dgxa100_dracooci" for OCI-IAD, "dgxh100_eos" for EOS' # CI wide variables CI_MCORE_IMAGE: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci CI_NEMO_IMAGE: gitlab-master.nvidia.com:5005/adlr/megatron-lm/nemo_ci LINTING_IMAGE: gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_linting metadata: image: python:3.10 stage: .pre tags: - os/linux script: - set -x - env - JET_CUSTOM_FILTER="type == 'basic'" - | if [[ $SLURM_CLUSTER == dgxh100_eos ]]; then JET_CI_BRANCH=mcore/eos JET_CUSTOM_FILTER="$JET_CUSTOM_FILTER and 'dgx_h100' in spec.platforms" elif [[ $SLURM_CLUSTER == dgxa100_dracooci ]]; then JET_CI_BRANCH=mcore/draco-oci JET_CUSTOM_FILTER="$JET_CUSTOM_FILTER and 'dgx_a100' in spec.platforms" fi - | if [[ $SCOPE == mr ]]; then JET_CUSTOM_FILTER="$JET_CUSTOM_FILTER and 'mr' in spec.scope" elif [[ $SCOPE == nightly ]]; then JET_CUSTOM_FILTER="$JET_CUSTOM_FILTER and 'nightly' in spec.scope" elif [[ $SCOPE == mr-and-nightly ]]; then JET_CUSTOM_FILTER="$JET_CUSTOM_FILTER and ('mr' in spec.scope or 'nightly' in spec.scope)" elif [[ $SCOPE == weekly ]]; then JET_CUSTOM_FILTER="$JET_CUSTOM_FILTER and 'weekly' in spec.scope" elif [[ $SCOPE == release ]]; then JET_CUSTOM_FILTER="$JET_CUSTOM_FILTER and 'release' in spec.scope" fi - | if [[ "$JET_CUSTOM_FILTER" == "type == 'basic'" ]]; then JET_CUSTOM_FILTER="False" fi - echo "JET_CI_BRANCH=$JET_CI_BRANCH" | tee -a build.env - echo "JET_CUSTOM_FILTER=$JET_CUSTOM_FILTER" | tee -a build.env artifacts: reports: dotenv: build.env rules: - if: '$FUNCTIONAL_TEST == "yes"' ppp_capacity_statistics: tags: [mcore-ssh-agent] stage: .pre script: - | set -x ALL_USER=$(sshare -aP | grep coreai_dlalgo_mcore | tail -n +2 | awk -F '|' '{print $2}' | tr '\n' ',') # Get the current year, month, and day YEAR=$(date +%Y) MONTH=$(date +%m) DAY=$([[ $(date +%-d) -le 15 ]] && echo "01" || echo "15") TIMESTAMP="${YEAR}-${MONTH}-${DAY}T00:00:01" CLUSTER_ID=$(curl "${RESOURCE_ENDPOINT}/api/v1/clusters" \ -H "accept: application/json, text/plain, */*" \ -H "accept-language: en-US,en;q=0.9" \ -H "authorization: Bearer $CSRG_API_KEY" | jq '.[] | select(.name == "draco-oci-iad") | .id' | tr -d '"') INITIATIVE_ITEM_ID=$(curl "${RESOURCE_ENDPOINT}/api/v1/initiative-items" \ -H "accept: application/json, text/plain, */*" \ -H "accept-language: en-US,en;q=0.9" \ -H "authorization: Bearer $CSRG_API_KEY" | jq '.[] | select(.name == "coreai_dlalgo_mcore") | .id' | tr -d '"') QUOTA=$(curl "${RESOURCE_ENDPOINT}/api/v1/capacity-requests" \ -H "accept: application/json, text/plain, */*" \ -H "accept-language: en-US,en;q=0.9" \ -H "authorization: Bearer $CSRG_API_KEY" | jq --arg CLUSTER_ID $CLUSTER_ID --arg INITIATIVE_ITEM_ID $INITIATIVE_ITEM_ID '[.[] | select(.clusterId == $CLUSTER_ID and .initiativeItemId == $INITIATIVE_ITEM_ID)] | to_entries | [last] | .[0].value.quantity') USED_CAPA=$(sacct \ -u ${ALL_USER} \ --partition batch_block1,batch_block3,batch_block4 \ --truncate \ -A coreai_dlalgo_mcore \ -S ${TIMESTAMP} \ -X \ --format JobID,JobName%20,Partition,AllocNodes,ElapsedRaw \ -p \ -n \ | awk -F "|" '{{sum+=$4*$5}} END {{print sum*8/3600}}') TOTAL_CAPA=$(( $QUOTA*24*30 )) USAGE=$(echo "$USED_CAPA $TOTAL_CAPA" | awk '{print (1 - $1/$2)*100}')% echo "Usage left: $USAGE" echo "Disclaimer: Please be careful with this number. Usage does not imply what we are guaranteed to get a slot, SLURM scheduling is more complicated than that. The number is rather a proxy to the FairShare that determines our job-scheduling-priority. Most important take-away of this number is to get a sense how much much we are eating up our budget such that we can discuss this with capacity planning. " build_image: tags: - mcore-docker-node image: docker:26.1.4-dind needs: [] # May start ASAP stage: build timeout: 45m parallel: matrix: - IMAGE: CI_MCORE_IMAGE FILE: Dockerfile.ci BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3 - IMAGE: CI_NEMO_IMAGE FILE: Dockerfile.ci BASE_IMAGE: nvcr.io/nvidian/nemo:nightly - IMAGE: LINTING_IMAGE FILE: Dockerfile.linting BASE_IMAGE: python:3.10 before_script: - echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin - echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin script: - | set -x eval "IMAGE=\$$IMAGE" OLD_IMAGES=$(docker image ls --format "{{.ID}} {{.Repository}}:{{.Tag}}" \ | grep -v 'nvcr.io/nvidia/pytorch:24.01-py3' \ | grep -v 'gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_ci:buildcache' \ | grep -v 'gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_nemo:buildcache' \ | grep -v 'gitlab-master.nvidia.com:5005/adlr/megatron-lm/mcore_linting:buildcache' \ | grep -v 'nvcr.io/nvidian/nemo:nightly' \ | grep -v 'python:3.10' | awk '{ print $1 }' ) docker rmi $OLD_IMAGES || true docker builder prune -a --filter "until=24h" -f if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then ADDITIONAL_PARAMS="--pull" fi docker build \ -f $FILE \ -t ${IMAGE}:${CI_PIPELINE_ID} \ --cache-to type=inline \ --cache-from type=registry,ref=${IMAGE}:buildcache \ --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \ ${ADDITIONAL_PARAMS} . docker push ${IMAGE}:${CI_PIPELINE_ID} if [[ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]]; then docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:buildcache docker push ${IMAGE}:buildcache fi if [[ $CI_COMMIT_BRANCH == core_r* ]]; then docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:v${CI_COMMIT_BRANCH#core_r}-${CI_PIPELINE_ID} docker push ${IMAGE}:v${CI_COMMIT_BRANCH#core_r}-${CI_PIPELINE_ID} fi .unit_test_common: image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID} stage: unit_tests needs: [build_image] tags: - 8xL40S variables: MOE_GROUPED_GEMM: 0 # Set to 1 to enable grouped gemm for MoE retry: max: 2 when: job_execution_timeout unit_tests: extends: [.unit_test_common] script: - torchrun --nproc_per_node=8 -m pytest -x -v -s --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail tests/unit_tests coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/' artifacts: paths: - coverage expire_in: 30 days rules: - if: '$FUNCTIONAL_TEST == "yes" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)' allow_failure: true - if: '$FUNCTIONAL_TEST == "yes"' unit_tests-data: extends: [.unit_test_common] script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/data rules: - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)' allow_failure: true - if: '$FUNCTIONAL_TEST == "no"' unit_tests-dist-checkpointing: extends: [.unit_test_common] script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/dist_checkpointing rules: - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)' allow_failure: true - if: '$FUNCTIONAL_TEST == "no"' unit_tests-fusions: extends: [.unit_test_common] script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/fusions rules: - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)' allow_failure: true - if: '$FUNCTIONAL_TEST == "no"' unit_tests-inference: extends: [.unit_test_common] script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/inference rules: - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)' allow_failure: true - if: '$FUNCTIONAL_TEST == "no"' unit_tests-models: extends: [.unit_test_common] script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/models rules: - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)' allow_failure: true - if: '$FUNCTIONAL_TEST == "no"' unit_tests-pipeline-parallel: extends: [.unit_test_common] script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/pipeline_parallel rules: - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)' allow_failure: true - if: '$FUNCTIONAL_TEST == "no"' unit_tests-tensor-parallel: extends: [.unit_test_common] script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/tensor_parallel rules: - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)' allow_failure: true - if: '$FUNCTIONAL_TEST == "no"' unit_tests-transformer: extends: [.unit_test_common] script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/transformer rules: - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)' allow_failure: true - if: '$FUNCTIONAL_TEST == "no"' unit_tests-top-py: extends: [.unit_test_common] script: - torchrun --nproc_per_node=8 -m pytest -x -v -s tests/unit_tests/*.py rules: - if: '$FUNCTIONAL_TEST == "no" && $CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)' allow_failure: true - if: '$FUNCTIONAL_TEST == "no"' docs_build_test: image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/python-format:0.0.1 stage: unit_tests tags: - os/linux script: - cd .. - rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab-master.nvidia.com/nemo-megatron-core-tme/documentation.git - mv megatron-lm/ documentation/ - cd documentation/ - ./repo docs allow_failure: true except: - main interruptible: true formatting: image: ${LINTING_IMAGE}:${CI_PIPELINE_ID} tags: - os/linux stage: unit_tests before_script: - git fetch origin main script: - CHECK_ONLY=true bash tools/autoformat.sh rules: - if: '$CI_PIPELINE_SOURCE == "merge_request_event" && ($CI_MERGE_REQUEST_TARGET_BRANCH_NAME != $CI_DEFAULT_BRANCH && $CI_MERGE_REQUEST_TARGET_BRANCH_NAME !~ /^core_r/)' allow_failure: true - when: always interruptible: true include: - jet-tests.yml