unit-tests.yaml 3.91 KB
Newer Older
silencealiang's avatar
silencealiang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
type: basic
format_version: 1
maintainers: [mcore]
loggers: [stdout]
spec:
  name: '{test_case}_{environment}_{tag}'
  model: unit-tests
  nodes: 1
  build: mcore-pyt-{environment}
  gpus: 8
  platforms: dgx_h100
  script: |-
    ls

    export TAG={tag}
    export ENVIRONMENT={environment}
    export BUCKET="{test_case}"
    export UNIT_TEST_REPEAT={n_repeat}
    export UNIT_TEST_TIMEOUT=10

    set -euxo pipefail

    if [[ "$TAG" == "latest" ]]; then
      TEST_PATH="/opt/megatron-lm"
    else
      TEST_PATH="/opt/megatron-lm-legacy/"
    fi

    cd $TEST_PATH

    MARKER=()
    if [[ "$TAG" == "legacy" ]]; then
      MARKER+=("not internal")
    fi

    if [[ "$ENVIRONMENT" == "lts" ]]; then
      MARKER+=("not flaky")
    fi

    if [[ "$ENVIRONMENT" == "dev" ]]; then
      MARKER+=("not flaky_in_dev")
    fi

    MARKER_ARG=$(printf "%s" "${{MARKER[0]}}")
    for element in "${{MARKER[@]:1}}"; do
      MARKER_ARG+=" and $element"
    done

    IGNORE_TEST_CASES=$(cat /opt/megatron-lm/tests/test_utils/recipes/unit-tests.yaml | yq eval 'with(.products[].test_case; del(.[] | select(. == env(BUCKET)))) | .products[].test_case[]' | tr " " "\n")
    IGNORE_ARGS=()
    while IFS= read -r test_case; do
      if [[ $test_case == *\** ]]; then
          FILES=($(ls $test_case))
          echo ${{FILES[@]}}
          for file in "${{FILES[@]}}"; do
            IGNORE_ARGS+=("--ignore='$file'")
          done          
      else
          IGNORE_ARGS+=("--ignore=$test_case")
      fi
    done <<< "$IGNORE_TEST_CASES"

    echo "------ARGUMENTS for SLURM ---"
    MASTER_ADDR=${{MASTER_ADDR:-localhost}}
    MASTER_PORT=${{MASTER_PORT:-6000}}
    NUM_NODES=${{NUM_NODES:-${{SLURM_NNODES}}}}
    GPUS_PER_NODE=${{GPUS_PER_NODE:-8}}
    NODE_RANK=${{SLURM_NODEID:-${{SLURM_NODEID}}}}
    DISTRIBUTED_ARGS=(
        --nproc_per_node $GPUS_PER_NODE
        --nnodes $NUM_NODES
        --master_addr $MASTER_ADDR
        --master_port $MASTER_PORT
        --node_rank $SLURM_NODEID
        --log-dir {assets_dir}
        --tee "0:3"
        --redirects "3"
    )

    # Reduce memory usage by NCCL
    export NCCL_MAX_NCHANNELS=1
    export NCCL_NVLS_ENABLE=0

    for i in $(seq $UNIT_TEST_REPEAT); do
      CMD=$(echo torchrun ${{DISTRIBUTED_ARGS[@]}} -m pytest \
        -xvs \
        --cov-report=term \
        --cov-branch \
        --cov=megatron/core \
        --cov-report xml:coverage.xml \
        --no-cov-on-fail ${{IGNORE_ARGS[@]}} \
        -m "'${{MARKER_ARG}}'" $BUCKET)
      
      eval "$CMD"    
    done

    ls -al 
    cp .coverage_0 {assets_dir}/coverage_report
    cp coverage.xml {assets_dir}

products:
  - test_case: [tests/unit_tests/data/]
    products:
      - environment: [lts, dev]
        tag: [latest, legacy]
        scope: [unit-tests]
        n_repeat: [1]
        time_limit: [1800]
  - test_case: [tests/unit_tests/dist_checkpointing/*.py]
    products:
      - environment: [lts, dev]
        tag: [latest, legacy]
        scope: [unit-tests]
        n_repeat: [1]
        time_limit: [1800]
  - test_case: [tests/unit_tests/dist_checkpointing/models/]
    products:
      - environment: [lts, dev]
        tag: [latest, legacy]
        scope: [unit-tests]
        n_repeat: [1]
        time_limit: [1800]
  - test_case: [tests/unit_tests/transformer/*.py]
    products:
      - environment: [lts, dev]
        tag: [latest, legacy]
        scope: [unit-tests]
        n_repeat: [1]
        time_limit: [1800]
  - test_case: [tests/unit_tests/transformer/moe]
    products:
      - environment: [lts, dev]
        tag: [latest, legacy]
        scope: [unit-tests]
        n_repeat: [1]
        time_limit: [1800]
  - test_case: [tests/unit_tests]
    products:
      - environment: [lts, dev]
        tag: [latest, legacy]
        scope: [unit-tests]
        n_repeat: [1]
        time_limit: [1800]