bert.yaml 1.51 KB
Newer Older
liangjing's avatar
liangjing committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
type: basic
format_version: 1
maintainers: [mcore]
loggers: [stdout]
spec:
  name: "{test_case}"
  model: bert
  build: mcore-pyt
  nodes: 1
  gpus: 8
  platforms: dgx_a100
  artifacts: 
    /workspace/data/bert_data: text/the_pile/bert_shard00
  script: |-
    ls
    cd /workspace/megatron-lm

    ARGUMENTS=(
        "DATA_PATH=/workspace/data/bert_data"
        "DATA_CACHE_PATH=/workspace/data/cache" 
        "OUTPUT_PATH={assets_dir}"
        "TENSORBOARD_PATH={assets_dir}/tensorboard"
        "CHECKPOINT_PATH=/workspace/checkpoints"
        "TRAINING_SCRIPT_PATH=pretrain_bert.py"
        "TEST_CASE_PATH=./tests/functional_tests/test_cases/{model}/{test_case}"
    )

    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}

products:
  - scope: [mr]
    time_limit: [1200]
    test_case: 
    - bert_mr_mcore_tp2_pp2_dgx_a100_1N8G
    - bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G
    - bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G
    - bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G
    - bert_mr_tp1_pp4_vp2_dgx_a100_1N8G
    - bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G
    - bert_mr_tp2_pp2_dgx_a100_1N8G
    - bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G
  - scope: [nightly]
    time_limit: [12000]
    test_case:
    - bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2
    - bert_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2
    - bert_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1
    - bert_345m_nightly_dgx_a100_1N8G_tp1_pp2
    - bert_345m_nightly_dgx_a100_1N8G_tp4_pp1