t5.yaml 3.62 KB
Newer Older
silencealiang's avatar
silencealiang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
type: basic
format_version: 1
maintainers: [mcore]
loggers: [stdout]
spec:
  name: '{test_case}_{environment}'
  model: t5
  build: mcore-pyt-{environment}
  nodes: 1
  gpus: 8
  platforms: dgx_a100
  artifacts:
    /workspace/data/t5_data: text/the_pile/t5_shard00
    /workspace/checkpoints/t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_dgx_a100_1N8G_dev: model/mcore_t5/t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_dgx_a100_1N8G_dev/22410107
  script: |-
    ls
    cd /opt/megatron-lm

    ARGUMENTS=(
        "DATA_PATH=/workspace/data/t5_data"
        "DATA_CACHE_PATH=/workspace/data/cache"
        "OUTPUT_PATH={assets_dir}"
        "TENSORBOARD_PATH={assets_dir}/tensorboard"
        "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
        "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}"
        "TRAINING_SCRIPT_PATH=pretrain_t5.py"
        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
        "N_REPEAT={n_repeat}"
    )

    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}

products:
  - test_case: [t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
      - environment: [lts]
        scope: [nightly]
  - test_case: [t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
      - environment: [lts]
        scope: [nightly]
  - test_case: [t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
      - environment: [lts]
        scope: [nightly]
  - test_case: [t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
      - environment: [lts]
        scope: [nightly]
  - test_case: [t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G]
    products:
      - environment: [lts]
        scope: [nightly]
  - test_case: [t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
      - environment: [lts]
        scope: [nightly]
  - test_case: [t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
      - environment: [lts]
        scope: [nightly]
  - test_case: [t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
      - environment: [lts]
        scope: [nightly]
  - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch]
    products:
      - environment: [dev, lts]
        scope: [nightly]
  - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1]
    products:
      - environment: [dev, lts]
        scope: [nightly]
  - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel]
    products:
      - environment: [dev, lts]
        scope: [nightly]
  - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1]
    products:
      - environment: [dev, lts]
        scope: [nightly]
  - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch]
    products:
      - environment: [dev, lts]
        scope: [nightly]
  - test_case: [t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1]
    products:
      - environment: [dev, lts]
        scope: [nightly]
  - test_case: [t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]