gpt-nemo.yaml 1.45 KB
Newer Older
xingjinliang's avatar
xingjinliang committed
1
2
3
4
5
type: basic
format_version: 1
maintainers: [mcore]
loggers: [stdout]
spec:
silencealiang's avatar
silencealiang committed
6
  name: '{test_case}_{environment}'
xingjinliang's avatar
xingjinliang committed
7
8
9
10
11
12
  model: gpt-nemo
  build: mcore-nemo
  nodes: 1
  gpus: 8
  platforms: dgx_a100
  time_limit: 1800
silencealiang's avatar
silencealiang committed
13
  scope:
xingjinliang's avatar
xingjinliang committed
14
15
16
  script: |-
    ls
    cd /opt/NeMo
silencealiang's avatar
silencealiang committed
17

xingjinliang's avatar
xingjinliang committed
18
19
20
21
22
    ARGUMENTS=(
        "DATA_PATH='-'"
        "DATA_CACHE_PATH='-'"
        "OUTPUT_PATH={assets_dir}"
        "TENSORBOARD_PATH={assets_dir}/tensorboard"
silencealiang's avatar
silencealiang committed
23
24
       "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
        "CHECKPOINT_LOAD_PATH=/workspace/checkpoints/{name}"
xingjinliang's avatar
xingjinliang committed
25
26
27
28
29
30
31
32
33
        "TRAINING_SCRIPT_PATH=/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py"
        "TRAINING_PARAMS_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
        "GOLDEN_VALUES_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
        "N_REPEAT={n_repeat}"
    )

    bash /opt/megatron-lm/tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}

products:
silencealiang's avatar
silencealiang committed
34
35
36
37
38
39
40
41
42
43
44
45
  - test_case: [gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
  - test_case: [gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]
  - test_case: [gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp1_dgx_a100_1N8G]
    products:
      - environment: [dev]
        scope: [mr]