weekly-gpt.yaml

type: basic
format_version: 1
maintainers: [shreyasm]
loggers: [stdout]
spec:
  name: "{model}_{variant}_{scope}_{platforms}_{nodes}N{gpus}G_\
         {'mcore_' if use_mcore else ''}{'nondet_' if allow_nondeterministic else ''}\
         tp{tp_size}_pp{pp_size}{'_vp'+str(vp_size) if vp_size else ''}\
         {'_resume_'+str(ckpt_format) if ckpt_resume else ''}\
         {'_'+args_meta if args_meta else ''}"
  model: gpt3
  variant: 345m
  build: mcore-pyt
  scope: weekly
  nodes: 1
  gpus: 8
  platforms: dgx_h100
  use_mcore: True
  vp_size: null
  extra_args: null
  args_meta: null
  micro_batch_size: 2 # MBS
  batch_size: 128 # GBS, JET schema requires 'batch_size'
  moe_grouped_gemm: 0
  allow_nondeterministic: False
  precision: bf16
  time_limit: 10000 # 2.5 hours
  ckpt_format: torch
  ckpt_resume: 0
  artifacts: {/workspace/data/gpt3_data: text/the_pile/shard00}
  script: |-
    ls
    cd /workspace/megatron-lm

    ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh \
        DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document \
        VOCAB_FILE=/workspace/data/gpt3_data/bpe/vocab.json \
        MERGE_FILE=/workspace/data/gpt3_data/bpe/merges.txt \
        CHECKPOINT_PATH=/workspace/checkpoints \
        TENSORBOARD_DIR={assets_dir} \
        DATA_CACHE=/workspace/data/index-cache \
        TP_SIZE={tp_size} \
        PP_SIZE={pp_size} \
        NUM_NODES={nodes} \
        MAX_STEPS=2000 \
        USE_CORE={"1" if use_mcore else "0"} \
        USE_FP8={"1" if precision == "fp8" else "0"} \
        VP_SIZE={vp_size if vp_size is not None else '""'} \
        MBS={micro_batch_size} \
        GBS={batch_size} \
        MOE_GROUPED_GEMM={moe_grouped_gemm} \
        ALLOW_NONDETERMINISTIC={"1" if allow_nondeterministic else "0"} \
        JOB_NAME={name} \
        ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
products:
  - {use_mcore: [True], precision: [bf16], tp_size: [1], pp_size: [1], allow_nondeterministic: [False], args_meta: ["bf16_baseline"]}
  - {use_mcore: [True], precision: [fp8], tp_size: [1], pp_size: [1], allow_nondeterministic: [False, True], args_meta: ["fp8_no_model_parallel"]}
  - {use_mcore: [True], precision: [fp8], tp_size: [1], pp_size: [2], allow_nondeterministic: [False], args_meta: ["fp8_pp"]}
  - {use_mcore: [True], precision: [fp8], tp_size: [2, 4], pp_size: [2], allow_nondeterministic: [False], args_meta: ["fp8_tp_pp"]}
  - {use_mcore: [True], precision: [fp8], tp_size: [2], pp_size: [2], allow_nondeterministic: [False], extra_args: [" --sequence-parallel"], args_meta: ["fp8_tp_pp_sp"]}