type: basic format_version: 1 maintainers: [shreyasm] loggers: [stdout] spec: name: "{model}_{variant}_{scope}_{platforms}_{nodes}N{gpus}G_\ {'mcore_' if use_mcore else ''}{'nondet_' if allow_nondeterministic else ''}\ tp{tp_size}_pp{pp_size}{'_vp'+str(vp_size) if vp_size else ''}\ {'_resume_'+str(ckpt_format) if ckpt_resume else ''}\ {'_'+args_meta if args_meta else ''}" model: gpt3 variant: 345m build: mcore-pyt scope: weekly nodes: 1 gpus: 8 platforms: dgx_h100 use_mcore: True vp_size: null extra_args: null args_meta: null micro_batch_size: 2 # MBS batch_size: 128 # GBS, JET schema requires 'batch_size' moe_grouped_gemm: 0 allow_nondeterministic: False precision: bf16 time_limit: 10000 # 2.5 hours ckpt_format: torch ckpt_resume: 0 artifacts: {/workspace/data/gpt3_data: text/the_pile/shard00} script: |- ls cd /workspace/megatron-lm ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh \ DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document \ VOCAB_FILE=/workspace/data/gpt3_data/bpe/vocab.json \ MERGE_FILE=/workspace/data/gpt3_data/bpe/merges.txt \ CHECKPOINT_PATH=/workspace/checkpoints \ TENSORBOARD_DIR={assets_dir} \ DATA_CACHE=/workspace/data/index-cache \ TP_SIZE={tp_size} \ PP_SIZE={pp_size} \ NUM_NODES={nodes} \ MAX_STEPS=2000 \ USE_CORE={"1" if use_mcore else "0"} \ USE_FP8={"1" if precision == "fp8" else "0"} \ VP_SIZE={vp_size if vp_size is not None else '""'} \ MBS={micro_batch_size} \ GBS={batch_size} \ MOE_GROUPED_GEMM={moe_grouped_gemm} \ ALLOW_NONDETERMINISTIC={"1" if allow_nondeterministic else "0"} \ JOB_NAME={name} \ ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} products: - {use_mcore: [True], precision: [bf16], tp_size: [1], pp_size: [1], allow_nondeterministic: [False], args_meta: ["bf16_baseline"]} - {use_mcore: [True], precision: [fp8], tp_size: [1], pp_size: [1], allow_nondeterministic: [False, True], args_meta: ["fp8_no_model_parallel"]} - {use_mcore: [True], precision: [fp8], tp_size: [1], pp_size: [2], allow_nondeterministic: [False], args_meta: ["fp8_pp"]} - {use_mcore: [True], precision: [fp8], tp_size: [2, 4], pp_size: [2], allow_nondeterministic: [False], args_meta: ["fp8_tp_pp"]} - {use_mcore: [True], precision: [fp8], tp_size: [2], pp_size: [2], allow_nondeterministic: [False], extra_args: [" --sequence-parallel"], args_meta: ["fp8_tp_pp_sp"]}