type: basic format_version: 1 maintainers: [maanug] loggers: [stdout] launchers: type:slurm: ntasks_per_node: '{gpus}' no_container_mount_home: 'true' spec: name: "{model}_{variant}_{scope}_\ mbs{mbs}_gbs{gbs}_\ {'mcore_' if use_mcore else ''}{'te_' if use_te else ''}\ tp{tp_size}_pp{pp_size}{'_vp'+str(vp_size) if vp_size else ''}\ {'_'+args_meta if args_meta else ''} _{platforms}_{nodes}N{gpus}G" model: gpt3-nemo variant: 126m build: mcore-nemo scope: mr nodes: 1 gpus: 8 platforms: dgx_a100 steps: 50 extra_args: null args_meta: null precision: bf16 time_limit: 1200 use_mcore: True use_te: True vp_size: null script: |- cd /opt/NeMo /opt/megatron-lm/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_nemo_test.sh \ TP_SIZE={tp_size} \ PP_SIZE={pp_size} \ NUM_NODES={nodes} \ MAX_STEPS={steps} \ VP_SIZE={vp_size if vp_size is not None else '""'} \ MBS={mbs} \ GBS={gbs} \ JOB_NAME={name} \ ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} products: - {tp_size: [1], pp_size: [1], mbs: [4], gbs: [64], vp_size: [null]} - {tp_size: [2], pp_size: [4], mbs: [1], gbs: [8], vp_size: [3], extra_args: ['"model.sequence_parallel=True model.overlap_p2p_comm=True model.batch_p2p_comm=False"'], args_meta: ["seq_par_overlap_p2p"]}