Commit 3237cc33 authored by Lawrence McAfee's avatar Lawrence McAfee
Browse files

removed slurm commands; added 'num-experts'

parent c8e4cdae
#!/bin/bash #!/bin/bash
#SBATCH <SLURM OPTIONS> --nodes=1 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt_switch # Runs a GPT model with switch MLP.
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p $DIR/logs
CHECKPOINT_DIR="<PATH TO CHECKPOINT DIR>" CHECKPOINT_DIR="<PATH TO CHECKPOINT DIR>"
TENSORBOARD_DIR="<PATH TO TENSORBOARD DIR>" TENSORBOARD_DIR="<PATH TO TENSORBOARD DIR>"
DATA_BLEND="<PATH TO DATA BLEND>" DATA_BLEND="<PATH TO DATA BLEND>"
BPE_DIR="<PATH TO BPE DIR>" BPE_DIR="<PATH TO BPE DIR>"
options=" \ python pretrain_gpt.py \
--exit-duration-in-mins 230 \ --num-experts 8 \
--tensor-model-parallel-size 1 \ --tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \
--num-layers 24 \ --num-layers 24 \
...@@ -48,13 +44,4 @@ options=" \ ...@@ -48,13 +44,4 @@ options=" \
--fp16 \ --fp16 \
--DDP-impl torch \ --DDP-impl torch \
--tensorboard-dir ${TENSORBOARD_DIR} \ --tensorboard-dir ${TENSORBOARD_DIR} \
--checkpoint-activations " --checkpoint-activations
run_cmd="cd $DIR && python pretrain_gpt.py ${options}"
srun -l \
--container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
--container-mounts "<DIRECTORIES TO MOUNT>" \
--output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
set +x
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment