#!/bin/bash export HSA_FORCE_FINE_GRAIN_PCIE=1 export MIOPEN_FIND_MODE=3 MODEL_NAME=gpt2-4tp DATA_OUTPUT_PATH=./ LOGS_PATH=$DATA_OUTPUT_PATH/logs CHECKPOINT_PATH=output-module/$MODEL_NAME DATA_PATH="my-gpt2_text_document" TENSORBOARD_PATH=output_dir/tensorboard/$MODEL_NAME CODECARBON_PATH=output_dir/codecarbon/$MODEL_NAME N_GPUS=8 TP_SIZE=4 # always fixed to the size of a single node PP_SIZE=2 #128 #96 # NLAYERS must be a multiple of PP_SIZE here MICRO_BATCH_SIZE=2 GLOBAL_BATCH_SIZE=32 #256 #1536 NLAYERS=24 NHIDDEN=1024 #12480 NHEADS=16 SEQ_LEN=1024 SAVE_INTERVAL=200 #rampup-batch-size 16 16 5859375 OPTIMIZER_ARGS=" \ --optimizer adam \ --adam-beta1 0.9 \ --adam-beta2 0.95 \ --adam-eps 1e-8 \ --lr 6.0e-5 \ --min-lr 6.0e-6 \ --lr-decay-style cosine \ --clip-grad 1.0 \ --weight-decay 1e-1 \ " GPT_ARGS=" \ --num-layers $NLAYERS \ --hidden-size $NHIDDEN \ --num-attention-heads $NHEADS \ --seq-length $SEQ_LEN \ --max-position-embeddings $SEQ_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-samples 3782590 \ --loss-scale 12 \ --vocab-file gpt2-vocab.json \ --merge-file gpt2-merges.txt \ --clip-grad 1.0 \ --fp16 \ --checkpoint-activations \ --seed 42 $OPTIMIZER_ARGS \ " OUTPUT_ARGS=" \ --log-interval 1 \ --save-interval $SAVE_INTERVAL \ --eval-interval 1000 \ --eval-iters 40 \ --tensorboard-dir $TENSORBOARD_PATH \ --tensorboard-queue-size 5 \ --log-timers-to-tensorboard \ --log-batch-size-to-tensorboard \ --log-validation-ppl-to-tensorboard \ " DATA_ARGS=" \ --save $CHECKPOINT_PATH \ --load $CHECKPOINT_PATH \ --data-path $DATA_PATH \ " ZERO_STAGE=1 config_json="./${MODEL_NAME}_ds_config.json" cat < $config_json { "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, "train_batch_size": $GLOBAL_BATCH_SIZE, "gradient_clipping": 1.0, "zero_optimization": { "stage": $ZERO_STAGE }, "fp16": { "enabled": true, "loss_scale": 0, "loss_scale_window": 500, "hysteresis": 2, "min_loss_scale": 1, "initial_scale_power": 12 }, "steps_per_print": 2000, "wall_clock_breakdown": false } EOT DEEPSPEED_ARGS=" \ --deepspeed \ --deepspeed_config ${config_json} \ --zero-stage ${ZERO_STAGE} \ --deepspeed-activation-checkpointing \ " export CMD="pretrain_gpt.py \ --tensor-model-parallel-size $TP_SIZE \ --pipeline-model-parallel-size $PP_SIZE \ $GPT_ARGS \ $DATA_ARGS \ $OUTPUT_ARGS \ --data-impl mmap \ --split 949,50,1 \ --distributed-backend nccl \ $DEEPSPEED_ARGS \ " deepspeed --num_gpus $N_GPUS $CMD