Commit 8ec5d678 authored by hepj987's avatar hepj987
Browse files

GPT2 base on megatron-deepspeed

parents
#!/bin/bash
TENSOR_MODEL_PARALLEL_SIZE=2
VOCAB_FILE=bert-vocab.txt
CHECKPOINT_PATH=checkpoints/bert_345m
WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
--model-type BERT \
--tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file $VOCAB_FILE \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 512 \
--max-position-embeddings 512 \
--load $CHECKPOINT_PATH
#!/bin/bash
RANK=0
WORLD_SIZE=1
DATA_PATH=<Specify path and file prefix>_text_sentence
CHECKPOINT_PATH=<Specify path>
python pretrain_bert.py \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--micro-batch-size 4 \
--global-batch-size 8 \
--seq-length 512 \
--max-position-embeddings 512 \
--train-iters 2000000 \
--lr-decay-iters 990000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file bert-vocab.txt \
--data-impl mmap \
--split 949,50,1 \
--lr 0.0001 \
--min-lr 0.00001 \
--lr-decay-style linear \
--lr-warmup-fraction .01 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16
#!/bin/bash
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DATA_PATH=<Specify path and file prefix>_text_sentence
CHECKPOINT_PATH=<Specify path>
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_bert.py \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--micro-batch-size 4 \
--global-batch-size 32 \
--seq-length 512 \
--max-position-embeddings 512 \
--train-iters 1000000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file bert-vocab.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.0001 \
--lr-decay-style linear \
--min-lr 1.0e-5 \
--lr-decay-iters 990000 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction .01 \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16
#!/bin/bash
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DATA_PATH=<Specify path and file prefix>_text_sentence
VOCAB_FILE=<Specify path to vocab.txt>
CHECKPOINT_PATH=<Specify path>
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_bert.py \
--tensor-model-parallel-size 2 \
--pipeline-model-parallel-size 2 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--micro-batch-size 2 \
--global-batch-size 16 \
--max-position-embeddings 512 \
--train-iters 1000000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.0001 \
--lr-decay-style linear \
--min-lr 1.0e-5 \
--lr-decay-iters 990000 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction .01 \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16
#! /bin/bash
# Runs the "345M" parameter model
RANK=0
WORLD_SIZE=1
DATA_PATH=<Specify path and file prefix>_text_document
CHECKPOINT_PATH=<Specify path>
deepspeed --num_gpus 1 pretrain_gpt.py \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--micro-batch-size 4 \
--global-batch-size 8 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters 500000 \
--lr-decay-iters 320000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file gpt2-vocab.json \
--merge-file gpt2-merges.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.00015 \
--min-lr 1.0e-5 \
--lr-decay-style cosine \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction .01 \
--checkpoint-activations \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16
#!/bin/bash
#SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p $DIR/logs
DATASET_1="<PATH TO THE FIRST DATASET>"
DATASET_2="<PATH TO THE SECOND DATASET>"
DATASET_3="<PATH TO THE THIRD DATASET>"
DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
options=" \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 16 \
--num-layers 96 \
--hidden-size 12288 \
--num-attention-heads 96 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 1 \
--global-batch-size 1536 \
--rampup-batch-size 16 16 5859375 \
--train-samples 146484375 \
--lr-decay-samples 126953125 \
--lr-warmup-samples 183105 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--log-interval 10 \
--eval-iters 40 \
--eval-interval 1000 \
--data-path ${DATASET} \
--vocab-file <PATH TO gpt-vocab.json> \
--merge-file <PATH TO gpt-merges.txt> \
--save-interval 1000 \
--save <PATH TO CHECKPOINTS DIRECTORY> \
--load <PATH TO CHECKPOINTS DIRECTORY> \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.006 \
--tensorboard-dir <TENSORBOARD DIRECTORY> \
--fp16 \
--checkpoint-activations "
run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
srun -l \
--container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
--container-mounts "<DIRECTORIES TO MOUNT>" \
--output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
set +x
#! /bin/bash
# Runs the "345M" parameter model
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DATA_PATH=<Specify path and file prefix>_text_document
CHECKPOINT_PATH=<Specify path>
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_gpt.py \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--micro-batch-size 8 \
--global-batch-size 64 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters 500000 \
--lr-decay-iters 320000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file gpt2-vocab.json \
--merge-file gpt2-merges.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.00015 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction .01 \
--checkpoint-activations \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16
#! /bin/bash
# Runs the "345M" parameter model
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DATA_PATH=<Specify path and file prefix>_text_document
CHECKPOINT_PATH=<Specify path>
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_gpt.py \
--tensor-model-parallel-size 2 \
--pipeline-model-parallel-size 2 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--micro-batch-size 4 \
--global-batch-size 16 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters 500000 \
--lr-decay-iters 320000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file gpt2-vocab.json \
--merge-file gpt2-merges.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.00015 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction .01 \
--checkpoint-activations \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16
#! /bin/bash
# Runs the "345M" parameter model
RANK=0
WORLD_SIZE=1
# paths to multilingual preprocessed datasets
DATA_PATH_EN=<Specify path and file prefix>_text_document
DATA_PATH_AR=<Specify path and file prefix>_text_document
DATA_PATH_KR=<Specify path and file prefix>_text_document
DATA_PATH_JP=<Specify path and file prefix>_text_document
CHECKPOINT_PATH=<Specify path>
deepspeed --num_gpus 1 pretrain_gpt.py \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--micro-batch-size 4 \
--global-batch-size 8 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters 500000 \
--lr-decay-iters 320000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--train-weighted-split-paths "TRAIN: 0.3 0:0.6 $DATA_EN 1 0:0.6 $DATA_AR 1 0:0.6 $DATA_KR 1 0:0.6 $DATA_JP" \
--valid-weighted-split-paths \
"VALID_EN: 1 0.6:0.8 $DATA_EN" \
"VALID_AR: 1 0.6:0.8 $DATA_AR" \
"VALID_JP: 1 0.6:0.8 $DATA_KR" \
"VALID_KR: 1 0.6:0.8 $DATA_JP" \
"VALID_EN-AR-JP-KR_BALANCED: 1 0.6:0.8 $DATA_EN, 1 0.6:0.8 $DATA_AR, 1 0.6:0.8 $DATA_JP, 1 0.6:0.8 $DATA_KR" \
--test-weighted-split-paths \
"TEST_EN: 1 0.8:1 $DATA_EN" \
"TEST_AR: 1 0.8:1 $DATA_AR" \
"TEST_JP: 1 0.8:1 $DATA_JP" \
"TEST_KR: 1 0.8:1 $DATA_KR" \
"TEST_EN-AR-JP-KR_BALANCED: 1 0.8:1 $DATA_EN, 1 0.8:1 $DATA_AR, 1 0.8:1 $DATA_JP, 1 0.8:1 $DATA_KR" \
--vocab-file gpt2-vocab.json \
--merge-file gpt2-merges.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.00015 \
--min-lr 1.0e-5 \
--lr-decay-style cosine \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction .01 \
--checkpoint-activations \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16
#!/bin/bash
# Adapted to use deepspeed on a single node
#
# Multi-node will require either a `hostfile` or switching to `torch.distributed.launch`
# adjust to the number of GPUs to use
N_GPUS=1
CHECKPOINT_PATH=checkpoints/gpt2
VOCAB_FILE=data/gpt2-vocab.json
MERGE_FILE=data/gpt2-merges.txt
DATA_PATH=data/meg-gpt2_text_document
GPT_ARGS=" \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--micro-batch-size 4 \
--global-batch-size 8 \
--lr-decay-iters 320000 \
--lr 0.00015 \
--min-lr 1.0e-5 \
--lr-decay-style cosine \
--train-iters 5000 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction .01 \
--fp16 \
"
OUTPUT_ARGS=" \
--log-interval 10 \
--save-interval 500 \
--eval-interval 100 \
--eval-iters 10 \
--checkpoint-activations \
"
DATA_ARGS=" \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
"
ALL_ARGS="$GPT_ARGS $OUTPUT_ARGS $DATA_ARGS"
LAUNCHER="deepspeed --num_gpus $N_GPUS"
CMD="$LAUNCHER pretrain_gpt.py $ALL_ARGS"
echo $CMD
$CMD
#! /bin/bash
# Runs the "345M" parameter model
RANK=0
WORLD_SIZE=1
DATA_PATH=GPT2/c4_en_partial_gpt2_text_document
CHECKPOINT_PATH=GPT2
deepspeed --num_gpus 1 pretrain_gpt.py \
--num-layers 2 \
--hidden-size 128 \
--num-attention-heads 4 \
--micro-batch-size 4 \
--global-batch-size 8 \
--seq-length 256 \
--max-position-embeddings 256 \
--train-iters 10000 \
--lr-decay-iters 5000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path t5-small \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.00015 \
--min-lr 1.0e-5 \
--lr-decay-style cosine \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction .01 \
--checkpoint-activations \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16 \
--tensorboard-dir GPT2
# --vocab-file GPT2/gpt2-vocab.json \
# --merge-file GPT2/gpt2-merges.txt \
#! /bin/bash
# Runs the "217M" parameter biencoder model for ICT retriever
RANK=0
WORLD_SIZE=1
PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
TEXT_DATA_PATH=<Specify path and file prefix of the text data>
TITLE_DATA_PATH=<Specify path and file prefix od the titles>
CHECKPOINT_PATH=<Specify path>
python pretrain_ict.py \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--tensor-model-parallel-size 1 \
--micro-batch-size 32 \
--seq-length 256 \
--max-position-embeddings 512 \
--train-iters 100000 \
--vocab-file bert-vocab.txt \
--tokenizer-type BertWordPieceLowerCase \
--DDP-impl torch \
--bert-load ${PRETRAINED_BERT_PATH} \
--log-interval 100 \
--eval-interval 1000 \
--eval-iters 10 \
--retriever-report-topk-accuracies 1 5 10 20 100 \
--retriever-score-scaling \
--load $CHECKPOINT_PATH \
--save $CHECKPOINT_PATH \
--data-path ${TEXT_DATA_PATH} \
--titles-data-path ${TITLE_DATA_PATH} \
--lr 0.0001 \
--lr-decay-style linear \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction 0.01 \
--save-interval 4000 \
--exit-interval 8000 \
--query-in-block-prob 0.1 \
--fp16
#!/bin/bash
RANK=0
WORLD_SIZE=1
DATA_PATH=<Specify path and file prefix>
VOCAB_FILE=<Specify path to vocab.txt>
CHECKPOINT_PATH=<Specify path>
python pretrain_t5.py \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--kv-channels 64 \
--ffn-hidden-size 3072 \
--encoder-seq-length 512 \
--decoder-seq-length 128 \
--micro-batch-size 16 \
--global-batch-size 2048 \
--max-position-embeddings 512 \
--train-iters 1000000 \
--lr-decay-iters 1000000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--data-impl mmap \
--split 949,50,1 \
--lr 0.0001 \
--min-lr 0.00001 \
--lr-decay-style linear \
--lr-warmup-fraction .01 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16
#!/bin/bash
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DATA_PATH=<Specify path and file prefix>
VOCAB_FILE=<Specify path to vocab.txt>
CHECKPOINT_PATH=<Specify path>
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_t5.py \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--kv-channels 64 \
--ffn-hidden-size 3072 \
--encoder-seq-length 512 \
--decoder-seq-length 128 \
--micro-batch-size 16 \
--global-batch-size 2048 \
--max-position-embeddings 512 \
--train-iters 1000000 \
--lr-decay-iters 1000000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--data-impl mmap \
--split 949,50,1 \
--lr 0.0001 \
--min-lr 0.00001 \
--lr-decay-style linear \
--lr-warmup-fraction .01 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16
#!/bin/bash
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DATA_PATH=<Specify path and file prefix>
CHECKPOINT_PATH=<Specify path>
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_t5.py \
--tensor-model-parallel-size 2 \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--kv-channels 64 \
--ffn-hidden-size 3072 \
--encoder-seq-length 512 \
--decoder-seq-length 128 \
--micro-batch-size 16 \
--global-batch-size 2048 \
--seq-length 512 \
--max-position-embeddings 512 \
--train-iters 1000000 \
--lr-decay-iters 1000000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file t5-vocab.txt \
--data-impl mmap \
--split 949,50,1 \
--lr 0.0001 \
--min-lr 0.00001 \
--lr-decay-style linear \
--lr-warmup-fraction .01 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16
CHECKPOINT_PATH=/gpfsscratch/rech/bbv/utw68ny/checkpoints/tr3m-1B3-pile/global_step296023/
PP_SIZE=1
TP_SIZE=1
VOCAB_FILE=gpt2-vocab.json
MERGE_FILE=gpt2-merges.txt
export HF_DATASETS_OFFLINE=1
#dummy arguments to make megatron happy.
MEGATRON_REQUIRED_ARGS="\
--num-layers -1\
--hidden-size -1\
--num-attention-heads -1\
--seq-length -1 \
--max-position-embeddings -1
"
CMD="./tasks/eval_harness/evaluate.py \
--load $CHECKPOINT_PATH\
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE\
--vocab-file $VOCAB_FILE\
--merge-file $MERGE_FILE\
--micro-batch-size 64\
--adaptive_seq_len\
--eval_fp32\
--task_list hellaswag,mrpc,piqa\
$MEGATRON_REQUIRED_ARGS\
"
N_GPUS=1
LAUNCHER="deepspeed --num_gpus $N_GPUS"
$LAUNCHER $CMD
\ No newline at end of file
# How to run lm-eval on Megatron-DeepSpeed checkpoint using the original setup
This particular setup uses the normal deepspeed checkpoint and requires no conversion to Megatron-LM.
This doc assumes usage on JZ, so some peculiar requirements in places. Ignore these if you're not running this on JZ.
## Prerequisites
1. Install software
On login console with external network
Get lm-eval harness (https://github.com/EleutherAI/lm-evaluation-harness) and `best-download==0.0.7` needed to download some tasks.
```
start-prod
pip install best-download==0.0.7
pip install git+https://github.com/EleutherAI/lm-evaluation-harness
```
2. Pre-download needed datasets
some symlinks due to lm-harness' issues with relative position of data
```
mkdir data
ln -s `pwd`/data tasks/eval_harness/data
```
Also make sure `data` is not on one of the limited paritions like WORKSF.
Then install datasets for the tasks:
```
python ./tasks/eval_harness/download.py --task_list
arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc
```
and make sure that `export HF_DATASETS_OFFLINE=1`
If there are things like custom tokenizers, pre-download those too, e.g.:
```
python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('bigscience/oscar_13_languages_alpha_weight')"
```
and make sure that `export TRANSFORMERS_OFFLINE=1` is in the script.
You know there is a custom tokenizer if the training script had something like:
```
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path bigscience/oscar_13_languages_alpha_weight \
```
3. Prepare the slurm script
Prepare the run script, replace `variant` with a unique identifier for the current eval so that multiple evals could run in parallel and not all log into the same `results.json` file. so, e.g., `tr9c-1B3-swiglu`
```
cp examples/run_evalharness_deepspeed.slurm run_evalharness-variant.slurm
```
now edit `run_evalharness-variant.slurm`
Note that the eval code knows to pull the original training args from the checkpoint, so we don't need to pass any of those. And we just need to setup the evaluation args.
1. Edit:
```
PP_SIZE=1
TP_SIZE=1
```
to match the eval topology. If the model fits into 1 gpu, then there is nothing to change.
The eval script will automatically reshape the model if it was of a different topology.
2. Adjust the following to fit the chosen GPU. As of last check for 1.3B model the settings are one of:
```
EVAL_MICRO_BATCH_SIZE=6 # 16GB GPU 1.3B model
EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
```
If you get OOM lower it further.
3. If not using the Deepspeed path, disable it by removing:
```
--deepspeed \
--deepspeed_config ds_config.json \
```
If you didn't disable it and the program crashed on checkpoint loading unable to find some key, disable deepspeed as explained above.
4. Additional flags
- To reduce the amount of iterations for stderr estimation, use e.g. `--bootstrap_iters 2`. This saves 1-2 minutes per dataset.
- To print intermediate results when running multiple tasks use `--intermed_results`.
- To reduce the bubble when setting PP use the flag `--micro_bs_multiplier`. Reducing `--micro-batch-size` may be needed when increasing the multiplier.
- Running the 176B model with PP=8, `--micro_bs_multiplier 8` & `--micro-batch-size 4` produced the fastest results for PiQA on 1 node in 2min18s.
## Eval
Currently it takes 2-3 hours to run on 32GB for 1.3B model, 6-7h for 16GB GPU, so a 20h slurm job should be enough.
When ready, launch:
```
sbatch ./run_evalharness-variant.slurm
```
To monitor progress:
```
tail -f tail -f $VARIANT-eval-harness.log
```
where the variant is what you set `$VARIANT` to in the slurm script.
The template is set up for 16GB gpu since they are easier to get by. If you change to 32GB, adjust:
```
#SBATCH --constraint=v100-32g
...
EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
```
Note that the original ETA at the start of the run can be 10x too longer than the actual outcome. For example it may suggest 18 hours but will complete in 2 hours.
## Short eval
if you just want to quickly test that everything can run to the end, edit `tasks/eval_harness/evaluate.py`, e.g. to run only 10 batches:
```
- results = evaluator.evaluate(adaptor, task_dict, False, 0, None)
+ results = evaluator.evaluate(adaptor, task_dict, False, 0, 10)
```
(XXX: could be a cmd line option so that code won't need to be modified)
## Import into spreadsheet
https://docs.google.com/spreadsheets/d/1CI8Q9RCblLRzUOPJ6ViqBmo284-8ojluQ-CmaEuhuv0/edit?usp=sharing
Note that the spreadsheet format is quite different, so use this script:
```
./tasks/eval_harness/report-to-csv.py results.json
```
to reformat the json results into csv while changing its shape to match the spreadsheet format
Since some records might be missing or extraneous here is the best way to do it:
1. copy the data from first 2 columns to some place under the main spreadsheet
2. put the pointer to the 3rd column next to where the 2 first columns were copied.
3. import `results.csv` using file-> import -> file ->
Import location: Replace data at selected cell
4. Now it should be easy to align the new records with the old ones - delete irrelevant records and Insert->Cells where data is missing until the first 2 columns match
5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space.
#!/bin/bash
#SBATCH --job-name=eval-harness-deepspeed
#SBATCH --constraint=v100-16g
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=40 # number of cores per tasks
#SBATCH --hint=nomultithread # we get physical cores not logical
#SBATCH --gres=gpu:1 # number of gpus
#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
#SBATCH --output=%x-%j.out # output file name
#SBATCH --account=six@gpu
set -x -e
source $six_ALL_CCFRWORK/start-prod
echo "START TIME: $(date)"
# a unique identifier for the current eval so that multiple evals could run in parallel and not all log into the same "results.json" file.
VARIANT="tr9c-1B3-swiglu"
CHECKPOINT_PATH=/gpfsdsstore/projects/rech/six/commun/checkpoints/tr3m-1B3-emb-norm-pile/global_step296023
MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
# you want these 2 on JZ, and pre-download/cache any datasets/tokenizers/models
# but comment these out if you're running on a node with Internet access
export HF_DATASETS_OFFLINE=1
export TRANSFORMERS_OFFLINE=1
cd $MEGATRON_DEEPSPEED_REPO
# eval topology
PP_SIZE=1
TP_SIZE=1
VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json
MERGE_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-merges.txt
SEQ_LEN=2048
# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
# make as big as it can fit into gpu w/o OOM, but not too close to 100%
EVAL_MICRO_BATCH_SIZE=6 # 16GB GPU 1.3B model
#EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
#dummy arguments to make megatron happy.
MEGATRON_REQUIRED_ARGS=" \
--num-layers -1 \
--hidden-size -1 \
--num-attention-heads -1 \
--seq-length -1 \
--max-position-embeddings -1
"
ZERO_STAGE=0
config_json="./ds_config.json"
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": 1,
"train_batch_size": 1,
"zero_optimization": { "stage": $ZERO_STAGE },
"fp16": { "enabled": true },
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
CMD="./tasks/eval_harness/evaluate.py \
--load $CHECKPOINT_PATH \
--results_path $VARIANT-results.json \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--micro-batch-size $EVAL_MICRO_BATCH_SIZE \
--no-load-optim \
--no-load-rng \
--inference \
--deepspeed \
--deepspeed_config ds_config.json \
--seq-length $SEQ_LEN \
--adaptive_seq_len \
--eval_fp32 \
--task_list arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sst,webqs,wic,winogrande,wnli,wsc,triviaqa,sciq \
$MEGATRON_REQUIRED_ARGS \
"
N_GPUS=1
LAUNCHER="deepspeed --num_gpus $N_GPUS"
echo $LAUNCHER $CMD
export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log
#!/bin/bash
#SBATCH --job-name=run_evalharness-tr11-176b-ml
#SBATCH --partition=gpu_p5
#SBATCH --constraint=a100
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=64 # number of cores per tasks
#SBATCH --hint=nomultithread # we get physical cores not logical
#SBATCH --gres=gpu:8 # number of gpus
#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
#SBATCH --output=%x-%j.out # output file name
#SBATCH --account=six@a100
set -x -e
source $six_ALL_CCFRWORK/start-py38-pt111
echo "START TIME: $(date)"
# a unique identifier for the current eval ideally correspnding to the modelname
VARIANT="tr11-176b-ml"
CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step50000
MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
export HF_DATASETS_OFFLINE=1
export TRANSFORMERS_OFFLINE=1
export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
cd $MEGATRON_DEEPSPEED_REPO
TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
PP_SIZE=8
TP_SIZE=1
SEQ_LEN=2048
# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
# make as big as it can fit into gpu w/o OOM, but not too close to 100%
EVAL_MICRO_BATCH_SIZE=1
#dummy arguments to make megatron happy.
MEGATRON_REQUIRED_ARGS=" \
--num-layers -1 \
--hidden-size -1 \
--num-attention-heads -1 \
--seq-length -1 \
--max-position-embeddings -1 \
"
ZERO_STAGE=0
config_json="./ds_config.json"
# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": 1,
"train_batch_size": 1,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"bf16": {
"enabled": true
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
CMD="./tasks/eval_harness/evaluate.py \
--load $CHECKPOINT_PATH \
--results_path $VARIANT-results.json \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
--micro-batch-size $EVAL_MICRO_BATCH_SIZE \
--no-load-optim \
--no-load-rng \
--bf16 \
--inference \
--seq-length $SEQ_LEN \
--task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \
--deepspeed \
--deepspeed_config ds_config.json \
--bootstrap_iters 2 \
--intermed_results \
--adaptive_seq_len \
--micro_bs_multiplier 4 \
$MEGATRON_REQUIRED_ARGS \
"
GPUS_PER_NODE=8
NNODES=$SLURM_NNODES
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000
export LAUNCHER="python -u -m torch.distributed.run \
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
--rdzv_backend c10d \
--max_restarts 0 \
--tee 3 \
"
export CUDA_LAUNCH_BLOCKING=1
echo $LAUNCHER $CMD
export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log
"""Multitask Finetuning T0"""
import torch
from megatron import get_args, get_tokenizer, print_rank_0, mpu
from megatron.data.decoder_packed_mtf_dataset import build_train_valid_test_datasets, build_dataset_group
from megatron.enums import PositionEmbeddingType, AttnMaskType
from megatron.model import GPTModelPipe
from megatron.training import pretrain
from megatron.utils import get_ltor_masks_and_position_ids, get_packed_attention_mask
import deepspeed
from deepspeed.runtime.utils import see_memory_usage
try:
from torch.distributed.elastic.multiprocessing.errors import record
except ImportError:
# noop
def record(fn):
return fn
def model_provider(pre_process=True, post_process=True):
"""Build the model."""
print_rank_0("building GPT model ...")
see_memory_usage(f"Before Building Model", force=True)
args = get_args()
with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(),
remote_device=None if args.remote_device == "none" else args.remote_device,
config_dict_or_path=args.deepspeed_config,
enabled=args.zero_stage == 3,
mpu=mpu):
if args.deepspeed:
model = GPTModelPipe(
num_tokentypes=0,
parallel_output=True,
attn_mask_type=AttnMaskType.custom
)
# This is a hack to give us a reference to get_batch_pipe from within training.py
# We need to call model.set_batch_fn after deepspeed.initialize
model._megatron_batch_fn = get_batch_pipe
else:
raise NotImplementedError("DeepSpeed is required for T0")
see_memory_usage(f"After Building Model", force=True)
return model
def get_batch_pipe(data):
"""
Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator` & in packed fashion
data:
decoder_tokens = [[6, 7, 8, 3, 4, 5, 0]]
decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]]
decoder_is_inputs = [[1, 1, 0, 1, 1, 0, 0]]
"""
args = get_args()
tokenizer = get_tokenizer()
# Broadcast data.
data_b = mpu.broadcast_data(["decoder_token_ids", "decoder_segment_ids"], data, torch.int64)
data_c = mpu.broadcast_data(["decoder_is_inputs"], data, torch.bool)
# Unpack.
tokens_ = data_b["decoder_token_ids"].long()
labels = tokens_[:, 1:].contiguous()
tokens = tokens_[:, :-1].contiguous()
segment_ids = data_b["decoder_segment_ids"].long()[:, :-1]
decoder_is_inputs = data_c["decoder_is_inputs"][:, :-1]
# Get the masks and position ids.
causal_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
tokens,
tokenizer.eod,
args.reset_position_ids,
args.reset_attention_mask,
args.eod_mask_loss,
prefix_indices=None,
loss_on_targets_only=False # This is done below
)
# Only compute loss over causal target tokens, i.e. ignore input_tokens & padding
loss_on_targets_only = ~data_c["decoder_is_inputs"][:, 1:]
loss_on_non_pad_only = (tokens != tokenizer.pad)
loss_mask *= loss_on_targets_only * loss_on_non_pad_only
attention_mask = get_packed_attention_mask(
# Run non-causal decoder
is_causal=False,
causal_mask=~(causal_mask.bool()),
decoder_is_inputs=decoder_is_inputs.bool(),
segment_ids=segment_ids.long(),
)
if args.position_embedding_type not in [PositionEmbeddingType.alibi, PositionEmbeddingType.rotary]:
raise NotImplementedError("absolute positional embeddings require us to reset position_ids accordingly.")
return (tokens, position_ids, attention_mask), (labels, loss_mask)
def train_valid_test_datasets_provider(train_val_test_num_samples):
"""Build train, valid, and test datasets."""
args = get_args()
train_ds, valid_ds, test_ds = None, None, None
tokenizer = get_tokenizer()
print_rank_0("> building train, validation, and test datasets for T0 ...")
# Option 1 of data loading using --data-path
if args.data_path:
# TODO: Not yet compatible with dataset weights (Will break at prefixes, weights = analyze_data_prefix(args.data_path))
train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
data_prefix=args.data_path,
data_impl=args.data_impl,
splits_string=args.split,
seq_length=args.seq_length + 1,
pad_token=tokenizer.pad,
eos_token=tokenizer.eos,
train_valid_test_num_samples=train_val_test_num_samples,
seed=args.seed,
skip_warmup=(not args.mmap_warmup)
)
# Option 2 of data loading using --(train|valid|test)-weighted-split-paths
elif args.train_weighted_split_paths:
assigned_train_valid_test = []
if args.train_weighted_split_paths is not None:
train_ds = []
assigned_train_valid_test.append("train")
if args.valid_weighted_split_paths is not None:
valid_ds = []
assigned_train_valid_test.append("valid")
if args.test_weighted_split_paths is not None:
test_ds = []
assigned_train_valid_test.append("test")
for s in assigned_train_valid_test:
data_groups = zip(eval(f"args.{s}_weighted_split_paths"),
eval(f"args.{s}_weighted_split_weights"),
eval(f"args.{s}_weighted_split_splits"),
eval(f"args.{s}_weighted_split_names"))
for paths, weights, splits, name in data_groups:
d = build_dataset_group(
dataset_group_name=name,
paths=paths,
weights=weights,
splits=splits,
data_impl=args.data_impl,
train_valid_test_num_samples=train_val_test_num_samples,
seq_length=args.seq_length + 1,
pad_token=tokenizer.pad,
eos_token=tokenizer.eos,
seed=args.seed,
skip_warmup=(not args.mmap_warmup),
train_valid_test=s
)
eval(f"{s}_ds").append(d)
else:
raise NotImplementedError("No dataloading argument passed")
print_rank_0("> finished creating T0 datasets ...")
return train_ds, valid_ds, test_ds
@record
def main():
pretrain(
train_valid_test_datasets_provider,
model_provider,
forward_step_func=None,
args_defaults={}
)
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment