Commit bc5c7fa7 authored by wxj's avatar wxj
Browse files

第一次测试提交

parent 70fddd0f
#!/bin/bash
# Runs the "345M" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
MERGE_FILE=<Specify path to file>/gpt2-merges.txt
DATA_PATH=<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
GPT_ARGS="
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--micro-batch-size 8 \
--global-batch-size 64 \
--lr 0.00015 \
--train-iters 500000 \
--lr-decay-iters 320000 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#!/bin/bash
# Runs the "345M" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
MERGE_FILE=<Specify path to file>/gpt2-merges.txt
DATA_PATH=<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
GPT_ARGS="
--tensor-model-parallel-size 2 \
--pipeline-model-parallel-size 2 \
--sequence-parallel \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--micro-batch-size 4 \
--global-batch-size 16 \
--lr 0.00015 \
--train-iters 500000 \
--lr-decay-iters 320000 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#! /bin/bash
# Runs the "217M" parameter biencoder model for ICT retriever
RANK=0
WORLD_SIZE=1
PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
TEXT_DATA_PATH=<Specify path and file prefix of the text data>
TITLE_DATA_PATH=<Specify path and file prefix od the titles>
CHECKPOINT_PATH=<Specify path>
python pretrain_ict.py \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--tensor-model-parallel-size 1 \
--micro-batch-size 32 \
--seq-length 256 \
--max-position-embeddings 512 \
--train-iters 100000 \
--vocab-file bert-vocab.txt \
--tokenizer-type BertWordPieceLowerCase \
--DDP-impl torch \
--bert-load ${PRETRAINED_BERT_PATH} \
--log-interval 100 \
--eval-interval 1000 \
--eval-iters 10 \
--retriever-report-topk-accuracies 1 5 10 20 100 \
--retriever-score-scaling \
--load $CHECKPOINT_PATH \
--save $CHECKPOINT_PATH \
--data-path ${TEXT_DATA_PATH} \
--titles-data-path ${TITLE_DATA_PATH} \
--lr 0.0001 \
--lr-decay-style linear \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction 0.01 \
--save-interval 4000 \
--exit-interval 8000 \
--query-in-block-prob 0.1 \
--fp16
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/t5-vocab.txt
DATA_PATH=<Specify path and file prefix>_text_sentence
T5_ARGS="
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--kv-channels 64 \
--ffn-hidden-size 3072 \
--encoder-seq-length 512 \
--decoder-seq-length 128 \
--max-position-embeddings 512 \
--micro-batch-size 16 \
--global-batch-size 16 \
--lr 0.0001 \
--train-iters 1000000 \
--lr-decay-iters 1000000 \
--lr-decay-style linear \
--min-lr 0.00001 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16 \
--vocab-extra-ids 100
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun pretrain_t5.py \
$T5_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/t5-vocab.txt
DATA_PATH=<Specify path and file prefix>_text_sentence
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
T5_ARGS="
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--kv-channels 64 \
--ffn-hidden-size 3072 \
--encoder-seq-length 512 \
--decoder-seq-length 128 \
--max-position-embeddings 512 \
--micro-batch-size 16 \
--global-batch-size 128 \
--lr 0.0001 \
--train-iters 1000000 \
--lr-decay-iters 1000000 \
--lr-decay-style linear \
--min-lr 0.00001 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16 \
--vocab-extra-ids 100
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun $DISTRIBUTED_ARGS pretrain_t5_core.py \
$T5_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
CHECKPOINT_PATH=<Specify path>
VOCAB_FILE=<Specify path to file>/t5-vocab.txt
DATA_PATH=<Specify path and file prefix>_text_sentence
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
T5_ARGS="
--tensor-model-parallel-size 2 \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--kv-channels 64 \
--ffn-hidden-size 3072 \
--encoder-seq-length 512 \
--decoder-seq-length 128 \
--max-position-embeddings 512 \
--micro-batch-size 16 \
--global-batch-size 128 \
--lr 0.0001 \
--train-iters 1000000 \
--lr-decay-iters 1000000 \
--lr-decay-style linear \
--min-lr 0.00001 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16 \
--vocab-extra-ids 100
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun $DISTRIBUTED_ARGS pretrain_t5.py \
$T5_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#! /bin/bash
# Pre-trains ViT based image classificaation model
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_SL=1
# Training and validation paths should each point to a folder where each
# sub-folder contains a collection of images in jpg or png format
# e.g. If using imagenet, one train image might be, train_data/n01688243/n01688243_11301.JPEG
DATA_PATH_TRAIN=<Specify train data path>
DATA_PATH_VAL=<Specify validation data path>
CHECKPOINT_PATH=<Specify path>
CLASSIFIER_ARGS="
--tensor-model-parallel-size 1 \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--patch-dim 4 \
--seq-length 3136 \
--max-position-embeddings 3136 \
--img-h 224 \
--img-w 224 \
--mask-factor 1.0 \
--fp16 \
--train-iters 750000 \
--lr-decay-style cosine \
--micro-batch-size 4 \
--global-batch-size 1024 \
--lr 0.0005 \
--min-lr 0.00001 \
--attention-dropout 0.0 \
--weight-decay 0.05 \
--lr-warmup-iters 12500 \
--clip-grad 1.0 \
--no-gradient-accumulation-fusion \
--num-workers 4 \
--DDP-impl torch "
DATA_ARGS="
--tokenizer-type NullTokenizer \
--vocab-size 0 \
--data-path $DATA_PATH_TRAIN $DATA_PATH_VAL \
--no-data-sharding \
--split 949,50,1 \
"
OUTPUT_ARG="
--log-interval 32 \
--save-interval 10000 \
--eval-interval 2500 \
--eval-iters 100 \
--tensorboard-dir ${CHECKPOINT_PATH} \
"
torchrun pretrain_vision_classification.py \
$CLASSIFIER_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#! /bin/bash
# Pre-trains Dino V1 model
# For model details: https://arxiv.org/abs/2104.14294
# For original author implementation: https://github.com/facebookresearch/dino/tree/main
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_SL=1
# Training and validation paths should each point to a folder where each
# sub-folder contains a collection of images in jpg or png format
# e.g. If using imagenet, one train image might be, train_data/n01688243/n01688243_11301.JPEG
DATA_PATH_TRAIN=<Specify train data path>
DATA_PATH_VAL=<Specify validation data path>
CHECKPOINT_PATH=<Specify path>
DINO_ARGS="
--vision-pretraining-type dino \
--tensor-model-parallel-size 1 \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--patch-dim 4 \
--seq-length 3136 \
--max-position-embeddings 3136 \
--img-h 224 \
--img-w 224 \
--mask-factor 1.0 \
--fp16 \
--train-iters 750000 \
--lr-decay-style cosine \
--micro-batch-size 4 \
--global-batch-size 1024 \
--lr 0.0005 \
--min-lr 0.00001 \
--attention-dropout 0.0 \
--weight-decay 0.05 \
--lr-warmup-iters 12500 \
--clip-grad 1.0 \
--no-gradient-accumulation-fusion \
--num-workers 4 \
--DDP-impl torch "
DATA_ARGS="
--tokenizer-type NullTokenizer \
--vocab-size 0 \
--data-path $DATA_PATH_TRAIN $DATA_PATH_VAL \
--no-data-sharding \
--split 949,50,1 \
"
OUTPUT_ARG="
--log-interval 32 \
--save-interval 10000 \
--eval-interval 2500 \
--eval-iters 100 \
--tensorboard-dir ${CHECKPOINT_PATH} \
"
torchrun pretrain_vision_dino.py \
$DINO_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#! /bin/bash
# Pre-trains ViT based image inpainting model
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_SL=1
# Training and validation paths should each point to a folder where each
# sub-folder contains a collection of images in jpg or png format
# e.g. If using imagenet, one train image might be, train_data/n01688243/n01688243_11301.JPEG
DATA_PATH_TRAIN=<Specify train data path>
DATA_PATH_VAL=<Specify validation data path>
CHECKPOINT_PATH=<Specify path>
INPAINT_ARGS="
--vision-pretraining-type inpaint \
--tensor-model-parallel-size 1 \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--patch-dim 4 \
--seq-length 3136 \
--max-position-embeddings 3136 \
--img-h 224 \
--img-w 224 \
--mask-factor 1.0 \
--fp16 \
--train-iters 750000 \
--lr-decay-style cosine \
--micro-batch-size 4 \
--global-batch-size 1024 \
--lr 0.0005 \
--min-lr 0.00001 \
--attention-dropout 0.0 \
--weight-decay 0.05 \
--lr-warmup-iters 12500 \
--clip-grad 1.0 \
--no-gradient-accumulation-fusion \
--num-workers 4 \
--DDP-impl torch "
DATA_ARGS="
--tokenizer-type NullTokenizer \
--vocab-size 0 \
--data-path $DATA_PATH_TRAIN $DATA_PATH_VAL \
--no-data-sharding \
--split 949,50,1 \
"
OUTPUT_ARG="
--log-interval 32 \
--save-interval 10000 \
--eval-interval 2500 \
--eval-iters 100 \
--tensorboard-dir ${CHECKPOINT_PATH} \
"
torchrun pretrain_vision_inpaint.py \
$INPAINT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
#!/bin/bash
# Train a vision language model.
# Default arguments here use a mock dataset. Please edit the arguments to your liking.
export CUDA_DEVICE_MAX_CONNECTIONS=1
# Check that the user has set an output path for model checkpoints.
if [[ -z $CHECKPOINT_PATH ]]; then
echo "Please set CHECKPOINT_PATH for storing your model checkpoints."
exit 1
fi
DISTRIBUTED_ARGS="
--nproc_per_node 8 \
"
# Note: the learning rate and other hyperparameters used here are just examples and not optimized in any way.
GPT_ARGS="
--num-layers 24 \
--hidden-size 512 \
--num-attention-heads 16 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--micro-batch-size 2 \
--global-batch-size 16 \
--lr 0.00015 \
--train-iters 10000 \
--lr-decay-iters 3200 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16
"
IMG_ARGS="
--img-h 336 \
--img-w 336 \
--patch-dim 14
"
DATA_ARGS="
--split 949,50,1
--tokenizer-type NullTokenizer
--vocab-size=8192
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 5000 \
--eval-interval 1000 \
--eval-iters 10
"
# Select one of the cases below.
# Multi GPU
# torchrun $DISTRIBUTED_ARGS \
# Single GPU
# CUDA_VISIBLE_DEVICES=0 python -u \
# Single GPU with a debugger
# CUDA_VISIBLE_DEVICES=0 python -u -m debugpy --listen 0.0.0.0:5678 --wait-for-client \
torchrun $DISTRIBUTED_ARGS \
pretrain_vlm.py \
$GPT_ARGS \
$IMG_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
# RETRO MODEL
## Table of contents
- [1. Training Setup](#1-training-setup)
- [2. Data Preprocessing](#2-data-preprocessing)
- [3. Configurations](#3-configurations)
## 1. Training setup
<a id="markdown-training-setup" name="training-setup"></a>
To run the model using a docker container run it as follows
```
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:23.09-py3
CHECKPOINT_PATH="" #<Specify path>
TENSORBOARD_LOGS_PATH=""#<Specify path>
docker run \
--gpus=all \
--ipc=host \
--workdir /workspace/megatron-lm \
-v /path/to/data:/path/to/data \
-v /path/to/megatron-lm:/workspace/megatron-lm \
megatron-lm nvcr.io/nvidia/pytorch:23.04-py3 \
bash examples/retro/train_retro_2b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH"
```
NOTE: Depending on the environment you are running it the above command might look slightly different.
NOTE: Due to how Retro preprocess and caches elements of the pretraining dataset before training begins, some arguments are auto-loaded from the Retro preprocessing configuration. These loaded arguments include:
- `--data-path`
- `--data-cache-path`
- `--eval-interval`
- `--eval-iters`
- `--global-batch-size`
- `--tokenizer-type`
- `--tokenizer-model`
- `--vocab-file`
- `--merge-file`
- `--seed`
- `--seq-length`
- `--train-samples`
## 2. Data Preprocessing
<a id="markdown-data-preprocessing" name="data-preprocessing"></a>
Retro preprocesses and caches data prior to pretraining, to greatly speed up pretraining. During data preprocessing, the retrieval database is built, and neighbor IDs are queried for each sample within the pretraining dataset. Please see `preprocess_data.sh` for an example script to preprocess data for Retro. The reference documentation for data preprocessing can be found [here](tools/retro/README.md).
## 3. Configurations
<a id="markdown-configurations" name="configurations"></a>
The example in this folder shows you how to run a 2B model. Below are a few other example configurations.
### 857M
```
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
### 4B
```
--num-layers 48 \
--hidden-size 2560 \
--num-attention-heads 32 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
#!/bin/bash
set -u
unset NCCL_DEBUG
######## Megatron, Retro dirs. ########
REPO_DIR="<path/to/megatron/repo>"
RETRO_PROJECT_DIR="<path/to/retro/project/directory>"
######## Task (e.g., db, index, query). ########
# This script takes a single argument, which specifies the retro task to be
# performed. The available tasks are: db-build, index-train, index-add, and
# query-neighbors.
# ~~ Examples ~~
# RETRO_TASKS="db-build" # Build the retrieval database
# RETRO_TASKS="index-train" # Train the index
# RETRO_TASKS="index-add" # Add data to the index
# RETRO_TASKS="query-neighbors" # Perform query pretraining for neighbors
# You can also provide the task as a command-line argument when executing the
# script. Example: ./preprocess_data.sh index-add
RETRO_TASKS=$1
######## Data. ########
DATA_BLEND="<see --data-path in arguments.py>"
######## Index. ########
RETRO_INDEX_STR="OPQ32_64,IVF65536_HNSW8,PQ32"
RETRO_INDEX_NTRAIN=66625331
RETRO_INDEX_TRAIN_LOAD_FRACTION=0.97
RETRO_INDEX_ADD_LOAD_FRACTION=0.95
######## GPT. ########
RETRO_GPT_SEED=1234
RETRO_GPT_SPLIT="98,2,0"
RETRO_GPT_DATA_PATH=${DATA_BLEND}
RETRO_GPT_TRAIN_SAMPLES=200000
RETRO_GPT_EVAL_INTERVAL=2000
RETRO_GPT_EVAL_ITERS=50
RETRO_GPT_LR_DECAY_SAMPLES=175000
RETRO_GPT_LR_WARMUP_SAMPLES=10000
RETRO_GPT_SEQ_LENGTH=2048
RETRO_GPT_GLOBAL_BATCH_SIZE=256
RETRO_GPT_CHUNK_LENGTH=64
######## Query. ########
RETRO_QUERY_NUM_NEIGHBORS_QUERY=200
RETRO_QUERY_NUM_NEIGHBORS_SAVE=20
RETRO_QUERY_EF_SEARCH=32
RETRO_QUERY_NPROBE=4096
######## Args. ########
ARGS=" \
--distributed-timeout-minutes 600 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--micro-batch-size 1 \
--global-batch-size ${RETRO_GPT_GLOBAL_BATCH_SIZE} \
--seq-length 512 \
--max-position-embeddings 512 \
--load ${RETRO_PROJECT_DIR}/checkpoints/bert \
--exit-on-missing-checkpoint \
--no-load-optim \
--data-path [null] \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file ${RETRO_PROJECT_DIR}/tokenizer/bert-large-uncased-vocab.txt \
--split ${RETRO_GPT_SPLIT} \
--distributed-backend nccl \
--lr 0.0001 \
--lr-decay-style linear \
--min-lr 1.0e-5 \
--train-samples ${RETRO_GPT_TRAIN_SAMPLES} \
--lr-decay-samples ${RETRO_GPT_LR_DECAY_SAMPLES} \
--lr-warmup-samples ${RETRO_GPT_LR_WARMUP_SAMPLES} \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--eval-interval ${RETRO_GPT_EVAL_INTERVAL} \
--eval-iters ${RETRO_GPT_EVAL_ITERS} \
--bf16 \
--no-data-sharding \
--no-gradient-accumulation-fusion \
--no-async-tensor-model-parallel-allreduce \
--bert-embedder-type megatron \
--output-bert-embeddings \
\
--retro-project-dir ${RETRO_PROJECT_DIR} \
--retro-tasks ${RETRO_TASKS} \
--retro-bert-vocab-file tokenizer/bert-large-uncased-vocab.txt \
--retro-bert-tokenizer-type BertWordPieceLowerCase \
\
--retro-gpt-seed ${RETRO_GPT_SEED} \
--retro-gpt-tokenizer-type GPTSentencePieceTokenizer \
--retro-gpt-tokenizer-model /path/to/tokenizer/model \
--retro-gpt-seq-length ${RETRO_GPT_SEQ_LENGTH} \
--retro-gpt-chunk-length ${RETRO_GPT_CHUNK_LENGTH} \
--retro-gpt-global-batch-size ${RETRO_GPT_GLOBAL_BATCH_SIZE} \
--retro-gpt-eval-interval ${RETRO_GPT_EVAL_INTERVAL} \
--retro-gpt-eval-iters ${RETRO_GPT_EVAL_ITERS} \
--retro-gpt-split ${RETRO_GPT_SPLIT} \
--retro-gpt-data-path ${RETRO_GPT_DATA_PATH} \
--retro-gpt-train-samples ${RETRO_GPT_TRAIN_SAMPLES} \
\
--retro-index-str ${RETRO_INDEX_STR} \
--retro-index-ntrain ${RETRO_INDEX_NTRAIN} \
--retro-index-train-load-fraction ${RETRO_INDEX_TRAIN_LOAD_FRACTION} \
--retro-index-add-load-fraction ${RETRO_INDEX_ADD_LOAD_FRACTION} \
--no-retro-index-delete-training-embeddings \
--no-retro-index-delete-added-codes \
\
--retro-query-num-neighbors-query ${RETRO_QUERY_NUM_NEIGHBORS_QUERY} \
--retro-query-num-neighbors-save ${RETRO_QUERY_NUM_NEIGHBORS_SAVE} \
--retro-query-ef-search ${RETRO_QUERY_EF_SEARCH} \
--retro-query-nprobe ${RETRO_QUERY_NPROBE} \
"
######## Command. ########
NPROCS=8 # Number of GPUs.
CMD="\
cd ${REPO_DIR} && pwd && \
export PYTHONPATH=$PYTHONPATH:${REPO_DIR} && \
python -m torch.distributed.run \
--nproc_per_node ${NPROCS} \
--nnodes 1 \
--node_rank ${NODE_RANK} \
--master_addr ${MASTER_ADDR} \
--master_port 6000 \
tools/retro/preprocess_data.py ${ARGS} \
"
echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
echo "CMD = '$CMD'."
echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
eval $CMD
#!/bin/bash
# Runs the "307M" parameter Retro model.
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=$1 #<Specify path>
TENSORBOARD_LOGS_PATH=$2 #<Specify path>
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
######## GPT or Retro? ########
# 0 : GPT.
# 1 : Retro
ADD_RETRIEVER=1
######## Megatron, Retro dirs. ########
RETRO_PROJECT_DIR="<path/to/retro/project/directory>"
######## Model, training args. ########
# ** Note: --seq-length auto loaded from Retro project dir.
RETRO_MODEL_ARGS=(
--num-layers 32
--hidden-size 2048
--num-attention-heads 32
)
# ** Note: --data-path, --tokenizer-type, and --tokenizer-model auto loaded from Retro project dir.
DATA_ARGS=(
--split 98,2,0
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 8
--pipeline-model-parallel-size 1
)
# ** Note: --eval-interval, --eval-iters auto loaded from Retro project dir.
EVAL_AND_LOGGING_ARGS=(
--log-interval 100
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
TRAINING_ARGS=" \
--retro-project-dir ${RETRO_PROJECT_DIR} \
--use-mcore-models \
--transformer-impl transformer_engine \
--num-workers 8 \
--micro-batch-size 4 \
--lr-decay-samples 166400000 \
--lr-warmup-samples 162761 \
--lr 6.0e-4 \
--min-lr 6.0e-5 \
--lr-decay-style cosine \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.023 \
--log-params-norm \
--log-num-zeros-in-grad \
--bf16 \
--no-data-sharding \
"
if [ "$ADD_RETRIEVER" = "1" ]; then
TRAINING_ARGS+=" --retro-add-retriever"
fi
######## Command. ########
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_retro.py \
${RETRO_MODEL_ARGS[@]} \
${TRAINING_ARGS} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
import os
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from functools import partial
from pathlib import Path
from megatron.core import parallel_state
from megatron.core import dist_checkpointing
from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.models.gpt.gpt_model import GPTModel
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
from megatron.core.datasets.utils import Split
from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset
def initialize_distributed(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1):
parallel_state.destroy_model_parallel()
# Torch setup for distributed training
rank = int(os.environ['LOCAL_RANK'])
world_size = torch.cuda.device_count()
torch.cuda.set_device(rank)
torch.distributed.init_process_group(world_size=world_size, rank=rank)
# Megatron core distributed training initialization
parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size)
def model_provider():
"""Build the model."""
transformer_config = TransformerConfig(
num_layers=2,
hidden_size=12,
num_attention_heads=4,
use_cpu_initialization=True,
pipeline_dtype=torch.float32)
gpt_model = GPTModel(
config=transformer_config,
transformer_layer_spec=get_gpt_layer_local_spec(),
vocab_size=100,
max_sequence_length=64)
return gpt_model
def get_train_data_iterator():
config = GPTDatasetConfig(
random_seed = 0,
sequence_length = 64,
blend=[],
mock=True,
reset_position_ids=False,
reset_attention_mask=False,
eod_mask_loss=False,
tokenizer="dummy")
training_data= MockGPTDataset(Split.train, config)
train_dataloader = DataLoader(training_data, batch_size=8, shuffle=True)
train_iterator = iter(train_dataloader)
return train_iterator
def forward_step_func(data_iterator, model):
def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
losses = output_tensor.float()
loss_mask = loss_mask.view(-1).float()
loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
# If you have data parallel reduce loss across data parallel groups.
# If pipeline parallel, loss computation is done only in last stage.
return loss, {'lm loss': loss}
data = next(data_iterator)
tokens = data['tokens'].to(device)
attention_mask = data['attention_mask'].to(device)
position_ids = data['position_ids'].to(device)
labels = data['labels'].to(device)
loss_mask = data['loss_mask'].to(device)
output_tensor = model(tokens, position_ids, attention_mask,
labels=labels)
return output_tensor, partial(loss_func, loss_mask)
def save_distributed_checkpoint(checkpoint_path, gpt_model):
sharded_state_dict = gpt_model.sharded_state_dict(prefix='')
dist_checkpointing.save(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
def load_distributed_checkpoint(checkpoint_path, gpt_model):
sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
gpt_model.load_state_dict(checkpoint)
return gpt_model
if __name__ == "__main__":
initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
model_parallel_cuda_manual_seed(123)
gpt_model = model_provider()
device = torch.device("cuda")
gpt_model.to(device)
optim = Adam(gpt_model.parameters())
train_iterator = get_train_data_iterator()
forward_backward_func = get_forward_backward_func()
# Running the model for 5 iterations
for _ in range(5):
optim.zero_grad()
losses_reduced = forward_backward_func(
forward_step_func=forward_step_func,
data_iterator=train_iterator,
model=gpt_model,
num_microbatches=1,
seq_length=64,
micro_batch_size=8,
decoder_seq_length=64,
forward_only=False)
optim.step()
print(f'Losses reduced : {losses_reduced}')
# Saving the model
ckpt_path = os.getcwd() + '/ckpt'
Path(ckpt_path).mkdir(exist_ok=True)
save_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
# Loading the model
gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
gpt_model.to(device)
print('Successfully loaded the model')
#!/bin/bash
# This example will start serving the 345M model.
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint (e.g /345m)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
export CUDA_DEVICE_MAX_CONNECTIONS=1
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--load ${CHECKPOINT} \
--num-attention-heads 16 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--micro-batch-size 1 \
--seq-length 1024 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--seed 42
#!/bin/bash
# This example will start serving the 345M model that is partitioned 8 way tensor parallel
DISTRIBUTED_ARGS="--nproc_per_node 8 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint (e.g /345m)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
pip install flask-restful
python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--load ${CHECKPOINT} \
--num-attention-heads 16 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--micro-batch-size 1 \
--seq-length 1024 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--seed 42
#!/bin/bash
# SLURM options.
export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>
export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm>
# Source code.
export MEGATRON_CODE_DIR=<megatron source code directory>
# This variable is used to mount the relevant part of the filesystem
# inside the docker container. Note that the `MEGATRON_CODE_DIR` and the
# launch directory already get mounted; this variable should be used to
# mount the directories that contain the data and tokenizer files.
export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path>
# Data and tokenizer files.
MEGATRON_DATA=<path to megatron processed data>
BPE_VOCAB_FILE=<path to bpe vocab file>
BPE_MERGE_FILE=<path to bpe merges file>
# Megatron input parameters.
# `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters
# that are not listed here.
export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--micro-batch-size ${MBS} \
--global-batch-size ${GBS} \
--num-layers ${NLS} \
--hidden-size ${HS} \
--num-attention-heads ${NAH} \
--DDP-impl ${DDP} \
--data-path ${MEGATRON_DATA} \
--vocab-file ${BPE_VOCAB_FILE} \
--merge-file ${BPE_MERGE_FILE} \
--log-interval 5 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--train-iters 500 \
--lr-decay-iters 320 \
--lr 0.0001 \
--min-lr 0.00001 \
--lr-decay-style cosine \
--lr-warmup-fraction 0.01 \
--split 969,30,1 \
--eval-iters 100 \
--eval-interval 1000 \
--clip-grad 1.0 \
--fp16 \
--loss-scale 8192 "
# Reproducing Figures in SC21 Paper
This directory contains some of the scripts that were used to produce the
results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is
to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These
scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the
[pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other
schedulers as well.
## Git commit
To replicate these results use Megatron-LM commit: 6985e58938d40ad91ac07b0fddcfad8132e1447e
## Setup
All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please
update the unspecified values (in angle brackets `<...>`) before launching any
scripts.
## Scripts
Below is a list of scripts that can be used to reproduce various figures in our
[paper](https://arxiv.org/pdf/2104.04473.pdf):
* [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput
for GPT models ranging from 1 billion to 1 trillion parameters.
* [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling
performance of pipeline parallelism.
* [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of
the interleaved schedule on a 175B GPT model.
* [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of
different degrees of pipeline and tensor model parallelism on a model with
162.2 billion parameters.
* [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of
different degrees of data and pipeline model parallelism on a model with
5.9 billion parameters.
* [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of
different degrees of data and tensor model parallelism on a model with
5.9 billion parameters.
* [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of
microbatch size.
* [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of
activation recomputation.
* [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of
the scatter-gather communication optimization.
#!/bin/bash
sbatch -p ${SLURM_PARTITION} \
-A ${SLURM_ACCOUNT} \
--job-name=${JOB_NAME} \
--nodes=${NNODES} \
--export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh
exit 0
#!/bin/bash
#SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8
THIS_DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p ${THIS_DIR}/logs
CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"
srun -l \
--container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
--container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
--output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment