Commit e4575be9 authored by huaerkl's avatar huaerkl
Browse files

v1.0

parents
[tool.black]
line-length = 119
target-version = ['py35']
\ No newline at end of file
absl-py==1.4.0
aiohttp==3.8.5
a1iosignal==1.3.1
#apex @ file:///public/software/apps/DeepLearning/whl/dtk-23.04/apex/torch-1.10/apex-0.1%2Bf49ddd4.abi0.dtk2304.torch1.10-cp38-cp38-manylinux2014_x86_64.whl#sha256=938b1ac70165ace45ee218d1181d409a172ef873e8e640c8ae794a9758260eae
appdirs==1.4.4
async-timeout==4.0.2
attrs==23.1.0
black==21.4b0
cachetools==5.3.1
certifi==2023.5.7
charset-normalizer==3.2.0
click==8.1.6
datasets==2.13.1
#deepspeed @ file:///public/software/apps/DeepLearning/whl/dtk-23.04/deepspeed/torch-1.10/deepspeed-0.9.2%2Bgit25d5540.abi0.dtk2304.torch1.10.0-cp38-cp38-manylinux2014_x86_64.whl#sha256=c400f0fc214e8bcac44dde763fb7459ee1f7461049962ef456e1801cf9610bbb
dill==0.3.6
einops==0.6.1
filelock==3.12.2
frozenlist==1.4.0
fsspec==2023.6.0
google-auth==2.22.0
google-auth-oauthlib==1.0.0
grpcio==1.56.2
hjson==3.1.0
huggingface-hub==0.16.4
idna==3.4
importlib-metadata==6.8.0
isort==5.12.0
joblib==1.3.1
Markdown==3.4.3
MarkupSafe==2.1.3
mpi4py==3.1.4
multidict==6.0.4
multiprocess==0.70.14
mypy-extensions==1.0.0
ninja==1.11.1
nltk==3.8.1
numpy==1.23.5
oauthlib==3.2.2
packaging==23.1
pandas==2.0.3
parameterized==0.9.0
pathspec==0.11.1
Pillow==10.0.0
protobuf==4.23.4
psutil==5.9.5
py-cpuinfo==9.0.0
pyarrow==12.0.1
pyasn1==0.5.0
pyasn1-modules==0.3.0
pybind11==2.11.1
pydantic==1.10.11
python-dateutil==2.8.2
pytz==2023.3
PyYAML==6.0.1
regex==2023.6.3
requests==2.31.0
requests-oauthlib==1.3.1
rsa==4.9
safetensors==0.3.1
six==1.16.0
some-package==0.1
tensorboard==2.13.0
tensorboard-data-server==0.7.1
tokenizers==0.13.3
toml==0.10.2
#torch @ file:///public/software/apps/DeepLearning/whl/dtk-23.04/pytorch/torch-1.10/py38/torch-1.10.0%2Bgite378c3c.abi0.dtk2304-cp38-cp38-manylinux2014_x86_64.whl#sha256=0633dfa4814c2bd460e0bcd76c22c50e4824d21adf61274625ea2110f0063aea
#torchaudio @ file:///public/software/apps/DeepLearning/whl/dtk-23.04/pytorch/torch-1.10/py38/torchaudio-0.10.0%2Ba9847c3.abi0.dtk2304.torch1.10.0-cp38-cp38-manylinux2014_x86_64.whl#sha256=a26117aafe8e7e331f3a6f31e58a4d2959a8cd860b8cd3e1586d774922d7ff3e
#torchvision @ file:///public/software/apps/DeepLearning/whl/dtk-23.04/pytorch/torch-1.10/py38/torchvision-0.10.0%2Bgit48e6bbb.abi0.dtk2304.torch1.10-cp38-cp38-manylinux2014_x86_64.whl#sha256=1e8a3baddb1afe5c0c2373d6129e58a3877af6f942f28337a84af4835dad332b
tqdm==4.65.0
transformers==4.31.0
typing_extensions==4.7.1
tzdata==2023.3
urllib3==1.26.16
Werkzeug==2.3.6
xxhash==3.2.0
yarl==1.9.2
zipp==3.16.2
#!/bin/bash
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
#mkdir -p $DIR/logs
#mkdir -p /tmp/logs
#DATASET_1="<PATH TO THE FIRST DATASET>"
#DATASET_2="<PATH TO THE SECOND DATASET>"
#DATASET_3="<PATH TO THE THIRD DATASET>"
#DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
BASE_DATA_PATH=/data/Megatron-LM/data
DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
script_path=$(realpath $0)
script_dir=$(dirname $script_path)
#CONFIG_JSON="$script_dir/ds_config.json"
CONFIG_JSON="/tmp/ds_config.json"
USE_DEEPSPEED=1
ZERO_STAGE=0
# Debug
#TP=4
#PP=4
#LAYERS=8
#HIDDEN=512
#SEQ=1024
#GLOBAL_BATCH=128
#WORKER_STR="-i worker-0"
TP=1
PP=2
HIDDEN=1024
LAYERS=24
SEQ=1024
GLOBAL_BATCH=2
WORKER_STR=""
MICRO_BATCH=1
DTYPE="bf16"
LOG_DIR="/tmp/tensorboard/tp${TP}_pp${PP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_${DTYPE}_fix3"
mkdir -p $LOG_DIR
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--no-deepspeed)
USE_DEEPSPEED=0;
shift
;;
-z|--zero-stage)
ZERO_STAGE=$2;
shift
;;
*)
echo "Unknown argument(s)"
usage
exit 1
shift
;;
esac
done
options=" \
--tensor-model-parallel-size $TP \
--pipeline-model-parallel-size $PP \
--num-layers $LAYERS \
--hidden-size $HIDDEN \
--num-attention-heads 32 \
--seq-length $SEQ \
--loss-scale 12 \
--max-position-embeddings $SEQ \
--micro-batch-size $MICRO_BATCH \
--global-batch-size $GLOBAL_BATCH \
--train-iters 1000 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--log-interval 1 \
--eval-iters 40 \
--eval-interval 1000 \
--data-path ${DATASET} \
--vocab-file ${VOCAB_PATH} \
--merge-file ${MERGE_PATH} \
--save-interval 10000 \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.006 \
--${DTYPE} \
--checkpoint-activations \
--exit-interval 10000 \
--tensorboard-dir $LOG_DIR
"
if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
echo "Using DeepSpeed"
options="${options} \
--deepspeed \
--deepspeed_config=${CONFIG_JSON} \
--zero-stage=${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
fi
cat <<EOT > $CONFIG_JSON
{
"train_batch_size" : $GLOBAL_BATCH,
"train_micro_batch_size_per_gpu": $MICRO_BATCH,
"steps_per_print": 1,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"bf16": {
"enabled": true
},
"fp16": {
"enabled": false,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"wall_clock_breakdown" : true
}
EOT
WORKER_STR="-i worker-0:0,1"
#run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
#run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
run_cmd="deepspeed $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
echo ${run_cmd}
eval ${run_cmd}
set +x
#!/bin/bash
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
#mkdir -p $DIR/logs
#mkdir -p /tmp/logs
#DATASET_1="<PATH TO THE FIRST DATASET>"
#DATASET_2="<PATH TO THE SECOND DATASET>"
#DATASET_3="<PATH TO THE THIRD DATASET>"
#DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
#BASE_DATA_PATH=tests/data/gpt2
#DATASET=${BASE_DATA_PATH}/meg-gpt2-openwebtext_text_document
#VOCAB_PATH=${BASE_DATA_PATH}/gpt2-tiny-vocab.json
#MERGE_PATH=${BASE_DATA_PATH}/gpt2-tiny-merges.txt
BASE_DATA_PATH=/vc_data/Megatron-LM/data
DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
script_path=$(realpath $0)
script_dir=$(dirname $script_path)
CONFIG_JSON="$script_dir/ds_config.json"
#CONFIG_JSON="/tmp/ds_config.json"
USE_DEEPSPEED=1
ZERO_STAGE=0
#TP=4
#PP=4
# Debug
DEBUG_MODE=0
if [[ $DEBUG_MODE == 1 ]]; then
LAYERS=4
HIDDEN=512
SEQ=512
EXIT_INTERVAL=3
else
HIDDEN=1024
LAYERS=24
SEQ=1024
EXIT_INTERVAL=10
fi
TP=2
PP=2
DP=4
WORLD_SIZE=$((TP*PP*DP))
GLOBAL_BATCH=4
MICRO_BATCH=1
TRAIN_ITERS=100000
CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP}
LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP}
LR=6.0e-4
MIN_LR=6.0e-5
DTYPE="bf16"
EXP_DIR=${HOME}/experiments/results/ckpt_reshape
LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_cont"
mkdir -p $LOG_DIR
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--no-deepspeed)
USE_DEEPSPEED=0;
shift
;;
-z|--zero-stage)
ZERO_STAGE=$2;
shift
;;
*)
echo "Unknown argument(s)"
usage
exit 1
shift
;;
esac
done
options=" \
--tensor-model-parallel-size $TP \
--pipeline-model-parallel-size $PP \
--num-layers $LAYERS \
--hidden-size $HIDDEN \
--num-attention-heads 32 \
--seq-length $SEQ \
--loss-scale 12 \
--max-position-embeddings $SEQ \
--micro-batch-size $MICRO_BATCH \
--global-batch-size $GLOBAL_BATCH \
--train-iters $TRAIN_ITERS \
--lr $LR \
--min-lr $MIN_LR \
--lr-decay-style cosine \
--log-interval 1 \
--eval-iters 40 \
--eval-interval 10 \
--data-path ${DATASET} \
--vocab-file ${VOCAB_PATH} \
--merge-file ${MERGE_PATH} \
--save-interval 1000 \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.006 \
--${DTYPE} \
--checkpoint-activations \
--exit-interval ${EXIT_INTERVAL} \
--save ${CHECKPOINT_PATH} \
--load ${LOAD_CHECKPOINT_PATH} \
--position-embedding-type alibi \
--override-lr-scheduler \
--embed-layernorm \
--tensorboard-dir $LOG_DIR
"
if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
echo "Using DeepSpeed"
options="${options} \
--deepspeed \
--deepspeed_config=${CONFIG_JSON} \
--zero-stage=${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
fi
cat <<EOT > $CONFIG_JSON
{
"train_batch_size" : $GLOBAL_BATCH,
"train_micro_batch_size_per_gpu": $MICRO_BATCH,
"steps_per_print": 1,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"bf16": {
"enabled": true
},
"fp16": {
"enabled": false,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"wall_clock_breakdown" : true
}
EOT
#WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
#WORKER_STR="-i worker-0:0,1,2,3"
#run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
#run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
run_cmd="deepspeed --master_port 29700 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
echo ${run_cmd}
eval ${run_cmd}
set +x
#!/bin/bash
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
#mkdir -p $DIR/logs
#mkdir -p /tmp/logs
#DATASET_1="<PATH TO THE FIRST DATASET>"
#DATASET_2="<PATH TO THE SECOND DATASET>"
#DATASET_3="<PATH TO THE THIRD DATASET>"
#DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
BASE_DATA_PATH=/data/Megatron-LM/data
DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
script_path=$(realpath $0)
script_dir=$(dirname $script_path)
#CONFIG_JSON="$script_dir/ds_config.json"
CONFIG_JSON="/tmp/ds_config.json"
USE_DEEPSPEED=1
ZERO_STAGE=0
# Debug
#TP=4
#PP=4
#LAYERS=8
#HIDDEN=512
#SEQ=1024
#GLOBAL_BATCH=128
#WORKER_STR="-i worker-0"
TP=1
PP=1
DP=2
WORLD_SIZE=$((TP*PP*DP))
HIDDEN=1024
LAYERS=24
SEQ=1024
GLOBAL_BATCH=1
WORKER_STR=""
MICRO_BATCH=1
LR=6.0e-4
MIN_LR=6.0e-5
DTYPE="fp16"
EXP_DIR=${HOME}/experiments/results/bf16
LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_fix3"
mkdir -p $LOG_DIR
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--no-deepspeed)
USE_DEEPSPEED=0;
shift
;;
-z|--zero-stage)
ZERO_STAGE=$2;
shift
;;
*)
echo "Unknown argument(s)"
usage
exit 1
shift
;;
esac
done
options=" \
--tensor-model-parallel-size $TP \
--pipeline-model-parallel-size $PP \
--num-layers $LAYERS \
--hidden-size $HIDDEN \
--num-attention-heads 32 \
--seq-length $SEQ \
--loss-scale 12 \
--max-position-embeddings $SEQ \
--micro-batch-size $MICRO_BATCH \
--global-batch-size $GLOBAL_BATCH \
--train-iters 1000 \
--lr $LR \
--min-lr $MIN_LR \
--lr-decay-style cosine \
--log-interval 1 \
--eval-iters 40 \
--eval-interval 10 \
--data-path ${DATASET} \
--vocab-file ${VOCAB_PATH} \
--merge-file ${MERGE_PATH} \
--save-interval 10000 \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.006 \
--${DTYPE} \
--checkpoint-activations \
--exit-interval 10000 \
--tensorboard-dir $LOG_DIR
"
if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
echo "Using DeepSpeed"
options="${options} \
--deepspeed \
--deepspeed_config=${CONFIG_JSON} \
--zero-stage=${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
fi
cat <<EOT > $CONFIG_JSON
{
"train_batch_size" : $GLOBAL_BATCH,
"train_micro_batch_size_per_gpu": $MICRO_BATCH,
"steps_per_print": 1,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"bf16": {
"enabled": false
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 8
},
"wall_clock_breakdown" : true
}
EOT
WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
#WORKER_STR="-i worker-0:0,1,2,3"
#run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
#run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
run_cmd="deepspeed --master_port 29600 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
echo ${run_cmd}
eval ${run_cmd}
set +x
#!/bin/bash
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
#mkdir -p $DIR/logs
#mkdir -p /tmp/logs
#DATASET_1="<PATH TO THE FIRST DATASET>"
#DATASET_2="<PATH TO THE SECOND DATASET>"
#DATASET_3="<PATH TO THE THIRD DATASET>"
#DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
#BASE_DATA_PATH=tests/data/gpt2
#DATASET=${BASE_DATA_PATH}/meg-gpt2-openwebtext_text_document
#VOCAB_PATH=${BASE_DATA_PATH}/gpt2-tiny-vocab.json
#MERGE_PATH=${BASE_DATA_PATH}/gpt2-tiny-merges.txt
BASE_DATA_PATH=/vc_data/Megatron-LM/data
DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
script_path=$(realpath $0)
script_dir=$(dirname $script_path)
CONFIG_JSON="$script_dir/ds_config.json"
#CONFIG_JSON="/tmp/ds_config.json"
USE_DEEPSPEED=1
ZERO_STAGE=0
#TP=4
#PP=4
# Debug
DEBUG_MODE=0
if [[ $DEBUG_MODE == 1 ]]; then
LAYERS=4
HIDDEN=512
SEQ=512
EXIT_INTERVAL=3
else
HIDDEN=1024
LAYERS=24
SEQ=1024
EXIT_INTERVAL=10
fi
TP=2
PP=2
DP=4
WORLD_SIZE=$((TP*PP*DP))
GLOBAL_BATCH=4
MICRO_BATCH=1
TRAIN_ITERS=100000
CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP}
LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp2_pp2_dp4
LR=6.0e-4
MIN_LR=6.0e-5
DTYPE="bf16"
EXP_DIR=${HOME}/experiments/results/ckpt_reshape
LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_uni"
mkdir -p $LOG_DIR
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--no-deepspeed)
USE_DEEPSPEED=0;
shift
;;
-z|--zero-stage)
ZERO_STAGE=$2;
shift
;;
*)
echo "Unknown argument(s)"
usage
exit 1
shift
;;
esac
done
options=" \
--tensor-model-parallel-size $TP \
--pipeline-model-parallel-size $PP \
--num-layers $LAYERS \
--hidden-size $HIDDEN \
--num-attention-heads 32 \
--seq-length $SEQ \
--loss-scale 12 \
--max-position-embeddings $SEQ \
--micro-batch-size $MICRO_BATCH \
--global-batch-size $GLOBAL_BATCH \
--train-iters $TRAIN_ITERS \
--lr $LR \
--min-lr $MIN_LR \
--lr-decay-style cosine \
--log-interval 1 \
--eval-iters 40 \
--eval-interval 10 \
--data-path ${DATASET} \
--vocab-file ${VOCAB_PATH} \
--merge-file ${MERGE_PATH} \
--save-interval 1000 \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.006 \
--${DTYPE} \
--checkpoint-activations \
--exit-interval ${EXIT_INTERVAL} \
--save ${CHECKPOINT_PATH} \
--load ${LOAD_CHECKPOINT_PATH} \
--universal-checkpoint \
--position-embedding-type alibi \
--override-lr-scheduler \
--embed-layernorm \
--tensorboard-dir $LOG_DIR
"
if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
echo "Using DeepSpeed"
options="${options} \
--deepspeed \
--deepspeed_config=${CONFIG_JSON} \
--zero-stage=${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
fi
cat <<EOT > $CONFIG_JSON
{
"train_batch_size" : $GLOBAL_BATCH,
"train_micro_batch_size_per_gpu": $MICRO_BATCH,
"steps_per_print": 1,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"bf16": {
"enabled": true
},
"fp16": {
"enabled": false,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"wall_clock_breakdown" : true
}
EOT
#WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
#WORKER_STR="-i worker-0:0,1,2,3"
#run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
#run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
run_cmd="deepspeed --master_port 29700 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
echo ${run_cmd}
eval ${run_cmd}
set +x
# Scripts
This section should be organized with sub-folders for different things.
# Inference scripts for BLOOM
Moved to https://github.com/huggingface/transformers-bloom-inference/tree/main/bloom-inference-scripts
## Inference solutions for BLOOM 176B
Moved to https://github.com/huggingface/transformers-bloom-inference/tree/main/bloom-inference-server
import json
import argparse
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--dir',
type=str,
required=True,
help='directory to save data'
)
args = parser.parse_args()
return args
def main():
args = get_args()
for i in range(10):
row_limit = 1000
rows_to_save = [{'text': ''.join([str(i)+'-*']*128)}]
with open('{}/dataset_{}.json'.format(args.dir, i), 'w') as f:
f.write(
'\n'.join(json.dumps(_i) for _i in rows_to_save*row_limit)
)
if __name__ == '__main__':
main()
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Processing data for pretraining."""
import argparse
import json
import multiprocessing
import os
import sys
from megatron.data.indexed_dataset import best_fitting_dtype
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
os.path.pardir)))
import time
import torch
try:
import nltk
nltk_available = True
except ImportError:
nltk_available = False
from megatron.tokenizer import build_tokenizer
from megatron.data import indexed_dataset
# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
class CustomLanguageVars(nltk.tokenize.punkt.PunktLanguageVars):
_period_context_fmt = r"""
\S* # some word material
%(SentEndChars)s # a potential sentence ending
\s* # <-- THIS is what I changed
(?=(?P<after_tok>
%(NonWord)s # either other punctuation
|
(?P<next_tok>\S+) # <-- Normally you would have \s+ here
))"""
class IdentitySplitter(object):
def tokenize(self, *text):
return text
class Encoder(object):
def __init__(self, args):
self.args = args
def initializer(self):
# Use Encoder class as a container for global data
Encoder.tokenizer = build_tokenizer(self.args)
if self.args.split_sentences:
if not nltk_available:
print("NLTK is not available to split sentences.")
exit()
splitter = nltk.load("tokenizers/punkt/english.pickle")
if self.args.keep_newlines:
# this prevents punkt from eating newlines after sentences
Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
train_text = splitter._params,
lang_vars = CustomLanguageVars())
else:
Encoder.splitter = splitter
else:
Encoder.splitter = IdentitySplitter()
def encode(self, json_line):
data = json.loads(json_line)
ids = {}
for key in self.args.json_keys:
text = data[key]
doc_ids = []
for sentence in Encoder.splitter.tokenize(text):
sentence_ids = Encoder.tokenizer.tokenize(sentence)
if len(sentence_ids) > 0:
doc_ids.append(sentence_ids)
if len(doc_ids) > 0 and self.args.append_eod:
doc_ids[-1].append(Encoder.tokenizer.eod)
ids[key] = doc_ids
return ids, len(json_line)
def get_args():
parser = argparse.ArgumentParser()
group = parser.add_argument_group(title='input data')
group.add_argument('--input', type=str,
help='Path to input JSON')
group.add_argument('--datasets', nargs='+', default=None,
help='Paths to one or more input datasets to merge')
group.add_argument('--json-keys', nargs='+', default=['text'],
help='space separate listed of keys to extract from json')
group.add_argument('--split-sentences', action='store_true',
help='Split documents into sentences.')
group.add_argument('--keep-newlines', action='store_true',
help='Keep newlines between sentences when splitting.')
group = parser.add_argument_group(title='tokenizer')
group.add_argument('--tokenizer-type', type=str, required=True,
choices=['BertWordPieceLowerCase','BertWordPieceCase',
'GPT2BPETokenizer', 'PretrainedFromHF'],
help='What type of tokenizer to use.')
group.add_argument('--vocab-file', type=str, default=None,
help='Path to the vocab file')
group.add_argument('--merge-file', type=str, default=None,
help='Path to the BPE merge file (if necessary).')
group.add_argument('--append-eod', action='store_true',
help='Append an <eod> token to the end of a document.')
group.add_argument("--tokenizer-name-or-path", type=str, default=None,
help="Name or path of the huggingface tokenizer.")
group = parser.add_argument_group(title='output data')
group.add_argument('--output-prefix', type=str, required=True,
help='Path to binary output file without suffix')
group.add_argument('--dataset-impl', type=str, default='mmap',
choices=['lazy', 'cached', 'mmap'])
group = parser.add_argument_group(title='runtime')
group.add_argument('--workers', type=int, default=1,
help='Number of worker processes to launch')
group.add_argument('--log-interval', type=int, default=100,
help='Interval between progress updates')
args = parser.parse_args()
args.keep_empty = False
if args.tokenizer_type.lower().startswith('bert'):
if not args.split_sentences:
print("Bert tokenizer detected, are you sure you don't want to split sentences?")
# some default/dummy values for the tokenizer
args.rank = 0
args.make_vocab_size_divisible_by = 128
args.tensor_model_parallel_size = 1
args.vocab_extra_ids = 0
return args
def main():
args = get_args()
startup_start = time.time()
print("Opening", args.input)
fin = open(args.input, 'r', encoding='utf-8')
if nltk_available and args.split_sentences:
nltk.download("punkt", quiet=True)
encoder = Encoder(args)
tokenizer = build_tokenizer(args)
pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
encoded_docs = pool.imap(encoder.encode, fin, 25)
#encoded_docs = map(encoder.encode, fin)
level = "document"
if args.split_sentences:
level = "sentence"
print(f"Vocab size: {tokenizer.vocab_size}")
print(f"Output prefix: {args.output_prefix}")
output_bin_files = {}
output_idx_files = {}
builders = {}
for key in args.json_keys:
output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix,
key, level)
output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix,
key, level)
builders[key] = indexed_dataset.make_builder(output_bin_files[key],
impl=args.dataset_impl,
dtype=best_fitting_dtype(tokenizer.vocab_size))
startup_end = time.time()
proc_start = time.time()
total_bytes_processed = 0
print("Time to startup:", startup_end - startup_start)
for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
total_bytes_processed += bytes_processed
for key, sentences in doc.items():
if len(sentences) == 0:
continue
for sentence in sentences:
builders[key].add_item(torch.IntTensor(sentence))
builders[key].end_document()
if i % args.log_interval == 0:
current = time.time()
elapsed = current - proc_start
mbs = total_bytes_processed/elapsed/1024/1024
print(f"Processed {i} documents",
f"({i/elapsed} docs/s, {mbs} MB/s).",
file=sys.stderr)
for key in args.json_keys:
builders[key].finalize(output_idx_files[key])
if __name__ == '__main__':
main()
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pretrain GPT"""
import torch
from functools import partial
from megatron import get_args
from megatron import print_rank_0
from megatron import get_timers
from megatron import get_tokenizer
from megatron import mpu
from megatron.data.gpt_dataset import build_train_valid_test_datasets
from megatron.model import GPTModel, GPTModelPipe
from megatron.training import pretrain
from megatron.utils import get_ltor_masks_and_position_ids
from megatron.utils import average_losses_across_data_parallel_group
import deepspeed
from deepspeed.runtime.utils import see_memory_usage
import os
import subprocess
collected_sample = {}
token_dict = {
15: 0,
16: 1,
17: 2,
18: 3,
19: 4,
20: 5,
21: 6,
22: 7,
23: 8,
24: 9,
}
def model_provider(pre_process=True, post_process=True):
"""Build the model."""
print_rank_0('building GPT model ...')
see_memory_usage(f"Before Building Model", force=True)
args = get_args()
with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(),
remote_device=None if args.remote_device=='none' else args.remote_device,
config=args.deepspeed_config,
enabled=args.zero_stage==3):
if args.deepspeed and mpu.get_pipeline_model_parallel_world_size() > 1:
model = GPTModelPipe(
num_tokentypes=0,
parallel_output=True
)
# This is a hack to give us a reference to get_batch_pipe from within training.py
# We need to call model.set_batch_fn after deepspeed.initialize
model._megatron_batch_fn = get_batch_pipe
# Predompute the attention mask and store it in args. This avoids having to
# pipeline it as an activation during training. The mask is constant, and thus
# we can reuse it.
attention_mask = torch.tril(torch.ones(
(1, args.seq_length, args.seq_length), device=torch.cuda.current_device())).view(
1, 1, args.seq_length, args.seq_length)
# Convert attention mask to binary:
attention_mask = (attention_mask < 0.5)
if args.fp16:
attention_mask = attention_mask.half()
elif args.bf16:
attention_mask = attention_mask.bfloat16()
args.attn_mask = attention_mask
else:
model = GPTModel(
num_tokentypes=0,
parallel_output=True,
pre_process=pre_process,
post_process=post_process
)
see_memory_usage(f"After Building Model", force=True)
return model
def get_batch(data_iterator):
"""Generate a batch"""
args = get_args()
tokenizer = get_tokenizer()
# Items and their type.
keys = ['text']
datatype = torch.int64
# Broadcast data.
if data_iterator is not None:
data = next(data_iterator)
else:
data = None
data_b = mpu.broadcast_data(keys, data, datatype)
# Unpack.
# tokens_ = data_b['text'].long()
tokens_ = data_b[keys[0]].long()
labels = tokens_[:, 1:].contiguous()
tokens = tokens_[:, :-1].contiguous()
# Get the masks and postition ids.
attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
tokens,
tokenizer.eod,
args.reset_position_ids,
args.reset_attention_mask,
args.eod_mask_loss)
return tokens, labels, loss_mask, attention_mask, position_ids
def get_batch_pipe(data):
"""Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`"""
args = get_args()
tokenizer = get_tokenizer()
# Items and their type.
keys = ['text']
datatype = torch.int64
# Broadcast data.
data_b = mpu.broadcast_data(keys, data, datatype)
# Unpack.
tokens_ = data_b['text'].long()
labels = tokens_[:, 1:].contiguous()
tokens = tokens_[:, :-1].contiguous()
# Get the masks and postition ids.
attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
tokens,
tokenizer.eod,
args.reset_position_ids,
args.reset_attention_mask,
args.eod_mask_loss)
return (tokens, position_ids, attention_mask), (labels, loss_mask)
def loss_func(loss_mask, output_tensor):
losses = output_tensor.float()
loss_mask = loss_mask.view(-1).float()
loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
# Reduce loss for logging.
averaged_loss = average_losses_across_data_parallel_group([loss])
return loss, {'lm loss': averaged_loss[0]}
def forward_step(data_iterator, model):
"""Forward step."""
args = get_args()
timers = get_timers()
# Get the batch.
timers('batch-generator').start()
tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
data_iterator)
timers('batch-generator').stop()
for row in tokens.detach().cpu():
token_idx = [i for i in row.tolist() if i in list(token_dict.keys())][0]
num = "dataset_{}".format(token_dict[token_idx])
if num in collected_sample:
collected_sample[num] += 1
else:
collected_sample[num] = 1
print_rank_0(collected_sample)
output_tensor = model(tokens, position_ids, attention_mask,
labels=labels)
return output_tensor, partial(loss_func, loss_mask)
def train_valid_test_datasets_provider(train_val_test_num_samples):
"""Build train, valid, and test datasets."""
args = get_args()
print_rank_0('> building train, validation, and test datasets '
'for GPT ...')
train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
data_prefix=args.data_path,
data_impl=args.data_impl,
splits_string=args.split,
train_valid_test_num_samples=train_val_test_num_samples,
seq_length=args.seq_length,
seed=args.seed,
skip_warmup=(not args.mmap_warmup))
print_rank_0("> finished creating GPT datasets ...")
return train_ds, valid_ds, test_ds
def command_exists(cmd):
result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
return result.wait() == 0
def git_ds_info():
from deepspeed.env_report import main as ds_report
ds_report()
# Write out version/git info
git_hash_cmd = "git rev-parse --short HEAD"
git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
if command_exists('git'):
try:
result = subprocess.check_output(git_hash_cmd, shell=True)
git_hash = result.decode('utf-8').strip()
result = subprocess.check_output(git_branch_cmd, shell=True)
git_branch = result.decode('utf-8').strip()
except subprocess.CalledProcessError:
git_hash = "unknown"
git_branch = "unknown"
else:
git_hash = "unknown"
git_branch = "unknown"
print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****')
if __name__ == "__main__":
git_ds_info()
pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
#!/bin/bash
BASE_DATA_PATH=/tmp
INPUT_PATH=/tmp
OUTPUT_PATH=/tmp
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -P ${BASE_DATA_PATH}
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -P ${BASE_DATA_PATH}
python create_dummy_dataset.py --dir ${INPUT_PATH}
python preprocess_data.py \
--input ${INPUT_PATH}/dataset_0.json \
--output-prefix ${OUTPUT_PATH}/dataset-0 \
--vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
--append-eod
python preprocess_data.py \
--input ${INPUT_PATH}/dataset_1.json \
--output-prefix ${OUTPUT_PATH}/dataset-1 \
--vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
--append-eod
python preprocess_data.py \
--input ${INPUT_PATH}/dataset_2.json \
--output-prefix ${OUTPUT_PATH}/dataset-2 \
--vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
--append-eod
python preprocess_data.py \
--input ${INPUT_PATH}/dataset_3.json \
--output-prefix ${OUTPUT_PATH}/dataset-3 \
--vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
--append-eod
python preprocess_data.py \
--input ${INPUT_PATH}/dataset_4.json \
--output-prefix ${OUTPUT_PATH}/dataset-4 \
--vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
--append-eod
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p ${BASE_DATA_PATH}/logs
DATASET_0="${OUTPUT_PATH}/dataset-0_text_document"
DATASET_1="${OUTPUT_PATH}/dataset-1_text_document"
DATASET_2="${OUTPUT_PATH}/dataset-2_text_document"
DATASET_3="${OUTPUT_PATH}/dataset-3_text_document"
DATASET_4="${OUTPUT_PATH}/dataset-4_text_document"
DATASET="0.1 ${DATASET_0} 0.25 ${DATASET_1} 0.2 ${DATASET_2} 0.15 ${DATASET_3} 0.3 ${DATASET_4}"
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
#script_path=$(realpath $0)
#script_dir=$(dirname $script_path)
#CONFIG_JSON="$script_dir/ds_config.json"
USE_DEEPSPEED=1
ZERO_STAGE=0
# Debug
#TP=4
#PP=4
#LAYERS=8
#HIDDEN=512
#SEQ=1024
#GLOBAL_BATCH=128
#WORKER_STR="-i worker-0"
# 52B
TP=4
PP=16
HIDDEN=1024
LAYERS=24
SEQ=128
GLOBAL_BATCH=16
WORKER_STR=""
MICRO_BATCH=8
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--no-deepspeed)
USE_DEEPSPEED=0;
shift
;;
-z|--zero-stage)
ZERO_STAGE=$2;
shift
;;
*)
echo "Unknown argument(s)"
usage
exit 1
shift
;;
esac
done
options=" \
--tensor-model-parallel-size $TP \
--pipeline-model-parallel-size $PP \
--num-layers $LAYERS \
--hidden-size $HIDDEN \
--num-attention-heads 32 \
--seq-length $SEQ \
--loss-scale 12 \
--max-position-embeddings $SEQ \
--micro-batch-size $MICRO_BATCH \
--global-batch-size $GLOBAL_BATCH \
--train-iters 1000 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--log-interval 1 \
--eval-iters 40 \
--eval-interval 1000 \
--data-path ${DATASET} \
--vocab-file ${VOCAB_PATH} \
--merge-file ${MERGE_PATH} \
--save-interval 1000 \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.006 \
--fp16 \
--checkpoint-activations
"
if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
echo "Using DeepSpeed"
options="${options} \
--deepspeed \
--deepspeed_config=${CONFIG_JSON} \
--zero-stage=${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
fi
cat <<EOT > $CONFIG_JSON
{
"train_batch_size" : $GLOBAL_BATCH,
"train_micro_batch_size_per_gpu": $MICRO_BATCH,
"steps_per_print": 1,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"gradient_clipping": 1.0,
"prescale_gradients": true,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"wall_clock_breakdown" : true
}
EOT
# run_cmd="deepspeed $WORKER_STR ${DIR}/test_sampling.py $@ ${options}"
run_cmd="deepspeed $WORKER_STR test_sampling.py $@ ${options}"
echo ${run_cmd}
eval ${run_cmd}
set +x
[isort]
default_section = FIRSTPARTY
ensure_newline_before_comments = True
force_grid_wrap = 0
include_trailing_comma = True
known_first_party = megatron
known_third_party =
apex
codecarbon
datasets
deepspeed
git
nltk
numpy
pytest
tensorboard
torch
tqdm
transformers
line_length = 119
lines_after_imports = 2
multi_line_output = 3
use_parentheses = True
\ No newline at end of file
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Setup for pip package."""
import os
import sys
import setuptools
if sys.version_info < (3,):
raise Exception("Python 2 is not supported by Megatron.")
from megatron.package_info import (
__description__,
__contact_names__,
__url__,
__download_url__,
__keywords__,
__license__,
__package_name__,
__version__,
)
with open("README.md", "r") as fh:
long_description = fh.read()
###############################################################################
# Dependency Loading #
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
def req_file(filename):
with open(filename) as f:
content = f.readlines()
return [x.strip() for x in content]
install_requires = req_file("requirements.txt")
setuptools.setup(
name=__package_name__,
# Versions should comply with PEP440. For a discussion on single-sourcing
# the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version=__version__,
description=__description__,
long_description=long_description,
long_description_content_type="text/markdown",
# The project's main homepage.
url=__url__,
author=__contact_names__,
maintainer=__contact_names__,
# The licence under which the project is released
license=__license__,
classifiers=[
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'Intended Audience :: Information Technology',
# Indicate what your project relates to
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: Software Development :: Libraries :: Python Modules',
# Supported python versions
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
# Additional Setting
'Environment :: Console',
'Natural Language :: English',
'Operating System :: OS Independent',
],
python_requires='>=3.6',
packages=setuptools.find_packages(),
install_requires=install_requires,
# Add in any packaged data.
include_package_data=True,
zip_safe=False,
# PyPI package information.
keywords=__keywords__
)
# Fast Setup instructions
This quick instructions document contains 3 steps:
1. installing software
2. preparing data
3. running the script
This is useful if you need to ask someone to reproduce problems with `Megatron-Deepspeed`
## 1. Software
Please follow this exact order.
0. Create a new conda env if need be or activate an existing environment.
1. Install `pytorch`. Choose the desired version install instructions [here](https://pytorch.org/get-started/locally/), but for conda it'd be:
```
conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
```
2. Install system-wide `cuda` if you don't have it already. [NVIDIA instructions](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html). Of course ideally use [the premade packages for your distro](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#package-manager-installation).
Use the same major version as pytorch's cuda build. To check use:
```
python -c 'import torch; print(f"pt={torch.__version__}, cuda={torch.version.cuda}")'
```
The minor versions don't actually have to match, but then you will need to hack `apex` installer to ignore minor version changes, see below.
3. Install `apex`
```
git clone https://github.com/NVIDIA/apex
cd apex
pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check . 2>&1 | tee build.log
cd -
```
If the pytorch and system-wide cuda minor versions mismatch, it's not a problem, you just need to hack `apex`'s build to bypass the check by applying this patch first and then build it.
```
diff --git a/setup.py b/setup.py
index d76e998..f224dae 100644
--- a/setup.py
+++ b/setup.py
@@ -31,6 +31,8 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
print(raw_output + "from " + cuda_dir + "/bin\n")
if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
+ # allow minor diffs
+ if bare_metal_minor != torch_binary_minor: return
raise RuntimeError(
"Cuda extensions are being compiled with a version of Cuda that does "
"not match the version used to compile Pytorch binaries. "
```
4. Checkout and prepare `Megatron-DeepSpeed` and install its requirements
```
git clone https://github.com/bigscience-workshop/Megatron-DeepSpeed
cd Megatron-DeepSpeed
pip install -r requirements.txt
```
## 2. Data
Will work under the `Megatron-DeepSpeed` clone
```
cd Megatron-DeepSpeed
```
Prepare data for preprocessing
```
mkdir -p data
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -O data/gpt2-vocab.json
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -O data/gpt2-merges.txt
python -c 'from datasets import load_dataset; ds = load_dataset("stas/oscar-en-10k", split="train", keep_in_memory=False); ds.to_json(f"data/oscar-en-10k.jsonl", orient="records", lines=True, force_ascii=False)'
```
Pre-process a small dataset to be used for training
```
python tools/preprocess_data.py \
--input data/oscar-en-10k.jsonl \
--output-prefix data/meg-gpt2-oscar-en-10k \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file data/gpt2-merges.txt \
--vocab data/gpt2-vocab.json \
--append-eod \
--workers 4
```
now you have data/meg-gpt2-oscar-en-10k, vocab and merges files to pass as arguments to training, the next section shows how to use them.
Note that Megatron wants `data/meg-gpt2-oscar-en-10k_text_document` prefix later in `--data-path`
## 3. Train
Here is a tiny model training setup configured over 2 gpus to train on the data we prepared in step 2.
Put it in a script or run it directly.
If you have only 1 gpu, change these 2 lines below to:
```
N_GPUS=1
TP_SIZE=1
```
The script:
```
CHECKPOINT_PATH=checkpoints/gpt2
VOCAB_FILE=data/gpt2-vocab.json
MERGE_FILE=data/gpt2-merges.txt
DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document
TENSORBOARD_PATH=output_dir/tensorboard
N_GPUS=2
MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=16
TP_SIZE=2
PP_SIZE=1
NLAYERS=2
NHIDDEN=8
NHEADS=2
SEQ_LEN=512
VOCAB_SIZE=50257
SAVE_INTERVAL=50
TRAIN_SAMPLES=10_000
GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--rampup-batch-size 2 2 1_000 \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train-samples $TRAIN_SAMPLES \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 1e-4 \
--lr-warmup-samples 5 \
--min-lr 1e-6 \
--lr-decay-style cosine \
--lr-decay-samples 12 \
--clip-grad 1.0 \
--weight-decay 1e-1 \
--embed-layernorm \
--fp16 \
--partition-activations \
--seed 42 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
"
OUTPUT_ARGS=" \
--exit-interval 100 \
--log-interval 10 \
--save-interval $SAVE_INTERVAL \
--eval-interval 100 \
--eval-iters 10 \
--checkpoint-activations \
"
DATA_ARGS=" \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--kill-switch-path /tmp/kill-switch \
"
ZERO_STAGE=1
config_json="./ds_config.json"
# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
ALL_ARGS="$GPT_ARGS $OUTPUT_ARGS $DATA_ARGS $DEEPSPEED_ARGS"
MASTER_ADDR=localhost
MASTER_PORT=6777
export LAUNCHER="python -u -m torch.distributed.run \
--nproc_per_node $N_GPUS \
--nnodes 1 \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
--rdzv_backend c10d \
--max_restarts 0 \
--tee 3 \
"
export CMD=" \
$LAUNCHER pretrain_gpt.py \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
--distributed-backend nccl \
$ALL_ARGS \
"
echo $CMD
$CMD
```
You can, of course, run this as a slurm script, but here is [a full slurm script example](https://github.com/bigscience-workshop/bigscience/blob/d57b76bb592832bb4d2054cd5cbf132796be2d83/train/tr11-176B-ml/setup-test-n2.slurm), which has some tweaks to get `MASTER_ADDR` and a few other bits right under the SLURM environment on JeanZay, which may or may not be needed if you run it elsewhere.
Remember to wipe out `$CHECKPOINT_PATH`, if you change the model shape and there is a checkpoint with the old shapes saved already.
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tasks data utility."""
import re
import numpy as np
def clean_text(text):
"""Remove new lines and multiple spaces and adjust end of sentence dot."""
text = text.replace("\n", " ")
text = re.sub(r'\s+', ' ', text)
for _ in range(3):
text = text.replace(' . ', '. ')
return text
def build_sample(ids, types, paddings, label, unique_id):
"""Convert to numpy and return a sample consumed by the batch producer."""
ids_np = np.array(ids, dtype=np.int64)
types_np = np.array(types, dtype=np.int64)
paddings_np = np.array(paddings, dtype=np.int64)
sample = ({'text': ids_np,
'types': types_np,
'padding_mask': paddings_np,
'label': int(label),
'uid': int(unique_id)})
return sample
def build_tokens_types_paddings_from_text(text_a, text_b,
tokenizer, max_seq_length):
"""Build token types and paddings, trim if needed, and pad if needed."""
text_a_ids = tokenizer.tokenize(text_a)
text_b_ids = None
if text_b is not None:
text_b_ids = tokenizer.tokenize(text_b)
return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids,
max_seq_length, tokenizer.cls,
tokenizer.sep, tokenizer.pad)
def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length,
cls_id, sep_id, pad_id):
"""Build token types and paddings, trim if needed, and pad if needed."""
ids = []
types = []
paddings = []
# [CLS].
ids.append(cls_id)
types.append(0)
paddings.append(1)
# A.
len_text_a = len(text_a_ids)
ids.extend(text_a_ids)
types.extend([0] * len_text_a)
paddings.extend([1] * len_text_a)
# [SEP].
ids.append(sep_id)
types.append(0)
paddings.append(1)
# B.
if text_b_ids is not None:
len_text_b = len(text_b_ids)
ids.extend(text_b_ids)
types.extend([1] * len_text_b)
paddings.extend([1] * len_text_b)
# Cap the size.
trimmed = False
if len(ids) >= max_seq_length:
max_seq_length_m1 = max_seq_length - 1
ids = ids[0:max_seq_length_m1]
types = types[0:max_seq_length_m1]
paddings = paddings[0:max_seq_length_m1]
trimmed = True
# [SEP].
if (text_b_ids is not None) or trimmed:
ids.append(sep_id)
if text_b_ids is None:
types.append(0)
else:
types.append(1)
paddings.append(1)
# Padding.
padding_length = max_seq_length - len(ids)
if padding_length > 0:
ids.extend([pad_id] * padding_length)
types.extend([pad_id] * padding_length)
paddings.extend([0] * padding_length)
return ids, types, paddings
import os
import argparse
import collections
import numpy as np
import torch
def process_files(args):
all_predictions = collections.OrderedDict()
all_labels = collections.OrderedDict()
all_uid = collections.OrderedDict()
for path in args.paths:
path = os.path.join(path, args.prediction_name)
try:
data = torch.load(path)
for dataset in data:
name, d = dataset
predictions, labels, uid = d
if name not in all_predictions:
all_predictions[name] = np.array(predictions)
if args.labels is None:
args.labels = [i for i in range(all_predictions[name].shape[1])]
if args.eval:
all_labels[name] = np.array(labels)
all_uid[name] = np.array(uid)
else:
all_predictions[name] += np.array(predictions)
assert np.allclose(all_uid[name], np.array(uid))
except Exception as e:
print(e)
continue
return all_predictions, all_labels, all_uid
def get_threshold(all_predictions, all_labels, one_threshold=False):
if one_threshold:
all_predictons = {'combined': np.concatenate(list(all_predictions.values()))}
all_labels = {'combined': np.concatenate(list(all_predictions.labels()))}
out_thresh = []
for dataset in all_predictions:
preds = all_predictions[dataset]
labels = all_labels[dataset]
out_thresh.append(calc_threshold(preds, labels))
return out_thresh
def calc_threshold(p, l):
trials = [(i) * (1. / 100.) for i in range(100)]
best_acc = float('-inf')
best_thresh = 0
for t in trials:
acc = ((apply_threshold(p, t).argmax(-1) == l).astype(float)).mean()
if acc > best_acc:
best_acc = acc
best_thresh = t
return best_thresh
def apply_threshold(preds, t):
assert (np.allclose(preds.sum(-1), np.ones(preds.shape[0])))
prob = preds[:, -1]
thresholded = (prob >= t).astype(int)
preds = np.zeros_like(preds)
preds[np.arange(len(thresholded)), thresholded.reshape(-1)] = 1
return preds
def threshold_predictions(all_predictions, threshold):
if len(threshold) != len(all_predictions):
threshold = [threshold[-1]] * (len(all_predictions) - len(threshold))
for i, dataset in enumerate(all_predictions):
thresh = threshold[i]
preds = all_predictions[dataset]
all_predictions[dataset] = apply_threshold(preds, thresh)
return all_predictions
def postprocess_predictions(all_predictions, all_labels, args):
for d in all_predictions:
all_predictions[d] = all_predictions[d] / len(args.paths)
if args.calc_threshold:
args.threshold = get_threshold(all_predictions, all_labels, args.one_threshold)
print('threshold', args.threshold)
if args.threshold is not None:
all_predictions = threshold_predictions(all_predictions, args.threshold)
return all_predictions, all_labels
def write_predictions(all_predictions, all_labels, all_uid, args):
all_correct = 0
count = 0
for dataset in all_predictions:
preds = all_predictions[dataset]
preds = np.argmax(preds, -1)
if args.eval:
correct = (preds == all_labels[dataset]).sum()
num = len(all_labels[dataset])
accuracy = correct / num
count += num
all_correct += correct
accuracy = (preds == all_labels[dataset]).mean()
print(accuracy)
if not os.path.exists(os.path.join(args.outdir, dataset)):
os.makedirs(os.path.join(args.outdir, dataset))
outpath = os.path.join(
args.outdir, dataset, os.path.splitext(
args.prediction_name)[0] + '.tsv')
with open(outpath, 'w') as f:
f.write('id\tlabel\n')
f.write('\n'.join(str(uid) + '\t' + str(args.labels[p])
for uid, p in zip(all_uid[dataset], preds.tolist())))
if args.eval:
print(all_correct / count)
def ensemble_predictions(args):
all_predictions, all_labels, all_uid = process_files(args)
all_predictions, all_labels = postprocess_predictions(all_predictions, all_labels, args)
write_predictions(all_predictions, all_labels, all_uid, args)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--paths', required=True, nargs='+',
help='paths to checkpoint directories used in ensemble')
parser.add_argument('--eval', action='store_true',
help='compute accuracy metrics against labels (dev set)')
parser.add_argument('--outdir',
help='directory to place ensembled predictions in')
parser.add_argument('--prediction-name', default='test_predictions.pt',
help='name of predictions in checkpoint directories')
parser.add_argument('--calc-threshold', action='store_true',
help='calculate threshold classification')
parser.add_argument('--one-threshold', action='store_true',
help='use on threshold for all subdatasets')
parser.add_argument('--threshold', nargs='+', default=None, type=float,
help='user supplied threshold for classification')
parser.add_argument('--labels', nargs='+', default=None,
help='whitespace separated list of label names')
args = parser.parse_args()
ensemble_predictions(args)
if __name__ == '__main__':
main()
# Downloads the specified taks in the evaluation harness
# This is particularly useful when running in environments where the GPU nodes
# do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
from lm_eval import tasks
from lm_eval.tasks import ALL_TASKS
import argparse
import os
parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False)
parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.')
args = parser.parse_args()
def main():
task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
tasks.get_task_dict(task_list)
if __name__ == '__main__':
main()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment