Commit 8ec5d678 authored by hepj987's avatar hepj987
Browse files

GPT2 base on megatron-deepspeed

parents
#!/bin/bash
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
#mkdir -p $DIR/logs
#mkdir -p /tmp/logs
#DATASET_1="<PATH TO THE FIRST DATASET>"
#DATASET_2="<PATH TO THE SECOND DATASET>"
#DATASET_3="<PATH TO THE THIRD DATASET>"
#DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
BASE_DATA_PATH=/data/Megatron-LM/data
DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
script_path=$(realpath $0)
script_dir=$(dirname $script_path)
#CONFIG_JSON="$script_dir/ds_config.json"
CONFIG_JSON="/tmp/ds_config.json"
USE_DEEPSPEED=1
ZERO_STAGE=0
# Debug
#TP=4
#PP=4
#LAYERS=8
#HIDDEN=512
#SEQ=1024
#GLOBAL_BATCH=128
#WORKER_STR="-i worker-0"
TP=1
PP=1
DP=2
WORLD_SIZE=$((TP*PP*DP))
HIDDEN=1024
LAYERS=24
SEQ=1024
GLOBAL_BATCH=1
WORKER_STR=""
MICRO_BATCH=1
LR=6.0e-4
MIN_LR=6.0e-5
DTYPE="fp16"
EXP_DIR=${HOME}/experiments/results/bf16
LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_fix3"
mkdir -p $LOG_DIR
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--no-deepspeed)
USE_DEEPSPEED=0;
shift
;;
-z|--zero-stage)
ZERO_STAGE=$2;
shift
;;
*)
echo "Unknown argument(s)"
usage
exit 1
shift
;;
esac
done
options=" \
--tensor-model-parallel-size $TP \
--pipeline-model-parallel-size $PP \
--num-layers $LAYERS \
--hidden-size $HIDDEN \
--num-attention-heads 32 \
--seq-length $SEQ \
--loss-scale 12 \
--max-position-embeddings $SEQ \
--micro-batch-size $MICRO_BATCH \
--global-batch-size $GLOBAL_BATCH \
--train-iters 1000 \
--lr $LR \
--min-lr $MIN_LR \
--lr-decay-style cosine \
--log-interval 1 \
--eval-iters 40 \
--eval-interval 10 \
--data-path ${DATASET} \
--vocab-file ${VOCAB_PATH} \
--merge-file ${MERGE_PATH} \
--save-interval 10000 \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.006 \
--${DTYPE} \
--checkpoint-activations \
--exit-interval 10000 \
--tensorboard-dir $LOG_DIR
"
if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
echo "Using DeepSpeed"
options="${options} \
--deepspeed \
--deepspeed_config=${CONFIG_JSON} \
--zero-stage=${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
fi
cat <<EOT > $CONFIG_JSON
{
"train_batch_size" : $GLOBAL_BATCH,
"train_micro_batch_size_per_gpu": $MICRO_BATCH,
"steps_per_print": 1,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"bf16": {
"enabled": false
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 8
},
"wall_clock_breakdown" : true
}
EOT
WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
#WORKER_STR="-i worker-0:0,1,2,3"
#run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
#run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
run_cmd="deepspeed --master_port 29600 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
echo ${run_cmd}
eval ${run_cmd}
set +x
#!/bin/bash
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
#mkdir -p $DIR/logs
#mkdir -p /tmp/logs
#DATASET_1="<PATH TO THE FIRST DATASET>"
#DATASET_2="<PATH TO THE SECOND DATASET>"
#DATASET_3="<PATH TO THE THIRD DATASET>"
#DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
#BASE_DATA_PATH=tests/data/gpt2
#DATASET=${BASE_DATA_PATH}/meg-gpt2-openwebtext_text_document
#VOCAB_PATH=${BASE_DATA_PATH}/gpt2-tiny-vocab.json
#MERGE_PATH=${BASE_DATA_PATH}/gpt2-tiny-merges.txt
BASE_DATA_PATH=/vc_data/Megatron-LM/data
DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
script_path=$(realpath $0)
script_dir=$(dirname $script_path)
CONFIG_JSON="$script_dir/ds_config.json"
#CONFIG_JSON="/tmp/ds_config.json"
USE_DEEPSPEED=1
ZERO_STAGE=0
#TP=4
#PP=4
# Debug
DEBUG_MODE=0
if [[ $DEBUG_MODE == 1 ]]; then
LAYERS=4
HIDDEN=512
SEQ=512
EXIT_INTERVAL=3
else
HIDDEN=1024
LAYERS=24
SEQ=1024
EXIT_INTERVAL=10
fi
TP=2
PP=2
DP=4
WORLD_SIZE=$((TP*PP*DP))
GLOBAL_BATCH=4
MICRO_BATCH=1
TRAIN_ITERS=100000
CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP}
LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp2_pp2_dp4
LR=6.0e-4
MIN_LR=6.0e-5
DTYPE="bf16"
EXP_DIR=${HOME}/experiments/results/ckpt_reshape
LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_uni"
mkdir -p $LOG_DIR
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--no-deepspeed)
USE_DEEPSPEED=0;
shift
;;
-z|--zero-stage)
ZERO_STAGE=$2;
shift
;;
*)
echo "Unknown argument(s)"
usage
exit 1
shift
;;
esac
done
options=" \
--tensor-model-parallel-size $TP \
--pipeline-model-parallel-size $PP \
--num-layers $LAYERS \
--hidden-size $HIDDEN \
--num-attention-heads 32 \
--seq-length $SEQ \
--loss-scale 12 \
--max-position-embeddings $SEQ \
--micro-batch-size $MICRO_BATCH \
--global-batch-size $GLOBAL_BATCH \
--train-iters $TRAIN_ITERS \
--lr $LR \
--min-lr $MIN_LR \
--lr-decay-style cosine \
--log-interval 1 \
--eval-iters 40 \
--eval-interval 10 \
--data-path ${DATASET} \
--vocab-file ${VOCAB_PATH} \
--merge-file ${MERGE_PATH} \
--save-interval 1000 \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.006 \
--${DTYPE} \
--checkpoint-activations \
--exit-interval ${EXIT_INTERVAL} \
--save ${CHECKPOINT_PATH} \
--load ${LOAD_CHECKPOINT_PATH} \
--universal-checkpoint \
--position-embedding-type alibi \
--override-lr-scheduler \
--embed-layernorm \
--tensorboard-dir $LOG_DIR
"
if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
echo "Using DeepSpeed"
options="${options} \
--deepspeed \
--deepspeed_config=${CONFIG_JSON} \
--zero-stage=${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
fi
cat <<EOT > $CONFIG_JSON
{
"train_batch_size" : $GLOBAL_BATCH,
"train_micro_batch_size_per_gpu": $MICRO_BATCH,
"steps_per_print": 1,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"bf16": {
"enabled": true
},
"fp16": {
"enabled": false,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"wall_clock_breakdown" : true
}
EOT
#WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
#WORKER_STR="-i worker-0:0,1,2,3"
#run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
#run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
run_cmd="deepspeed --master_port 29700 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
echo ${run_cmd}
eval ${run_cmd}
set +x
# Scripts
This section should be organized with sub-folders for different things.
# Inference scripts for BLOOM
Moved to https://github.com/huggingface/transformers-bloom-inference/tree/main/bloom-inference-scripts
## Inference solutions for BLOOM 176B
Moved to https://github.com/huggingface/transformers-bloom-inference/tree/main/bloom-inference-server
import json
import argparse
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--dir',
type=str,
required=True,
help='directory to save data'
)
args = parser.parse_args()
return args
def main():
args = get_args()
for i in range(10):
row_limit = 1000
rows_to_save = [{'text': ''.join([str(i)+'-*']*128)}]
with open('{}/dataset_{}.json'.format(args.dir, i), 'w') as f:
f.write(
'\n'.join(json.dumps(_i) for _i in rows_to_save*row_limit)
)
if __name__ == '__main__':
main()
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Processing data for pretraining."""
import argparse
import json
import multiprocessing
import os
import sys
from megatron.data.indexed_dataset import best_fitting_dtype
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
os.path.pardir)))
import time
import torch
try:
import nltk
nltk_available = True
except ImportError:
nltk_available = False
from megatron.tokenizer import build_tokenizer
from megatron.data import indexed_dataset
# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
class CustomLanguageVars(nltk.tokenize.punkt.PunktLanguageVars):
_period_context_fmt = r"""
\S* # some word material
%(SentEndChars)s # a potential sentence ending
\s* # <-- THIS is what I changed
(?=(?P<after_tok>
%(NonWord)s # either other punctuation
|
(?P<next_tok>\S+) # <-- Normally you would have \s+ here
))"""
class IdentitySplitter(object):
def tokenize(self, *text):
return text
class Encoder(object):
def __init__(self, args):
self.args = args
def initializer(self):
# Use Encoder class as a container for global data
Encoder.tokenizer = build_tokenizer(self.args)
if self.args.split_sentences:
if not nltk_available:
print("NLTK is not available to split sentences.")
exit()
splitter = nltk.load("tokenizers/punkt/english.pickle")
if self.args.keep_newlines:
# this prevents punkt from eating newlines after sentences
Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
train_text = splitter._params,
lang_vars = CustomLanguageVars())
else:
Encoder.splitter = splitter
else:
Encoder.splitter = IdentitySplitter()
def encode(self, json_line):
data = json.loads(json_line)
ids = {}
for key in self.args.json_keys:
text = data[key]
doc_ids = []
for sentence in Encoder.splitter.tokenize(text):
sentence_ids = Encoder.tokenizer.tokenize(sentence)
if len(sentence_ids) > 0:
doc_ids.append(sentence_ids)
if len(doc_ids) > 0 and self.args.append_eod:
doc_ids[-1].append(Encoder.tokenizer.eod)
ids[key] = doc_ids
return ids, len(json_line)
def get_args():
parser = argparse.ArgumentParser()
group = parser.add_argument_group(title='input data')
group.add_argument('--input', type=str,
help='Path to input JSON')
group.add_argument('--datasets', nargs='+', default=None,
help='Paths to one or more input datasets to merge')
group.add_argument('--json-keys', nargs='+', default=['text'],
help='space separate listed of keys to extract from json')
group.add_argument('--split-sentences', action='store_true',
help='Split documents into sentences.')
group.add_argument('--keep-newlines', action='store_true',
help='Keep newlines between sentences when splitting.')
group = parser.add_argument_group(title='tokenizer')
group.add_argument('--tokenizer-type', type=str, required=True,
choices=['BertWordPieceLowerCase','BertWordPieceCase',
'GPT2BPETokenizer', 'PretrainedFromHF'],
help='What type of tokenizer to use.')
group.add_argument('--vocab-file', type=str, default=None,
help='Path to the vocab file')
group.add_argument('--merge-file', type=str, default=None,
help='Path to the BPE merge file (if necessary).')
group.add_argument('--append-eod', action='store_true',
help='Append an <eod> token to the end of a document.')
group.add_argument("--tokenizer-name-or-path", type=str, default=None,
help="Name or path of the huggingface tokenizer.")
group = parser.add_argument_group(title='output data')
group.add_argument('--output-prefix', type=str, required=True,
help='Path to binary output file without suffix')
group.add_argument('--dataset-impl', type=str, default='mmap',
choices=['lazy', 'cached', 'mmap'])
group = parser.add_argument_group(title='runtime')
group.add_argument('--workers', type=int, default=1,
help='Number of worker processes to launch')
group.add_argument('--log-interval', type=int, default=100,
help='Interval between progress updates')
args = parser.parse_args()
args.keep_empty = False
if args.tokenizer_type.lower().startswith('bert'):
if not args.split_sentences:
print("Bert tokenizer detected, are you sure you don't want to split sentences?")
# some default/dummy values for the tokenizer
args.rank = 0
args.make_vocab_size_divisible_by = 128
args.tensor_model_parallel_size = 1
args.vocab_extra_ids = 0
return args
def main():
args = get_args()
startup_start = time.time()
print("Opening", args.input)
fin = open(args.input, 'r', encoding='utf-8')
if nltk_available and args.split_sentences:
nltk.download("punkt", quiet=True)
encoder = Encoder(args)
tokenizer = build_tokenizer(args)
pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
encoded_docs = pool.imap(encoder.encode, fin, 25)
#encoded_docs = map(encoder.encode, fin)
level = "document"
if args.split_sentences:
level = "sentence"
print(f"Vocab size: {tokenizer.vocab_size}")
print(f"Output prefix: {args.output_prefix}")
output_bin_files = {}
output_idx_files = {}
builders = {}
for key in args.json_keys:
output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix,
key, level)
output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix,
key, level)
builders[key] = indexed_dataset.make_builder(output_bin_files[key],
impl=args.dataset_impl,
dtype=best_fitting_dtype(tokenizer.vocab_size))
startup_end = time.time()
proc_start = time.time()
total_bytes_processed = 0
print("Time to startup:", startup_end - startup_start)
for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
total_bytes_processed += bytes_processed
for key, sentences in doc.items():
if len(sentences) == 0:
continue
for sentence in sentences:
builders[key].add_item(torch.IntTensor(sentence))
builders[key].end_document()
if i % args.log_interval == 0:
current = time.time()
elapsed = current - proc_start
mbs = total_bytes_processed/elapsed/1024/1024
print(f"Processed {i} documents",
f"({i/elapsed} docs/s, {mbs} MB/s).",
file=sys.stderr)
for key in args.json_keys:
builders[key].finalize(output_idx_files[key])
if __name__ == '__main__':
main()
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pretrain GPT"""
import torch
from functools import partial
from megatron import get_args
from megatron import print_rank_0
from megatron import get_timers
from megatron import get_tokenizer
from megatron import mpu
from megatron.data.gpt_dataset import build_train_valid_test_datasets
from megatron.model import GPTModel, GPTModelPipe
from megatron.training import pretrain
from megatron.utils import get_ltor_masks_and_position_ids
from megatron.utils import average_losses_across_data_parallel_group
import deepspeed
from deepspeed.runtime.utils import see_memory_usage
import os
import subprocess
collected_sample = {}
token_dict = {
15: 0,
16: 1,
17: 2,
18: 3,
19: 4,
20: 5,
21: 6,
22: 7,
23: 8,
24: 9,
}
def model_provider(pre_process=True, post_process=True):
"""Build the model."""
print_rank_0('building GPT model ...')
see_memory_usage(f"Before Building Model", force=True)
args = get_args()
with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(),
remote_device=None if args.remote_device=='none' else args.remote_device,
config=args.deepspeed_config,
enabled=args.zero_stage==3):
if args.deepspeed and mpu.get_pipeline_model_parallel_world_size() > 1:
model = GPTModelPipe(
num_tokentypes=0,
parallel_output=True
)
# This is a hack to give us a reference to get_batch_pipe from within training.py
# We need to call model.set_batch_fn after deepspeed.initialize
model._megatron_batch_fn = get_batch_pipe
# Predompute the attention mask and store it in args. This avoids having to
# pipeline it as an activation during training. The mask is constant, and thus
# we can reuse it.
attention_mask = torch.tril(torch.ones(
(1, args.seq_length, args.seq_length), device=torch.cuda.current_device())).view(
1, 1, args.seq_length, args.seq_length)
# Convert attention mask to binary:
attention_mask = (attention_mask < 0.5)
if args.fp16:
attention_mask = attention_mask.half()
elif args.bf16:
attention_mask = attention_mask.bfloat16()
args.attn_mask = attention_mask
else:
model = GPTModel(
num_tokentypes=0,
parallel_output=True,
pre_process=pre_process,
post_process=post_process
)
see_memory_usage(f"After Building Model", force=True)
return model
def get_batch(data_iterator):
"""Generate a batch"""
args = get_args()
tokenizer = get_tokenizer()
# Items and their type.
keys = ['text']
datatype = torch.int64
# Broadcast data.
if data_iterator is not None:
data = next(data_iterator)
else:
data = None
data_b = mpu.broadcast_data(keys, data, datatype)
# Unpack.
# tokens_ = data_b['text'].long()
tokens_ = data_b[keys[0]].long()
labels = tokens_[:, 1:].contiguous()
tokens = tokens_[:, :-1].contiguous()
# Get the masks and postition ids.
attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
tokens,
tokenizer.eod,
args.reset_position_ids,
args.reset_attention_mask,
args.eod_mask_loss)
return tokens, labels, loss_mask, attention_mask, position_ids
def get_batch_pipe(data):
"""Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`"""
args = get_args()
tokenizer = get_tokenizer()
# Items and their type.
keys = ['text']
datatype = torch.int64
# Broadcast data.
data_b = mpu.broadcast_data(keys, data, datatype)
# Unpack.
tokens_ = data_b['text'].long()
labels = tokens_[:, 1:].contiguous()
tokens = tokens_[:, :-1].contiguous()
# Get the masks and postition ids.
attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
tokens,
tokenizer.eod,
args.reset_position_ids,
args.reset_attention_mask,
args.eod_mask_loss)
return (tokens, position_ids, attention_mask), (labels, loss_mask)
def loss_func(loss_mask, output_tensor):
losses = output_tensor.float()
loss_mask = loss_mask.view(-1).float()
loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
# Reduce loss for logging.
averaged_loss = average_losses_across_data_parallel_group([loss])
return loss, {'lm loss': averaged_loss[0]}
def forward_step(data_iterator, model):
"""Forward step."""
args = get_args()
timers = get_timers()
# Get the batch.
timers('batch-generator').start()
tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
data_iterator)
timers('batch-generator').stop()
for row in tokens.detach().cpu():
token_idx = [i for i in row.tolist() if i in list(token_dict.keys())][0]
num = "dataset_{}".format(token_dict[token_idx])
if num in collected_sample:
collected_sample[num] += 1
else:
collected_sample[num] = 1
print_rank_0(collected_sample)
output_tensor = model(tokens, position_ids, attention_mask,
labels=labels)
return output_tensor, partial(loss_func, loss_mask)
def train_valid_test_datasets_provider(train_val_test_num_samples):
"""Build train, valid, and test datasets."""
args = get_args()
print_rank_0('> building train, validation, and test datasets '
'for GPT ...')
train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
data_prefix=args.data_path,
data_impl=args.data_impl,
splits_string=args.split,
train_valid_test_num_samples=train_val_test_num_samples,
seq_length=args.seq_length,
seed=args.seed,
skip_warmup=(not args.mmap_warmup))
print_rank_0("> finished creating GPT datasets ...")
return train_ds, valid_ds, test_ds
def command_exists(cmd):
result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
return result.wait() == 0
def git_ds_info():
from deepspeed.env_report import main as ds_report
ds_report()
# Write out version/git info
git_hash_cmd = "git rev-parse --short HEAD"
git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
if command_exists('git'):
try:
result = subprocess.check_output(git_hash_cmd, shell=True)
git_hash = result.decode('utf-8').strip()
result = subprocess.check_output(git_branch_cmd, shell=True)
git_branch = result.decode('utf-8').strip()
except subprocess.CalledProcessError:
git_hash = "unknown"
git_branch = "unknown"
else:
git_hash = "unknown"
git_branch = "unknown"
print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****')
if __name__ == "__main__":
git_ds_info()
pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
#!/bin/bash
BASE_DATA_PATH=/tmp
INPUT_PATH=/tmp
OUTPUT_PATH=/tmp
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -P ${BASE_DATA_PATH}
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -P ${BASE_DATA_PATH}
python create_dummy_dataset.py --dir ${INPUT_PATH}
python preprocess_data.py \
--input ${INPUT_PATH}/dataset_0.json \
--output-prefix ${OUTPUT_PATH}/dataset-0 \
--vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
--append-eod
python preprocess_data.py \
--input ${INPUT_PATH}/dataset_1.json \
--output-prefix ${OUTPUT_PATH}/dataset-1 \
--vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
--append-eod
python preprocess_data.py \
--input ${INPUT_PATH}/dataset_2.json \
--output-prefix ${OUTPUT_PATH}/dataset-2 \
--vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
--append-eod
python preprocess_data.py \
--input ${INPUT_PATH}/dataset_3.json \
--output-prefix ${OUTPUT_PATH}/dataset-3 \
--vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
--append-eod
python preprocess_data.py \
--input ${INPUT_PATH}/dataset_4.json \
--output-prefix ${OUTPUT_PATH}/dataset-4 \
--vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
--append-eod
DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p ${BASE_DATA_PATH}/logs
DATASET_0="${OUTPUT_PATH}/dataset-0_text_document"
DATASET_1="${OUTPUT_PATH}/dataset-1_text_document"
DATASET_2="${OUTPUT_PATH}/dataset-2_text_document"
DATASET_3="${OUTPUT_PATH}/dataset-3_text_document"
DATASET_4="${OUTPUT_PATH}/dataset-4_text_document"
DATASET="0.1 ${DATASET_0} 0.25 ${DATASET_1} 0.2 ${DATASET_2} 0.15 ${DATASET_3} 0.3 ${DATASET_4}"
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
#script_path=$(realpath $0)
#script_dir=$(dirname $script_path)
#CONFIG_JSON="$script_dir/ds_config.json"
USE_DEEPSPEED=1
ZERO_STAGE=0
# Debug
#TP=4
#PP=4
#LAYERS=8
#HIDDEN=512
#SEQ=1024
#GLOBAL_BATCH=128
#WORKER_STR="-i worker-0"
# 52B
TP=4
PP=16
HIDDEN=1024
LAYERS=24
SEQ=128
GLOBAL_BATCH=16
WORKER_STR=""
MICRO_BATCH=8
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--no-deepspeed)
USE_DEEPSPEED=0;
shift
;;
-z|--zero-stage)
ZERO_STAGE=$2;
shift
;;
*)
echo "Unknown argument(s)"
usage
exit 1
shift
;;
esac
done
options=" \
--tensor-model-parallel-size $TP \
--pipeline-model-parallel-size $PP \
--num-layers $LAYERS \
--hidden-size $HIDDEN \
--num-attention-heads 32 \
--seq-length $SEQ \
--loss-scale 12 \
--max-position-embeddings $SEQ \
--micro-batch-size $MICRO_BATCH \
--global-batch-size $GLOBAL_BATCH \
--train-iters 1000 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--log-interval 1 \
--eval-iters 40 \
--eval-interval 1000 \
--data-path ${DATASET} \
--vocab-file ${VOCAB_PATH} \
--merge-file ${MERGE_PATH} \
--save-interval 1000 \
--split 98,2,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.006 \
--fp16 \
--checkpoint-activations
"
if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
echo "Using DeepSpeed"
options="${options} \
--deepspeed \
--deepspeed_config=${CONFIG_JSON} \
--zero-stage=${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
fi
cat <<EOT > $CONFIG_JSON
{
"train_batch_size" : $GLOBAL_BATCH,
"train_micro_batch_size_per_gpu": $MICRO_BATCH,
"steps_per_print": 1,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"gradient_clipping": 1.0,
"prescale_gradients": true,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"wall_clock_breakdown" : true
}
EOT
# run_cmd="deepspeed $WORKER_STR ${DIR}/test_sampling.py $@ ${options}"
run_cmd="deepspeed $WORKER_STR test_sampling.py $@ ${options}"
echo ${run_cmd}
eval ${run_cmd}
set +x
[isort]
default_section = FIRSTPARTY
ensure_newline_before_comments = True
force_grid_wrap = 0
include_trailing_comma = True
known_first_party = megatron
known_third_party =
apex
codecarbon
datasets
deepspeed
git
nltk
numpy
pytest
tensorboard
torch
tqdm
transformers
line_length = 119
lines_after_imports = 2
multi_line_output = 3
use_parentheses = True
\ No newline at end of file
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Setup for pip package."""
import os
import sys
import setuptools
if sys.version_info < (3,):
raise Exception("Python 2 is not supported by Megatron.")
from megatron.package_info import (
__description__,
__contact_names__,
__url__,
__download_url__,
__keywords__,
__license__,
__package_name__,
__version__,
)
with open("README.md", "r") as fh:
long_description = fh.read()
###############################################################################
# Dependency Loading #
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
def req_file(filename):
with open(filename) as f:
content = f.readlines()
return [x.strip() for x in content]
install_requires = req_file("requirements.txt")
setuptools.setup(
name=__package_name__,
# Versions should comply with PEP440. For a discussion on single-sourcing
# the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version=__version__,
description=__description__,
long_description=long_description,
long_description_content_type="text/markdown",
# The project's main homepage.
url=__url__,
author=__contact_names__,
maintainer=__contact_names__,
# The licence under which the project is released
license=__license__,
classifiers=[
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'Intended Audience :: Information Technology',
# Indicate what your project relates to
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: Software Development :: Libraries :: Python Modules',
# Supported python versions
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
# Additional Setting
'Environment :: Console',
'Natural Language :: English',
'Operating System :: OS Independent',
],
python_requires='>=3.6',
packages=setuptools.find_packages(),
install_requires=install_requires,
# Add in any packaged data.
include_package_data=True,
zip_safe=False,
# PyPI package information.
keywords=__keywords__
)
#!/bin/bash
export NCCL_SOCKET_IFNAME=ib0
export NCCL_IB_HCA=mlx5
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
MODEL_NAME=gpt2-oscar_16B-4tp
DATA_OUTPUT_PATH=./
LOGS_PATH=$DATA_OUTPUT_PATH/logs
CHECKPOINT_PATH=output-module/$MODEL_NAME
DATA_PATH="my-gpt2_text_document"
TENSORBOARD_PATH=output_dir/tensorboard/$MODEL_NAME
CODECARBON_PATH=output_dir/codecarbon/$MODEL_NAME
TP_SIZE=4 # always fixed to the size of a single node
PP_SIZE=4 # NLAYERS must be a multiple of PP_SIZE here
MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=128
NLAYERS=40
NHIDDEN=5760
NHEADS=24
SEQ_LEN=2048
SAVE_INTERVAL=1000
OPTIMIZER_ARGS=" \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--clip-grad 1.0 \
--weight-decay 1e-1 \
"
GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train-samples 3782590 \
--loss-scale 12 \
--vocab-file gpt2-vocab.json \
--merge-file gpt2-merges.txt \
--clip-grad 1.0 \
--fp16 \
--checkpoint-activations \
--seed 42
$OPTIMIZER_ARGS \
"
OUTPUT_ARGS=" \
--log-interval 1 \
--save-interval $SAVE_INTERVAL \
--eval-interval 1000 \
--eval-iters 40 \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
"
DATA_ARGS=" \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
"
ZERO_STAGE=1
config_json="./${MODEL_NAME}_ds_config.json"
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
export CMD=" \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
$DEEPSPEED_ARGS \
"
APP="python3 -u `pwd`/pretrain_gpt.py \
--rank ${RANK} \
--world_size ${WORLD_SIZE} \
--dist_url tcp://${1}:34566 \
--num-workers 2 \
${CMD} \
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
# Fast Setup instructions
This quick instructions document contains 3 steps:
1. installing software
2. preparing data
3. running the script
This is useful if you need to ask someone to reproduce problems with `Megatron-Deepspeed`
## 1. Software
Please follow this exact order.
0. Create a new conda env if need be or activate an existing environment.
1. Install `pytorch`. Choose the desired version install instructions [here](https://pytorch.org/get-started/locally/), but for conda it'd be:
```
conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
```
2. Install system-wide `cuda` if you don't have it already. [NVIDIA instructions](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html). Of course ideally use [the premade packages for your distro](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#package-manager-installation).
Use the same major version as pytorch's cuda build. To check use:
```
python -c 'import torch; print(f"pt={torch.__version__}, cuda={torch.version.cuda}")'
```
The minor versions don't actually have to match, but then you will need to hack `apex` installer to ignore minor version changes, see below.
3. Install `apex`
```
git clone https://github.com/NVIDIA/apex
cd apex
pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check . 2>&1 | tee build.log
cd -
```
If the pytorch and system-wide cuda minor versions mismatch, it's not a problem, you just need to hack `apex`'s build to bypass the check by applying this patch first and then build it.
```
diff --git a/setup.py b/setup.py
index d76e998..f224dae 100644
--- a/setup.py
+++ b/setup.py
@@ -31,6 +31,8 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
print(raw_output + "from " + cuda_dir + "/bin\n")
if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
+ # allow minor diffs
+ if bare_metal_minor != torch_binary_minor: return
raise RuntimeError(
"Cuda extensions are being compiled with a version of Cuda that does "
"not match the version used to compile Pytorch binaries. "
```
4. Checkout and prepare `Megatron-DeepSpeed` and install its requirements
```
git clone https://github.com/bigscience-workshop/Megatron-DeepSpeed
cd Megatron-DeepSpeed
pip install -r requirements.txt
```
## 2. Data
Will work under the `Megatron-DeepSpeed` clone
```
cd Megatron-DeepSpeed
```
Prepare data for preprocessing
```
mkdir -p data
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -O data/gpt2-vocab.json
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -O data/gpt2-merges.txt
python -c 'from datasets import load_dataset; ds = load_dataset("stas/oscar-en-10k", split="train", keep_in_memory=False); ds.to_json(f"data/oscar-en-10k.jsonl", orient="records", lines=True, force_ascii=False)'
```
Pre-process a small dataset to be used for training
```
python tools/preprocess_data.py \
--input data/oscar-en-10k.jsonl \
--output-prefix data/meg-gpt2-oscar-en-10k \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file data/gpt2-merges.txt \
--vocab data/gpt2-vocab.json \
--append-eod \
--workers 4
```
now you have data/meg-gpt2-oscar-en-10k, vocab and merges files to pass as arguments to training, the next section shows how to use them.
Note that Megatron wants `data/meg-gpt2-oscar-en-10k_text_document` prefix later in `--data-path`
## 3. Train
Here is a tiny model training setup configured over 2 gpus to train on the data we prepared in step 2.
Put it in a script or run it directly.
If you have only 1 gpu, change these 2 lines below to:
```
N_GPUS=1
TP_SIZE=1
```
The script:
```
CHECKPOINT_PATH=checkpoints/gpt2
VOCAB_FILE=data/gpt2-vocab.json
MERGE_FILE=data/gpt2-merges.txt
DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document
TENSORBOARD_PATH=output_dir/tensorboard
N_GPUS=2
MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=16
TP_SIZE=2
PP_SIZE=1
NLAYERS=2
NHIDDEN=8
NHEADS=2
SEQ_LEN=512
VOCAB_SIZE=50257
SAVE_INTERVAL=50
TRAIN_SAMPLES=10_000
GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--rampup-batch-size 2 2 1_000 \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train-samples $TRAIN_SAMPLES \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 1e-4 \
--lr-warmup-samples 5 \
--min-lr 1e-6 \
--lr-decay-style cosine \
--lr-decay-samples 12 \
--clip-grad 1.0 \
--weight-decay 1e-1 \
--embed-layernorm \
--fp16 \
--partition-activations \
--seed 42 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
"
OUTPUT_ARGS=" \
--exit-interval 100 \
--log-interval 10 \
--save-interval $SAVE_INTERVAL \
--eval-interval 100 \
--eval-iters 10 \
--checkpoint-activations \
"
DATA_ARGS=" \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--kill-switch-path /tmp/kill-switch \
"
ZERO_STAGE=1
config_json="./ds_config.json"
# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
ALL_ARGS="$GPT_ARGS $OUTPUT_ARGS $DATA_ARGS $DEEPSPEED_ARGS"
MASTER_ADDR=localhost
MASTER_PORT=6777
export LAUNCHER="python -u -m torch.distributed.run \
--nproc_per_node $N_GPUS \
--nnodes 1 \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
--rdzv_backend c10d \
--max_restarts 0 \
--tee 3 \
"
export CMD=" \
$LAUNCHER pretrain_gpt.py \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
--distributed-backend nccl \
$ALL_ARGS \
"
echo $CMD
$CMD
```
You can, of course, run this as a slurm script, but here is [a full slurm script example](https://github.com/bigscience-workshop/bigscience/blob/d57b76bb592832bb4d2054cd5cbf132796be2d83/train/tr11-176B-ml/setup-test-n2.slurm), which has some tweaks to get `MASTER_ADDR` and a few other bits right under the SLURM environment on JeanZay, which may or may not be needed if you run it elsewhere.
Remember to wipe out `$CHECKPOINT_PATH`, if you change the model shape and there is a checkpoint with the old shapes saved already.
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tasks data utility."""
import re
import numpy as np
def clean_text(text):
"""Remove new lines and multiple spaces and adjust end of sentence dot."""
text = text.replace("\n", " ")
text = re.sub(r'\s+', ' ', text)
for _ in range(3):
text = text.replace(' . ', '. ')
return text
def build_sample(ids, types, paddings, label, unique_id):
"""Convert to numpy and return a sample consumed by the batch producer."""
ids_np = np.array(ids, dtype=np.int64)
types_np = np.array(types, dtype=np.int64)
paddings_np = np.array(paddings, dtype=np.int64)
sample = ({'text': ids_np,
'types': types_np,
'padding_mask': paddings_np,
'label': int(label),
'uid': int(unique_id)})
return sample
def build_tokens_types_paddings_from_text(text_a, text_b,
tokenizer, max_seq_length):
"""Build token types and paddings, trim if needed, and pad if needed."""
text_a_ids = tokenizer.tokenize(text_a)
text_b_ids = None
if text_b is not None:
text_b_ids = tokenizer.tokenize(text_b)
return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids,
max_seq_length, tokenizer.cls,
tokenizer.sep, tokenizer.pad)
def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length,
cls_id, sep_id, pad_id):
"""Build token types and paddings, trim if needed, and pad if needed."""
ids = []
types = []
paddings = []
# [CLS].
ids.append(cls_id)
types.append(0)
paddings.append(1)
# A.
len_text_a = len(text_a_ids)
ids.extend(text_a_ids)
types.extend([0] * len_text_a)
paddings.extend([1] * len_text_a)
# [SEP].
ids.append(sep_id)
types.append(0)
paddings.append(1)
# B.
if text_b_ids is not None:
len_text_b = len(text_b_ids)
ids.extend(text_b_ids)
types.extend([1] * len_text_b)
paddings.extend([1] * len_text_b)
# Cap the size.
trimmed = False
if len(ids) >= max_seq_length:
max_seq_length_m1 = max_seq_length - 1
ids = ids[0:max_seq_length_m1]
types = types[0:max_seq_length_m1]
paddings = paddings[0:max_seq_length_m1]
trimmed = True
# [SEP].
if (text_b_ids is not None) or trimmed:
ids.append(sep_id)
if text_b_ids is None:
types.append(0)
else:
types.append(1)
paddings.append(1)
# Padding.
padding_length = max_seq_length - len(ids)
if padding_length > 0:
ids.extend([pad_id] * padding_length)
types.extend([pad_id] * padding_length)
paddings.extend([0] * padding_length)
return ids, types, paddings
import os
import argparse
import collections
import numpy as np
import torch
def process_files(args):
all_predictions = collections.OrderedDict()
all_labels = collections.OrderedDict()
all_uid = collections.OrderedDict()
for path in args.paths:
path = os.path.join(path, args.prediction_name)
try:
data = torch.load(path)
for dataset in data:
name, d = dataset
predictions, labels, uid = d
if name not in all_predictions:
all_predictions[name] = np.array(predictions)
if args.labels is None:
args.labels = [i for i in range(all_predictions[name].shape[1])]
if args.eval:
all_labels[name] = np.array(labels)
all_uid[name] = np.array(uid)
else:
all_predictions[name] += np.array(predictions)
assert np.allclose(all_uid[name], np.array(uid))
except Exception as e:
print(e)
continue
return all_predictions, all_labels, all_uid
def get_threshold(all_predictions, all_labels, one_threshold=False):
if one_threshold:
all_predictons = {'combined': np.concatenate(list(all_predictions.values()))}
all_labels = {'combined': np.concatenate(list(all_predictions.labels()))}
out_thresh = []
for dataset in all_predictions:
preds = all_predictions[dataset]
labels = all_labels[dataset]
out_thresh.append(calc_threshold(preds, labels))
return out_thresh
def calc_threshold(p, l):
trials = [(i) * (1. / 100.) for i in range(100)]
best_acc = float('-inf')
best_thresh = 0
for t in trials:
acc = ((apply_threshold(p, t).argmax(-1) == l).astype(float)).mean()
if acc > best_acc:
best_acc = acc
best_thresh = t
return best_thresh
def apply_threshold(preds, t):
assert (np.allclose(preds.sum(-1), np.ones(preds.shape[0])))
prob = preds[:, -1]
thresholded = (prob >= t).astype(int)
preds = np.zeros_like(preds)
preds[np.arange(len(thresholded)), thresholded.reshape(-1)] = 1
return preds
def threshold_predictions(all_predictions, threshold):
if len(threshold) != len(all_predictions):
threshold = [threshold[-1]] * (len(all_predictions) - len(threshold))
for i, dataset in enumerate(all_predictions):
thresh = threshold[i]
preds = all_predictions[dataset]
all_predictions[dataset] = apply_threshold(preds, thresh)
return all_predictions
def postprocess_predictions(all_predictions, all_labels, args):
for d in all_predictions:
all_predictions[d] = all_predictions[d] / len(args.paths)
if args.calc_threshold:
args.threshold = get_threshold(all_predictions, all_labels, args.one_threshold)
print('threshold', args.threshold)
if args.threshold is not None:
all_predictions = threshold_predictions(all_predictions, args.threshold)
return all_predictions, all_labels
def write_predictions(all_predictions, all_labels, all_uid, args):
all_correct = 0
count = 0
for dataset in all_predictions:
preds = all_predictions[dataset]
preds = np.argmax(preds, -1)
if args.eval:
correct = (preds == all_labels[dataset]).sum()
num = len(all_labels[dataset])
accuracy = correct / num
count += num
all_correct += correct
accuracy = (preds == all_labels[dataset]).mean()
print(accuracy)
if not os.path.exists(os.path.join(args.outdir, dataset)):
os.makedirs(os.path.join(args.outdir, dataset))
outpath = os.path.join(
args.outdir, dataset, os.path.splitext(
args.prediction_name)[0] + '.tsv')
with open(outpath, 'w') as f:
f.write('id\tlabel\n')
f.write('\n'.join(str(uid) + '\t' + str(args.labels[p])
for uid, p in zip(all_uid[dataset], preds.tolist())))
if args.eval:
print(all_correct / count)
def ensemble_predictions(args):
all_predictions, all_labels, all_uid = process_files(args)
all_predictions, all_labels = postprocess_predictions(all_predictions, all_labels, args)
write_predictions(all_predictions, all_labels, all_uid, args)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--paths', required=True, nargs='+',
help='paths to checkpoint directories used in ensemble')
parser.add_argument('--eval', action='store_true',
help='compute accuracy metrics against labels (dev set)')
parser.add_argument('--outdir',
help='directory to place ensembled predictions in')
parser.add_argument('--prediction-name', default='test_predictions.pt',
help='name of predictions in checkpoint directories')
parser.add_argument('--calc-threshold', action='store_true',
help='calculate threshold classification')
parser.add_argument('--one-threshold', action='store_true',
help='use on threshold for all subdatasets')
parser.add_argument('--threshold', nargs='+', default=None, type=float,
help='user supplied threshold for classification')
parser.add_argument('--labels', nargs='+', default=None,
help='whitespace separated list of label names')
args = parser.parse_args()
ensemble_predictions(args)
if __name__ == '__main__':
main()
# Downloads the specified taks in the evaluation harness
# This is particularly useful when running in environments where the GPU nodes
# do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
from lm_eval import tasks
from lm_eval.tasks import ALL_TASKS
import argparse
import os
parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False)
parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.')
args = parser.parse_args()
def main():
task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
tasks.get_task_dict(task_list)
if __name__ == '__main__':
main()
\ No newline at end of file
from functools import reduce
from logging import logMultiprocessing
import os
import sys
import datetime
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
os.path.pardir,os.path.pardir)))
from lm_eval.models.gpt2 import GPT2LM
from lm_eval import evaluator, tasks, utils
from lm_eval.base import CacheHook
from tqdm import tqdm
import torch.nn.functional as F
from lm_eval.tasks import ALL_TASKS
from pretrain_gpt import model_provider
import numpy as np
import torch
from megatron import get_args
from megatron import print_rank_0
from megatron import get_tokenizer
from megatron import mpu
from megatron.training import setup_model_and_optimizer, get_model
from megatron.mpu.mappings import gather_from_tensor_model_parallel_region
from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
from megatron.p2p_communication import recv_forward, send_forward
import pickle
import json
from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
from megatron.model.distributed import DistributedDataParallel as LocalDDP
from megatron.model.module import Float16Module
from deepspeed.runtime.pipe import schedule
class EvalHarnessAdaptor(GPT2LM):
def __init__(self, model, tokenizer):
args = get_args()
self.args = args
self.model = model
self.tokenizer = tokenizer
self.VOCAB_SIZE = tokenizer.vocab_size
self.EOT_TOKEN_ID = tokenizer.eod
self._max_length = args.seq_length
# For ds we split into mini batches and then micro batches to keep pipelining api happy.
# With Megatron we just go to micro_batches directly
self._batch_size = args.micro_batch_size * args.micro_bs_multiplier
self.cache_hook = CacheHook(None)
self.is_main = args.rank == 0
self.is_local_main = args.local_rank == 0
self._device = torch.cuda.current_device()
self.is_model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
self.is_pipe_parallel = mpu.get_pipeline_model_parallel_world_size() > 1
self.is_data_parallel = mpu.get_data_parallel_world_size() > 1
self.adaptive_seq_len = args.adaptive_seq_len
if self.is_data_parallel:
raise NotImplementedError("Data parallelism is currently not supported for evaluation")
self.is_last_stage = True if not self.is_pipe_parallel else mpu.is_pipeline_last_stage() # only the last stage of the pipeline model will receive the logits
@property
def max_length(self):
return self._max_length
@property
def batch_size(self):
return self._batch_size
@property
def device(self):
return self._device
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in requests:
if context == "":
# end of text as context
context_enc = [self.EOT_TOKEN_ID]
else:
context_enc = self.tokenizer_encode(context)
continuation_enc = self.tokenizer_encode(continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(self, requests):
# TODO: Implement caching once we've confirmed the perplexity implementation
# TODO: automatic batch size detection for vectorization
loglikelihoods = []
with torch.no_grad():
for string, in tqdm(requests):
rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
token_list=self.tokenizer_encode(string),
prefix_token=self.EOT_TOKEN_ID,
max_seq_len=self.max_length,
context_len=1,
)))
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
# TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that
string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
return loglikelihoods
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
disable_tqdm = disable_tqdm if self.is_main else True
res = []
res_len = 0 # storing the result length for later
self.model.eval()
with torch.no_grad():
def _collate(x):
toks = x[1] + x[2]
return (-len(toks), tuple(toks))
reord = utils.Reorderer(requests, _collate)
for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
inps, contlens, inplens, padding_length = [], [], [], None
for _, context_enc, continuation_enc in chunk:
# when too long to fit in context, truncate from the left
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length + 1):][:-1]
, dtype=torch.long).to(self.device)
inplen, = inp.shape
cont = continuation_enc
# since in _collate we make sure length is descending, the longest is always the first one.
padding_length = padding_length if padding_length is not None else inplen
if not self.adaptive_seq_len:
padding_length = self.max_length
# pad to length
inp = torch.cat([
inp, # [seq]
torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device) # [padding_length - seq]
], dim=0)
inps.append(inp.unsqueeze(0))
contlens.append(cont)
inplens.append(inplen)
logits = self._model_call(torch.cat(inps, dim=0))
res_len += len(chunk)
if logits is not None:
multi_logits = F.log_softmax(logits, dim=-1).cpu() # [batch, seq, vocab]
for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
contlen = len(cont_toks)
logits = logits[inplen - contlen:inplen].unsqueeze(0) # [1, seq, vocab]
greedy_tokens = logits.argmax(dim=-1)
# cont_toks :: [1, seq]
cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)
max_equal = (greedy_tokens == cont_toks).all()
# last_token_slice = logits[:, -1, :].squeeze(0).tolist()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [1, seq]
answer = (float(logits.sum()), bool(max_equal))
# partial caching
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
res.append(answer)
if not mpu.is_pipeline_last_stage():
# @HACK: To make the eval harness happy on threads that don't have access to the results.
# We just randomly generate some data.
res = [(np.random.rand(), np.random.rand()>0.5) for _ in requests]
return reord.get_original(res)
def create_model_inputs(self, tokens):
args = get_args()
attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
tokens,
self.EOT_TOKEN_ID,
args.reset_position_ids,
args.reset_attention_mask,
args.eod_mask_loss,
prefix_indices=None,
loss_on_targets_only=False)
return (tokens, position_ids, attention_mask), (tokens, loss_mask)
def _model_call(self, inps):
args = get_args()
if args.deepspeed:
self.model.set_batch_fn(self.create_model_inputs)
# round up to multiple of micro_batch_size
new_size = ((len(inps) + args.micro_batch_size-1) // args.micro_batch_size) * args.micro_batch_size
padded = F.pad(inps, (0, 0, 0, new_size-len(inps)), value = 0)
# dummy data iterator for pipelining.
data_iterator = list((torch.stack(inp) for inp in utils.chunks(padded, args.micro_batch_size)))
self.model.micro_batches = len(data_iterator)
if self.adaptive_seq_len:
# Allow different shapes than the default seq_len to be communicated across pipes
# Without this Deepspeed will hang when trying to receive activations
self.model.reset_activation_shape()
output = self.model.eval_batch(iter(data_iterator), compute_loss = False, reduce_output = None)
if output is not None:
output = torch.cat(output, 0)[:len(inps)]
else:
output = None
# hack #2 for adaptive_seq_len to work as total_loss gets appended to and shapes aren't the same
if args.adaptive_seq_len:
self.model.total_loss = None
else:
# Since the shape of the micro-batch will change
# We need set the correct shapes here
# So that latter pipeline stages knows which shapes to expect.
# Otherwise we will deadlock.
args.micro_batch_size = len(inps)
args.seq_length = len(inps[0])
args.max_position_embeddings = args.seq_length
input_tensor = recv_forward()
# Forward pass through the model.
unwrapped_model = unwrap_model(self.model, (torchDDP, LocalDDP, Float16Module))
unwrapped_model.set_input_tensor(input_tensor)
output = self.model(*self.create_model_inputs(inps)[0])
send_forward(output)
if mpu.is_pipeline_last_stage():
return gather_from_tensor_model_parallel_region(output)[..., :self.tokenizer.vocab_size]
else:
return None
def tokenizer_encode(self, text):
"""Tokenize text *without* adding special tokens."""
# Splitting this into its own method in case we need to handle special cases for different tokenizers
from megatron.tokenizer.gpt2_tokenization import GPT2Tokenizer
if isinstance(self.tokenizer.tokenizer, GPT2Tokenizer):
return self.tokenizer.tokenizer.encode(text)
else:
return self.tokenizer.tokenizer.encode(text, add_special_tokens=False)
from megatron.initialize import initialize_megatron
import megatron
from tools.convert_checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint
from tools.convert_checkpoint.deepspeed_to_megatron import _create_rank_checkpoint
def override_args(args, override_args, skip_keys, skip_if_specified_keys):
for k, v in vars(override_args).items():
if k in skip_keys:
continue
if k in skip_if_specified_keys and getattr(args, k) is not None:
continue
setattr(args, k, v)
# Note(Hesslow):
# The model loading is a bit convoluted.
# We want to parse out the model arguments from the checkpoint and use those to initialize megatron-ds.
#
# However megatron-ds expects its arguments on the command line.
# And at that point we don't know them.
#
# Instead we use Jasons way: we load the arguments form the checkpoint and then override _parse_args to return whatever args we want.
#
# If the checkpoint is old, some new arguments may have been introduced and the code will expect these arguments to exist.
# In order to support this we _first_ parse the arguments normally, and then override them with the arguments from the checkpoint.
# Keeping the default-value of newer arguments.
#
# We then use the megatron deepspeed converter to load the deepspeed checkpoints as if they we're megatron checkpoints.
def load_ds_checkpoint_and_setup_megatron(args):
_print_args = megatron.arguments._print_args
megatron.arguments._print_args = lambda *_args, **kwarg: None
if not os.path.exists(args.load):
raise ValueError(f"checkpoint path {args.load} doesn't exit")
ds_checkpoint = DeepSpeedCheckpoint(args.load,
tp_degree=args.tensor_model_parallel_size,
pp_degree=args.pipeline_model_parallel_size)
cp_args = ds_checkpoint.get_args()
# Merge the current args with the checkpoint args.
skip_keys = [
'abort_on_unmet_fused_kernel_constraints',
'batch_size',
'data_parallel_size',
'deepspeed',
'deepspeed_config',
'device_count',
'global_batch_size',
'inference',
'iteration',
'load',
'local_rank',
'micro_batch_size',
'pipeline_model_parallel_size',
'rampup_batch_size',
'rank',
'tensor_model_parallel_size',
'tensorboard_dir',
'world_size',
]
skip_if_specified = ['merge_file', 'vocab_file']
if args.eval_fp32:
cp_args.fp16 = False
cp_args.bf16 = False
cp_args.params_dtype = torch.float32
override_args(args, cp_args, skip_keys, skip_if_specified)
# stop megatron from reparsing the arguments.
megatron.global_vars._parse_args = lambda *_args, **kwarg: args
megatron.global_vars._GLOBAL_ARGS = args
initialize_megatron()
torch.distributed.barrier()
# Initializing megatron will update eg. tokenizer size. Override again.
override_args(args, cp_args, skip_keys, skip_if_specified)
# print final arguments.
_print_args(args)
if args.deepspeed:
# Hack #3:
# Loading pipelined models in deepspeed with different TP than it was originally trained on fails
# due to a sanity check, that makes sure that all state_dicts that we merge contains attention layers.
# This, however, is not true for pipelining when we will merge the state_dict for the embeddings which
# which does not contain these attention-specific keys.
#
# Deepspeed does however manage to load the model if we just turn off this sanity check.
import deepspeed
deepspeed.runtime.state_dict_factory.MegatronSDLoader.sanity_check = lambda self, ckpt_file_name: None
cp_path = args.load
args.load = None
model, _, _ = setup_model_and_optimizer(model_provider)
model = model[0]
zero_enabled = model._config.zero_enabled
model._config.zero_enabled = False
_, _ = model.load_checkpoint(cp_path, tag = '.', load_optimizer_states=False, load_lr_scheduler_states=False, load_module_only=True)
model._config.zero_enabled = zero_enabled
else:
model = get_model(model_provider)[0]
# Initialize megatron model using the parsed state dict.
sd = _create_rank_checkpoint(ds_checkpoint, None, mpu.get_tensor_model_parallel_rank(), mpu.get_pipeline_model_parallel_rank(), True)
model.load_state_dict(sd['model'], strict=True)
if args.eval_fp32:
model = model.float()
torch.distributed.barrier()
return model
def tasks_args(parser):
results_path_default = f"results-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.json"
"""Provide extra arguments required for tasks."""
group = parser.add_argument_group(title='Evaluation options')
group.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks.')
group.add_argument('--results_path', type=str, default = results_path_default, help='Path to where the results will be stored.')
group.add_argument('--adaptive_seq_len', default = False, action='store_true',
help='Should the sequence length be adapted to the batch during evaluation, if in fp16 the results will be slightly different due to numerical errors but greatly speed up evaluation.')
group.add_argument('--eval_fp32', default = False, action='store_true', help='Should the evaluation run in fp32')
group.add_argument('--intermed_results', default = False, action='store_true', help='Whether to print & write intermediate results for each task')
group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel')
return parser
from megatron.global_vars import _parse_args
def main():
# parse the megatron args. But wait with initalizing megatron.
# avoid printing the arguments, since they will later be overridden.
args = _parse_args(tasks_args)
load_path = args.load
model = load_ds_checkpoint_and_setup_megatron(args)
args = get_args()
if args.deepspeed and args.adaptive_seq_len:
# adaptive_seq_len hack #1:
# CL automatically enables reset_activation_shape() which allows us to change input shapes
# and it also reshapes the attenion scores in attention_mask_func
args.curriculum_learning = 1
task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
task_dict = tasks.get_task_dict(task_list)
model.module.activation_checkpoint_interval = 0
model._compute_loss = False
model.fwd_outputs = []
tokenizer = get_tokenizer()
adaptor = EvalHarnessAdaptor(model, tokenizer)
if args.intermed_results:
global_results = {"results": {}, "versions": {}}
timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
iteration_id = load_path.split("/")[-1].replace("/", "")
results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json")
# Backup file in case of interruption during writing
results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json")
for task_name, task in task_dict.items():
results = evaluator.evaluate(adaptor, {task_name: task}, False, 0, None, bootstrap_iters=args.bootstrap_iters)
global_results["results"] = {**global_results["results"], **results["results"]}
global_results["versions"] = {**global_results["versions"], **results["versions"]}
if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
print(json.dumps(results, indent=2))
with open(results_path, 'w') as outfile:
json.dump(global_results, outfile, indent=4)
with open(results_path_backup, 'w') as outfile:
json.dump(global_results, outfile, indent=4)
else:
global_results = evaluator.evaluate(adaptor, task_dict, False, 0, None, bootstrap_iters=args.bootstrap_iters)
if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
print(json.dumps(global_results, indent=2))
with open(args.results_path, 'w') as outfile:
json.dump(global_results, outfile, indent=4)
if __name__ == '__main__':
main()
#!/usr/bin/env python
# this script converts results.json:
#
# "results": {
# "arc_challenge": {
# "acc": 0.24232081911262798,
# "acc_stderr": 0.01252159329580012,
# "acc_norm": 0.2764505119453925,
# "acc_norm_stderr": 0.013069662474252425
# },
#
# into a format expected by a spreadsheet, which is:
#
# task metric value err
# arc_challenge acc xxx yyy
# arc_challenge acc_norm xxx yyy
# arc_challenge f1 xxx yyy
#
# usage:
# report-to-csv.py results.json
import sys
import json
import io
import csv
results_file = sys.argv[1]
csv_file = results_file.replace("json", "csv")
print(f"Converting {results_file} to {csv_file}")
with io.open(results_file, 'r', encoding='utf-8') as f:
results = json.load(f)
with io.open(csv_file, 'w', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(["task", "metric", "value", "err", "version"])
versions = results["versions"]
for k,v in sorted(results["results"].items()):
if k not in versions:
versions[k] = -1
if "acc" in v:
writer.writerow([k, "acc", v["acc"], v["acc_stderr"], versions[k]])
if "acc_norm" in v:
writer.writerow([k, "acc_norm", v["acc_norm"], v["acc_norm_stderr"], versions[k]])
if "f1" in v:
writer.writerow([k, "f1", v["f1"], v["f1_stderr"] if "f1_stderr" in v else "", versions[k]])
# if "ppl" in v:
# writer.writerow([k, "ppl", v["ppl"], v["ppl_stderr"], versions[k]])
# if "em" in v:
# writer.writerow([k, "em", v["em"], v["em_stderr"] if "em_stderr" in v else "", versions[k]])
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Evaluation utilities."""
import os
import time
from functools import partial
import torch
from megatron import get_args
from megatron import print_rank_last, is_last_rank
from megatron import mpu
from megatron.schedules import get_forward_backward_func
from tasks.finetune_utils import build_data_loader
from tasks.finetune_utils import process_batch
def accuracy_func_provider(single_dataset_provider):
"""Provide function that calculates accuracies."""
args = get_args()
# Build dataloaders.
datapaths = args.valid_data
dataloaders = []
for datapath in datapaths:
dataset = single_dataset_provider(datapath)
dataloader = build_data_loader(
dataset, args.orig_micro_batch_size, num_workers=args.num_workers,
drop_last=(mpu.get_data_parallel_world_size() > 1))
dataloaders.append((dataset.dataset_name, dataloader))
def metrics_func(model, epoch, output_predictions=False):
print_rank_last('calculating metrics ...')
correct = 0
total = 0
if output_predictions:
assert mpu.get_data_parallel_world_size() == 1
named_predictions = []
names = 'predictions'
for name, dataloader in dataloaders:
output = calculate_correct_answers(name, model, dataloader,
epoch, output_predictions)
if not output_predictions:
correct_ans, total_count = output
else:
correct_ans, total_count, predictions = output
named_predictions.append((name, predictions))
names += '_' + name
correct += correct_ans
total += total_count
if is_last_rank():
percent = float(correct) * 100.0 / float(total)
print(' >> |epoch: {}| overall: correct / total = {} / {} = '
'{:.4f} %'.format(epoch, correct, total, percent))
if output_predictions and is_last_rank():
assert args.load is not None
filename = os.path.join(args.load, names + '.pt')
torch.save(named_predictions, filename)
return metrics_func
def calculate_correct_answers(name, model, dataloader,
epoch, output_predictions):
"""Calculate correct over total answers and return prediction if the
`output_predictions` is true."""
args = get_args()
forward_backward_func = get_forward_backward_func()
start_time = time.time()
for m in model:
m.eval()
saved_micro_batch_size = args.micro_batch_size
saved_global_batch_size = args.global_batch_size
ds = dataloader.dataset
if hasattr(ds, 'sample_multiplier'):
# If our dataset as a sample_multiplier attribute that means
# each "sample" from the dataset actually has multiple samples
# that will collapse into the batch dimension (for example in
# the RACE dataset that has several options), we need to
# account for that when setting the micro batch size.
sample_multiplier = ds.sample_multiplier
else:
sample_multiplier = 1
micro_batch_size_times_data_parallel = args.orig_micro_batch_size * args.data_parallel_size
num_micro_batches = args.orig_global_batch_size // micro_batch_size_times_data_parallel
def loss_func(output_predictions, labels, output_tensor):
logits = output_tensor
loss_dict = {}
# Add output predictions.
if output_predictions:
assert False
loss_dict['softmaxes'] = torch.nn.Softmax(dim=-1)(
logits.float()).data.cpu().numpy().tolist()
loss_dict['labels'] = labels.data.cpu().numpy().tolist()
loss_dict['ids'] = batch['uid'].cpu().numpy().tolist()
# Compute the correct answers.
predicted = torch.argmax(logits, dim=-1)
corrects = (predicted == labels)
# Add to the counters.
loss_dict['total'] = labels.size(0)
loss_dict['correct'] = corrects.sum().item()
return 0, loss_dict
# defined inside to capture output_predictions
def correct_answers_forward_step(batch, model):
try:
batch_ = next(batch)
except BaseException:
batch_ = batch
tokens, types, labels, attention_mask = process_batch(batch_)
# Forward model.
args = get_args()
output_tensor = model(tokens, attention_mask, tokentype_ids=types)
return output_tensor, partial(loss_func, output_predictions, labels)
with torch.no_grad():
# For all the batches in the dataset.
total = 0
correct = 0
if output_predictions:
# This option is only possible when data parallel size is 1.
assert mpu.get_data_parallel_world_size() == 1
softmaxes = []
labels = []
ids = []
for _, batch in enumerate(dataloader):
# For evaluation only mode we use drop_last = False to get all the
# samples, which means we might not have a full batch, so we
# adjust batch_size here to actual batch size of data
actual_batch_size = len(batch['label'])
# ... applying sample_multiplier if necessary
args.micro_batch_size = actual_batch_size * sample_multiplier
args.global_batch_size = actual_batch_size * sample_multiplier * num_micro_batches
loss_dicts = forward_backward_func(correct_answers_forward_step, batch, model,
optimizer=None, timers=None, forward_only=True)
for loss_dict in loss_dicts:
if output_predictions:
softmaxes.extend(loss_dict['softmaxes'])
labels.extend(loss_dict['labels'])
ids.extend(loss_dict['ids'])
total += loss_dict['total']
correct += loss_dict['correct']
for m in model:
m.train()
args.micro_batch_size = saved_micro_batch_size
args.global_batch_size = saved_global_batch_size
# Reduce.
if mpu.is_pipeline_last_stage():
unreduced = torch.cuda.LongTensor([correct, total])
torch.distributed.all_reduce(unreduced,
group=mpu.get_data_parallel_group())
# Print on screen.
correct_ans = unreduced[0].item()
total_count = unreduced[1].item()
percent = float(correct_ans) * 100.0 / float(total_count)
elapsed_time = time.time() - start_time
print_rank_last(' > |epoch: {}| metrics for {}: correct / total '
'= {} / {} = {:.4f} %, elapsed time (sec): {:.3f}'.format(
epoch, name, correct_ans, total_count,
percent, elapsed_time))
if output_predictions:
return correct_ans, total_count, (softmaxes, labels, ids)
return correct_ans, total_count
if output_predictions:
return 0, 0, ()
return 0, 0
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetune utilities."""
from functools import partial
import torch
from megatron import get_args
from megatron import print_rank_0
from megatron import get_timers
from megatron import mpu
from megatron.checkpointing import load_checkpoint
from megatron.checkpointing import save_checkpoint
from megatron.training import evaluate_and_print_results
from megatron.training import setup_model_and_optimizer
from megatron.training import train_step
from megatron.training import training_log
from megatron.utils import average_losses_across_data_parallel_group
from megatron.utils import calc_params_l2_norm
from megatron.utils import check_adlr_autoresume_termination
def process_batch(batch):
"""Process batch and produce inputs for the model."""
args = get_args()
tokens = batch['text'].long().cuda().contiguous()
types = batch['types'].long().cuda().contiguous()
labels = batch['label'].long().cuda().contiguous()
attention_mask = batch['padding_mask'].float().cuda().contiguous()
if args.fp16:
attention_mask = attention_mask.half()
return tokens, types, labels, attention_mask
def cross_entropy_loss_func(labels, output_tensor):
logits = output_tensor
# Cross-entropy loss.
loss_func = torch.nn.CrossEntropyLoss()
loss = loss_func(logits.contiguous().float(), labels)
# Reduce loss for logging.
averaged_loss = average_losses_across_data_parallel_group([loss])
return loss, {'lm loss': averaged_loss[0]}
def _cross_entropy_forward_step(batch, model):
"""Simple forward step with cross-entropy loss."""
timers = get_timers()
# Get the batch.
timers('batch-generator').start()
try:
batch_ = next(batch)
except BaseException:
batch_ = batch
tokens, types, labels, attention_mask = process_batch(batch_)
timers('batch-generator').stop()
# Forward model.
output_tensor = model(tokens, attention_mask, tokentype_ids=types)
return output_tensor, partial(cross_entropy_loss_func, labels)
def build_data_loader(dataset, micro_batch_size, num_workers, drop_last):
"""Data loader. Note that batch-size is the local (per GPU) batch-size."""
# Sampler.
world_size = mpu.get_data_parallel_world_size()
rank = mpu.get_data_parallel_rank()
sampler = torch.utils.data.distributed.DistributedSampler(
dataset, num_replicas=world_size, rank=rank)
# Data loader. Note that batch size is the per GPU batch size.
data_loader = torch.utils.data.DataLoader(dataset,
batch_size=micro_batch_size,
sampler=sampler,
shuffle=False,
num_workers=num_workers,
drop_last=drop_last,
pin_memory=True)
return data_loader
def _build_infinite_size_dataloader(dataloader):
"""Build a looped dataloader with infinite size."""
iterator = dataloader.__iter__()
while True:
try:
yield iterator.__next__()
except StopIteration:
iterator = dataloader.__iter__()
def _build_train_valid_dataloaders(train_dataset, valid_dataset):
"""Traing and validation dataloaders."""
args = get_args()
print_rank_0('building train and validation dataloaders ...')
# Training dataset.
train_dataloader = build_data_loader(train_dataset, args.micro_batch_size,
args.num_workers, not args.keep_last)
# Set the training iterations.
args.train_iters_per_epoch = len(train_dataloader)
args.train_iters = args.epochs * args.train_iters_per_epoch
# Validation dataset. For this dataset, we do not need to set up
# shuffling so we can just use a simple infinite loop.
valid_dataloader_ = build_data_loader(valid_dataset, args.micro_batch_size,
args.num_workers, not args.keep_last)
valid_dataloader = _build_infinite_size_dataloader(valid_dataloader_)
# Now that we've built the data loaders, set batch_size arguments
# to the actual batch size the model will see for this dataset.
# This is necessary so pipeline transfers know what size they are
# and the LR schedule, which is based on samples seen, gets set
# correctly.
args.orig_micro_batch_size = args.micro_batch_size
args.orig_global_batch_size = args.global_batch_size
if hasattr(train_dataset, 'sample_multiplier'):
# If our dataset as a sample_multiplier attribute that means
# each "sample" from the dataset actually has multiple samples
# that will collapse into the batch dimension (for example in
# the RACE dataset that has several options), we need to
# account for that when setting the micro batch size.
args.micro_batch_size *= train_dataset.sample_multiplier
args.global_batch_size *= train_dataset.sample_multiplier
return train_dataloader, valid_dataloader
def _train(model, optimizer, lr_scheduler, forward_step,
train_dataloader, valid_dataloader, end_of_epoch_callback):
"""Train the model."""
args = get_args()
timers = get_timers()
# Turn on training mode which enables dropout.
for m in model:
m.train()
# Tracking loss.
losses_dict_sum = {}
# Starting epoch and iteration
start_epoch = args.iteration // args.train_iters_per_epoch
start_iteration = args.iteration % args.train_iters_per_epoch
iteration = args.iteration
# Memory reporting flag.
report_memory_flag = True
# For each remaining epoch
timers('interval-time').start()
for epoch in range(start_epoch, args.epochs):
print_rank_0('working on epoch {} ...'.format(epoch + 1))
# Set the data loader epoch to shuffle the index iterator.
train_dataloader.sampler.set_epoch(args.seed + epoch)
# For all the batches in the dataset.
for iteration_, batch in enumerate(train_dataloader):
# Ignore the iterations before starting value
if iteration_ < start_iteration:
continue
# Set to zero so the next epoch does not skip any batches.
start_iteration = 0
# Train for one step.
out = train_step(forward_step, batch, model, optimizer, lr_scheduler)
losses_dict, skipped_iter, grad_norm, num_zeros_in_grad = out
iteration += 1
# Logging.
params_norm = None
if args.log_params_norm:
params_norm = calc_params_l2_norm(model)
report_memory_flag = training_log(losses_dict, losses_dict_sum,
optimizer.param_groups[0]['lr'],
iteration,
optimizer.get_loss_scale().item(),
report_memory_flag, skipped_iter,
grad_norm, params_norm, num_zeros_in_grad)
# Autoresume
if args.adlr_autoresume and \
(iteration % args.adlr_autoresume_interval == 0):
check_adlr_autoresume_termination(iteration, model,
optimizer, lr_scheduler)
# Checkpointing
if args.save and args.save_interval and \
iteration % args.save_interval == 0:
save_checkpoint(iteration, model, optimizer, lr_scheduler)
# Evaluation
if args.eval_interval and iteration % args.eval_interval == 0:
prefix = 'iteration {}'.format(iteration)
evaluate_and_print_results(prefix, forward_step,
valid_dataloader, model,
iteration, False)
# Checkpointing at the end of each epoch.
if args.save:
save_checkpoint(iteration, model, optimizer, lr_scheduler)
# Callback at the end of each epoch.
if end_of_epoch_callback is not None:
end_of_epoch_callback(model, epoch)
def finetune(train_valid_datasets_provider, model_provider,
forward_step=_cross_entropy_forward_step,
end_of_epoch_callback_provider=None):
"""Main finetune function used across all tasks."""
args = get_args()
timers = get_timers()
assert args.rampup_batch_size is None, \
'batch size scaling is not supported for finetuning'
# Train and validation data loaders.
timers('train/valid/test dataset/dataloder').start()
if args.epochs > 0:
train_dataset, valid_dataset = train_valid_datasets_provider()
train_dataloader, valid_dataloader = _build_train_valid_dataloaders(
train_dataset, valid_dataset)
else:
args.train_iters = 0
timers('train/valid/test dataset/dataloder').stop()
# Build calback function.
timers('callback function').start()
end_of_epoch_callback = None
if end_of_epoch_callback_provider is not None:
end_of_epoch_callback = end_of_epoch_callback_provider()
timers('callback function').stop()
# Build model, optimizer and learning rate scheduler.
timers('model and optimizer').start()
model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
timers('model and optimizer').stop()
# If pretrained checkpoint is provided and we have not trained for
# any iteration (i.e., iteration is zero), then load the pretrained
# checkpoint.
timers('pretrained checkpoint').start()
if args.iteration == 0 and args.pretrained_checkpoint is not None:
original_load = args.load
args.load = args.pretrained_checkpoint
_ = load_checkpoint(model, None, None)
args.load = original_load
# This is critical when only model is loaded. We should make sure
# main parameters are also updated.
optimizer.reload_model_params()
timers('pretrained checkpoint').stop()
# Print setup timing.
print_rank_0('done with setups ...')
timers.log(['train/valid/test dataset/dataloder', 'callback function',
'model and optimizer', 'pretrained checkpoint'])
print_rank_0('training ...')
# Finetune the model.
if args.epochs > 0:
_train(model, optimizer, lr_scheduler, forward_step,
train_dataloader, valid_dataloader, end_of_epoch_callback)
# Or just evaluate.
else:
if end_of_epoch_callback is not None:
print_rank_0('evaluation only mode, setting epoch to -1')
end_of_epoch_callback(model, epoch=-1, output_predictions=True)
print_rank_0('done :-)')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment