Commit 22c0e300 authored by Jared Casper's avatar Jared Casper
Browse files

Merge branch 'staging' into 'master'

April 2020 Release

See merge request ADLR/megatron-lm!69
parents 70174ae3 acf8780b
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. The following applies to all files unless otherwise noted:
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
# #
# Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions # modification, are permitted provided that the following conditions
...@@ -24,8 +26,15 @@ ...@@ -24,8 +26,15 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--
This repository also contains code from Hugging Face Inc., Google Research,
and Facebook (from their Fairseq project). Files from these
organizations have notices at the top of each file. Below are licenses
used in those files, as indicated.
------------- LICENSE FOR huggingface(transformer) repository -------------- ------------- LICENSE FOR huggingface and Google Research code --------------
Apache License Apache License
...@@ -229,3 +238,27 @@ ...@@ -229,3 +238,27 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
------------- LICENSE FOR Facebook Fairseq code --------------
MIT License
Copyright (c) Facebook, Inc. and its affiliates.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
This diff is collapsed.
import re
def ptb_detokenizer(string):
string = string.replace(" '", "'")
string = string.replace(" \n", "\n")
string = string.replace("\n ", "\n")
string = string.replace(" n't", "n't")
string = string.replace(" N ","1 ")
string = string.replace("$ 1", "$1")
string = string.replace("# 1", "#1")
return string
def wikitext_detokenizer(string):
#contractions
string = string.replace("s '", "s'")
string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
# number separators
string = string.replace(" @-@ ", "-")
string = string.replace(" @,@ ", ",")
string = string.replace(" @.@ ", ".")
#punctuation
string = string.replace(" : ", ": ")
string = string.replace(" ; ", "; ")
string = string.replace(" . ", ". ")
string = string.replace(" ! ", "! ")
string = string.replace(" ? ", "? ")
string = string.replace(" , ", ", ")
# double brackets
string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
# miscellaneous
string = string.replace("= = = =", "====")
string = string.replace("= = =", "===")
string = string.replace("= =", "==")
string = string.replace(" "+chr(176)+" ", chr(176))
string = string.replace(" \n", "\n")
string = string.replace("\n ", "\n")
string = string.replace(" N ", " 1 ")
string = string.replace(" 's", "'s")
return string
def lambada_detokenizer(string):
return string
def get_detokenizer(path):
for key in DETOKENIZERS.keys():
if key in path:
print(key)
return DETOKENIZERS[key]
DETOKENIZERS = {
'ptb': ptb_detokenizer,
'wikitext': wikitext_detokenizer,
'lambada': lambada_detokenizer,
}
# ===========
# base images
# ===========
FROM nvcr.io/nvidia/pytorch:19.09-py3
# ===============
# system packages
# ===============
RUN apt-get update && apt-get install -y \
bash-completion \
emacs \
git \
graphviz \
htop \
libopenexr-dev \
rsync \
wget \
&& rm -rf /var/lib/apt/lists/*
# ============
# pip packages
# ============
RUN pip install --upgrade pip && \
pip install --upgrade setuptools
COPY requirements.txt /tmp/
RUN pip install --upgrade --ignore-installed -r /tmp/requirements.txt
boto3
google-cloud-language
inflect
nltk
numpy
pandas
requests
sentencepiece
tensorflow
tqdm
This diff is collapsed.
#!/bin/bash
WORLD_SIZE=8
DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
TASK="LAMBADA"
VALID_DATA=<lambada path>
VOCAB_FILE=gpt2-vocab.json
MERGE_FILE=gpt2-merges.txt
CHECKPOINT=checkpoints/gpt2_345m
python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
--task $TASK \
--valid-data $VALID_DATA \
--tokenizer-type GPT2BPETokenizer \
--strict-lambada
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--load $CHECKPOINT \
--model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--batch-size 8 \
--checkpoint-activations \
--seq-length 1024 \
--max-position-embeddings 1024 \
--log-interval 10 \
--fp16 \
--no-load-optim \
--no-load-rng
#!/bin/bash
WORLD_SIZE=8
DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
TRAIN_DATA="data/glue_data/MNLI/train.tsv"
VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
data/glue_data/MNLI/dev_mismatched.tsv"
PRETRAINED_CHECKPOINT=checkpoints/bert_345m
VOCAB_FILE=bert-vocab.txt
CHECKPOINT_PATH=checkpoints/bert_345m_mnli
python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
--task MNLI \
--seed 1234 \
--train-data $TRAIN_DATA \
--valid-data $VALID_DATA \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file $VOCAB_FILE \
--epochs 5 \
--pretrained-checkpoint $PRETRAINED_CHECKPOINT \
--model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--batch-size 8 \
--checkpoint-activations \
--lr 5.0e-5 \
--lr-decay-style linear \
--warmup 0.065 \
--seq-length 512 \
--max-position-embeddings 512 \
--save-interval 500000 \
--save $CHECKPOINT_PATH \
--log-interval 10 \
--eval-interval 100 \
--eval-iters 50 \
--weight-decay 1.0e-1 \
--fp16
#!/bin/bash
WORLD_SIZE=8
DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
TRAIN_DATA="data/RACE/train/middle"
VALID_DATA="data/RACE/dev/middle \
data/RACE/dev/high"
VOCAB_FILE=bert-vocab.txt
PRETRAINED_CHECKPOINT=checkpoints/bert_345m
CHECKPOINT_PATH=checkpoints/bert_345m_race
python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
--task RACE \
--seed 1234 \
--train-data $TRAIN_DATA \
--valid-data $VALID_DATA \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file $VOCAB_FILE \
--epochs 3 \
--pretrained-checkpoint $PRETRAINED_CHECKPOINT \
--model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--batch-size 4 \
--checkpoint-activations \
--lr 1.0e-5 \
--lr-decay-style linear \
--warmup 0.06 \
--seq-length 512 \
--max-position-embeddings 512 \
--save-interval 100000 \
--save $CHECKPOINT_PATH \
--log-interval 10 \
--eval-interval 100 \
--eval-iters 50 \
--weight-decay 1.0e-1 \
--clip-grad 1.0 \
--hidden-dropout 0.1 \
--attention-dropout 0.1 \
--fp16
#!/bin/bash
CHECKPOINT_PATH=checkpoints/gpt2_345m
VOCAB_FILE=gpt2-vocab.json
MERGE_FILE=gpt2-merges.txt
python tools/generate_samples_gpt2.py \
--model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--load $CHECKPOINT_PATH \
--num-attention-heads 16 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--batch-size 2 \
--seq-length 1024 \
--out-seq-length 1024 \
--temperature 1.0 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--genfile unconditional_samples.json \
--num-samples 2 \
--top_p 0.9 \
--recompute
#!/bin/bash
MODEL_PARALLEL_SIZE=2
VOCAB_FILE=bert-vocab.txt
CHECKPOINT_PATH=checkpoints/bert_345m
WORLD_SIZE=$MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
--model-type BERT \
--model-parallel-size $MODEL_PARALLEL_SIZE \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file $VOCAB_FILE \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 512 \
--max-position-embeddings 512 \
--load $CHECKPOINT_PATH
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
RANK=0 RANK=0
WORLD_SIZE=1 WORLD_SIZE=1
DATA_PATH=<Specify path and file prefix>_text_sentence
CHECKPOINT_PATH=<Specify path>
python pretrain_bert.py \ python pretrain_bert.py \
--num-layers 24 \ --num-layers 24 \
...@@ -9,26 +11,25 @@ python pretrain_bert.py \ ...@@ -9,26 +11,25 @@ python pretrain_bert.py \
--num-attention-heads 16 \ --num-attention-heads 16 \
--batch-size 4 \ --batch-size 4 \
--seq-length 512 \ --seq-length 512 \
--max-preds-per-seq 80 \
--max-position-embeddings 512 \ --max-position-embeddings 512 \
--train-iters 1000000 \ --train-iters 2000000 \
--save checkpoints/bert_345m \ --save $CHECKPOINT_PATH \
--load checkpoints/bert_345m \ --load $CHECKPOINT_PATH \
--resume-dataloader \ --data-path $DATA_PATH \
--train-data wikipedia \ --vocab-file bert-vocab.txt \
--lazy-loader \ --data-impl mmap \
--tokenizer-type BertWordPieceTokenizer \
--tokenizer-model-type bert-large-uncased \
--presplit-sentences \
--cache-dir cache \
--split 949,50,1 \ --split 949,50,1 \
--distributed-backend nccl \ --distributed-backend nccl \
--lr 0.0001 \ --lr 0.0001 \
--min-lr 0.00001 \
--lr-decay-style linear \ --lr-decay-style linear \
--lr-decay-iters 990000 \ --lr-decay-iters 990000 \
--weight-decay 1e-2 \ --weight-decay 1e-2 \
--clip-grad 1.0 \ --clip-grad 1.0 \
--warmup .01 \ --warmup .01 \
--fp16 \ --log-interval 100 \
--fp32-layernorm \ --save-interval 10000 \
--fp32-embedding --eval-interval 1000 \
--eval-iters 10 \
--fp16
...@@ -8,36 +8,37 @@ NNODES=1 ...@@ -8,36 +8,37 @@ NNODES=1
NODE_RANK=0 NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DATA_PATH=<Specify path and file prefix>_text_sentence
CHECKPOINT_PATH=<Specify path>
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS \ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_bert.py \ pretrain_bert.py \
--model-parallel-size 1 \
--num-layers 24 \ --num-layers 24 \
--hidden-size 1024 \ --hidden-size 1024 \
--num-attention-heads 16 \ --num-attention-heads 16 \
--batch-size 4 \ --batch-size 4 \
--seq-length 512 \ --seq-length 512 \
--max-preds-per-seq 80 \
--max-position-embeddings 512 \ --max-position-embeddings 512 \
--train-iters 1000000 \ --train-iters 1000000 \
--save checkpoints/bert_345m \ --save $CHECKPOINT_PATH \
--load checkpoints/bert_345m \ --load $CHECKPOINT_PATH \
--resume-dataloader \ --data-path $DATA_PATH \
--train-data wikipedia \ --vocab-file bert-vocab.txt \
--lazy-loader \ --data-impl mmap \
--tokenizer-type BertWordPieceTokenizer \
--tokenizer-model-type bert-large-uncased \
--presplit-sentences \
--cache-dir cache \
--split 949,50,1 \ --split 949,50,1 \
--distributed-backend nccl \ --distributed-backend nccl \
--lr 0.0001 \ --lr 0.0001 \
--lr-decay-style linear \ --lr-decay-style linear \
--min-lr 1.0e-5 \
--lr-decay-iters 990000 \ --lr-decay-iters 990000 \
--weight-decay 1e-2 \ --weight-decay 1e-2 \
--clip-grad 1.0 \ --clip-grad 1.0 \
--warmup .01 \ --warmup .01 \
--fp16 \ --log-interval 100 \
--fp32-layernorm \ --save-interval 10000 \
--fp32-embedding --eval-interval 1000 \
--eval-iters 10 \
--fp16
...@@ -5,6 +5,10 @@ ...@@ -5,6 +5,10 @@
RANK=0 RANK=0
WORLD_SIZE=1 WORLD_SIZE=1
DATA_PATH=<Specify path and file prefix>_text_document
CHECKPOINT_PATH=<Specify path>
python pretrain_gpt2.py \ python pretrain_gpt2.py \
--num-layers 24 \ --num-layers 24 \
--hidden-size 1024 \ --hidden-size 1024 \
...@@ -12,22 +16,27 @@ python pretrain_gpt2.py \ ...@@ -12,22 +16,27 @@ python pretrain_gpt2.py \
--batch-size 8 \ --batch-size 8 \
--seq-length 1024 \ --seq-length 1024 \
--max-position-embeddings 1024 \ --max-position-embeddings 1024 \
--train-iters 320000 \ --train-iters 500000 \
--save checkpoints/gpt2_345m \ --lr-decay-iters 320000 \
--load checkpoints/gpt2_345m \ --save $CHECKPOINT_PATH \
--resume-dataloader \ --load $CHECKPOINT_PATH \
--train-data wikipedia \ --data-path $DATA_PATH \
--lazy-loader \ --vocab-file gpt2-vocab.json \
--tokenizer-type GPT2BPETokenizer \ --merge-file gpt2-merges.txt \
--cache-dir cache \ --data-impl mmap \
--split 949,50,1 \ --split 949,50,1 \
--distributed-backend nccl \ --distributed-backend nccl \
--lr 0.00015 \ --lr 0.00015 \
--min-lr 1.0e-5 \
--lr-decay-style cosine \ --lr-decay-style cosine \
--weight-decay 1e-2 \ --weight-decay 1e-2 \
--clip-grad 1.0 \ --clip-grad 1.0 \
--warmup .01 \ --warmup .01 \
--checkpoint-activations \ --checkpoint-activations \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16 --fp16
......
...@@ -10,33 +10,43 @@ NNODES=1 ...@@ -10,33 +10,43 @@ NNODES=1
NODE_RANK=0 NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DATA_PATH=<Specify path and file prefix>_text_document
CHECKPOINT_PATH=<Specify path>
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS \ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_gpt2.py \ pretrain_gpt2.py \
--model-parallel-size 1 \
--num-layers 24 \ --num-layers 24 \
--hidden-size 1024 \ --hidden-size 1024 \
--num-attention-heads 16 \ --num-attention-heads 16 \
--batch-size 8 \ --batch-size 8 \
--seq-length 1024 \ --seq-length 1024 \
--max-position-embeddings 1024 \ --max-position-embeddings 1024 \
--train-iters 320000 \ --train-iters 500000 \
--save checkpoints/gpt2_345m \ --lr-decay-iters 320000 \
--load checkpoints/gpt2_345m \ --save $CHECKPOINT_PATH \
--resume-dataloader \ --load $CHECKPOINT_PATH \
--train-data wikipedia \ --data-path $DATA_PATH \
--lazy-loader \ --vocab-file gpt2-vocab.json \
--tokenizer-type GPT2BPETokenizer \ --merge-file gpt2-merges.txt \
--cache-dir cache \ --data-impl mmap \
--split 949,50,1 \ --split 949,50,1 \
--distributed-backend nccl \ --distributed-backend nccl \
--lr 0.00015 \ --lr 0.00015 \
--lr-decay-style cosine \ --lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \ --weight-decay 1e-2 \
--clip-grad 1.0 \ --clip-grad 1.0 \
--warmup .01 \ --warmup .01 \
--checkpoint-activations \ --checkpoint-activations \
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--fp16 --fp16
set +x set +x
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import numpy as np
import torch
from torch.multiprocessing import Lock
from torch.utils.data import Dataset
from megatron import mpu
from megatron.data_utils.samplers import DistributedBatchSampler
from megatron.data_utils.tokenization_gpt2 import GPT2Tokenizer
def make_gpt2_dataloaders(args):
# Input parameters.
input_data_sizes_file = args.input_data_sizes_file
seq_length = args.seq_length
initial_seed = args.seed
# Data parallel arguments.
world_size = mpu.get_data_parallel_world_size()
rank = mpu.get_data_parallel_rank()
global_batch_size = args.batch_size * world_size
num_workers = args.num_workers
def make_data_loader_(data_path):
# Build the dataset.
dataset = GPT2Dataset(data_path, input_data_sizes_file,
seq_length, initial_seed)
# Use a simple sampler with distributed batch sampler.
sampler = torch.utils.data.SequentialSampler(dataset)
batch_sampler = DistributedBatchSampler(sampler=sampler,
batch_size=global_batch_size,
drop_last=True,
rank=rank,
world_size=world_size)
# Torch dataloader.
return torch.utils.data.DataLoader(dataset,
batch_sampler=batch_sampler,
num_workers=num_workers,
pin_memory=True)
train = make_data_loader_(args.train_data)
valid = make_data_loader_(args.valid_data)
test = make_data_loader_(args.test_data)
args.do_train = False
args.do_valid = False
args.do_test = False
if train is not None:
args.do_train = True
if valid is not None:
args.do_valid = True
if test is not None:
args.do_test = True
# Tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=args.cache_dir)
eod_token = tokenizer.encoder['<|endoftext|>']
num_tokens = eod_token + 1
return (train, valid, test), num_tokens, eod_token
class GPT2Dataset(Dataset):
def __init__(self, data_path, sizes_filename, seq_length,
initial_seed, max_epochs=100):
# Input parameters.
self.data_path = data_path
self.sizes_filename = sizes_filename
self.seq_length = seq_length
self.initial_seed = initial_seed
self.max_epochs = max_epochs
# Lock for building the dataset.
self.lock = Lock()
# Shard stuff.
# Dictionary from shard nameto its size (number of element).
self.master_shard_size_dict = None
# Dictionary from shard name to modified size so it is
# divisible by self.seq_length.
self.shard_size_dict = None
# Long array (self.max_epochs * num-shards) populated
# randomly with shard names.
self.shards_name = None
# Start index of the data for a shard.
self.shards_start_index = None
self.build_shard_mappings_()
self.data_length = self.shards_start_index[-1]
# Data.
self.shards_data = [None]*self.shards_name.size
self.shards_sample_index = [None]*self.shards_name.size
def __len__(self):
return self.data_length
def __getitem__(self, idx):
# Find which shard we need.
shard_index = np.searchsorted(self.shards_start_index,
idx, side='right') - 1
# data index in the shard.
data_idx = idx - self.shards_start_index[shard_index]
# Load the shard if it is not in memory.
#self.lock.acquire()
if self.shards_data[shard_index] is None:
print('global rank {} is building data for shard index {} ...'.
format(torch.distributed.get_rank(), shard_index))
self.build_dataset_(shard_index)
#assert self.shards_data[shard_index] is not None
#self.lock.release()
# Start index.
start_index = self.shards_sample_index[shard_index][data_idx]
# Add one for label shift.
end_index = start_index + self.seq_length + 1
data = self.shards_data[shard_index][start_index:end_index]
return {'text': np.array(data, dtype=np.int64)}
def build_dataset_(self, shard_index):
# Garbage collect so we don't use a lot of memory.
# Leave the last one in case other threads have not catche up yet.
#for i in range(shard_index - 1):
for i in range(shard_index):
self.shards_data[i] = None
self.shards_sample_index[i] = None
# Read the shard.
filename = os.path.join(self.data_path, self.shards_name[shard_index])
print('loading {}'.format(filename))
data = np.load(filename, allow_pickle=True)
# Shuffle the data
rng = np.random.RandomState(self.initial_seed + shard_index)
rng.shuffle(data)
# Flatten.
data = np.hstack(data)
size = (data.shape[0] - 1) // self.seq_length
last_index = size * self.seq_length + 1
data = data[0:last_index]
self.shards_data[shard_index] = data
indices = np.arange(size) * self.seq_length
rng.shuffle(indices)
self.shards_sample_index[shard_index] = indices
def build_shard_mappings_(self):
# Load the sizes file.
sizes_filename = os.path.join(self.data_path, self.sizes_filename)
if torch.distributed.get_rank() == 0:
print(' > loading sizes from {}'.format(sizes_filename))
with open(sizes_filename, 'r') as f:
self.master_shard_size_dict = json.load(f)
if torch.distributed.get_rank() == 0:
print(' found {} shards'.format(len(self.master_shard_size_dict)))
# Adjust sizes to be a multiple of seq_length.
self.shard_size_dict = self.master_shard_size_dict.copy()
total_samples = 0
for shard in self.shard_size_dict:
size = self.shard_size_dict[shard]
size = ((size - 1) // self.seq_length) * self.seq_length
total_samples += size // self.seq_length
self.shard_size_dict[shard] = size
if torch.distributed.get_rank() == 0:
print(' found {} samples in the dataset'.format(total_samples))
# Build a list of shards.
shards_ = np.sort(np.array(list(self.shard_size_dict.keys())))
rng = np.random.RandomState(self.initial_seed)
self.shards_name = np.copy(shards_)
rng.shuffle(self.shards_name)
for i in range(1, self.max_epochs):
shards_c = np.copy(shards_)
rng.shuffle(shards_c)
self.shards_name = np.append(self.shards_name, shards_c)
# Build the global indexing.
self.shards_start_index = np.zeros(self.shards_name.size, dtype=np.int)
self.shards_start_index[0] = 0
for i in range(1, self.shards_name.size):
shard = str(self.shards_name[i-1])
size = self.shard_size_dict[shard]
self.shards_start_index[i] = self.shards_start_index[i-1] + \
size // self.seq_length
'''
if __name__ == '__main__':
print('gpt2 data loader ...')
path = '/raid/mshoeybi/data/gpt2/adlr/reddit_all_ftfy_lg200/npys'
dataset = GPT2Dataset(path, 'sizes.txt', 1024, 1234, 100)
print('dataset contains {} samples'.format(dataset.data_length))
for i in range(len(dataset)):
if i % 512000 == 0:
print(i)
data = dataset[i]
'''
# coding=utf-8 # coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -13,24 +13,19 @@ ...@@ -13,24 +13,19 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import sys import torch
sys.path.append('..')
from megatron.data_utils.tokenization_gpt2 import GPT2Tokenizer from .global_vars import get_args
from .global_vars import get_tokenizer
from .global_vars import get_tensorboard_writer
from .global_vars import get_adlr_autoresume
from .global_vars import get_timers
class Tokenizer: def print_rank_0(message):
"""If distributed is initialized print only on rank 0."""
def __init__(self, cache_dir=None): if torch.distributed.is_initialized():
self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2', if torch.distributed.get_rank() == 0:
cache_dir=cache_dir) print(message, flush=True)
self.tokenizer.max_len = int(1e12) else:
self.eod_token = self.tokenizer.encoder['<|endoftext|>'] print(message, flush=True)
assert self.eod_token < 65535, 'vocab size will not fit in uint16'
print('> GPT2 tokenizer with {} vocab size and eod token {} ...'.format(
len(self.tokenizer.encoder), self.eod_token))
def tokenize_document(self, document):
tokens = self.tokenizer.encode(document)
tokens.append(self.eod_token)
return tokens
This diff is collapsed.
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Input/output checkpointing."""
import os
import random
import sys
import numpy as np
import torch
from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
from megatron import mpu
from megatron import get_args
from megatron import print_rank_0
def check_checkpoint_args(checkpoint_args):
"""Ensure fixed arguments for a model are the same for the input
arguments and the one retreived frm checkpoint."""
args = get_args()
def _compare(arg_name):
checkpoint_value = getattr(checkpoint_args, arg_name)
args_value = getattr(args, arg_name)
error_message = '{} value from checkpoint ({}) is not equal to the ' \
'input argument value ({}).'.format(
arg_name, checkpoint_value, args_value)
assert checkpoint_value == args_value, error_message
_compare('num_layers')
_compare('hidden_size')
_compare('num_attention_heads')
_compare('max_position_embeddings')
_compare('make_vocab_size_divisible_by')
_compare('padded_vocab_size')
_compare('tokenizer_type')
_compare('model_parallel_size')
def ensure_directory_exists(filename):
"""Build filename's path if it does not already exists."""
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.makedirs(dirname)
def get_checkpoint_name(checkpoints_path, iteration,
release=False, mp_rank=None):
"""A unified checkpoint name."""
if release:
directory = 'release'
else:
directory = 'iter_{:07d}'.format(iteration)
return os.path.join(checkpoints_path, directory,
'mp_rank_{:02d}'.format(
mpu.get_model_parallel_rank() if mp_rank is None
else mp_rank),
'model_optim_rng.pt')
def get_checkpoint_tracker_filename(checkpoints_path):
"""Tracker file rescords the latest chckpoint during
training to restart from."""
return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt')
def save_checkpoint(iteration, model, optimizer, lr_scheduler):
"""Save a model checkpoint."""
args = get_args()
# Only rank zero of the data parallel writes to the disk.
if isinstance(model, torchDDP):
model = model.module
if mpu.get_data_parallel_rank() == 0:
# Arguments, iteration, and model.
state_dict = {}
state_dict['args'] = args
state_dict['iteration'] = iteration
state_dict['model'] = model.state_dict_for_save_checkpoint()
# Optimizer stuff.
if not args.no_save_optim:
if optimizer is not None:
state_dict['optimizer'] = optimizer.state_dict()
if lr_scheduler is not None:
state_dict['lr_scheduler'] = lr_scheduler.state_dict()
# RNG states.
if not args.no_save_rng:
state_dict['random_rng_state'] = random.getstate()
state_dict['np_rng_state'] = np.random.get_state()
state_dict['torch_rng_state'] = torch.get_rng_state()
state_dict['cuda_rng_state'] = torch.cuda.get_rng_state()
state_dict['rng_tracker_states'] \
= mpu.get_cuda_rng_tracker().get_states()
# Save.
checkpoint_name = get_checkpoint_name(args.save, iteration)
print('global rank {} is saving checkpoint at iteration {:7d} to {}'.
format(torch.distributed.get_rank(), iteration, checkpoint_name))
ensure_directory_exists(checkpoint_name)
torch.save(state_dict, checkpoint_name)
print(' successfully saved {}'.format(checkpoint_name))
# Wait so everyone is done (necessary)
torch.distributed.barrier()
# And update the latest iteration
if torch.distributed.get_rank() == 0:
tracker_filename = get_checkpoint_tracker_filename(args.save)
with open(tracker_filename, 'w') as f:
f.write(str(iteration))
# Wait so everyone is done (not necessary)
torch.distributed.barrier()
def load_checkpoint(model, optimizer, lr_scheduler):
"""Load a model checkpoint and return the iteration."""
args = get_args()
if isinstance(model, torchDDP):
model = model.module
# Read the tracker file and set the iteration.
tracker_filename = get_checkpoint_tracker_filename(args.load)
# If no tracker file, return iretation zero.
if not os.path.isfile(tracker_filename):
print_rank_0('WARNING: could not find the metadata file {} '.format(
tracker_filename))
print_rank_0(' will not load any checkpoints and will start from '
'random')
return 0
# Otherwise, read the tracker file and either set the iteration or
# mark it as a release checkpoint.
iteration = 0
release = False
with open(tracker_filename, 'r') as f:
metastring = f.read().strip()
try:
iteration = int(metastring)
except ValueError:
release = metastring == 'release'
if not release:
print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(
tracker_filename))
sys.exit()
assert iteration > 0 or release, 'error parsing metadata file {}'.format(
tracker_filename)
# Checkpoint.
checkpoint_name = get_checkpoint_name(args.load, iteration, release)
if mpu.get_data_parallel_rank() == 0:
print('global rank {} is loading checkpoint {}'.format(
torch.distributed.get_rank(), checkpoint_name))
# Load the checkpoint.
try:
state_dict = torch.load(checkpoint_name, map_location='cpu')
except ModuleNotFoundError:
# For backward compatibility.
print_rank_0(' > deserializing using the old code structure ...')
sys.modules['fp16.loss_scaler'] = sys.modules[
'megatron.fp16.loss_scaler']
state_dict = torch.load(checkpoint_name, map_location='cpu')
sys.modules.pop('fp16.loss_scaler', None)
except BaseException:
print_rank_0('could not load the checkpoint')
sys.exit()
# Set iteration.
if args.finetune or release:
iteration = 0
else:
try:
iteration = state_dict['iteration']
except KeyError:
try: # Backward compatible with older checkpoints
iteration = state_dict['total_iters']
except KeyError:
print_rank_0('A metadata file exists but unable to load '
'iteration from checkpoint {}, exiting'.format(
checkpoint_name))
sys.exit()
# Check arguments.
if 'args' in state_dict:
checkpoint_args = state_dict['args']
check_checkpoint_args(checkpoint_args)
else:
print_rank_0('could not find arguments in the checkpoint ...')
# Model.
model.load_state_dict(state_dict['model'])
# Optimizer.
if not release and not args.finetune and not args.no_load_optim:
try:
if optimizer is not None:
optimizer.load_state_dict(state_dict['optimizer'])
if lr_scheduler is not None:
lr_scheduler.load_state_dict(state_dict['lr_scheduler'])
except KeyError:
print_rank_0('Unable to load optimizer from checkpoint {}. '
'Specify --no-load-optim or --finetune to prevent '
'attempting to load the optimizer state, '
'exiting ...'.format(checkpoint_name))
sys.exit()
# rng states.
if not release and not args.finetune and not args.no_load_rng:
try:
random.setstate(state_dict['random_rng_state'])
np.random.set_state(state_dict['np_rng_state'])
torch.set_rng_state(state_dict['torch_rng_state'])
torch.cuda.set_rng_state(state_dict['cuda_rng_state'])
mpu.get_cuda_rng_tracker().set_states(
state_dict['rng_tracker_states'])
except KeyError:
print_rank_0('Unable to load optimizer from checkpoint {}. '
'Specify --no-load-rng or --finetune to prevent '
'attempting to load the optimizer state, '
'exiting ...'.format(checkpoint_name))
sys.exit()
torch.distributed.barrier()
if mpu.get_data_parallel_rank() == 0:
print(' successfully loaded {}'.format(checkpoint_name))
return iteration
from . import indexed_dataset from . import indexed_dataset
from .bert_tokenization import FullTokenizer as FullBertTokenizer
from .albert_dataset import AlbertDataset
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment