Merge branch 'staging' into 'master'

April 2020 Release See merge request ADLR/megatron-lm!69

Merge branch 'staging' into 'master'
April 2020 Release See merge request ADLR/megatron-lm!69
22c0e300 · Jared Casper · 70174ae3 · acf8780b · 22c0e300 · 22c0e300
Commit 22c0e300 authored Apr 16, 2020 by Jared Casper
20 changed files
--- a/LICENSE
+++ b/LICENSE
-# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+The following applies to all files unless otherwise noted:
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,8 +26,15 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+--
+
+This repository also contains code from Hugging Face Inc., Google Research,
+and Facebook (from their Fairseq project). Files from these
+organizations have notices at the top of each file. Below are licenses
+used in those files, as indicated.
+

------------- LICENSE FOR huggingface(transformer) repository --------------
+------------- LICENSE FOR huggingface and Google Research code  --------------


                                 Apache License
@@ -229,3 +238,27 @@
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
+
+------------- LICENSE FOR Facebook Fairseq code --------------
+
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
--- a/detokenizer.py
+++ b/detokenizer.py
-import re
-
-def ptb_detokenizer(string):
-	string = string.replace(" '", "'")
-	string = string.replace(" \n", "\n")
-	string = string.replace("\n ", "\n")
-	string = string.replace(" n't", "n't")
-	string = string.replace(" N ","1 ")
-	string = string.replace("$ 1", "$1")
-	string = string.replace("# 1", "#1")
-	return string
-
-
-def wikitext_detokenizer(string):
-	#contractions
-	string = string.replace("s '", "s'")
-	string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
-	# number separators
-	string = string.replace(" @-@ ", "-")
-	string = string.replace(" @,@ ", ",")
-	string = string.replace(" @.@ ", ".")
-	#punctuation
-	string = string.replace(" : ", ": ")
-	string = string.replace(" ; ", "; ")
-	string = string.replace(" . ", ". ")
-	string = string.replace(" ! ", "! ")
-	string = string.replace(" ? ", "? ")
-	string = string.replace(" , ", ", ")
-	# double brackets
-	string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
-	string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
-	string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
-	string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
-	string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
-	# miscellaneous
-	string = string.replace("= = = =", "====")
-	string = string.replace("= = =", "===")
-	string = string.replace("= =", "==")
-	string = string.replace(" "+chr(176)+" ", chr(176))
-	string = string.replace(" \n", "\n")
-	string = string.replace("\n ", "\n")
-	string = string.replace(" N ", " 1 ")
-	string = string.replace(" 's", "'s")
-
-	return string
-
-def lambada_detokenizer(string):
-	return string
-
-def get_detokenizer(path):
-	for key in DETOKENIZERS.keys():
-		if key in path:
-			print(key)
-			return DETOKENIZERS[key]
-
-DETOKENIZERS = {
-	'ptb': ptb_detokenizer,
-	'wikitext': wikitext_detokenizer,
-	'lambada': lambada_detokenizer,
-}
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
-# ===========
-# base images
-# ===========
-FROM nvcr.io/nvidia/pytorch:19.09-py3
-
-
-# ===============
-# system packages
-# ===============
-RUN apt-get update && apt-get install -y \
-    bash-completion \
-    emacs \
-    git \
-    graphviz \
-    htop \
-    libopenexr-dev \
-    rsync \
-    wget \
-&& rm -rf /var/lib/apt/lists/*
-
-
-# ============
-# pip packages
-# ============
-RUN pip install --upgrade pip && \
-    pip install --upgrade setuptools
-COPY requirements.txt /tmp/
-RUN pip install --upgrade --ignore-installed -r /tmp/requirements.txt
-
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
-boto3
-google-cloud-language
-inflect
-nltk
-numpy
-pandas
-requests
-sentencepiece
-tensorflow
-tqdm
--- a/evaluate_gpt2.py
+++ b/evaluate_gpt2.py
--- a/examples/evaluate_zeroshot_gpt2.sh
+++ b/examples/evaluate_zeroshot_gpt2.sh
+#!/bin/bash
+
+WORLD_SIZE=8
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+TASK="LAMBADA"
+
+VALID_DATA=<lambada path>
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+CHECKPOINT=checkpoints/gpt2_345m
+
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+               --task $TASK \
+               --valid-data $VALID_DATA \
+               --tokenizer-type GPT2BPETokenizer \
+               --strict-lambada
+               --vocab-file $VOCAB_FILE \
+               --merge-file $MERGE_FILE \
+               --load $CHECKPOINT \
+               --model-parallel-size 1 \
+               --num-layers 24 \
+               --hidden-size 1024 \
+               --num-attention-heads 16 \
+               --batch-size 8 \
+               --checkpoint-activations \
+               --seq-length 1024 \
+               --max-position-embeddings 1024 \
+               --log-interval 10 \
+               --fp16 \
+               --no-load-optim \
+               --no-load-rng
--- a/examples/finetune_mnli_distributed.sh
+++ b/examples/finetune_mnli_distributed.sh
+#!/bin/bash
+
+WORLD_SIZE=8
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+TRAIN_DATA="data/glue_data/MNLI/train.tsv"
+VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
+            data/glue_data/MNLI/dev_mismatched.tsv"
+PRETRAINED_CHECKPOINT=checkpoints/bert_345m
+VOCAB_FILE=bert-vocab.txt
+CHECKPOINT_PATH=checkpoints/bert_345m_mnli
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+               --task MNLI \
+               --seed 1234 \
+               --train-data $TRAIN_DATA \
+               --valid-data $VALID_DATA \
+               --tokenizer-type BertWordPieceLowerCase \
+               --vocab-file $VOCAB_FILE \
+               --epochs 5 \
+               --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
+               --model-parallel-size 1 \
+               --num-layers 24 \
+               --hidden-size 1024 \
+               --num-attention-heads 16 \
+               --batch-size 8 \
+               --checkpoint-activations \
+               --lr 5.0e-5 \
+               --lr-decay-style linear \
+               --warmup 0.065 \
+               --seq-length 512 \
+               --max-position-embeddings 512 \
+               --save-interval 500000 \
+               --save $CHECKPOINT_PATH \
+               --log-interval 10 \
+               --eval-interval 100 \
+               --eval-iters 50 \
+               --weight-decay 1.0e-1 \
+               --fp16
--- a/examples/finetune_race_distributed.sh
+++ b/examples/finetune_race_distributed.sh
+#!/bin/bash
+
+WORLD_SIZE=8
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+TRAIN_DATA="data/RACE/train/middle"
+VALID_DATA="data/RACE/dev/middle \
+            data/RACE/dev/high"
+VOCAB_FILE=bert-vocab.txt
+PRETRAINED_CHECKPOINT=checkpoints/bert_345m
+CHECKPOINT_PATH=checkpoints/bert_345m_race
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+               --task RACE \
+               --seed 1234 \
+               --train-data $TRAIN_DATA \
+               --valid-data $VALID_DATA \
+               --tokenizer-type BertWordPieceLowerCase \
+               --vocab-file $VOCAB_FILE \
+               --epochs 3 \
+               --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
+               --model-parallel-size 1 \
+               --num-layers 24 \
+               --hidden-size 1024 \
+               --num-attention-heads 16 \
+               --batch-size 4 \
+               --checkpoint-activations \
+               --lr 1.0e-5 \
+               --lr-decay-style linear \
+               --warmup 0.06 \
+               --seq-length 512 \
+               --max-position-embeddings 512 \
+               --save-interval 100000 \
+               --save $CHECKPOINT_PATH \
+               --log-interval 10 \
+               --eval-interval 100 \
+               --eval-iters 50 \
+               --weight-decay 1.0e-1 \
+               --clip-grad 1.0 \
+               --hidden-dropout 0.1 \
+               --attention-dropout 0.1 \
+               --fp16
--- a/examples/generate_text.sh
+++ b/examples/generate_text.sh
+#!/bin/bash
+
+CHECKPOINT_PATH=checkpoints/gpt2_345m
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+
+python tools/generate_samples_gpt2.py \
+       --model-parallel-size 1 \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --load $CHECKPOINT_PATH \
+       --num-attention-heads 16 \
+       --max-position-embeddings 1024 \
+       --tokenizer-type GPT2BPETokenizer \
+       --fp16 \
+       --batch-size 2 \
+       --seq-length 1024 \
+       --out-seq-length 1024 \
+       --temperature 1.0 \
+       --vocab-file $VOCAB_FILE \
+       --merge-file $MERGE_FILE \
+       --genfile unconditional_samples.json \
+       --num-samples 2 \
+       --top_p 0.9 \
+       --recompute
--- a/examples/merge_mp_bert.sh
+++ b/examples/merge_mp_bert.sh
+#!/bin/bash
+
+MODEL_PARALLEL_SIZE=2
+
+VOCAB_FILE=bert-vocab.txt
+CHECKPOINT_PATH=checkpoints/bert_345m
+
+WORLD_SIZE=$MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
+                                --model-type BERT \
+                                --model-parallel-size $MODEL_PARALLEL_SIZE \
+                                --tokenizer-type BertWordPieceLowerCase \
+                                --vocab-file $VOCAB_FILE \
+                                --num-layers 24 \
+                                --hidden-size 1024 \
+                                --num-attention-heads 16 \
+                                --seq-length 512 \
+                                --max-position-embeddings 512 \
+                                --load $CHECKPOINT_PATH
--- a/scripts/pretrain_bert.sh
+++ b/scripts/pretrain_bert.sh
@@ -2,6 +2,8 @@

 RANK=0
 WORLD_SIZE=1
+DATA_PATH=<Specify path and file prefix>_text_sentence
+CHECKPOINT_PATH=<Specify path>

 python pretrain_bert.py \
       --num-layers 24 \
@@ -9,26 +11,25 @@ python pretrain_bert.py \
       --num-attention-heads 16 \
       --batch-size 4 \
       --seq-length 512 \
-       --max-preds-per-seq 80 \
       --max-position-embeddings 512 \
-       --train-iters 1000000 \
-       --save checkpoints/bert_345m \
-       --load checkpoints/bert_345m \
-       --resume-dataloader \
-       --train-data wikipedia \
-       --lazy-loader \
-       --tokenizer-type BertWordPieceTokenizer \
-       --tokenizer-model-type bert-large-uncased \
-       --presplit-sentences \
-       --cache-dir cache \
+       --train-iters 2000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file bert-vocab.txt \
+       --data-impl mmap \
       --split 949,50,1 \
       --distributed-backend nccl \
       --lr 0.0001 \
+       --min-lr 0.00001 \
       --lr-decay-style linear \
       --lr-decay-iters 990000 \
       --weight-decay 1e-2 \
       --clip-grad 1.0 \
       --warmup .01 \
-       --fp16 \
-       --fp32-layernorm \
-       --fp32-embedding
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
+
--- a/scripts/pretrain_bert_distributed.sh
+++ b/scripts/pretrain_bert_distributed.sh
@@ -8,36 +8,37 @@ NNODES=1
 NODE_RANK=0
 WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))

+DATA_PATH=<Specify path and file prefix>_text_sentence
+CHECKPOINT_PATH=<Specify path>
+
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"

 python -m torch.distributed.launch $DISTRIBUTED_ARGS \
       pretrain_bert.py \
+       --model-parallel-size 1 \
       --num-layers 24 \
       --hidden-size 1024 \
       --num-attention-heads 16 \
       --batch-size 4 \
       --seq-length 512 \
-       --max-preds-per-seq 80 \
       --max-position-embeddings 512 \
       --train-iters 1000000 \
-       --save checkpoints/bert_345m \
-       --load checkpoints/bert_345m \
-       --resume-dataloader \
-       --train-data wikipedia \
-       --lazy-loader \
-       --tokenizer-type BertWordPieceTokenizer \
-       --tokenizer-model-type bert-large-uncased \
-       --presplit-sentences \
-       --cache-dir cache \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file bert-vocab.txt \
+       --data-impl mmap \
       --split 949,50,1 \
       --distributed-backend nccl \
       --lr 0.0001 \
       --lr-decay-style linear \
+       --min-lr 1.0e-5 \
       --lr-decay-iters 990000 \
       --weight-decay 1e-2 \
       --clip-grad 1.0 \
       --warmup .01 \
-       --fp16 \
-       --fp32-layernorm \
-       --fp32-embedding
-
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/scripts/pretrain_gpt2.sh
+++ b/scripts/pretrain_gpt2.sh
@@ -5,6 +5,10 @@
 RANK=0
 WORLD_SIZE=1

+DATA_PATH=<Specify path and file prefix>_text_document
+CHECKPOINT_PATH=<Specify path>
+
+
 python pretrain_gpt2.py \
       --num-layers 24 \
       --hidden-size 1024 \
@@ -12,22 +16,27 @@ python pretrain_gpt2.py \
       --batch-size 8 \
       --seq-length 1024 \
       --max-position-embeddings 1024 \
-       --train-iters 320000 \
-       --save checkpoints/gpt2_345m \
-       --load checkpoints/gpt2_345m \
-       --resume-dataloader \
-       --train-data wikipedia \
-       --lazy-loader \
-       --tokenizer-type GPT2BPETokenizer \
-       --cache-dir cache \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --data-impl mmap \
       --split 949,50,1 \
       --distributed-backend nccl \
       --lr 0.00015 \
+       --min-lr 1.0e-5 \
       --lr-decay-style cosine \
       --weight-decay 1e-2 \
       --clip-grad 1.0 \
       --warmup .01 \
       --checkpoint-activations \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
       --fp16



--- a/scripts/pretrain_gpt2_distributed.sh
+++ b/scripts/pretrain_gpt2_distributed.sh
@@ -10,33 +10,43 @@ NNODES=1
 NODE_RANK=0
 WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))

+DATA_PATH=<Specify path and file prefix>_text_document
+CHECKPOINT_PATH=<Specify path>
+
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"

 python -m torch.distributed.launch $DISTRIBUTED_ARGS \
       pretrain_gpt2.py \
+       --model-parallel-size 1 \
       --num-layers 24 \
       --hidden-size 1024 \
       --num-attention-heads 16 \
       --batch-size 8 \
       --seq-length 1024 \
       --max-position-embeddings 1024 \
-       --train-iters 320000 \
-       --save checkpoints/gpt2_345m \
-       --load checkpoints/gpt2_345m \
-       --resume-dataloader \
-       --train-data wikipedia \
-       --lazy-loader \
-       --tokenizer-type GPT2BPETokenizer \
-       --cache-dir cache \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --data-impl mmap \
       --split 949,50,1 \
       --distributed-backend nccl \
       --lr 0.00015 \
       --lr-decay-style cosine \
+       --min-lr 1.0e-5 \
       --weight-decay 1e-2 \
       --clip-grad 1.0 \
       --warmup .01 \
       --checkpoint-activations \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
       --fp16


+
 set +x
--- a/gpt2_data_loader.py
+++ b/gpt2_data_loader.py
-# coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-
-import numpy as np
-import torch
-from torch.multiprocessing import Lock
-from torch.utils.data import Dataset
-
-from megatron import mpu
-from megatron.data_utils.samplers import DistributedBatchSampler
-from megatron.data_utils.tokenization_gpt2 import GPT2Tokenizer
-
-
-def make_gpt2_dataloaders(args):
-
-    # Input parameters.
-    input_data_sizes_file = args.input_data_sizes_file
-    seq_length = args.seq_length
-    initial_seed = args.seed
-
-    # Data parallel arguments.
-    world_size = mpu.get_data_parallel_world_size()
-    rank = mpu.get_data_parallel_rank()
-    global_batch_size = args.batch_size * world_size
-    num_workers = args.num_workers
-
-    def make_data_loader_(data_path):
-        # Build the dataset.
-        dataset = GPT2Dataset(data_path, input_data_sizes_file,
-                              seq_length, initial_seed)
-        # Use a simple sampler with distributed batch sampler.
-        sampler = torch.utils.data.SequentialSampler(dataset)
-        batch_sampler = DistributedBatchSampler(sampler=sampler,
-                                                batch_size=global_batch_size,
-                                                drop_last=True,
-                                                rank=rank,
-                                                world_size=world_size)
-        # Torch dataloader.
-        return torch.utils.data.DataLoader(dataset,
-                                           batch_sampler=batch_sampler,
-                                           num_workers=num_workers,
-                                           pin_memory=True)
-
-    train = make_data_loader_(args.train_data)
-    valid = make_data_loader_(args.valid_data)
-    test = make_data_loader_(args.test_data)
-
-    args.do_train = False
-    args.do_valid = False
-    args.do_test = False
-
-    if train is not None:
-        args.do_train = True
-    if valid is not None:
-        args.do_valid = True
-    if test is not None:
-        args.do_test = True
-
-    # Tokenizer.
-    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=args.cache_dir)
-    eod_token = tokenizer.encoder['<|endoftext|>']
-    num_tokens = eod_token + 1
-
-    return (train, valid, test), num_tokens, eod_token
-
-
-class GPT2Dataset(Dataset):
-
-    def __init__(self, data_path, sizes_filename, seq_length,
-                 initial_seed, max_epochs=100):
-        # Input parameters.
-        self.data_path = data_path
-        self.sizes_filename = sizes_filename
-        self.seq_length = seq_length
-        self.initial_seed = initial_seed
-        self.max_epochs = max_epochs
-        # Lock for building the dataset.
-        self.lock = Lock()
-
-        # Shard stuff.
-        # Dictionary from shard nameto its size (number of element).
-        self.master_shard_size_dict = None
-        # Dictionary from shard name to modified size so it is
-        # divisible by self.seq_length.
-        self.shard_size_dict = None
-        # Long array (self.max_epochs * num-shards) populated
-        # randomly with shard names.
-        self.shards_name = None
-        # Start index of the data for a shard.
-        self.shards_start_index = None
-        self.build_shard_mappings_()
-        self.data_length = self.shards_start_index[-1]
-
-        # Data.
-        self.shards_data = [None]*self.shards_name.size
-        self.shards_sample_index = [None]*self.shards_name.size
-
-    def __len__(self):
-        return self.data_length
-
-    def __getitem__(self, idx):
-        # Find which shard we need.
-        shard_index = np.searchsorted(self.shards_start_index,
-                                      idx, side='right') - 1
-        # data index in the shard.
-        data_idx = idx - self.shards_start_index[shard_index]
-        # Load the shard if it is not in memory.
-        #self.lock.acquire()
-        if self.shards_data[shard_index] is None:
-            print('global rank {} is building data for shard index {} ...'.
-                  format(torch.distributed.get_rank(), shard_index))
-            self.build_dataset_(shard_index)
-        #assert self.shards_data[shard_index] is not None
-        #self.lock.release()
-        # Start index.
-        start_index = self.shards_sample_index[shard_index][data_idx]
-        # Add one for label shift.
-        end_index = start_index + self.seq_length + 1
-        data = self.shards_data[shard_index][start_index:end_index]
-        return {'text': np.array(data, dtype=np.int64)}
-
-    def build_dataset_(self, shard_index):
-        # Garbage collect so we don't use a lot of memory.
-        # Leave the last one in case other threads have not catche up yet.
-        #for i in range(shard_index - 1):
-        for i in range(shard_index):
-            self.shards_data[i] = None
-            self.shards_sample_index[i] = None
-        # Read the shard.
-        filename = os.path.join(self.data_path, self.shards_name[shard_index])
-        print('loading {}'.format(filename))
-        data = np.load(filename, allow_pickle=True)
-        # Shuffle the data
-        rng = np.random.RandomState(self.initial_seed + shard_index)
-        rng.shuffle(data)
-        # Flatten.
-        data = np.hstack(data)
-        size = (data.shape[0] - 1) // self.seq_length
-        last_index = size * self.seq_length + 1
-        data = data[0:last_index]
-        self.shards_data[shard_index] = data
-        indices = np.arange(size) * self.seq_length
-        rng.shuffle(indices)
-        self.shards_sample_index[shard_index] = indices
-
-    def build_shard_mappings_(self):
-        # Load the sizes file.
-        sizes_filename = os.path.join(self.data_path, self.sizes_filename)
-        if torch.distributed.get_rank() == 0:
-            print(' > loading sizes from {}'.format(sizes_filename))
-        with open(sizes_filename, 'r') as f:
-            self.master_shard_size_dict = json.load(f)
-        if torch.distributed.get_rank() == 0:
-            print('   found {} shards'.format(len(self.master_shard_size_dict)))
-        # Adjust sizes to be a multiple of seq_length.
-        self.shard_size_dict = self.master_shard_size_dict.copy()
-        total_samples = 0
-        for shard in self.shard_size_dict:
-            size = self.shard_size_dict[shard]
-            size = ((size - 1) // self.seq_length) * self.seq_length
-            total_samples += size // self.seq_length
-            self.shard_size_dict[shard] = size
-        if torch.distributed.get_rank() == 0:
-            print('   found {} samples in the dataset'.format(total_samples))
-        # Build a list of shards.
-        shards_ = np.sort(np.array(list(self.shard_size_dict.keys())))
-        rng = np.random.RandomState(self.initial_seed)
-        self.shards_name = np.copy(shards_)
-        rng.shuffle(self.shards_name)
-        for i in range(1, self.max_epochs):
-            shards_c = np.copy(shards_)
-            rng.shuffle(shards_c)
-            self.shards_name = np.append(self.shards_name, shards_c)
-        # Build the global indexing.
-        self.shards_start_index = np.zeros(self.shards_name.size, dtype=np.int)
-        self.shards_start_index[0] = 0
-        for i in range(1, self.shards_name.size):
-            shard = str(self.shards_name[i-1])
-            size = self.shard_size_dict[shard]
-            self.shards_start_index[i] = self.shards_start_index[i-1] + \
-                                         size // self.seq_length
-
-'''
-if __name__ == '__main__':
-
-    print('gpt2 data loader ...')
-    path = '/raid/mshoeybi/data/gpt2/adlr/reddit_all_ftfy_lg200/npys'
-
-    dataset = GPT2Dataset(path, 'sizes.txt', 1024, 1234, 100)
-    print('dataset contains {} samples'.format(dataset.data_length))
-
-    for i in range(len(dataset)):
-        if i % 512000 == 0:
-            print(i)
-        data = dataset[i]
-'''
--- a/openwebtext/tokenizer.py
+++ b/openwebtext/tokenizer.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,24 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import sys
-sys.path.append('..')
+import torch

-from megatron.data_utils.tokenization_gpt2 import GPT2Tokenizer
+from .global_vars import get_args
+from .global_vars import get_tokenizer
+from .global_vars import get_tensorboard_writer
+from .global_vars import get_adlr_autoresume
+from .global_vars import get_timers


-class Tokenizer:
-
-    def __init__(self, cache_dir=None):
-        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
-                                                       cache_dir=cache_dir)
-        self.tokenizer.max_len = int(1e12)
-        self.eod_token = self.tokenizer.encoder['<|endoftext|>']
-        assert self.eod_token < 65535, 'vocab size will not fit in uint16'
-        print('> GPT2 tokenizer with {} vocab size and eod token {} ...'.format(
-            len(self.tokenizer.encoder), self.eod_token))
-
-    def tokenize_document(self, document):
-        tokens = self.tokenizer.encode(document)
-        tokens.append(self.eod_token)
-        return tokens
+def print_rank_0(message):
+    """If distributed is initialized print only on rank 0."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
--- a/arguments.py
+++ b/arguments.py
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Input/output checkpointing."""
+
+import os
+import random
+import sys
+import numpy as np
+
+import torch
+from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
+
+from megatron import mpu
+from megatron import get_args
+from megatron import print_rank_0
+
+
+def check_checkpoint_args(checkpoint_args):
+    """Ensure fixed arguments for a model are the same for the input
+    arguments and the one retreived frm checkpoint."""
+    args = get_args()
+
+    def _compare(arg_name):
+        checkpoint_value = getattr(checkpoint_args, arg_name)
+        args_value = getattr(args, arg_name)
+        error_message = '{} value from checkpoint ({}) is not equal to the ' \
+                        'input argument value ({}).'.format(
+                            arg_name, checkpoint_value, args_value)
+        assert checkpoint_value == args_value, error_message
+
+    _compare('num_layers')
+    _compare('hidden_size')
+    _compare('num_attention_heads')
+    _compare('max_position_embeddings')
+    _compare('make_vocab_size_divisible_by')
+    _compare('padded_vocab_size')
+    _compare('tokenizer_type')
+    _compare('model_parallel_size')
+
+
+def ensure_directory_exists(filename):
+    """Build filename's path if it does not already exists."""
+    dirname = os.path.dirname(filename)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+
+def get_checkpoint_name(checkpoints_path, iteration,
+                        release=False, mp_rank=None):
+    """A unified checkpoint name."""
+    if release:
+        directory = 'release'
+    else:
+        directory = 'iter_{:07d}'.format(iteration)
+    return os.path.join(checkpoints_path, directory,
+                        'mp_rank_{:02d}'.format(
+                            mpu.get_model_parallel_rank() if mp_rank is None
+                            else mp_rank),
+                        'model_optim_rng.pt')
+
+
+def get_checkpoint_tracker_filename(checkpoints_path):
+    """Tracker file rescords the latest chckpoint during
+    training to restart from."""
+    return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt')
+
+
+def save_checkpoint(iteration, model, optimizer, lr_scheduler):
+    """Save a model checkpoint."""
+    args = get_args()
+
+    # Only rank zero of the data parallel writes to the disk.
+    if isinstance(model, torchDDP):
+        model = model.module
+    if mpu.get_data_parallel_rank() == 0:
+
+        # Arguments, iteration, and model.
+        state_dict = {}
+        state_dict['args'] = args
+        state_dict['iteration'] = iteration
+        state_dict['model'] = model.state_dict_for_save_checkpoint()
+
+        # Optimizer stuff.
+        if not args.no_save_optim:
+            if optimizer is not None:
+                state_dict['optimizer'] = optimizer.state_dict()
+            if lr_scheduler is not None:
+                state_dict['lr_scheduler'] = lr_scheduler.state_dict()
+
+        # RNG states.
+        if not args.no_save_rng:
+            state_dict['random_rng_state'] = random.getstate()
+            state_dict['np_rng_state'] = np.random.get_state()
+            state_dict['torch_rng_state'] = torch.get_rng_state()
+            state_dict['cuda_rng_state'] = torch.cuda.get_rng_state()
+            state_dict['rng_tracker_states'] \
+                = mpu.get_cuda_rng_tracker().get_states()
+
+        # Save.
+        checkpoint_name = get_checkpoint_name(args.save, iteration)
+        print('global rank {} is saving checkpoint at iteration {:7d} to {}'.
+              format(torch.distributed.get_rank(), iteration, checkpoint_name))
+        ensure_directory_exists(checkpoint_name)
+        torch.save(state_dict, checkpoint_name)
+        print('  successfully saved {}'.format(checkpoint_name))
+
+    # Wait so everyone is done (necessary)
+    torch.distributed.barrier()
+    # And update the latest iteration
+    if torch.distributed.get_rank() == 0:
+        tracker_filename = get_checkpoint_tracker_filename(args.save)
+        with open(tracker_filename, 'w') as f:
+            f.write(str(iteration))
+    # Wait so everyone is done (not necessary)
+    torch.distributed.barrier()
+
+
+def load_checkpoint(model, optimizer, lr_scheduler):
+    """Load a model checkpoint and return the iteration."""
+    args = get_args()
+
+    if isinstance(model, torchDDP):
+        model = model.module
+    # Read the tracker file and set the iteration.
+    tracker_filename = get_checkpoint_tracker_filename(args.load)
+
+    # If no tracker file, return iretation zero.
+    if not os.path.isfile(tracker_filename):
+        print_rank_0('WARNING: could not find the metadata file {} '.format(
+            tracker_filename))
+        print_rank_0('    will not load any checkpoints and will start from '
+                     'random')
+        return 0
+
+    # Otherwise, read the tracker file and either set the iteration or
+    # mark it as a release checkpoint.
+    iteration = 0
+    release = False
+    with open(tracker_filename, 'r') as f:
+        metastring = f.read().strip()
+        try:
+            iteration = int(metastring)
+        except ValueError:
+            release = metastring == 'release'
+            if not release:
+                print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(
+                    tracker_filename))
+                sys.exit()
+
+    assert iteration > 0 or release, 'error parsing metadata file {}'.format(
+        tracker_filename)
+
+    # Checkpoint.
+    checkpoint_name = get_checkpoint_name(args.load, iteration, release)
+    if mpu.get_data_parallel_rank() == 0:
+        print('global rank {} is loading checkpoint {}'.format(
+            torch.distributed.get_rank(), checkpoint_name))
+
+    # Load the checkpoint.
+    try:
+        state_dict = torch.load(checkpoint_name, map_location='cpu')
+    except ModuleNotFoundError:
+        # For backward compatibility.
+        print_rank_0(' > deserializing using the old code structure ...')
+        sys.modules['fp16.loss_scaler'] = sys.modules[
+            'megatron.fp16.loss_scaler']
+        state_dict = torch.load(checkpoint_name, map_location='cpu')
+        sys.modules.pop('fp16.loss_scaler', None)
+    except BaseException:
+        print_rank_0('could not load the checkpoint')
+        sys.exit()
+
+    # Set iteration.
+    if args.finetune or release:
+        iteration = 0
+    else:
+        try:
+            iteration = state_dict['iteration']
+        except KeyError:
+            try:  # Backward compatible with older checkpoints
+                iteration = state_dict['total_iters']
+            except KeyError:
+                print_rank_0('A metadata file exists but unable to load '
+                             'iteration from checkpoint {}, exiting'.format(
+                                 checkpoint_name))
+                sys.exit()
+
+    # Check arguments.
+    if 'args' in state_dict:
+        checkpoint_args = state_dict['args']
+        check_checkpoint_args(checkpoint_args)
+    else:
+        print_rank_0('could not find arguments in the checkpoint ...')
+
+    # Model.
+    model.load_state_dict(state_dict['model'])
+
+    # Optimizer.
+    if not release and not args.finetune and not args.no_load_optim:
+        try:
+            if optimizer is not None:
+                optimizer.load_state_dict(state_dict['optimizer'])
+            if lr_scheduler is not None:
+                lr_scheduler.load_state_dict(state_dict['lr_scheduler'])
+        except KeyError:
+            print_rank_0('Unable to load optimizer from checkpoint {}. '
+                         'Specify --no-load-optim or --finetune to prevent '
+                         'attempting to load the optimizer state, '
+                         'exiting ...'.format(checkpoint_name))
+            sys.exit()
+
+    # rng states.
+    if not release and not args.finetune and not args.no_load_rng:
+        try:
+            random.setstate(state_dict['random_rng_state'])
+            np.random.set_state(state_dict['np_rng_state'])
+            torch.set_rng_state(state_dict['torch_rng_state'])
+            torch.cuda.set_rng_state(state_dict['cuda_rng_state'])
+            mpu.get_cuda_rng_tracker().set_states(
+                state_dict['rng_tracker_states'])
+        except KeyError:
+            print_rank_0('Unable to load optimizer from checkpoint {}. '
+                         'Specify --no-load-rng or --finetune to prevent '
+                         'attempting to load the optimizer state, '
+                         'exiting ...'.format(checkpoint_name))
+            sys.exit()
+
+    torch.distributed.barrier()
+    if mpu.get_data_parallel_rank() == 0:
+        print('  successfully loaded {}'.format(checkpoint_name))
+
+    return iteration
--- a/megatron/data/__init__.py
+++ b/megatron/data/__init__.py
 from . import indexed_dataset
-from .bert_tokenization import FullTokenizer as FullBertTokenizer
-from .albert_dataset import AlbertDataset