Merge branch 'megatron_sampler' into 'main'

Simplified sampler (will be needed later for batch size increase) and removed deprecated data stuff See merge request ADLR/megatron-lm!177

Merge branch 'megatron_sampler' into 'main'
Simplified sampler (will be needed later for batch size increase) and removed deprecated data stuff See merge request ADLR/megatron-lm!177
75bd9b54 · Jared Casper · ea81d62f · fac6718a · ea81d62f · ea81d62f
Commit 75bd9b54 authored Dec 02, 2020 by Jared Casper
6 changed files
--- a/megatron/deprecated_data_utils/tokenization.py
+++ b/megatron/deprecated_data_utils/tokenization.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utilities for using and training tokenizers (char, wordpiece, sentencepiece)"""
-from collections import namedtuple
-import random
-import os
-import csv
-import torch
-import nltk
-from nltk import tokenize as nltk_tokenize
-import sentencepiece as spm
-from .wordpiece import BertTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
-from .tokenization_gpt2 import GPT2Tokenizer
-import regex as re
-def make_tokenizer(tokenizer_type, corpus, model_path=None, vocab_size=None, model_type='bpe',
-                   pad_token=0, character_coverage=1.0, command_tokens=None, type_tokens=None, **kwargs):
-    """
-    Helper function to instantiate a tokenizer given common combinations of options.
-    """
-    tokenizer_class = tokenizer_type
-    if isinstance(tokenizer_class, str):
-        tokenizer_class = eval(tokenizer_class)
-    if tokenizer_class is BertWordPieceTokenizer:
-        return BertWordPieceTokenizer(model_type, **kwargs)
-    elif tokenizer_class is GPT2BPETokenizer:
-        return GPT2BPETokenizer(**kwargs)
-    text_tokenizer = tokenizer_class(corpus=corpus, vocab_size=vocab_size, model_path=model_path, model_type=model_type,
-                                     pad_token=pad_token, character_coverage=character_coverage)
-    return Tokenizer(text_tokenizer, command_tokens, type_tokens)
-class Tokenization(object):
-    """
-    Tokenization object to hold tokenization, (processed text),and original
-    text. Can hold tokenization as Ids or tokens.
-    It also holds command tokens (pad, unk, etc.) for the tokenization.
-    This allows functions to pad/operate on tokenizations without having
-    access to the full tokenizer, just the tokenization.
-    Several standard array operations are implemented (insert, append, extend).
-    """
-    def __init__(self, tokenization, text=None, original_text=None,
-                 command_tokens=None, asIds=True):
-        self.tokenization = tokenization
-        self.text = text
-        if self.text is None:
-            self.text = self.tokenization
-        self.original_text = original_text
-        if self.original_text is None:
-            self.original_text = self.text
-        self.command_tokens = command_tokens
-        self.asIds = asIds
-        self.parse_command_tokens()
-    def set_command_tokens(self, command_tokens):
-        self.command_tokens = command_tokens
-        return self.parse_command_tokens()
-    def parse_command_tokens(self):
-        if self.command_tokens is None:
-            return
-        for command_token in self.command_tokens:
-            if self.asIds:
-                setattr(self, command_token.name, command_token.Id)
-            else:
-                setattr(self, command_token.name, command_token.token)
-    def __getitem__(self, index):
-        return self.tokenization[index]
-    def __len__(self):
-        return len(self.tokenization)
-    def insert(self, idx, other):
-        if isinstance(other, (CommandToken, TypeToken)):
-            self.tokenization.insert(idx, other.Id)
-            if idx == 0:
-                self.text = other.token + self.text
-                self.original_text = other.token + self.original_text
-            elif idx == len(self.tokenization) - 1:
-                self.text += other.token
-                self.original_text += other.token
-        elif isinstance(other, Tokenization):
-            self.tokenization = self.tokenization[:idx] + \
-                other.tokenization + self.tokenization[idx:]
-        else:
-            self.tokenization = self.tokenization[:idx] + \
-                other.tokenization + self.tokenization[idx:]
-    def append(self, other):
-        if isinstance(other, (CommandToken, TypeToken)):
-            self.tokenization.append(other.Id)
-            self.text += other.token
-            self.original_text += other.token
-        elif isinstance(other, Tokenization):
-            self.tokenization.extend(other.tokenization)
-            self.text += other.text
-            self.original_text += other.original_text
-        else:
-            self.tokenization.append(other)
-        return self
-    def extend(self, other):
-        if isinstance(other, (CommandToken, TypeToken)):
-            self.tokenization.append(other.Id)
-            self.text += other.token
-            self.original_text += other.token
-        elif isinstance(other, list) and isinstance(other[0], (CommandToken, TypeToken)):
-            self.tokenization.extend([o.Id for o in other])
-            self.text += [o.token for o in other]
-            self.original_text += [o.token for o in other]
-        elif isinstance(other, Tokenization):
-            self.tokenization.extend(other.tokenization)
-            self.text += other.text
-            self.original_text += other.original_text
-        else:
-            self.tokenization.extend(other)
-        return self
-"""define some default command tokens for the tokenizer to use"""
-token_format = "<{0}>"
-COMMAND_TUPLE = namedtuple('CommandToken', ('name', 'token', 'Id'))
-def prep_command_tokens(tokenlist, token_format=token_format):
-    return [CommandToken(tok[0], token_format.format(tok[0]), tok[1]) for tok in tokenlist]
-class CommandToken(object):
-    def __init__(self, name, token, Id):
-        self.name = name
-        self.token = token
-        self.Id = Id
-    def __str__(self):
-        return str(COMMAND_TUPLE(self.name, self.token, self.Id))
-DEFAULT_COMMAND_TOKENS = [
-    ('pad', 0),
-    ('eos', 1),
-    ('bos', 2),
-    ('unk', 3),
-    ('sep', 4),
-    ('L2R', 5),
-    ('ENC', 6),
-    ('MASK', 7),
-]
-DEFAULT_COMMAND_TOKENS = prep_command_tokens(DEFAULT_COMMAND_TOKENS)
-"""define some default type tokens for bert training"""
-TYPE_TUPLE = namedtuple('TypeToken', ('name', 'token', 'Id'))
-def prep_type_tokens(tokenlist, token_format=token_format):
-    return [TypeToken(tok[0], token_format.format(tok[0]), tok[1]) for tok in tokenlist]
-class TypeToken(object):
-    def __init__(self, name, token, Id):
-        self.name = name
-        self.token = token
-        self.Id = Id
-    def __str__(self):
-        return str(TYPE_TUPLE(self.name, self.token, self.Id))
-DEFAULT_TYPE_TOKENS = [
-    ('function', 0),
-    ('command', 1),
-    ('str0', 2),
-    ('str1', 3),
-    ('str2', 4),
-    ('embedding0', 5),
-    ('embedding1', 6),
-    ('embedding2', 7),
-    ('arg0', 8),
-    ('arg1', 9),
-    ('arg2', 10),
-]
-DEFAULT_TYPE_TOKENS = prep_type_tokens(DEFAULT_TYPE_TOKENS)
-class Tokenizer(object):
-    """
-    Tokenizer object that handles text tokenization, command tokens, and type tokens.
-    Command tokens and text tokens are stored together in one mapping of size
-    `len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first
-    `len(command_tokens)` tokens. Token idx is stored at `idx+len(command_tokens)`.
-    Token types are stored in a separate mapping of size `len(type_tokens)`.
-    """
-    def __init__(self, text_tokenizer, command_tokens=None, type_tokens=None):
-        # set text tokenizer
-        self.text_tokenizer = text_tokenizer
-        if not hasattr(self, 'num_text_tokens'):
-            self.num_text_tokens = len(self.text_tokenizer)
-        # set command tokens
-        if command_tokens is None:
-            command_tokens = DEFAULT_COMMAND_TOKENS
-        self._command_tokens = command_tokens
-        self.command_name_map = {tok.name: tok for tok in self._command_tokens}
-        self.command_token_map = {tok.token: tok for tok in self._command_tokens}
-        self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
-        if not hasattr(self, 'num_command_tokens'):
-            self.num_command_tokens = len(self._command_tokens)
-        if not hasattr(self, 'num_tokens'):
-            self.num_tokens = self.num_command_tokens + self.num_text_tokens
-        # set type tokens
-        if type_tokens is None:
-            type_tokens = DEFAULT_TYPE_TOKENS
-        self.type_tokens = type_tokens
-        self.type_name_map = {tok.name: tok for tok in self.type_tokens}
-        self.type_token_map = {tok.token: tok for tok in self.type_tokens}
-        self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
-        if not hasattr(self, 'num_type_tokens'):
-            self.num_type_tokens = len(self.type_tokens)
-        # parse tokens and vocabs from tokenizer
-        self._tokens = list(self.command_token_map.keys()) + list(self.text_tokenizer.tokens)
-        self._vocab = {t: Id for Id, t in self.command_id_map.items()}
-        self._vocab.update({t: Id + self.num_command_tokens for t,
-                            Id in self.text_tokenizer.vocab.items()})
-        self._text_tokens = list(self.text_tokenizer.tokens)
-        self._text_token_vocab = {
-            t: Id + self.num_command_tokens for t,
-            Id in self.text_tokenizer.vocab.items()}
-        self._command_token_tokens = list(self.command_token_map.keys())
-        self._command_token_vocab = {t: Id for Id, t in self.command_id_map.items()}
-        self._token_types = list(self.type_token_map.keys())
-        self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
-    def __call__(self, text, process_fn=None):
-        """run preprocessing and encode text as Ids"""
-        return self.EncodeAsIds(text, process_fn=process_fn)
-    def __len__(self):
-        """total number of tokens"""
-        return self.num_tokens
-    def get_command(self, name):
-        """get command token corresponding to `name`"""
-        return self.command_name_map[name]
-    def get_type(self, name):
-        """get type token corresponding to `name`"""
-        return self.type_name_map[name]
-    @property
-    def tokens(self):
-        """list (or iterable) of all tokens for tokenizer"""
-        return self._tokens
-    @property
-    def vocab(self):
-        """dictionary mapping tokens to ids for tokenizer"""
-        return self._vocab
-    @property
-    def token_types(self):
-        """list (or iterable) of all token types for tokenizer"""
-        return self._token_types
-    @property
-    def token_type_vocab(self):
-        """dictionary mapping token types to ids for tokenizer"""
-        return self._token_type_vocab
-    @property
-    def command_tokens(self):
-        """list (or iterable) of all command tokens for tokenizer"""
-        return self._command_token_tokens
-    @property
-    def command_token_vocab(self):
-        """dictionary mapping command tokens to ids for tokenizer"""
-        return self._command_token_vocab
-    @property
-    def text_tokens(self):
-        """list (or iterable) of text tokens for text tokenizer"""
-        return self._text_tokens
-    @property
-    def text_token_vocab(self):
-        """dictionary mapping text tokens to ids for text tokenizer"""
-        return self._text_token_vocab
-    def EncodeAsIds(self, text, process_fn=None):
-        """
-        encode text using text tokenizer and shift Id values for command tokens
-        """
-        tokenization = self.text_tokenizer.EncodeAsIds(text, process_fn=process_fn)
-        tokenization.tokenization = [t + self.num_command_tokens for t in tokenization.tokenization]
-        tokenization.set_command_tokens(self._command_tokens)
-        return tokenization
-    def EncodeAsTokens(self, text, process_fn=None):
-        """
-        encode text as tokens using text tokenizer
-        """
-        tokenization = self.text_tokenizer.EncodeAsTokens(text, process_fn=process_fn)
-        tokenization.set_command_tokens(self._command_tokens)
-        return tokenization
-    def IdToToken(self, Id, type_token=False):
-        """convert Id to token accounting for command and type tokens"""
-        if isinstance(Id, (TypeToken, CommandToken)):
-            return Id.token
-        if type_token:
-            return self.type_id_map[Id].token
-        if Id < self.num_command_tokens:
-            return self.command_id_map[Id].token
-        return self.text_tokenizer.IdToToken(Id - self.num_command_tokens)
-    def TokenToId(self, token, type_token=False):
-        """convert token to Id accounting for command and type tokens"""
-        if isinstance(token, (TypeToken, CommandToken)):
-            return token.Id
-        if type_token:
-            return self.type_token_map[token].Id
-        if token in self.command_token_map:
-            return self.command_token_map[token].Id
-        return self.text_tokenizer.TokenToId(token) + self.num_command_tokens
-    def DecodeIds(self, Ids, type_token=False):
-        """
-        convert Ids to tokens accounting for command and type tokens, tokens
-        are joined and returned as a string.
-        """
-        if type_token:
-            return ' '.join(Id.token if isinstance(Id, TypeToken)
-                            else self.type_id_map[Id].token for Id in Ids)
-        rtn_strs = []
-        current_str = []
-        if isinstance(Ids, Tokenization):
-            Ids = Ids.tokenization
-        for Id in Ids:
-            if isinstance(Id, CommandToken):
-                rtn_strs.append(self.text_tokenizer.DecodeIds(current_str))
-                current_str = []
-                rtn_strs.append(t.token)
-            elif Id < self.num_command_tokens:
-                rtn_strs.append(self.text_tokenizer.DecodeIds(current_str))
-                current_str = []
-                rtn_strs.append(self.command_id_map[Id].token)
-            else:
-                current_str.append(Id - self.num_command_tokens)
-        if current_str != []:
-            rtn_strs.append(self.text_tokenizer.DecodeIds(current_str))
-        return ' '.join(rtn_strs)
-    def DecodeTokens(self, Tokens, type_token=False):
-        """
-        convert tokens to a string accounting for command and type tokens.
-        """
-        if type_token:
-            return ' '.join(t.token if isinstance(t, TypeToken) else t for t in Tokens)
-        rtn_strs = []
-        current_str = []
-        if isinstance(Tokens, Tokenization):
-            Tokens = Tokens.tokenization
-        for t in Tokens:
-            if isinstance(t, CommandToken):
-                rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str))
-                current_str = []
-                rtn_strs.append(t.token)
-            elif t in self.command_token_map:
-                rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str))
-                current_str = []
-                rtn_strs.append(t)
-            else:
-                current_str.append(t)
-        if current_str != []:
-            rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str))
-        return ' '.join(rtn_strs)
-class TextTokenizer(object):
-    """
-    Interface for text tokenizer
-    """
-    def __init__(self):
-        if not hasattr(self, 'num_text_tokens'):
-            self.num_text_tokens = 0
-        if not hasattr(self, 'num_tokens'):
-            self.num_tokens = self.num_text_tokens
-    def __call__(self, text, process_fn=None):
-        return self.EncodeAsIds(text, process_fn)
-    def __len__(self):
-        return self.num_text_tokens
-    @property
-    def tokens(self):
-        """list (or iterable) of text tokens for text tokenizer"""
-        raise NotImplementedError('TextTokenizer tokens property not implemented')
-    @property
-    def vocab(self):
-        """dictionary mapping tokens to ids"""
-        raise NotImplementedError('TextTokenizer vocab property not implemented')
-    @staticmethod
-    def exists(model_path):
-        """check if the filepath for a text tokenizer exists"""
-        raise NotImplementedError('TextTokenizer exists method not implemented')
-    def Train(self, corpus):
-        """train a tokenizer on a data corpus and save model for future use"""
-        raise NotImplementedError('TextTokenizer Train not implemented')
-    def EncodeAsIds(self, text, process_fn=None):
-        """
-        Preprocess text and encode as ids. Return a tokenization object with
-        original text, processed text, and id tokenization.
-        """
-        raise NotImplementedError('TextTokenizer EncodeAsIds not implemented')
-    def EncodeAsTokens(self, text, process_fn=None):
-        """
-        Preprocess text and encode as tokens. Return a tokenization object with
-        original text, processed text, and token tokenization.
-        """
-        raise NotImplementedError('TextTokenizer EncodeAsTokens not implemented')
-    def IdToToken(self, Id):
-        """Convert an Id to Token. Reverse lookup of self.vocab"""
-        raise NotImplementedError('TextTokenizer IdToToken not implemented')
-    def TokenToId(self, token):
-        """Convert a Token to Id. Lookup of self.vocab"""
-        raise NotImplementedError('TextTokenizer TokenToId not implemented')
-    def DecodeIds(self, Ids):
-        """Convert a list or tokenization object of Ids to a text string"""
-        raise NotImplementedError('TextTokenizer DecodeIds not implemented')
-    def DecodeTokens(self, Tokens):
-        """Convert a list or tokenization object of tokens to a text string"""
-        raise NotImplementedError('TextTokenizer DecodeTokens not implemented')
-class CharacterLevelTokenizer(TextTokenizer):
-    """
-    Text tokenizer for ASCII-256 Character Level Tokenization.
-    """
-    def __init__(self, **kwargs):
-        self.num_text_tokens = 256
-        super(CharacterLevelTokenizer, self).__init__()
-        self._tokens = [self.IdToToken(Id) for Id in range(self.num_text_tokens)]
-        self._vocab = {t: i for i, t in enumerate(self._tokens)}
-    def __len__(self):
-        return 256
-    @staticmethod
-    def exists(model_path):
-        return True
-    def Train(self, corpus):
-        pass
-    @property
-    def tokens(self):
-        return self._tokens
-    @property
-    def vocab(self):
-        return self._vocab
-    def EncodeAsIds(self, text, process_fn=None):
-        """convert text to ascii 256 Ids"""
-        processed_text = text
-        if process_fn is not None:
-            processed_text = process_fn(processed_text)
-            processed_text = str(processed_text)
-        tokens = [self.TokenToId(c) for c in processed_text]
-        return Tokenization(tokens, processed_text, text)
-    def EncodeAsTokens(self, text, process_fn=None):
-        """convert text to ascii 256 characters"""
-        processed_text = text
-        if process_fn is not None:
-            processed_text = process_fn(processed_text)
-        processed_text = str(processed_text)
-        tokens = [c for c in processed_text]
-        return Tokenization(tokens, processed_text, text, asIds=False)
-    def IdToToken(self, Id):
-        """ascii index to character"""
-        return chr(Id)
-    def TokenToId(self, token):
-        """ascii character to index"""
-        return ord(token)
-    def DecodeIds(self, Ids):
-        """converts ascii ids to tokens before joining them into text"""
-        if isinstance(Ids, Tokenization):
-            Ids = Ids.tokenization
-        return ''.join([self.IdToToken(tok) for tok in Ids])
-    def DecodeTokens(self, Tokens):
-        """just concatenates ascii tokens into text"""
-        if isinstance(Tokens, Tokenization):
-            Tokens = Tokens.tokenization
-        return ''.join(Tokens)
-MAX_SENTENCEPIECE_SENTENCES = 100000000
-def get_corpus_freq(dataset, filepath, filetype='tsv'):
-    """
-    Take corpus, split it into sentences, and extract word frequencies.
-    Write frequencies to `filepath` as a tsv. Only write the first
-    MAX_SENTENCEPIECE_SENTENCES most common words to the file.
-    """
-    nltk.download('punkt', download_dir="./nltk")
-    if filetype == 'tsv':
-        delimiter = '\t'
-    else:
-        delimiter = ','
-    print("compute corpus frequency\n", flush=True)
-    total_sentence_count = 0
-    maxlen = 0
-    freqs = {}
-    for entry in dataset:
-        if isinstance(entry, dict):
-            entry = entry['text']
-        lines = entry.strip().split('\n')
-        for line in lines:
-            sentences = nltk_tokenize.sent_tokenize(line)
-            total_sentence_count += len(sentences)
-            for sentence in sentences:
-                maxlen = max(len(line), maxlen)
-                for word in sentence.split():
-                    if word not in freqs:
-                        freqs[word] = 0
-                    freqs[word] += 1
-    print("length of freqs before truncating " + str(len(freqs)), flush=True)
-    print("file path for freq " + str(filepath), flush=True)
-    freqs_sorted = {}
-    counter = 0
-    for word, count in sorted(freqs.items(), key=lambda x: x[1], reverse=True):
-        if counter >= MAX_SENTENCEPIECE_SENTENCES:
-            break
-        counter += 1
-        freqs_sorted[word] = count
-    print("length of freqs after trancating " + str(len(freqs_sorted)), flush=True)
-    with open(filepath, 'w') as f:
-        writer = csv.writer(f, delimiter=delimiter)
-        for k, v in freqs_sorted.items():
-            writer.writerow([str(k), str(v)])
-    return total_sentence_count, maxlen
-class SentencePieceTokenizer(TextTokenizer):
-    """Trains and uses sentencepiece for text tokenization"""
-    def __init__(self, model_type='bpe', vocab_size=None, corpus=None,
-                 model_path=None, character_coverage=1.0, **kwargs):
-        self.character_coverage = character_coverage
-        self.model_type = model_type.lower()
-        self.spm_model = model_path
-        self.num_text_tokens = vocab_size
-        make_train = not SentencePieceTokenizer.exists(self.spm_model)
-        if make_train:
-            assert corpus is not None and self.num_text_tokens is not None
-            self.Train(corpus, self.num_text_tokens)
-        self._tokens = []
-        self._vocab = {}
-        self.load_spm_model()
-        super(SentencePieceTokenizer, self).__init__()
-    def __len__(self):
-        return self.num_text_tokens
-    @property
-    def tokens(self):
-        return self._tokens
-    @property
-    def vocab(self):
-        return self._vocab
-    @staticmethod
-    def exists(model_path):
-        if model_path is None:
-            return False
-        # check if path exists
-        dne = not os.path.exists(model_path)
-        # check if path.model exists
-        if dne and not model_path.endswith('.model'):
-            dne = not os.path.exists(model_path + '.model')
-        return not dne
-    def load_spm_model(self):
-        """load sentencepiece model and parse vocab"""
-        if not os.path.exists(self.spm_model) and not self.spm_model.endswith('.model'):
-            self.spm_model = self.spm_model + '.model'
-        self.sp = spm.SentencePieceProcessor()
-        self.sp.Load(self.spm_model)
-        self.vocab_size = self.num_text_tokens = len(self.sp)
-        self._tokens = [self.IdToToken(t) for t in range(self.vocab_size)]
-        self._vocab = {t: i for i, t in enumerate(self._tokens)}
-    def Train(self, corpus, num_text_tokens):
-        """train sentencepiece model on corpus using word frequencies"""
-        self.num_text_tokens = num_text_tokens
-        use_model_path = self.spm_model
-        random_hash = str(random.randint(0, 2147483647))
-        if use_model_path is None:
-            use_model_path = random_hash
-        if use_model_path.endswith('.model'):
-            use_model_path = use_model_path[:use_model_path.rfind('.model')]
-        input_path = use_model_path + '.tsv.' + random_hash
-        line_count, maxlenline = get_corpus_freq(corpus, input_path)
-        line_count = min(line_count, MAX_SENTENCEPIECE_SENTENCES)
-        print('line count used as input_sentence_size ', line_count, flush=True)
-        print('training sentencepiece model', flush=True)
-        train_string = '--input={file_path} --model_prefix={model_prefix} --vocab_size={vocab_size}' \
-            + ' --model_type={model_type} --character_coverage={character_coverage} ' \
-            + '--input_sentence_size={input_sentence_size} ' \
-            + '--input_format=tsv'
-        train_string = train_string.format(file_path=input_path, model_prefix=use_model_path, vocab_size=num_text_tokens,
-                                           model_type=self.model_type, character_coverage=self.character_coverage,
-                                           input_sentence_size=int(line_count))  # , #)#,
-        print("calling spm.SentencePieceTrainer.Train(%s)" % (train_string), flush=True)
-        spm.SentencePieceTrainer.Train(train_string)
-        os.remove(input_path)
-        self.spm_model = use_model_path + '.model'
-        print('sentencepiece model written to ' + self.spm_model, flush=True)
-    def EncodeAsIds(self, text, process_fn=None):
-        """convert text to sentencepiece Ids"""
-        processed_text = text
-        if process_fn is not None:
-            processed_text = process_fn(processed_text)
-        tokens = self.sp.EncodeAsIds(processed_text)
-        return Tokenization(tokens, processed_text, text)
-    def EncodeAsTokens(self, text, process_fn=None):
-        """convert text to sentencepiece tokens"""
-        processed_text = text
-        if process_fn is not None:
-            processed_text = process_fn(processed_text)
-        tokens = self.sp.EncodeAsTokens(processed_text)
-        return Tokenization(tokens, processed_text, text, asIds=False)
-    def IdToToken(self, Id):
-        """convert Id to sentencpiece token"""
-        return self.sp.IdToPiece(Id)
-    def TokenToId(self, token):
-        """convert sentencpiece token to Id"""
-        return self.sp.PieceToId(token)
-    def DecodeIds(self, Ids):
-        """converts ids to a text string"""
-        if isinstance(Ids, Tokenization):
-            Ids = Ids.tokenization
-        return self.sp.DecodeIds(Ids)
-    def DecodeTokens(self, Tokens):
-        """converts sentencepiece tokens to a text string"""
-        if isinstance(Tokens, Tokenization):
-            Tokens = Tokens.tokenization
-        return self.sp.DecodeTokens(Tokens)
-class BertWordPieceTokenizer(Tokenizer):
-    """
-    Loads a pretrained WordPiece tokenizer from `cache_dir` for tokenization
-    in BERT training. Default to bert-large-uncased tokenizer.
-    """
-    def __init__(self, tokenizer_model_type=None, cache_dir=None, **kwargs):
-        # default to bert-large-uncased tokenizer
-        if tokenizer_model_type not in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            tokenizer_model_type = 'bert-large-uncased'
-        if torch.distributed.get_rank() == 0:
-            print(
-                'loading BertWordPieceTokenizer (',
-                tokenizer_model_type,
-                ') from cache_dir ',
-                cache_dir)
-        do_lower_case = not ('-cased' in tokenizer_model_type or 'chinese' in tokenizer_model_type)
-        self.text_tokenizer = BertTokenizer.from_pretrained(
-            tokenizer_model_type, do_lower_case=do_lower_case, cache_dir=cache_dir)
-        if torch.distributed.get_rank() == 0:
-            print('loaded', tokenizer_model_type)
-        # disable max len warnings by increasing max len
-        self.text_tokenizer.max_len = int(1e12)
-        # set command tokens from wordpiece tokenizer values
-        self.num_command_tokens = 5
-        self.num_tokens = len(self.text_tokenizer.vocab)
-        self.num_text_tokens = self.num_tokens - 5
-        self.num_type_tokens = 2
-        self._command_tokens = [
-            CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']),
-            CommandToken('ENC', '[CLS]', self.text_tokenizer.vocab['[CLS]']),
-            CommandToken('MASK', '[MASK]', self.text_tokenizer.vocab['[MASK]']),
-            CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']),
-            CommandToken('sep', '[SEP]', self.text_tokenizer.vocab['[SEP]']),
-        ]
-        self.command_name_map = {tok.name: tok for tok in self._command_tokens}
-        self.command_token_map = {tok.token: tok for tok in self._command_tokens}
-        self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
-        # set type tokens
-        self.type_tokens = [
-            TypeToken('str0', '<str0>', 0),
-            TypeToken('str1', '<str1>', 1),
-        ]
-        self.type_name_map = {tok.name: tok for tok in self.type_tokens}
-        self.type_token_map = {tok.token: tok for tok in self.type_tokens}
-        self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
-        # parse tokens and vocabs from tokenizer
-        self._tokens = list(self.text_tokenizer.vocab.keys())
-        self._vocab = {k: v for k, v in self.text_tokenizer.vocab.items()}
-        self._text_tokens = list(self._tokens)
-        self._text_token_vocab = {k: v for k, v in self.text_tokenizer.vocab.items()}
-        self._command_token_tokens = list(self.command_token_map.keys())
-        self._command_token_vocab = {t: Id for Id, t in self.command_id_map.items()}
-        self._token_types = list(self.type_token_map.keys())
-        self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
-    def EncodeAsIds(self, text, process_fn=None):
-        """convert text to wordpiece Ids"""
-        processed_text = text
-        if process_fn is not None:
-            processed_text = process_fn(processed_text)
-        tokens = self.text_tokenizer.tokenize(processed_text)
-        Ids = self.text_tokenizer.convert_tokens_to_ids(tokens)
-        return Tokenization(Ids, processed_text, text)
-    def EncodeAsTokens(self, text, process_fn=None):
-        """convert wordpiece token to Id"""
-        processed_text = text
-        if process_fn is not None:
-            processed_text = process_fn(processed_text)
-        tokens = self.text_tokenizer.tokenize(processed_text)
-        return Tokenization(tokens, processed_text, text, asIds=False)
-    def IdToToken(self, Id, type_token=False):
-        """convert Id to sentencpiece token"""
-        if isinstance(Id, (TypeToken, CommandToken)):
-            return Id.token
-        if type_token:
-            return self.type_id_map[Id].token
-        return self.text_tokenizer.ids_to_tokens[Id]
-    def TokenToId(self, token, type_token=False):
-        """convert sentencpiece token to Id"""
-        if isinstance(token, (TypeToken, CommandToken)):
-            return token.Id
-        if type_token:
-            return self.type_token_map[token].Id
-        return self.text_tokenizer.vocab[token]
-    def DecodeIds(self, Ids, type_token=False):
-        """converts ids to wordpiece tokens and joins them as a text string"""
-        if type_token:
-            return ' '.join(Id.token if isinstance(Id, TypeToken)
-                            else self.type_id_map[Id].token for Id in Ids)
-        if isinstance(Ids, Tokenization):
-            Ids = Ids.tokenization
-        Tokens = []
-        for Id in Ids:
-            Tokens.append(self.text_tokenizer.ids_to_tokens[Id] if Id != -1 else '-1')
-        Tokens = self.text_tokenizer.convert_ids_to_tokens(Ids)
-        return ' '.join(Tokens)
-    def DecodeTokens(self, Tokens, type_token=False):
-        """converts wordpiece tokens to a text string"""
-        if type_token:
-            return ' '.join(t.token if isinstance(t, TypeToken) else t for t in Tokens)
-        if isinstance(Tokens, Tokenization):
-            Tokens = Tokens.tokenization
-        return ' '.join(Tokens)
-class GPT2BPETokenizer(Tokenizer):
-    def __init__(self, cache_dir=None, **kwargs):
-        self.text_tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
-                                                            cache_dir=cache_dir)
-        # disable max len warnings by increasing max len
-        self.text_tokenizer.max_len = int(1e12)
-        self.num_command_tokens = 2
-        self.num_tokens = len(self.text_tokenizer.encoder)
-        self.num_text_tokens = self.num_tokens - 1
-        self.num_type_tokens = 2
-        self._command_tokens = [
-            CommandToken('pad', '<|endoftext|>', self.text_tokenizer.encoder['<|endoftext|>']),
-            CommandToken('eos', '<|endoftext|>', self.text_tokenizer.encoder['<|endoftext|>']),
-        ]
-        self.command_name_map = {tok.name: tok for tok in self._command_tokens}
-        self.command_token_map = {tok.token: tok for tok in self._command_tokens}
-        self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
-        self.type_tokens = [
-            TypeToken('str0', '<str0>', 0),
-            TypeToken('str1', '<str1>', 1),
-        ]
-        self.type_name_map = {tok.name: tok for tok in self.type_tokens}
-        self.type_token_map = {tok.token: tok for tok in self.type_tokens}
-        self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
-        self._tokens = list(self.text_tokenizer.encoder.keys())
-        self._vocab = {k: v for k, v in self.text_tokenizer.encoder.items()}
-        self._text_tokens = list(self._tokens)
-        self._text_token_vocab = {k: v for k, v in self.text_tokenizer.encoder.items()}
-        self._command_token_tokens = list(self.command_token_map.keys())
-        self._command_token_vocab = {t: Id for Id, t in self.command_id_map.items()}
-        self._token_types = list(self.type_token_map.keys())
-        self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
-    def EncodeAsIds(self, text, process_fn=None):
-        processed_text = text
-        if process_fn is not None:
-            processed_text = process_fn(processed_text)
-        Ids = self.text_tokenizer.encode(processed_text)
-        # return Tokenization(Ids, processed_text, text)
-        tokenization = Tokenization(Ids, processed_text, text)
-        tokenization.set_command_tokens(self._command_tokens)
-        return tokenization
-    def EncodeAsTokens(self, text, process_fn=None):
-        processed_text = text
-        if process_fn is not None:
-            processed_text = process_fn(processed_text)
-        tokens = []
-        for token in re.findall(self.text_tokenizer.pat, processed_text):
-            token = ''.join(self.text_tokenizer.bye_encoder[b] for b in token.encode('utf-8'))
-            tokens.extend(bpe_token for bpe_token in self.text_tokenizer.bpe(token).split(' '))
-        tokenization = Tokenization(tokens, processed_text, text, asIds=False)
-        tokenization.set_command_tokens(self._command_tokens)
-        return tokenization
-        # return Tokenization(tokens, processed_text, text, asIds=False)
-    def IdToToken(self, Id, type_token=False):
-        if isinstance(Id, (TypeToken, CommandToken)):
-            return Id.token
-        if type_token:
-            return self.type_id_map[Id].token
-        return self.text_tokenizer.decoder[Id]
-    def TokenToId(self, token, type_token=False):
-        if isinstance(token, (TypeToken, CommandToken)):
-            return token.Id
-        if type_token:
-            return self.type_token_map[token].Id
-        return self.text_tokenizer.encoder[token]
-    def DecodeIds(self, Ids, type_token=False):
-        if type_token:
-            return ' '.join(Id.token if isinstance(Id, TypeToken)
-                            else self.type_id_map[Id].token for Id in Ids)
-        if isinstance(Ids, Tokenization):
-            Ids = Ids.tokenization
-        return self.text_tokenizer.decode(Ids)
-    def DecodeTokens(self, Tokens, type_token=False):
-        if type_token:
-            return ' '.join(t.token if isinstance(t, TypeToken) else t for t in Tokens)
-        if isinstance(Tokens, Tokenization):
-            Tokens = Tokens.tokenization
-        return self.text_tokenizer.decode([self.TokenToId(tok) for tok in Tokens])
--- a/megatron/deprecated_data_utils/tokenization_gpt2.py
+++ b/megatron/deprecated_data_utils/tokenization_gpt2.py
-# coding=utf-8
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for OpenAI GPT."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
-import sys
-import json
-import logging
-import os
-import regex as re
-from io import open
-try:
-    from functools import lru_cache
-except ImportError:
-    # Just a dummy decorator to get the checks to run on python2
-    # because honestly I don't want to support a byte-level unicode BPE
-    # tokenizer on python 2 right now.
-    def lru_cache():
-        return lambda func: func
-from .file_utils import cached_path
-logger = logging.getLogger(__name__)
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
-}
-PRETRAINED_MERGES_ARCHIVE_MAP = {
-    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
-}
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
-    'gpt2': 1024,
-}
-VOCAB_NAME = 'vocab.json'
-MERGES_NAME = 'merges.txt'
-SPECIAL_TOKENS_NAME = 'special_tokens.txt'
-@lru_cache()
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    _chr = unichr if sys.version_info[0] == 2 else chr
-    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \
-        list(range(ord("®"), ord("ÿ") + 1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [_chr(n) for n in cs]
-    return dict(zip(bs, cs))
-def get_pairs(word):
-    """Return set of symbol pairs in a word.
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-class GPT2Tokenizer(object):
-    """
-    GPT-2 BPE tokenizer. Peculiarities:
-        - Byte-level BPE
-    """
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
-        """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
-            special_tokens_file = None
-        else:
-            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
-            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
-            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
-            if not os.path.exists(special_tokens_file):
-                special_tokens_file = None
-            else:
-                logger.info("loading special tokens file {}".format(special_tokens_file))
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                "at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    pretrained_model_name_or_path,
-                    vocab_file, merges_file))
-            return None
-        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-            logger.info("loading merges file {}".format(merges_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-            logger.info("loading merges file {} from cache at {}".format(
-                merges_file, resolved_merges_file))
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
-            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
-            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        if special_tokens_file and 'special_tokens' not in kwargs:
-            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-        else:
-            special_tokens = kwargs.pop('special_tokens', [])
-        tokenizer = cls(
-            resolved_vocab_file,
-            resolved_merges_file,
-            special_tokens=special_tokens,
-            *inputs,
-            **kwargs)
-        return tokenizer
-    def __init__(self, vocab_file, merges_file, errors='replace',
-                 special_tokens=None, max_len=None):
-        self.max_len = max_len if max_len is not None else int(1e12)
-        self.encoder = json.load(open(vocab_file))
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-        # Should haved added re.IGNORECASE so BPE merges can happen for
-        # capitalized versions of contractions
-        self.pat = re.compile(
-            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-        self.special_tokens = {}
-        self.special_tokens_decoder = {}
-        self.set_special_tokens(special_tokens)
-    def __len__(self):
-        return len(self.encoder) + len(self.special_tokens)
-    def set_special_tokens(self, special_tokens):
-        """ Add a list of additional tokens to the encoder.
-            The additional tokens are indexed starting from the last index of the
-            current vocabulary in the order of the `special_tokens` list.
-        """
-        if not special_tokens:
-            self.special_tokens = {}
-            self.special_tokens_decoder = {}
-            return
-        self.special_tokens = dict((tok, len(self.encoder) + i)
-                                   for i, tok in enumerate(special_tokens))
-        self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()}
-        logger.info("Special tokens {}".format(self.special_tokens))
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-        if not pairs:
-            return token
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except BaseException:
-                    new_word.extend(word[i:])
-                    break
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = ' '.join(word)
-        self.cache[token] = word
-        return word
-    def tokenize(self, text):
-        """ Tokenize a string. """
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            if sys.version_info[0] == 2:
-                token = ''.join(self.byte_encoder[ord(b)] for b in token)
-            else:
-                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
-        return bpe_tokens
-    def convert_tokens_to_ids(self, tokens):
-        """ Converts a sequence of tokens into ids using the vocab. """
-        ids = []
-        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
-            if tokens in self.special_tokens:
-                return self.special_tokens[tokens]
-            else:
-                return self.encoder.get(tokens, 0)
-        for token in tokens:
-            if token in self.special_tokens:
-                ids.append(self.special_tokens[token])
-            else:
-                ids.append(self.encoder.get(token, 0))
-        if len(ids) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this OpenAI GPT model ({} > {}). Running this"
-                " sequence through the model will result in indexing errors".format(
-                    len(ids), self.max_len)
-            )
-        return ids
-    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-        """Converts a sequence of ids in BPE tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            if i in self.special_tokens_decoder:
-                if not skip_special_tokens:
-                    tokens.append(self.special_tokens_decoder[i])
-            else:
-                tokens.append(self.decoder[i])
-        return tokens
-    def encode(self, text):
-        return self.convert_tokens_to_ids(self.tokenize(text))
-    def decode(self, tokens):
-        text = ''.join([self.decoder[token] for token in tokens])
-        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
-        return text
-    def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary and merge files to a directory."""
-        if not os.path.isdir(vocab_path):
-            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
-            return
-        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
-        merge_file = os.path.join(vocab_path, MERGES_NAME)
-        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
-        with open(vocab_file, 'w', encoding='utf-8') as f:
-            f.write(json.dumps(self.encoder, ensure_ascii=False))
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write(u'#version: 0.2\n')
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
-                    index = token_index
-                writer.write(' '.join(bpe_tokens) + u'\n')
-                index += 1
-        index = len(self.encoder)
-        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
-            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
-                    index = token_index
-                writer.write(token + u'\n')
-                index += 1
-        return vocab_file, merge_file, special_tokens_file
--- a/megatron/deprecated_data_utils/wordpiece.py
+++ b/megatron/deprecated_data_utils/wordpiece.py
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes. Provided as is from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py"""
-from __future__ import absolute_import, division, print_function, unicode_literals
-import collections
-import logging
-import os
-import unicodedata
-from io import open
-from .file_utils import cached_path
-logger = logging.getLogger(__name__)
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
-}
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
-    'bert-base-uncased': 512,
-    'bert-large-uncased': 512,
-    'bert-base-cased': 512,
-    'bert-large-cased': 512,
-    'bert-base-multilingual-uncased': 512,
-    'bert-base-multilingual-cased': 512,
-    'bert-base-chinese': 512,
-}
-VOCAB_NAME = 'vocab.txt'
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    index = 0
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        while True:
-            token = reader.readline()
-            if not token:
-                break
-            token = token.strip()
-            vocab[token] = index
-            index += 1
-    return vocab
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-class BertTokenizer(object):
-    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
-    def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
-                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
-        """Constructs a BertTokenizer.
-        Args:
-          vocab_file: Path to a one-wordpiece-per-line vocabulary file
-          do_lower_case: Whether to lower case the input
-                         Only has an effect when do_wordpiece_only=False
-          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-          max_len: An artificial maximum length to truncate tokenized sequences to;
-                         Effective maximum length is always the minimum of this
-                         value (if specified) and the underlying BERT model's
-                         sequence length.
-          never_split: List of tokens which will never be split during tokenization.
-                         Only has an effect when do_wordpiece_only=False
-        """
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict(
-            [(ids, tok) for tok, ids in self.vocab.items()])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
-                                                  never_split=never_split)
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-        self.max_len = max_len if max_len is not None else int(1e12)
-    def tokenize(self, text):
-        if self.do_basic_tokenize:
-            split_tokens = []
-            for token in self.basic_tokenizer.tokenize(text):
-                for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                    split_tokens.append(sub_token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-    def convert_tokens_to_ids(self, tokens):
-        """Converts a sequence of tokens into ids using the vocab."""
-        ids = []
-        for token in tokens:
-            ids.append(self.vocab[token])
-        if len(ids) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this BERT model ({} > {}). Running this"
-                " sequence through BERT will result in indexing errors".format(
-                    len(ids), self.max_len)
-            )
-        return ids
-    def convert_ids_to_tokens(self, ids):
-        """Converts a sequence of ids in wordpiece tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            tokens.append(self.ids_to_tokens[i])
-        return tokens
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
-        """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            vocab_file = pretrained_model_name_or_path
-        if os.path.isdir(vocab_file):
-            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    vocab_file))
-            return None
-        if resolved_vocab_file == vocab_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
-            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
-            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
-        return tokenizer
-class BasicTokenizer(object):
-    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-    def __init__(self,
-                 do_lower_case=True,
-                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
-        """Constructs a BasicTokenizer.
-        Args:
-          do_lower_case: Whether to lower case the input.
-        """
-        self.do_lower_case = do_lower_case
-        self.never_split = never_split
-    def tokenize(self, text):
-        """Tokenizes a piece of text."""
-        text = self._clean_text(text)
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
-        split_tokens = []
-        for token in orig_tokens:
-            if self.do_lower_case and token not in self.never_split:
-                token = token.lower()
-                token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token))
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-    def _run_split_on_punc(self, text):
-        """Splits punctuation on a piece of text."""
-        if text in self.never_split:
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-        return ["".join(x) for x in output]
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-                (cp >= 0x3400 and cp <= 0x4DBF) or  #
-                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-                (cp >= 0x2B820 and cp <= 0x2CEAF) or
-                (cp >= 0xF900 and cp <= 0xFAFF) or  #
-                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
-            return True
-        return False
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xfffd or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-class WordpieceTokenizer(object):
-    """Runs WordPiece tokenization."""
-    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-    def tokenize(self, text):
-        """Tokenizes a piece of text into its word pieces.
-        This uses a greedy longest-match-first algorithm to perform tokenization
-        using the given vocabulary.
-        For example:
-          input = "unaffable"
-          output = ["un", "##aff", "##able"]
-        Args:
-          text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer`.
-        Returns:
-          A list of wordpiece tokens.
-        """
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
-def _is_whitespace(char):
-    """Checks whether `chars` is a whitespace character."""
-    # \t, \n, and \r are technically contorl characters but we treat them
-    # as whitespace since they are generally considered as such.
-    if char == " " or char == "\t" or char == "\n" or char == "\r":
-        return True
-    cat = unicodedata.category(char)
-    if cat == "Zs":
-        return True
-    return False
-def _is_control(char):
-    """Checks whether `chars` is a control character."""
-    # These are technically control characters but we count them as whitespace
-    # characters.
-    if char == "\t" or char == "\n" or char == "\r":
-        return False
-    cat = unicodedata.category(char)
-    if cat.startswith("C"):
-        return True
-    return False
-def _is_punctuation(char):
-    """Checks whether `chars` is a punctuation character."""
-    cp = ord(char)
-    # We treat all non-letter/number ASCII as punctuation.
-    # Characters such as "^", "$", and "`" are not in the Unicode
-    # Punctuation class but we treat them as punctuation anyways, for
-    # consistency.
-    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
-        return True
-    cat = unicodedata.category(char)
-    if cat.startswith("P"):
-        return True
-    return False
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -19,77 +19,101 @@ import math
 from megatron import print_rank_0
 class AnnealingLR(object):
    """Anneals the learning rate."""
-    def __init__(self, optimizer, start_lr,
+    def __init__(self, optimizer, max_lr, min_lr,
-                 warmup_iter, total_iters,
+                 warmup_steps, decay_steps,
-                 decay_style, last_iter, min_lr=0.0,
+                 decay_style, num_steps,
                 use_checkpoint_lr_scheduler=True,
                 override_lr_scheduler=False):
        # Class values.
        self.optimizer = optimizer
-        self.start_lr = start_lr
+        self.max_lr = float(max_lr)
        self.min_lr = min_lr
-        self.warmup_iter = warmup_iter
+        assert self.min_lr >= 0.0
-        self.num_iters = last_iter
+        assert self.max_lr >= self.min_lr
-        self.end_iter = total_iters
-        assert self.end_iter > 0
+        self.warmup_steps = warmup_steps
+        self.num_steps = num_steps
+        self.decay_steps = decay_steps
+        assert self.decay_steps > 0
+        assert self.warmup_steps < self.decay_steps
        self.decay_style = decay_style
        self.override_lr_scheduler = override_lr_scheduler
        self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
        if self.override_lr_scheduler:
            assert not self.use_checkpoint_lr_scheduler, 'both override and '\
                'use-checkpoint are set.'
        # Set the learning rate
-        self.step(self.num_iters)
+        self.step(step_num=self.num_steps)
        print_rank_0('> learning rate decay style: {}'.format(self.decay_style))
    def get_lr(self):
        """Learning rate decay functions from:
              https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
-        num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter)
+        # Use linear warmup for the initial part.
-        # Warmup.
+        if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps:
-        if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter:
+            return self.max_lr * float(self.num_steps) / \
-            return float(self.start_lr) * num_iters_ / self.warmup_iter
+                float(self.warmup_steps)
+        # If the learning rate is constant, just return the initial value.
+        if self.decay_style == 'constant':
+            return self.max_lr
+        # For any steps larger than `self.decay_steps`, use `self.min_lr`.
+        if self.num_steps > self.decay_steps:
+            return self.min_lr
+        # If we are done with the warmup period, use the decay style.
+        num_steps_ = self.num_steps - self.warmup_steps
+        decay_steps_ = self.decay_steps - self.warmup_steps
+        decay_ratio = float(num_steps_) / float(decay_steps_)
+        assert decay_ratio >= 0.0
+        assert decay_ratio <= 1.0
+        delta_lr = self.max_lr - self.min_lr
-        num_iters_ = num_iters_ - self.warmup_iter
        if self.decay_style == 'linear':
-            lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter
+            coeff = (1.0 - decay_ratio)
        elif self.decay_style == 'cosine':
-            lr = self.start_lr / 2.0 * (math.cos(
+            coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
-                math.pi * num_iters_ / self.end_iter) + 1)
-        elif self.decay_style == 'exponential':
-            # exp(-0.693) = 1/2
-            lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter)
        else:
-            lr = self.start_lr
+            raise Exception('{} decay style is not supported.'.format(
-        return max(lr, self.min_lr)
+                self.decay_style))
+        return self.min_lr + coeff * delta_lr
-    def step(self, step_num=None):
+    def step(self, increment=1, step_num=None):
        """Set lr for all parameters groups."""
        if step_num is None:
-            step_num = self.num_iters + 1
+            step_num = self.num_steps + increment
-        self.num_iters = step_num
+        self.num_steps = step_num
        new_lr = self.get_lr()
        for group in self.optimizer.param_groups:
            group['lr'] = new_lr
    def state_dict(self):
        state_dict = {
-            'start_lr': self.start_lr,
+            'max_lr': self.max_lr,
-            'warmup_iter': self.warmup_iter,
+            'warmup_steps': self.warmup_steps,
-            'num_iters': self.num_iters,
+            'num_steps': self.num_steps,
            'decay_style': self.decay_style,
-            'end_iter': self.end_iter,
+            'decay_steps': self.decay_steps,
            'min_lr': self.min_lr
        }
        return state_dict
    def _check_and_set(self, cls_value, sd_value, name):
        """Auxiliary function for checking the values in the checkpoint and
        setting them."""
@@ -104,20 +128,39 @@ class AnnealingLR(object):
                                                                  name))
        return sd_value
    def load_state_dict(self, sd):
-        self.start_lr = self._check_and_set(self.start_lr, sd['start_lr'],
+        if 'start_lr' in sd:
-                                            'learning rate')
+            max_lr_ = sd['start_lr']
+        else:
+            max_lr_ = sd['max_lr']
+        self.max_lr = self._check_and_set(self.max_lr, max_lr_,
+                                          'learning rate')
        self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'],
                                          'minimum learning rate')
-        self.warmup_iter = self._check_and_set(self.warmup_iter,
-                                               sd['warmup_iter'],
+        if 'warmup_iter' in sd:
-                                               'warmup iterations')
+            warmup_steps_ = sd['warmup_iter']
-        self.end_iter = self._check_and_set(self.end_iter, sd['end_iter'],
+        else:
-                                            'total number of iterations')
+            warmup_steps_ = sd['warmup_steps']
+        self.warmup_steps = self._check_and_set(self.warmup_steps,
+                                                warmup_steps_,
+                                                'warmup iterations')
+        if 'end_iter' in sd:
+            decay_steps_ = sd['end_iter']
+        else:
+            decay_steps_ = sd['decay_steps']
+        self.decay_steps = self._check_and_set(self.decay_steps, decay_steps_,
+                                               'total number of iterations')
        self.decay_style = self._check_and_set(self.decay_style,
                                               sd['decay_style'],
                                               'decay style')
-        self.num_iters = sd['num_iters']
+        if 'num_iters' in sd:
-        self.step(self.num_iters)
+            self.num_steps = sd['num_iters']
+        else:
+            self.num_steps = sd['num_steps']
+        self.step(step_num=self.num_steps)
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -37,7 +37,7 @@ from megatron.model import DistributedDataParallel as LocalDDP
 from megatron.model import get_params_for_weight_decay_optimization
 from megatron.model.realm_model import ICTBertModel
 from megatron.utils import check_adlr_autoresume_termination
-from megatron.utils import make_data_loader
+from megatron.data.data_loaders import build_pretraining_data_loader
 from megatron.utils import report_memory
@@ -194,12 +194,12 @@ def get_learning_rate_scheduler(optimizer):
    warmup_iter = args.warmup * num_iters
    lr_scheduler = AnnealingLR(
        optimizer,
-        start_lr=args.lr,
+        max_lr=args.lr,
-        warmup_iter=warmup_iter,
-        total_iters=num_iters,
-        decay_style=args.lr_decay_style,
-        last_iter=init_step,
        min_lr=args.min_lr,
+        warmup_steps=warmup_iter,
+        decay_steps=num_iters,
+        decay_style=args.lr_decay_style,
+        num_steps=init_step,
        use_checkpoint_lr_scheduler=args.use_checkpoint_lr_scheduler,
        override_lr_scheduler=args.override_lr_scheduler)
@@ -224,7 +224,8 @@ def setup_model_and_optimizer(model_provider_func):
    while hasattr(unwrapped_model, 'module'):
        unwrapped_model = unwrapped_model.module
-    if args.iteration == 0 and hasattr(unwrapped_model, 'init_state_dict_from_bert'):
+    if args.iteration == 0 and hasattr(unwrapped_model,
+                                       'init_state_dict_from_bert'):
        print("Initializing ICT from pretrained BERT model", flush=True)
        unwrapped_model.init_state_dict_from_bert()
@@ -414,6 +415,8 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
                                             optimizer,
                                             lr_scheduler)
        iteration += 1
+        args.consumed_train_samples += mpu.get_data_parallel_world_size() * \
+                                       args.batch_size
        # Logging.
        loss_scale = None
@@ -472,6 +475,8 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False):
                                                            args.eval_iters))
            # Forward evaluation.
            _, loss_dict = forward_step_func(data_iterator, model)
+            args.consumed_valid_samples += mpu.get_data_parallel_world_size() \
+                                           * args.batch_size
            # Reduce across processes.
            for key in loss_dict:
                total_loss_dict[key] = total_loss_dict.get(key, 0.) + \
@@ -517,11 +522,19 @@ def build_train_valid_test_data_iterators(
    (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None)
    print_rank_0('> building train, validation, and test datasets ...')
+    # Rank and  global batch size.
+    data_parallel_size = mpu.get_data_parallel_world_size()
+    global_batch_size = args.batch_size * data_parallel_size
+    # Backward compatibility, assume fixed batch size.
+    if args.iteration > 0 and args.consumed_train_samples == 0:
+        args.consumed_train_samples = args.iteration * global_batch_size
+    if args.iteration > 0 and args.consumed_valid_samples == 0:
+        args.consumed_valid_samples = (args.iteration // args.eval_interval) * \
+            args.eval_iters * global_batch_size
    # Data loader only on rank 0 of each model parallel group.
    if mpu.get_model_parallel_rank() == 0:
-        # Rank, size, and global batch size.
-        data_parallel_size = mpu.get_data_parallel_world_size()
-        global_batch_size = args.batch_size * data_parallel_size
        # Number of train/valid/test samples.
        train_iters = args.train_iters
@@ -540,9 +553,11 @@ def build_train_valid_test_data_iterators(
            train_val_test_num_samples)
        # Build dataloders.
-        train_dataloader = make_data_loader(train_ds)
+        train_dataloader = build_pretraining_data_loader(
-        valid_dataloader = make_data_loader(valid_ds)
+            train_ds, args.consumed_train_samples)
-        test_dataloader = make_data_loader(test_ds)
+        valid_dataloader = build_pretraining_data_loader(
+            valid_ds, args.consumed_valid_samples)
+        test_dataloader = build_pretraining_data_loader(test_ds, 0)
        # Flags to know if we need to do training/validation/testing.
        do_train = train_dataloader is not None and args.train_iters > 0
@@ -561,21 +576,7 @@ def build_train_valid_test_data_iterators(
    args.do_train = flags[0].item()
    args.do_valid = flags[1].item()
    args.do_test = flags[2].item()
-    # Shift the start iterations.
-    if train_dataloader is not None:
-        train_dataloader.batch_sampler.start_iter = args.iteration % \
-            len(train_dataloader)
-        print_rank_0('setting training data start iteration to {}'.
-                     format(train_dataloader.batch_sampler.start_iter))
-    if valid_dataloader is not None:
-        start_iter_val = (args.iteration // args.eval_interval) * \
-            args.eval_iters
-        valid_dataloader.batch_sampler.start_iter = start_iter_val % \
-            len(valid_dataloader)
-        print_rank_0('setting validation data start iteration to {}'.
-                     format(valid_dataloader.batch_sampler.start_iter))
    # Build iterators.
    if train_dataloader is not None:
        train_data_iterator = iter(train_dataloader)

--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -24,7 +24,6 @@ from megatron import print_rank_0
 from megatron import get_adlr_autoresume
 from megatron import mpu
 from megatron.checkpointing import save_checkpoint
-from megatron.data.samplers import DistributedBatchSampler
 from megatron.fp16 import FP16_Optimizer
@@ -89,32 +88,6 @@ def check_adlr_autoresume_termination(iteration, model,
        sys.exit(0)
-def make_data_loader(dataset):
-    """Buld dataloader given an input dataset."""
-    if dataset is None:
-        return None
-    args = get_args()
-    # Data parallel arguments.
-    world_size = mpu.get_data_parallel_world_size()
-    rank = mpu.get_data_parallel_rank()
-    global_batch_size = args.batch_size * world_size
-    num_workers = args.num_workers
-    # Use a simple sampler with distributed batch sampler.
-    sampler = torch.utils.data.SequentialSampler(dataset)
-    batch_sampler = DistributedBatchSampler(sampler=sampler,
-                                            batch_size=global_batch_size,
-                                            drop_last=True,
-                                            rank=rank,
-                                            world_size=world_size)
-    # Torch dataloader.
-    return torch.utils.data.DataLoader(dataset,
-                                       batch_sampler=batch_sampler,
-                                       num_workers=num_workers,
-                                       pin_memory=True)
 def get_ltor_masks_and_position_ids(data,
                                    eod_token,
                                    reset_position_ids,