Commit 75bd9b54 authored by Jared Casper's avatar Jared Casper
Browse files

Merge branch 'megatron_sampler' into 'main'

Simplified sampler (will be needed later for batch size increase) and removed deprecated data stuff

See merge request ADLR/megatron-lm!177
parents ea81d62f fac6718a
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utilities for using and training tokenizers (char, wordpiece, sentencepiece)"""
from collections import namedtuple
import random
import os
import csv
import torch
import nltk
from nltk import tokenize as nltk_tokenize
import sentencepiece as spm
from .wordpiece import BertTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
from .tokenization_gpt2 import GPT2Tokenizer
import regex as re
def make_tokenizer(tokenizer_type, corpus, model_path=None, vocab_size=None, model_type='bpe',
pad_token=0, character_coverage=1.0, command_tokens=None, type_tokens=None, **kwargs):
"""
Helper function to instantiate a tokenizer given common combinations of options.
"""
tokenizer_class = tokenizer_type
if isinstance(tokenizer_class, str):
tokenizer_class = eval(tokenizer_class)
if tokenizer_class is BertWordPieceTokenizer:
return BertWordPieceTokenizer(model_type, **kwargs)
elif tokenizer_class is GPT2BPETokenizer:
return GPT2BPETokenizer(**kwargs)
text_tokenizer = tokenizer_class(corpus=corpus, vocab_size=vocab_size, model_path=model_path, model_type=model_type,
pad_token=pad_token, character_coverage=character_coverage)
return Tokenizer(text_tokenizer, command_tokens, type_tokens)
class Tokenization(object):
"""
Tokenization object to hold tokenization, (processed text),and original
text. Can hold tokenization as Ids or tokens.
It also holds command tokens (pad, unk, etc.) for the tokenization.
This allows functions to pad/operate on tokenizations without having
access to the full tokenizer, just the tokenization.
Several standard array operations are implemented (insert, append, extend).
"""
def __init__(self, tokenization, text=None, original_text=None,
command_tokens=None, asIds=True):
self.tokenization = tokenization
self.text = text
if self.text is None:
self.text = self.tokenization
self.original_text = original_text
if self.original_text is None:
self.original_text = self.text
self.command_tokens = command_tokens
self.asIds = asIds
self.parse_command_tokens()
def set_command_tokens(self, command_tokens):
self.command_tokens = command_tokens
return self.parse_command_tokens()
def parse_command_tokens(self):
if self.command_tokens is None:
return
for command_token in self.command_tokens:
if self.asIds:
setattr(self, command_token.name, command_token.Id)
else:
setattr(self, command_token.name, command_token.token)
def __getitem__(self, index):
return self.tokenization[index]
def __len__(self):
return len(self.tokenization)
def insert(self, idx, other):
if isinstance(other, (CommandToken, TypeToken)):
self.tokenization.insert(idx, other.Id)
if idx == 0:
self.text = other.token + self.text
self.original_text = other.token + self.original_text
elif idx == len(self.tokenization) - 1:
self.text += other.token
self.original_text += other.token
elif isinstance(other, Tokenization):
self.tokenization = self.tokenization[:idx] + \
other.tokenization + self.tokenization[idx:]
else:
self.tokenization = self.tokenization[:idx] + \
other.tokenization + self.tokenization[idx:]
def append(self, other):
if isinstance(other, (CommandToken, TypeToken)):
self.tokenization.append(other.Id)
self.text += other.token
self.original_text += other.token
elif isinstance(other, Tokenization):
self.tokenization.extend(other.tokenization)
self.text += other.text
self.original_text += other.original_text
else:
self.tokenization.append(other)
return self
def extend(self, other):
if isinstance(other, (CommandToken, TypeToken)):
self.tokenization.append(other.Id)
self.text += other.token
self.original_text += other.token
elif isinstance(other, list) and isinstance(other[0], (CommandToken, TypeToken)):
self.tokenization.extend([o.Id for o in other])
self.text += [o.token for o in other]
self.original_text += [o.token for o in other]
elif isinstance(other, Tokenization):
self.tokenization.extend(other.tokenization)
self.text += other.text
self.original_text += other.original_text
else:
self.tokenization.extend(other)
return self
"""define some default command tokens for the tokenizer to use"""
token_format = "<{0}>"
COMMAND_TUPLE = namedtuple('CommandToken', ('name', 'token', 'Id'))
def prep_command_tokens(tokenlist, token_format=token_format):
return [CommandToken(tok[0], token_format.format(tok[0]), tok[1]) for tok in tokenlist]
class CommandToken(object):
def __init__(self, name, token, Id):
self.name = name
self.token = token
self.Id = Id
def __str__(self):
return str(COMMAND_TUPLE(self.name, self.token, self.Id))
DEFAULT_COMMAND_TOKENS = [
('pad', 0),
('eos', 1),
('bos', 2),
('unk', 3),
('sep', 4),
('L2R', 5),
('ENC', 6),
('MASK', 7),
]
DEFAULT_COMMAND_TOKENS = prep_command_tokens(DEFAULT_COMMAND_TOKENS)
"""define some default type tokens for bert training"""
TYPE_TUPLE = namedtuple('TypeToken', ('name', 'token', 'Id'))
def prep_type_tokens(tokenlist, token_format=token_format):
return [TypeToken(tok[0], token_format.format(tok[0]), tok[1]) for tok in tokenlist]
class TypeToken(object):
def __init__(self, name, token, Id):
self.name = name
self.token = token
self.Id = Id
def __str__(self):
return str(TYPE_TUPLE(self.name, self.token, self.Id))
DEFAULT_TYPE_TOKENS = [
('function', 0),
('command', 1),
('str0', 2),
('str1', 3),
('str2', 4),
('embedding0', 5),
('embedding1', 6),
('embedding2', 7),
('arg0', 8),
('arg1', 9),
('arg2', 10),
]
DEFAULT_TYPE_TOKENS = prep_type_tokens(DEFAULT_TYPE_TOKENS)
class Tokenizer(object):
"""
Tokenizer object that handles text tokenization, command tokens, and type tokens.
Command tokens and text tokens are stored together in one mapping of size
`len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first
`len(command_tokens)` tokens. Token idx is stored at `idx+len(command_tokens)`.
Token types are stored in a separate mapping of size `len(type_tokens)`.
"""
def __init__(self, text_tokenizer, command_tokens=None, type_tokens=None):
# set text tokenizer
self.text_tokenizer = text_tokenizer
if not hasattr(self, 'num_text_tokens'):
self.num_text_tokens = len(self.text_tokenizer)
# set command tokens
if command_tokens is None:
command_tokens = DEFAULT_COMMAND_TOKENS
self._command_tokens = command_tokens
self.command_name_map = {tok.name: tok for tok in self._command_tokens}
self.command_token_map = {tok.token: tok for tok in self._command_tokens}
self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
if not hasattr(self, 'num_command_tokens'):
self.num_command_tokens = len(self._command_tokens)
if not hasattr(self, 'num_tokens'):
self.num_tokens = self.num_command_tokens + self.num_text_tokens
# set type tokens
if type_tokens is None:
type_tokens = DEFAULT_TYPE_TOKENS
self.type_tokens = type_tokens
self.type_name_map = {tok.name: tok for tok in self.type_tokens}
self.type_token_map = {tok.token: tok for tok in self.type_tokens}
self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
if not hasattr(self, 'num_type_tokens'):
self.num_type_tokens = len(self.type_tokens)
# parse tokens and vocabs from tokenizer
self._tokens = list(self.command_token_map.keys()) + list(self.text_tokenizer.tokens)
self._vocab = {t: Id for Id, t in self.command_id_map.items()}
self._vocab.update({t: Id + self.num_command_tokens for t,
Id in self.text_tokenizer.vocab.items()})
self._text_tokens = list(self.text_tokenizer.tokens)
self._text_token_vocab = {
t: Id + self.num_command_tokens for t,
Id in self.text_tokenizer.vocab.items()}
self._command_token_tokens = list(self.command_token_map.keys())
self._command_token_vocab = {t: Id for Id, t in self.command_id_map.items()}
self._token_types = list(self.type_token_map.keys())
self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
def __call__(self, text, process_fn=None):
"""run preprocessing and encode text as Ids"""
return self.EncodeAsIds(text, process_fn=process_fn)
def __len__(self):
"""total number of tokens"""
return self.num_tokens
def get_command(self, name):
"""get command token corresponding to `name`"""
return self.command_name_map[name]
def get_type(self, name):
"""get type token corresponding to `name`"""
return self.type_name_map[name]
@property
def tokens(self):
"""list (or iterable) of all tokens for tokenizer"""
return self._tokens
@property
def vocab(self):
"""dictionary mapping tokens to ids for tokenizer"""
return self._vocab
@property
def token_types(self):
"""list (or iterable) of all token types for tokenizer"""
return self._token_types
@property
def token_type_vocab(self):
"""dictionary mapping token types to ids for tokenizer"""
return self._token_type_vocab
@property
def command_tokens(self):
"""list (or iterable) of all command tokens for tokenizer"""
return self._command_token_tokens
@property
def command_token_vocab(self):
"""dictionary mapping command tokens to ids for tokenizer"""
return self._command_token_vocab
@property
def text_tokens(self):
"""list (or iterable) of text tokens for text tokenizer"""
return self._text_tokens
@property
def text_token_vocab(self):
"""dictionary mapping text tokens to ids for text tokenizer"""
return self._text_token_vocab
def EncodeAsIds(self, text, process_fn=None):
"""
encode text using text tokenizer and shift Id values for command tokens
"""
tokenization = self.text_tokenizer.EncodeAsIds(text, process_fn=process_fn)
tokenization.tokenization = [t + self.num_command_tokens for t in tokenization.tokenization]
tokenization.set_command_tokens(self._command_tokens)
return tokenization
def EncodeAsTokens(self, text, process_fn=None):
"""
encode text as tokens using text tokenizer
"""
tokenization = self.text_tokenizer.EncodeAsTokens(text, process_fn=process_fn)
tokenization.set_command_tokens(self._command_tokens)
return tokenization
def IdToToken(self, Id, type_token=False):
"""convert Id to token accounting for command and type tokens"""
if isinstance(Id, (TypeToken, CommandToken)):
return Id.token
if type_token:
return self.type_id_map[Id].token
if Id < self.num_command_tokens:
return self.command_id_map[Id].token
return self.text_tokenizer.IdToToken(Id - self.num_command_tokens)
def TokenToId(self, token, type_token=False):
"""convert token to Id accounting for command and type tokens"""
if isinstance(token, (TypeToken, CommandToken)):
return token.Id
if type_token:
return self.type_token_map[token].Id
if token in self.command_token_map:
return self.command_token_map[token].Id
return self.text_tokenizer.TokenToId(token) + self.num_command_tokens
def DecodeIds(self, Ids, type_token=False):
"""
convert Ids to tokens accounting for command and type tokens, tokens
are joined and returned as a string.
"""
if type_token:
return ' '.join(Id.token if isinstance(Id, TypeToken)
else self.type_id_map[Id].token for Id in Ids)
rtn_strs = []
current_str = []
if isinstance(Ids, Tokenization):
Ids = Ids.tokenization
for Id in Ids:
if isinstance(Id, CommandToken):
rtn_strs.append(self.text_tokenizer.DecodeIds(current_str))
current_str = []
rtn_strs.append(t.token)
elif Id < self.num_command_tokens:
rtn_strs.append(self.text_tokenizer.DecodeIds(current_str))
current_str = []
rtn_strs.append(self.command_id_map[Id].token)
else:
current_str.append(Id - self.num_command_tokens)
if current_str != []:
rtn_strs.append(self.text_tokenizer.DecodeIds(current_str))
return ' '.join(rtn_strs)
def DecodeTokens(self, Tokens, type_token=False):
"""
convert tokens to a string accounting for command and type tokens.
"""
if type_token:
return ' '.join(t.token if isinstance(t, TypeToken) else t for t in Tokens)
rtn_strs = []
current_str = []
if isinstance(Tokens, Tokenization):
Tokens = Tokens.tokenization
for t in Tokens:
if isinstance(t, CommandToken):
rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str))
current_str = []
rtn_strs.append(t.token)
elif t in self.command_token_map:
rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str))
current_str = []
rtn_strs.append(t)
else:
current_str.append(t)
if current_str != []:
rtn_strs.append(self.text_tokenizer.DecodeTokens(current_str))
return ' '.join(rtn_strs)
class TextTokenizer(object):
"""
Interface for text tokenizer
"""
def __init__(self):
if not hasattr(self, 'num_text_tokens'):
self.num_text_tokens = 0
if not hasattr(self, 'num_tokens'):
self.num_tokens = self.num_text_tokens
def __call__(self, text, process_fn=None):
return self.EncodeAsIds(text, process_fn)
def __len__(self):
return self.num_text_tokens
@property
def tokens(self):
"""list (or iterable) of text tokens for text tokenizer"""
raise NotImplementedError('TextTokenizer tokens property not implemented')
@property
def vocab(self):
"""dictionary mapping tokens to ids"""
raise NotImplementedError('TextTokenizer vocab property not implemented')
@staticmethod
def exists(model_path):
"""check if the filepath for a text tokenizer exists"""
raise NotImplementedError('TextTokenizer exists method not implemented')
def Train(self, corpus):
"""train a tokenizer on a data corpus and save model for future use"""
raise NotImplementedError('TextTokenizer Train not implemented')
def EncodeAsIds(self, text, process_fn=None):
"""
Preprocess text and encode as ids. Return a tokenization object with
original text, processed text, and id tokenization.
"""
raise NotImplementedError('TextTokenizer EncodeAsIds not implemented')
def EncodeAsTokens(self, text, process_fn=None):
"""
Preprocess text and encode as tokens. Return a tokenization object with
original text, processed text, and token tokenization.
"""
raise NotImplementedError('TextTokenizer EncodeAsTokens not implemented')
def IdToToken(self, Id):
"""Convert an Id to Token. Reverse lookup of self.vocab"""
raise NotImplementedError('TextTokenizer IdToToken not implemented')
def TokenToId(self, token):
"""Convert a Token to Id. Lookup of self.vocab"""
raise NotImplementedError('TextTokenizer TokenToId not implemented')
def DecodeIds(self, Ids):
"""Convert a list or tokenization object of Ids to a text string"""
raise NotImplementedError('TextTokenizer DecodeIds not implemented')
def DecodeTokens(self, Tokens):
"""Convert a list or tokenization object of tokens to a text string"""
raise NotImplementedError('TextTokenizer DecodeTokens not implemented')
class CharacterLevelTokenizer(TextTokenizer):
"""
Text tokenizer for ASCII-256 Character Level Tokenization.
"""
def __init__(self, **kwargs):
self.num_text_tokens = 256
super(CharacterLevelTokenizer, self).__init__()
self._tokens = [self.IdToToken(Id) for Id in range(self.num_text_tokens)]
self._vocab = {t: i for i, t in enumerate(self._tokens)}
def __len__(self):
return 256
@staticmethod
def exists(model_path):
return True
def Train(self, corpus):
pass
@property
def tokens(self):
return self._tokens
@property
def vocab(self):
return self._vocab
def EncodeAsIds(self, text, process_fn=None):
"""convert text to ascii 256 Ids"""
processed_text = text
if process_fn is not None:
processed_text = process_fn(processed_text)
processed_text = str(processed_text)
tokens = [self.TokenToId(c) for c in processed_text]
return Tokenization(tokens, processed_text, text)
def EncodeAsTokens(self, text, process_fn=None):
"""convert text to ascii 256 characters"""
processed_text = text
if process_fn is not None:
processed_text = process_fn(processed_text)
processed_text = str(processed_text)
tokens = [c for c in processed_text]
return Tokenization(tokens, processed_text, text, asIds=False)
def IdToToken(self, Id):
"""ascii index to character"""
return chr(Id)
def TokenToId(self, token):
"""ascii character to index"""
return ord(token)
def DecodeIds(self, Ids):
"""converts ascii ids to tokens before joining them into text"""
if isinstance(Ids, Tokenization):
Ids = Ids.tokenization
return ''.join([self.IdToToken(tok) for tok in Ids])
def DecodeTokens(self, Tokens):
"""just concatenates ascii tokens into text"""
if isinstance(Tokens, Tokenization):
Tokens = Tokens.tokenization
return ''.join(Tokens)
MAX_SENTENCEPIECE_SENTENCES = 100000000
def get_corpus_freq(dataset, filepath, filetype='tsv'):
"""
Take corpus, split it into sentences, and extract word frequencies.
Write frequencies to `filepath` as a tsv. Only write the first
MAX_SENTENCEPIECE_SENTENCES most common words to the file.
"""
nltk.download('punkt', download_dir="./nltk")
if filetype == 'tsv':
delimiter = '\t'
else:
delimiter = ','
print("compute corpus frequency\n", flush=True)
total_sentence_count = 0
maxlen = 0
freqs = {}
for entry in dataset:
if isinstance(entry, dict):
entry = entry['text']
lines = entry.strip().split('\n')
for line in lines:
sentences = nltk_tokenize.sent_tokenize(line)
total_sentence_count += len(sentences)
for sentence in sentences:
maxlen = max(len(line), maxlen)
for word in sentence.split():
if word not in freqs:
freqs[word] = 0
freqs[word] += 1
print("length of freqs before truncating " + str(len(freqs)), flush=True)
print("file path for freq " + str(filepath), flush=True)
freqs_sorted = {}
counter = 0
for word, count in sorted(freqs.items(), key=lambda x: x[1], reverse=True):
if counter >= MAX_SENTENCEPIECE_SENTENCES:
break
counter += 1
freqs_sorted[word] = count
print("length of freqs after trancating " + str(len(freqs_sorted)), flush=True)
with open(filepath, 'w') as f:
writer = csv.writer(f, delimiter=delimiter)
for k, v in freqs_sorted.items():
writer.writerow([str(k), str(v)])
return total_sentence_count, maxlen
class SentencePieceTokenizer(TextTokenizer):
"""Trains and uses sentencepiece for text tokenization"""
def __init__(self, model_type='bpe', vocab_size=None, corpus=None,
model_path=None, character_coverage=1.0, **kwargs):
self.character_coverage = character_coverage
self.model_type = model_type.lower()
self.spm_model = model_path
self.num_text_tokens = vocab_size
make_train = not SentencePieceTokenizer.exists(self.spm_model)
if make_train:
assert corpus is not None and self.num_text_tokens is not None
self.Train(corpus, self.num_text_tokens)
self._tokens = []
self._vocab = {}
self.load_spm_model()
super(SentencePieceTokenizer, self).__init__()
def __len__(self):
return self.num_text_tokens
@property
def tokens(self):
return self._tokens
@property
def vocab(self):
return self._vocab
@staticmethod
def exists(model_path):
if model_path is None:
return False
# check if path exists
dne = not os.path.exists(model_path)
# check if path.model exists
if dne and not model_path.endswith('.model'):
dne = not os.path.exists(model_path + '.model')
return not dne
def load_spm_model(self):
"""load sentencepiece model and parse vocab"""
if not os.path.exists(self.spm_model) and not self.spm_model.endswith('.model'):
self.spm_model = self.spm_model + '.model'
self.sp = spm.SentencePieceProcessor()
self.sp.Load(self.spm_model)
self.vocab_size = self.num_text_tokens = len(self.sp)
self._tokens = [self.IdToToken(t) for t in range(self.vocab_size)]
self._vocab = {t: i for i, t in enumerate(self._tokens)}
def Train(self, corpus, num_text_tokens):
"""train sentencepiece model on corpus using word frequencies"""
self.num_text_tokens = num_text_tokens
use_model_path = self.spm_model
random_hash = str(random.randint(0, 2147483647))
if use_model_path is None:
use_model_path = random_hash
if use_model_path.endswith('.model'):
use_model_path = use_model_path[:use_model_path.rfind('.model')]
input_path = use_model_path + '.tsv.' + random_hash
line_count, maxlenline = get_corpus_freq(corpus, input_path)
line_count = min(line_count, MAX_SENTENCEPIECE_SENTENCES)
print('line count used as input_sentence_size ', line_count, flush=True)
print('training sentencepiece model', flush=True)
train_string = '--input={file_path} --model_prefix={model_prefix} --vocab_size={vocab_size}' \
+ ' --model_type={model_type} --character_coverage={character_coverage} ' \
+ '--input_sentence_size={input_sentence_size} ' \
+ '--input_format=tsv'
train_string = train_string.format(file_path=input_path, model_prefix=use_model_path, vocab_size=num_text_tokens,
model_type=self.model_type, character_coverage=self.character_coverage,
input_sentence_size=int(line_count)) # , #)#,
print("calling spm.SentencePieceTrainer.Train(%s)" % (train_string), flush=True)
spm.SentencePieceTrainer.Train(train_string)
os.remove(input_path)
self.spm_model = use_model_path + '.model'
print('sentencepiece model written to ' + self.spm_model, flush=True)
def EncodeAsIds(self, text, process_fn=None):
"""convert text to sentencepiece Ids"""
processed_text = text
if process_fn is not None:
processed_text = process_fn(processed_text)
tokens = self.sp.EncodeAsIds(processed_text)
return Tokenization(tokens, processed_text, text)
def EncodeAsTokens(self, text, process_fn=None):
"""convert text to sentencepiece tokens"""
processed_text = text
if process_fn is not None:
processed_text = process_fn(processed_text)
tokens = self.sp.EncodeAsTokens(processed_text)
return Tokenization(tokens, processed_text, text, asIds=False)
def IdToToken(self, Id):
"""convert Id to sentencpiece token"""
return self.sp.IdToPiece(Id)
def TokenToId(self, token):
"""convert sentencpiece token to Id"""
return self.sp.PieceToId(token)
def DecodeIds(self, Ids):
"""converts ids to a text string"""
if isinstance(Ids, Tokenization):
Ids = Ids.tokenization
return self.sp.DecodeIds(Ids)
def DecodeTokens(self, Tokens):
"""converts sentencepiece tokens to a text string"""
if isinstance(Tokens, Tokenization):
Tokens = Tokens.tokenization
return self.sp.DecodeTokens(Tokens)
class BertWordPieceTokenizer(Tokenizer):
"""
Loads a pretrained WordPiece tokenizer from `cache_dir` for tokenization
in BERT training. Default to bert-large-uncased tokenizer.
"""
def __init__(self, tokenizer_model_type=None, cache_dir=None, **kwargs):
# default to bert-large-uncased tokenizer
if tokenizer_model_type not in PRETRAINED_VOCAB_ARCHIVE_MAP:
tokenizer_model_type = 'bert-large-uncased'
if torch.distributed.get_rank() == 0:
print(
'loading BertWordPieceTokenizer (',
tokenizer_model_type,
') from cache_dir ',
cache_dir)
do_lower_case = not ('-cased' in tokenizer_model_type or 'chinese' in tokenizer_model_type)
self.text_tokenizer = BertTokenizer.from_pretrained(
tokenizer_model_type, do_lower_case=do_lower_case, cache_dir=cache_dir)
if torch.distributed.get_rank() == 0:
print('loaded', tokenizer_model_type)
# disable max len warnings by increasing max len
self.text_tokenizer.max_len = int(1e12)
# set command tokens from wordpiece tokenizer values
self.num_command_tokens = 5
self.num_tokens = len(self.text_tokenizer.vocab)
self.num_text_tokens = self.num_tokens - 5
self.num_type_tokens = 2
self._command_tokens = [
CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']),
CommandToken('ENC', '[CLS]', self.text_tokenizer.vocab['[CLS]']),
CommandToken('MASK', '[MASK]', self.text_tokenizer.vocab['[MASK]']),
CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']),
CommandToken('sep', '[SEP]', self.text_tokenizer.vocab['[SEP]']),
]
self.command_name_map = {tok.name: tok for tok in self._command_tokens}
self.command_token_map = {tok.token: tok for tok in self._command_tokens}
self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
# set type tokens
self.type_tokens = [
TypeToken('str0', '<str0>', 0),
TypeToken('str1', '<str1>', 1),
]
self.type_name_map = {tok.name: tok for tok in self.type_tokens}
self.type_token_map = {tok.token: tok for tok in self.type_tokens}
self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
# parse tokens and vocabs from tokenizer
self._tokens = list(self.text_tokenizer.vocab.keys())
self._vocab = {k: v for k, v in self.text_tokenizer.vocab.items()}
self._text_tokens = list(self._tokens)
self._text_token_vocab = {k: v for k, v in self.text_tokenizer.vocab.items()}
self._command_token_tokens = list(self.command_token_map.keys())
self._command_token_vocab = {t: Id for Id, t in self.command_id_map.items()}
self._token_types = list(self.type_token_map.keys())
self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
def EncodeAsIds(self, text, process_fn=None):
"""convert text to wordpiece Ids"""
processed_text = text
if process_fn is not None:
processed_text = process_fn(processed_text)
tokens = self.text_tokenizer.tokenize(processed_text)
Ids = self.text_tokenizer.convert_tokens_to_ids(tokens)
return Tokenization(Ids, processed_text, text)
def EncodeAsTokens(self, text, process_fn=None):
"""convert wordpiece token to Id"""
processed_text = text
if process_fn is not None:
processed_text = process_fn(processed_text)
tokens = self.text_tokenizer.tokenize(processed_text)
return Tokenization(tokens, processed_text, text, asIds=False)
def IdToToken(self, Id, type_token=False):
"""convert Id to sentencpiece token"""
if isinstance(Id, (TypeToken, CommandToken)):
return Id.token
if type_token:
return self.type_id_map[Id].token
return self.text_tokenizer.ids_to_tokens[Id]
def TokenToId(self, token, type_token=False):
"""convert sentencpiece token to Id"""
if isinstance(token, (TypeToken, CommandToken)):
return token.Id
if type_token:
return self.type_token_map[token].Id
return self.text_tokenizer.vocab[token]
def DecodeIds(self, Ids, type_token=False):
"""converts ids to wordpiece tokens and joins them as a text string"""
if type_token:
return ' '.join(Id.token if isinstance(Id, TypeToken)
else self.type_id_map[Id].token for Id in Ids)
if isinstance(Ids, Tokenization):
Ids = Ids.tokenization
Tokens = []
for Id in Ids:
Tokens.append(self.text_tokenizer.ids_to_tokens[Id] if Id != -1 else '-1')
Tokens = self.text_tokenizer.convert_ids_to_tokens(Ids)
return ' '.join(Tokens)
def DecodeTokens(self, Tokens, type_token=False):
"""converts wordpiece tokens to a text string"""
if type_token:
return ' '.join(t.token if isinstance(t, TypeToken) else t for t in Tokens)
if isinstance(Tokens, Tokenization):
Tokens = Tokens.tokenization
return ' '.join(Tokens)
class GPT2BPETokenizer(Tokenizer):
def __init__(self, cache_dir=None, **kwargs):
self.text_tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
cache_dir=cache_dir)
# disable max len warnings by increasing max len
self.text_tokenizer.max_len = int(1e12)
self.num_command_tokens = 2
self.num_tokens = len(self.text_tokenizer.encoder)
self.num_text_tokens = self.num_tokens - 1
self.num_type_tokens = 2
self._command_tokens = [
CommandToken('pad', '<|endoftext|>', self.text_tokenizer.encoder['<|endoftext|>']),
CommandToken('eos', '<|endoftext|>', self.text_tokenizer.encoder['<|endoftext|>']),
]
self.command_name_map = {tok.name: tok for tok in self._command_tokens}
self.command_token_map = {tok.token: tok for tok in self._command_tokens}
self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
self.type_tokens = [
TypeToken('str0', '<str0>', 0),
TypeToken('str1', '<str1>', 1),
]
self.type_name_map = {tok.name: tok for tok in self.type_tokens}
self.type_token_map = {tok.token: tok for tok in self.type_tokens}
self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
self._tokens = list(self.text_tokenizer.encoder.keys())
self._vocab = {k: v for k, v in self.text_tokenizer.encoder.items()}
self._text_tokens = list(self._tokens)
self._text_token_vocab = {k: v for k, v in self.text_tokenizer.encoder.items()}
self._command_token_tokens = list(self.command_token_map.keys())
self._command_token_vocab = {t: Id for Id, t in self.command_id_map.items()}
self._token_types = list(self.type_token_map.keys())
self._token_type_vocab = {t: Id for Id, t in self.type_id_map.items()}
def EncodeAsIds(self, text, process_fn=None):
processed_text = text
if process_fn is not None:
processed_text = process_fn(processed_text)
Ids = self.text_tokenizer.encode(processed_text)
# return Tokenization(Ids, processed_text, text)
tokenization = Tokenization(Ids, processed_text, text)
tokenization.set_command_tokens(self._command_tokens)
return tokenization
def EncodeAsTokens(self, text, process_fn=None):
processed_text = text
if process_fn is not None:
processed_text = process_fn(processed_text)
tokens = []
for token in re.findall(self.text_tokenizer.pat, processed_text):
token = ''.join(self.text_tokenizer.bye_encoder[b] for b in token.encode('utf-8'))
tokens.extend(bpe_token for bpe_token in self.text_tokenizer.bpe(token).split(' '))
tokenization = Tokenization(tokens, processed_text, text, asIds=False)
tokenization.set_command_tokens(self._command_tokens)
return tokenization
# return Tokenization(tokens, processed_text, text, asIds=False)
def IdToToken(self, Id, type_token=False):
if isinstance(Id, (TypeToken, CommandToken)):
return Id.token
if type_token:
return self.type_id_map[Id].token
return self.text_tokenizer.decoder[Id]
def TokenToId(self, token, type_token=False):
if isinstance(token, (TypeToken, CommandToken)):
return token.Id
if type_token:
return self.type_token_map[token].Id
return self.text_tokenizer.encoder[token]
def DecodeIds(self, Ids, type_token=False):
if type_token:
return ' '.join(Id.token if isinstance(Id, TypeToken)
else self.type_id_map[Id].token for Id in Ids)
if isinstance(Ids, Tokenization):
Ids = Ids.tokenization
return self.text_tokenizer.decode(Ids)
def DecodeTokens(self, Tokens, type_token=False):
if type_token:
return ' '.join(t.token if isinstance(t, TypeToken) else t for t in Tokens)
if isinstance(Tokens, Tokenization):
Tokens = Tokens.tokenization
return self.text_tokenizer.decode([self.TokenToId(tok) for tok in Tokens])
# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import sys
import json
import logging
import os
import regex as re
from io import open
try:
from functools import lru_cache
except ImportError:
# Just a dummy decorator to get the checks to run on python2
# because honestly I don't want to support a byte-level unicode BPE
# tokenizer on python 2 right now.
def lru_cache():
return lambda func: func
from .file_utils import cached_path
logger = logging.getLogger(__name__)
PRETRAINED_VOCAB_ARCHIVE_MAP = {
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
}
PRETRAINED_MERGES_ARCHIVE_MAP = {
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
}
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
'gpt2': 1024,
}
VOCAB_NAME = 'vocab.json'
MERGES_NAME = 'merges.txt'
SPECIAL_TOKENS_NAME = 'special_tokens.txt'
@lru_cache()
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a signficant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
_chr = unichr if sys.version_info[0] == 2 else chr
bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \
list(range(ord("®"), ord("ÿ") + 1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [_chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class GPT2Tokenizer(object):
"""
GPT-2 BPE tokenizer. Peculiarities:
- Byte-level BPE
"""
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
"""
Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed.
"""
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
special_tokens_file = None
else:
vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
if not os.path.exists(special_tokens_file):
special_tokens_file = None
else:
logger.info("loading special tokens file {}".format(special_tokens_file))
# redirect to the cache, if necessary
try:
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
except EnvironmentError:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find files {} and {} "
"at this path or url.".format(
pretrained_model_name_or_path,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
pretrained_model_name_or_path,
vocab_file, merges_file))
return None
if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
logger.info("loading vocabulary file {}".format(vocab_file))
logger.info("loading merges file {}".format(merges_file))
else:
logger.info("loading vocabulary file {} from cache at {}".format(
vocab_file, resolved_vocab_file))
logger.info("loading merges file {} from cache at {}".format(
merges_file, resolved_merges_file))
if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# than the number of positional embeddings
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
# Instantiate tokenizer.
if special_tokens_file and 'special_tokens' not in kwargs:
special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
else:
special_tokens = kwargs.pop('special_tokens', [])
tokenizer = cls(
resolved_vocab_file,
resolved_merges_file,
special_tokens=special_tokens,
*inputs,
**kwargs)
return tokenizer
def __init__(self, vocab_file, merges_file, errors='replace',
special_tokens=None, max_len=None):
self.max_len = max_len if max_len is not None else int(1e12)
self.encoder = json.load(open(vocab_file))
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
# Should haved added re.IGNORECASE so BPE merges can happen for
# capitalized versions of contractions
self.pat = re.compile(
r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
self.special_tokens = {}
self.special_tokens_decoder = {}
self.set_special_tokens(special_tokens)
def __len__(self):
return len(self.encoder) + len(self.special_tokens)
def set_special_tokens(self, special_tokens):
""" Add a list of additional tokens to the encoder.
The additional tokens are indexed starting from the last index of the
current vocabulary in the order of the `special_tokens` list.
"""
if not special_tokens:
self.special_tokens = {}
self.special_tokens_decoder = {}
return
self.special_tokens = dict((tok, len(self.encoder) + i)
for i, tok in enumerate(special_tokens))
self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()}
logger.info("Special tokens {}".format(self.special_tokens))
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except BaseException:
new_word.extend(word[i:])
break
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = ' '.join(word)
self.cache[token] = word
return word
def tokenize(self, text):
""" Tokenize a string. """
bpe_tokens = []
for token in re.findall(self.pat, text):
if sys.version_info[0] == 2:
token = ''.join(self.byte_encoder[ord(b)] for b in token)
else:
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
return bpe_tokens
def convert_tokens_to_ids(self, tokens):
""" Converts a sequence of tokens into ids using the vocab. """
ids = []
if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
if tokens in self.special_tokens:
return self.special_tokens[tokens]
else:
return self.encoder.get(tokens, 0)
for token in tokens:
if token in self.special_tokens:
ids.append(self.special_tokens[token])
else:
ids.append(self.encoder.get(token, 0))
if len(ids) > self.max_len:
logger.warning(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this OpenAI GPT model ({} > {}). Running this"
" sequence through the model will result in indexing errors".format(
len(ids), self.max_len)
)
return ids
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
"""Converts a sequence of ids in BPE tokens using the vocab."""
tokens = []
for i in ids:
if i in self.special_tokens_decoder:
if not skip_special_tokens:
tokens.append(self.special_tokens_decoder[i])
else:
tokens.append(self.decoder[i])
return tokens
def encode(self, text):
return self.convert_tokens_to_ids(self.tokenize(text))
def decode(self, tokens):
text = ''.join([self.decoder[token] for token in tokens])
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
return text
def save_vocabulary(self, vocab_path):
"""Save the tokenizer vocabulary and merge files to a directory."""
if not os.path.isdir(vocab_path):
logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
return
vocab_file = os.path.join(vocab_path, VOCAB_NAME)
merge_file = os.path.join(vocab_path, MERGES_NAME)
special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
with open(vocab_file, 'w', encoding='utf-8') as f:
f.write(json.dumps(self.encoder, ensure_ascii=False))
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write(u'#version: 0.2\n')
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(merge_file))
index = token_index
writer.write(' '.join(bpe_tokens) + u'\n')
index += 1
index = len(self.encoder)
with open(special_tokens_file, 'w', encoding='utf-8') as writer:
for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(special_tokens_file))
index = token_index
writer.write(token + u'\n')
index += 1
return vocab_file, merge_file, special_tokens_file
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes. Provided as is from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py"""
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import logging
import os
import unicodedata
from io import open
from .file_utils import cached_path
logger = logging.getLogger(__name__)
PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
}
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
'bert-base-uncased': 512,
'bert-large-uncased': 512,
'bert-base-cased': 512,
'bert-large-cased': 512,
'bert-base-multilingual-uncased': 512,
'bert-base-multilingual-cased': 512,
'bert-base-chinese': 512,
}
VOCAB_NAME = 'vocab.txt'
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, "r", encoding="utf-8") as reader:
while True:
token = reader.readline()
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class BertTokenizer(object):
"""Runs end-to-end tokenization: punctuation splitting + wordpiece"""
def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
"""Constructs a BertTokenizer.
Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file
do_lower_case: Whether to lower case the input
Only has an effect when do_wordpiece_only=False
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying BERT model's
sequence length.
never_split: List of tokens which will never be split during tokenization.
Only has an effect when do_wordpiece_only=False
"""
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
never_split=never_split)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
self.max_len = max_len if max_len is not None else int(1e12)
def tokenize(self, text):
if self.do_basic_tokenize:
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
def convert_tokens_to_ids(self, tokens):
"""Converts a sequence of tokens into ids using the vocab."""
ids = []
for token in tokens:
ids.append(self.vocab[token])
if len(ids) > self.max_len:
logger.warning(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this BERT model ({} > {}). Running this"
" sequence through BERT will result in indexing errors".format(
len(ids), self.max_len)
)
return ids
def convert_ids_to_tokens(self, ids):
"""Converts a sequence of ids in wordpiece tokens using the vocab."""
tokens = []
for i in ids:
tokens.append(self.ids_to_tokens[i])
return tokens
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
"""
Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed.
"""
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
else:
vocab_file = pretrained_model_name_or_path
if os.path.isdir(vocab_file):
vocab_file = os.path.join(vocab_file, VOCAB_NAME)
# redirect to the cache, if necessary
try:
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
except EnvironmentError:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name_or_path,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
vocab_file))
return None
if resolved_vocab_file == vocab_file:
logger.info("loading vocabulary file {}".format(vocab_file))
else:
logger.info("loading vocabulary file {} from cache at {}".format(
vocab_file, resolved_vocab_file))
if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# than the number of positional embeddings
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
# Instantiate tokenizer.
tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
return tokenizer
class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def __init__(self,
do_lower_case=True,
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case
self.never_split = never_split
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case and token not in self.never_split:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
if text in self.never_split:
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer`.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
...@@ -19,77 +19,101 @@ import math ...@@ -19,77 +19,101 @@ import math
from megatron import print_rank_0 from megatron import print_rank_0
class AnnealingLR(object): class AnnealingLR(object):
"""Anneals the learning rate.""" """Anneals the learning rate."""
def __init__(self, optimizer, start_lr, def __init__(self, optimizer, max_lr, min_lr,
warmup_iter, total_iters, warmup_steps, decay_steps,
decay_style, last_iter, min_lr=0.0, decay_style, num_steps,
use_checkpoint_lr_scheduler=True, use_checkpoint_lr_scheduler=True,
override_lr_scheduler=False): override_lr_scheduler=False):
# Class values. # Class values.
self.optimizer = optimizer self.optimizer = optimizer
self.start_lr = start_lr
self.max_lr = float(max_lr)
self.min_lr = min_lr self.min_lr = min_lr
self.warmup_iter = warmup_iter assert self.min_lr >= 0.0
self.num_iters = last_iter assert self.max_lr >= self.min_lr
self.end_iter = total_iters
assert self.end_iter > 0 self.warmup_steps = warmup_steps
self.num_steps = num_steps
self.decay_steps = decay_steps
assert self.decay_steps > 0
assert self.warmup_steps < self.decay_steps
self.decay_style = decay_style self.decay_style = decay_style
self.override_lr_scheduler = override_lr_scheduler self.override_lr_scheduler = override_lr_scheduler
self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
if self.override_lr_scheduler: if self.override_lr_scheduler:
assert not self.use_checkpoint_lr_scheduler, 'both override and '\ assert not self.use_checkpoint_lr_scheduler, 'both override and '\
'use-checkpoint are set.' 'use-checkpoint are set.'
# Set the learning rate # Set the learning rate
self.step(self.num_iters) self.step(step_num=self.num_steps)
print_rank_0('> learning rate decay style: {}'.format(self.decay_style)) print_rank_0('> learning rate decay style: {}'.format(self.decay_style))
def get_lr(self): def get_lr(self):
"""Learning rate decay functions from: """Learning rate decay functions from:
https://openreview.net/pdf?id=BJYwwY9ll pg. 4""" https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter) # Use linear warmup for the initial part.
# Warmup. if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps:
if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter: return self.max_lr * float(self.num_steps) / \
return float(self.start_lr) * num_iters_ / self.warmup_iter float(self.warmup_steps)
# If the learning rate is constant, just return the initial value.
if self.decay_style == 'constant':
return self.max_lr
# For any steps larger than `self.decay_steps`, use `self.min_lr`.
if self.num_steps > self.decay_steps:
return self.min_lr
# If we are done with the warmup period, use the decay style.
num_steps_ = self.num_steps - self.warmup_steps
decay_steps_ = self.decay_steps - self.warmup_steps
decay_ratio = float(num_steps_) / float(decay_steps_)
assert decay_ratio >= 0.0
assert decay_ratio <= 1.0
delta_lr = self.max_lr - self.min_lr
num_iters_ = num_iters_ - self.warmup_iter
if self.decay_style == 'linear': if self.decay_style == 'linear':
lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter coeff = (1.0 - decay_ratio)
elif self.decay_style == 'cosine': elif self.decay_style == 'cosine':
lr = self.start_lr / 2.0 * (math.cos( coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
math.pi * num_iters_ / self.end_iter) + 1)
elif self.decay_style == 'exponential':
# exp(-0.693) = 1/2
lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter)
else: else:
lr = self.start_lr raise Exception('{} decay style is not supported.'.format(
return max(lr, self.min_lr) self.decay_style))
return self.min_lr + coeff * delta_lr
def step(self, step_num=None):
def step(self, increment=1, step_num=None):
"""Set lr for all parameters groups.""" """Set lr for all parameters groups."""
if step_num is None: if step_num is None:
step_num = self.num_iters + 1 step_num = self.num_steps + increment
self.num_iters = step_num self.num_steps = step_num
new_lr = self.get_lr() new_lr = self.get_lr()
for group in self.optimizer.param_groups: for group in self.optimizer.param_groups:
group['lr'] = new_lr group['lr'] = new_lr
def state_dict(self): def state_dict(self):
state_dict = { state_dict = {
'start_lr': self.start_lr, 'max_lr': self.max_lr,
'warmup_iter': self.warmup_iter, 'warmup_steps': self.warmup_steps,
'num_iters': self.num_iters, 'num_steps': self.num_steps,
'decay_style': self.decay_style, 'decay_style': self.decay_style,
'end_iter': self.end_iter, 'decay_steps': self.decay_steps,
'min_lr': self.min_lr 'min_lr': self.min_lr
} }
return state_dict return state_dict
def _check_and_set(self, cls_value, sd_value, name): def _check_and_set(self, cls_value, sd_value, name):
"""Auxiliary function for checking the values in the checkpoint and """Auxiliary function for checking the values in the checkpoint and
setting them.""" setting them."""
...@@ -104,20 +128,39 @@ class AnnealingLR(object): ...@@ -104,20 +128,39 @@ class AnnealingLR(object):
name)) name))
return sd_value return sd_value
def load_state_dict(self, sd): def load_state_dict(self, sd):
self.start_lr = self._check_and_set(self.start_lr, sd['start_lr'], if 'start_lr' in sd:
'learning rate') max_lr_ = sd['start_lr']
else:
max_lr_ = sd['max_lr']
self.max_lr = self._check_and_set(self.max_lr, max_lr_,
'learning rate')
self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'], self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'],
'minimum learning rate') 'minimum learning rate')
self.warmup_iter = self._check_and_set(self.warmup_iter,
sd['warmup_iter'], if 'warmup_iter' in sd:
'warmup iterations') warmup_steps_ = sd['warmup_iter']
self.end_iter = self._check_and_set(self.end_iter, sd['end_iter'], else:
'total number of iterations') warmup_steps_ = sd['warmup_steps']
self.warmup_steps = self._check_and_set(self.warmup_steps,
warmup_steps_,
'warmup iterations')
if 'end_iter' in sd:
decay_steps_ = sd['end_iter']
else:
decay_steps_ = sd['decay_steps']
self.decay_steps = self._check_and_set(self.decay_steps, decay_steps_,
'total number of iterations')
self.decay_style = self._check_and_set(self.decay_style, self.decay_style = self._check_and_set(self.decay_style,
sd['decay_style'], sd['decay_style'],
'decay style') 'decay style')
self.num_iters = sd['num_iters'] if 'num_iters' in sd:
self.step(self.num_iters) self.num_steps = sd['num_iters']
else:
self.num_steps = sd['num_steps']
self.step(step_num=self.num_steps)
...@@ -37,7 +37,7 @@ from megatron.model import DistributedDataParallel as LocalDDP ...@@ -37,7 +37,7 @@ from megatron.model import DistributedDataParallel as LocalDDP
from megatron.model import get_params_for_weight_decay_optimization from megatron.model import get_params_for_weight_decay_optimization
from megatron.model.realm_model import ICTBertModel from megatron.model.realm_model import ICTBertModel
from megatron.utils import check_adlr_autoresume_termination from megatron.utils import check_adlr_autoresume_termination
from megatron.utils import make_data_loader from megatron.data.data_loaders import build_pretraining_data_loader
from megatron.utils import report_memory from megatron.utils import report_memory
...@@ -194,12 +194,12 @@ def get_learning_rate_scheduler(optimizer): ...@@ -194,12 +194,12 @@ def get_learning_rate_scheduler(optimizer):
warmup_iter = args.warmup * num_iters warmup_iter = args.warmup * num_iters
lr_scheduler = AnnealingLR( lr_scheduler = AnnealingLR(
optimizer, optimizer,
start_lr=args.lr, max_lr=args.lr,
warmup_iter=warmup_iter,
total_iters=num_iters,
decay_style=args.lr_decay_style,
last_iter=init_step,
min_lr=args.min_lr, min_lr=args.min_lr,
warmup_steps=warmup_iter,
decay_steps=num_iters,
decay_style=args.lr_decay_style,
num_steps=init_step,
use_checkpoint_lr_scheduler=args.use_checkpoint_lr_scheduler, use_checkpoint_lr_scheduler=args.use_checkpoint_lr_scheduler,
override_lr_scheduler=args.override_lr_scheduler) override_lr_scheduler=args.override_lr_scheduler)
...@@ -224,7 +224,8 @@ def setup_model_and_optimizer(model_provider_func): ...@@ -224,7 +224,8 @@ def setup_model_and_optimizer(model_provider_func):
while hasattr(unwrapped_model, 'module'): while hasattr(unwrapped_model, 'module'):
unwrapped_model = unwrapped_model.module unwrapped_model = unwrapped_model.module
if args.iteration == 0 and hasattr(unwrapped_model, 'init_state_dict_from_bert'): if args.iteration == 0 and hasattr(unwrapped_model,
'init_state_dict_from_bert'):
print("Initializing ICT from pretrained BERT model", flush=True) print("Initializing ICT from pretrained BERT model", flush=True)
unwrapped_model.init_state_dict_from_bert() unwrapped_model.init_state_dict_from_bert()
...@@ -414,6 +415,8 @@ def train(forward_step_func, model, optimizer, lr_scheduler, ...@@ -414,6 +415,8 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
optimizer, optimizer,
lr_scheduler) lr_scheduler)
iteration += 1 iteration += 1
args.consumed_train_samples += mpu.get_data_parallel_world_size() * \
args.batch_size
# Logging. # Logging.
loss_scale = None loss_scale = None
...@@ -472,6 +475,8 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False): ...@@ -472,6 +475,8 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False):
args.eval_iters)) args.eval_iters))
# Forward evaluation. # Forward evaluation.
_, loss_dict = forward_step_func(data_iterator, model) _, loss_dict = forward_step_func(data_iterator, model)
args.consumed_valid_samples += mpu.get_data_parallel_world_size() \
* args.batch_size
# Reduce across processes. # Reduce across processes.
for key in loss_dict: for key in loss_dict:
total_loss_dict[key] = total_loss_dict.get(key, 0.) + \ total_loss_dict[key] = total_loss_dict.get(key, 0.) + \
...@@ -517,11 +522,19 @@ def build_train_valid_test_data_iterators( ...@@ -517,11 +522,19 @@ def build_train_valid_test_data_iterators(
(train_dataloader, valid_dataloader, test_dataloader) = (None, None, None) (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None)
print_rank_0('> building train, validation, and test datasets ...') print_rank_0('> building train, validation, and test datasets ...')
# Rank and global batch size.
data_parallel_size = mpu.get_data_parallel_world_size()
global_batch_size = args.batch_size * data_parallel_size
# Backward compatibility, assume fixed batch size.
if args.iteration > 0 and args.consumed_train_samples == 0:
args.consumed_train_samples = args.iteration * global_batch_size
if args.iteration > 0 and args.consumed_valid_samples == 0:
args.consumed_valid_samples = (args.iteration // args.eval_interval) * \
args.eval_iters * global_batch_size
# Data loader only on rank 0 of each model parallel group. # Data loader only on rank 0 of each model parallel group.
if mpu.get_model_parallel_rank() == 0: if mpu.get_model_parallel_rank() == 0:
# Rank, size, and global batch size.
data_parallel_size = mpu.get_data_parallel_world_size()
global_batch_size = args.batch_size * data_parallel_size
# Number of train/valid/test samples. # Number of train/valid/test samples.
train_iters = args.train_iters train_iters = args.train_iters
...@@ -540,9 +553,11 @@ def build_train_valid_test_data_iterators( ...@@ -540,9 +553,11 @@ def build_train_valid_test_data_iterators(
train_val_test_num_samples) train_val_test_num_samples)
# Build dataloders. # Build dataloders.
train_dataloader = make_data_loader(train_ds) train_dataloader = build_pretraining_data_loader(
valid_dataloader = make_data_loader(valid_ds) train_ds, args.consumed_train_samples)
test_dataloader = make_data_loader(test_ds) valid_dataloader = build_pretraining_data_loader(
valid_ds, args.consumed_valid_samples)
test_dataloader = build_pretraining_data_loader(test_ds, 0)
# Flags to know if we need to do training/validation/testing. # Flags to know if we need to do training/validation/testing.
do_train = train_dataloader is not None and args.train_iters > 0 do_train = train_dataloader is not None and args.train_iters > 0
...@@ -561,21 +576,7 @@ def build_train_valid_test_data_iterators( ...@@ -561,21 +576,7 @@ def build_train_valid_test_data_iterators(
args.do_train = flags[0].item() args.do_train = flags[0].item()
args.do_valid = flags[1].item() args.do_valid = flags[1].item()
args.do_test = flags[2].item() args.do_test = flags[2].item()
# Shift the start iterations.
if train_dataloader is not None:
train_dataloader.batch_sampler.start_iter = args.iteration % \
len(train_dataloader)
print_rank_0('setting training data start iteration to {}'.
format(train_dataloader.batch_sampler.start_iter))
if valid_dataloader is not None:
start_iter_val = (args.iteration // args.eval_interval) * \
args.eval_iters
valid_dataloader.batch_sampler.start_iter = start_iter_val % \
len(valid_dataloader)
print_rank_0('setting validation data start iteration to {}'.
format(valid_dataloader.batch_sampler.start_iter))
# Build iterators. # Build iterators.
if train_dataloader is not None: if train_dataloader is not None:
train_data_iterator = iter(train_dataloader) train_data_iterator = iter(train_dataloader)
......
...@@ -24,7 +24,6 @@ from megatron import print_rank_0 ...@@ -24,7 +24,6 @@ from megatron import print_rank_0
from megatron import get_adlr_autoresume from megatron import get_adlr_autoresume
from megatron import mpu from megatron import mpu
from megatron.checkpointing import save_checkpoint from megatron.checkpointing import save_checkpoint
from megatron.data.samplers import DistributedBatchSampler
from megatron.fp16 import FP16_Optimizer from megatron.fp16 import FP16_Optimizer
...@@ -89,32 +88,6 @@ def check_adlr_autoresume_termination(iteration, model, ...@@ -89,32 +88,6 @@ def check_adlr_autoresume_termination(iteration, model,
sys.exit(0) sys.exit(0)
def make_data_loader(dataset):
"""Buld dataloader given an input dataset."""
if dataset is None:
return None
args = get_args()
# Data parallel arguments.
world_size = mpu.get_data_parallel_world_size()
rank = mpu.get_data_parallel_rank()
global_batch_size = args.batch_size * world_size
num_workers = args.num_workers
# Use a simple sampler with distributed batch sampler.
sampler = torch.utils.data.SequentialSampler(dataset)
batch_sampler = DistributedBatchSampler(sampler=sampler,
batch_size=global_batch_size,
drop_last=True,
rank=rank,
world_size=world_size)
# Torch dataloader.
return torch.utils.data.DataLoader(dataset,
batch_sampler=batch_sampler,
num_workers=num_workers,
pin_memory=True)
def get_ltor_masks_and_position_ids(data, def get_ltor_masks_and_position_ids(data,
eod_token, eod_token,
reset_position_ids, reset_position_ids,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment