Commit 982f181a authored by erenup's avatar erenup
Browse files

Merge remote-tracking branch 'origin/master' into run_multiple_choice_add_doc

parents 603b470a 84b9d1c4
...@@ -64,13 +64,14 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { ...@@ -64,13 +64,14 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
@lru_cache() @lru_cache()
def bytes_to_unicode(): def bytes_to_unicode():
""" """
Returns list of utf-8 byte and a corresponding list of unicode strings. Returns list of utf-8 byte and a mapping to unicode strings.
We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
The reversible bpe codes work on unicode strings. The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a signficant percentage of your normal, say, 32K bpe vocab. This is a signficant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings. To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
""" """
_chr = unichr if sys.version_info[0] == 2 else chr _chr = unichr if sys.version_info[0] == 2 else chr
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
...@@ -99,7 +100,10 @@ def get_pairs(word): ...@@ -99,7 +100,10 @@ def get_pairs(word):
class GPT2Tokenizer(PreTrainedTokenizer): class GPT2Tokenizer(PreTrainedTokenizer):
""" """
GPT-2 BPE tokenizer. Peculiarities: GPT-2 BPE tokenizer. Peculiarities:
- Byte-level BPE - Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => will add a space is there isn't.
As a consequence, this tokenizer `encode` and `decode` method will not conserve
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
...@@ -111,11 +115,11 @@ class GPT2Tokenizer(PreTrainedTokenizer): ...@@ -111,11 +115,11 @@ class GPT2Tokenizer(PreTrainedTokenizer):
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
self.encoder = json.load(open(vocab_file)) self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v:k for k,v in self.encoder.items()} self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode() self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_data] bpe_merges = [tuple(merge.split()) for merge in bpe_data]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
...@@ -171,12 +175,13 @@ class GPT2Tokenizer(PreTrainedTokenizer): ...@@ -171,12 +175,13 @@ class GPT2Tokenizer(PreTrainedTokenizer):
def _tokenize(self, text): def _tokenize(self, text):
""" Tokenize a string. """ """ Tokenize a string. """
text = ' ' + text # GPT-2 (and RoBERTa) tokenizers need at least one space to begin the sentence with.
bpe_tokens = [] bpe_tokens = []
for token in re.findall(self.pat, text): for token in re.findall(self.pat, text):
if sys.version_info[0] == 2: if sys.version_info[0] == 2:
token = ''.join(self.byte_encoder[ord(b)] for b in token) token = ''.join(self.byte_encoder[ord(b)] for b in token) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
else: else:
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
return bpe_tokens return bpe_tokens
...@@ -216,4 +221,4 @@ class GPT2Tokenizer(PreTrainedTokenizer): ...@@ -216,4 +221,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
writer.write(' '.join(bpe_tokens) + u'\n') writer.write(' '.join(bpe_tokens) + u'\n')
index += 1 index += 1
return vocab_file, merge_file return vocab_file, merge_file
\ No newline at end of file
...@@ -23,8 +23,7 @@ import os ...@@ -23,8 +23,7 @@ import os
import regex as re import regex as re
from io import open from io import open
from .tokenization_gpt2 import bytes_to_unicode, get_pairs from .tokenization_gpt2 import GPT2Tokenizer
from .tokenization_utils import PreTrainedTokenizer
try: try:
from functools import lru_cache from functools import lru_cache
...@@ -63,9 +62,13 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { ...@@ -63,9 +62,13 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
} }
class RobertaTokenizer(PreTrainedTokenizer): class RobertaTokenizer(GPT2Tokenizer):
""" """
RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Byte-level BPE RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => will add a space is there isn't.
As a consequence, this tokenizer `encode` and `decode` method will not conserve
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
...@@ -73,132 +76,23 @@ class RobertaTokenizer(PreTrainedTokenizer): ...@@ -73,132 +76,23 @@ class RobertaTokenizer(PreTrainedTokenizer):
def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>", def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs): cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors,
bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
mask_token=mask_token, **kwargs) mask_token=mask_token, **kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
@property
def vocab_size(self):
return len(self.encoder)
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except:
new_word.extend(word[i:])
break
if word[i] == first and i < len(word)-1 and word[i+1] == second:
new_word.append(first+second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = ' '.join(word)
self.cache[token] = word
return word
def _tokenize(self, text):
""" Tokenize a string. """
bpe_tokens = []
for token in re.findall(self.pat, text):
if sys.version_info[0] == 2:
token = ''.join(self.byte_encoder[ord(b)] for b in token)
else:
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
return bpe_tokens
def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):
""" Converts a sequence of tokens (string) in a single string. """
text = ''.join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
return text
def add_special_tokens_single_sentence(self, token_ids): def add_special_tokens_single_sentence(self, token_ids):
""" """
Adds special tokens to a sequence for sequence classification tasks. Adds special tokens to a sequence for sequence classification tasks.
A RoBERTa sequence has the following format: <s> X </s> A RoBERTa sequence has the following format: <s> X </s>
""" """
return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)] return [self.cls_token_id] + token_ids + [self.sep_token_id]
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1): def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
""" """
Adds special tokens to a sequence pair for sequence classification tasks. Adds special tokens to a sequence pair for sequence classification tasks.
A RoBERTa sequence pair has the following format: <s> A </s></s> B </s> A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
""" """
sep = [self._convert_token_to_id(self.sep_token)] sep = [self.sep_token_id]
cls = [self._convert_token_to_id(self.cls_token)] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory."""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
with open(vocab_file, 'w', encoding='utf-8') as f:
f.write(json.dumps(self.encoder, ensure_ascii=False))
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write(u'#version: 0.2\n')
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(merge_file))
index = token_index
writer.write(' '.join(bpe_tokens) + u'\n')
index += 1
return vocab_file, merge_file
...@@ -95,7 +95,8 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -95,7 +95,8 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
# in a library like ours, at all. # in a library like ours, at all.
vocab_dict = torch.load(pretrained_vocab_file) vocab_dict = torch.load(pretrained_vocab_file)
for key, value in vocab_dict.items(): for key, value in vocab_dict.items():
self.__dict__[key] = value if key not in self.__dict__:
self.__dict__[key] = value
if vocab_file is not None: if vocab_file is not None:
self.build_vocab() self.build_vocab()
......
...@@ -20,6 +20,7 @@ import logging ...@@ -20,6 +20,7 @@ import logging
import os import os
import json import json
import six import six
import copy
from io import open from io import open
from .file_utils import cached_path from .file_utils import cached_path
...@@ -28,6 +29,7 @@ logger = logging.getLogger(__name__) ...@@ -28,6 +29,7 @@ logger = logging.getLogger(__name__)
SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json' SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
ADDED_TOKENS_FILE = 'added_tokens.json' ADDED_TOKENS_FILE = 'added_tokens.json'
TOKENIZER_CONFIG_FILE = 'tokenizer_config.json'
class PreTrainedTokenizer(object): class PreTrainedTokenizer(object):
""" Base class for all tokenizers. """ Base class for all tokenizers.
...@@ -40,27 +42,29 @@ class PreTrainedTokenizer(object): ...@@ -40,27 +42,29 @@ class PreTrainedTokenizer(object):
- ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string). - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
- ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file. - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
- ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size. - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
- ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method.
Parameters: Parameters:
- ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id``
- ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id``
- ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id``
- ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id``
- ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id``
- ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id``
- ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id``
- ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
""" """
vocab_files_names = {} vocab_files_names = {}
pretrained_vocab_files_map = {} pretrained_vocab_files_map = {}
pretrained_init_configuration = {}
max_model_input_sizes = {} max_model_input_sizes = {}
SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token", SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token",
...@@ -155,6 +159,46 @@ class PreTrainedTokenizer(object): ...@@ -155,6 +159,46 @@ class PreTrainedTokenizer(object):
def additional_special_tokens(self, value): def additional_special_tokens(self, value):
self._additional_special_tokens = value self._additional_special_tokens = value
@property
def bos_token_id(self):
""" Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """
return self.convert_tokens_to_ids(self.bos_token)
@property
def eos_token_id(self):
""" Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """
return self.convert_tokens_to_ids(self.eos_token)
@property
def unk_token_id(self):
""" Id of the unknown token in the vocabulary. Log an error if used while not having been set. """
return self.convert_tokens_to_ids(self.unk_token)
@property
def sep_token_id(self):
""" Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
return self.convert_tokens_to_ids(self.sep_token)
@property
def pad_token_id(self):
""" Id of the padding token in the vocabulary. Log an error if used while not having been set. """
return self.convert_tokens_to_ids(self.pad_token)
@property
def cls_token_id(self):
""" Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
return self.convert_tokens_to_ids(self.cls_token)
@property
def mask_token_id(self):
""" Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
return self.convert_tokens_to_ids(self.mask_token)
@property
def additional_special_tokens_ids(self):
""" Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """
return self.convert_tokens_to_ids(self.additional_special_tokens)
def __init__(self, max_len=None, **kwargs): def __init__(self, max_len=None, **kwargs):
self._bos_token = None self._bos_token = None
self._eos_token = None self._eos_token = None
...@@ -166,12 +210,15 @@ class PreTrainedTokenizer(object): ...@@ -166,12 +210,15 @@ class PreTrainedTokenizer(object):
self._additional_special_tokens = [] self._additional_special_tokens = []
self.max_len = max_len if max_len is not None else int(1e12) self.max_len = max_len if max_len is not None else int(1e12)
self.max_len_single_sentence = self.max_len
self.max_len_sentences_pair = self.max_len
# Added tokens
self.added_tokens_encoder = {} self.added_tokens_encoder = {}
self.added_tokens_decoder = {} self.added_tokens_decoder = {}
# inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
self.init_inputs = ()
self.init_kwargs = {}
for key, value in kwargs.items(): for key, value in kwargs.items():
if key in self.SPECIAL_TOKENS_ATTRIBUTES: if key in self.SPECIAL_TOKENS_ATTRIBUTES:
if key == 'additional_special_tokens': if key == 'additional_special_tokens':
...@@ -231,17 +278,20 @@ class PreTrainedTokenizer(object): ...@@ -231,17 +278,20 @@ class PreTrainedTokenizer(object):
@classmethod @classmethod
def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
cache_dir = kwargs.pop('cache_dir', None) cache_dir = kwargs.pop('cache_dir', None)
force_download = kwargs.pop('force_download', False) force_download = kwargs.pop('force_download', False)
proxies = kwargs.pop('proxies', None) proxies = kwargs.pop('proxies', None)
s3_models = list(cls.max_model_input_sizes.keys()) s3_models = list(cls.max_model_input_sizes.keys())
vocab_files = {} vocab_files = {}
init_configuration = {}
if pretrained_model_name_or_path in s3_models: if pretrained_model_name_or_path in s3_models:
# Get the vocabulary from AWS S3 bucket # Get the vocabulary from AWS S3 bucket
for file_id, map_list in cls.pretrained_vocab_files_map.items(): for file_id, map_list in cls.pretrained_vocab_files_map.items():
vocab_files[file_id] = map_list[pretrained_model_name_or_path] vocab_files[file_id] = map_list[pretrained_model_name_or_path]
if cls.pretrained_init_configuration and pretrained_model_name_or_path in cls.pretrained_init_configuration:
init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path]
else: else:
# Get the vocabulary from local files # Get the vocabulary from local files
logger.info( logger.info(
...@@ -264,15 +314,17 @@ class PreTrainedTokenizer(object): ...@@ -264,15 +314,17 @@ class PreTrainedTokenizer(object):
vocab_files[file_id] = full_file_name vocab_files[file_id] = full_file_name
# Look for the additional tokens files # Look for the additional tokens files
all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE, additional_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE} 'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE,
'tokenizer_config_file': TOKENIZER_CONFIG_FILE,
}
# If a path to a file was provided, get the parent directory # If a path to a file was provided, get the parent directory
saved_directory = pretrained_model_name_or_path saved_directory = pretrained_model_name_or_path
if os.path.exists(saved_directory) and not os.path.isdir(saved_directory): if os.path.exists(saved_directory) and not os.path.isdir(saved_directory):
saved_directory = os.path.dirname(saved_directory) saved_directory = os.path.dirname(saved_directory)
for file_id, file_name in all_vocab_files_names.items(): for file_id, file_name in additional_files_names.items():
full_file_name = os.path.join(saved_directory, file_name) full_file_name = os.path.join(saved_directory, file_name)
if not os.path.exists(full_file_name): if not os.path.exists(full_file_name):
logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
...@@ -315,28 +367,46 @@ class PreTrainedTokenizer(object): ...@@ -315,28 +367,46 @@ class PreTrainedTokenizer(object):
logger.info("loading file {} from cache at {}".format( logger.info("loading file {} from cache at {}".format(
file_path, resolved_vocab_files[file_id])) file_path, resolved_vocab_files[file_id]))
# Prepare tokenizer initialization kwargs
# Did we saved some inputs and kwargs to reload ?
tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None)
if tokenizer_config_file is not None:
init_kwargs = json.load(open(tokenizer_config_file, encoding="utf-8"))
saved_init_inputs = init_kwargs.pop('init_inputs', ())
if not init_inputs:
init_inputs = saved_init_inputs
else:
init_kwargs = init_configuration
# Update with newly provided kwargs
init_kwargs.update(kwargs)
# Set max length if needed # Set max length if needed
if pretrained_model_name_or_path in cls.max_model_input_sizes: if pretrained_model_name_or_path in cls.max_model_input_sizes:
# if we're using a pretrained model, ensure the tokenizer # if we're using a pretrained model, ensure the tokenizer
# wont index sequences longer than the number of positional embeddings # wont index sequences longer than the number of positional embeddings
max_len = cls.max_model_input_sizes[pretrained_model_name_or_path] max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
if max_len is not None and isinstance(max_len, (int, float)): if max_len is not None and isinstance(max_len, (int, float)):
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) init_kwargs['max_len'] = min(init_kwargs.get('max_len', int(1e12)), max_len)
# Merge resolved_vocab_files arguments in kwargs. # Merge resolved_vocab_files arguments in init_kwargs.
added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None) added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None)
special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None) special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None)
for args_name, file_path in resolved_vocab_files.items(): for args_name, file_path in resolved_vocab_files.items():
if args_name not in kwargs: if args_name not in init_kwargs:
kwargs[args_name] = file_path init_kwargs[args_name] = file_path
if special_tokens_map_file is not None: if special_tokens_map_file is not None:
special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8")) special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
for key, value in special_tokens_map.items(): for key, value in special_tokens_map.items():
if key not in kwargs: if key not in init_kwargs:
kwargs[key] = value init_kwargs[key] = value
# Instantiate tokenizer. # Instantiate tokenizer.
tokenizer = cls(*inputs, **kwargs) tokenizer = cls(*init_inputs, **init_kwargs)
# Save inputs and kwargs for saving and re-loading with ``save_pretrained``
tokenizer.init_inputs = init_inputs
tokenizer.init_kwargs = init_kwargs
# Add supplementary tokens. # Add supplementary tokens.
if added_tokens_file is not None: if added_tokens_file is not None:
...@@ -349,8 +419,13 @@ class PreTrainedTokenizer(object): ...@@ -349,8 +419,13 @@ class PreTrainedTokenizer(object):
def save_pretrained(self, save_directory): def save_pretrained(self, save_directory):
""" Save the tokenizer vocabulary files (with added tokens) and the """ Save the tokenizer vocabulary files together with:
special-tokens-to-class-attributes-mapping to a directory. - added tokens,
- special-tokens-to-class-attributes-mapping,
- tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
This won't save modifications other than (added tokens and special token mapping) you may have
applied to the tokenizer after the instantion (e.g. modifying tokenizer.do_lower_case after creation).
This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method. This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
""" """
...@@ -360,6 +435,15 @@ class PreTrainedTokenizer(object): ...@@ -360,6 +435,15 @@ class PreTrainedTokenizer(object):
special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE) special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE) added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
tokenizer_config = copy.deepcopy(self.init_kwargs)
tokenizer_config['init_inputs'] = copy.deepcopy(self.init_inputs)
for file_id in self.vocab_files_names.keys():
tokenizer_config.pop(file_id, None)
with open(tokenizer_config_file, 'w', encoding='utf-8') as f:
f.write(json.dumps(tokenizer_config, ensure_ascii=False))
with open(special_tokens_map_file, 'w', encoding='utf-8') as f: with open(special_tokens_map_file, 'w', encoding='utf-8') as f:
f.write(json.dumps(self.special_tokens_map, ensure_ascii=False)) f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
...@@ -441,6 +525,13 @@ class PreTrainedTokenizer(object): ...@@ -441,6 +525,13 @@ class PreTrainedTokenizer(object):
to class attributes. If special tokens are NOT in the vocabulary, they are added to class attributes. If special tokens are NOT in the vocabulary, they are added
to it (indexed starting from the last index of the current vocabulary). to it (indexed starting from the last index of the current vocabulary).
Using `add_special_tokens` will ensure your special tokens can be used in several ways:
- special tokens are carefully handled by the tokenizer (they are never split)
- you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts.
When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '</s>')
Args: Args:
special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
[``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
...@@ -546,6 +637,9 @@ class PreTrainedTokenizer(object): ...@@ -546,6 +637,9 @@ class PreTrainedTokenizer(object):
""" Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
(resp. a sequence of ids), using the vocabulary. (resp. a sequence of ids), using the vocabulary.
""" """
if tokens is None:
return None
if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)): if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
return self._convert_token_to_id_with_added_voc(tokens) return self._convert_token_to_id_with_added_voc(tokens)
...@@ -559,6 +653,9 @@ class PreTrainedTokenizer(object): ...@@ -559,6 +653,9 @@ class PreTrainedTokenizer(object):
return ids return ids
def _convert_token_to_id_with_added_voc(self, token): def _convert_token_to_id_with_added_voc(self, token):
if token is None:
return None
if token in self.added_tokens_encoder: if token in self.added_tokens_encoder:
return self.added_tokens_encoder[token] return self.added_tokens_encoder[token]
return self._convert_token_to_id(token) return self._convert_token_to_id(token)
...@@ -566,7 +663,7 @@ class PreTrainedTokenizer(object): ...@@ -566,7 +663,7 @@ class PreTrainedTokenizer(object):
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
raise NotImplementedError raise NotImplementedError
def encode(self, text, text_pair=None, add_special_tokens=False): def encode(self, text, text_pair=None, add_special_tokens=False, **kwargs):
""" """
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
...@@ -577,15 +674,16 @@ class PreTrainedTokenizer(object): ...@@ -577,15 +674,16 @@ class PreTrainedTokenizer(object):
text_pair: Optional second sequence to be encoded. text_pair: Optional second sequence to be encoded.
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
to their model. to their model.
**kwargs: passed to the `self.tokenize()` method
""" """
if text_pair is None: if text_pair is None:
if add_special_tokens: if add_special_tokens:
return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text))) return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text, **kwargs)))
else: else:
return self.convert_tokens_to_ids(self.tokenize(text)) return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text)] first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)]
second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair)] second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)]
if add_special_tokens: if add_special_tokens:
return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens) return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens)
...@@ -614,7 +712,7 @@ class PreTrainedTokenizer(object): ...@@ -614,7 +712,7 @@ class PreTrainedTokenizer(object):
return self._convert_id_to_token(ids) return self._convert_id_to_token(ids)
tokens = [] tokens = []
for index in ids: for index in ids:
if index in self.all_special_ids and skip_special_tokens: if skip_special_tokens and index in self.all_special_ids:
continue continue
if index in self.added_tokens_decoder: if index in self.added_tokens_decoder:
tokens.append(self.added_tokens_decoder[index]) tokens.append(self.added_tokens_decoder[index])
...@@ -639,7 +737,25 @@ class PreTrainedTokenizer(object): ...@@ -639,7 +737,25 @@ class PreTrainedTokenizer(object):
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
""" """
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
text = self.convert_tokens_to_string(filtered_tokens)
# To avoid mixing byte-level and unicode for byte-level BPT
# we need to build string separatly for added tokens and byte-level tokens
# cf. https://github.com/huggingface/pytorch-transformers/issues/1133
sub_texts = []
current_sub_text = []
for token in filtered_tokens:
if skip_special_tokens and token in self.all_special_ids:
continue
if token in self.added_tokens_encoder:
if current_sub_text:
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
current_sub_text = []
sub_texts.append(" " + token)
else:
current_sub_text.append(token)
if current_sub_text:
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
text = ''.join(sub_texts)
if self._sep_token is not None and self._sep_token in text: if self._sep_token is not None and self._sep_token in text:
text = text.replace(self._cls_token, self._sep_token) text = text.replace(self._cls_token, self._sep_token)
...@@ -676,7 +792,7 @@ class PreTrainedTokenizer(object): ...@@ -676,7 +792,7 @@ class PreTrainedTokenizer(object):
all_toks = [] all_toks = []
set_attr = self.special_tokens_map set_attr = self.special_tokens_map
for attr_value in set_attr.values(): for attr_value in set_attr.values():
all_toks = all_toks + (attr_value if isinstance(attr_value, (list, tuple)) else [attr_value]) all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
all_toks = list(set(all_toks)) all_toks = list(set(all_toks))
return all_toks return all_toks
......
...@@ -20,8 +20,12 @@ import json ...@@ -20,8 +20,12 @@ import json
import logging import logging
import os import os
import re import re
import sys
import unicodedata
from io import open from io import open
import sacremoses as sm
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
from .tokenization_bert import BasicTokenizer from .tokenization_bert import BasicTokenizer
...@@ -43,6 +47,8 @@ PRETRAINED_VOCAB_FILES_MAP = { ...@@ -43,6 +47,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json", 'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json",
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json", 'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json",
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json", 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json",
}, },
'merges_file': 'merges_file':
{ {
...@@ -54,6 +60,8 @@ PRETRAINED_VOCAB_FILES_MAP = { ...@@ -54,6 +60,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt", 'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt",
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt", 'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt", 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt",
'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt",
}, },
} }
...@@ -66,6 +74,342 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { ...@@ -66,6 +74,342 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'xlm-mlm-xnli15-1024': 512, 'xlm-mlm-xnli15-1024': 512,
'xlm-clm-enfr-1024': 512, 'xlm-clm-enfr-1024': 512,
'xlm-clm-ende-1024': 512, 'xlm-clm-ende-1024': 512,
'xlm-mlm-17-1280': 512,
'xlm-mlm-100-1280': 512,
}
PRETRAINED_INIT_CONFIGURATION = {
'xlm-mlm-en-2048': {"do_lowercase_and_remove_accent": True},
'xlm-mlm-ende-1024': { "do_lowercase_and_remove_accent": True,
"id2lang": { "0": "de",
"1": "en"},
"lang2id": { "de": 0,
"en": 1 }},
'xlm-mlm-enfr-1024': { "do_lowercase_and_remove_accent": True,
"id2lang": { "0": "en",
"1": "fr"},
"lang2id": { "en": 0,
"fr": 1 }},
'xlm-mlm-enro-1024': { "do_lowercase_and_remove_accent": True,
"id2lang": { "0": "en",
"1": "ro"},
"lang2id": { "en": 0,
"ro": 1 }},
'xlm-mlm-tlm-xnli15-1024': { "do_lowercase_and_remove_accent": True,
"id2lang": { "0": "ar",
"1": "bg",
"2": "de",
"3": "el",
"4": "en",
"5": "es",
"6": "fr",
"7": "hi",
"8": "ru",
"9": "sw",
"10": "th",
"11": "tr",
"12": "ur",
"13": "vi",
"14": "zh"},
"lang2id": { "ar": 0,
"bg": 1,
"de": 2,
"el": 3,
"en": 4,
"es": 5,
"fr": 6,
"hi": 7,
"ru": 8,
"sw": 9,
"th": 10,
"tr": 11,
"ur": 12,
"vi": 13,
"zh": 14 }},
'xlm-mlm-xnli15-1024': { "do_lowercase_and_remove_accent": True,
"id2lang": { "0": "ar",
"1": "bg",
"2": "de",
"3": "el",
"4": "en",
"5": "es",
"6": "fr",
"7": "hi",
"8": "ru",
"9": "sw",
"10": "th",
"11": "tr",
"12": "ur",
"13": "vi",
"14": "zh"},
"lang2id": { "ar": 0,
"bg": 1,
"de": 2,
"el": 3,
"en": 4,
"es": 5,
"fr": 6,
"hi": 7,
"ru": 8,
"sw": 9,
"th": 10,
"tr": 11,
"ur": 12,
"vi": 13,
"zh": 14 }},
'xlm-clm-enfr-1024': { "do_lowercase_and_remove_accent": True,
"id2lang": { "0": "en",
"1": "fr"},
"lang2id": { "en": 0,
"fr": 1 }},
'xlm-clm-ende-1024': { "do_lowercase_and_remove_accent": True,
"id2lang": { "0": "de",
"1": "en"},
"lang2id": { "de": 0,
"en": 1 }},
'xlm-mlm-17-1280': {"do_lowercase_and_remove_accent": False,
"id2lang": {
"0": "ar",
"1": "de",
"2": "en",
"3": "es",
"4": "fr",
"5": "hi",
"6": "it",
"7": "ja",
"8": "ko",
"9": "nl",
"10": "pl",
"11": "pt",
"12": "ru",
"13": "sv",
"14": "tr",
"15": "vi",
"16": "zh"
},
"lang2id": {
"ar": 0,
"de": 1,
"en": 2,
"es": 3,
"fr": 4,
"hi": 5,
"it": 6,
"ja": 7,
"ko": 8,
"nl": 9,
"pl": 10,
"pt": 11,
"ru": 12,
"sv": 13,
"tr": 14,
"vi": 15,
"zh": 16}},
'xlm-mlm-100-1280': {"do_lowercase_and_remove_accent": False,
"id2lang": {
"0": "af",
"1": "als",
"2": "am",
"3": "an",
"4": "ang",
"5": "ar",
"6": "arz",
"7": "ast",
"8": "az",
"9": "bar",
"10": "be",
"11": "bg",
"12": "bn",
"13": "br",
"14": "bs",
"15": "ca",
"16": "ceb",
"17": "ckb",
"18": "cs",
"19": "cy",
"20": "da",
"21": "de",
"22": "el",
"23": "en",
"24": "eo",
"25": "es",
"26": "et",
"27": "eu",
"28": "fa",
"29": "fi",
"30": "fr",
"31": "fy",
"32": "ga",
"33": "gan",
"34": "gl",
"35": "gu",
"36": "he",
"37": "hi",
"38": "hr",
"39": "hu",
"40": "hy",
"41": "ia",
"42": "id",
"43": "is",
"44": "it",
"45": "ja",
"46": "jv",
"47": "ka",
"48": "kk",
"49": "kn",
"50": "ko",
"51": "ku",
"52": "la",
"53": "lb",
"54": "lt",
"55": "lv",
"56": "mk",
"57": "ml",
"58": "mn",
"59": "mr",
"60": "ms",
"61": "my",
"62": "nds",
"63": "ne",
"64": "nl",
"65": "nn",
"66": "no",
"67": "oc",
"68": "pl",
"69": "pt",
"70": "ro",
"71": "ru",
"72": "scn",
"73": "sco",
"74": "sh",
"75": "si",
"76": "simple",
"77": "sk",
"78": "sl",
"79": "sq",
"80": "sr",
"81": "sv",
"82": "sw",
"83": "ta",
"84": "te",
"85": "th",
"86": "tl",
"87": "tr",
"88": "tt",
"89": "uk",
"90": "ur",
"91": "uz",
"92": "vi",
"93": "war",
"94": "wuu",
"95": "yi",
"96": "zh",
"97": "zh_classical",
"98": "zh_min_nan",
"99": "zh_yue"
},
"lang2id": {
"af": 0,
"als": 1,
"am": 2,
"an": 3,
"ang": 4,
"ar": 5,
"arz": 6,
"ast": 7,
"az": 8,
"bar": 9,
"be": 10,
"bg": 11,
"bn": 12,
"br": 13,
"bs": 14,
"ca": 15,
"ceb": 16,
"ckb": 17,
"cs": 18,
"cy": 19,
"da": 20,
"de": 21,
"el": 22,
"en": 23,
"eo": 24,
"es": 25,
"et": 26,
"eu": 27,
"fa": 28,
"fi": 29,
"fr": 30,
"fy": 31,
"ga": 32,
"gan": 33,
"gl": 34,
"gu": 35,
"he": 36,
"hi": 37,
"hr": 38,
"hu": 39,
"hy": 40,
"ia": 41,
"id": 42,
"is": 43,
"it": 44,
"ja": 45,
"jv": 46,
"ka": 47,
"kk": 48,
"kn": 49,
"ko": 50,
"ku": 51,
"la": 52,
"lb": 53,
"lt": 54,
"lv": 55,
"mk": 56,
"ml": 57,
"mn": 58,
"mr": 59,
"ms": 60,
"my": 61,
"nds": 62,
"ne": 63,
"nl": 64,
"nn": 65,
"no": 66,
"oc": 67,
"pl": 68,
"pt": 69,
"ro": 70,
"ru": 71,
"scn": 72,
"sco": 73,
"sh": 74,
"si": 75,
"simple": 76,
"sk": 77,
"sl": 78,
"sq": 79,
"sr": 80,
"sv": 81,
"sw": 82,
"ta": 83,
"te": 84,
"th": 85,
"tl": 86,
"tr": 87,
"tt": 88,
"uk": 89,
"ur": 90,
"uz": 91,
"vi": 92,
"war": 93,
"wuu": 94,
"yi": 95,
"zh": 96,
"zh_classical": 97,
"zh_min_nan": 98,
"zh_yue": 99
}},
} }
def get_pairs(word): def get_pairs(word):
...@@ -80,62 +424,145 @@ def get_pairs(word): ...@@ -80,62 +424,145 @@ def get_pairs(word):
prev_char = char prev_char = char
return pairs return pairs
def text_standardize(text):
def lowercase_and_remove_accent(text):
""" """
fixes some issues the spacy tokenizer had on books corpus Lowercase and strips accents from a piece of text based on
also does some whitespace standardization https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py
""" """
text = text.replace('—', '-') text = ' '.join(text)
text = text.replace('–', '-') text = text.lower()
text = text.replace('―', '-') text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output).lower().split(' ')
def replace_unicode_punct(text):
'''
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
'''
text = text.replace(',', ',')
text = re.sub(r'。\s*', '. ', text)
text = text.replace('、', ',')
text = text.replace('”', '"')
text = text.replace('“', '"')
text = text.replace('∶', ':')
text = text.replace(':', ':')
text = text.replace('?', '?')
text = text.replace('《', '"')
text = text.replace('》', '"')
text = text.replace(')', ')')
text = text.replace('!', '!')
text = text.replace('(', '(')
text = text.replace(';', ';')
text = text.replace('1', '"')
text = text.replace('」', '"')
text = text.replace('「', '"')
text = text.replace('0', '0')
text = text.replace('3', '3')
text = text.replace('2', '2')
text = text.replace('5', '5')
text = text.replace('6', '6')
text = text.replace('9', '9')
text = text.replace('7', '7')
text = text.replace('8', '8')
text = text.replace('4', '4')
text = re.sub(r'.\s*', '. ', text)
text = text.replace('~', '~')
text = text.replace('’', '\'')
text = text.replace('…', '...') text = text.replace('…', '...')
text = text.replace('´', "'") text = text.replace('━', '-')
text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text) text = text.replace('〈', '<')
text = re.sub(r'\s*\n\s*', ' \n ', text) text = text.replace('〉', '>')
text = re.sub(r'[^\S\n]+', ' ', text) text = text.replace('【', '[')
return text.strip() text = text.replace('】', ']')
text = text.replace('%', '%')
return text
def remove_non_printing_char(text):
'''
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
'''
output = []
for char in text:
cat = unicodedata.category(char)
if cat.startswith('C'):
continue
output.append(char)
return "".join(output)
def romanian_preprocessing(text):
'''Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`'''
# https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/normalise-romanian.py
text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
# https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/remove-diacritics.py
text = text.replace("\u0218", "S").replace("\u0219", "s") #s-comma
text = text.replace("\u021a", "T").replace("\u021b", "t") #t-comma
text = text.replace("\u0102", "A").replace("\u0103", "a")
text = text.replace("\u00C2", "A").replace("\u00E2", "a")
text = text.replace("\u00CE", "I").replace("\u00EE", "i")
return text
class XLMTokenizer(PreTrainedTokenizer): class XLMTokenizer(PreTrainedTokenizer):
""" """
BPE tokenizer for XLM, adapted from OpenAI BPE tokenizer. Peculiarities: BPE tokenizer for XLM
- lower case all inputs - Moses preprocessing & tokenization for most supported languages
- uses `SpaCy tokenizer <https://spacy.io/api/tokenizer/>`_ and \ - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
`ftfy <https://ftfy.readthedocs.io/en/latest/>`_ for pre-BPE tokenization if they are installed, \
fallback to BERT's BasicTokenizer if not. - (optionally) lower case & normalize all inputs text
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
(ex: "__classify__") to a vocabulary. (ex: "__classify__") to a vocabulary
- `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
- `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
- `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies)
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>", def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
sep_token="</s>", pad_token="<pad>", cls_token="</s>", sep_token="</s>", pad_token="<pad>", cls_token="</s>",
mask_token="<special1>", additional_special_tokens=["<special0>", mask_token="<special1>", additional_special_tokens=["<special0>",
"<special1>", "<special2>", "<special3>", "<special4>", "<special5>", "<special1>", "<special2>", "<special3>", "<special4>", "<special5>",
"<special6>", "<special7>", "<special8>", "<special9>"], **kwargs): "<special6>", "<special7>", "<special8>", "<special9>"],
lang2id=None, id2lang=None, do_lowercase_and_remove_accent=True,
**kwargs):
super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token, super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token,
sep_token=sep_token, pad_token=pad_token, sep_token=sep_token, pad_token=pad_token,
cls_token=cls_token, mask_token=mask_token, cls_token=cls_token, mask_token=mask_token,
additional_special_tokens=additional_special_tokens, additional_special_tokens=additional_special_tokens,
**kwargs) **kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens # cache of sm.MosesPunctNormalizer instance
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens self.cache_moses_punct_normalizer = dict()
# cache of sm.MosesTokenizer instance
self.cache_moses_tokenizer = dict()
self.lang_with_custom_tokenizer = set(['zh', 'th', 'ja'])
# True for current supported model (v1.2.0), False for XLM-17 & 100
self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
self.lang2id = lang2id
self.id2lang = id2lang
if lang2id is not None and id2lang is not None:
assert len(lang2id) == len(id2lang)
try: self.ja_word_tokenizer = None
import ftfy self.zh_word_tokenizer = None
from spacy.lang.en import English
_nlp = English()
self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
self.fix_text = ftfy.fix_text
except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
self.nlp = BasicTokenizer(do_lower_case=True)
self.fix_text = None
self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v:k for k,v in self.encoder.items()} self.decoder = {v:k for k,v in self.encoder.items()}
...@@ -144,6 +571,43 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -144,6 +571,43 @@ class XLMTokenizer(PreTrainedTokenizer):
self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {} self.cache = {}
def moses_punct_norm(self, text, lang):
if lang not in self.cache_moses_punct_normalizer:
punct_normalizer = sm.MosesPunctNormalizer(lang=lang)
self.cache_moses_punct_normalizer[lang] = punct_normalizer
else:
punct_normalizer = self.cache_moses_punct_normalizer[lang]
return punct_normalizer.normalize(text)
def moses_tokenize(self, text, lang):
if lang not in self.cache_moses_tokenizer:
moses_tokenizer = sm.MosesTokenizer(lang=lang)
self.cache_moses_tokenizer[lang] = moses_tokenizer
else:
moses_tokenizer = self.cache_moses_tokenizer[lang]
return moses_tokenizer.tokenize(text, return_str=False, escape=False)
def moses_pipeline(self, text, lang):
text = replace_unicode_punct(text)
text = self.moses_punct_norm(text, lang)
text = remove_non_printing_char(text)
return text
def ja_tokenize(self, text):
if self.ja_word_tokenizer is None:
try:
import Mykytea
self.ja_word_tokenizer = Mykytea.Mykytea('-model %s/local/share/kytea/model.bin' % os.path.expanduser('~'))
except (AttributeError, ImportError) as e:
logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps")
logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
logger.error("2. autoreconf -i")
logger.error("3. ./configure --prefix=$HOME/local")
logger.error("4. make && make install")
logger.error("5. pip install kytea")
raise e
return list(self.ja_word_tokenizer.getWS(text))
@property @property
def vocab_size(self): def vocab_size(self):
return len(self.encoder) return len(self.encoder)
...@@ -191,19 +655,90 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -191,19 +655,90 @@ class XLMTokenizer(PreTrainedTokenizer):
self.cache[token] = word self.cache[token] = word
return word return word
def _tokenize(self, text): def _tokenize(self, text, lang='en', bypass_tokenizer=False):
""" Tokenize a string. """ """
Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizerself. Otherwise, we use Moses.
Details of tokenization:
- [sacremoses](https://github.com/alvations/sacremoses): port of Moses
- Install with `pip install sacremoses`
- [pythainlp](https://github.com/PyThaiNLP/pythainlp): Thai tokenizer
- Install with `pip install pythainlp`
- [kytea](https://github.com/chezou/Mykytea-python): Japanese tokenizer, wrapper of [KyTea](https://github.com/neubig/kytea)
- Install with the following steps:
```
git clone git@github.com:neubig/kytea.git && cd kytea
autoreconf -i
./configure --prefix=$HOME/local
make && make install
pip install kytea
```
- [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer *
- Install with `pip install jieba`
\* The original XLM used [Stanford Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip).
However, the wrapper (`nltk.tokenize.stanford_segmenter`) is slow due to JVM overhead, and it will be deprecated.
Jieba is a lot faster and pip-installable. Note there is some mismatch with the Stanford Segmenter. It should be fine
if you fine-tune the model with Chinese supervisionself. If you want the same exact behaviour, use the original XLM
[preprocessing script](https://github.com/facebookresearch/XLM/tree/master/tools) to tokenize the sentence externally,
and set `bypass_tokenizer=True` to bypass the tokenizer.
Args:
- lang: ISO language code (default = 'en') (string). Languages should belong of the model supported languages. However, we don't enforce it.
- bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False) (bool). If True, we only apply BPE.
Returns:
List of tokens.
"""
if lang and self.lang2id and lang not in self.lang2id:
logger.error("Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model.")
if bypass_tokenizer:
text = text.split()
elif lang not in self.lang_with_custom_tokenizer:
text = self.moses_pipeline(text, lang=lang)
# TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step
if lang == 'ro':
text = romanian_preprocessing(text)
text = self.moses_tokenize(text, lang=lang)
elif lang == 'th':
text = self.moses_pipeline(text, lang=lang)
try:
if 'pythainlp' not in sys.modules:
from pythainlp.tokenize import word_tokenize as th_word_tokenize
else:
th_word_tokenize = sys.modules['pythainlp'].word_tokenize
except (AttributeError, ImportError) as e:
logger.error("Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps")
logger.error("1. pip install pythainlp")
raise e
text = th_word_tokenize(text)
elif lang == 'zh':
try:
if 'jieba' not in sys.modules:
import jieba
else:
jieba = sys.modules['jieba']
except (AttributeError, ImportError) as e:
logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
logger.error("1. pip install jieba")
raise e
text = ' '.join(jieba.cut(text))
text = self.moses_pipeline(text, lang=lang)
text = text.split()
elif lang == 'ja':
text = self.moses_pipeline(text, lang=lang)
text = self.ja_tokenize(text)
else:
raise ValueError('It should not reach here')
if self.do_lowercase_and_remove_accent and not bypass_tokenizer:
text = lowercase_and_remove_accent(text)
split_tokens = [] split_tokens = []
if self.fix_text is None: for token in text:
# Using BERT's BasicTokenizer if token:
text = self.nlp.tokenize(text)
for token in text:
split_tokens.extend([t for t in self.bpe(token).split(' ')]) split_tokens.extend([t for t in self.bpe(token).split(' ')])
else:
# Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
text = self.nlp(text_standardize(self.fix_text(text)))
for token in text:
split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
return split_tokens return split_tokens
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
...@@ -224,15 +759,15 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -224,15 +759,15 @@ class XLMTokenizer(PreTrainedTokenizer):
Adds special tokens to a sequence for sequence classification tasks. Adds special tokens to a sequence for sequence classification tasks.
An XLM sequence has the following format: [CLS] X [SEP] An XLM sequence has the following format: [CLS] X [SEP]
""" """
return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)] return [self.cls_token_id] + token_ids + [self.sep_token_id]
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1): def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
""" """
Adds special tokens to a sequence pair for sequence classification tasks. Adds special tokens to a sequence pair for sequence classification tasks.
An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP] An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
""" """
sep = [self._convert_token_to_id(self.sep_token)] sep = [self.sep_token_id]
cls = [self._convert_token_to_id(self.cls_token)] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
......
...@@ -61,7 +61,7 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -61,7 +61,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, max_len=None, def __init__(self, vocab_file,
do_lower_case=False, remove_space=True, keep_accents=False, do_lower_case=False, remove_space=True, keep_accents=False,
bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>", bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
pad_token="<pad>", cls_token="<cls>", mask_token="<mask>", pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",
...@@ -186,8 +186,8 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -186,8 +186,8 @@ class XLNetTokenizer(PreTrainedTokenizer):
Adds special tokens to a sequence pair for sequence classification tasks. Adds special tokens to a sequence pair for sequence classification tasks.
An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS] An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
""" """
sep = [self._convert_token_to_id(self.sep_token)] sep = [self.sep_token_id]
cls = [self._convert_token_to_id(self.cls_token)] cls = [self.cls_token_id]
return token_ids + sep + cls return token_ids + sep + cls
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1): def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
...@@ -195,8 +195,8 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -195,8 +195,8 @@ class XLNetTokenizer(PreTrainedTokenizer):
Adds special tokens to a sequence for sequence classification tasks. Adds special tokens to a sequence for sequence classification tasks.
An XLNet sequence has the following format: X [SEP][CLS] An XLNet sequence has the following format: X [SEP][CLS]
""" """
sep = [self._convert_token_to_id(self.sep_token)] sep = [self.sep_token_id]
cls = [self._convert_token_to_id(self.cls_token)] cls = [self.cls_token_id]
return token_ids_0 + sep + token_ids_1 + sep + cls return token_ids_0 + sep + token_ids_1 + sep + cls
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
......
...@@ -9,4 +9,6 @@ requests ...@@ -9,4 +9,6 @@ requests
# For OpenAI GPT # For OpenAI GPT
regex regex
# For XLNet # For XLNet
sentencepiece sentencepiece
\ No newline at end of file # For XLM
sacremoses
\ No newline at end of file
...@@ -38,7 +38,7 @@ from setuptools import find_packages, setup ...@@ -38,7 +38,7 @@ from setuptools import find_packages, setup
setup( setup(
name="pytorch_transformers", name="pytorch_transformers",
version="1.1.0", version="1.2.0",
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors", author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors",
author_email="thomas@huggingface.co", author_email="thomas@huggingface.co",
description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM", description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",
...@@ -55,7 +55,8 @@ setup( ...@@ -55,7 +55,8 @@ setup(
'requests', 'requests',
'tqdm', 'tqdm',
'regex', 'regex',
'sentencepiece'], 'sentencepiece',
'sacremoses'],
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [
"pytorch_transformers=pytorch_transformers.__main__:main", "pytorch_transformers=pytorch_transformers.__main__:main",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment