Unverified Commit 50e615f4 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge branch 'master' into improved_testing

parents f8aace6b f7978490
...@@ -222,6 +222,9 @@ class PreTrainedTokenizer(object): ...@@ -222,6 +222,9 @@ class PreTrainedTokenizer(object):
self._additional_special_tokens = [] self._additional_special_tokens = []
self.max_len = max_len if max_len is not None else int(1e12) self.max_len = max_len if max_len is not None else int(1e12)
self.max_len_single_sentence = self.max_len
self.max_len_sentences_pair = self.max_len
self.added_tokens_encoder = {} self.added_tokens_encoder = {}
self.added_tokens_decoder = {} self.added_tokens_decoder = {}
...@@ -349,7 +352,7 @@ class PreTrainedTokenizer(object): ...@@ -349,7 +352,7 @@ class PreTrainedTokenizer(object):
resolved_vocab_files[file_id] = None resolved_vocab_files[file_id] = None
else: else:
resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies) resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
except EnvironmentError: except EnvironmentError as e:
if pretrained_model_name_or_path in s3_models: if pretrained_model_name_or_path in s3_models:
logger.error("Couldn't reach server to download vocabulary.") logger.error("Couldn't reach server to download vocabulary.")
else: else:
...@@ -359,7 +362,7 @@ class PreTrainedTokenizer(object): ...@@ -359,7 +362,7 @@ class PreTrainedTokenizer(object):
"at this path or url.".format( "at this path or url.".format(
pretrained_model_name_or_path, ', '.join(s3_models), pretrained_model_name_or_path, ', '.join(s3_models),
pretrained_model_name_or_path, str(vocab_files.keys()))) pretrained_model_name_or_path, str(vocab_files.keys())))
return None raise e
for file_id, file_path in vocab_files.items(): for file_id, file_path in vocab_files.items():
if file_path == resolved_vocab_files[file_id]: if file_path == resolved_vocab_files[file_id]:
...@@ -653,10 +656,12 @@ class PreTrainedTokenizer(object): ...@@ -653,10 +656,12 @@ class PreTrainedTokenizer(object):
return first_sentence_tokens, second_sentence_tokens return first_sentence_tokens, second_sentence_tokens
def add_special_tokens_single_sentence(self, token_ids): def add_special_tokens_single_sentence(self, token_ids):
raise NotImplementedError logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
return token_ids
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1): def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
raise NotImplementedError logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
return token_ids_0 + token_ids_1
def convert_ids_to_tokens(self, ids, skip_special_tokens=False): def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
""" Converts a single index or a sequence of indices (integers) in a token " """ Converts a single index or a sequence of indices (integers) in a token "
...@@ -699,9 +704,9 @@ class PreTrainedTokenizer(object): ...@@ -699,9 +704,9 @@ class PreTrainedTokenizer(object):
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
text = self.convert_tokens_to_string(filtered_tokens) text = self.convert_tokens_to_string(filtered_tokens)
if self.sep_token is not None and self.sep_token in text: if self._sep_token is not None and self._sep_token in text:
text = text.replace(self.cls_token, self.sep_token) text = text.replace(self._cls_token, self._sep_token)
split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self.sep_token))) split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self._sep_token)))
if clean_up_tokenization_spaces: if clean_up_tokenization_spaces:
clean_text = [self.clean_up_tokenization(text) for text in split_text] clean_text = [self.clean_up_tokenization(text) for text in split_text]
return clean_text return clean_text
......
...@@ -122,6 +122,10 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -122,6 +122,10 @@ class XLMTokenizer(PreTrainedTokenizer):
cls_token=cls_token, mask_token=mask_token, cls_token=cls_token, mask_token=mask_token,
additional_special_tokens=additional_special_tokens, additional_special_tokens=additional_special_tokens,
**kwargs) **kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
try: try:
import ftfy import ftfy
from spacy.lang.en import English from spacy.lang.en import English
......
...@@ -71,6 +71,10 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -71,6 +71,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
pad_token=pad_token, cls_token=cls_token, pad_token=pad_token, cls_token=cls_token,
mask_token=mask_token, additional_special_tokens= mask_token=mask_token, additional_special_tokens=
additional_special_tokens, **kwargs) additional_special_tokens, **kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
try: try:
import sentencepiece as spm import sentencepiece as spm
except ImportError: except ImportError:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment