Commit 47d68534 authored by thomwolf's avatar thomwolf
Browse files

adding max_lengths for single sentences and sentences pairs

parent 90dcd8c0
......@@ -139,6 +139,14 @@ class BertTokenizer(PreTrainedTokenizer):
tokenize_chinese_chars=tokenize_chinese_chars)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
@property
def max_len_single_sentence(self):
return self.max_len - 2 # take into account special tokens
@property
def max_len_sentences_pair(self):
return self.max_len - 3 # take into account special tokens
@property
def vocab_size(self):
return len(self.vocab)
......
......@@ -160,6 +160,14 @@ class RobertaTokenizer(PreTrainedTokenizer):
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
return text
@property
def max_len_single_sentence(self):
return self.max_len - 2 # take into account special tokens
@property
def max_len_sentences_pair(self):
return self.max_len - 4 # take into account special tokens
def add_special_tokens_single_sentence(self, token_ids):
"""
Adds special tokens to a sequence for sequence classification tasks.
......
......@@ -67,6 +67,14 @@ class PreTrainedTokenizer(object):
"pad_token", "cls_token", "mask_token",
"additional_special_tokens"]
@property
def max_len_single_sentence(self):
return self.max_len # Default to max_len but can be smaller in specific tokenizers to take into account special tokens
@property
def max_len_sentences_pair(self):
return self.max_len # Default to max_len but can be smaller in specific tokenizers to take into account special tokens
@property
def bos_token(self):
""" Beginning of sentence token (string). Log an error if used while not having been set. """
......
......@@ -215,6 +215,14 @@ class XLMTokenizer(PreTrainedTokenizer):
out_string = ''.join(tokens).replace('</w>', ' ').strip()
return out_string
@property
def max_len_single_sentence(self):
return self.max_len - 2 # take into account special tokens
@property
def max_len_sentences_pair(self):
return self.max_len - 3 # take into account special tokens
def add_special_tokens_single_sentence(self, token_ids):
"""
Adds special tokens to a sequence for sequence classification tasks.
......
......@@ -177,6 +177,14 @@ class XLNetTokenizer(PreTrainedTokenizer):
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
return out_string
@property
def max_len_single_sentence(self):
return self.max_len - 2 # take into account special tokens
@property
def max_len_sentences_pair(self):
return self.max_len - 3 # take into account special tokens
def add_special_tokens_single_sentence(self, token_ids):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment