"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "bfeaae2235a8437ff1e80c9fc8c51b25ed21dcf5"
Commit 47d68534 authored by thomwolf's avatar thomwolf
Browse files

adding max_lengths for single sentences and sentences pairs

parent 90dcd8c0
...@@ -139,6 +139,14 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -139,6 +139,14 @@ class BertTokenizer(PreTrainedTokenizer):
tokenize_chinese_chars=tokenize_chinese_chars) tokenize_chinese_chars=tokenize_chinese_chars)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
@property
def max_len_single_sentence(self):
return self.max_len - 2 # take into account special tokens
@property
def max_len_sentences_pair(self):
return self.max_len - 3 # take into account special tokens
@property @property
def vocab_size(self): def vocab_size(self):
return len(self.vocab) return len(self.vocab)
......
...@@ -160,6 +160,14 @@ class RobertaTokenizer(PreTrainedTokenizer): ...@@ -160,6 +160,14 @@ class RobertaTokenizer(PreTrainedTokenizer):
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
return text return text
@property
def max_len_single_sentence(self):
return self.max_len - 2 # take into account special tokens
@property
def max_len_sentences_pair(self):
return self.max_len - 4 # take into account special tokens
def add_special_tokens_single_sentence(self, token_ids): def add_special_tokens_single_sentence(self, token_ids):
""" """
Adds special tokens to a sequence for sequence classification tasks. Adds special tokens to a sequence for sequence classification tasks.
......
...@@ -67,6 +67,14 @@ class PreTrainedTokenizer(object): ...@@ -67,6 +67,14 @@ class PreTrainedTokenizer(object):
"pad_token", "cls_token", "mask_token", "pad_token", "cls_token", "mask_token",
"additional_special_tokens"] "additional_special_tokens"]
@property
def max_len_single_sentence(self):
return self.max_len # Default to max_len but can be smaller in specific tokenizers to take into account special tokens
@property
def max_len_sentences_pair(self):
return self.max_len # Default to max_len but can be smaller in specific tokenizers to take into account special tokens
@property @property
def bos_token(self): def bos_token(self):
""" Beginning of sentence token (string). Log an error if used while not having been set. """ """ Beginning of sentence token (string). Log an error if used while not having been set. """
......
...@@ -215,6 +215,14 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -215,6 +215,14 @@ class XLMTokenizer(PreTrainedTokenizer):
out_string = ''.join(tokens).replace('</w>', ' ').strip() out_string = ''.join(tokens).replace('</w>', ' ').strip()
return out_string return out_string
@property
def max_len_single_sentence(self):
return self.max_len - 2 # take into account special tokens
@property
def max_len_sentences_pair(self):
return self.max_len - 3 # take into account special tokens
def add_special_tokens_single_sentence(self, token_ids): def add_special_tokens_single_sentence(self, token_ids):
""" """
Adds special tokens to a sequence for sequence classification tasks. Adds special tokens to a sequence for sequence classification tasks.
......
...@@ -177,6 +177,14 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -177,6 +177,14 @@ class XLNetTokenizer(PreTrainedTokenizer):
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip() out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
return out_string return out_string
@property
def max_len_single_sentence(self):
return self.max_len - 2 # take into account special tokens
@property
def max_len_sentences_pair(self):
return self.max_len - 3 # take into account special tokens
def add_special_tokens_single_sentence(self, token_ids): def add_special_tokens_single_sentence(self, token_ids):
""" """
Adds special tokens to a sequence pair for sequence classification tasks. Adds special tokens to a sequence pair for sequence classification tasks.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment