adding max_lengths for single sentences and sentences pairs

47d68534 · thomwolf · 90dcd8c0 · 47d68534 · 47d68534 · 47d68534
Commit 47d68534 authored Aug 23, 2019 by thomwolf
5 changed files
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -139,6 +139,14 @@ class BertTokenizer(PreTrainedTokenizer):
                                                  tokenize_chinese_chars=tokenize_chinese_chars)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)

+    @property
+    def max_len_single_sentence(self):
+        return self.max_len - 2  # take into account special tokens
+
+    @property
+    def max_len_sentences_pair(self):
+        return self.max_len - 3  # take into account special tokens
+
    @property
    def vocab_size(self):
        return len(self.vocab)

--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -160,6 +160,14 @@ class RobertaTokenizer(PreTrainedTokenizer):
        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
        return text

+    @property
+    def max_len_single_sentence(self):
+        return self.max_len - 2  # take into account special tokens
+
+    @property
+    def max_len_sentences_pair(self):
+        return self.max_len - 4  # take into account special tokens
+
    def add_special_tokens_single_sentence(self, token_ids):
        """
        Adds special tokens to a sequence for sequence classification tasks.

--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -67,6 +67,14 @@ class PreTrainedTokenizer(object):
                                 "pad_token", "cls_token", "mask_token",
                                 "additional_special_tokens"]

+    @property
+    def max_len_single_sentence(self):
+        return self.max_len  # Default to max_len but can be smaller in specific tokenizers to take into account special tokens
+
+    @property
+    def max_len_sentences_pair(self):
+        return self.max_len  # Default to max_len but can be smaller in specific tokenizers to take into account special tokens
+
    @property
    def bos_token(self):
        """ Beginning of sentence token (string). Log an error if used while not having been set. """

--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -215,6 +215,14 @@ class XLMTokenizer(PreTrainedTokenizer):
        out_string = ''.join(tokens).replace('</w>', ' ').strip()
        return out_string

+    @property
+    def max_len_single_sentence(self):
+        return self.max_len - 2  # take into account special tokens
+
+    @property
+    def max_len_sentences_pair(self):
+        return self.max_len - 3  # take into account special tokens
+
    def add_special_tokens_single_sentence(self, token_ids):
        """
        Adds special tokens to a sequence for sequence classification tasks.

--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -177,6 +177,14 @@ class XLNetTokenizer(PreTrainedTokenizer):
        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
        return out_string

+    @property
+    def max_len_single_sentence(self):
+        return self.max_len - 2  # take into account special tokens
+
+    @property
+    def max_len_sentences_pair(self):
+        return self.max_len - 3  # take into account special tokens
+
    def add_special_tokens_single_sentence(self, token_ids):
        """
        Adds special tokens to a sequence pair for sequence classification tasks.