Reorganized max_len warning

a67e7478 · Lysandre · e18f786c · a67e7478
Commit a67e7478 authored Nov 14, 2019 by Lysandre
Show whitespace changes
Inline Side-by-side

Showing with 5 additions and 4 deletions

transformers/tokenization_utils.py transformers/tokenization_utils.py +5 -4

No files found.
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -671,10 +671,6 @@ class PreTrainedTokenizer(object):
        ids = []
        for token in tokens:
            ids.append(self._convert_token_to_id_with_added_voc(token))
-        if len(ids) > self.max_len:
-            logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
-                           "for this model ({} > {}). Running this sequence through the model will result in "
-                           "indexing errors".format(len(ids), self.max_len))
        return ids
    def _convert_token_to_id_with_added_voc(self, token):
@@ -877,6 +873,11 @@ class PreTrainedTokenizer(object):
            encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
            encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
+        if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len:
+            logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
+                           "for this model ({} > {}). Running this sequence through the model will result in "
+                           "indexing errors".format(len(ids), self.max_len))
        return encoded_inputs
    def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):