overflowing_tokens do not really make sense here, let's just return a number

Co-Authored-By: Lysandre Debut <lysandre.debut@reseau.eseo.fr>

overflowing_tokens do not really make sense here, let's just return a number
Co-Authored-By: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
b3506629 · Julien Chaumond · f5bcde0b · b3506629 · b3506629
Commit b3506629 authored Sep 30, 2019 by Julien Chaumond
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 2 deletions

examples/utils_multiple_choice.py examples/utils_multiple_choice.py +1 -1

transformers/tokenization_utils.py transformers/tokenization_utils.py +2 -1

No files found.
--- a/examples/utils_multiple_choice.py
+++ b/examples/utils_multiple_choice.py
@@ -338,7 +338,7 @@ def convert_examples_to_features(
                max_length=max_length,
                truncate_both_sequences=True
            )
-            if 'overflowing_tokens' in inputs and len(inputs['overflowing_tokens']) > 0:
+            if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0:
                logger.info('Attention! you are cropping tokens (swag task is ok). '
                        'If you are training ARC and RACE and you are poping question + options,'
                        'you need to try to use a bigger max seq length!')

--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -837,7 +837,8 @@ class PreTrainedTokenizer(object):
                        copy.deepcopy(pair_ids),
                        max_length=max_length - n_added_tokens
                    )
-                    encoded_inputs["overflowing_tokens"] = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):]
+                    truncated_tokens = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):]
+                    encoded_inputs["num_truncated_tokens"] = len(truncated_tokens)
                    ids = tokens_a
                    pair_ids = tokens_b
                elif pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length: