Correct truncation for RoBERTa in 2-input GLUE

d8923270 · Jason Phang · Lysandre Debut · 7e7fc53d · d8923270
Commit d8923270 authored Aug 16, 2019 by Jason Phang Committed by Lysandre Debut Aug 16, 2019
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 2 deletions

examples/utils_glue.py examples/utils_glue.py +3 -2

No files found.
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -422,8 +422,9 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+            # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
+            special_tokens_count = 4 if sep_token_extra else 3
+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
        else:
            # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
            special_tokens_count = 3 if sep_token_extra else 2