"tests/test_tokenization_phobert.py" did not exist on "5dd7b677adbd2a228328e42b79583143c16b8dff"
Commit d8923270 authored by Jason Phang's avatar Jason Phang Committed by Lysandre Debut
Browse files

Correct truncation for RoBERTa in 2-input GLUE

parent 7e7fc53d
......@@ -422,8 +422,9 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
tokens_b = tokenizer.tokenize(example.text_b)
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
# Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
special_tokens_count = 4 if sep_token_extra else 3
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
else:
# Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
special_tokens_count = 3 if sep_token_extra else 2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment