Commit bac51fba authored by Maksym Del's avatar Maksym Del Committed by Lysandre Debut
Browse files

Fix token_type_ids for XLM-R

parent babd41e7
......@@ -176,10 +176,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A RoBERTa sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
if token_ids_1 is None, only returns the first portion of the mask (0's).
"""
sep = [self.sep_token_id]
......@@ -187,7 +184,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
@property
def vocab_size(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment