Commit 2d042274 authored by Lysandre's avatar Lysandre
Browse files

Sequence special token handling for BERT and RoBERTa

parent a690edab
......@@ -71,7 +71,13 @@ class TextDataset(Dataset):
text = f.read()
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
tokenized_text = tokenizer.add_special_tokens_single_sentence(tokenized_text)
while len(tokenized_text) >= block_size: # Truncate in block of block_size
if isinstance(tokenizer, (BertTokenizer, RobertaTokenizer)):
self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size - 2]))
tokenized_text = tokenized_text[block_size - 2:]
else:
self.examples.append(tokenized_text[:block_size])
tokenized_text = tokenized_text[block_size:]
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment