Sequence special token handling for BERT and RoBERTa

2d042274 · Lysandre · a690edab · 2d042274
Commit 2d042274 authored Aug 20, 2019 by Lysandre
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 2 deletions

examples/run_lm_finetuning.py examples/run_lm_finetuning.py +8 -2

No files found.
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -71,7 +71,13 @@ class TextDataset(Dataset):
                text = f.read()

            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
+
+            tokenized_text = tokenizer.add_special_tokens_single_sentence(tokenized_text)
            while len(tokenized_text) >= block_size:  # Truncate in block of block_size
+                if isinstance(tokenizer, (BertTokenizer, RobertaTokenizer)):
+                    self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size - 2]))
+                    tokenized_text = tokenized_text[block_size - 2:]
+                else:
                    self.examples.append(tokenized_text[:block_size])
                    tokenized_text = tokenized_text[block_size:]
            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)