Merge pull request #1337 from mgrankin/fastdataset

faster dataset building

Merge pull request #1337 from mgrankin/fastdataset
faster dataset building
d83d2957 · Thomas Wolf · GitHub · f6de0003 · f71a4577 · d83d2957
Unverified Commit d83d2957 authored Sep 27, 2019 by Thomas Wolf Committed by GitHub Sep 27, 2019
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 3 deletions

examples/run_lm_finetuning.py examples/run_lm_finetuning.py +2 -3

No files found.
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -74,9 +74,8 @@ class TextDataset(Dataset):
            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
-            while len(tokenized_text) >= block_size:  # Truncate in block of block_size
+            for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
-                self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[:block_size]))
+                self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[i:i+block_size]))
-                tokenized_text = tokenized_text[block_size:]
            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
            # If your dataset is small, first you should loook for a bigger one :-) and second you
            # can change this behavior by adding (model specific) padding.