Fix a few issues regarding the language modeling script

569897ce · Lysandre · Julien Chaumond · 21da8950 · 569897ce
Commit 569897ce authored Feb 10, 2020 by Lysandre Committed by Julien Chaumond Feb 12, 2020
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 4 deletions

examples/run_language_modeling.py examples/run_language_modeling.py +4 -4

No files found.
--- a/examples/run_language_modeling.py
+++ b/examples/run_language_modeling.py
@@ -130,9 +130,9 @@ class LineByLineTextDataset(Dataset):
        logger.info("Creating features from dataset file at %s", file_path)

        with open(file_path, encoding="utf-8") as f:
-            lines = [line for line in f.read().splitlines() if len(line) > 0]
+            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

-        self.examples = tokenizer.batch_encode_plus(lines, max_length=block_size)["input_ids"]
+        self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]

    def __len__(self):
        return len(self.examples)
@@ -704,10 +704,10 @@ def main():
        )

    if args.block_size <= 0:
-        args.block_size = tokenizer.max_len_single_sentence
+        args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
-        args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)
+        args.block_size = min(args.block_size, tokenizer.max_len)

    if args.model_name_or_path:
        model = model_class.from_pretrained(