Added final TODOs

1798e98e · Matthew Carrigan · c64c2fc4 · 1798e98e · 1798e98e
Commit 1798e98e authored Mar 20, 2019 by Matthew Carrigan
Showing with 8 additions and 6 deletions

examples/lm_finetuning/finetune_on_pregenerated.py examples/lm_finetuning/finetune_on_pregenerated.py +1 -5

examples/lm_finetuning/pregenerate_training_data.py examples/lm_finetuning/pregenerate_training_data.py +7 -1

No files found.
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -54,7 +54,7 @@ def convert_example_to_features(example, tokenizer, max_seq_length):
 class PregeneratedDataset(Dataset):
    def __init__(self, training_path, epoch, tokenizer, num_data_epochs):
-        # TODO Add an option to memmap the training data
+        # TODO Add an option to memmap and shuffle the training data if needed (see note in pregenerate_training_data)
        self.vocab = tokenizer.vocab
        self.tokenizer = tokenizer
        self.epoch = epoch
@@ -101,10 +101,6 @@ class PregeneratedDataset(Dataset):
                torch.tensor(self.is_nexts[item].astype(np.int64)))
-# TODO 2: Test it's all working
-# TODO 3: Add a README (can you do that with subfolders?)
 def main():
    parser = ArgumentParser()
    parser.add_argument('--pregenerated_data', type=Path, required=True)

--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -220,6 +220,13 @@ def main():
    args = parser.parse_args()
+    # TODO Add a low-memory / multiprocessing path for very large datasets
+    #      In this path documents would be stored in a shelf after being tokenized, and multiple processes would convert
+    #      those docs into training examples that would be written out on the fly. This would avoid the need to keep
+    #      the whole training set in memory and would speed up dataset creation at the cost of code complexity.
+    #      In addition, the finetuning script would need to be modified to store the training epochs as memmaped arrays,
+    #      and to shuffle them by importing to the rows of the array in a random order.
    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    vocab_list = list(tokenizer.vocab.keys())
    with args.corpus_path.open() as f:
@@ -232,7 +239,6 @@ def main():
                doc = []
            else:
                tokens = tokenizer.tokenize(line)
-                # TODO If the sentence is longer than max_len, do we split it in the middle? That's probably a bad idea
                doc.append(tokens)
    args.save_dir.mkdir(exist_ok=True)