"build_tools/git@developer.sourcefind.cn:hehl2/torchaudio.git" did not exist on "66f4cdf9a95af341da23645f00b308d2caf9a905"
Commit 1798e98e authored by Matthew Carrigan's avatar Matthew Carrigan
Browse files

Added final TODOs

parent c64c2fc4
...@@ -54,7 +54,7 @@ def convert_example_to_features(example, tokenizer, max_seq_length): ...@@ -54,7 +54,7 @@ def convert_example_to_features(example, tokenizer, max_seq_length):
class PregeneratedDataset(Dataset): class PregeneratedDataset(Dataset):
def __init__(self, training_path, epoch, tokenizer, num_data_epochs): def __init__(self, training_path, epoch, tokenizer, num_data_epochs):
# TODO Add an option to memmap the training data # TODO Add an option to memmap and shuffle the training data if needed (see note in pregenerate_training_data)
self.vocab = tokenizer.vocab self.vocab = tokenizer.vocab
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.epoch = epoch self.epoch = epoch
...@@ -101,10 +101,6 @@ class PregeneratedDataset(Dataset): ...@@ -101,10 +101,6 @@ class PregeneratedDataset(Dataset):
torch.tensor(self.is_nexts[item].astype(np.int64))) torch.tensor(self.is_nexts[item].astype(np.int64)))
# TODO 2: Test it's all working
# TODO 3: Add a README (can you do that with subfolders?)
def main(): def main():
parser = ArgumentParser() parser = ArgumentParser()
parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--pregenerated_data', type=Path, required=True)
......
...@@ -220,6 +220,13 @@ def main(): ...@@ -220,6 +220,13 @@ def main():
args = parser.parse_args() args = parser.parse_args()
# TODO Add a low-memory / multiprocessing path for very large datasets
# In this path documents would be stored in a shelf after being tokenized, and multiple processes would convert
# those docs into training examples that would be written out on the fly. This would avoid the need to keep
# the whole training set in memory and would speed up dataset creation at the cost of code complexity.
# In addition, the finetuning script would need to be modified to store the training epochs as memmaped arrays,
# and to shuffle them by importing to the rows of the array in a random order.
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
vocab_list = list(tokenizer.vocab.keys()) vocab_list = list(tokenizer.vocab.keys())
with args.corpus_path.open() as f: with args.corpus_path.open() as f:
...@@ -232,7 +239,6 @@ def main(): ...@@ -232,7 +239,6 @@ def main():
doc = [] doc = []
else: else:
tokens = tokenizer.tokenize(line) tokens = tokenizer.tokenize(line)
# TODO If the sentence is longer than max_len, do we split it in the middle? That's probably a bad idea
doc.append(tokens) doc.append(tokens)
args.save_dir.mkdir(exist_ok=True) args.save_dir.mkdir(exist_ok=True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment