"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "4e3f745ba4e754e415c184d53c874031101d263b"
Commit a8a577ba authored by Matthew Carrigan's avatar Matthew Carrigan
Browse files

Reduced memory usage for pregenerating the data a lot by writing it

out on the fly without shuffling - the Sampler in the finetuning script
will shuffle for us.
parent 0ae59e66
...@@ -74,8 +74,6 @@ class PregeneratedDataset(Dataset): ...@@ -74,8 +74,6 @@ class PregeneratedDataset(Dataset):
with data_file.open() as f: with data_file.open() as f:
for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")): for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")):
line = line.strip() line = line.strip()
if not line:
continue # Skip trailing blank lines etc.
example = json.loads(line) example = json.loads(line)
features = convert_example_to_features(example, tokenizer, seq_len) features = convert_example_to_features(example, tokenizer, seq_len)
input_ids[i] = features.input_ids input_ids[i] = features.input_ids
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment