"examples/git@developer.sourcefind.cn:hehl2/torchaudio.git" did not exist on "f75497308bfe59e68ba657d61d41a0c88705d7f9"
Unverified Commit 4302ace5 authored by Sam Shleifer's avatar Sam Shleifer Committed by GitHub
Browse files

[pack_dataset] don't sort before packing, only pack train (#5954)

parent c8bdf7f4
...@@ -13,3 +13,4 @@ streamlit ...@@ -13,3 +13,4 @@ streamlit
elasticsearch elasticsearch
pandas pandas
nlp nlp
fire
from pathlib import Path
import fire
def minify(src_dir: str, dest_dir: str, n: int):
"""Write first n lines of each file f in src_dir to dest_dir/f """
src_dir = Path(src_dir)
dest_dir = Path(dest_dir)
dest_dir.mkdir(exist_ok=True)
for path in src_dir.iterdir():
new = [x.rstrip() for x in list(path.open().readlines())][:n]
dest_path = dest_dir.joinpath(path.name)
print(dest_path)
dest_path.open("w").write("\n".join(new))
if __name__ == "__main__":
fire.Fire(minify)
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
""" """
import argparse import argparse
import shutil
from pathlib import Path from pathlib import Path
from tqdm import tqdm from tqdm import tqdm
...@@ -17,7 +18,7 @@ def pack_examples(tok, src_examples, tgt_examples, max_tokens=1024): ...@@ -17,7 +18,7 @@ def pack_examples(tok, src_examples, tgt_examples, max_tokens=1024):
finished_src, finished_tgt = [], [] finished_src, finished_tgt = [], []
sorted_examples = list(sorted(zip(src_examples, tgt_examples), key=lambda x: len(x[0]))) sorted_examples = list(zip(src_examples, tgt_examples))
new_src, new_tgt = sorted_examples[0] new_src, new_tgt = sorted_examples[0]
def is_too_big(strang): def is_too_big(strang):
...@@ -42,20 +43,10 @@ def pack_examples(tok, src_examples, tgt_examples, max_tokens=1024): ...@@ -42,20 +43,10 @@ def pack_examples(tok, src_examples, tgt_examples, max_tokens=1024):
return finished_src, finished_tgt return finished_src, finished_tgt
def minify(src_dir: Path, dest_dir: Path, n: int):
"""Write first n lines of each file f in src_dir to dest_dir/f"""
dest_dir.mkdir(exist_ok=True)
for path in src_dir.iterdir():
new = [x.rstrip() for x in list(path.open().readlines())][:n]
dest_path = dest_dir.joinpath(path.name)
print(dest_path)
dest_path.open("w").write("\n".join(new))
def pack_data_dir(tok, data_dir: Path, max_tokens, save_path): def pack_data_dir(tok, data_dir: Path, max_tokens, save_path):
save_path = Path(save_path) save_path = Path(save_path)
save_path.mkdir(exist_ok=True) save_path.mkdir(exist_ok=True)
for split in ["val", "test", "train"]: for split in ["train"]:
src_path, tgt_path = data_dir / f"{split}.source", data_dir / f"{split}.target" src_path, tgt_path = data_dir / f"{split}.source", data_dir / f"{split}.target"
src_docs = [x.rstrip() for x in Path(src_path).open().readlines()] src_docs = [x.rstrip() for x in Path(src_path).open().readlines()]
tgt_docs = [x.rstrip() for x in Path(tgt_path).open().readlines()] tgt_docs = [x.rstrip() for x in Path(tgt_path).open().readlines()]
...@@ -63,6 +54,10 @@ def pack_data_dir(tok, data_dir: Path, max_tokens, save_path): ...@@ -63,6 +54,10 @@ def pack_data_dir(tok, data_dir: Path, max_tokens, save_path):
print(f"packed {split} split from {len(src_docs)} examples -> {len(packed_src)}.") print(f"packed {split} split from {len(src_docs)} examples -> {len(packed_src)}.")
Path(save_path / f"{split}.source").open("w").write("\n".join(packed_src)) Path(save_path / f"{split}.source").open("w").write("\n".join(packed_src))
Path(save_path / f"{split}.target").open("w").write("\n".join(packed_tgt)) Path(save_path / f"{split}.target").open("w").write("\n".join(packed_tgt))
for split in ["val", "test"]:
src_path, tgt_path = data_dir / f"{split}.source", data_dir / f"{split}.target"
shutil.copyfile(src_path, save_path / f"{split}.source")
shutil.copyfile(tgt_path, save_path / f"{split}.target")
def packer_cli(): def packer_cli():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment