Commit 599529ec authored by Lengyue's avatar Lengyue Committed by zjsun
Browse files

Add compress tool

parent 94a54a14
import tarfile
from pathlib import Path
from tqdm import tqdm
import io
import random
from multiprocessing import Process
def chunked_tarring(rank, file_list, base_folder, output_folder, chunk_size=1024**3):
chunk_count = 1
total_size = 0
saved_count = 0
buffer = io.BytesIO()
tar = tarfile.open(fileobj=buffer, mode="w")
for audio_file in file_list:
txt_file = audio_file.with_suffix(".txt")
if not txt_file.exists():
continue
file_size = audio_file.stat().st_size + txt_file.stat().st_size
if total_size + file_size > chunk_size:
tar.close()
# write the buffer to disk
buffer.seek(0)
with open(output_folder / f"chunk-{rank:03d}-{chunk_count:04d}.tar", "wb") as f:
f.write(buffer.read())
chunk_count += 1
total_size = 0
buffer = io.BytesIO()
tar = tarfile.open(fileobj=buffer, mode="w")
tar.add(audio_file, arcname=audio_file.relative_to(base_folder))
tar.add(txt_file, arcname=txt_file.relative_to(base_folder))
total_size += file_size
if saved_count % 1000 == 0:
print(f"Rank {rank}: {saved_count}/{len(file_list)}")
saved_count += 1
tar.close()
buffer.seek(0)
with open(output_folder / f"chunk-{rank:03d}-{chunk_count:04d}.tar", "wb") as f:
f.write(buffer.read())
print(f"Rank {rank}: {saved_count}/{len(file_list)}")
if __name__ == "__main__":
base_folder = Path("/mnt/nvme1/multi-modal-test/WenetSpeech/cleaned")
output_folder = Path("/mnt/nvme1/multi-modal-test/WenetSpeech/compressed")
output_folder.mkdir(exist_ok=True, parents=True)
num_workers = 50
file_list = list(tqdm(base_folder.rglob("*.flac")))
random.shuffle(file_list)
print(f"Total files: {len(file_list)}")
chunk_size = len(file_list) // num_workers
processes = []
for i in range(num_workers):
start = i * chunk_size
end = (i + 1) * chunk_size
if i == num_workers - 1:
end = len(file_list)
p = Process(target=chunked_tarring, args=(i, file_list[start:end], base_folder, output_folder))
p.start()
processes.append(p)
for p in processes:
p.join()
print("Done")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment