#!/usr/bin/env python3 """Parse a directory contains Librispeech dataset. Recursively search for "*.trans.txt" file in the given directory and print out `\\t\\t` example: python parse_librispeech.py LibriSpeech/test-clean 1089-134691-0000\t/LibriSpeech/test-clean/1089/134691/1089-134691-0000.flac\tHE COULD WAIT NO LONGER ... Dataset can be obtained from https://www.openslr.org/12 """ import argparse from pathlib import Path def _parse_args(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawTextHelpFormatter, ) parser.add_argument( 'input_dir', type=Path, help='Directory where `*.trans.txt` files are searched.' ) return parser.parse_args() def _parse_transcript(path): with open(path) as trans_fileobj: for line in trans_fileobj: line = line.strip() if line: yield line.split(' ', maxsplit=1) def _parse_directory(root_dir: Path): for trans_file in root_dir.glob('**/*.trans.txt'): trans_dir = trans_file.parent for id_, transcription in _parse_transcript(trans_file): audio_path = trans_dir / f'{id_}.flac' yield id_, audio_path, transcription def _main(): args = _parse_args() for id_, path, transcription in _parse_directory(args.input_dir): print(f'{id_}\t{path}\t{transcription}') if __name__ == '__main__': _main()