Fix bugs from Emformer RNN-T recipes merge (#2217)

Summary: - Removes 100-batch truncation in TEDLIUM3 recipe. - Reinstates `train_spm.py` for TEDLIUM3. Pull Request resolved: https://github.com/pytorch/audio/pull/2217 Reviewed By: nateanl Differential Revision: D34171525 Pulled By: hwangjeff fbshipit-source-id: 54698e5e1b094c26c28eec9b8b1722223077876c

Fix bugs from Emformer RNN-T recipes merge (#2217)
Summary: - Removes 100-batch truncation in TEDLIUM3 recipe. - Reinstates `train_spm.py` for TEDLIUM3. Pull Request resolved: https://github.com/pytorch/audio/pull/2217 Reviewed By: nateanl Differential Revision: D34171525 Pulled By: hwangjeff fbshipit-source-id: 54698e5e1b094c26c28eec9b8b1722223077876c
2b991225 · hwangjeff · Facebook GitHub Bot · 33bcb7b0 · 2b991225 · 2b991225
Commit 2b991225 authored Feb 11, 2022 by hwangjeff Committed by Facebook GitHub Bot Feb 11, 2022
Showing with 82 additions and 1 deletion

examples/asr/emformer_rnnt/tedlium3/lightning.py examples/asr/emformer_rnnt/tedlium3/lightning.py +1 -1

examples/asr/emformer_rnnt/tedlium3/train_spm.py examples/asr/emformer_rnnt/tedlium3/train_spm.py +81 -0

No files found.
--- a/examples/asr/emformer_rnnt/tedlium3/lightning.py
+++ b/examples/asr/emformer_rnnt/tedlium3/lightning.py
@@ -37,7 +37,7 @@ class CustomDataset(torch.utils.data.Dataset):
        assert max_token_limit >= idx_target_lengths[-1][1]
-        self.batches = batch_by_token_count(idx_target_lengths, max_token_limit)[:100]
+        self.batches = batch_by_token_count(idx_target_lengths, max_token_limit)
    def _target_length(self, fileid, line):
        transcript_path = os.path.join(self.base_dataset._path, "stm", fileid)

--- a/examples/asr/emformer_rnnt/tedlium3/train_spm.py
+++ b/examples/asr/emformer_rnnt/tedlium3/train_spm.py
+"""Train the SentencePiece model by using the transcripts of TED-LIUM release 3 training set.
+Example:
+python train_spm.py --tedlium-path /home/datasets/
+"""
+import logging
+import os
+import pathlib
+from argparse import ArgumentParser, RawTextHelpFormatter
+import sentencepiece as spm
+logger = logging.getLogger(__name__)
+def _parse_args():
+    parser = ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter)
+    parser.add_argument(
+        "--tedlium-path",
+        required=True,
+        type=pathlib.Path,
+        help="Path to TED-LIUM release 3 dataset.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=pathlib.Path("./"),
+        type=pathlib.Path,
+        help="File to save feature statistics to. (Default: './')",
+    )
+    parser.add_argument("--debug", action="store_true", help="whether to use debug level for logging")
+    return parser.parse_args()
+def _extract_train_text(tedlium_path, output_dir):
+    stm_path = tedlium_path / "TEDLIUM_release-3/data/stm/"
+    transcripts = []
+    for file in sorted(os.listdir(stm_path)):
+        if file.endswith(".stm"):
+            file = os.path.join(stm_path, file)
+            with open(file) as f:
+                for line in f.readlines():
+                    talk_id, _, speaker_id, start_time, end_time, identifier, transcript = line.split(" ", 6)
+                    if transcript == "ignore_time_segment_in_scoring\n":
+                        continue
+                    else:
+                        transcript = transcript.lower().replace("<unk>", "<garbage>")
+                        transcripts.append(transcript)
+    with open(output_dir / "text_train.txt", "w") as f:
+        f.writelines(transcripts)
+def _init_logger(debug):
+    fmt = "%(asctime)s %(message)s" if debug else "%(message)s"
+    level = logging.DEBUG if debug else logging.INFO
+    logging.basicConfig(format=fmt, level=level, datefmt="%Y-%m-%d %H:%M:%S")
+def cli_main():
+    args = _parse_args()
+    _init_logger(args.debug)
+    _extract_train_text(args.tedlium_path, args.output_dir)
+    spm.SentencePieceTrainer.train(
+        input=args.output_dir / "text_train.txt",
+        vocab_size=500,
+        model_prefix="spm_bpe_500",
+        model_type="bpe",
+        input_sentence_size=100000000,
+        character_coverage=1.0,
+        user_defined_symbols=["<garbage>"],
+        bos_id=0,
+        pad_id=1,
+        eos_id=2,
+        unk_id=3,
+    )
+    logger.info("Successfully trained the sentencepiece model")
+if __name__ == "__main__":
+    cli_main()