Remove obsolete examples (#2655)

Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/2655 Removed obsolete example and the corresponding test Reviewed By: mthrok Differential Revision: D39260253 fbshipit-source-id: 0bde71ffd75dd0c94a5cc4a9940f4648a5d61bd7

Remove obsolete examples (#2655)
Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/2655 Removed obsolete example and the corresponding test Reviewed By: mthrok Differential Revision: D39260253 fbshipit-source-id: 0bde71ffd75dd0c94a5cc4a9940f4648a5d61bd7
4a20c412 · Peter Albert · Facebook GitHub Bot · 95eada24 · 95eada24 · 95eada24
Commit 4a20c412 authored Sep 06, 2022 by Peter Albert Committed by Facebook GitHub Bot Sep 06, 2022
8 changed files
--- a/examples/interactive_asr/README.md
+++ b/examples/interactive_asr/README.md
-⚠️⚠️⚠️ THIS EXAMPLE IS NOT MAINTAINED ⚠️⚠️⚠️
-# asr-demo
-To run this demo, you need the following libraries
- [python3](https://www.python.org/download/releases/3.0/)
- [pyaudio](https://people.csail.mit.edu/hubert/pyaudio/)
- [torchaudio](https://github.com/pytorch/audio/tree/master/torchaudio)
- [pytorch](https://pytorch.org/)
- [librosa](https://librosa.github.io/librosa/)
- [fairseq](https://github.com/pytorch/fairseq) (clone the github repository)
-and the following models
- [dictionary](https://download.pytorch.org/models/audio/dict.txt)
- [sentence piece model](https://download.pytorch.org/models/audio/spm.model)
- [model](https://download.pytorch.org/models/audio/checkpoint_avg_60_80.pt)
-## Installation
-We recommend that you use [conda](https://docs.conda.io/en/latest/miniconda.html) to install the dependencies when available.
-```bash
-# Assume that all commands are from the examples folder
-cd examples
-# Install dependencies
-conda install -c pytorch torchaudio
-conda install -c conda-forge librosa
-conda install pyaudio
-pip install sentencepiece
-# Install fairseq from source
-git clone https://github.com/pytorch/fairseq interactive_asr/fairseq
-pushd interactive_asr/fairseq
-export CFLAGS='-stdlib=libc++'  # For Mac only
-pip install --editable .
-popd
-# Install dictionary, sentence piece model, and model
-wget -O interactive_asr/data/dict.txt https://download.pytorch.org/models/audio/dict.txt
-wget -O interactive_asr/data/spm.model https://download.pytorch.org/models/audio/spm.model
-wget -O interactive_asr/data/model.pt https://download.pytorch.org/models/audio/checkpoint_avg_60_80.pt
-```
-## Run
-On a file
-```bash
-INPUT_FILE=interactive_asr/data/sample.wav
-python -m interactive_asr.asr interactive_asr/data --input_file $INPUT_FILE --max-tokens 10000000 --nbest 1 \
-  --path interactive_asr/data/model.pt --beam 40 --task speech_recognition \
-  --user-dir interactive_asr/fairseq/examples/speech_recognition
-```
-As a microphone
-```bash
-python -m interactive_asr.asr interactive_asr/data --max-tokens 10000000 --nbest 1 \
-  --path interactive_asr/data/model.pt --beam 40 --task speech_recognition \
-  --user-dir interactive_asr/fairseq/examples/speech_recognition
-```
-To run the testcase associated with this example
-```bash
-ASR_MODEL_PATH=interactive_asr/data/model.pt \
-ASR_INPUT_FILE=interactive_asr/data/sample.wav \
-ASR_DATA_PATH=interactive_asr/data \
-ASR_USER_DIR=interactive_asr/fairseq/examples/speech_recognition \
-python -m unittest test/test_interactive_asr.py
-```
--- a/examples/interactive_asr/__init__.py
+++ b/examples/interactive_asr/__init__.py
-from . import utils, vad
-__all__ = ["utils", "vad"]
--- a/examples/interactive_asr/asr.py
+++ b/examples/interactive_asr/asr.py
-#!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
-"""
-Run inference for pre-processed data with a trained model.
-"""
-import datetime as dt
-import logging
-from fairseq import options
-from interactive_asr.utils import add_asr_eval_argument, get_microphone_transcription, setup_asr, transcribe_file
-def main(args):
-    logger = logging.getLogger(__name__)
-    logger.setLevel(logging.INFO)
-    task, generator, models, sp, tgt_dict = setup_asr(args, logger)
-    print("READY!")
-    if args.input_file:
-        transcription_time, transcription = transcribe_file(args, task, generator, models, sp, tgt_dict)
-        print("transcription:", transcription)
-        print("transcription_time:", transcription_time)
-    else:
-        for transcription in get_microphone_transcription(args, task, generator, models, sp, tgt_dict):
-            print("{}: {}".format(dt.datetime.now().strftime("%H:%M:%S"), transcription[0][0]))
-def cli_main():
-    parser = options.get_generation_parser()
-    parser = add_asr_eval_argument(parser)
-    args = options.parse_args_and_arch(parser)
-    main(args)
-if __name__ == "__main__":
-    cli_main()
--- a/examples/interactive_asr/data/sample.wav
+++ b/examples/interactive_asr/data/sample.wav
--- a/examples/interactive_asr/utils.py
+++ b/examples/interactive_asr/utils.py
-#!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
-import os
-import sys
-import time
-import sentencepiece as spm
-import torch
-import torchaudio
-from fairseq import tasks
-from fairseq.utils import import_user_module, load_ensemble_for_inference
-from interactive_asr.vad import get_microphone_chunks
-def add_asr_eval_argument(parser):
-    parser.add_argument("--input_file", help="input file")
-    parser.add_argument("--ctc", action="store_true", help="decode a ctc model")
-    parser.add_argument("--rnnt", default=False, help="decode a rnnt model")
-    parser.add_argument("--kspmodel", default=None, help="sentence piece model")
-    parser.add_argument("--wfstlm", default=None, help="wfstlm on dictonary output units")
-    parser.add_argument(
-        "--rnnt_decoding_type",
-        default="greedy",
-        help="wfstlm on dictonary output units",
-    )
-    parser.add_argument(
-        "--lm_weight",
-        default=0.2,
-        help="weight for wfstlm while interpolating with neural score",
-    )
-    parser.add_argument("--rnnt_len_penalty", default=-0.5, help="rnnt length penalty on word level")
-    return parser
-def check_args(args):
-    assert args.path is not None, "--path required for generation!"
-    assert not args.sampling or args.nbest == args.beam, "--sampling requires --nbest to be equal to --beam"
-    assert args.replace_unk is None or args.raw_text, "--replace-unk requires a raw text dataset (--raw-text)"
-def process_predictions(args, hypos, sp, tgt_dict):
-    res = []
-    device = torch.device("cuda:0" if torch.cuda.is_available() and not args.cpu else "cpu")
-    for hypo in hypos[: min(len(hypos), args.nbest)]:
-        hyp_pieces = tgt_dict.string(hypo["tokens"].int().to(device))
-        hyp_words = sp.DecodePieces(hyp_pieces.split())
-        res.append(hyp_words)
-    return res
-def optimize_models(args, use_cuda, models):
-    """Optimize ensemble for generation"""
-    for model in models:
-        model.make_generation_fast_(
-            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
-            need_attn=args.print_alignment,
-        )
-        if args.fp16:
-            model.half()
-        if use_cuda:
-            model.cuda()
-def calc_mean_invstddev(feature):
-    if len(feature.shape) != 2:
-        raise ValueError("We expect the input feature to be 2-D tensor")
-    mean = torch.mean(feature, dim=0)
-    var = torch.var(feature, dim=0)
-    # avoid division by ~zero
-    if (var < sys.float_info.epsilon).any():
-        return mean, 1.0 / (torch.sqrt(var) + sys.float_info.epsilon)
-    return mean, 1.0 / torch.sqrt(var)
-def calcMN(features):
-    mean, invstddev = calc_mean_invstddev(features)
-    res = (features - mean) * invstddev
-    return res
-def transcribe(waveform, args, task, generator, models, sp, tgt_dict):
-    num_features = 80
-    output = torchaudio.compliance.kaldi.fbank(waveform, num_mel_bins=num_features)
-    device = torch.device("cuda:0" if torch.cuda.is_available() and not args.cpu else "cpu")
-    output_cmvn = calcMN(output.to(device).detach())
-    # size (m, n)
-    source = output_cmvn
-    frames_lengths = torch.LongTensor([source.size(0)])
-    # size (1, m, n). In general, if source is (x, m, n), then hypos is (x, ...)
-    source.unsqueeze_(0)
-    sample = {"net_input": {"src_tokens": source, "src_lengths": frames_lengths}}
-    hypos = task.inference_step(generator, models, sample)
-    assert len(hypos) == 1
-    transcription = []
-    for i in range(len(hypos)):
-        # Process top predictions
-        hyp_words = process_predictions(args, hypos[i], sp, tgt_dict)
-        transcription.append(hyp_words)
-    return transcription
-def setup_asr(args, logger):
-    check_args(args)
-    import_user_module(args)
-    if args.max_tokens is None and args.batch_size is None:
-        args.max_tokens = 30000
-    logger.info(args)
-    use_cuda = torch.cuda.is_available() and not args.cpu
-    # Load dataset splits
-    task = tasks.setup_task(args)
-    # Set dictionary
-    tgt_dict = task.target_dictionary
-    if args.ctc or args.rnnt:
-        tgt_dict.add_symbol("<ctc_blank>")
-        if args.ctc:
-            logger.info("| decoding a ctc model")
-        if args.rnnt:
-            logger.info("| decoding a rnnt model")
-    # Load ensemble
-    logger.info("| loading model(s) from {}".format(args.path))
-    models, _model_args = load_ensemble_for_inference(
-        args.path.split(":"),
-        task,
-        model_arg_overrides=eval(args.model_overrides),  # noqa
-    )
-    optimize_models(args, use_cuda, models)
-    # Initialize generator
-    generator = task.build_generator(models, args)
-    sp = spm.SentencePieceProcessor()
-    sp.Load(os.path.join(args.data, "spm.model"))
-    return task, generator, models, sp, tgt_dict
-def transcribe_file(args, task, generator, models, sp, tgt_dict):
-    path = args.input_file
-    if not os.path.exists(path):
-        raise FileNotFoundError("Audio file not found: {}".format(path))
-    waveform, sample_rate = torchaudio.load_wav(path)
-    waveform = waveform.mean(0, True)
-    waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
-    start = time.time()
-    transcription = transcribe(waveform, args, task, generator, models, sp, tgt_dict)
-    transcription_time = time.time() - start
-    return transcription_time, transcription
-def get_microphone_transcription(args, task, generator, models, sp, tgt_dict):
-    for (waveform, sample_rate) in get_microphone_chunks():
-        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform.reshape(1, -1))
-        transcription = transcribe(waveform, args, task, generator, models, sp, tgt_dict)
-        yield transcription
--- a/examples/interactive_asr/vad.py
+++ b/examples/interactive_asr/vad.py
-#!/usr/bin/env python3
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
-"""
-Following `a simple but efficient real-time voice activity detection algorithm
-<https://www.eurasip.org/Proceedings/Eusipco/Eusipco2009/contents/papers/1569192958.pdf>`__.
-There are three criteria to decide if a frame contains speech: energy, most
-dominant frequency, and spectral flatness. If any two of those are higher than
-a minimum plus a threshold, then the frame contains speech.  In the offline
-case, the list of frames is postprocessed to remove too short silence and
-speech sequences. In the online case here, inertia is added before switching
-from speech to silence or vice versa.
-"""
-import queue
-from collections import deque
-import librosa
-import numpy as np
-import pyaudio
-import torch
-import torchaudio
-def compute_spectral_flatness(frame, epsilon=0.01):
-    # epsilon protects against log(0)
-    geometric_mean = torch.exp((frame + epsilon).log().mean(-1)) - epsilon
-    arithmetic_mean = frame.mean(-1)
-    return -10 * torch.log10(epsilon + geometric_mean / arithmetic_mean)
-class VoiceActivityDetection:
-    def __init__(
-        self,
-        num_init_frames=30,
-        ignore_silent_count=4,
-        ignore_speech_count=1,
-        energy_prim_thresh=60,
-        frequency_prim_thresh=10,
-        spectral_flatness_prim_thresh=3,
-        verbose=False,
-    ):
-        self.num_init_frames = num_init_frames
-        self.ignore_silent_count = ignore_silent_count
-        self.ignore_speech_count = ignore_speech_count
-        self.energy_prim_thresh = energy_prim_thresh
-        self.frequency_prim_thresh = frequency_prim_thresh
-        self.spectral_flatness_prim_thresh = spectral_flatness_prim_thresh
-        self.verbose = verbose
-        self.speech_mark = True
-        self.silence_mark = False
-        self.silent_count = 0
-        self.speech_count = 0
-        self.n = 0
-        if self.verbose:
-            self.energy_list = []
-            self.frequency_list = []
-            self.spectral_flatness_list = []
-    def iter(self, frame):
-        frame_fft = torch.rfft(frame, 1)
-        amplitudes = torchaudio.functional.complex_norm(frame_fft)
-        # Compute frame energy
-        energy = frame.pow(2).sum(-1)
-        # Most dominant frequency component
-        frequency = amplitudes.argmax()
-        # Spectral flatness measure
-        spectral_flatness = compute_spectral_flatness(amplitudes)
-        if self.verbose:
-            self.energy_list.append(energy)
-            self.frequency_list.append(frequency)
-            self.spectral_flatness_list.append(spectral_flatness)
-        if self.n == 0:
-            self.min_energy = energy
-            self.min_frequency = frequency
-            self.min_spectral_flatness = spectral_flatness
-        elif self.n < self.num_init_frames:
-            self.min_energy = min(energy, self.min_energy)
-            self.min_frequency = min(frequency, self.min_frequency)
-            self.min_spectral_flatness = min(spectral_flatness, self.min_spectral_flatness)
-        self.n += 1
-        # Add 1. to avoid log(0)
-        thresh_energy = self.energy_prim_thresh * torch.log(1.0 + self.min_energy)
-        thresh_frequency = self.frequency_prim_thresh
-        thresh_spectral_flatness = self.spectral_flatness_prim_thresh
-        # Check all three conditions
-        counter = 0
-        if energy - self.min_energy >= thresh_energy:
-            counter += 1
-        if frequency - self.min_frequency >= thresh_frequency:
-            counter += 1
-        if spectral_flatness - self.min_spectral_flatness >= thresh_spectral_flatness:
-            counter += 1
-        # Detection
-        if counter > 1:
-            # Speech detected
-            self.speech_count += 1
-            # Inertia against switching
-            if self.n >= self.num_init_frames and self.speech_count <= self.ignore_speech_count:
-                # Too soon to change
-                return self.silence_mark
-            else:
-                self.silent_count = 0
-                return self.speech_mark
-        else:
-            # Silence detected
-            self.min_energy = ((self.silent_count * self.min_energy) + energy) / (self.silent_count + 1)
-            self.silent_count += 1
-            # Inertia against switching
-            if self.n >= self.num_init_frames and self.silent_count <= self.ignore_silent_count:
-                # Too soon to change
-                return self.speech_mark
-            else:
-                self.speech_count = 0
-                return self.silence_mark
-class MicrophoneStream:
-    """Opens a recording stream as a generator yielding the audio chunks."""
-    def __init__(self, device=None, rate=22050, chunk=2205):
-        """
-        The 22050 is the librosa default, which is what our models were
-        trained on.  The ratio of [chunk / rate] is the amount of time between
-        audio samples - for example, with these defaults,
-        an audio fragment will be processed every tenth of a second.
-        """
-        self._rate = rate
-        self._chunk = chunk
-        self._device = device
-        # Create a thread-safe buffer of audio data
-        self._buff = queue.Queue()
-        self.closed = True
-    def __enter__(self):
-        self._audio_interface = pyaudio.PyAudio()
-        self._audio_stream = self._audio_interface.open(
-            # format=pyaudio.paInt16,
-            format=pyaudio.paFloat32,
-            # The API currently only supports 1-channel (mono) audio
-            # https://goo.gl/z757pE
-            channels=1,
-            rate=self._rate,
-            input=True,
-            frames_per_buffer=self._chunk,
-            input_device_index=self._device,
-            # Run the audio stream asynchronously to fill the buffer object.
-            # This is necessary so that the input device's buffer doesn't
-            # overflow while the calling thread makes network requests, etc.
-            stream_callback=self._fill_buffer,
-        )
-        self.closed = False
-        return self
-    def __exit__(self, type, value, traceback):
-        self._audio_stream.stop_stream()
-        self._audio_stream.close()
-        self.closed = True
-        # Signal the generator to terminate so that the client's
-        # streaming_recognize method will not block the process termination.
-        self._buff.put(None)
-        self._audio_interface.terminate()
-    def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
-        """Continuously collect data from the audio stream, into the buffer."""
-        self._buff.put(in_data)
-        return None, pyaudio.paContinue
-    def generator(self):
-        while not self.closed:
-            # Use a blocking get() to ensure there's at least one chunk of
-            # data, and stop iteration if the chunk is None, indicating the
-            # end of the audio stream.
-            chunk = self._buff.get()
-            if chunk is None:
-                return
-            data = [chunk]
-            # Now consume whatever other data's still buffered.
-            while True:
-                try:
-                    chunk = self._buff.get(block=False)
-                    if chunk is None:
-                        return
-                    data.append(chunk)
-                except queue.Empty:
-                    break
-            ans = np.fromstring(b"".join(data), dtype=np.float32)
-            # yield uniform-sized chunks
-            ans = np.split(ans, np.shape(ans)[0] / self._chunk)
-            # Resample the audio to 22050, librosa default
-            for chunk in ans:
-                yield librosa.core.resample(chunk, self._rate, 22050)
-def get_microphone_chunks(
-    min_to_cumulate=5,  # 0.5 seconds
-    max_to_cumulate=100,  # 10 seconds
-    precumulate=5,
-    max_to_visualize=100,
-):
-    vad = VoiceActivityDetection()
-    cumulated = []
-    precumulated = deque(maxlen=precumulate)
-    with MicrophoneStream() as stream:
-        audio_generator = stream.generator()
-        chunk_length = stream._chunk
-        waveform = torch.zeros(max_to_visualize * chunk_length)
-        for chunk in audio_generator:
-            # Is speech?
-            chunk = torch.tensor(chunk)
-            is_speech = vad.iter(chunk)
-            # Cumulate speech
-            if is_speech or cumulated:
-                cumulated.append(chunk)
-            else:
-                precumulated.append(chunk)
-            if (not is_speech and len(cumulated) >= min_to_cumulate) or (len(cumulated) > max_to_cumulate):
-                waveform = torch.cat(list(precumulated) + cumulated, -1)
-                yield (waveform * stream._rate, stream._rate)
-                cumulated = []
-                precumulated = deque(maxlen=precumulate)
--- a/examples/test/__init__.py
+++ b/examples/test/__init__.py
--- a/examples/test/test_interactive_asr.py
+++ b/examples/test/test_interactive_asr.py
-import argparse
-import logging
-import os
-import unittest
-from interactive_asr.utils import setup_asr, transcribe_file
-class ASRTest(unittest.TestCase):
-    logger = logging.getLogger(__name__)
-    logger.setLevel(logging.INFO)
-    arguments_dict = {
-        "path": "/scratch/jamarshon/downloads/model.pt",
-        "input_file": "/scratch/jamarshon/audio/examples/interactive_asr/data/sample.wav",
-        "data": "/scratch/jamarshon/downloads",
-        "user_dir": "/scratch/jamarshon/fairseq-py/examples/speech_recognition",
-        "no_progress_bar": False,
-        "log_interval": 1000,
-        "log_format": None,
-        "tensorboard_logdir": "",
-        "tbmf_wrapper": False,
-        "seed": 1,
-        "cpu": True,
-        "fp16": False,
-        "memory_efficient_fp16": False,
-        "fp16_init_scale": 128,
-        "fp16_scale_window": None,
-        "fp16_scale_tolerance": 0.0,
-        "min_loss_scale": 0.0001,
-        "threshold_loss_scale": None,
-        "criterion": "cross_entropy",
-        "tokenizer": None,
-        "bpe": None,
-        "optimizer": "nag",
-        "lr_scheduler": "fixed",
-        "task": "speech_recognition",
-        "num_workers": 0,
-        "skip_invalid_size_inputs_valid_test": False,
-        "max_tokens": 10000000,
-        "max_sentences": None,
-        "required_batch_size_multiple": 8,
-        "dataset_impl": None,
-        "gen_subset": "test",
-        "num_shards": 1,
-        "shard_id": 0,
-        "remove_bpe": None,
-        "quiet": False,
-        "model_overrides": "{}",
-        "results_path": None,
-        "beam": 40,
-        "nbest": 1,
-        "max_len_a": 0,
-        "max_len_b": 200,
-        "min_len": 1,
-        "match_source_len": False,
-        "no_early_stop": False,
-        "unnormalized": False,
-        "no_beamable_mm": False,
-        "lenpen": 1,
-        "unkpen": 0,
-        "replace_unk": None,
-        "sacrebleu": False,
-        "score_reference": False,
-        "prefix_size": 0,
-        "no_repeat_ngram_size": 0,
-        "sampling": False,
-        "sampling_topk": -1,
-        "sampling_topp": -1.0,
-        "temperature": 1.0,
-        "diverse_beam_groups": -1,
-        "diverse_beam_strength": 0.5,
-        "print_alignment": False,
-        "ctc": False,
-        "rnnt": False,
-        "kspmodel": None,
-        "wfstlm": None,
-        "rnnt_decoding_type": "greedy",
-        "lm_weight": 0.2,
-        "rnnt_len_penalty": -0.5,
-        "momentum": 0.99,
-        "weight_decay": 0.0,
-        "force_anneal": None,
-        "lr_shrink": 0.1,
-        "warmup_updates": 0,
-    }
-    arguments_dict["path"] = os.environ.get("ASR_MODEL_PATH", None)
-    arguments_dict["input_file"] = os.environ.get("ASR_INPUT_FILE", None)
-    arguments_dict["data"] = os.environ.get("ASR_DATA_PATH", None)
-    arguments_dict["user_dir"] = os.environ.get("ASR_USER_DIR", None)
-    args = argparse.Namespace(**arguments_dict)
-    def test_transcribe_file(self):
-        task, generator, models, sp, tgt_dict = setup_asr(self.args, self.logger)
-        _, transcription = transcribe_file(self.args, task, generator, models, sp, tgt_dict)
-        expected_transcription = [["THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG"]]
-        self.assertEqual(transcription, expected_transcription, msg=str(transcription))
-if __name__ == "__main__":
-    unittest.main()