init v0.10.0

9dcc7a15 · flyingdown · db2b0b79 · 9dcc7a15 · 9dcc7a15 · 9dcc7a15
Commit 9dcc7a15 authored Apr 25, 2022 by flyingdown
20 changed files
--- a/docs/source/datasets.rst
+++ b/docs/source/datasets.rst
+torchaudio.datasets
+====================
+
+All datasets are subclasses of :class:`torch.utils.data.Dataset`
+and have ``__getitem__`` and ``__len__`` methods implemented.
+Hence, they can all be passed to a :class:`torch.utils.data.DataLoader`
+which can load multiple samples parallelly using ``torch.multiprocessing`` workers.
+For example: ::
+
+    yesno_data = torchaudio.datasets.YESNO('.', download=True)
+    data_loader = torch.utils.data.DataLoader(yesno_data,
+                                              batch_size=1,
+                                              shuffle=True,
+                                              num_workers=args.nThreads)
+
+The following datasets are available:
+
+.. contents:: Datasets
+    :local:
+
+All the datasets have almost similar API. They all have two common arguments:
+``transform`` and  ``target_transform`` to transform the input and target respectively.
+
+
+.. currentmodule:: torchaudio.datasets
+
+
+CMUARCTIC
+~~~~~~~~~
+
+.. autoclass:: CMUARCTIC
+  :members:
+  :special-members: __getitem__
+
+
+CMUDict
+~~~~~~~~~
+
+.. autoclass:: CMUDict
+  :members:
+  :special-members: __getitem__
+
+
+COMMONVOICE
+~~~~~~~~~~~
+
+.. autoclass:: COMMONVOICE
+  :members:
+  :special-members: __getitem__
+
+
+GTZAN
+~~~~~
+
+.. autoclass:: GTZAN
+  :members:
+  :special-members: __getitem__
+
+
+LIBRISPEECH
+~~~~~~~~~~~
+
+.. autoclass:: LIBRISPEECH
+  :members:
+  :special-members: __getitem__
+
+
+LIBRITTS
+~~~~~~~~
+
+.. autoclass:: LIBRITTS
+  :members:
+  :special-members: __getitem__
+
+
+LJSPEECH
+~~~~~~~~
+
+.. autoclass:: LJSPEECH
+  :members:
+  :special-members: __getitem__
+
+
+SPEECHCOMMANDS
+~~~~~~~~~~~~~~
+
+.. autoclass:: SPEECHCOMMANDS
+  :members:
+  :special-members: __getitem__
+
+
+TEDLIUM
+~~~~~~~~~~~~~~
+
+.. autoclass:: TEDLIUM
+  :members:
+  :special-members: __getitem__
+
+
+VCTK
+~~~~
+
+.. autoclass:: VCTK
+  :members:
+  :special-members: __getitem__
+
+
+VCTK_092
+~~~~~~~~
+
+.. autoclass:: VCTK_092
+  :members:
+  :special-members: __getitem__
+
+
+YESNO
+~~~~~
+
+.. autoclass:: YESNO
+  :members:
+  :special-members: __getitem__
--- a/docs/source/functional.rst
+++ b/docs/source/functional.rst
+.. role:: hidden
+    :class: hidden-section
+
+torchaudio.functional
+=====================
+
+.. currentmodule:: torchaudio.functional
+
+Functions to perform common audio operations.
+
+:hidden:`Utility`
+~~~~~~~~~~~~~~~~~
+
+amplitude_to_DB
+---------------
+
+.. autofunction:: amplitude_to_DB
+
+DB_to_amplitude
+---------------
+
+.. autofunction:: DB_to_amplitude
+
+create_fb_matrix
+----------------
+
+.. autofunction:: create_fb_matrix
+
+melscale_fbanks
+---------------
+
+.. autofunction:: melscale_fbanks
+
+linear_fbanks
+-------------
+
+.. autofunction:: linear_fbanks
+
+create_dct
+----------
+
+.. autofunction:: create_dct
+
+mask_along_axis
+---------------
+
+.. autofunction:: mask_along_axis
+
+mask_along_axis_iid
+-------------------
+
+.. autofunction:: mask_along_axis_iid
+
+mu_law_encoding
+---------------
+
+.. autofunction:: mu_law_encoding
+
+mu_law_decoding
+---------------
+
+.. autofunction:: mu_law_decoding
+
+apply_codec
+-----------
+
+.. autofunction:: apply_codec
+
+resample
+--------
+
+.. autofunction:: resample
+
+:hidden:`Complex Utility`
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Utilities for pseudo complex tensor. This is not for the native complex dtype, such as `cfloat64`, but for tensors with real-value type and have extra dimension at the end for real and imaginary parts.
+
+angle
+-----
+
+.. autofunction:: angle
+
+complex_norm
+------------
+
+.. autofunction:: complex_norm
+
+magphase
+--------
+
+.. autofunction:: magphase
+
+:hidden:`Filtering`
+~~~~~~~~~~~~~~~~~~~
+
+
+allpass_biquad
+--------------
+
+.. autofunction:: allpass_biquad
+
+band_biquad
+-----------
+
+.. autofunction:: band_biquad
+
+bandpass_biquad
+---------------
+
+.. autofunction:: bandpass_biquad
+
+bandreject_biquad
+-----------------
+
+.. autofunction:: bandreject_biquad
+
+bass_biquad
+-----------
+
+.. autofunction:: bass_biquad
+
+biquad
+------
+
+.. autofunction:: biquad
+
+contrast
+--------
+
+.. autofunction:: contrast
+
+dcshift
+-------
+
+.. autofunction:: dcshift
+
+deemph_biquad
+-------------
+
+.. autofunction:: deemph_biquad
+
+
+dither
+------
+
+.. autofunction:: dither
+
+equalizer_biquad
+----------------
+
+.. autofunction:: equalizer_biquad
+
+filtfilt
+--------
+
+.. autofunction:: filtfilt
+
+flanger
+-------
+
+.. autofunction:: flanger
+
+gain
+----
+
+.. autofunction:: gain
+
+highpass_biquad
+---------------
+
+.. autofunction:: highpass_biquad
+
+lfilter
+-------
+
+.. autofunction:: lfilter
+
+lowpass_biquad
+--------------
+
+.. autofunction:: lowpass_biquad
+
+overdrive
+---------
+
+.. autofunction:: overdrive
+
+phaser
+------
+
+.. autofunction:: phaser
+
+riaa_biquad
+-----------
+
+.. autofunction:: riaa_biquad
+
+treble_biquad
+-------------
+
+.. autofunction:: treble_biquad
+
+:hidden:`Feature Extractions`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:hidden:`vad`
+-------------
+
+.. autofunction:: vad
+
+:hidden:`spectrogram`
+---------------------
+
+.. autofunction:: spectrogram
+
+:hidden:`inverse_spectrogram`
+-----------------------------
+
+.. autofunction:: inverse_spectrogram
+
+:hidden:`griffinlim`
+--------------------
+
+.. autofunction:: griffinlim
+
+:hidden:`phase_vocoder`
+-----------------------
+
+.. autofunction:: phase_vocoder
+
+:hidden:`pitch_shift`
+---------------------
+
+.. autofunction:: pitch_shift
+
+:hidden:`compute_deltas`
+------------------------
+
+.. autofunction:: compute_deltas
+
+:hidden:`detect_pitch_frequency`
+--------------------------------
+
+.. autofunction:: detect_pitch_frequency
+
+:hidden:`sliding_window_cmn`
+----------------------------
+
+.. autofunction:: sliding_window_cmn
+
+:hidden:`compute_kaldi_pitch`
+-----------------------------
+
+.. autofunction:: compute_kaldi_pitch
+
+:hidden:`spectral_centroid`
+---------------------------
+
+.. autofunction:: spectral_centroid
+
+:hidden:`Loss`
+~~~~~~~~~~~~~~
+
+rnnt_loss
+---------
+
+.. autofunction:: rnnt_loss
+
+:hidden:`Metric`
+~~~~~~~~~~~~~~~~
+
+edit_distance
+-------------
+
+.. autofunction:: edit_distance
+
+References
+~~~~~~~~~~
+
+.. footbibliography::
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
+torchaudio
+==========
+This library is part of the `PyTorch
+<http://pytorch.org/>`_ project. PyTorch is an open source
+machine learning framework.
+
+Features described in this documentation are classified by release status:
+
+  *Stable:*  These features will be maintained long-term and there should generally
+  be no major performance limitations or gaps in documentation.
+  We also expect to maintain backwards compatibility (although
+  breaking changes can happen and notice will be given one release ahead
+  of time).
+
+  *Beta:*  Features are tagged as Beta because the API may change based on
+  user feedback, because the performance needs to improve, or because
+  coverage across operators is not yet complete. For Beta features, we are
+  committing to seeing the feature through to the Stable classification.
+  We are not, however, committing to backwards compatibility.
+
+  *Prototype:*  These features are typically not available as part of
+  binary distributions like PyPI or Conda, except sometimes behind run-time
+  flags, and are at an early stage for feedback and testing.
+
+
+The :mod:`torchaudio` package consists of I/O, popular datasets and common audio transformations.
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Package Reference
+
+   torchaudio
+   backend
+   functional
+   transforms
+   datasets
+   models
+   pipelines
+   sox_effects
+   compliance.kaldi
+   kaldi_io
+   utils
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: PyTorch Libraries
+
+   PyTorch <https://pytorch.org/docs>
+   torchaudio <https://pytorch.org/audio>
+   torchtext <https://pytorch.org/text>
+   torchvision <https://pytorch.org/vision>
+   TorchElastic <https://pytorch.org/elastic/>
+   TorchServe <https://pytorch.org/serve>
+   PyTorch on XLA Devices <http://pytorch.org/xla/>
--- a/docs/source/kaldi_io.rst
+++ b/docs/source/kaldi_io.rst
+.. role:: hidden
+    :class: hidden-section
+
+torchaudio.kaldi_io
+======================
+
+.. currentmodule:: torchaudio.kaldi_io
+
+To use this module, the dependency kaldi_io_ needs to be installed.
+This is a light wrapper around ``kaldi_io`` that returns :class:`torch.Tensor`.
+
+.. _kaldi_io: https://github.com/vesis84/kaldi-io-for-python
+
+Vectors
+-------
+
+:hidden:`read_vec_int_ark`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: read_vec_int_ark
+
+:hidden:`read_vec_flt_scp`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: read_vec_flt_scp
+
+:hidden:`read_vec_flt_ark`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: read_vec_flt_ark
+
+Matrices
+--------
+
+:hidden:`read_mat_scp`
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: read_mat_scp
+
+:hidden:`read_mat_ark`
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: read_mat_ark
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
+.. role:: hidden
+    :class: hidden-section
+
+torchaudio.models
+=================
+
+.. currentmodule:: torchaudio.models
+
+The models subpackage contains definitions of models for addressing common audio tasks.
+
+
+ConvTasNet
+~~~~~~~~~~
+
+.. autoclass:: ConvTasNet
+
+  .. automethod:: forward
+
+
+DeepSpeech
+~~~~~~~~~~
+
+.. autoclass:: DeepSpeech
+
+  .. automethod:: forward
+
+
+Tacotron2
+~~~~~~~~~
+
+.. autoclass:: Tacotron2
+
+  .. automethod:: forward
+
+  .. automethod:: infer
+
+Wav2Letter
+~~~~~~~~~~
+
+.. autoclass:: Wav2Letter
+
+  .. automethod:: forward
+
+
+Wav2Vec2.0 / HuBERT
+~~~~~~~~~~~~~~~~~~~
+
+Model
+-----
+
+Wav2Vec2Model
+^^^^^^^^^^^^^
+
+.. autoclass:: Wav2Vec2Model
+
+  .. automethod:: extract_features
+
+  .. automethod:: forward
+
+Factory Functions
+-----------------
+
+wav2vec2_model
+^^^^^^^^^^^^^^
+
+.. autofunction:: wav2vec2_model
+
+
+wav2vec2_base
+^^^^^^^^^^^^^
+
+.. autofunction:: wav2vec2_base
+
+wav2vec2_large
+^^^^^^^^^^^^^^
+
+.. autofunction:: wav2vec2_large
+
+wav2vec2_large_lv60k
+^^^^^^^^^^^^^^^^^^^^
+
+.. autofunction:: wav2vec2_large_lv60k
+
+hubert_base
+^^^^^^^^^^^
+
+.. autofunction:: hubert_base
+
+hubert_large
+^^^^^^^^^^^^
+
+.. autofunction:: hubert_large
+
+hubert_xlarge
+^^^^^^^^^^^^^
+
+.. autofunction:: hubert_xlarge
+
+Utility Functions
+-----------------
+
+.. currentmodule:: torchaudio.models.wav2vec2.utils
+
+import_huggingface_model
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autofunction:: import_huggingface_model
+
+import_fairseq_model
+^^^^^^^^^^^^^^^^^^^^
+
+.. autofunction:: import_fairseq_model
+
+.. currentmodule:: torchaudio.models
+
+WaveRNN
+~~~~~~~
+
+.. autoclass:: WaveRNN
+
+  .. automethod:: forward
+
+  .. automethod:: infer
+
+References
+~~~~~~~~~~
+
+.. footbibliography::
--- a/docs/source/pipelines.rst
+++ b/docs/source/pipelines.rst
+torchaudio.pipelines
+====================
+
+.. currentmodule:: torchaudio.pipelines
+
+The pipelines subpackage contains API to access the models with pretrained weights, and information/helper functions associated the pretrained weights.
+
+wav2vec 2.0 / HuBERT - Representation Learning
+----------------------------------------------
+
+.. autoclass:: Wav2Vec2Bundle
+   :members: sample_rate
+
+   .. automethod:: get_model
+
+WAV2VEC2_BASE
+~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: WAV2VEC2_BASE
+      :no-value:
+
+WAV2VEC2_LARGE
+~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: WAV2VEC2_LARGE
+      :no-value:
+
+WAV2VEC2_LARGE_LV60K
+~~~~~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: WAV2VEC2_LARGE_LV60K
+      :no-value:
+
+
+WAV2VEC2_XLSR53
+~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: WAV2VEC2_XLSR53
+      :no-value:
+
+HUBERT_BASE
+~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: HUBERT_BASE
+      :no-value:
+
+HUBERT_LARGE
+~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: HUBERT_LARGE
+      :no-value:
+
+HUBERT_XLARGE
+~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: HUBERT_XLARGE
+      :no-value:
+
+wav2vec 2.0 / HuBERT - Fine-tuned ASR
+-------------------------------------
+
+.. autoclass:: Wav2Vec2ASRBundle
+   :members: sample_rate
+
+   .. automethod:: get_model
+
+   .. automethod:: get_labels
+
+
+WAV2VEC2_ASR_BASE_10M
+~~~~~~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: WAV2VEC2_ASR_BASE_10M
+      :no-value:
+
+WAV2VEC2_ASR_BASE_100H
+~~~~~~~~~~~~~~~~~~~~~~
+      
+.. container:: py attribute
+
+   .. autodata:: WAV2VEC2_ASR_BASE_100H
+      :no-value:
+
+WAV2VEC2_ASR_BASE_960H
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: WAV2VEC2_ASR_BASE_960H
+      :no-value:
+
+WAV2VEC2_ASR_LARGE_10M
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: WAV2VEC2_ASR_LARGE_10M
+      :no-value:
+
+WAV2VEC2_ASR_LARGE_100H
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: WAV2VEC2_ASR_LARGE_100H
+      :no-value:
+
+WAV2VEC2_ASR_LARGE_960H
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: WAV2VEC2_ASR_LARGE_960H
+      :no-value:
+
+WAV2VEC2_ASR_LARGE_LV60K_10M
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: WAV2VEC2_ASR_LARGE_LV60K_10M
+      :no-value:
+
+WAV2VEC2_ASR_LARGE_LV60K_100H
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: WAV2VEC2_ASR_LARGE_LV60K_100H
+      :no-value:
+
+WAV2VEC2_ASR_LARGE_LV60K_960H
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: WAV2VEC2_ASR_LARGE_LV60K_960H
+      :no-value:
+
+HUBERT_ASR_LARGE
+~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: HUBERT_ASR_LARGE
+      :no-value:
+
+HUBERT_ASR_XLARGE
+~~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: HUBERT_ASR_XLARGE
+      :no-value:
+
+Tacotron2 Text-To-Speech
+------------------------
+
+Tacotron2TTSBundle
+~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: Tacotron2TTSBundle
+
+   .. automethod:: get_text_processor
+
+   .. automethod:: get_tacotron2
+
+   .. automethod:: get_vocoder
+
+Tacotron2TTSBundle - TextProcessor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: torchaudio.pipelines::Tacotron2TTSBundle.TextProcessor
+   :members: tokens
+   :special-members: __call__
+
+
+Tacotron2TTSBundle - Vocoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: torchaudio.pipelines::Tacotron2TTSBundle.Vocoder
+   :members: sample_rate
+   :special-members: __call__
+
+
+TACOTRON2_WAVERNN_PHONE_LJSPEECH
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: TACOTRON2_WAVERNN_PHONE_LJSPEECH
+      :no-value:
+
+
+TACOTRON2_WAVERNN_CHAR_LJSPEECH
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: TACOTRON2_WAVERNN_CHAR_LJSPEECH
+      :no-value:
+
+TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH
+      :no-value:
+
+TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH
+      :no-value:
+
+References
+----------
+
+.. footbibliography::
--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
+@article{specaugment,
+   title={SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition},
+   url={http://dx.doi.org/10.21437/Interspeech.2019-2680},
+   DOI={10.21437/interspeech.2019-2680},
+   journal={Interspeech 2019},
+   publisher={ISCA},
+   author={Park, Daniel S. and Chan, William and Zhang, Yu and Chiu, Chung-Cheng and Zoph, Barret and Cubuk, Ekin D. and Le, Quoc V.},
+   year={2019},
+   month={Sep}
+}
+@misc{ljspeech17,
+  author       = {Keith Ito and Linda Johnson},
+  title        = {The LJ Speech Dataset},
+  howpublished = {\url{https://keithito.com/LJ-Speech-Dataset/}},
+  year         = {2017}
+}
+@misc{conneau2020unsupervised,
+      title={Unsupervised Cross-lingual Representation Learning for Speech Recognition}, 
+      author={Alexis Conneau and Alexei Baevski and Ronan Collobert and Abdelrahman Mohamed and Michael Auli},
+      year={2020},
+      eprint={2006.13979},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@inproceedings{Gales2014SpeechRA,
+  title={Speech recognition and keyword spotting for low-resource languages: Babel project research at CUED},
+  author={Mark John Francis Gales and Kate Knill and Anton Ragni and Shakti Prasad Rath},
+  booktitle={SLTU},
+  year={2014}
+}
+@misc{ardila2020common,
+      title={Common Voice: A Massively-Multilingual Speech Corpus}, 
+      author={Rosana Ardila and Megan Branson and Kelly Davis and Michael Henretty and Michael Kohler and Josh Meyer and Reuben Morais and Lindsay Saunders and Francis M. Tyers and Gregor Weber},
+      year={2020},
+      eprint={1912.06670},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@article{Pratap_2020,
+   title={MLS: A Large-Scale Multilingual Dataset for Speech Research},
+   url={http://dx.doi.org/10.21437/Interspeech.2020-2826},
+   DOI={10.21437/interspeech.2020-2826},
+   journal={Interspeech 2020},
+   publisher={ISCA},
+   author={Pratap, Vineel and Xu, Qiantong and Sriram, Anuroop and Synnaeve, Gabriel and Collobert, Ronan},
+   year={2020},
+   month={Oct}
+}
+@INPROCEEDINGS{librilight,
+  author={J. {Kahn} and M. {Rivière} and W. {Zheng} and E. {Kharitonov} and Q. {Xu} and P. E. {Mazaré} and J. {Karadayi} and V. {Liptchinsky} and R. {Collobert} and C. {Fuegen} and T. {Likhomanenko} and G. {Synnaeve} and A. {Joulin} and A. {Mohamed} and E. {Dupoux}},
+  booktitle={ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
+  title={Libri-Light: A Benchmark for ASR with Limited or No Supervision}, 
+  year={2020},
+  pages={7669-7673},
+  note = {\url{https://github.com/facebookresearch/libri-light}},
+}
+@INPROCEEDINGS{7178964,
+  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
+  booktitle={2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
+  title={Librispeech: An ASR corpus based on public domain audio books}, 
+  year={2015},
+  volume={},
+  number={},
+  pages={5206-5210},
+  doi={10.1109/ICASSP.2015.7178964}
+}
+@inproceedings{ott2019fairseq,
+  title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling},
+  author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli},
+  booktitle = {Proceedings of NAACL-HLT 2019: Demonstrations},
+  year = {2019},
+}
+@misc{baevski2020wav2vec,
+      title={wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations}, 
+      author={Alexei Baevski and Henry Zhou and Abdelrahman Mohamed and Michael Auli},
+      year={2020},
+      eprint={2006.11477},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{hsu2021hubert,
+      title={HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units}, 
+      author={Wei-Ning Hsu and Benjamin Bolte and Yao-Hung Hubert Tsai and Kushal Lakhotia and Ruslan Salakhutdinov and Abdelrahman Mohamed},
+      year={2021},
+      eprint={2106.07447},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{hannun2014deep,
+      title={Deep Speech: Scaling up end-to-end speech recognition}, 
+      author={Awni Hannun and Carl Case and Jared Casper and Bryan Catanzaro and Greg Diamos and Erich Elsen and Ryan Prenger and Sanjeev Satheesh and Shubho Sengupta and Adam Coates and Andrew Y. Ng},
+      year={2014},
+      eprint={1412.5567},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{graves2012sequence,
+      title={Sequence Transduction with Recurrent Neural Networks}, 
+      author={Alex Graves},
+      year={2012},
+      eprint={1211.3711},
+      archivePrefix={arXiv},
+      primaryClass={cs.NE}
+}
+@misc{collobert2016wav2letter,
+      title={Wav2Letter: an End-to-End ConvNet-based Speech Recognition System}, 
+      author={Ronan Collobert and Christian Puhrsch and Gabriel Synnaeve},
+      year={2016},
+      eprint={1609.03193},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+@misc{kalchbrenner2018efficient,
+      title={Efficient Neural Audio Synthesis}, 
+      author={Nal Kalchbrenner and Erich Elsen and Karen Simonyan and Seb Noury and Norman Casagrande and Edward Lockhart and Florian Stimberg and Aaron van den Oord and Sander Dieleman and Koray Kavukcuoglu},
+      year={2018},
+      eprint={1802.08435},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD}
+}
+@article{Luo_2019,
+   title={Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation},
+   volume={27},
+   ISSN={2329-9304},
+   url={http://dx.doi.org/10.1109/TASLP.2019.2915167},
+   DOI={10.1109/taslp.2019.2915167},
+   number={8},
+   journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
+   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
+   author={Luo, Yi and Mesgarani, Nima},
+   year={2019},
+   month={Aug},
+   pages={1256–1266}
+}
+@InProceedings{ brian_mcfee-proc-scipy-2015,
+  author    = { {B}rian {M}c{F}ee and {C}olin {R}affel and {D}awen {L}iang and {D}aniel {P}.{W}. {E}llis and {M}att {M}c{V}icar and {E}ric {B}attenberg and {O}riol {N}ieto },
+  title     = { librosa: {A}udio and {M}usic {S}ignal {A}nalysis in {P}ython },
+  booktitle = { {P}roceedings of the 14th {P}ython in {S}cience {C}onference },
+  pages     = { 18 - 24 },
+  year      = { 2015 },
+  editor    = { {K}athryn {H}uff and {J}ames {B}ergstra },
+  doi       = { 10.25080/Majora-7b98e3ed-003 }
+}
+@INPROCEEDINGS{6701851,
+  author={Perraudin, Nathanaël and Balazs, Peter and Søndergaard, Peter L.},
+  booktitle={2013 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics}, 
+  title={A fast Griffin-Lim algorithm}, 
+  year={2013},
+  volume={},
+  number={},
+  pages={1-4},
+  doi={10.1109/WASPAA.2013.6701851}}
+@INPROCEEDINGS{1172092,
+  author={Griffin, D. and Jae Lim},
+  booktitle={ICASSP '83. IEEE International Conference on Acoustics, Speech, and Signal Processing}, 
+  title={Signal estimation from modified short-time Fourier transform}, 
+  year={1983},
+  volume={8},
+  number={},
+  pages={804-807},
+  doi={10.1109/ICASSP.1983.1172092}}
+@INPROCEEDINGS{6854049,
+  author={Ghahremani, Pegah and BabaAli, Bagher and Povey, Daniel and Riedhammer, Korbinian and Trmal, Jan and Khudanpur, Sanjeev},
+  booktitle={2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
+  title={A pitch extraction algorithm tuned for automatic speech recognition}, 
+  year={2014},
+  volume={},
+  number={},
+  pages={2494-2498},
+  doi={10.1109/ICASSP.2014.6854049}}
+@inproceedings{shen2018natural,
+  title={Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions},
+  author={Shen, Jonathan and Pang, Ruoming and Weiss, Ron J and Schuster, Mike and Jaitly, Navdeep and Yang, Zongheng and Chen, Zhifeng and Zhang, Yu and Wang, Yuxuan and Skerrv-Ryan, Rj and others},
+  booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  pages={4779--4783},
+  year={2018},
+  organization={IEEE}
+}
+@inproceedings{souden2009optimal,
+  title={On optimal frequency-domain multichannel linear filtering for noise reduction},
+  author={Souden, Mehrez and Benesty, Jacob and Affes, Sofiene},
+  booktitle={IEEE Transactions on audio, speech, and language processing},
+  volume={18},
+  number={2},
+  pages={260--276},
+  year={2009},
+  publisher={IEEE}
+}
+@inproceedings{higuchi2016robust,
+  title={Robust MVDR beamforming using time-frequency masks for online/offline ASR in noise},
+  author={Higuchi, Takuya and Ito, Nobutaka and Yoshioka, Takuya and Nakatani, Tomohiro},
+  booktitle={2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  pages={5210--5214},
+  year={2016},
+  organization={IEEE}
+}
+@article{mises1929praktische,
+  title={Praktische Verfahren der Gleichungsaufl{\"o}sung.},
+  author={Mises, RV and Pollaczek-Geiringer, Hilda},
+  journal={ZAMM-Journal of Applied Mathematics and Mechanics/Zeitschrift f{\"u}r Angewandte Mathematik und Mechanik},
+  volume={9},
+  number={1},
+  pages={58--77},
+  year={1929},
+  publisher={Wiley Online Library}
+}
+@article{higuchi2017online,
+  title={Online MVDR beamformer based on complex Gaussian mixture model with spatial prior for noise robust ASR},
+  author={Higuchi, Takuya and Ito, Nobutaka and Araki, Shoko and Yoshioka, Takuya and Delcroix, Marc and Nakatani, Tomohiro},
+  journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
+  volume={25},
+  number={4},
+  pages={780--793},
+  year={2017},
+  publisher={IEEE}
+}
--- a/docs/source/sox_effects.rst
+++ b/docs/source/sox_effects.rst
+.. _sox_effects:
+
+torchaudio.sox_effects
+======================
+
+.. currentmodule:: torchaudio.sox_effects
+
+Resource initialization / shutdown
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: init_sox_effects
+
+.. autofunction:: shutdown_sox_effects
+
+Listing supported effects
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: effect_names
+
+Applying effects
+~~~~~~~~~~~~~~~~
+
+Apply SoX effects chain on torch.Tensor or on file and load as torch.Tensor.
+
+Applying effects on Tensor
+--------------------------
+
+.. autofunction:: apply_effects_tensor
+
+Applying effects on file
+------------------------
+
+.. autofunction:: apply_effects_file
--- a/docs/source/torchaudio.rst
+++ b/docs/source/torchaudio.rst
+torchaudio
+==========
+
+I/O functionalities
+~~~~~~~~~~~~~~~~~~~
+
+Audio I/O functions are implemented in :ref:`torchaudio.backend<backend>` module, but for the ease of use, the following functions are made available on :mod:`torchaudio` module. There are different backends available and you can switch backends with :func:`set_audio_backend`.
+
+Refer to :ref:`backend` for the detail.
+
+.. function:: torchaudio.info(filepath: str, ...)
+
+   Fetch meta data of an audio file. Refer to :ref:`backend` for the detail.
+
+.. function:: torchaudio.load(filepath: str, ...)
+
+   Load audio file into torch.Tensor object. Refer to :ref:`backend` for the detail.
+
+.. function:: torchaudio.save(filepath: str, src: torch.Tensor, sample_rate: int, ...)
+
+   Save torch.Tensor object into an audio format. Refer to :ref:`backend` for the detail.
+
+.. currentmodule:: torchaudio
+
+Backend Utilities
+~~~~~~~~~~~~~~~~~
+
+.. autofunction:: list_audio_backends
+
+.. autofunction:: get_audio_backend
+
+.. autofunction:: set_audio_backend
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
+.. role:: hidden
+    :class: hidden-section
+
+torchaudio.transforms
+======================
+
+.. currentmodule:: torchaudio.transforms
+
+Transforms are common audio transforms. They can be chained together using :class:`torch.nn.Sequential`
+
+:hidden:`Utility`
+~~~~~~~~~~~~~~~~~~
+
+:hidden:`AmplitudeToDB`
+-----------------------
+
+.. autoclass:: AmplitudeToDB
+
+  .. automethod:: forward
+
+:hidden:`MelScale`
+------------------
+
+.. autoclass:: MelScale
+
+  .. automethod:: forward
+
+:hidden:`InverseMelScale`
+-------------------------
+
+.. autoclass:: InverseMelScale
+
+  .. automethod:: forward
+
+:hidden:`MuLawEncoding`
+-----------------------
+
+.. autoclass:: MuLawEncoding
+
+  .. automethod:: forward
+
+:hidden:`MuLawDecoding`
+-----------------------
+
+.. autoclass:: MuLawDecoding
+
+  .. automethod:: forward
+
+:hidden:`Resample`
+------------------
+
+.. autoclass:: Resample
+
+  .. automethod:: forward
+
+:hidden:`FrequencyMasking`
+--------------------------
+
+.. autoclass:: FrequencyMasking
+
+  .. automethod:: forward
+
+:hidden:`TimeMasking`
+---------------------
+
+.. autoclass:: TimeMasking
+
+  .. automethod:: forward
+
+:hidden:`TimeStretch`
+---------------------
+
+.. autoclass:: TimeStretch
+
+  .. automethod:: forward
+
+:hidden:`Fade`
+--------------
+
+.. autoclass:: Fade
+
+  .. automethod:: forward
+
+:hidden:`Vol`
+-------------
+
+.. autoclass:: Vol
+
+  .. automethod:: forward
+
+:hidden:`Complex Utility`
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:hidden:`ComplexNorm`
+---------------------
+
+.. autoclass:: ComplexNorm
+
+  .. automethod:: forward
+
+:hidden:`Feature Extractions`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:hidden:`Spectrogram`
+---------------------
+
+.. autoclass:: Spectrogram
+
+  .. automethod:: forward
+
+:hidden:`InverseSpectrogram`
+----------------------------
+
+.. autoclass:: InverseSpectrogram
+
+  .. automethod:: forward
+
+:hidden:`MelSpectrogram`
+------------------------
+
+.. autoclass:: MelSpectrogram
+
+  .. automethod:: forward
+
+:hidden:`GriffinLim`
+--------------------
+
+.. autoclass:: GriffinLim
+
+  .. automethod:: forward
+
+:hidden:`MFCC`
+--------------
+
+.. autoclass:: MFCC
+
+  .. automethod:: forward
+
+:hidden:`LFCC`
+--------------
+
+.. autoclass:: LFCC
+
+  .. automethod:: forward
+
+:hidden:`ComputeDeltas`
+-----------------------
+
+.. autoclass:: ComputeDeltas
+
+  .. automethod:: forward
+
+:hidden:`PitchShift`
+--------------------
+
+.. autoclass:: PitchShift
+
+  .. automethod:: forward
+
+:hidden:`SlidingWindowCmn`
+--------------------------
+
+.. autoclass:: SlidingWindowCmn
+
+  .. automethod:: forward
+
+:hidden:`SpectralCentroid`
+--------------------------
+
+.. autoclass:: SpectralCentroid
+
+  .. automethod:: forward
+
+:hidden:`Vad`
+-------------
+
+.. autoclass:: Vad
+
+  .. automethod:: forward
+
+:hidden:`Loss`
+~~~~~~~~~~~~~~
+
+:hidden:`RNNTLoss`
+------------------
+
+.. autoclass:: RNNTLoss
+
+  .. automethod:: forward
+
+:hidden:`Multi-channel`
+~~~~~~~~~~~~~~~~~~~~~~~
+
+:hidden:`PSD`
+-------------
+
+.. autoclass:: PSD
+
+  .. automethod:: forward
+
+:hidden:`MVDR`
+--------------
+
+.. autoclass:: MVDR
+
+  .. automethod:: forward
+
+References
+~~~~~~~~~~
+
+.. footbibliography::
--- a/docs/source/utils.rst
+++ b/docs/source/utils.rst
+torchaudio.utils
+================
+
+torchaudio.utils.sox_utils
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Utility module to configure libsox.
+This affects functionalities in :ref:`Sox IO backend<sox_io_backend>` and :ref:`Sox Effects<sox_effects>`.
+
+.. automodule:: torchaudio.utils.sox_utils
+   :members:
--- a/examples/beamforming/MVDR_tutorial.ipynb
+++ b/examples/beamforming/MVDR_tutorial.ipynb
+{
+  "nbformat": 4,
+  "nbformat_minor": 2,
+  "metadata": {
+    "colab": {
+      "name": "Copy of Copy of torchaudio_MVDR_tutorial.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3.9.6 64-bit ('dev': conda)"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.9.6",
+      "mimetype": "text/x-python",
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "pygments_lexer": "ipython3",
+      "nbconvert_exporter": "python",
+      "file_extension": ".py"
+    },
+    "interpreter": {
+      "hash": "6a702c257b9a40163843ba760790c17a6ddd2abeef8febce55475eea4b92c28c"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/nateanl/audio/blob/mvdr/examples/beamforming/MVDR_tutorial.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ],
+      "metadata": {
+        "id": "xheYDPUcYGbp"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "This is a tutorial on how to apply MVDR beamforming by using [torchaudio](https://github.com/pytorch/audio)\n",
+        "-----------\n",
+        "\n",
+        "The multi-channel audio example is selected from [ConferencingSpeech](https://github.com/ConferencingSpeech/ConferencingSpeech2021) dataset. \n",
+        "\n",
+        "```\n",
+        "original filename: SSB07200001\\#noise-sound-bible-0038\\#7.86_6.16_3.00_3.14_4.84_134.5285_191.7899_0.4735\\#15217\\#25.16333303751458\\#0.2101221178590021.wav\n",
+        "```\n",
+        "\n",
+        "Note:\n",
+        "- You need to use the nightly torchaudio in order to use the MVDR and InverseSpectrogram modules.\n",
+        "\n",
+        "\n",
+        "Steps\n",
+        "\n",
+        "- Ideal Ratio Mask (IRM) is generated by dividing the clean/noise magnitude by the mixture magnitude.\n",
+        "- We test all three solutions (``ref_channel``, ``stv_evd``, ``stv_power``) of torchaudio's MVDR module.\n",
+        "- We test the single-channel and multi-channel masks for MVDR beamforming. The multi-channel mask is averaged along channel dimension when computing the covariance matrices of speech and noise, respectively."
+      ],
+      "metadata": {
+        "id": "L6R0MXe5Wr19"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "!pip install --pre torchaudio -f https://download.pytorch.org/whl/nightly/torch_nightly.html --force"
+      ],
+      "outputs": [],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "juO6PE9XLctD",
+        "outputId": "8777ba14-da99-4c18-d80f-b070ad9861af"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "import torch\n",
+        "import torchaudio\n",
+        "import IPython.display as ipd"
+      ],
+      "outputs": [],
+      "metadata": {
+        "id": "T4u4unhFMMBG"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Load audios of mixture, reverberated clean speech, and dry clean speech."
+      ],
+      "metadata": {
+        "id": "bDILVXkeg2s3"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "!curl -LJO https://github.com/nateanl/torchaudio_mvdr_tutorial/raw/main/wavs/mix.wav\n",
+        "!curl -LJO https://github.com/nateanl/torchaudio_mvdr_tutorial/raw/main/wavs/reverb_clean.wav\n",
+        "!curl -LJO https://github.com/nateanl/torchaudio_mvdr_tutorial/raw/main/wavs/clean.wav"
+      ],
+      "outputs": [],
+      "metadata": {
+        "id": "2XIyMa_VKv0c",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "404f46a6-e70c-4f80-af8d-d356408a9f18"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "mix, sr = torchaudio.load('mix.wav')\n",
+        "reverb_clean, sr2 = torchaudio.load('reverb_clean.wav')\n",
+        "clean, sr3 = torchaudio.load('clean.wav')\n",
+        "assert sr == sr2\n",
+        "noise = mix - reverb_clean"
+      ],
+      "outputs": [],
+      "metadata": {
+        "id": "iErB6UhQPtD3"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Note: The MVDR Module requires ``torch.cdouble`` dtype for noisy STFT. We need to convert the dtype of the waveforms to ``torch.double``"
+      ],
+      "metadata": {
+        "id": "Aq-x_fo5VkwL"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "mix = mix.to(torch.double)\n",
+        "noise = noise.to(torch.double)\n",
+        "clean = clean.to(torch.double)\n",
+        "reverb_clean = reverb_clean.to(torch.double)"
+      ],
+      "outputs": [],
+      "metadata": {
+        "id": "5c66pHcQV0P9"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Initilize the Spectrogram and InverseSpectrogram modules"
+      ],
+      "metadata": {
+        "id": "05D26we0V4P-"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "stft = torchaudio.transforms.Spectrogram(n_fft=1024, hop_length=256, return_complex=True, power=None)\n",
+        "istft = torchaudio.transforms.InverseSpectrogram(n_fft=1024, hop_length=256)"
+      ],
+      "outputs": [],
+      "metadata": {
+        "id": "NcGhD7_TUKd1"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Compute the complex-valued STFT of mixture, clean speech, and noise"
+      ],
+      "metadata": {
+        "id": "-dlJcuSNUCgA"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "spec_mix = stft(mix)\n",
+        "spec_clean = stft(clean)\n",
+        "spec_reverb_clean = stft(reverb_clean)\n",
+        "spec_noise = stft(noise)"
+      ],
+      "outputs": [],
+      "metadata": {
+        "id": "w1vO7w1BUKt4"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Generate the Ideal Ratio Mask (IRM)\n",
+        "Note: we found using the mask directly peforms better than using the square root of it. This is slightly different from the definition of IRM."
+      ],
+      "metadata": {
+        "id": "8SBchrDhURK1"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "def get_irms(spec_clean, spec_noise, spec_mix):\n",
+        "    mag_mix = spec_mix.abs() ** 2\n",
+        "    mag_clean = spec_clean.abs() ** 2\n",
+        "    mag_noise = spec_noise.abs() ** 2\n",
+        "    irm_speech = mag_clean / (mag_clean + mag_noise)\n",
+        "    irm_noise = mag_noise / (mag_clean + mag_noise)\n",
+        "\n",
+        "    return irm_speech, irm_noise"
+      ],
+      "outputs": [],
+      "metadata": {
+        "id": "2gB63BoWUmHZ"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Note: We use reverberant clean speech as the target here, you can also set it to dry clean speech"
+      ],
+      "metadata": {
+        "id": "reGMDyNCaE7L"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "irm_speech, irm_noise = get_irms(spec_reverb_clean, spec_noise, spec_mix)"
+      ],
+      "outputs": [],
+      "metadata": {
+        "id": "HSTCGy_5Uqzx"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Apply MVDR beamforming by using multi-channel masks"
+      ],
+      "metadata": {
+        "id": "1R5I_TmSUbS0"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "results_multi = {}\n",
+        "for solution in ['ref_channel', 'stv_evd', 'stv_power']:\n",
+        "    mvdr = torchaudio.transforms.MVDR(ref_channel=0, solution=solution, multi_mask=True)\n",
+        "    stft_est = mvdr(spec_mix, irm_speech, irm_noise)\n",
+        "    est = istft(stft_est, length=mix.shape[-1])\n",
+        "    results_multi[solution] = est"
+      ],
+      "outputs": [],
+      "metadata": {
+        "id": "SiWFZgCbadz7"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Apply MVDR beamforming by using single-channel masks \n",
+        "(We use the 1st channel as an example. The channel selection may depend on the design of the microphone array)"
+      ],
+      "metadata": {
+        "id": "Ukez6_lcUfna"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "results_single = {}\n",
+        "for solution in ['ref_channel', 'stv_evd', 'stv_power']:\n",
+        "    mvdr = torchaudio.transforms.MVDR(ref_channel=0, solution=solution, multi_mask=False)\n",
+        "    stft_est = mvdr(spec_mix, irm_speech[0], irm_noise[0])\n",
+        "    est = istft(stft_est, length=mix.shape[-1])\n",
+        "    results_single[solution] = est"
+      ],
+      "outputs": [],
+      "metadata": {
+        "id": "kLeNKsk-VLm5"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Compute Si-SDR scores"
+      ],
+      "metadata": {
+        "id": "uJjJNdYiUnf0"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "def si_sdr(estimate, reference, epsilon=1e-8):\n",
+        "    estimate = estimate - estimate.mean()\n",
+        "    reference = reference - reference.mean()\n",
+        "    reference_pow = reference.pow(2).mean(axis=1, keepdim=True)\n",
+        "    mix_pow = (estimate * reference).mean(axis=1, keepdim=True)\n",
+        "    scale = mix_pow / (reference_pow + epsilon)\n",
+        "\n",
+        "    reference = scale * reference\n",
+        "    error = estimate - reference\n",
+        "\n",
+        "    reference_pow = reference.pow(2)\n",
+        "    error_pow = error.pow(2)\n",
+        "\n",
+        "    reference_pow = reference_pow.mean(axis=1)\n",
+        "    error_pow = error_pow.mean(axis=1)\n",
+        "\n",
+        "    sisdr = 10 * torch.log10(reference_pow) - 10 * torch.log10(error_pow)\n",
+        "    return sisdr.item()"
+      ],
+      "outputs": [],
+      "metadata": {
+        "id": "MgmAJcyiU-FU"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Single-channel mask results"
+      ],
+      "metadata": {
+        "id": "3TCJEwTOUxci"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "for solution in results_single:\n",
+        "    print(solution+\": \", si_sdr(results_single[solution][None,...], reverb_clean[0:1]))"
+      ],
+      "outputs": [],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "NrUXXj98VVY7",
+        "outputId": "bc113347-70e3-47a9-8479-8aeeeca80abf"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Multi-channel mask results"
+      ],
+      "metadata": {
+        "id": "-7AnjM-gU3c8"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "for solution in results_multi:\n",
+        "    print(solution+\": \", si_sdr(results_multi[solution][None,...], reverb_clean[0:1]))"
+      ],
+      "outputs": [],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "S_VINTnlXobM",
+        "outputId": "234b5615-63e7-44d8-f816-a6cc05999e52"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Display the mixture audio"
+      ],
+      "metadata": {
+        "id": "_vOK8vgmU_UP"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "print(\"Mixture speech\")\n",
+        "ipd.Audio(mix[0], rate=16000)"
+      ],
+      "outputs": [],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 92
+        },
+        "id": "QaKauQIHYctE",
+        "outputId": "674c7f9b-62a3-4298-81ac-d3ab1ee43cd7"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Display the noise"
+      ],
+      "metadata": {
+        "id": "R-QGGm87VFQI"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "print(\"Noise\")\n",
+        "ipd.Audio(noise[0], rate=16000)"
+      ],
+      "outputs": [],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 92
+        },
+        "id": "l1WgzxIZYhlk",
+        "outputId": "7b100679-b4a0-47ff-b30b-9f4cb9dca3d1"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Display the clean speech"
+      ],
+      "metadata": {
+        "id": "P3kB-jzpVKKu"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "print(\"Clean speech\")\n",
+        "ipd.Audio(clean[0], rate=16000)"
+      ],
+      "outputs": [],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 92
+        },
+        "id": "pwAWvlRAVJkT",
+        "outputId": "5e173a1b-2ba8-4797-8f3a-e41cbf05ac2b"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Display the enhanced audios¶"
+      ],
+      "metadata": {
+        "id": "RIlyzL1wVTnr"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "print(\"multi-channel mask, ref_channel solution\")\n",
+        "ipd.Audio(results_multi['ref_channel'], rate=16000)"
+      ],
+      "outputs": [],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 92
+        },
+        "id": "M3YQsledVIQ5",
+        "outputId": "43d9ee34-6933-401b-baf9-e4cdb7d79b63"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "print(\"multi-channel mask, stv_evd solution\")\n",
+        "ipd.Audio(results_multi['stv_evd'], rate=16000)"
+      ],
+      "outputs": [],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 92
+        },
+        "id": "UhYOHLvCVWBN",
+        "outputId": "761468ec-ebf9-4b31-ad71-bfa2e15fed37"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "print(\"multi-channel mask, stv_power solution\")\n",
+        "ipd.Audio(results_multi['stv_power'], rate=16000)"
+      ],
+      "outputs": [],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 92
+        },
+        "id": "9dv8VDtCVXzd",
+        "outputId": "1ae61ea3-d3c4-479f-faad-7439f942aac1"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "print(\"single-channel mask, ref_channel solution\")\n",
+        "ipd.Audio(results_single['ref_channel'], rate=16000)"
+      ],
+      "outputs": [],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 92
+        },
+        "id": "jCFUN890VZdh",
+        "outputId": "c0d2a928-5dd0-4584-b277-7838ac4a9e6b"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "print(\"single-channel mask, stv_evd solution\")\n",
+        "ipd.Audio(results_single['stv_evd'], rate=16000)"
+      ],
+      "outputs": [],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 92
+        },
+        "id": "hzlzagsKVbAv",
+        "outputId": "96af9e37-82ca-4544-9c08-421fe222bde4"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "source": [
+        "print(\"single-channel mask, stv_power solution\")\n",
+        "ipd.Audio(results_single['stv_power'], rate=16000)"
+      ],
+      "outputs": [],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 92
+        },
+        "id": "A4igQpTnVctG",
+        "outputId": "cf968089-9274-4c1c-a1a5-32b220de0bf9"
+      }
+    }
+  ]
+}
\ No newline at end of file
--- a/examples/interactive_asr/README.md
+++ b/examples/interactive_asr/README.md
+# asr-demo
+
+To run this demo, you need the following libraries
+- [python3](https://www.python.org/download/releases/3.0/)
+- [pyaudio](https://people.csail.mit.edu/hubert/pyaudio/)
+- [torchaudio](https://github.com/pytorch/audio/tree/master/torchaudio)
+- [pytorch](https://pytorch.org/)
+- [librosa](https://librosa.github.io/librosa/)
+- [fairseq](https://github.com/pytorch/fairseq) (clone the github repository)
+and the following models
+- [dictionary](https://download.pytorch.org/models/audio/dict.txt)
+- [sentence piece model](https://download.pytorch.org/models/audio/spm.model)
+- [model](https://download.pytorch.org/models/audio/checkpoint_avg_60_80.pt)
+
+## Installation
+
+We recommend that you use [conda](https://docs.conda.io/en/latest/miniconda.html) to install the dependencies when available.
+```bash
+# Assume that all commands are from the examples folder
+cd examples
+
+# Install dependencies
+conda install -c pytorch torchaudio
+conda install -c conda-forge librosa
+conda install pyaudio
+pip install sentencepiece
+
+# Install fairseq from source
+git clone https://github.com/pytorch/fairseq interactive_asr/fairseq
+pushd interactive_asr/fairseq
+export CFLAGS='-stdlib=libc++'  # For Mac only
+pip install --editable .
+popd
+
+# Install dictionary, sentence piece model, and model
+wget -O interactive_asr/data/dict.txt https://download.pytorch.org/models/audio/dict.txt
+wget -O interactive_asr/data/spm.model https://download.pytorch.org/models/audio/spm.model
+wget -O interactive_asr/data/model.pt https://download.pytorch.org/models/audio/checkpoint_avg_60_80.pt
+```
+
+## Run
+On a file
+```bash
+INPUT_FILE=interactive_asr/data/sample.wav
+python -m interactive_asr.asr interactive_asr/data --input_file $INPUT_FILE --max-tokens 10000000 --nbest 1 \
+  --path interactive_asr/data/model.pt --beam 40 --task speech_recognition \
+  --user-dir interactive_asr/fairseq/examples/speech_recognition
+```
+
+As a microphone
+```bash
+python -m interactive_asr.asr interactive_asr/data --max-tokens 10000000 --nbest 1 \
+  --path interactive_asr/data/model.pt --beam 40 --task speech_recognition \
+  --user-dir interactive_asr/fairseq/examples/speech_recognition
+```
+To run the testcase associated with this example
+```bash
+ASR_MODEL_PATH=interactive_asr/data/model.pt \
+ASR_INPUT_FILE=interactive_asr/data/sample.wav \
+ASR_DATA_PATH=interactive_asr/data \
+ASR_USER_DIR=interactive_asr/fairseq/examples/speech_recognition \
+python -m unittest test/test_interactive_asr.py
+```
--- a/examples/interactive_asr/__init__.py
+++ b/examples/interactive_asr/__init__.py
+from . import utils, vad
+
+__all__ = ['utils', 'vad']
--- a/examples/interactive_asr/asr.py
+++ b/examples/interactive_asr/asr.py
+#!/usr/bin/env python3
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+"""
+Run inference for pre-processed data with a trained model.
+"""
+
+import datetime as dt
+import logging
+
+from fairseq import options
+
+from interactive_asr.utils import add_asr_eval_argument, setup_asr, get_microphone_transcription, transcribe_file
+
+
+def main(args):
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.INFO)
+    task, generator, models, sp, tgt_dict = setup_asr(args, logger)
+
+    print("READY!")
+    if args.input_file:
+        transcription_time, transcription = transcribe_file(args, task, generator, models, sp, tgt_dict)
+        print("transcription:", transcription)
+        print("transcription_time:", transcription_time)
+    else:
+        for transcription in get_microphone_transcription(args, task, generator, models, sp, tgt_dict):
+            print(
+                "{}: {}".format(
+                    dt.datetime.now().strftime("%H:%M:%S"), transcription[0][0]
+                )
+            )
+
+
+def cli_main():
+    parser = options.get_generation_parser()
+    parser = add_asr_eval_argument(parser)
+    args = options.parse_args_and_arch(parser)
+    main(args)
+
+
+if __name__ == "__main__":
+    cli_main()
--- a/examples/interactive_asr/data/sample.wav
+++ b/examples/interactive_asr/data/sample.wav
--- a/examples/interactive_asr/utils.py
+++ b/examples/interactive_asr/utils.py
+#!/usr/bin/env python3
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+import os
+import sys
+import time
+
+import torch
+import torchaudio
+import sentencepiece as spm
+
+from fairseq import tasks
+from fairseq.utils import load_ensemble_for_inference, import_user_module
+
+from interactive_asr.vad import get_microphone_chunks
+
+
+def add_asr_eval_argument(parser):
+    parser.add_argument("--input_file", help="input file")
+    parser.add_argument("--ctc", action="store_true", help="decode a ctc model")
+    parser.add_argument("--rnnt", default=False, help="decode a rnnt model")
+    parser.add_argument("--kspmodel", default=None, help="sentence piece model")
+    parser.add_argument(
+        "--wfstlm", default=None, help="wfstlm on dictonary output units"
+    )
+    parser.add_argument(
+        "--rnnt_decoding_type",
+        default="greedy",
+        help="wfstlm on dictonary output units",
+    )
+    parser.add_argument(
+        "--lm_weight",
+        default=0.2,
+        help="weight for wfstlm while interpolating with neural score",
+    )
+    parser.add_argument(
+        "--rnnt_len_penalty", default=-0.5, help="rnnt length penalty on word level"
+    )
+    return parser
+
+
+def check_args(args):
+    assert args.path is not None, "--path required for generation!"
+    assert (
+        not args.sampling or args.nbest == args.beam
+    ), "--sampling requires --nbest to be equal to --beam"
+    assert (
+        args.replace_unk is None or args.raw_text
+    ), "--replace-unk requires a raw text dataset (--raw-text)"
+
+
+def process_predictions(args, hypos, sp, tgt_dict):
+    res = []
+    device = torch.device("cuda:0" if torch.cuda.is_available() and not args.cpu else "cpu")
+    for hypo in hypos[: min(len(hypos), args.nbest)]:
+        hyp_pieces = tgt_dict.string(hypo["tokens"].int().to(device))
+        hyp_words = sp.DecodePieces(hyp_pieces.split())
+        res.append(hyp_words)
+    return res
+
+
+def optimize_models(args, use_cuda, models):
+    """Optimize ensemble for generation
+    """
+    for model in models:
+        model.make_generation_fast_(
+            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
+            need_attn=args.print_alignment,
+        )
+        if args.fp16:
+            model.half()
+        if use_cuda:
+            model.cuda()
+
+
+def calc_mean_invstddev(feature):
+    if len(feature.shape) != 2:
+        raise ValueError("We expect the input feature to be 2-D tensor")
+    mean = torch.mean(feature, dim=0)
+    var = torch.var(feature, dim=0)
+    # avoid division by ~zero
+    if (var < sys.float_info.epsilon).any():
+        return mean, 1.0 / (torch.sqrt(var) + sys.float_info.epsilon)
+    return mean, 1.0 / torch.sqrt(var)
+
+
+def calcMN(features):
+    mean, invstddev = calc_mean_invstddev(features)
+    res = (features - mean) * invstddev
+    return res
+
+
+def transcribe(waveform, args, task, generator, models, sp, tgt_dict):
+    num_features = 80
+    output = torchaudio.compliance.kaldi.fbank(waveform, num_mel_bins=num_features)
+    device = torch.device("cuda:0" if torch.cuda.is_available() and not args.cpu else "cpu")
+    output_cmvn = calcMN(output.to(device).detach())
+
+    # size (m, n)
+    source = output_cmvn
+    frames_lengths = torch.LongTensor([source.size(0)])
+
+    # size (1, m, n). In general, if source is (x, m, n), then hypos is (x, ...)
+    source.unsqueeze_(0)
+    sample = {"net_input": {"src_tokens": source, "src_lengths": frames_lengths}}
+
+    hypos = task.inference_step(generator, models, sample)
+
+    assert len(hypos) == 1
+    transcription = []
+    for i in range(len(hypos)):
+        # Process top predictions
+        hyp_words = process_predictions(args, hypos[i], sp, tgt_dict)
+        transcription.append(hyp_words)
+
+    return transcription
+
+
+def setup_asr(args, logger):
+    check_args(args)
+    import_user_module(args)
+
+    if args.max_tokens is None and args.batch_size is None:
+        args.max_tokens = 30000
+    logger.info(args)
+
+    use_cuda = torch.cuda.is_available() and not args.cpu
+
+    # Load dataset splits
+    task = tasks.setup_task(args)
+
+    # Set dictionary
+    tgt_dict = task.target_dictionary
+
+    if args.ctc or args.rnnt:
+        tgt_dict.add_symbol("<ctc_blank>")
+        if args.ctc:
+            logger.info("| decoding a ctc model")
+        if args.rnnt:
+            logger.info("| decoding a rnnt model")
+
+    # Load ensemble
+    logger.info("| loading model(s) from {}".format(args.path))
+    models, _model_args = load_ensemble_for_inference(
+        args.path.split(":"),
+        task,
+        model_arg_overrides=eval(args.model_overrides),  # noqa
+    )
+    optimize_models(args, use_cuda, models)
+
+    # Initialize generator
+    generator = task.build_generator(models, args)
+
+    sp = spm.SentencePieceProcessor()
+    sp.Load(os.path.join(args.data, "spm.model"))
+    return task, generator, models, sp, tgt_dict
+
+
+def transcribe_file(args, task, generator, models, sp, tgt_dict):
+    path = args.input_file
+    if not os.path.exists(path):
+        raise FileNotFoundError("Audio file not found: {}".format(path))
+    waveform, sample_rate = torchaudio.load_wav(path)
+    waveform = waveform.mean(0, True)
+    waveform = torchaudio.transforms.Resample(
+        orig_freq=sample_rate, new_freq=16000
+    )(waveform)
+
+    start = time.time()
+    transcription = transcribe(
+        waveform, args, task, generator, models, sp, tgt_dict
+    )
+    transcription_time = time.time() - start
+    return transcription_time, transcription
+
+
+def get_microphone_transcription(args, task, generator, models, sp, tgt_dict):
+    for (waveform, sample_rate) in get_microphone_chunks():
+        waveform = torchaudio.transforms.Resample(
+            orig_freq=sample_rate, new_freq=16000
+        )(waveform.reshape(1, -1))
+        transcription = transcribe(
+            waveform, args, task, generator, models, sp, tgt_dict
+        )
+        yield transcription
--- a/examples/interactive_asr/vad.py
+++ b/examples/interactive_asr/vad.py
+#!/usr/bin/env python3
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+"""
+Following `a simple but efficient real-time voice activity detection algorithm
+<https://www.eurasip.org/Proceedings/Eusipco/Eusipco2009/contents/papers/1569192958.pdf>`__.
+
+There are three criteria to decide if a frame contains speech: energy, most
+dominant frequency, and spectral flatness. If any two of those are higher than
+a minimum plus a threshold, then the frame contains speech.  In the offline
+case, the list of frames is postprocessed to remove too short silence and
+speech sequences. In the online case here, inertia is added before switching
+from speech to silence or vice versa.
+"""
+
+from collections import deque
+
+import numpy as np
+import torch
+import queue
+
+import librosa
+import pyaudio
+import torchaudio
+
+
+def compute_spectral_flatness(frame, epsilon=0.01):
+    # epsilon protects against log(0)
+    geometric_mean = torch.exp((frame + epsilon).log().mean(-1)) - epsilon
+    arithmetic_mean = frame.mean(-1)
+    return -10 * torch.log10(epsilon + geometric_mean / arithmetic_mean)
+
+
+class VoiceActivityDetection:
+    def __init__(
+        self,
+        num_init_frames=30,
+        ignore_silent_count=4,
+        ignore_speech_count=1,
+        energy_prim_thresh=60,
+        frequency_prim_thresh=10,
+        spectral_flatness_prim_thresh=3,
+        verbose=False,
+    ):
+
+        self.num_init_frames = num_init_frames
+        self.ignore_silent_count = ignore_silent_count
+        self.ignore_speech_count = ignore_speech_count
+
+        self.energy_prim_thresh = energy_prim_thresh
+        self.frequency_prim_thresh = frequency_prim_thresh
+        self.spectral_flatness_prim_thresh = spectral_flatness_prim_thresh
+
+        self.verbose = verbose
+
+        self.speech_mark = True
+        self.silence_mark = False
+
+        self.silent_count = 0
+        self.speech_count = 0
+        self.n = 0
+
+        if self.verbose:
+            self.energy_list = []
+            self.frequency_list = []
+            self.spectral_flatness_list = []
+
+    def iter(self, frame):
+
+        frame_fft = torch.rfft(frame, 1)
+        amplitudes = torchaudio.functional.complex_norm(frame_fft)
+
+        # Compute frame energy
+        energy = frame.pow(2).sum(-1)
+
+        # Most dominant frequency component
+        frequency = amplitudes.argmax()
+
+        # Spectral flatness measure
+        spectral_flatness = compute_spectral_flatness(amplitudes)
+
+        if self.verbose:
+            self.energy_list.append(energy)
+            self.frequency_list.append(frequency)
+            self.spectral_flatness_list.append(spectral_flatness)
+
+        if self.n == 0:
+            self.min_energy = energy
+            self.min_frequency = frequency
+            self.min_spectral_flatness = spectral_flatness
+        elif self.n < self.num_init_frames:
+            self.min_energy = min(energy, self.min_energy)
+            self.min_frequency = min(frequency, self.min_frequency)
+            self.min_spectral_flatness = min(
+                spectral_flatness, self.min_spectral_flatness
+            )
+
+        self.n += 1
+
+        # Add 1. to avoid log(0)
+        thresh_energy = self.energy_prim_thresh * torch.log(1.0 + self.min_energy)
+        thresh_frequency = self.frequency_prim_thresh
+        thresh_spectral_flatness = self.spectral_flatness_prim_thresh
+
+        # Check all three conditions
+
+        counter = 0
+        if energy - self.min_energy >= thresh_energy:
+            counter += 1
+        if frequency - self.min_frequency >= thresh_frequency:
+            counter += 1
+        if spectral_flatness - self.min_spectral_flatness >= thresh_spectral_flatness:
+            counter += 1
+
+        # Detection
+        if counter > 1:
+            # Speech detected
+            self.speech_count += 1
+            # Inertia against switching
+            if (
+                self.n >= self.num_init_frames
+                and self.speech_count <= self.ignore_speech_count
+            ):
+                # Too soon to change
+                return self.silence_mark
+            else:
+                self.silent_count = 0
+                return self.speech_mark
+        else:
+            # Silence detected
+            self.min_energy = ((self.silent_count * self.min_energy) + energy) / (
+                self.silent_count + 1
+            )
+            self.silent_count += 1
+            # Inertia against switching
+            if (
+                self.n >= self.num_init_frames
+                and self.silent_count <= self.ignore_silent_count
+            ):
+                # Too soon to change
+                return self.speech_mark
+            else:
+                self.speech_count = 0
+                return self.silence_mark
+
+
+class MicrophoneStream:
+    """Opens a recording stream as a generator yielding the audio chunks."""
+
+    def __init__(self, device=None, rate=22050, chunk=2205):
+        """
+        The 22050 is the librosa default, which is what our models were
+        trained on.  The ratio of [chunk / rate] is the amount of time between
+        audio samples - for example, with these defaults,
+        an audio fragment will be processed every tenth of a second.
+        """
+        self._rate = rate
+        self._chunk = chunk
+        self._device = device
+
+        # Create a thread-safe buffer of audio data
+        self._buff = queue.Queue()
+        self.closed = True
+
+    def __enter__(self):
+        self._audio_interface = pyaudio.PyAudio()
+        self._audio_stream = self._audio_interface.open(
+            # format=pyaudio.paInt16,
+            format=pyaudio.paFloat32,
+            # The API currently only supports 1-channel (mono) audio
+            # https://goo.gl/z757pE
+            channels=1,
+            rate=self._rate,
+            input=True,
+            frames_per_buffer=self._chunk,
+            input_device_index=self._device,
+            # Run the audio stream asynchronously to fill the buffer object.
+            # This is necessary so that the input device's buffer doesn't
+            # overflow while the calling thread makes network requests, etc.
+            stream_callback=self._fill_buffer,
+        )
+
+        self.closed = False
+
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self._audio_stream.stop_stream()
+        self._audio_stream.close()
+        self.closed = True
+        # Signal the generator to terminate so that the client's
+        # streaming_recognize method will not block the process termination.
+        self._buff.put(None)
+        self._audio_interface.terminate()
+
+    def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
+        """Continuously collect data from the audio stream, into the buffer."""
+        self._buff.put(in_data)
+        return None, pyaudio.paContinue
+
+    def generator(self):
+        while not self.closed:
+            # Use a blocking get() to ensure there's at least one chunk of
+            # data, and stop iteration if the chunk is None, indicating the
+            # end of the audio stream.
+            chunk = self._buff.get()
+            if chunk is None:
+                return
+            data = [chunk]
+
+            # Now consume whatever other data's still buffered.
+            while True:
+                try:
+                    chunk = self._buff.get(block=False)
+                    if chunk is None:
+                        return
+                    data.append(chunk)
+                except queue.Empty:
+                    break
+
+            ans = np.fromstring(b"".join(data), dtype=np.float32)
+            # yield uniform-sized chunks
+            ans = np.split(ans, np.shape(ans)[0] / self._chunk)
+            # Resample the audio to 22050, librosa default
+            for chunk in ans:
+                yield librosa.core.resample(chunk, self._rate, 22050)
+
+
+def get_microphone_chunks(
+    min_to_cumulate=5,  # 0.5 seconds
+    max_to_cumulate=100,  # 10 seconds
+    precumulate=5,
+    max_to_visualize=100,
+):
+
+    vad = VoiceActivityDetection()
+
+    cumulated = []
+    precumulated = deque(maxlen=precumulate)
+
+    with MicrophoneStream() as stream:
+        audio_generator = stream.generator()
+        chunk_length = stream._chunk
+        waveform = torch.zeros(max_to_visualize * chunk_length)
+
+        for chunk in audio_generator:
+            # Is speech?
+
+            chunk = torch.tensor(chunk)
+            is_speech = vad.iter(chunk)
+
+            # Cumulate speech
+
+            if is_speech or cumulated:
+                cumulated.append(chunk)
+            else:
+                precumulated.append(chunk)
+
+            if (not is_speech and len(cumulated) >= min_to_cumulate) or (
+                len(cumulated) > max_to_cumulate
+            ):
+                waveform = torch.cat(list(precumulated) + cumulated, -1)
+                yield (waveform * stream._rate, stream._rate)
+                cumulated = []
+                precumulated = deque(maxlen=precumulate)
--- a/examples/libtorchaudio/.gitignore
+++ b/examples/libtorchaudio/.gitignore
+build
+data/output.wav
+*.zip
+output
--- a/examples/libtorchaudio/CMakeLists.txt
+++ b/examples/libtorchaudio/CMakeLists.txt
+cmake_minimum_required(VERSION 3.5)
+
+project(libtorchaudio-cpp-example)
+
+SET(BUILD_SOX ON CACHE BOOL "Build libsox into libtorchaudio")
+
+SET(BUILD_KALDI OFF CACHE BOOL "Build Kaldi into libtorchaudio")
+SET(BUILD_RNNT ON CACHE BOOL "Build RNN transducer into libtorchaudio")
+SET(BUILD_TORCHAUDIO_PYTHON_EXTENSION OFF CACHE BOOL "Build Python binding")
+
+find_package(Torch REQUIRED)
+message("libtorchaudio CMakeLists: ${TORCH_CXX_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
+
+add_subdirectory(../.. libtorchaudio)
+add_subdirectory(augmentation)
+add_subdirectory(speech_recognition)