Add SpecAugment figure/citation (#1887)

9e3778d2 · moto · GitHub · e885204e · 9e3778d2 · 9e3778d2
Unverified Commit 9e3778d2 authored Oct 16, 2021 by moto Committed by GitHub Oct 16, 2021
Show whitespace changes
Inline Side-by-side

Showing with 63 additions and 0 deletions

docs/source/refs.bib docs/source/refs.bib +10 -0

torchaudio/transforms.py torchaudio/transforms.py +53 -0

No files found.
--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
+@article{specaugment,
+   title={SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition},
+   url={http://dx.doi.org/10.21437/Interspeech.2019-2680},
+   DOI={10.21437/interspeech.2019-2680},
+   journal={Interspeech 2019},
+   publisher={ISCA},
+   author={Park, Daniel S. and Chan, William and Zhang, Yu and Chiu, Chung-Cheng and Zoph, Barret and Cubuk, Ekin D. and Le, Quoc V.},
+   year={2019},
+   month={Sep}
+}
 @misc{ljspeech17,
  author       = {Keith Ito and Linda Johnson},
  title        = {The LJ Speech Dataset},

--- a/torchaudio/transforms.py
+++ b/torchaudio/transforms.py
@@ -947,11 +947,34 @@ class ComputeDeltas(torch.nn.Module):
 class TimeStretch(torch.nn.Module):
    r"""Stretch stft in time without modifying pitch for a given rate.

+    Proposed in *SpecAugment* [:footcite:`specaugment`].
+
    Args:
        hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
        n_freq (int, optional): number of filter banks from stft. (Default: ``201``)
        fixed_rate (float or None, optional): rate to speed up or slow down by.
            If None is provided, rate must be passed to the forward method. (Default: ``None``)
+
+    Example
+        >>> spectrogram = torchaudio.transforms.Spectrogram()
+        >>> stretch = torchaudio.transforms.TimeStretch()
+        >>>
+        >>> original = spectrogram(waveform)
+        >>> streched_1_2 = stretch(original, 1.2)
+        >>> streched_0_9 = stretch(original, 0.9)
+
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_stretch_1.png
+           :width: 600
+           :alt: Spectrogram streched by 1.2
+
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_stretch_2.png
+           :width: 600
+           :alt: The original spectrogram
+
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_stretch_3.png
+           :width: 600
+           :alt: Spectrogram streched by 0.9
+
    """
    __constants__ = ['fixed_rate']

@@ -1111,12 +1134,27 @@ class _AxisMasking(torch.nn.Module):
 class FrequencyMasking(_AxisMasking):
    r"""Apply masking to a spectrogram in the frequency domain.

+    Proposed in *SpecAugment* [:footcite:`specaugment`].
+
    Args:
        freq_mask_param (int): maximum possible length of the mask.
            Indices uniformly sampled from [0, freq_mask_param).
        iid_masks (bool, optional): whether to apply different masks to each
            example/channel in the batch. (Default: ``False``)
            This option is applicable only when the input tensor is 4D.
+
+    Example
+        >>> spectrogram = torchaudio.transforms.Spectrogram()
+        >>> masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80)
+        >>>
+        >>> original = spectrogram(waveform)
+        >>> masked = masking(original)
+
+        .. image::  https://download.pytorch.org/torchaudio/doc-assets/specaugment_freq_masking1.png
+           :alt: The original spectrogram
+
+        .. image::  https://download.pytorch.org/torchaudio/doc-assets/specaugment_freq_masking2.png
+           :alt: The spectrogram masked along frequency axis
    """

    def __init__(self, freq_mask_param: int, iid_masks: bool = False) -> None:
@@ -1126,12 +1164,27 @@ class FrequencyMasking(_AxisMasking):
 class TimeMasking(_AxisMasking):
    r"""Apply masking to a spectrogram in the time domain.

+    Proposed in *SpecAugment* [:footcite:`specaugment`].
+
    Args:
        time_mask_param (int): maximum possible length of the mask.
            Indices uniformly sampled from [0, time_mask_param).
        iid_masks (bool, optional): whether to apply different masks to each
            example/channel in the batch. (Default: ``False``)
            This option is applicable only when the input tensor is 4D.
+
+    Example
+        >>> spectrogram = torchaudio.transforms.Spectrogram()
+        >>> masking = torchaudio.transforms.TimeMasking(time_mask_param=80)
+        >>>
+        >>> original = spectrogram(waveform)
+        >>> masked = masking(original)
+
+        .. image::  https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_masking1.png
+           :alt: The original spectrogram
+
+        .. image::  https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_masking2.png
+           :alt: The spectrogram masked along time axis
    """

    def __init__(self, time_mask_param: int, iid_masks: bool = False) -> None: