Unverified Commit 9e3778d2 authored by moto's avatar moto Committed by GitHub
Browse files

Add SpecAugment figure/citation (#1887)

parent e885204e
@article{specaugment,
title={SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition},
url={http://dx.doi.org/10.21437/Interspeech.2019-2680},
DOI={10.21437/interspeech.2019-2680},
journal={Interspeech 2019},
publisher={ISCA},
author={Park, Daniel S. and Chan, William and Zhang, Yu and Chiu, Chung-Cheng and Zoph, Barret and Cubuk, Ekin D. and Le, Quoc V.},
year={2019},
month={Sep}
}
@misc{ljspeech17,
author = {Keith Ito and Linda Johnson},
title = {The LJ Speech Dataset},
......
......@@ -947,11 +947,34 @@ class ComputeDeltas(torch.nn.Module):
class TimeStretch(torch.nn.Module):
r"""Stretch stft in time without modifying pitch for a given rate.
Proposed in *SpecAugment* [:footcite:`specaugment`].
Args:
hop_length (int or None, optional): Length of hop between STFT windows. (Default: ``win_length // 2``)
n_freq (int, optional): number of filter banks from stft. (Default: ``201``)
fixed_rate (float or None, optional): rate to speed up or slow down by.
If None is provided, rate must be passed to the forward method. (Default: ``None``)
Example
>>> spectrogram = torchaudio.transforms.Spectrogram()
>>> stretch = torchaudio.transforms.TimeStretch()
>>>
>>> original = spectrogram(waveform)
>>> streched_1_2 = stretch(original, 1.2)
>>> streched_0_9 = stretch(original, 0.9)
.. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_stretch_1.png
:width: 600
:alt: Spectrogram streched by 1.2
.. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_stretch_2.png
:width: 600
:alt: The original spectrogram
.. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_stretch_3.png
:width: 600
:alt: Spectrogram streched by 0.9
"""
__constants__ = ['fixed_rate']
......@@ -1111,12 +1134,27 @@ class _AxisMasking(torch.nn.Module):
class FrequencyMasking(_AxisMasking):
r"""Apply masking to a spectrogram in the frequency domain.
Proposed in *SpecAugment* [:footcite:`specaugment`].
Args:
freq_mask_param (int): maximum possible length of the mask.
Indices uniformly sampled from [0, freq_mask_param).
iid_masks (bool, optional): whether to apply different masks to each
example/channel in the batch. (Default: ``False``)
This option is applicable only when the input tensor is 4D.
Example
>>> spectrogram = torchaudio.transforms.Spectrogram()
>>> masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80)
>>>
>>> original = spectrogram(waveform)
>>> masked = masking(original)
.. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_freq_masking1.png
:alt: The original spectrogram
.. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_freq_masking2.png
:alt: The spectrogram masked along frequency axis
"""
def __init__(self, freq_mask_param: int, iid_masks: bool = False) -> None:
......@@ -1126,12 +1164,27 @@ class FrequencyMasking(_AxisMasking):
class TimeMasking(_AxisMasking):
r"""Apply masking to a spectrogram in the time domain.
Proposed in *SpecAugment* [:footcite:`specaugment`].
Args:
time_mask_param (int): maximum possible length of the mask.
Indices uniformly sampled from [0, time_mask_param).
iid_masks (bool, optional): whether to apply different masks to each
example/channel in the batch. (Default: ``False``)
This option is applicable only when the input tensor is 4D.
Example
>>> spectrogram = torchaudio.transforms.Spectrogram()
>>> masking = torchaudio.transforms.TimeMasking(time_mask_param=80)
>>>
>>> original = spectrogram(waveform)
>>> masked = masking(original)
.. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_masking1.png
:alt: The original spectrogram
.. image:: https://download.pytorch.org/torchaudio/doc-assets/specaugment_time_masking2.png
:alt: The spectrogram masked along time axis
"""
def __init__(self, time_mask_param: int, iid_masks: bool = False) -> None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment