Unverified Commit e39ece66 authored by Vincent QB's avatar Vincent QB Committed by GitHub
Browse files

add name of paper before reference. (#1575)

parent c5d0390c
...@@ -155,7 +155,8 @@ def griffinlim( ...@@ -155,7 +155,8 @@ def griffinlim(
r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation. r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
Implementation ported from Implementation ported from
:footcite:`brian_mcfee-proc-scipy-2015`, :footcite:`6701851` and :footcite:`1172092`. *librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`]
and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`].
Args: Args:
specgram (Tensor): A magnitude-only STFT spectrogram of dimension (..., freq, frames) specgram (Tensor): A magnitude-only STFT spectrogram of dimension (..., freq, frames)
...@@ -1207,7 +1208,8 @@ def compute_kaldi_pitch( ...@@ -1207,7 +1208,8 @@ def compute_kaldi_pitch(
recompute_frame: int = 500, recompute_frame: int = 500,
snip_edges: bool = True, snip_edges: bool = True,
) -> torch.Tensor: ) -> torch.Tensor:
"""Extract pitch based on method described in :footcite:`6854049`. """Extract pitch based on method described in *A pitch extraction algorithm tuned
for automatic speech recognition* [:footcite:`6854049`].
This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi. This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi.
......
...@@ -164,7 +164,9 @@ class MaskGenerator(torch.nn.Module): ...@@ -164,7 +164,9 @@ class MaskGenerator(torch.nn.Module):
class ConvTasNet(torch.nn.Module): class ConvTasNet(torch.nn.Module):
"""Conv-TasNet: a fully-convolutional time-domain audio separation network :footcite:`Luo_2019`. """Conv-TasNet: a fully-convolutional time-domain audio separation network
*Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation*
[:footcite:`Luo_2019`].
Args: Args:
num_sources (int): The number of sources to split. num_sources (int): The number of sources to split.
......
...@@ -31,7 +31,8 @@ class FullyConnected(torch.nn.Module): ...@@ -31,7 +31,8 @@ class FullyConnected(torch.nn.Module):
class DeepSpeech(torch.nn.Module): class DeepSpeech(torch.nn.Module):
""" """
DeepSpeech model architecture from :footcite:`hannun2014deep`. DeepSpeech model architecture from *Deep Speech: Scaling up end-to-end speech recognition*
[:footcite:`hannun2014deep`].
Args: Args:
n_feature: Number of input features n_feature: Number of input features
......
...@@ -7,7 +7,8 @@ __all__ = [ ...@@ -7,7 +7,8 @@ __all__ = [
class Wav2Letter(nn.Module): class Wav2Letter(nn.Module):
r"""Wav2Letter model architecture from :footcite:`collobert2016wav2letter`. r"""Wav2Letter model architecture from *Wav2Letter: an End-to-End ConvNet-based Speech
Recognition System* [:footcite:`collobert2016wav2letter`].
:math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}` :math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}`
......
...@@ -7,7 +7,7 @@ from . import components ...@@ -7,7 +7,7 @@ from . import components
class Wav2Vec2Model(Module): class Wav2Vec2Model(Module):
"""Encoder model used in [:footcite:`baevski2020wav2vec`]. """Encoder model used in *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
Note: Note:
To build the model, please use one of the factory functions. To build the model, please use one of the factory functions.
...@@ -122,7 +122,7 @@ def _get_model( ...@@ -122,7 +122,7 @@ def _get_model(
def wav2vec2_base(num_out: int) -> Wav2Vec2Model: def wav2vec2_base(num_out: int) -> Wav2Vec2Model:
"""Build wav2vec2.0 model with "Base" configuration from [:footcite:`baevski2020wav2vec`]. """Build wav2vec2.0 model with "Base" configuration from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
Args: Args:
num_out: int num_out: int
...@@ -164,7 +164,7 @@ def wav2vec2_base(num_out: int) -> Wav2Vec2Model: ...@@ -164,7 +164,7 @@ def wav2vec2_base(num_out: int) -> Wav2Vec2Model:
def wav2vec2_large(num_out: int) -> Wav2Vec2Model: def wav2vec2_large(num_out: int) -> Wav2Vec2Model:
"""Build wav2vec2.0 model with "Large" configuration from [:footcite:`baevski2020wav2vec`]. """Build wav2vec2.0 model with "Large" configuration from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
Args: Args:
num_out: int num_out: int
...@@ -206,7 +206,7 @@ def wav2vec2_large(num_out: int) -> Wav2Vec2Model: ...@@ -206,7 +206,7 @@ def wav2vec2_large(num_out: int) -> Wav2Vec2Model:
def wav2vec2_large_lv60k(num_out: int) -> Wav2Vec2Model: def wav2vec2_large_lv60k(num_out: int) -> Wav2Vec2Model:
"""Build wav2vec2.0 model with "Large LV-60k" configuration from [:footcite:`baevski2020wav2vec`]. """Build wav2vec2.0 model with "Large LV-60k" configuration from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
Args: Args:
num_out: int num_out: int
......
...@@ -14,7 +14,7 @@ __all__ = [ ...@@ -14,7 +14,7 @@ __all__ = [
class ResBlock(nn.Module): class ResBlock(nn.Module):
r"""ResNet block based on :footcite:`kalchbrenner2018efficient`. r"""ResNet block based on *Efficient Neural Audio Synthesis* [:footcite:`kalchbrenner2018efficient`].
Args: Args:
n_freq: the number of bins in a spectrogram. (Default: ``128``) n_freq: the number of bins in a spectrogram. (Default: ``128``)
...@@ -202,9 +202,9 @@ class UpsampleNetwork(nn.Module): ...@@ -202,9 +202,9 @@ class UpsampleNetwork(nn.Module):
class WaveRNN(nn.Module): class WaveRNN(nn.Module):
r"""WaveRNN model based on the implementation from `fatchord <https://github.com/fatchord/WaveRNN>`_. r"""WaveRNN model based on the implementation from `fatchord <https://github.com/fatchord/WaveRNN>`_.
The original implementation was introduced in :footcite:`kalchbrenner2018efficient`. The original implementation was introduced in *Efficient Neural Audio Synthesis*
The input channels of waveform and spectrogram have to be 1. The product of [:footcite:`kalchbrenner2018efficient`]. The input channels of waveform and spectrogram have to be 1.
`upsample_scales` must equal `hop_length`. The product of `upsample_scales` must equal `hop_length`.
Args: Args:
upsample_scales: the list of upsample scales. upsample_scales: the list of upsample scales.
......
...@@ -17,7 +17,8 @@ def rnnt_loss( ...@@ -17,7 +17,8 @@ def rnnt_loss(
fused_log_softmax: bool = True, fused_log_softmax: bool = True,
reuse_logits_for_grads: bool = True, reuse_logits_for_grads: bool = True,
): ):
"""Compute the RNN Transducer loss from :footcite:`graves2012sequence`. """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
[:footcite:`graves2012sequence`].
The RNN Transducer loss extends the CTC loss by defining a distribution over output The RNN Transducer loss extends the CTC loss by defining a distribution over output
sequences of all lengths, and by jointly modelling both input-output and output-output sequences of all lengths, and by jointly modelling both input-output and output-output
...@@ -57,7 +58,8 @@ def rnnt_loss( ...@@ -57,7 +58,8 @@ def rnnt_loss(
class RNNTLoss(torch.nn.Module): class RNNTLoss(torch.nn.Module):
"""Compute the RNN Transducer loss from :footcite:`graves2012sequence`. """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
[:footcite:`graves2012sequence`].
The RNN Transducer loss extends the CTC loss by defining a distribution over output The RNN Transducer loss extends the CTC loss by defining a distribution over output
sequences of all lengths, and by jointly modelling both input-output and output-output sequences of all lengths, and by jointly modelling both input-output and output-output
......
...@@ -129,7 +129,8 @@ class GriffinLim(torch.nn.Module): ...@@ -129,7 +129,8 @@ class GriffinLim(torch.nn.Module):
r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation. r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
Implementation ported from Implementation ported from
:footcite:`brian_mcfee-proc-scipy-2015`, :footcite:`6701851` and :footcite:`1172092`. *librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`]
and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`].
Args: Args:
n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``) n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment