Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
e39ece66
Unverified
Commit
e39ece66
authored
Jun 14, 2021
by
Vincent QB
Committed by
GitHub
Jun 14, 2021
Browse files
add name of paper before reference. (#1575)
parent
c5d0390c
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
25 additions
and
16 deletions
+25
-16
torchaudio/functional/functional.py
torchaudio/functional/functional.py
+4
-2
torchaudio/models/conv_tasnet.py
torchaudio/models/conv_tasnet.py
+3
-1
torchaudio/models/deepspeech.py
torchaudio/models/deepspeech.py
+2
-1
torchaudio/models/wav2letter.py
torchaudio/models/wav2letter.py
+2
-1
torchaudio/models/wav2vec2/model.py
torchaudio/models/wav2vec2/model.py
+4
-4
torchaudio/models/wavernn.py
torchaudio/models/wavernn.py
+4
-4
torchaudio/prototype/rnnt_loss.py
torchaudio/prototype/rnnt_loss.py
+4
-2
torchaudio/transforms.py
torchaudio/transforms.py
+2
-1
No files found.
torchaudio/functional/functional.py
View file @
e39ece66
...
...
@@ -155,7 +155,8 @@ def griffinlim(
r
"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
Implementation ported from
:footcite:`brian_mcfee-proc-scipy-2015`, :footcite:`6701851` and :footcite:`1172092`.
*librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`]
and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`].
Args:
specgram (Tensor): A magnitude-only STFT spectrogram of dimension (..., freq, frames)
...
...
@@ -1207,7 +1208,8 @@ def compute_kaldi_pitch(
recompute_frame
:
int
=
500
,
snip_edges
:
bool
=
True
,
)
->
torch
.
Tensor
:
"""Extract pitch based on method described in :footcite:`6854049`.
"""Extract pitch based on method described in *A pitch extraction algorithm tuned
for automatic speech recognition* [:footcite:`6854049`].
This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi.
...
...
torchaudio/models/conv_tasnet.py
View file @
e39ece66
...
...
@@ -164,7 +164,9 @@ class MaskGenerator(torch.nn.Module):
class
ConvTasNet
(
torch
.
nn
.
Module
):
"""Conv-TasNet: a fully-convolutional time-domain audio separation network :footcite:`Luo_2019`.
"""Conv-TasNet: a fully-convolutional time-domain audio separation network
*Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation*
[:footcite:`Luo_2019`].
Args:
num_sources (int): The number of sources to split.
...
...
torchaudio/models/deepspeech.py
View file @
e39ece66
...
...
@@ -31,7 +31,8 @@ class FullyConnected(torch.nn.Module):
class
DeepSpeech
(
torch
.
nn
.
Module
):
"""
DeepSpeech model architecture from :footcite:`hannun2014deep`.
DeepSpeech model architecture from *Deep Speech: Scaling up end-to-end speech recognition*
[:footcite:`hannun2014deep`].
Args:
n_feature: Number of input features
...
...
torchaudio/models/wav2letter.py
View file @
e39ece66
...
...
@@ -7,7 +7,8 @@ __all__ = [
class
Wav2Letter
(
nn
.
Module
):
r
"""Wav2Letter model architecture from :footcite:`collobert2016wav2letter`.
r
"""Wav2Letter model architecture from *Wav2Letter: an End-to-End ConvNet-based Speech
Recognition System* [:footcite:`collobert2016wav2letter`].
:math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}`
...
...
torchaudio/models/wav2vec2/model.py
View file @
e39ece66
...
...
@@ -7,7 +7,7 @@ from . import components
class
Wav2Vec2Model
(
Module
):
"""Encoder model used in [:footcite:`baevski2020wav2vec`].
"""Encoder model used in
*wav2vec 2.0*
[:footcite:`baevski2020wav2vec`].
Note:
To build the model, please use one of the factory functions.
...
...
@@ -122,7 +122,7 @@ def _get_model(
def
wav2vec2_base
(
num_out
:
int
)
->
Wav2Vec2Model
:
"""Build wav2vec2.0 model with "Base" configuration from [:footcite:`baevski2020wav2vec`].
"""Build wav2vec2.0 model with "Base" configuration from
*wav2vec 2.0*
[:footcite:`baevski2020wav2vec`].
Args:
num_out: int
...
...
@@ -164,7 +164,7 @@ def wav2vec2_base(num_out: int) -> Wav2Vec2Model:
def
wav2vec2_large
(
num_out
:
int
)
->
Wav2Vec2Model
:
"""Build wav2vec2.0 model with "Large" configuration from [:footcite:`baevski2020wav2vec`].
"""Build wav2vec2.0 model with "Large" configuration from
*wav2vec 2.0*
[:footcite:`baevski2020wav2vec`].
Args:
num_out: int
...
...
@@ -206,7 +206,7 @@ def wav2vec2_large(num_out: int) -> Wav2Vec2Model:
def
wav2vec2_large_lv60k
(
num_out
:
int
)
->
Wav2Vec2Model
:
"""Build wav2vec2.0 model with "Large LV-60k" configuration from [:footcite:`baevski2020wav2vec`].
"""Build wav2vec2.0 model with "Large LV-60k" configuration from
*wav2vec 2.0*
[:footcite:`baevski2020wav2vec`].
Args:
num_out: int
...
...
torchaudio/models/wavernn.py
View file @
e39ece66
...
...
@@ -14,7 +14,7 @@ __all__ = [
class
ResBlock
(
nn
.
Module
):
r
"""ResNet block based on :footcite:`kalchbrenner2018efficient`.
r
"""ResNet block based on
*Efficient Neural Audio Synthesis* [
:footcite:`kalchbrenner2018efficient`
]
.
Args:
n_freq: the number of bins in a spectrogram. (Default: ``128``)
...
...
@@ -202,9 +202,9 @@ class UpsampleNetwork(nn.Module):
class
WaveRNN
(
nn
.
Module
):
r
"""WaveRNN model based on the implementation from `fatchord <https://github.com/fatchord/WaveRNN>`_.
The original implementation was introduced in
:footcite:`kalchbrenner2018efficient`.
The input channels of waveform and spectrogram have to be 1.
The product of
`upsample_scales` must equal `hop_length`.
The original implementation was introduced in
*Efficient Neural Audio Synthesis*
[:footcite:`kalchbrenner2018efficient`].
The input channels of waveform and spectrogram have to be 1.
The product of
`upsample_scales` must equal `hop_length`.
Args:
upsample_scales: the list of upsample scales.
...
...
torchaudio/prototype/rnnt_loss.py
View file @
e39ece66
...
...
@@ -17,7 +17,8 @@ def rnnt_loss(
fused_log_softmax
:
bool
=
True
,
reuse_logits_for_grads
:
bool
=
True
,
):
"""Compute the RNN Transducer loss from :footcite:`graves2012sequence`.
"""Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
[:footcite:`graves2012sequence`].
The RNN Transducer loss extends the CTC loss by defining a distribution over output
sequences of all lengths, and by jointly modelling both input-output and output-output
...
...
@@ -57,7 +58,8 @@ def rnnt_loss(
class
RNNTLoss
(
torch
.
nn
.
Module
):
"""Compute the RNN Transducer loss from :footcite:`graves2012sequence`.
"""Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
[:footcite:`graves2012sequence`].
The RNN Transducer loss extends the CTC loss by defining a distribution over output
sequences of all lengths, and by jointly modelling both input-output and output-output
...
...
torchaudio/transforms.py
View file @
e39ece66
...
...
@@ -129,7 +129,8 @@ class GriffinLim(torch.nn.Module):
r
"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
Implementation ported from
:footcite:`brian_mcfee-proc-scipy-2015`, :footcite:`6701851` and :footcite:`1172092`.
*librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`]
and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`].
Args:
n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment