Commit 48707255 authored by jamarshon's avatar jamarshon Committed by cpuhrsch
Browse files

Add Kaldi docs (#136)

parent 56e2835e
.. role:: hidden
:class: hidden-section
torchaudio.compliance.kaldi
============================
.. currentmodule:: torchaudio.compliance.kaldi
The useful processing operations of kaldi_ can be performed with torchaudio.
Various functions with identical parameters are given so that torchaudio can
produce similar outputs.
.. _kaldi: https://github.com/kaldi-asr/kaldi
Functions
---------
:hidden:`fbank`
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: fbank
:hidden:`spectrogram`
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: spectrogram
...@@ -9,6 +9,7 @@ The :mod:`torchaudio` package consists of I/O, popular datasets and common audio ...@@ -9,6 +9,7 @@ The :mod:`torchaudio` package consists of I/O, popular datasets and common audio
sox_effects sox_effects
datasets datasets
compliance.kaldi
kaldi_io kaldi_io
transforms transforms
legacy legacy
......
...@@ -184,10 +184,10 @@ def spectrogram( ...@@ -184,10 +184,10 @@ def spectrogram(
preemphasis_coefficient=0.97, raw_energy=True, remove_dc_offset=True, preemphasis_coefficient=0.97, raw_energy=True, remove_dc_offset=True,
round_to_power_of_two=True, sample_frequency=16000.0, snip_edges=True, round_to_power_of_two=True, sample_frequency=16000.0, snip_edges=True,
subtract_mean=False, window_type=POVEY): subtract_mean=False, window_type=POVEY):
"""Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi's r"""Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi's
compute-spectrogram-feats. compute-spectrogram-feats.
Inputs: Args:
sig (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2) sig (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
blackman_coeff (float): Constant coefficient for generalized Blackman window. (default = 0.42) blackman_coeff (float): Constant coefficient for generalized Blackman window. (default = 0.42)
channel (int): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (default = -1) channel (int): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (default = -1)
...@@ -213,9 +213,9 @@ def spectrogram( ...@@ -213,9 +213,9 @@ def spectrogram(
it this way. (default = False) it this way. (default = False)
window_type (str): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman') (default = 'povey') window_type (str): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman') (default = 'povey')
Outputs: Returns:
Tensor: a spectrogram identical to what Kaldi would output. The shape is (m, `padded_window_size` // 2 + 1) Tensor: a spectrogram identical to what Kaldi would output. The shape is
where m is calculated in _get_strided (m, `padded_window_size` // 2 + 1) where m is calculated in _get_strided
""" """
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties( waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
sig, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient) sig, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient)
...@@ -421,10 +421,10 @@ def fbank( ...@@ -421,10 +421,10 @@ def fbank(
remove_dc_offset=True, round_to_power_of_two=True, sample_frequency=16000.0, remove_dc_offset=True, round_to_power_of_two=True, sample_frequency=16000.0,
snip_edges=True, subtract_mean=False, use_energy=False, use_log_fbank=True, use_power=True, snip_edges=True, subtract_mean=False, use_energy=False, use_log_fbank=True, use_power=True,
vtln_high=-500.0, vtln_low=100.0, vtln_warp=1.0, window_type='povey'): vtln_high=-500.0, vtln_low=100.0, vtln_warp=1.0, window_type='povey'):
"""Create a fbank from a raw audio signal. This matches the input/output of Kaldi's r"""Create a fbank from a raw audio signal. This matches the input/output of Kaldi's
compute-fbank-feats. compute-fbank-feats.
Inputs: Args:
sig (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2) sig (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
blackman_coeff (float): Constant coefficient for generalized Blackman window. (default = 0.42) blackman_coeff (float): Constant coefficient for generalized Blackman window. (default = 0.42)
channel (int): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (default = -1) channel (int): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (default = -1)
...@@ -462,9 +462,9 @@ def fbank( ...@@ -462,9 +462,9 @@ def fbank(
vtln_warp (float): Vtln warp factor (only applicable if vtln_map not specified) (float, default = 1.0) vtln_warp (float): Vtln warp factor (only applicable if vtln_map not specified) (float, default = 1.0)
window_type (str): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman') (default = 'povey') window_type (str): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman') (default = 'povey')
Outputs: Returns:
Tensor: a fbank identical to what Kaldi would output. The shape is (m, `num_mel_bins` + `use_energy`) Tensor: a fbank identical to what Kaldi would output. The shape is (m, `num_mel_bins` + `use_energy`)
where m is calculated in _get_strided where m is calculated in _get_strided
""" """
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties( waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
sig, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient) sig, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment