Add Kaldi docs (#136)

48707255 · jamarshon · cpuhrsch · 56e2835e · 48707255 · 48707255
Commit 48707255 authored Jul 11, 2019 by jamarshon Committed by cpuhrsch Jul 11, 2019
Showing with 36 additions and 9 deletions

docs/source/compliance.kaldi.rst docs/source/compliance.kaldi.rst +26 -0

docs/source/index.rst docs/source/index.rst +1 -0

torchaudio/compliance/kaldi.py torchaudio/compliance/kaldi.py +9 -9

No files found.
--- a/docs/source/compliance.kaldi.rst
+++ b/docs/source/compliance.kaldi.rst
+.. role:: hidden
+    :class: hidden-section
+
+torchaudio.compliance.kaldi
+============================
+
+.. currentmodule:: torchaudio.compliance.kaldi
+
+The useful processing operations of kaldi_ can be performed with torchaudio.
+Various functions with identical parameters are given so that torchaudio can
+produce similar outputs.
+
+.. _kaldi: https://github.com/kaldi-asr/kaldi
+
+Functions
+---------
+
+:hidden:`fbank`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: fbank
+
+:hidden:`spectrogram`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: spectrogram
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -9,6 +9,7 @@ The :mod:`torchaudio` package consists of I/O, popular datasets and common audio

   sox_effects
   datasets
+   compliance.kaldi
   kaldi_io
   transforms
   legacy

--- a/torchaudio/compliance/kaldi.py
+++ b/torchaudio/compliance/kaldi.py
@@ -184,10 +184,10 @@ def spectrogram(
        preemphasis_coefficient=0.97, raw_energy=True, remove_dc_offset=True,
        round_to_power_of_two=True, sample_frequency=16000.0, snip_edges=True,
        subtract_mean=False, window_type=POVEY):
-    """Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi's
+    r"""Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi's
    compute-spectrogram-feats.

-    Inputs:
+    Args:
        sig (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
        blackman_coeff (float): Constant coefficient for generalized Blackman window. (default = 0.42)
        channel (int): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (default = -1)
@@ -213,9 +213,9 @@ def spectrogram(
            it this way.  (default = False)
        window_type (str): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman') (default = 'povey')

-    Outputs:
-        Tensor: a spectrogram identical to what Kaldi would output. The shape is (m, `padded_window_size` // 2 + 1)
-            where m is calculated in _get_strided
+    Returns:
+        Tensor: a spectrogram identical to what Kaldi would output. The shape is
+        (m, `padded_window_size` // 2 + 1) where m is calculated in _get_strided
    """
    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
        sig, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient)
@@ -421,10 +421,10 @@ def fbank(
        remove_dc_offset=True, round_to_power_of_two=True, sample_frequency=16000.0,
        snip_edges=True, subtract_mean=False, use_energy=False, use_log_fbank=True, use_power=True,
        vtln_high=-500.0, vtln_low=100.0, vtln_warp=1.0, window_type='povey'):
-    """Create a fbank from a raw audio signal. This matches the input/output of Kaldi's
+    r"""Create a fbank from a raw audio signal. This matches the input/output of Kaldi's
    compute-fbank-feats.

-    Inputs:
+    Args:
        sig (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
        blackman_coeff (float): Constant coefficient for generalized Blackman window. (default = 0.42)
        channel (int): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (default = -1)
@@ -462,9 +462,9 @@ def fbank(
        vtln_warp (float): Vtln warp factor (only applicable if vtln_map not specified) (float, default = 1.0)
        window_type (str): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman') (default = 'povey')

-    Outputs:
+    Returns:
        Tensor: a fbank identical to what Kaldi would output. The shape is (m, `num_mel_bins` + `use_energy`)
-            where m is calculated in _get_strided
+        where m is calculated in _get_strided
    """
    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
        sig, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient)