[BC-Breaking] Remove compute_kaldi_pitch (#3368)

Summary: This commit removes compute_kaldi_pitch function and the underlying Kaldi integration from torchaudio. Kaldi pitch function was added in a short period of time by integrating the original Kaldi implementation, instead of reimplementing it in PyTorch. The Kaldi integration employed a hack which replaces the base vector/matrix implementation of Kaldi with PyTorch Tensor so that there is only one blas library within torchaudio. Recently, we are making torchaudio more lean, and we don't see a wide adoption of kaldi_pitch feature, so we decided to remove them. See some of the discussion https://github.com/pytorch/audio/issues/1269 Pull Request resolved: https://github.com/pytorch/audio/pull/3368 Differential Revision: D46406176 Pulled By: mthrok fbshipit-source-id: ee5e24d825188f379979ddccd680c7323b119b1e

[BC-Breaking] Remove compute_kaldi_pitch (#3368)
Summary: This commit removes compute_kaldi_pitch function and the underlying Kaldi integration from torchaudio. Kaldi pitch function was added in a short period of time by integrating the original Kaldi implementation, instead of reimplementing it in PyTorch. The Kaldi integration employed a hack which replaces the base vector/matrix implementation of Kaldi with PyTorch Tensor so that there is only one blas library within torchaudio. Recently, we are making torchaudio more lean, and we don't see a wide adoption of kaldi_pitch feature, so we decided to remove them. See some of the discussion https://github.com/pytorch/audio/issues/1269 Pull Request resolved: https://github.com/pytorch/audio/pull/3368 Differential Revision: D46406176 Pulled By: mthrok fbshipit-source-id: ee5e24d825188f379979ddccd680c7323b119b1e
5bbbb1d5 · moto · Facebook GitHub Bot · 2ba36b47 · 5bbbb1d5 · 5bbbb1d5
Commit 5bbbb1d5 authored Jun 02, 2023 by moto Committed by Facebook GitHub Bot Jun 02, 2023
4 changed files
--- a/torchaudio/_extension/__init__.py
+++ b/torchaudio/_extension/__init__.py
@@ -14,12 +14,10 @@ _LG = logging.getLogger(__name__)
 # Builder uses it for debugging purpose, so we export it.
 # https://github.com/pytorch/builder/blob/e2e4542b8eb0bdf491214451a1a4128bd606cce2/test/smoke_test/smoke_test.py#L80
 __all__ = [
-    "fail_if_no_kaldi",
    "fail_if_no_sox",
    "fail_if_no_ffmpeg",
    "_check_cuda_version",
    "_IS_TORCHAUDIO_EXT_AVAILABLE",
-    "_IS_KALDI_AVAILABLE",
    "_IS_RIR_AVAILABLE",
    "_SOX_INITIALIZED",
    "_FFMPEG_INITIALIZED",
@@ -34,11 +32,10 @@ if os.name == "nt" and (3, 8) <= sys.version_info < (3, 9):
 # In case of an error, we do not catch the failure as it suggests there is something
 # wrong with the installation.
 _IS_TORCHAUDIO_EXT_AVAILABLE = is_module_available("torchaudio.lib._torchaudio")
-# Kaldi and RIR features are implemented in _torchaudio extension, but they can be individually
+# RIR features are implemented in _torchaudio extension, but they can be individually
 # turned on/off at build time. Available means that _torchaudio is loaded properly, and
-# Kaldi or RIR features are found there.
+# RIR features are found there.
 _IS_RIR_AVAILABLE = False
-_IS_KALDI_AVAILABLE = False
 _IS_ALIGN_AVAILABLE = False
 if _IS_TORCHAUDIO_EXT_AVAILABLE:
    _load_lib("libtorchaudio")
@@ -47,7 +44,6 @@ if _IS_TORCHAUDIO_EXT_AVAILABLE:

    _check_cuda_version()
    _IS_RIR_AVAILABLE = torchaudio.lib._torchaudio.is_rir_available()
-    _IS_KALDI_AVAILABLE = torchaudio.lib._torchaudio.is_kaldi_available()
    _IS_ALIGN_AVAILABLE = torchaudio.lib._torchaudio.is_align_available()


@@ -77,13 +73,6 @@ if is_module_available("torchaudio.lib._torchaudio_ffmpeg"):
        _LG.debug("Failed to initialize ffmpeg bindings", exc_info=True)


-fail_if_no_kaldi = (
-    no_op
-    if _IS_KALDI_AVAILABLE
-    else fail_with_message(
-        "requires kaldi extension, but TorchAudio is not compiled with it. Please build TorchAudio with kaldi support."
-    )
-)
 fail_if_no_sox = (
    no_op
    if _SOX_INITIALIZED

--- a/torchaudio/csrc/CMakeLists.txt
+++ b/torchaudio/csrc/CMakeLists.txt
@@ -76,12 +76,6 @@ if(USE_CUDA)
    )
 endif()

-if(BUILD_KALDI)
-  list(APPEND additional_libs kaldi)
-  list(APPEND sources kaldi.cpp)
-  list(APPEND compile_definitions INCLUDE_KALDI)
-endif()
-
 if(OpenMP_CXX_FOUND)
  list(
    APPEND

--- a/torchaudio/functional/__init__.py
+++ b/torchaudio/functional/__init__.py
@@ -28,7 +28,6 @@ from .functional import (
    apply_beamforming,
    apply_codec,
    compute_deltas,
-    compute_kaldi_pitch,
    convolve,
    create_dct,
    DB_to_amplitude,
@@ -65,7 +64,6 @@ from .functional import (
 __all__ = [
    "amplitude_to_DB",
    "compute_deltas",
-    "compute_kaldi_pitch",
    "create_dct",
    "melscale_fbanks",
    "linear_fbanks",

--- a/torchaudio/functional/functional.py
+++ b/torchaudio/functional/functional.py
@@ -21,7 +21,6 @@ __all__ = [
    "amplitude_to_DB",
    "DB_to_amplitude",
    "compute_deltas",
-    "compute_kaldi_pitch",
    "melscale_fbanks",
    "linear_fbanks",
    "create_dct",
@@ -1340,120 +1339,6 @@ def apply_codec(
    return augmented


-@torchaudio._extension.fail_if_no_kaldi
-def compute_kaldi_pitch(
-    waveform: torch.Tensor,
-    sample_rate: float,
-    frame_length: float = 25.0,
-    frame_shift: float = 10.0,
-    min_f0: float = 50,
-    max_f0: float = 400,
-    soft_min_f0: float = 10.0,
-    penalty_factor: float = 0.1,
-    lowpass_cutoff: float = 1000,
-    resample_frequency: float = 4000,
-    delta_pitch: float = 0.005,
-    nccf_ballast: float = 7000,
-    lowpass_filter_width: int = 1,
-    upsample_filter_width: int = 5,
-    max_frames_latency: int = 0,
-    frames_per_chunk: int = 0,
-    simulate_first_pass_online: bool = False,
-    recompute_frame: int = 500,
-    snip_edges: bool = True,
-) -> torch.Tensor:
-    """Extract pitch based on method described in *A pitch extraction algorithm tuned
-    for automatic speech recognition* :cite:`6854049`.
-
-    .. devices:: CPU
-
-    .. properties:: TorchScript
-
-    This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi.
-
-    Args:
-        waveform (Tensor):
-            The input waveform of shape `(..., time)`.
-        sample_rate (float):
-            Sample rate of `waveform`.
-        frame_length (float, optional):
-            Frame length in milliseconds. (default: 25.0)
-        frame_shift (float, optional):
-            Frame shift in milliseconds. (default: 10.0)
-        min_f0 (float, optional):
-            Minimum F0 to search for (Hz)  (default: 50.0)
-        max_f0 (float, optional):
-            Maximum F0 to search for (Hz)  (default: 400.0)
-        soft_min_f0 (float, optional):
-            Minimum f0, applied in soft way, must not exceed min-f0  (default: 10.0)
-        penalty_factor (float, optional):
-            Cost factor for FO change.  (default: 0.1)
-        lowpass_cutoff (float, optional):
-            Cutoff frequency for LowPass filter (Hz) (default: 1000)
-        resample_frequency (float, optional):
-            Frequency that we down-sample the signal to. Must be more than twice lowpass-cutoff.
-            (default: 4000)
-        delta_pitch( float, optional):
-            Smallest relative change in pitch that our algorithm measures. (default: 0.005)
-        nccf_ballast (float, optional):
-            Increasing this factor reduces NCCF for quiet frames (default: 7000)
-        lowpass_filter_width (int, optional):
-            Integer that determines filter width of lowpass filter, more gives sharper filter.
-            (default: 1)
-        upsample_filter_width (int, optional):
-            Integer that determines filter width when upsampling NCCF. (default: 5)
-        max_frames_latency (int, optional):
-            Maximum number of frames of latency that we allow pitch tracking to introduce into
-            the feature processing (affects output only if ``frames_per_chunk > 0`` and
-            ``simulate_first_pass_online=True``) (default: 0)
-        frames_per_chunk (int, optional):
-            The number of frames used for energy normalization. (default: 0)
-        simulate_first_pass_online (bool, optional):
-            If true, the function will output features that correspond to what an online decoder
-            would see in the first pass of decoding -- not the final version of the features,
-            which is the default. (default: False)
-            Relevant if ``frames_per_chunk > 0``.
-        recompute_frame (int, optional):
-            Only relevant for compatibility with online pitch extraction.
-            A non-critical parameter; the frame at which we recompute some of the forward pointers,
-            after revising our estimate of the signal energy.
-            Relevant if ``frames_per_chunk > 0``. (default: 500)
-        snip_edges (bool, optional):
-            If this is set to false, the incomplete frames near the ending edge won't be snipped,
-            so that the number of frames is the file size divided by the frame-shift.
-            This makes different types of features give the same number of frames. (default: True)
-
-    Returns:
-       Tensor: Pitch feature. Shape: `(batch, frames 2)` where the last dimension
-       corresponds to pitch and NCCF.
-    """
-    shape = waveform.shape
-    waveform = waveform.reshape(-1, shape[-1])
-    result = torch.ops.torchaudio.kaldi_ComputeKaldiPitch(
-        waveform,
-        sample_rate,
-        frame_length,
-        frame_shift,
-        min_f0,
-        max_f0,
-        soft_min_f0,
-        penalty_factor,
-        lowpass_cutoff,
-        resample_frequency,
-        delta_pitch,
-        nccf_ballast,
-        lowpass_filter_width,
-        upsample_filter_width,
-        max_frames_latency,
-        frames_per_chunk,
-        simulate_first_pass_online,
-        recompute_frame,
-        snip_edges,
-    )
-    result = result.reshape(shape[:-1] + result.shape[-2:])
-    return result
-
-
 def _get_sinc_resample_kernel(
    orig_freq: int,
    new_freq: int,