Commit 5bbbb1d5 authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

[BC-Breaking] Remove compute_kaldi_pitch (#3368)

Summary:
This commit removes compute_kaldi_pitch function and the underlying Kaldi integration from torchaudio.

Kaldi pitch function was added in a short period of time by integrating the original Kaldi implementation, instead of reimplementing it in PyTorch.

The Kaldi integration employed a hack which replaces the base vector/matrix implementation of Kaldi with PyTorch Tensor so that there is only one blas library within torchaudio.

Recently, we are making torchaudio more lean, and we don't see a wide adoption of kaldi_pitch feature, so we decided to remove them.

See some of the discussion https://github.com/pytorch/audio/issues/1269

Pull Request resolved: https://github.com/pytorch/audio/pull/3368

Differential Revision: D46406176

Pulled By: mthrok

fbshipit-source-id: ee5e24d825188f379979ddccd680c7323b119b1e
parent 2ba36b47
......@@ -14,12 +14,10 @@ _LG = logging.getLogger(__name__)
# Builder uses it for debugging purpose, so we export it.
# https://github.com/pytorch/builder/blob/e2e4542b8eb0bdf491214451a1a4128bd606cce2/test/smoke_test/smoke_test.py#L80
__all__ = [
"fail_if_no_kaldi",
"fail_if_no_sox",
"fail_if_no_ffmpeg",
"_check_cuda_version",
"_IS_TORCHAUDIO_EXT_AVAILABLE",
"_IS_KALDI_AVAILABLE",
"_IS_RIR_AVAILABLE",
"_SOX_INITIALIZED",
"_FFMPEG_INITIALIZED",
......@@ -34,11 +32,10 @@ if os.name == "nt" and (3, 8) <= sys.version_info < (3, 9):
# In case of an error, we do not catch the failure as it suggests there is something
# wrong with the installation.
_IS_TORCHAUDIO_EXT_AVAILABLE = is_module_available("torchaudio.lib._torchaudio")
# Kaldi and RIR features are implemented in _torchaudio extension, but they can be individually
# RIR features are implemented in _torchaudio extension, but they can be individually
# turned on/off at build time. Available means that _torchaudio is loaded properly, and
# Kaldi or RIR features are found there.
# RIR features are found there.
_IS_RIR_AVAILABLE = False
_IS_KALDI_AVAILABLE = False
_IS_ALIGN_AVAILABLE = False
if _IS_TORCHAUDIO_EXT_AVAILABLE:
_load_lib("libtorchaudio")
......@@ -47,7 +44,6 @@ if _IS_TORCHAUDIO_EXT_AVAILABLE:
_check_cuda_version()
_IS_RIR_AVAILABLE = torchaudio.lib._torchaudio.is_rir_available()
_IS_KALDI_AVAILABLE = torchaudio.lib._torchaudio.is_kaldi_available()
_IS_ALIGN_AVAILABLE = torchaudio.lib._torchaudio.is_align_available()
......@@ -77,13 +73,6 @@ if is_module_available("torchaudio.lib._torchaudio_ffmpeg"):
_LG.debug("Failed to initialize ffmpeg bindings", exc_info=True)
fail_if_no_kaldi = (
no_op
if _IS_KALDI_AVAILABLE
else fail_with_message(
"requires kaldi extension, but TorchAudio is not compiled with it. Please build TorchAudio with kaldi support."
)
)
fail_if_no_sox = (
no_op
if _SOX_INITIALIZED
......
......@@ -76,12 +76,6 @@ if(USE_CUDA)
)
endif()
if(BUILD_KALDI)
list(APPEND additional_libs kaldi)
list(APPEND sources kaldi.cpp)
list(APPEND compile_definitions INCLUDE_KALDI)
endif()
if(OpenMP_CXX_FOUND)
list(
APPEND
......
......@@ -28,7 +28,6 @@ from .functional import (
apply_beamforming,
apply_codec,
compute_deltas,
compute_kaldi_pitch,
convolve,
create_dct,
DB_to_amplitude,
......@@ -65,7 +64,6 @@ from .functional import (
__all__ = [
"amplitude_to_DB",
"compute_deltas",
"compute_kaldi_pitch",
"create_dct",
"melscale_fbanks",
"linear_fbanks",
......
......@@ -21,7 +21,6 @@ __all__ = [
"amplitude_to_DB",
"DB_to_amplitude",
"compute_deltas",
"compute_kaldi_pitch",
"melscale_fbanks",
"linear_fbanks",
"create_dct",
......@@ -1340,120 +1339,6 @@ def apply_codec(
return augmented
@torchaudio._extension.fail_if_no_kaldi
def compute_kaldi_pitch(
waveform: torch.Tensor,
sample_rate: float,
frame_length: float = 25.0,
frame_shift: float = 10.0,
min_f0: float = 50,
max_f0: float = 400,
soft_min_f0: float = 10.0,
penalty_factor: float = 0.1,
lowpass_cutoff: float = 1000,
resample_frequency: float = 4000,
delta_pitch: float = 0.005,
nccf_ballast: float = 7000,
lowpass_filter_width: int = 1,
upsample_filter_width: int = 5,
max_frames_latency: int = 0,
frames_per_chunk: int = 0,
simulate_first_pass_online: bool = False,
recompute_frame: int = 500,
snip_edges: bool = True,
) -> torch.Tensor:
"""Extract pitch based on method described in *A pitch extraction algorithm tuned
for automatic speech recognition* :cite:`6854049`.
.. devices:: CPU
.. properties:: TorchScript
This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi.
Args:
waveform (Tensor):
The input waveform of shape `(..., time)`.
sample_rate (float):
Sample rate of `waveform`.
frame_length (float, optional):
Frame length in milliseconds. (default: 25.0)
frame_shift (float, optional):
Frame shift in milliseconds. (default: 10.0)
min_f0 (float, optional):
Minimum F0 to search for (Hz) (default: 50.0)
max_f0 (float, optional):
Maximum F0 to search for (Hz) (default: 400.0)
soft_min_f0 (float, optional):
Minimum f0, applied in soft way, must not exceed min-f0 (default: 10.0)
penalty_factor (float, optional):
Cost factor for FO change. (default: 0.1)
lowpass_cutoff (float, optional):
Cutoff frequency for LowPass filter (Hz) (default: 1000)
resample_frequency (float, optional):
Frequency that we down-sample the signal to. Must be more than twice lowpass-cutoff.
(default: 4000)
delta_pitch( float, optional):
Smallest relative change in pitch that our algorithm measures. (default: 0.005)
nccf_ballast (float, optional):
Increasing this factor reduces NCCF for quiet frames (default: 7000)
lowpass_filter_width (int, optional):
Integer that determines filter width of lowpass filter, more gives sharper filter.
(default: 1)
upsample_filter_width (int, optional):
Integer that determines filter width when upsampling NCCF. (default: 5)
max_frames_latency (int, optional):
Maximum number of frames of latency that we allow pitch tracking to introduce into
the feature processing (affects output only if ``frames_per_chunk > 0`` and
``simulate_first_pass_online=True``) (default: 0)
frames_per_chunk (int, optional):
The number of frames used for energy normalization. (default: 0)
simulate_first_pass_online (bool, optional):
If true, the function will output features that correspond to what an online decoder
would see in the first pass of decoding -- not the final version of the features,
which is the default. (default: False)
Relevant if ``frames_per_chunk > 0``.
recompute_frame (int, optional):
Only relevant for compatibility with online pitch extraction.
A non-critical parameter; the frame at which we recompute some of the forward pointers,
after revising our estimate of the signal energy.
Relevant if ``frames_per_chunk > 0``. (default: 500)
snip_edges (bool, optional):
If this is set to false, the incomplete frames near the ending edge won't be snipped,
so that the number of frames is the file size divided by the frame-shift.
This makes different types of features give the same number of frames. (default: True)
Returns:
Tensor: Pitch feature. Shape: `(batch, frames 2)` where the last dimension
corresponds to pitch and NCCF.
"""
shape = waveform.shape
waveform = waveform.reshape(-1, shape[-1])
result = torch.ops.torchaudio.kaldi_ComputeKaldiPitch(
waveform,
sample_rate,
frame_length,
frame_shift,
min_f0,
max_f0,
soft_min_f0,
penalty_factor,
lowpass_cutoff,
resample_frequency,
delta_pitch,
nccf_ballast,
lowpass_filter_width,
upsample_filter_width,
max_frames_latency,
frames_per_chunk,
simulate_first_pass_online,
recompute_frame,
snip_edges,
)
result = result.reshape(shape[:-1] + result.shape[-2:])
return result
def _get_sinc_resample_kernel(
orig_freq: int,
new_freq: int,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment