Add Torchscript sox effects (#760)

* Add sox_utils module * Make init/shutdown thread safe * Add sox effects implementation * Add test for sox effects * Update docstrings and add examples

Add Torchscript sox effects (#760)
* Add sox_utils module * Make init/shutdown thread safe * Add sox effects implementation * Add test for sox effects * Update docstrings and add examples
60a8e23d · moto · GitHub · db8f2bf3 · 60a8e23d · 60a8e23d
Unverified Commit 60a8e23d authored Jul 15, 2020 by moto Committed by GitHub Jul 15, 2020
7 changed files
--- a/torchaudio/csrc/sox_io.cpp
+++ b/torchaudio/csrc/sox_io.cpp
@@ -125,14 +125,12 @@ void save_audio_file(
    const c10::intrusive_ptr<TensorSignal>& signal,
    const double compression) {
  const auto tensor = signal->getTensor();
-  const auto sample_rate = signal->getSampleRate();
  const auto channels_first = signal->getChannelsFirst();

  validate_input_tensor(tensor);

  const auto filetype = get_filetype(file_name);
-  const auto signal_info =
-      get_signalinfo(tensor, sample_rate, channels_first, filetype);
+  const auto signal_info = get_signalinfo(signal.get(), filetype);
  const auto encoding_info =
      get_encodinginfo(filetype, tensor.dtype(), compression);


--- a/torchaudio/csrc/sox_utils.cpp
+++ b/torchaudio/csrc/sox_utils.cpp
@@ -5,6 +5,49 @@
 namespace torchaudio {
 namespace sox_utils {

+void set_seed(const int64_t seed) {
+  sox_get_globals()->ranqd1 = static_cast<sox_int32_t>(seed);
+}
+
+void set_verbosity(const int64_t verbosity) {
+  sox_get_globals()->verbosity = static_cast<unsigned>(verbosity);
+}
+
+void set_use_threads(const bool use_threads) {
+  sox_get_globals()->use_threads = static_cast<sox_bool>(use_threads);
+}
+
+void set_buffer_size(const int64_t buffer_size) {
+  sox_get_globals()->bufsiz = static_cast<size_t>(buffer_size);
+}
+
+std::vector<std::vector<std::string>> list_effects() {
+  std::vector<std::vector<std::string>> effects;
+  for (const sox_effect_fn_t* fns = sox_get_effect_fns(); *fns; ++fns) {
+    const sox_effect_handler_t* handler = (*fns)();
+    if (handler && handler->name) {
+      if (UNSUPPORTED_EFFECTS.find(handler->name) ==
+          UNSUPPORTED_EFFECTS.end()) {
+        effects.emplace_back(std::vector<std::string>{
+            handler->name,
+            handler->usage ? std::string(handler->usage) : std::string("")});
+      }
+    }
+  }
+  return effects;
+}
+
+std::vector<std::string> list_formats() {
+  std::vector<std::string> formats;
+  for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
+    for (const char* const* names = fns->fn()->names; *names; ++names) {
+      if (!strchr(*names, '/'))
+        formats.emplace_back(*names);
+    }
+  }
+  return formats;
+}
+
 TensorSignal::TensorSignal(
    torch::Tensor tensor_,
    int64_t sample_rate_,
@@ -205,13 +248,13 @@ unsigned get_precision(
 }

 sox_signalinfo_t get_signalinfo(
-    const torch::Tensor& tensor,
-    const int64_t sample_rate,
-    const bool channels_first,
+    const TensorSignal* signal,
    const std::string filetype) {
+  auto tensor = signal->getTensor();
  return sox_signalinfo_t{
-      /*rate=*/static_cast<sox_rate_t>(sample_rate),
-      /*channels=*/static_cast<unsigned>(tensor.size(channels_first ? 0 : 1)),
+      /*rate=*/static_cast<sox_rate_t>(signal->getSampleRate()),
+      /*channels=*/
+      static_cast<unsigned>(tensor.size(signal->getChannelsFirst() ? 0 : 1)),
      /*precision=*/get_precision(filetype, tensor.dtype()),
      /*length=*/static_cast<uint64_t>(tensor.numel())};
 }

--- a/torchaudio/csrc/sox_utils.h
+++ b/torchaudio/csrc/sox_utils.h
@@ -7,6 +7,25 @@
 namespace torchaudio {
 namespace sox_utils {

+////////////////////////////////////////////////////////////////////////////////
+// APIs for Python interaction
+////////////////////////////////////////////////////////////////////////////////
+
+/// Set sox global options
+void set_seed(const int64_t seed);
+
+void set_verbosity(const int64_t verbosity);
+
+void set_use_threads(const bool use_threads);
+
+void set_buffer_size(const int64_t buffer_size);
+
+std::vector<std::vector<std::string>> list_effects();
+
+std::vector<std::string> list_formats();
+
+/// Class for exchanging signal infomation (tensor + meta data) between
+/// C++ and Python for read/write operation.
 struct TensorSignal : torch::CustomClassHolder {
  torch::Tensor tensor;
  int64_t sample_rate;
@@ -22,6 +41,13 @@ struct TensorSignal : torch::CustomClassHolder {
  bool getChannelsFirst() const;
 };

+////////////////////////////////////////////////////////////////////////////////
+// Utilities for sox_io / sox_effects implementations
+////////////////////////////////////////////////////////////////////////////////
+
+const std::unordered_set<std::string> UNSUPPORTED_EFFECTS =
+    {"input", "output", "spectrogram", "noiseprof", "noisered", "splice"};
+
 /// helper class to automatically close sox_format_t*
 struct SoxFormat {
  explicit SoxFormat(sox_format_t* fd) noexcept;
@@ -84,9 +110,7 @@ const std::string get_filetype(const std::string path);

 /// Get sox_signalinfo_t for passing a torch::Tensor object.
 sox_signalinfo_t get_signalinfo(
-    const torch::Tensor& tensor,
-    const int64_t sample_rate,
-    const bool channels_first,
+    const TensorSignal* signal,
    const std::string filetype);

 /// Get sox_encofinginfo_t for saving audoi file

--- a/torchaudio/sox_effects/__init__.py
+++ b/torchaudio/sox_effects/__init__.py
@@ -3,6 +3,8 @@ from .sox_effects import (
    init_sox_effects,
    shutdown_sox_effects,
    effect_names,
+    apply_effects_tensor,
+    apply_effects_file,
    SoxEffect,
    SoxEffectsChain,
 )

--- a/torchaudio/sox_effects/sox_effects.py
+++ b/torchaudio/sox_effects/sox_effects.py
@@ -7,6 +7,8 @@ from torchaudio._internal import (
    module_utils as _mod_utils,
    misc_ops as _misc_ops,
 )
+from torchaudio.utils.sox_utils import list_effects
+

 if _mod_utils.is_module_available('torchaudio._torchaudio'):
    from torchaudio import _torchaudio
@@ -52,7 +54,128 @@ def effect_names() -> List[str]:
    Example
        >>> EFFECT_NAMES = torchaudio.sox_effects.effect_names()
    """
-    return torch.ops.torchaudio.sox_effects_list_effects()
+    return list(list_effects().keys())
+
+
+@_mod_utils.requires_module('torchaudio._torchaudio')
+def apply_effects_tensor(
+        tensor: torch.Tensor,
+        sample_rate: int,
+        effects: List[List[str]],
+        channels_first: bool = True,
+) -> Tuple[torch.Tensor, int]:
+    """Apply sox effects to given Tensor
+
+    Args:
+        tensor (torch.Tensor): Input 2D Tensor.
+        sample_rate (int): Sample rate
+        effects (List[List[str]]): List of effects.
+        channels_first (bool): Indicates if the input Tensor's dimension is
+            ``[channels, time]`` or ``[time, channels]``
+
+    Returns:
+        Tuple[torch.Tensor, int]: Resulting Tensor and sample rate.
+        The resulting Tensor has the same ``dtype`` as the input Tensor, and
+        the same channels order. The shape of the Tensor can be different based on the
+        effects applied. Sample rate can also be different based on the effects applied.
+
+    Notes:
+        This function works in the way very similar to ``sox`` command, however there are slight
+        differences. For example, ``sox`` commnad adds certain effects automatically (such as
+        ``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
+        only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
+        need to give ``rate`` effect with desired sampling rate.)
+
+    Examples:
+        >>> # Defines the effects to apply
+        >>> effects = [
+        ...     ['gain', '-n'],  # normalises to 0dB
+        ...     ['pitch', '5'],  # 5 cent pitch shift
+        ...     ['rate', '8000'],  # resample to 8000 Hz
+        ... ]
+        >>> # Generate pseudo wave:
+        >>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
+        >>> sample_rate = 16000
+        >>> waveform = 2 * torch.rand([2, sample_rate * 1]) - 1
+        >>> waveform.shape
+        torch.Size([2, 16000])
+        >>> waveform
+        tensor([[ 0.3138,  0.7620, -0.9019,  ..., -0.7495, -0.4935,  0.5442],
+                [-0.0832,  0.0061,  0.8233,  ..., -0.5176, -0.9140, -0.2434]])
+        >>> # Apply effects
+        >>> waveform, sample_rate = apply_effects_tensor(
+        ...     wave_form, sample_rate, effects, channels_first=True)
+        >>> # The new waveform is sampling rate 8000, 1 second.
+        >>> # normalization and channel order are preserved
+        >>> waveform.shape
+        torch.Size([2, 8000])
+        >>> waveform
+        tensor([[ 0.5054, -0.5518, -0.4800,  ..., -0.0076,  0.0096, -0.0110],
+                [ 0.1331,  0.0436, -0.3783,  ..., -0.0035,  0.0012,  0.0008]])
+        >>> sample_rate
+        8000
+    """
+    in_signal = torch.classes.torchaudio.TensorSignal(tensor, sample_rate, channels_first)
+    out_signal = torch.ops.torchaudio.sox_effects_apply_effects_tensor(in_signal, effects)
+    return out_signal.get_tensor(), out_signal.get_sample_rate()
+
+
+@_mod_utils.requires_module('torchaudio._torchaudio')
+def apply_effects_file(
+        path: str,
+        effects: List[List[str]],
+        normalize: bool = True,
+        channels_first: bool = True,
+) -> Tuple[torch.Tensor, int]:
+    """Apply sox effects to the audio file and load the resulting data as Tensor
+
+    Args:
+        path (str): Path to the audio file.
+        effects (List[List[str]]): List of effects.
+        normalize (bool): When ``True``, this function always return ``float32``, and sample values are
+            normalized to ``[-1.0, 1.0]``. If input file is integer WAV, giving ``False`` will change
+            the resulting Tensor type to integer type. This argument has no effect for formats other
+            than integer WAV type.
+        channels_first (bool): When True, the returned Tensor has dimension ``[channel, time]``.
+            Otherwise, the returned Tensor's dimension is ``[time, channel]``.
+
+    Returns:
+        Tuple[torch.Tensor, int]: Resulting Tensor and sample rate.
+        If ``normalize=True``, the resulting Tensor is always ``float32`` type.
+        If ``normalize=False`` and the input audio file is of integer WAV file, then the
+        resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
+        If ``channels_first=True``, the resulting Tensor has dimension ``[channel, time]``,
+        otherwise ``[time, channel]``.
+
+    Notes:
+        This function works in the way very similar to ``sox`` command, however there are slight
+        differences. For example, ``sox`` commnad adds certain effects automatically (such as
+        ``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
+        effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
+        effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
+        rate and leave samples untouched.
+
+    Examples:
+        >>> # Defines the effects to apply
+        >>> effects = [
+        ...     ['gain', '-n'],  # normalises to 0dB
+        ...     ['pitch', '5'],  # 5 cent pitch shift
+        ...     ['rate', '8000'],  # resample to 8000 Hz
+        ... ]
+        >>> # Apply effects and load data with channels_first=True
+        >>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
+        >>> waveform.shape
+        torch.Size([2, 8000])
+        >>> waveform
+        tensor([[ 5.1151e-03,  1.8073e-02,  2.2188e-02,  ...,  1.0431e-07,
+                 -1.4761e-07,  1.8114e-07],
+                [-2.6924e-03,  2.1860e-03,  1.0650e-02,  ...,  6.4122e-07,
+                 -5.6159e-07,  4.8103e-07]])
+        >>> sample_rate
+        8000
+    """
+    signal = torch.ops.torchaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first)
+    return signal.get_tensor(), signal.get_sample_rate()


 @_mod_utils.requires_module('torchaudio._torchaudio')

--- a/torchaudio/utils/__init__.py
+++ b/torchaudio/utils/__init__.py
+from . import (
+    sox_utils,
+)
+
+from torchaudio._internal import module_utils as _mod_utils
+
+
+if _mod_utils.is_module_available('torchaudio._torchaudio'):
+    sox_utils.set_verbosity(1)
--- a/torchaudio/utils/sox_utils.py
+++ b/torchaudio/utils/sox_utils.py
+from typing import List, Dict
+
+import torch
+
+from torchaudio._internal import (
+    module_utils as _mod_utils,
+)
+
+
+@_mod_utils.requires_module('torchaudio._torchaudio')
+def set_seed(seed: int):
+    """Set libsox's PRNG
+
+    Args:
+        seed (int): seed value. valid range is int32.
+
+    See Also:
+        http://sox.sourceforge.net/sox.html
+    """
+    torch.ops.torchaudio.sox_utils_set_seed(seed)
+
+
+@_mod_utils.requires_module('torchaudio._torchaudio')
+def set_verbosity(verbosity: int):
+    """Set libsox's verbosity
+
+    Args:
+        verbosity (int): Set verbosity level of libsox.
+            1: failure messages
+            2: warnings
+            3: details of processing
+            4-6: increasing levels of debug messages
+
+    See Also:
+        http://sox.sourceforge.net/sox.html
+    """
+    torch.ops.torchaudio.sox_utils_set_verbosity(verbosity)
+
+
+@_mod_utils.requires_module('torchaudio._torchaudio')
+def set_buffer_size(buffer_size: int):
+    """Set buffer size for sox effect chain
+
+    Args:
+        buffer_size (int): Set the size in bytes of the buffers used for processing audio.
+
+    See Also:
+        http://sox.sourceforge.net/sox.html
+    """
+    torch.ops.torchaudio.sox_utils_set_buffer_size(buffer_size)
+
+
+@_mod_utils.requires_module('torchaudio._torchaudio')
+def set_use_threads(use_threads: bool):
+    """Set multithread option for sox effect chain
+
+    Args:
+        use_threads (bool): When True, enables libsox's parallel effects channels processing.
+            To use mutlithread, the underlying libsox has to be compiled with OpenMP support.
+
+    See Also:
+        http://sox.sourceforge.net/sox.html
+    """
+    torch.ops.torchaudio.sox_utils_set_use_threads(use_threads)
+
+
+@_mod_utils.requires_module('torchaudio._torchaudio')
+def list_effects() -> Dict[str, str]:
+    """List the available sox effect names
+
+    Returns:
+        Dict[str, str]: Mapping from "effect name" to "usage"
+    """
+    return dict(torch.ops.torchaudio.sox_utils_list_effects())
+
+
+@_mod_utils.requires_module('torchaudio._torchaudio')
+def list_formats() -> List[str]:
+    """List the supported audio formats
+
+    Returns:
+        List[str]: List of supported audio formats
+    """
+    return torch.ops.torchaudio.sox_utils_list_formats()