Unverified Commit 60a8e23d authored by moto's avatar moto Committed by GitHub
Browse files

Add Torchscript sox effects (#760)

* Add sox_utils module

* Make init/shutdown thread safe

* Add sox effects implementation

* Add test for sox effects

* Update docstrings and add examples
parent db8f2bf3
......@@ -125,14 +125,12 @@ void save_audio_file(
const c10::intrusive_ptr<TensorSignal>& signal,
const double compression) {
const auto tensor = signal->getTensor();
const auto sample_rate = signal->getSampleRate();
const auto channels_first = signal->getChannelsFirst();
validate_input_tensor(tensor);
const auto filetype = get_filetype(file_name);
const auto signal_info =
get_signalinfo(tensor, sample_rate, channels_first, filetype);
const auto signal_info = get_signalinfo(signal.get(), filetype);
const auto encoding_info =
get_encodinginfo(filetype, tensor.dtype(), compression);
......
......@@ -5,6 +5,49 @@
namespace torchaudio {
namespace sox_utils {
void set_seed(const int64_t seed) {
sox_get_globals()->ranqd1 = static_cast<sox_int32_t>(seed);
}
void set_verbosity(const int64_t verbosity) {
sox_get_globals()->verbosity = static_cast<unsigned>(verbosity);
}
void set_use_threads(const bool use_threads) {
sox_get_globals()->use_threads = static_cast<sox_bool>(use_threads);
}
void set_buffer_size(const int64_t buffer_size) {
sox_get_globals()->bufsiz = static_cast<size_t>(buffer_size);
}
std::vector<std::vector<std::string>> list_effects() {
std::vector<std::vector<std::string>> effects;
for (const sox_effect_fn_t* fns = sox_get_effect_fns(); *fns; ++fns) {
const sox_effect_handler_t* handler = (*fns)();
if (handler && handler->name) {
if (UNSUPPORTED_EFFECTS.find(handler->name) ==
UNSUPPORTED_EFFECTS.end()) {
effects.emplace_back(std::vector<std::string>{
handler->name,
handler->usage ? std::string(handler->usage) : std::string("")});
}
}
}
return effects;
}
std::vector<std::string> list_formats() {
std::vector<std::string> formats;
for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
for (const char* const* names = fns->fn()->names; *names; ++names) {
if (!strchr(*names, '/'))
formats.emplace_back(*names);
}
}
return formats;
}
TensorSignal::TensorSignal(
torch::Tensor tensor_,
int64_t sample_rate_,
......@@ -205,13 +248,13 @@ unsigned get_precision(
}
sox_signalinfo_t get_signalinfo(
const torch::Tensor& tensor,
const int64_t sample_rate,
const bool channels_first,
const TensorSignal* signal,
const std::string filetype) {
auto tensor = signal->getTensor();
return sox_signalinfo_t{
/*rate=*/static_cast<sox_rate_t>(sample_rate),
/*channels=*/static_cast<unsigned>(tensor.size(channels_first ? 0 : 1)),
/*rate=*/static_cast<sox_rate_t>(signal->getSampleRate()),
/*channels=*/
static_cast<unsigned>(tensor.size(signal->getChannelsFirst() ? 0 : 1)),
/*precision=*/get_precision(filetype, tensor.dtype()),
/*length=*/static_cast<uint64_t>(tensor.numel())};
}
......
......@@ -7,6 +7,25 @@
namespace torchaudio {
namespace sox_utils {
////////////////////////////////////////////////////////////////////////////////
// APIs for Python interaction
////////////////////////////////////////////////////////////////////////////////
/// Set sox global options
void set_seed(const int64_t seed);
void set_verbosity(const int64_t verbosity);
void set_use_threads(const bool use_threads);
void set_buffer_size(const int64_t buffer_size);
std::vector<std::vector<std::string>> list_effects();
std::vector<std::string> list_formats();
/// Class for exchanging signal infomation (tensor + meta data) between
/// C++ and Python for read/write operation.
struct TensorSignal : torch::CustomClassHolder {
torch::Tensor tensor;
int64_t sample_rate;
......@@ -22,6 +41,13 @@ struct TensorSignal : torch::CustomClassHolder {
bool getChannelsFirst() const;
};
////////////////////////////////////////////////////////////////////////////////
// Utilities for sox_io / sox_effects implementations
////////////////////////////////////////////////////////////////////////////////
const std::unordered_set<std::string> UNSUPPORTED_EFFECTS =
{"input", "output", "spectrogram", "noiseprof", "noisered", "splice"};
/// helper class to automatically close sox_format_t*
struct SoxFormat {
explicit SoxFormat(sox_format_t* fd) noexcept;
......@@ -84,9 +110,7 @@ const std::string get_filetype(const std::string path);
/// Get sox_signalinfo_t for passing a torch::Tensor object.
sox_signalinfo_t get_signalinfo(
const torch::Tensor& tensor,
const int64_t sample_rate,
const bool channels_first,
const TensorSignal* signal,
const std::string filetype);
/// Get sox_encofinginfo_t for saving audoi file
......
......@@ -3,6 +3,8 @@ from .sox_effects import (
init_sox_effects,
shutdown_sox_effects,
effect_names,
apply_effects_tensor,
apply_effects_file,
SoxEffect,
SoxEffectsChain,
)
......
......@@ -7,6 +7,8 @@ from torchaudio._internal import (
module_utils as _mod_utils,
misc_ops as _misc_ops,
)
from torchaudio.utils.sox_utils import list_effects
if _mod_utils.is_module_available('torchaudio._torchaudio'):
from torchaudio import _torchaudio
......@@ -52,7 +54,128 @@ def effect_names() -> List[str]:
Example
>>> EFFECT_NAMES = torchaudio.sox_effects.effect_names()
"""
return torch.ops.torchaudio.sox_effects_list_effects()
return list(list_effects().keys())
@_mod_utils.requires_module('torchaudio._torchaudio')
def apply_effects_tensor(
tensor: torch.Tensor,
sample_rate: int,
effects: List[List[str]],
channels_first: bool = True,
) -> Tuple[torch.Tensor, int]:
"""Apply sox effects to given Tensor
Args:
tensor (torch.Tensor): Input 2D Tensor.
sample_rate (int): Sample rate
effects (List[List[str]]): List of effects.
channels_first (bool): Indicates if the input Tensor's dimension is
``[channels, time]`` or ``[time, channels]``
Returns:
Tuple[torch.Tensor, int]: Resulting Tensor and sample rate.
The resulting Tensor has the same ``dtype`` as the input Tensor, and
the same channels order. The shape of the Tensor can be different based on the
effects applied. Sample rate can also be different based on the effects applied.
Notes:
This function works in the way very similar to ``sox`` command, however there are slight
differences. For example, ``sox`` commnad adds certain effects automatically (such as
``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
need to give ``rate`` effect with desired sampling rate.)
Examples:
>>> # Defines the effects to apply
>>> effects = [
... ['gain', '-n'], # normalises to 0dB
... ['pitch', '5'], # 5 cent pitch shift
... ['rate', '8000'], # resample to 8000 Hz
... ]
>>> # Generate pseudo wave:
>>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
>>> sample_rate = 16000
>>> waveform = 2 * torch.rand([2, sample_rate * 1]) - 1
>>> waveform.shape
torch.Size([2, 16000])
>>> waveform
tensor([[ 0.3138, 0.7620, -0.9019, ..., -0.7495, -0.4935, 0.5442],
[-0.0832, 0.0061, 0.8233, ..., -0.5176, -0.9140, -0.2434]])
>>> # Apply effects
>>> waveform, sample_rate = apply_effects_tensor(
... wave_form, sample_rate, effects, channels_first=True)
>>> # The new waveform is sampling rate 8000, 1 second.
>>> # normalization and channel order are preserved
>>> waveform.shape
torch.Size([2, 8000])
>>> waveform
tensor([[ 0.5054, -0.5518, -0.4800, ..., -0.0076, 0.0096, -0.0110],
[ 0.1331, 0.0436, -0.3783, ..., -0.0035, 0.0012, 0.0008]])
>>> sample_rate
8000
"""
in_signal = torch.classes.torchaudio.TensorSignal(tensor, sample_rate, channels_first)
out_signal = torch.ops.torchaudio.sox_effects_apply_effects_tensor(in_signal, effects)
return out_signal.get_tensor(), out_signal.get_sample_rate()
@_mod_utils.requires_module('torchaudio._torchaudio')
def apply_effects_file(
path: str,
effects: List[List[str]],
normalize: bool = True,
channels_first: bool = True,
) -> Tuple[torch.Tensor, int]:
"""Apply sox effects to the audio file and load the resulting data as Tensor
Args:
path (str): Path to the audio file.
effects (List[List[str]]): List of effects.
normalize (bool): When ``True``, this function always return ``float32``, and sample values are
normalized to ``[-1.0, 1.0]``. If input file is integer WAV, giving ``False`` will change
the resulting Tensor type to integer type. This argument has no effect for formats other
than integer WAV type.
channels_first (bool): When True, the returned Tensor has dimension ``[channel, time]``.
Otherwise, the returned Tensor's dimension is ``[time, channel]``.
Returns:
Tuple[torch.Tensor, int]: Resulting Tensor and sample rate.
If ``normalize=True``, the resulting Tensor is always ``float32`` type.
If ``normalize=False`` and the input audio file is of integer WAV file, then the
resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
If ``channels_first=True``, the resulting Tensor has dimension ``[channel, time]``,
otherwise ``[time, channel]``.
Notes:
This function works in the way very similar to ``sox`` command, however there are slight
differences. For example, ``sox`` commnad adds certain effects automatically (such as
``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
rate and leave samples untouched.
Examples:
>>> # Defines the effects to apply
>>> effects = [
... ['gain', '-n'], # normalises to 0dB
... ['pitch', '5'], # 5 cent pitch shift
... ['rate', '8000'], # resample to 8000 Hz
... ]
>>> # Apply effects and load data with channels_first=True
>>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
>>> waveform.shape
torch.Size([2, 8000])
>>> waveform
tensor([[ 5.1151e-03, 1.8073e-02, 2.2188e-02, ..., 1.0431e-07,
-1.4761e-07, 1.8114e-07],
[-2.6924e-03, 2.1860e-03, 1.0650e-02, ..., 6.4122e-07,
-5.6159e-07, 4.8103e-07]])
>>> sample_rate
8000
"""
signal = torch.ops.torchaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first)
return signal.get_tensor(), signal.get_sample_rate()
@_mod_utils.requires_module('torchaudio._torchaudio')
......
from . import (
sox_utils,
)
from torchaudio._internal import module_utils as _mod_utils
if _mod_utils.is_module_available('torchaudio._torchaudio'):
sox_utils.set_verbosity(1)
from typing import List, Dict
import torch
from torchaudio._internal import (
module_utils as _mod_utils,
)
@_mod_utils.requires_module('torchaudio._torchaudio')
def set_seed(seed: int):
"""Set libsox's PRNG
Args:
seed (int): seed value. valid range is int32.
See Also:
http://sox.sourceforge.net/sox.html
"""
torch.ops.torchaudio.sox_utils_set_seed(seed)
@_mod_utils.requires_module('torchaudio._torchaudio')
def set_verbosity(verbosity: int):
"""Set libsox's verbosity
Args:
verbosity (int): Set verbosity level of libsox.
1: failure messages
2: warnings
3: details of processing
4-6: increasing levels of debug messages
See Also:
http://sox.sourceforge.net/sox.html
"""
torch.ops.torchaudio.sox_utils_set_verbosity(verbosity)
@_mod_utils.requires_module('torchaudio._torchaudio')
def set_buffer_size(buffer_size: int):
"""Set buffer size for sox effect chain
Args:
buffer_size (int): Set the size in bytes of the buffers used for processing audio.
See Also:
http://sox.sourceforge.net/sox.html
"""
torch.ops.torchaudio.sox_utils_set_buffer_size(buffer_size)
@_mod_utils.requires_module('torchaudio._torchaudio')
def set_use_threads(use_threads: bool):
"""Set multithread option for sox effect chain
Args:
use_threads (bool): When True, enables libsox's parallel effects channels processing.
To use mutlithread, the underlying libsox has to be compiled with OpenMP support.
See Also:
http://sox.sourceforge.net/sox.html
"""
torch.ops.torchaudio.sox_utils_set_use_threads(use_threads)
@_mod_utils.requires_module('torchaudio._torchaudio')
def list_effects() -> Dict[str, str]:
"""List the available sox effect names
Returns:
Dict[str, str]: Mapping from "effect name" to "usage"
"""
return dict(torch.ops.torchaudio.sox_utils_list_effects())
@_mod_utils.requires_module('torchaudio._torchaudio')
def list_formats() -> List[str]:
"""List the supported audio formats
Returns:
List[str]: List of supported audio formats
"""
return torch.ops.torchaudio.sox_utils_list_formats()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment