sox_effects.py 11.4 KB
Newer Older
1
2
import os
from pathlib import Path
3
from typing import List, Tuple, Optional
David Pollack's avatar
David Pollack committed
4

5
import torch
David Pollack's avatar
David Pollack committed
6

7
import torchaudio
moto's avatar
moto committed
8
from torchaudio._internal import module_utils as _mod_utils
moto's avatar
moto committed
9
10
from torchaudio.utils.sox_utils import list_effects

Vincent QB's avatar
Vincent QB committed
11

12
@_mod_utils.requires_module('torchaudio._torchaudio')
13
14
def init_sox_effects():
    """Initialize resources required to use sox effects.
15
16

    Note:
17
18
19
20
21
22
        You do not need to call this function manually. It is called automatically.

    Once initialized, you do not need to call this function again across the multiple uses of
    sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet.
    Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing
    again will result in error.
23
    """
24
    torch.ops.torchaudio.sox_effects_initialize_sox_effects()
25
26
27


@_mod_utils.requires_module("torchaudio._torchaudio")
28
29
def shutdown_sox_effects():
    """Clean up resources required to use sox effects.
30

31
32
    Note:
        You do not need to call this function manually. It is called automatically.
33
34

    It is safe to call this function multiple times.
35
    Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and
36
    initializing again will result in error.
37
    """
38
    torch.ops.torchaudio.sox_effects_shutdown_sox_effects()
39
40


41
@_mod_utils.requires_module('torchaudio._torchaudio')
42
def effect_names() -> List[str]:
David Pollack's avatar
David Pollack committed
43
44
    """Gets list of valid sox effect names

45
46
    Returns:
        List[str]: list of available effect names.
David Pollack's avatar
David Pollack committed
47

48
    Example
49
50
        >>> torchaudio.sox_effects.effect_names()
        ['allpass', 'band', 'bandpass', ... ]
David Pollack's avatar
David Pollack committed
51
    """
moto's avatar
moto committed
52
53
54
55
56
57
58
59
60
61
62
63
    return list(list_effects().keys())


@_mod_utils.requires_module('torchaudio._torchaudio')
def apply_effects_tensor(
        tensor: torch.Tensor,
        sample_rate: int,
        effects: List[List[str]],
        channels_first: bool = True,
) -> Tuple[torch.Tensor, int]:
    """Apply sox effects to given Tensor

64
65
    Note:
        This function works in the way very similar to ``sox`` command, however there are slight
66
        differences. For example, ``sox`` command adds certain effects automatically (such as
67
68
69
70
        ``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
        only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
        need to give ``rate`` effect with desired sampling rate.)

moto's avatar
moto committed
71
72
73
74
75
76
77
78
79
80
81
82
83
    Args:
        tensor (torch.Tensor): Input 2D Tensor.
        sample_rate (int): Sample rate
        effects (List[List[str]]): List of effects.
        channels_first (bool): Indicates if the input Tensor's dimension is
            ``[channels, time]`` or ``[time, channels]``

    Returns:
        Tuple[torch.Tensor, int]: Resulting Tensor and sample rate.
        The resulting Tensor has the same ``dtype`` as the input Tensor, and
        the same channels order. The shape of the Tensor can be different based on the
        effects applied. Sample rate can also be different based on the effects applied.

84
85
    Example - Basic usage
        >>>
moto's avatar
moto committed
86
87
88
89
90
91
        >>> # Defines the effects to apply
        >>> effects = [
        ...     ['gain', '-n'],  # normalises to 0dB
        ...     ['pitch', '5'],  # 5 cent pitch shift
        ...     ['rate', '8000'],  # resample to 8000 Hz
        ... ]
92
        >>>
moto's avatar
moto committed
93
94
95
96
97
98
99
100
101
        >>> # Generate pseudo wave:
        >>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
        >>> sample_rate = 16000
        >>> waveform = 2 * torch.rand([2, sample_rate * 1]) - 1
        >>> waveform.shape
        torch.Size([2, 16000])
        >>> waveform
        tensor([[ 0.3138,  0.7620, -0.9019,  ..., -0.7495, -0.4935,  0.5442],
                [-0.0832,  0.0061,  0.8233,  ..., -0.5176, -0.9140, -0.2434]])
102
        >>>
moto's avatar
moto committed
103
104
105
        >>> # Apply effects
        >>> waveform, sample_rate = apply_effects_tensor(
        ...     wave_form, sample_rate, effects, channels_first=True)
106
107
        >>>
        >>> # Check the result
moto's avatar
moto committed
108
109
110
111
112
113
114
115
116
        >>> # The new waveform is sampling rate 8000, 1 second.
        >>> # normalization and channel order are preserved
        >>> waveform.shape
        torch.Size([2, 8000])
        >>> waveform
        tensor([[ 0.5054, -0.5518, -0.4800,  ..., -0.0076,  0.0096, -0.0110],
                [ 0.1331,  0.0436, -0.3783,  ..., -0.0035,  0.0012,  0.0008]])
        >>> sample_rate
        8000
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

    Example - Torchscript-able transform
        >>>
        >>> # Use `apply_effects_tensor` in `torch.nn.Module` and dump it to file,
        >>> # then run sox effect via Torchscript runtime.
        >>>
        >>> class SoxEffectTransform(torch.nn.Module):
        ...     effects: List[List[str]]
        ...
        ...     def __init__(self, effects: List[List[str]]):
        ...         super().__init__()
        ...         self.effects = effects
        ...
        ...     def forward(self, tensor: torch.Tensor, sample_rate: int):
        ...         return sox_effects.apply_effects_tensor(
        ...             tensor, sample_rate, self.effects)
        ...
        ...
        >>> # Create transform object
        >>> effects = [
        ...     ["lowpass", "-1", "300"],  # apply single-pole lowpass filter
        ...     ["rate", "8000"],  # change sample rate to 8000
        ... ]
        >>> transform = SoxEffectTensorTransform(effects, input_sample_rate)
        >>>
        >>> # Dump it to file and load
        >>> path = 'sox_effect.zip'
        >>> torch.jit.script(trans).save(path)
        >>> transform = torch.jit.load(path)
        >>>
        >>>> # Run transform
        >>> waveform, input_sample_rate = torchaudio.load("input.wav")
        >>> waveform, sample_rate = transform(waveform, input_sample_rate)
        >>> assert sample_rate == 8000
moto's avatar
moto committed
151
    """
152
153
    return torch.ops.torchaudio.sox_effects_apply_effects_tensor(
        tensor, sample_rate, effects, channels_first)
moto's avatar
moto committed
154
155
156
157
158
159
160
161


@_mod_utils.requires_module('torchaudio._torchaudio')
def apply_effects_file(
        path: str,
        effects: List[List[str]],
        normalize: bool = True,
        channels_first: bool = True,
162
        format: Optional[str] = None,
moto's avatar
moto committed
163
164
165
) -> Tuple[torch.Tensor, int]:
    """Apply sox effects to the audio file and load the resulting data as Tensor

166
167
168
169
170
171
172
173
    Note:
        This function works in the way very similar to ``sox`` command, however there are slight
        differences. For example, ``sox`` commnad adds certain effects automatically (such as
        ``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
        effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
        effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
        rate and leave samples untouched.

moto's avatar
moto committed
174
    Args:
175
176
177
178
179
180
181
182
183
184
        path (path-like object or file-like object):
            Source of audio data. When the function is not compiled by TorchScript,
            (e.g. ``torch.jit.script``), the following types are accepted;
                  * ``path-like``: file path
                  * ``file-like``: Object with ``read(size: int) -> bytes`` method,
                    which returns byte string of at most ``size`` length.
            When the function is compiled by TorchScript, only ``str`` type is allowed.
            Note:
                * This argument is intentionally annotated as ``str`` only for
                  TorchScript compiler compatibility.
moto's avatar
moto committed
185
        effects (List[List[str]]): List of effects.
186
187
188
189
190
        normalize (bool):
            When ``True``, this function always return ``float32``, and sample values are
            normalized to ``[-1.0, 1.0]``.
            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
            integer type. This argument has no effect for formats other
moto's avatar
moto committed
191
192
193
            than integer WAV type.
        channels_first (bool): When True, the returned Tensor has dimension ``[channel, time]``.
            Otherwise, the returned Tensor's dimension is ``[time, channel]``.
194
195
196
197
        format (str, optional):
            Override the format detection with the given format.
            Providing the argument might help when libsox can not infer the format
            from header or extension,
moto's avatar
moto committed
198
199
200
201
202
203
204
205
206

    Returns:
        Tuple[torch.Tensor, int]: Resulting Tensor and sample rate.
        If ``normalize=True``, the resulting Tensor is always ``float32`` type.
        If ``normalize=False`` and the input audio file is of integer WAV file, then the
        resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
        If ``channels_first=True``, the resulting Tensor has dimension ``[channel, time]``,
        otherwise ``[time, channel]``.

207
208
    Example - Basic usage
        >>>
moto's avatar
moto committed
209
210
211
212
213
214
        >>> # Defines the effects to apply
        >>> effects = [
        ...     ['gain', '-n'],  # normalises to 0dB
        ...     ['pitch', '5'],  # 5 cent pitch shift
        ...     ['rate', '8000'],  # resample to 8000 Hz
        ... ]
215
        >>>
moto's avatar
moto committed
216
217
        >>> # Apply effects and load data with channels_first=True
        >>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
218
219
        >>>
        >>> # Check the result
moto's avatar
moto committed
220
221
222
223
224
225
226
227
228
        >>> waveform.shape
        torch.Size([2, 8000])
        >>> waveform
        tensor([[ 5.1151e-03,  1.8073e-02,  2.2188e-02,  ...,  1.0431e-07,
                 -1.4761e-07,  1.8114e-07],
                [-2.6924e-03,  2.1860e-03,  1.0650e-02,  ...,  6.4122e-07,
                 -5.6159e-07,  4.8103e-07]])
        >>> sample_rate
        8000
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243

    Example - Apply random speed perturbation to dataset
        >>>
        >>> # Load data from file, apply random speed perturbation
        >>> class RandomPerturbationFile(torch.utils.data.Dataset):
        ...     \"\"\"Given flist, apply random speed perturbation
        ...
        ...     Suppose all the input files are at least one second long.
        ...     \"\"\"
        ...     def __init__(self, flist: List[str], sample_rate: int):
        ...         super().__init__()
        ...         self.flist = flist
        ...         self.sample_rate = sample_rate
        ...
        ...     def __getitem__(self, index):
244
        ...         speed = 0.5 + 1.5 * random.randn()
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
        ...         effects = [
        ...             ['gain', '-n', '-10'],  # apply 10 db attenuation
        ...             ['remix', '-'],  # merge all the channels
        ...             ['speed', f'{speed:.5f}'],  # duration is now 0.5 ~ 2.0 seconds.
        ...             ['rate', f'{self.sample_rate}'],
        ...             ['pad', '0', '1.5'],  # add 1.5 seconds silence at the end
        ...             ['trim', '0', '2'],  # get the first 2 seconds
        ...         ]
        ...         waveform, _ = torchaudio.sox_effects.apply_effects_file(
        ...             self.flist[index], effects)
        ...         return waveform
        ...
        ...     def __len__(self):
        ...         return len(self.flist)
        ...
        >>> dataset = RandomPerturbationFile(file_list, sample_rate=8000)
        >>> loader = torch.utils.data.DataLoader(dataset, batch_size=32)
        >>> for batch in loader:
        >>>     pass
moto's avatar
moto committed
264
    """
265
266
267
268
269
    if not torch.jit.is_scripting():
        if hasattr(path, 'read'):
            return torchaudio._torchaudio.apply_effects_fileobj(
                path, effects, normalize, channels_first, format)
        path = os.fspath(path)
270
    return torch.ops.torchaudio.sox_effects_apply_effects_file(
271
        path, effects, normalize, channels_first, format)