Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
301e2e98
Commit
301e2e98
authored
Sep 11, 2018
by
David Pollack
Committed by
Soumith Chintala
Dec 25, 2018
Browse files
sox effects and documentation
parent
db0da559
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
373 additions
and
196 deletions
+373
-196
docs/source/index.rst
docs/source/index.rst
+2
-0
docs/source/legacy.rst
docs/source/legacy.rst
+7
-0
docs/source/sox_effects.rst
docs/source/sox_effects.rst
+12
-0
setup.py
setup.py
+1
-1
test/test.py
test/test.py
+25
-7
test/test_dataloader.py
test/test_dataloader.py
+4
-4
test/test_sox_effects.py
test/test_sox_effects.py
+148
-11
torchaudio/__init__.py
torchaudio/__init__.py
+63
-54
torchaudio/datasets/vctk.py
torchaudio/datasets/vctk.py
+17
-11
torchaudio/datasets/yesno.py
torchaudio/datasets/yesno.py
+2
-2
torchaudio/legacy.py
torchaudio/legacy.py
+12
-11
torchaudio/sox_effects.py
torchaudio/sox_effects.py
+48
-68
torchaudio/torch_sox.cpp
torchaudio/torch_sox.cpp
+31
-26
torchaudio/torch_sox.h
torchaudio/torch_sox.h
+1
-1
No files found.
docs/source/index.rst
View file @
301e2e98
...
...
@@ -7,8 +7,10 @@ The :mod:`torchaudio` package consists of I/O, popular datasets and common audio
:maxdepth: 2
:caption: Package Reference
sox_effects
datasets
transforms
legacy
.. automodule:: torchaudio
:members:
docs/source/legacy.rst
0 → 100644
View file @
301e2e98
torchaudio.legacy
======================
Legacy loading and save functions.
.. automodule:: torchaudio.legacy
:members:
docs/source/sox_effects.rst
0 → 100644
View file @
301e2e98
torchaudio.sox_effects
======================
Create SoX effects chain for preprocessing audio.
.. currentmodule:: torchaudio.sox_effects
.. autoclass:: SoxEffect
:members:
.. autoclass:: SoxEffectsChain
:members: append_effect_to_chain, sox_build_flow_effects, clear_chain, set_input_file
setup.py
View file @
301e2e98
...
...
@@ -5,7 +5,7 @@ from torch.utils.cpp_extension import BuildExtension, CppExtension
setup
(
name
=
"torchaudio"
,
version
=
"0.
1
"
,
version
=
"0.
2
"
,
description
=
"An audio package for PyTorch"
,
url
=
"https://github.com/pytorch/audio"
,
author
=
"Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough"
,
...
...
test/test.py
View file @
301e2e98
...
...
@@ -82,11 +82,29 @@ class Test_LoadSave(unittest.TestCase):
self
.
assertEqual
(
sr
,
44100
)
self
.
assertEqual
(
x
.
size
(),
(
2
,
278756
))
# check normalizing
x
,
sr
=
torchaudio
.
load
(
self
.
test_filepath
,
normalization
=
True
)
self
.
assertEqual
(
x
.
dtype
,
torch
.
float32
)
self
.
assertTrue
(
x
.
min
()
>=
-
1.0
)
self
.
assertTrue
(
x
.
max
()
<=
1.0
)
# check no normalizing
x
,
_
=
torchaudio
.
load
(
self
.
test_filepath
,
normalization
=
False
)
self
.
assertTrue
(
x
.
min
()
<=
-
1.0
)
self
.
assertTrue
(
x
.
max
()
>=
1.0
)
# check offset
offset
=
15
x
,
_
=
torchaudio
.
load
(
self
.
test_filepath
)
x_offset
,
_
=
torchaudio
.
load
(
self
.
test_filepath
,
offset
=
offset
)
self
.
assertTrue
(
x
[:,
offset
:].
allclose
(
x_offset
))
# check number of frames
n
=
201
x
,
_
=
torchaudio
.
load
(
self
.
test_filepath
,
num_frames
=
n
)
self
.
assertTrue
(
x
.
size
(),
(
2
,
n
))
# check channels first
x
,
_
=
torchaudio
.
load
(
self
.
test_filepath
,
channels_first
=
False
)
self
.
assertEqual
(
x
.
size
(),
(
278756
,
2
))
# check different input tensor type
x
,
_
=
torchaudio
.
load
(
self
.
test_filepath
,
torch
.
LongTensor
(),
normalization
=
False
)
self
.
assertTrue
(
isinstance
(
x
,
torch
.
LongTensor
))
# check raising errors
with
self
.
assertRaises
(
OSError
):
...
...
@@ -108,8 +126,8 @@ class Test_LoadSave(unittest.TestCase):
os
.
unlink
(
output_path
)
def
test_4_load_partial
(
self
):
num_frames
=
10
0
offset
=
20
0
num_frames
=
10
1
offset
=
20
1
# load entire mono sinewave wav file, load a partial copy and then compare
input_sine_path
=
os
.
path
.
join
(
self
.
test_dirpath
,
'assets'
,
'sinewave.wav'
)
x_sine_full
,
sr_sine
=
torchaudio
.
load
(
input_sine_path
)
...
...
test/test_dataloader.py
View file @
301e2e98
...
...
@@ -16,10 +16,10 @@ class TORCHAUDIODS(Dataset):
self
.
data
=
[
os
.
path
.
join
(
self
.
asset_dirpath
,
fn
)
for
fn
in
os
.
listdir
(
self
.
asset_dirpath
)]
self
.
si
,
self
.
ei
=
torchaudio
.
info
(
os
.
path
.
join
(
self
.
asset_dirpath
,
"sinewave.wav"
))
self
.
si
.
precision
=
16
self
.
E
=
torchaudio
.
sox_effects
.
SoxEffects
()
self
.
E
.
sox_
append_effect_to_chain
(
"rate"
,
[
self
.
si
.
rate
])
# resample to 16000hz
self
.
E
.
sox_
append_effect_to_chain
(
"channels"
,
[
self
.
si
.
channels
])
# mono singal
self
.
E
.
sox_
append_effect_to_chain
(
"trim"
,
[
0
,
1
])
# first
sec
of audio
self
.
E
=
torchaudio
.
sox_effects
.
SoxEffects
Chain
()
self
.
E
.
append_effect_to_chain
(
"rate"
,
[
self
.
si
.
rate
])
# resample to 16000hz
self
.
E
.
append_effect_to_chain
(
"channels"
,
[
self
.
si
.
channels
])
# mono singal
self
.
E
.
append_effect_to_chain
(
"trim"
,
[
0
,
"16000s"
])
# first
16000 samples
of audio
def
__getitem__
(
self
,
index
):
fn
=
self
.
data
[
index
]
...
...
test/test_sox_effects.py
View file @
301e2e98
...
...
@@ -5,31 +5,40 @@ import math
import
os
class
Test_SoxEffects
(
unittest
.
TestCase
):
class
Test_SoxEffects
Chain
(
unittest
.
TestCase
):
test_dirpath
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
test_filepath
=
os
.
path
.
join
(
test_dirpath
,
"assets"
,
"steam-train-whistle-daniel_simon.mp3"
)
def
test_single_channel
(
self
):
fn_sine
=
os
.
path
.
join
(
self
.
test_dirpath
,
"assets"
,
"sinewave.wav"
)
E
=
torchaudio
.
sox_effects
.
SoxEffectsChain
()
E
.
set_input_file
(
fn_sine
)
E
.
append_effect_to_chain
(
"echos"
,
[
0.8
,
0.7
,
40
,
0.25
,
63
,
0.3
])
x
,
sr
=
E
.
sox_build_flow_effects
()
# check if effects worked
#print(x.size())
def
test_rate_channels
(
self
):
target_rate
=
16000
target_channels
=
1
E
=
torchaudio
.
sox_effects
.
SoxEffects
()
E
=
torchaudio
.
sox_effects
.
SoxEffects
Chain
()
E
.
set_input_file
(
self
.
test_filepath
)
E
.
sox_
append_effect_to_chain
(
"rate"
,
[
target_rate
])
E
.
sox_
append_effect_to_chain
(
"channels"
,
[
target_channels
])
E
.
append_effect_to_chain
(
"rate"
,
[
target_rate
])
E
.
append_effect_to_chain
(
"channels"
,
[
target_channels
])
x
,
sr
=
E
.
sox_build_flow_effects
()
# check if effects worked
self
.
assertEqual
(
sr
,
target_rate
)
self
.
assertEqual
(
x
.
size
(
0
),
target_channels
)
def
test_
other
(
self
):
def
test_
lowpass_speed
(
self
):
speed
=
.
8
si
,
_
=
torchaudio
.
info
(
self
.
test_filepath
)
E
=
torchaudio
.
sox_effects
.
SoxEffects
()
E
=
torchaudio
.
sox_effects
.
SoxEffects
Chain
()
E
.
set_input_file
(
self
.
test_filepath
)
E
.
sox_
append_effect_to_chain
(
"lowpass"
,
100
)
E
.
sox_
append_effect_to_chain
(
"speed"
,
speed
)
E
.
sox_
append_effect_to_chain
(
"rate"
,
si
.
rate
)
E
.
append_effect_to_chain
(
"lowpass"
,
100
)
E
.
append_effect_to_chain
(
"speed"
,
speed
)
E
.
append_effect_to_chain
(
"rate"
,
si
.
rate
)
x
,
sr
=
E
.
sox_build_flow_effects
()
# check if effects worked
self
.
assertEqual
(
x
.
size
(
1
),
int
((
si
.
length
/
si
.
channels
)
/
speed
))
...
...
@@ -43,17 +52,145 @@ class Test_SoxEffects(unittest.TestCase):
ei_out
.
encoding
=
torchaudio
.
get_sox_encoding_t
(
9
)
ei_out
.
bits_per_sample
=
8
si_in
,
ei_in
=
torchaudio
.
info
(
self
.
test_filepath
)
E
=
torchaudio
.
sox_effects
.
SoxEffects
(
out_siginfo
=
si_out
,
out_encinfo
=
ei_out
)
E
=
torchaudio
.
sox_effects
.
SoxEffects
Chain
(
out_siginfo
=
si_out
,
out_encinfo
=
ei_out
)
E
.
set_input_file
(
self
.
test_filepath
)
x
,
sr
=
E
.
sox_build_flow_effects
()
# Note: the sample rate is reported as "changed", but no downsampling occured
# also the number of channels has not changed. Run rate and channels effects
# to make those changes
# to make those changes. However, the output was encoded into ulaw because the
# number of unique values in the output is less than 256.
self
.
assertLess
(
x
.
unique
().
size
(
0
),
2
**
8
)
self
.
assertEqual
(
x
.
size
(
0
),
si_in
.
channels
)
self
.
assertEqual
(
sr
,
si_out
.
rate
)
self
.
assertEqual
(
x
.
numel
(),
si_in
.
length
)
def
test_band_chorus
(
self
):
si_in
,
ei_in
=
torchaudio
.
info
(
self
.
test_filepath
)
E
=
torchaudio
.
sox_effects
.
SoxEffectsChain
(
out_encinfo
=
ei_in
,
out_siginfo
=
si_in
)
E
.
set_input_file
(
self
.
test_filepath
)
E
.
append_effect_to_chain
(
"band"
,
[
"-n"
,
"10k"
,
"3.5k"
])
E
.
append_effect_to_chain
(
"chorus"
,
[.
5
,
.
7
,
55
,
0.4
,
.
25
,
2
,
'-s'
])
x
,
sr
=
E
.
sox_build_flow_effects
()
#print(x.size(), sr)
def
test_synth
(
self
):
si_in
,
ei_in
=
torchaudio
.
info
(
self
.
test_filepath
)
E
=
torchaudio
.
sox_effects
.
SoxEffectsChain
(
out_encinfo
=
ei_in
,
out_siginfo
=
si_in
)
E
.
set_input_file
(
self
.
test_filepath
)
E
.
append_effect_to_chain
(
"synth"
,
[
"1"
,
"pinknoise"
,
"mix"
])
E
.
append_effect_to_chain
(
"rate"
,
[
44100
])
E
.
append_effect_to_chain
(
"channels"
,
[
2
])
x
,
sr
=
E
.
sox_build_flow_effects
()
#print(x.size(), sr)
def
test_gain
(
self
):
E
=
torchaudio
.
sox_effects
.
SoxEffectsChain
()
E
.
set_input_file
(
self
.
test_filepath
)
E
.
append_effect_to_chain
(
"gain"
,
[
"5"
])
x
,
sr
=
E
.
sox_build_flow_effects
()
E
.
clear_chain
()
self
.
assertTrue
(
x
.
abs
().
max
().
item
(),
1.
)
E
.
set_input_file
(
self
.
test_filepath
)
E
.
append_effect_to_chain
(
"gain"
,
[
"-e"
,
"-5"
])
x
,
sr
=
E
.
sox_build_flow_effects
()
E
.
clear_chain
()
self
.
assertLess
(
x
.
abs
().
max
().
item
(),
1.
)
E
.
set_input_file
(
self
.
test_filepath
)
E
.
append_effect_to_chain
(
"gain"
,
[
"-b"
,
"8"
])
x
,
sr
=
E
.
sox_build_flow_effects
()
E
.
clear_chain
()
self
.
assertTrue
(
x
.
abs
().
max
().
item
(),
1.
)
E
.
set_input_file
(
self
.
test_filepath
)
E
.
append_effect_to_chain
(
"gain"
,
[
"-n"
,
"-10"
])
x
,
sr
=
E
.
sox_build_flow_effects
()
E
.
clear_chain
()
self
.
assertLess
(
x
.
abs
().
max
().
item
(),
1.
)
def
test_tempo
(
self
):
tempo
=
.
8
si
,
_
=
torchaudio
.
info
(
self
.
test_filepath
)
E
=
torchaudio
.
sox_effects
.
SoxEffectsChain
()
E
.
set_input_file
(
self
.
test_filepath
)
E
.
append_effect_to_chain
(
"tempo"
,
[
"-s"
,
tempo
])
x
,
sr
=
E
.
sox_build_flow_effects
()
# check if effect worked
self
.
assertEqual
(
x
.
size
(
1
),
int
((
si
.
length
/
si
.
channels
)
/
tempo
))
def
test_trim
(
self
):
x_orig
,
_
=
torchaudio
.
load
(
self
.
test_filepath
)
offset
=
"10000s"
offset_int
=
int
(
offset
[:
-
1
])
num_frames
=
"200s"
num_frames_int
=
int
(
num_frames
[:
-
1
])
E
=
torchaudio
.
sox_effects
.
SoxEffectsChain
()
E
.
set_input_file
(
self
.
test_filepath
)
E
.
append_effect_to_chain
(
"trim"
,
[
offset
,
num_frames
])
x
,
sr
=
E
.
sox_build_flow_effects
()
# check if effect worked
self
.
assertTrue
(
x
.
allclose
(
x_orig
[:,
offset_int
:(
offset_int
+
num_frames_int
)],
rtol
=
1e-4
,
atol
=
1e-4
))
def
test_silence_contrast
(
self
):
si
,
_
=
torchaudio
.
info
(
self
.
test_filepath
)
E
=
torchaudio
.
sox_effects
.
SoxEffectsChain
()
E
.
set_input_file
(
self
.
test_filepath
)
E
.
append_effect_to_chain
(
"silence"
,
[
1
,
100
,
1
])
E
.
append_effect_to_chain
(
"contrast"
,
[])
x
,
sr
=
E
.
sox_build_flow_effects
()
# check if effect worked
self
.
assertLess
(
x
.
numel
(),
si
.
length
)
def
test_reverse
(
self
):
x_orig
,
_
=
torchaudio
.
load
(
self
.
test_filepath
)
E
=
torchaudio
.
sox_effects
.
SoxEffectsChain
()
E
.
set_input_file
(
self
.
test_filepath
)
E
.
append_effect_to_chain
(
"reverse"
,
""
)
x_rev
,
_
=
E
.
sox_build_flow_effects
()
# check if effect worked
rev_idx
=
torch
.
LongTensor
(
range
(
x_orig
.
size
(
1
))[::
-
1
])
self
.
assertTrue
(
x_orig
.
allclose
(
x_rev
[:,
rev_idx
],
rtol
=
1e-5
,
atol
=
2e-5
))
def
test_compand_fade
(
self
):
E
=
torchaudio
.
sox_effects
.
SoxEffectsChain
()
E
.
set_input_file
(
self
.
test_filepath
)
E
.
append_effect_to_chain
(
"compand"
,
[
"0.3,1"
,
"6:-70,-60,-20"
,
"-5"
,
"-90"
,
"0.2"
])
E
.
append_effect_to_chain
(
"fade"
,
[
"q"
,
"0.25"
,
"0"
,
"0.33"
])
x
,
_
=
E
.
sox_build_flow_effects
()
# check if effect worked
#print(x.size())
def
test_biquad_delay
(
self
):
si
,
_
=
torchaudio
.
info
(
self
.
test_filepath
)
E
=
torchaudio
.
sox_effects
.
SoxEffectsChain
()
E
.
set_input_file
(
self
.
test_filepath
)
E
.
append_effect_to_chain
(
"biquad"
,
[
"0.25136437"
,
"0.50272873"
,
"0.25136437"
,
"1.0"
,
"-0.17123075"
,
"0.17668821"
])
E
.
append_effect_to_chain
(
"delay"
,
[
"15000s"
])
x
,
_
=
E
.
sox_build_flow_effects
()
# check if effect worked
self
.
assertTrue
(
x
.
size
(
1
)
==
(
si
.
length
/
si
.
channels
)
+
15000
)
def
test_invalid_effect_name
(
self
):
E
=
torchaudio
.
sox_effects
.
SoxEffectsChain
()
E
.
set_input_file
(
self
.
test_filepath
)
# there is no effect named "special"
with
self
.
assertRaises
(
LookupError
):
E
.
append_effect_to_chain
(
"special"
,
[
""
])
def
test_unimplemented_effect
(
self
):
E
=
torchaudio
.
sox_effects
.
SoxEffectsChain
()
E
.
set_input_file
(
self
.
test_filepath
)
# the sox spectrogram function is not implemented in torchaudio
with
self
.
assertRaises
(
NotImplementedError
):
E
.
append_effect_to_chain
(
"spectrogram"
,
[
""
])
def
test_invalid_effect_options
(
self
):
E
=
torchaudio
.
sox_effects
.
SoxEffectsChain
()
E
.
set_input_file
(
self
.
test_filepath
)
# first two options should be combined to "0.3,1"
E
.
append_effect_to_chain
(
"compand"
,
[
"0.3"
,
"1"
,
"6:-70,-60,-20"
,
"-5"
,
"-90"
,
"0.2"
])
with
self
.
assertRaises
(
RuntimeError
):
E
.
sox_build_flow_effects
()
if
__name__
==
'__main__'
:
torchaudio
.
initialize_sox
()
unittest
.
main
()
...
...
torchaudio/__init__.py
View file @
301e2e98
from
__future__
import
division
,
print_function
import
os.path
import
torch
import
_torch_sox
from
torchaudio
import
transforms
,
datasets
,
sox_effects
from
torchaudio
import
transforms
,
datasets
,
sox_effects
,
legacy
def
check_input
(
src
):
...
...
@@ -17,7 +18,7 @@ def load(filepath,
out
=
None
,
normalization
=
True
,
channels_first
=
True
,
num_frames
=
-
1
,
num_frames
=
0
,
offset
=
0
,
signalinfo
=
None
,
encodinginfo
=
None
,
...
...
@@ -27,13 +28,13 @@ def load(filepath,
Args:
filepath (string): path to audio file
out (Tensor, optional): an output Tensor to use instead of creating one
normalization (bool, number, or
function
, optional): If boolean `True`, then output is divided by `1 << 31`
(assumes
16-bit depth
audio, and normalizes to `[0, 1]`.
normalization (bool, number, or
callable
, optional): If boolean `True`, then output is divided by `1 << 31`
(assumes
signed 32-bit
audio
)
, and normalizes to `[0, 1]`.
If `number`, then output is divided by that number
If `
function
`, then the output is passed as a parameter
If `
callable
`, then the output is passed as a parameter
to the given function, then the output is divided by
the result.
num_frames (int, optional): number of frames to load.
-1
to load everything after the offset.
num_frames (int, optional): number of frames to load.
0
to load everything after the offset.
offset (int, optional): number of frames from the start of the file to begin data loading.
signalinfo (sox_signalinfo_t, optional): a sox_signalinfo_t type, which could be helpful if the
audio type cannot be automatically determine
...
...
@@ -42,18 +43,18 @@ def load(filepath,
filetype (str, optional): a filetype or extension to be set if sox cannot determine it automatically
Returns: tuple(Tensor, int)
- Tensor: output Tensor of size `[L x C]` where L is the number of audio frames, C is the number of channels
- Tensor: output Tensor of size
`[C x L]` or
`[L x C]` where L is the number of audio frames, C is the number of channels
- int: the sample-rate of the audio (as listed in the metadata of the file)
Example::
>>> data, sample_rate = torchaudio.load('foo.mp3')
>>> print(data.size())
torch.Size([278756
, 2
])
torch.Size([
2,
278756])
>>> print(sample_rate)
44100
>>> data_vol
ume
_normalized, _ = torchaudio.load('foo.mp3', normalization=lambda x: torch.abs(x).max())
>>> print(data_vol
ume
_normalized.abs().max())
>>> data_vol_normalized, _ = torchaudio.load('foo.mp3', normalization=lambda x: torch.abs(x).max())
>>> print(data_vol_normalized.abs().max())
1.
"""
...
...
@@ -88,6 +89,9 @@ def load(filepath,
def
save
(
filepath
,
src
,
sample_rate
,
precision
=
16
,
channels_first
=
True
):
"""Convenience function for `save_encinfo`.
"""
si
=
sox_signalinfo_t
()
ch_idx
=
0
if
channels_first
else
1
si
.
rate
=
sample_rate
...
...
@@ -97,12 +101,17 @@ def save(filepath, src, sample_rate, precision=16, channels_first=True):
return
save_encinfo
(
filepath
,
src
,
channels_first
,
si
)
def
save_encinfo
(
filepath
,
src
,
channels_first
=
True
,
signalinfo
=
None
,
encodinginfo
=
None
,
filetype
=
None
):
"""Saves a Tensor with audio signal to disk as a standard format like mp3, wav, etc.
def
save_encinfo
(
filepath
,
src
,
channels_first
=
True
,
signalinfo
=
None
,
encodinginfo
=
None
,
filetype
=
None
):
"""Saves a Tensor of an audio signal to disk as a standard format like mp3, wav, etc.
Args:
filepath (string): path to audio file
src (Tensor): an input 2D Tensor of shape `[L x C]` where L is
src (Tensor): an input 2D Tensor of shape
`[C x L]` or
`[L x C]` where L is
the number of audio frames, C is the number of channels
signalinfo (sox_signalinfo_t): a sox_signalinfo_t type, which could be helpful if the
audio type cannot be automatically determine
...
...
@@ -129,10 +138,10 @@ def save_encinfo(filepath, src, channels_first=True, signalinfo=None, encodingin
if
src
.
dim
()
==
1
:
# 1d tensors as assumed to be mono signals
src
.
unsqueeze_
(
ch_idx
)
elif
src
.
dim
()
>
2
or
src
.
size
(
ch_idx
)
>
src
.
size
(
len_idx
)
:
# assumes
num_samples >
num_channels
elif
src
.
dim
()
>
2
or
src
.
size
(
ch_idx
)
>
16
:
# assumes num_channels
< 16
raise
ValueError
(
"Expected format
(L x C),
C <
L
, but found {}"
.
format
(
src
.
size
()))
"Expected format
where
C <
16
, but found {}"
.
format
(
src
.
size
()))
# sox stores the sample rate as a float, though practically sample rates are almost always integers
# convert integers to floats
if
not
isinstance
(
signalinfo
.
rate
,
float
):
...
...
@@ -178,29 +187,8 @@ def info(filepath):
return
_torch_sox
.
get_info
(
filepath
)
def
effect_names
():
"""Gets list of valid sox effect names
Returns: list[str]
Example::
>>> EFFECT_NAMES = torchaudio.effect_names()
"""
return
_torch_sox
.
get_effect_names
()
def
SoxEffect
():
"""Create a object to hold sox effect and options to pass between python and c++
Returns: SoxEffects(object)
- ename (str), name of effect
- eopts (list[str]), list of effect options
"""
return
_torch_sox
.
SoxEffect
()
def
sox_signalinfo_t
():
"""Create a sox_signalinfo_t object.
This object can be used to set the sample
r
"""Create a sox_signalinfo_t object. This object can be used to set the sample
rate, number of channels, length, bit precision and headroom multiplier
primarily for effects
...
...
@@ -210,6 +198,13 @@ def sox_signalinfo_t():
- precision (int), bit precision
- length (int), length of audio, 0 for unspecified and -1 for unknown
- mult (float, optional), headroom multiplier for effects and None for no multiplier
Example::
>>> si = torchaudio.sox_signalinfo_t()
>>> si.channels = 1
>>> si.rate = 16000.
>>> si.precision = 16
>>> si.length = 0
"""
return
_torch_sox
.
sox_signalinfo_t
()
...
...
@@ -231,6 +226,17 @@ def sox_encodinginfo_t():
- reverse_nibbles (sox_option_t), reverse nibbles, use sox_option_default
- reverse_bits (sox_option_t), reverse bytes, use sox_option_default
- opposite_endian (sox_bool), change endianness, use sox_false
Example::
>>> ei = torchaudio.sox_encodinginfo_t()
>>> ei.encoding = torchaudio.get_sox_encoding_t(1)
>>> ei.bits_per_sample = 16
>>> ei.compression = 0
>>> ei.reverse_bytes = torchaudio.get_sox_option_t(2)
>>> ei.reverse_nibbles = torchaudio.get_sox_option_t(2)
>>> ei.reverse_bits = torchaudio.get_sox_option_t(2)
>>> ei.opposite_endian = torchaudio.get_sox_bool(0)
"""
ei
=
_torch_sox
.
sox_encodinginfo_t
()
sdo
=
get_sox_option_t
(
2
)
# sox_default_option
...
...
@@ -245,7 +251,7 @@ def get_sox_encoding_t(i=None):
Args:
i (int, optional): choose type or get a dict with all possible options
use
.
__members__ to see all options when not specified
use
`
__members__
`
to see all options when not specified
Returns:
sox_encoding_t: a sox_encoding_t type for output encoding
"""
...
...
@@ -261,7 +267,7 @@ def get_sox_option_t(i=2):
Args:
i (int, optional): choose type or get a dict with all possible options
use
.
__members__ to see all options when not specified.
use
`
__members__
`
to see all options when not specified.
Defaults to sox_option_default.
Returns:
sox_option_t: a sox_option_t type
...
...
@@ -277,7 +283,7 @@ def get_sox_bool(i=0):
Args:
i (int, optional): choose type or get a dict with all possible options
use
.
__members__ to see all options when not specified.
use
`
__members__
`
to see all options when not specified.
Defaults to sox_false.
Returns:
sox_bool: a sox_bool type
...
...
@@ -289,22 +295,25 @@ def get_sox_bool(i=0):
def
initialize_sox
():
"""Initialize sox for effects chain.
N
ot required for simple
loading. Importantly,
only initialize
this
once and do not shutdown
until you have done effect chain
calls even when loading multiple file
s.
"""Initialize sox for
use with
effects chain
s
.
This is n
ot required for simple
loading. Importantly,
only
run `
initialize
_sox`
once and do not shutdown
after each effect chain, but rather once you are finished with all effects chain
s.
"""
return
_torch_sox
.
initialize_sox
()
def
shutdown_sox
():
"""Showdown sox for effects chain. Not required for simple loading. Importantly,
only call once. Attempting to re-initialize sox will result seg faults.
only call once. Attempting to re-initialize sox will result
in
seg faults.
"""
return
_torch_sox
.
shutdown_sox
()
def
_audio_normalization
(
signal
,
normalization
):
# assumes signed 32-bit depth, which is what sox uses internally
"""Audio normalization of a tensor in-place. The normalization can be a bool,
a number, or a callable that takes the audio tensor as an input. SoX uses
32-bit signed integers internally, thus bool normalizes based on that assumption.
"""
if
not
normalization
:
return
...
...
torchaudio/datasets/vctk.py
View file @
301e2e98
...
...
@@ -35,13 +35,18 @@ def make_manifest(dir):
def
read_audio
(
fp
,
downsample
=
True
):
sig
,
sr
=
torchaudio
.
load
(
fp
)
if
downsample
:
# 48khz -> 16 khz
if
sig
.
size
(
0
)
%
3
==
0
:
sig
=
sig
[::
3
].
contiguous
()
E
=
torchaudio
.
sox_effects
.
SoxEffects
()
E
.
set_input_file
(
fp
)
E
.
sox_append_effect_to_chain
(
"gain"
,
[
"-h"
])
E
.
sox_append_effect_to_chain
(
"channels"
,
[
1
])
E
.
sox_append_effect_to_chain
(
"rate"
,
[
16000
])
E
.
sox_append_effect_to_chain
(
"gain"
,
[
"-rh"
])
E
.
sox_append_effect_to_chain
(
"dither"
,
[
"-s"
])
sig
,
sr
=
E
.
sox_build_flow_effects
()
else
:
sig
=
sig
[:
-
(
sig
.
size
(
0
)
%
3
):
3
].
contiguous
()
sig
,
sr
=
torchaudio
.
load
(
fp
)
sig
=
sig
.
contiguous
()
return
sig
,
sr
...
...
@@ -168,8 +173,8 @@ class VCTK(data.Dataset):
# download files
try
:
os
.
makedirs
(
os
.
path
.
join
(
self
.
root
,
self
.
raw_folder
))
os
.
makedirs
(
os
.
path
.
join
(
self
.
root
,
self
.
processed_folder
))
os
.
makedirs
(
os
.
path
.
join
(
self
.
root
,
self
.
raw_folder
))
except
OSError
as
e
:
if
e
.
errno
==
errno
.
EEXIST
:
pass
...
...
@@ -191,6 +196,7 @@ class VCTK(data.Dataset):
os
.
unlink
(
file_path
)
# process and save as torch files
torchaudio
.
initialize_sox
()
print
(
'Processing...'
)
shutil
.
copyfile
(
os
.
path
.
join
(
dset_abs_path
,
"COPYING"
),
...
...
@@ -213,10 +219,10 @@ class VCTK(data.Dataset):
f_rel_no_ext
=
os
.
path
.
basename
(
f
).
rsplit
(
"."
,
1
)[
0
]
sig
=
read_audio
(
f
,
downsample
=
self
.
downsample
)[
0
]
tensors
.
append
(
sig
)
lengths
.
append
(
sig
.
size
(
0
))
lengths
.
append
(
sig
.
size
(
1
))
labels
.
append
(
utterences
[
f_rel_no_ext
])
self
.
max_len
=
sig
.
size
(
0
)
if
sig
.
size
(
0
)
>
self
.
max_len
else
self
.
max_len
self
.
max_len
=
sig
.
size
(
1
)
if
sig
.
size
(
1
)
>
self
.
max_len
else
self
.
max_len
# sort sigs/labels: longest -> shortest
tensors
,
labels
=
zip
(
*
[(
b
,
c
)
for
(
a
,
b
,
c
)
in
sorted
(
zip
(
lengths
,
tensors
,
labels
),
key
=
lambda
x
:
x
[
0
],
reverse
=
True
)])
...
...
@@ -232,5 +238,5 @@ class VCTK(data.Dataset):
self
.
_write_info
((
n
*
self
.
chunk_size
)
+
i
+
1
)
if
not
self
.
dev_mode
:
shutil
.
rmtree
(
raw_abs_dir
,
ignore_errors
=
True
)
torchaudio
.
shutdown_sox
()
print
(
'Done!'
)
torchaudio/datasets/yesno.py
View file @
301e2e98
...
...
@@ -128,12 +128,12 @@ class YESNO(data.Dataset):
full_path
=
os
.
path
.
join
(
dset_abs_path
,
f
)
sig
,
sr
=
torchaudio
.
load
(
full_path
)
tensors
.
append
(
sig
)
lengths
.
append
(
sig
.
size
(
0
))
lengths
.
append
(
sig
.
size
(
1
))
labels
.
append
(
os
.
path
.
basename
(
f
).
split
(
"."
,
1
)[
0
].
split
(
"_"
))
# sort sigs/labels: longest -> shortest
tensors
,
labels
=
zip
(
*
[(
b
,
c
)
for
(
a
,
b
,
c
)
in
sorted
(
zip
(
lengths
,
tensors
,
labels
),
key
=
lambda
x
:
x
[
0
],
reverse
=
True
)])
self
.
max_len
=
tensors
[
0
].
size
(
0
)
self
.
max_len
=
tensors
[
0
].
size
(
1
)
torch
.
save
(
(
tensors
,
labels
),
os
.
path
.
join
(
...
...
torchaudio/legacy.py
View file @
301e2e98
from
__future__
import
division
,
print_function
import
os.path
import
torch
import
_torch_sox
from
torchaudio
import
save
as
save_new
,
load
as
load_new
import
torchaudio
def
load
(
filepath
,
out
=
None
,
normalization
=
None
,
num_frames
=
-
1
,
offset
=
0
):
def
load
(
filepath
,
out
=
None
,
normalization
=
None
,
num_frames
=
0
,
offset
=
0
):
"""Loads an audio file from disk into a Tensor. The default options have
changed as of torchaudio 0.2 and this function maintains option defaults
from version 0.1.
...
...
@@ -26,14 +27,14 @@ def load(filepath, out=None, normalization=None, num_frames=-1, offset=0):
Example::
>>> data, sample_rate = torchaudio.load('foo.mp3')
>>> data, sample_rate = torchaudio.
legacy.
load('foo.mp3')
>>> print(data.size())
torch.Size([278756, 2])
>>> print(sample_rate)
44100
"""
return
load
_new
(
filepath
,
out
,
normalization
,
False
,
num_frames
,
offset
)
return
torchaudio
.
load
(
filepath
,
out
,
normalization
,
False
,
num_frames
,
offset
)
def
save
(
filepath
,
src
,
sample_rate
,
precision
=
32
):
...
...
@@ -50,8 +51,8 @@ def save(filepath, src, sample_rate, precision=32):
Example::
>>> data, sample_rate = torchaudio.load('foo.mp3')
>>> torchaudio.save('foo.wav', data, sample_rate)
>>> data, sample_rate = torchaudio.
legacy.
load('foo.mp3')
>>> torchaudio.
legacy.
save('foo.wav', data, sample_rate)
"""
save
_new
(
filepath
,
src
,
sample_rate
,
precision
,
False
)
torchaudio
.
save
(
filepath
,
src
,
sample_rate
,
precision
,
False
)
torchaudio/sox_effects.py
View file @
301e2e98
from
__future__
import
division
,
print_function
import
torch
import
_torch_sox
import
torchaudio
EFFECT_NAMES
=
set
(
_torch_sox
.
get_effect_names
())
"""
Notes:
sox_signalinfo_t {
sox_rate_t rate; /**< samples per second, 0 if unknown */
unsigned channels; /**< number of sound channels, 0 if unknown */
unsigned precision; /**< bits per sample, 0 if unknown */
sox_uint64_t length; /**< samples * chans in file, 0 if unspecified, -1 if unknown */
double * mult; /**< Effects headroom multiplier; may be null */
}
typedef struct sox_encodinginfo_t {
sox_encoding_t encoding; /**< format of sample numbers */
unsigned bits_per_sample; /**< 0 if unknown or variable; uncompressed value if lossless; compressed value if lossy */
double compression; /**< compression factor (where applicable) */
sox_option_t reverse_bytes; /** use sox_option_default */
sox_option_t reverse_nibbles; /** use sox_option_default */
sox_option_t reverse_bits; /** use sox_option_default */
sox_bool opposite_endian; /** use sox_false */
}
sox_encodings_t = {
"SOX_ENCODING_UNKNOWN",
"SOX_ENCODING_SIGN2",
"SOX_ENCODING_UNSIGNED",
"SOX_ENCODING_FLOAT",
"SOX_ENCODING_FLOAT_TEXT",
"SOX_ENCODING_FLAC",
"SOX_ENCODING_HCOM",
"SOX_ENCODING_WAVPACK",
"SOX_ENCODING_WAVPACKF",
"SOX_ENCODING_ULAW",
"SOX_ENCODING_ALAW",
"SOX_ENCODING_G721",
"SOX_ENCODING_G723",
"SOX_ENCODING_CL_ADPCM",
"SOX_ENCODING_CL_ADPCM16",
"SOX_ENCODING_MS_ADPCM",
"SOX_ENCODING_IMA_ADPCM",
"SOX_ENCODING_OKI_ADPCM",
"SOX_ENCODING_DPCM",
"SOX_ENCODING_DWVW",
"SOX_ENCODING_DWVWN",
"SOX_ENCODING_GSM",
"SOX_ENCODING_MP3",
"SOX_ENCODING_VORBIS",
"SOX_ENCODING_AMR_WB",
"SOX_ENCODING_AMR_NB",
"SOX_ENCODING_CVSD",
"SOX_ENCODING_LPC10",
"SOX_ENCODING_OPUS",
"SOX_ENCODINGS"
}
"""
class
SoxEffects
(
object
):
def
effect_names
():
"""Gets list of valid sox effect names
Returns: list[str]
Example::
>>> EFFECT_NAMES = torchaudio.sox_effects.effect_names()
"""
return
_torch_sox
.
get_effect_names
()
def
SoxEffect
():
"""Create an object for passing sox effect information between python and c++
Returns: SoxEffect(object)
- ename (str), name of effect
- eopts (list[str]), list of effect options
"""
return
_torch_sox
.
SoxEffect
()
class
SoxEffectsChain
(
object
):
"""SoX effects chain class.
"""
EFFECTS_AVAILABLE
=
set
(
effect_names
())
EFFECTS_UNIMPLEMENTED
=
set
([
"spectrogram"
,
"splice"
,
"noiseprof"
,
"fir"
])
def
__init__
(
self
,
normalization
=
True
,
channels_first
=
True
,
out_siginfo
=
None
,
out_encinfo
=
None
,
filetype
=
"raw"
):
self
.
input_file
=
None
...
...
@@ -73,15 +43,12 @@ class SoxEffects(object):
self
.
normalization
=
normalization
self
.
channels_first
=
channels_first
def
sox_check_effect
(
self
,
e
):
if
e
.
lower
()
not
in
EFFECT_NAMES
:
raise
LookupError
(
"Effect name, {}, not valid"
.
format
(
e
.
lower
()))
return
e
.
lower
()
def
sox_append_effect_to_chain
(
self
,
ename
,
eargs
=
None
):
e
=
torchaudio
.
SoxEffect
()
def
append_effect_to_chain
(
self
,
ename
,
eargs
=
None
):
"""Append effect to a sox effects chain.
"""
e
=
SoxEffect
()
# check if we have a valid effect
ename
=
self
.
sox
_check_effect
(
ename
)
ename
=
self
.
_check_effect
(
ename
)
if
eargs
is
None
or
eargs
==
[]:
eargs
=
[
""
]
elif
not
isinstance
(
eargs
,
list
):
...
...
@@ -96,13 +63,15 @@ class SoxEffects(object):
self
.
chain
.
append
(
e
)
def
sox_build_flow_effects
(
self
,
out
=
None
):
"""Build effects chain and flow effects from input file to output tensor
"""
# initialize output tensor
if
out
is
not
None
:
torchaudio
.
check_input
(
out
)
else
:
out
=
torch
.
FloatTensor
()
if
not
len
(
self
.
chain
):
e
=
torchaudio
.
SoxEffect
()
e
=
SoxEffect
()
e
.
ename
=
"no_effects"
e
.
eopts
=
[
""
]
self
.
chain
.
append
(
e
)
...
...
@@ -122,11 +91,22 @@ class SoxEffects(object):
return
out
,
sr
def
clear_chain
(
self
):
"""Clear effects chain in python
"""
self
.
chain
=
[]
def
set_input_file
(
self
,
input_file
):
"""Set input file for input of chain
"""
self
.
input_file
=
input_file
def
_check_effect
(
self
,
e
):
if
e
.
lower
()
in
self
.
EFFECTS_UNIMPLEMENTED
:
raise
NotImplementedError
(
"This effect ({}) is not implement in torchaudio"
.
format
(
e
))
elif
e
.
lower
()
not
in
self
.
EFFECTS_AVAILABLE
:
raise
LookupError
(
"Effect name, {}, not valid"
.
format
(
e
.
lower
()))
return
e
.
lower
()
# https://stackoverflow.com/questions/12472338/flattening-a-list-recursively
# convenience function to flatten list recursively
def
_flatten
(
self
,
x
):
...
...
torchaudio/torch_sox.cpp
View file @
301e2e98
...
...
@@ -109,18 +109,21 @@ int read_audio_file(
sox_encodinginfo_t
*
ei
,
const
char
*
ft
)
{
SoxDescriptor
fd
(
sox_open_read
(
file_name
.
c_str
(),
/*signal=*/
si
,
/*encoding=*/
ei
,
/*filetype=*/
ft
));
SoxDescriptor
fd
(
sox_open_read
(
file_name
.
c_str
(),
si
,
ei
,
ft
));
if
(
fd
.
get
()
==
nullptr
)
{
throw
std
::
runtime_error
(
"Error opening audio file"
);
}
// signal info
const
int
number_of_channels
=
fd
->
signal
.
channels
;
const
int
sample_rate
=
fd
->
signal
.
rate
;
const
int64_t
total_length
=
fd
->
signal
.
length
;
// multiply offset and number of frames by number of channels
offset
*=
number_of_channels
;
nframes
*=
number_of_channels
;
if
(
total_length
==
0
)
{
throw
std
::
runtime_error
(
"Error reading audio file: unknown length"
);
}
...
...
@@ -133,14 +136,10 @@ int read_audio_file(
if
(
offset
>
0
)
{
buffer_length
-=
offset
;
}
if
(
nframes
!=
-
1
&&
buffer_length
>
nframes
)
{
if
(
nframes
>
0
&&
buffer_length
>
nframes
)
{
buffer_length
=
nframes
;
}
// buffer length and offset need to be multipled by the number of channels
buffer_length
*=
number_of_channels
;
offset
*=
number_of_channels
;
// seek to offset point before reading data
if
(
sox_seek
(
fd
.
get
(),
offset
,
0
)
==
SOX_EOF
)
{
throw
std
::
runtime_error
(
"sox_seek reached EOF, try reducing offset or num_samples"
);
...
...
@@ -149,6 +148,7 @@ int read_audio_file(
// read data and fill output tensor
read_audio
(
fd
,
output
,
buffer_length
);
// L x C -> C x L, if desired
if
(
ch_first
)
{
output
.
transpose_
(
1
,
0
);
}
...
...
@@ -167,7 +167,6 @@ void write_audio_file(
"Error writing audio file: input tensor must be contiguous"
);
}
// remove ?
#if SOX_LIB_VERSION_CODE >= 918272 // >= 14.3.0
si
->
mult
=
nullptr
;
#endif
...
...
@@ -248,20 +247,12 @@ int build_flow_effects(const std::string& file_name,
target_encoding
->
opposite_endian
=
sox_false
;
// Reverse endianness
}
// set target precision / bits_per_sample if it's still 0
//if (target_signal->precision == 0)
// target_signal->precision = input->signal.precision;
//if (target_encoding->bits_per_sample == 0)
// target_encoding->bits_per_sample = input->signal.precision;
// check for rate or channels effect and change the output signalinfo accordingly
for
(
SoxEffect
se
:
pyeffs
)
{
if
(
se
.
ename
==
"rate"
)
{
target_signal
->
rate
=
std
::
stod
(
se
.
eopts
[
0
]);
//se.eopts[0] = "";
}
else
if
(
se
.
ename
==
"channels"
)
{
target_signal
->
channels
=
std
::
stoi
(
se
.
eopts
[
0
]);
//se.eopts[0] = "";
}
}
...
...
@@ -271,7 +262,6 @@ int build_flow_effects(const std::string& file_name,
// create buffer and buffer_size for output in memwrite
char
*
buffer
;
size_t
buffer_size
;
//const char* otype = (file_type.empty()) ? (const char*) "raw" : file_type.c_str();
#ifdef __APPLE__
// According to Mozilla Deepspeech sox_open_memstream_write doesn't work
// with OSX
...
...
@@ -287,7 +277,9 @@ int build_flow_effects(const std::string& file_name,
target_encoding
,
file_type
,
nullptr
);
#endif
assert
(
output
);
if
(
output
==
nullptr
)
{
throw
std
::
runtime_error
(
"Error opening output memstream/temporary file"
);
}
// Setup the effects chain to decode/resample
sox_effects_chain_t
*
chain
=
sox_create_effects_chain
(
&
input
->
encoding
,
&
output
->
encoding
);
...
...
@@ -307,11 +299,12 @@ int build_flow_effects(const std::string& file_name,
}
else
{
int
num_opts
=
tae
.
eopts
.
size
();
char
*
sox_args
[
max_num_eopts
];
//for(std::string s : tae.eopts) {
for
(
std
::
vector
<
std
::
string
>::
size_type
i
=
0
;
i
!=
tae
.
eopts
.
size
();
i
++
)
{
sox_args
[
i
]
=
(
char
*
)
tae
.
eopts
[
i
].
c_str
();
}
sox_effect_options
(
e
,
num_opts
,
sox_args
);
if
(
sox_effect_options
(
e
,
num_opts
,
sox_args
)
!=
SOX_SUCCESS
)
{
throw
std
::
runtime_error
(
"invalid effect options, see SoX docs for details"
);
}
}
sox_add_effect
(
chain
,
e
,
&
interm_signal
,
&
input
->
signal
);
free
(
e
);
...
...
@@ -331,9 +324,21 @@ int build_flow_effects(const std::string& file_name,
sox_close
(
output
);
sox_close
(
input
);
// Resize output tensor to desired dimensions
int
nc
=
interm_signal
.
channels
;
int
ns
=
interm_signal
.
length
;
// Resize output tensor to desired dimensions, different effects result in output->signal.length,
// interm_signal.length and buffer size being inconsistent with the result of the file output.
// We prioritize in the order: output->signal.length > interm_signal.length > buffer_size
int
nc
,
ns
;
if
(
output
->
signal
.
length
==
0
)
{
if
(
interm_signal
.
length
>
(
buffer_size
*
10
))
{
ns
=
buffer_size
/
2
;
}
else
{
ns
=
interm_signal
.
length
;
}
nc
=
interm_signal
.
channels
;
}
else
{
nc
=
output
->
signal
.
channels
;
ns
=
output
->
signal
.
length
;
}
otensor
.
resize_
({
ns
/
nc
,
nc
});
otensor
=
otensor
.
contiguous
();
...
...
torchaudio/torch_sox.h
View file @
301e2e98
...
...
@@ -27,7 +27,6 @@ int read_audio_file(
void
write_audio_file
(
const
std
::
string
&
file_name
,
at
::
Tensor
tensor
,
bool
ch_first
,
sox_signalinfo_t
*
si
,
sox_encodinginfo_t
*
ei
,
const
char
*
extension
)
...
...
@@ -55,6 +54,7 @@ int shutdown_sox();
/// and the sample rate of the output tensor.
int
build_flow_effects
(
const
std
::
string
&
file_name
,
at
::
Tensor
otensor
,
bool
ch_first
,
sox_signalinfo_t
*
target_signal
,
sox_encodinginfo_t
*
target_encoding
,
const
char
*
file_type
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment