Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
9538c65f
Commit
9538c65f
authored
Sep 03, 2017
by
Soumith Chintala
Committed by
GitHub
Sep 03, 2017
Browse files
Merge pull request #15 from dhpollack/MEL
add MEL spectrograms transform and fixed a few tests.
parents
697f4621
5bbc2ee2
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
110 additions
and
34 deletions
+110
-34
test/test.py
test/test.py
+14
-28
test/test_transforms.py
test/test_transforms.py
+20
-6
torchaudio/transforms.py
torchaudio/transforms.py
+76
-0
No files found.
test/test.py
View file @
9538c65f
import
unittest
import
unittest
import
torch
import
torchaudio
import
torchaudio
import
math
import
math
import
os
import
os
class
Test_LoadSave
(
unittest
.
TestCase
):
class
Test_LoadSave
(
unittest
.
TestCase
):
test_dirpath
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
test_dirpath
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
test_filepath
=
os
.
path
.
join
(
test_dirpath
,
"steam-train-whistle-daniel_simon.mp3"
)
test_filepath
=
os
.
path
.
join
(
test_dirpath
,
"assets"
,
"steam-train-whistle-daniel_simon.mp3"
)
def
test_load
(
self
):
def
test_load
(
self
):
# check normal loading
# check normal loading
x
,
sr
=
torchaudio
.
load
(
self
.
test_filepath
)
x
,
sr
=
torchaudio
.
load
(
self
.
test_filepath
)
...
@@ -76,33 +77,18 @@ class Test_LoadSave(unittest.TestCase):
...
@@ -76,33 +77,18 @@ class Test_LoadSave(unittest.TestCase):
new_filepath
=
os
.
path
.
join
(
self
.
test_dirpath
,
"no-path"
,
"test.wav"
)
new_filepath
=
os
.
path
.
join
(
self
.
test_dirpath
,
"no-path"
,
"test.wav"
)
torchaudio
.
save
(
new_filepath
,
x
,
sr
)
torchaudio
.
save
(
new_filepath
,
x
,
sr
)
steam_train
=
"assets/steam-train-whistle-daniel_simon.mp3"
# save created file
sinewave_filepath
=
os
.
path
.
join
(
self
.
test_dirpath
,
"assets"
,
"sinewave.wav"
)
x
,
sample_rate
=
torchaudio
.
load
(
steam_train
)
sr
=
16000
print
(
sample_rate
)
freq
=
440
print
(
x
.
size
())
volume
=
0.3
print
(
x
[
10000
])
print
(
x
.
min
(),
x
.
max
())
y
=
(
torch
.
cos
(
2
*
math
.
pi
*
torch
.
arange
(
0
,
4
*
sr
)
*
freq
/
sr
)).
float
()
print
(
x
.
mean
(),
x
.
std
())
y
.
unsqueeze_
(
1
)
# y is between -1 and 1, so must scale
x
,
sample_rate
=
torchaudio
.
load
(
steam_train
,
y
=
(
y
*
volume
*
2
**
31
).
long
()
out
=
torch
.
LongTensor
())
torchaudio
.
save
(
sinewave_filepath
,
y
,
sr
)
print
(
sample_rate
)
self
.
assertTrue
(
os
.
path
.
isfile
(
sinewave_filepath
))
print
(
x
.
size
())
print
(
x
[
10000
])
print
(
x
.
min
(),
x
.
max
())
sine_wave
=
"assets/sinewave.wav"
sr
=
16000
freq
=
440
volume
=
0.3
y
=
(
torch
.
cos
(
2
*
math
.
pi
*
torch
.
arange
(
0
,
4
*
sr
)
*
freq
/
sr
)).
float
()
y
.
unsqueeze_
(
1
)
# y is between -1 and 1, so must scale
y
=
(
y
*
volume
*
2
**
31
).
long
()
torchaudio
.
save
(
sine_wave
,
y
,
sr
)
print
(
y
.
min
(),
y
.
max
())
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
test/test_transforms.py
View file @
9538c65f
...
@@ -4,8 +4,6 @@ import torchaudio.transforms as transforms
...
@@ -4,8 +4,6 @@ import torchaudio.transforms as transforms
import
numpy
as
np
import
numpy
as
np
import
unittest
import
unittest
STEAM_TRAIN
=
"assets/steam-train-whistle-daniel_simon.mp3"
class
Tester
(
unittest
.
TestCase
):
class
Tester
(
unittest
.
TestCase
):
sr
=
16000
sr
=
16000
...
@@ -20,13 +18,13 @@ class Tester(unittest.TestCase):
...
@@ -20,13 +18,13 @@ class Tester(unittest.TestCase):
audio_orig
=
self
.
sig
.
clone
()
audio_orig
=
self
.
sig
.
clone
()
result
=
transforms
.
Scale
()(
audio_orig
)
result
=
transforms
.
Scale
()(
audio_orig
)
self
.
assertTrue
(
result
.
min
()
>=
-
1.
and
result
.
max
()
<=
1.
,
self
.
assertTrue
(
result
.
min
()
>=
-
1.
and
result
.
max
()
<=
1.
,
"min: {}, max: {}"
.
format
(
result
.
min
(),
result
.
max
()))
print
(
"min: {}, max: {}"
.
format
(
result
.
min
(),
result
.
max
()))
)
maxminmax
=
np
.
abs
([
audio_orig
.
min
(),
audio_orig
.
max
()]).
max
().
astype
(
np
.
float
)
maxminmax
=
np
.
abs
([
audio_orig
.
min
(),
audio_orig
.
max
()]).
max
().
astype
(
np
.
float
)
result
=
transforms
.
Scale
(
factor
=
maxminmax
)(
audio_orig
)
result
=
transforms
.
Scale
(
factor
=
maxminmax
)(
audio_orig
)
self
.
assertTrue
((
result
.
min
()
==
-
1.
or
result
.
max
()
==
1.
)
and
self
.
assertTrue
((
result
.
min
()
==
-
1.
or
result
.
max
()
==
1.
)
and
result
.
min
()
>=
-
1.
and
result
.
max
()
<=
1.
,
result
.
min
()
>=
-
1.
and
result
.
max
()
<=
1.
,
"min: {}, max: {}"
.
format
(
result
.
min
(),
result
.
max
()))
print
(
"min: {}, max: {}"
.
format
(
result
.
min
(),
result
.
max
()))
)
def
test_pad_trim
(
self
):
def
test_pad_trim
(
self
):
...
@@ -37,7 +35,7 @@ class Tester(unittest.TestCase):
...
@@ -37,7 +35,7 @@ class Tester(unittest.TestCase):
result
=
transforms
.
PadTrim
(
max_len
=
length_new
)(
audio_orig
)
result
=
transforms
.
PadTrim
(
max_len
=
length_new
)(
audio_orig
)
self
.
assertTrue
(
result
.
size
(
0
)
==
length_new
,
self
.
assertTrue
(
result
.
size
(
0
)
==
length_new
,
"old size: {}, new size: {}"
.
format
(
audio_orig
.
size
(
0
),
result
.
size
(
0
)))
print
(
"old size: {}, new size: {}"
.
format
(
audio_orig
.
size
(
0
),
result
.
size
(
0
)))
)
audio_orig
=
self
.
sig
.
clone
()
audio_orig
=
self
.
sig
.
clone
()
length_orig
=
audio_orig
.
size
(
0
)
length_orig
=
audio_orig
.
size
(
0
)
...
@@ -46,7 +44,7 @@ class Tester(unittest.TestCase):
...
@@ -46,7 +44,7 @@ class Tester(unittest.TestCase):
result
=
transforms
.
PadTrim
(
max_len
=
length_new
)(
audio_orig
)
result
=
transforms
.
PadTrim
(
max_len
=
length_new
)(
audio_orig
)
self
.
assertTrue
(
result
.
size
(
0
)
==
length_new
,
self
.
assertTrue
(
result
.
size
(
0
)
==
length_new
,
"old size: {}, new size: {}"
.
format
(
audio_orig
.
size
(
0
),
result
.
size
(
0
)))
print
(
"old size: {}, new size: {}"
.
format
(
audio_orig
.
size
(
0
),
result
.
size
(
0
)))
)
def
test_downmix_mono
(
self
):
def
test_downmix_mono
(
self
):
...
@@ -64,6 +62,22 @@ class Tester(unittest.TestCase):
...
@@ -64,6 +62,22 @@ class Tester(unittest.TestCase):
self
.
assertTrue
(
result
.
size
(
1
)
==
1
)
self
.
assertTrue
(
result
.
size
(
1
)
==
1
)
def
test_lc2cl
(
self
):
audio
=
self
.
sig
.
clone
()
result
=
transforms
.
LC2CL
()(
audio
)
self
.
assertTrue
(
result
.
size
()[::
-
1
]
==
audio
.
size
())
def
test_mel
(
self
):
audio
=
self
.
sig
.
clone
()
audio
=
transforms
.
Scale
()(
audio
)
self
.
assertTrue
(
len
(
audio
.
size
())
==
2
)
result
=
transforms
.
MEL
()(
audio
)
self
.
assertTrue
(
len
(
result
.
size
())
==
3
)
result
=
transforms
.
BLC2CBL
()(
result
)
self
.
assertTrue
(
len
(
result
.
size
())
==
3
)
def
test_compose
(
self
):
def
test_compose
(
self
):
audio_orig
=
self
.
sig
.
clone
()
audio_orig
=
self
.
sig
.
clone
()
...
...
torchaudio/transforms.py
View file @
9538c65f
from
__future__
import
division
from
__future__
import
division
import
torch
import
torch
import
numpy
as
np
import
numpy
as
np
try
:
import
librosa
except
ImportError
:
librosa
=
None
class
Compose
(
object
):
class
Compose
(
object
):
"""Composes several transforms together.
"""Composes several transforms together.
...
@@ -105,3 +109,75 @@ class DownmixMono(object):
...
@@ -105,3 +109,75 @@ class DownmixMono(object):
if
tensor
.
size
(
1
)
>
1
:
if
tensor
.
size
(
1
)
>
1
:
tensor
=
torch
.
mean
(
tensor
.
float
(),
1
,
True
)
tensor
=
torch
.
mean
(
tensor
.
float
(),
1
,
True
)
return
tensor
return
tensor
class
LC2CL
(
object
):
"""Permute a 2d tensor from samples (Length) x Channels to Channels x
samples (Length)
"""
def
__call__
(
self
,
tensor
):
"""
Args:
tensor (Tensor): Tensor of spectrogram with shape (BxLxC)
Returns:
tensor (Tensor): Tensor of spectrogram with shape (CxBxL)
"""
return
tensor
.
transpose
(
0
,
1
).
contiguous
()
class
MEL
(
object
):
"""Create MEL Spectrograms from a raw audio signal. Relatively pretty slow.
Usage (see librosa.feature.melspectrogram docs):
MEL(sr=16000, n_fft=1600, hop_length=800, n_mels=64)
"""
def
__init__
(
self
,
**
kwargs
):
self
.
kwargs
=
kwargs
def
__call__
(
self
,
tensor
):
"""
Args:
tensor (Tensor): Tensor of audio of size (samples x channels)
Returns:
tensor (Tensor): n_mels x hops x channels (BxLxC), where n_mels is
the number of mel bins, hops is the number of hops, and channels
is unchanged.
"""
if
librosa
is
None
:
print
(
"librosa not installed, cannot create spectrograms"
)
return
tensor
L
=
[]
for
i
in
range
(
tensor
.
size
(
1
)):
nparr
=
tensor
[:,
i
].
numpy
()
# (samples, )
sgram
=
librosa
.
feature
.
melspectrogram
(
nparr
,
**
self
.
kwargs
)
# (n_mels, hops)
L
.
append
(
sgram
)
L
=
np
.
stack
(
L
,
2
)
# (n_mels, hops, channels)
tensor
=
torch
.
from_numpy
(
L
).
type_as
(
tensor
)
return
tensor
class
BLC2CBL
(
object
):
"""Permute a 3d tensor from Bands x samples (Length) x Channels to Channels x
Bands x samples (Length)
"""
def
__call__
(
self
,
tensor
):
"""
Args:
tensor (Tensor): Tensor of spectrogram with shape (BxLxC)
Returns:
tensor (Tensor): Tensor of spectrogram with shape (CxBxL)
"""
return
tensor
.
permute
(
2
,
0
,
1
).
contiguous
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment