Commit 9538c65f authored by Soumith Chintala's avatar Soumith Chintala Committed by GitHub
Browse files

Merge pull request #15 from dhpollack/MEL

add MEL spectrograms transform and fixed a few tests.
parents 697f4621 5bbc2ee2
import unittest import unittest
import torch
import torchaudio import torchaudio
import math import math
import os import os
class Test_LoadSave(unittest.TestCase): class Test_LoadSave(unittest.TestCase):
test_dirpath = os.path.dirname(os.path.realpath(__file__)) test_dirpath = os.path.dirname(os.path.realpath(__file__))
test_filepath = os.path.join(test_dirpath, "steam-train-whistle-daniel_simon.mp3") test_filepath = os.path.join(test_dirpath, "assets", "steam-train-whistle-daniel_simon.mp3")
def test_load(self): def test_load(self):
# check normal loading # check normal loading
x, sr = torchaudio.load(self.test_filepath) x, sr = torchaudio.load(self.test_filepath)
...@@ -76,33 +77,18 @@ class Test_LoadSave(unittest.TestCase): ...@@ -76,33 +77,18 @@ class Test_LoadSave(unittest.TestCase):
new_filepath = os.path.join(self.test_dirpath, "no-path", "test.wav") new_filepath = os.path.join(self.test_dirpath, "no-path", "test.wav")
torchaudio.save(new_filepath, x, sr) torchaudio.save(new_filepath, x, sr)
steam_train = "assets/steam-train-whistle-daniel_simon.mp3" # save created file
sinewave_filepath = os.path.join(self.test_dirpath, "assets", "sinewave.wav")
x, sample_rate = torchaudio.load(steam_train) sr = 16000
print(sample_rate) freq = 440
print(x.size()) volume = 0.3
print(x[10000])
print(x.min(), x.max()) y = (torch.cos(2*math.pi*torch.arange(0, 4*sr) * freq/sr)).float()
print(x.mean(), x.std()) y.unsqueeze_(1)
# y is between -1 and 1, so must scale
x, sample_rate = torchaudio.load(steam_train, y = (y*volume*2**31).long()
out=torch.LongTensor()) torchaudio.save(sinewave_filepath, y, sr)
print(sample_rate) self.assertTrue(os.path.isfile(sinewave_filepath))
print(x.size())
print(x[10000])
print(x.min(), x.max())
sine_wave = "assets/sinewave.wav"
sr = 16000
freq = 440
volume = 0.3
y = (torch.cos(2*math.pi*torch.arange(0, 4*sr) * freq/sr)).float()
y.unsqueeze_(1)
# y is between -1 and 1, so must scale
y = (y*volume*2**31).long()
torchaudio.save(sine_wave, y, sr)
print(y.min(), y.max())
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -4,8 +4,6 @@ import torchaudio.transforms as transforms ...@@ -4,8 +4,6 @@ import torchaudio.transforms as transforms
import numpy as np import numpy as np
import unittest import unittest
STEAM_TRAIN = "assets/steam-train-whistle-daniel_simon.mp3"
class Tester(unittest.TestCase): class Tester(unittest.TestCase):
sr = 16000 sr = 16000
...@@ -20,13 +18,13 @@ class Tester(unittest.TestCase): ...@@ -20,13 +18,13 @@ class Tester(unittest.TestCase):
audio_orig = self.sig.clone() audio_orig = self.sig.clone()
result = transforms.Scale()(audio_orig) result = transforms.Scale()(audio_orig)
self.assertTrue(result.min() >= -1. and result.max() <= 1., self.assertTrue(result.min() >= -1. and result.max() <= 1.,
"min: {}, max: {}".format(result.min(), result.max())) print("min: {}, max: {}".format(result.min(), result.max())))
maxminmax = np.abs([audio_orig.min(), audio_orig.max()]).max().astype(np.float) maxminmax = np.abs([audio_orig.min(), audio_orig.max()]).max().astype(np.float)
result = transforms.Scale(factor=maxminmax)(audio_orig) result = transforms.Scale(factor=maxminmax)(audio_orig)
self.assertTrue((result.min() == -1. or result.max() == 1.) and self.assertTrue((result.min() == -1. or result.max() == 1.) and
result.min() >= -1. and result.max() <= 1., result.min() >= -1. and result.max() <= 1.,
"min: {}, max: {}".format(result.min(), result.max())) print("min: {}, max: {}".format(result.min(), result.max())))
def test_pad_trim(self): def test_pad_trim(self):
...@@ -37,7 +35,7 @@ class Tester(unittest.TestCase): ...@@ -37,7 +35,7 @@ class Tester(unittest.TestCase):
result = transforms.PadTrim(max_len=length_new)(audio_orig) result = transforms.PadTrim(max_len=length_new)(audio_orig)
self.assertTrue(result.size(0) == length_new, self.assertTrue(result.size(0) == length_new,
"old size: {}, new size: {}".format(audio_orig.size(0), result.size(0))) print("old size: {}, new size: {}".format(audio_orig.size(0), result.size(0))))
audio_orig = self.sig.clone() audio_orig = self.sig.clone()
length_orig = audio_orig.size(0) length_orig = audio_orig.size(0)
...@@ -46,7 +44,7 @@ class Tester(unittest.TestCase): ...@@ -46,7 +44,7 @@ class Tester(unittest.TestCase):
result = transforms.PadTrim(max_len=length_new)(audio_orig) result = transforms.PadTrim(max_len=length_new)(audio_orig)
self.assertTrue(result.size(0) == length_new, self.assertTrue(result.size(0) == length_new,
"old size: {}, new size: {}".format(audio_orig.size(0), result.size(0))) print("old size: {}, new size: {}".format(audio_orig.size(0), result.size(0))))
def test_downmix_mono(self): def test_downmix_mono(self):
...@@ -64,6 +62,22 @@ class Tester(unittest.TestCase): ...@@ -64,6 +62,22 @@ class Tester(unittest.TestCase):
self.assertTrue(result.size(1) == 1) self.assertTrue(result.size(1) == 1)
def test_lc2cl(self):
audio = self.sig.clone()
result = transforms.LC2CL()(audio)
self.assertTrue(result.size()[::-1] == audio.size())
def test_mel(self):
audio = self.sig.clone()
audio = transforms.Scale()(audio)
self.assertTrue(len(audio.size()) == 2)
result = transforms.MEL()(audio)
self.assertTrue(len(result.size()) == 3)
result = transforms.BLC2CBL()(result)
self.assertTrue(len(result.size()) == 3)
def test_compose(self): def test_compose(self):
audio_orig = self.sig.clone() audio_orig = self.sig.clone()
......
from __future__ import division from __future__ import division
import torch import torch
import numpy as np import numpy as np
try:
import librosa
except ImportError:
librosa = None
class Compose(object): class Compose(object):
"""Composes several transforms together. """Composes several transforms together.
...@@ -105,3 +109,75 @@ class DownmixMono(object): ...@@ -105,3 +109,75 @@ class DownmixMono(object):
if tensor.size(1) > 1: if tensor.size(1) > 1:
tensor = torch.mean(tensor.float(), 1, True) tensor = torch.mean(tensor.float(), 1, True)
return tensor return tensor
class LC2CL(object):
"""Permute a 2d tensor from samples (Length) x Channels to Channels x
samples (Length)
"""
def __call__(self, tensor):
"""
Args:
tensor (Tensor): Tensor of spectrogram with shape (BxLxC)
Returns:
tensor (Tensor): Tensor of spectrogram with shape (CxBxL)
"""
return tensor.transpose(0, 1).contiguous()
class MEL(object):
"""Create MEL Spectrograms from a raw audio signal. Relatively pretty slow.
Usage (see librosa.feature.melspectrogram docs):
MEL(sr=16000, n_fft=1600, hop_length=800, n_mels=64)
"""
def __init__(self, **kwargs):
self.kwargs = kwargs
def __call__(self, tensor):
"""
Args:
tensor (Tensor): Tensor of audio of size (samples x channels)
Returns:
tensor (Tensor): n_mels x hops x channels (BxLxC), where n_mels is
the number of mel bins, hops is the number of hops, and channels
is unchanged.
"""
if librosa is None:
print("librosa not installed, cannot create spectrograms")
return tensor
L = []
for i in range(tensor.size(1)):
nparr = tensor[:, i].numpy() # (samples, )
sgram = librosa.feature.melspectrogram(nparr, **self.kwargs) # (n_mels, hops)
L.append(sgram)
L = np.stack(L, 2) # (n_mels, hops, channels)
tensor = torch.from_numpy(L).type_as(tensor)
return tensor
class BLC2CBL(object):
"""Permute a 3d tensor from Bands x samples (Length) x Channels to Channels x
Bands x samples (Length)
"""
def __call__(self, tensor):
"""
Args:
tensor (Tensor): Tensor of spectrogram with shape (BxLxC)
Returns:
tensor (Tensor): Tensor of spectrogram with shape (CxBxL)
"""
return tensor.permute(2, 0, 1).contiguous()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment