Merge pull request #15 from dhpollack/MEL

add MEL spectrograms transform and fixed a few tests.

Merge pull request #15 from dhpollack/MEL
add MEL spectrograms transform and fixed a few tests.
9538c65f · Soumith Chintala · GitHub · 697f4621 · 5bbc2ee2 · 9538c65f
Commit 9538c65f authored Sep 03, 2017 by Soumith Chintala Committed by GitHub Sep 03, 2017
Hide whitespace changes
Inline Side-by-side

Showing with 110 additions and 34 deletions

test/test.py test/test.py +14 -28

test/test_transforms.py test/test_transforms.py +20 -6

torchaudio/transforms.py torchaudio/transforms.py +76 -0

No files found.
--- a/test/test.py
+++ b/test/test.py
 import unittest
+import torch
 import torchaudio
 import math
 import os
 class Test_LoadSave(unittest.TestCase):
    test_dirpath = os.path.dirname(os.path.realpath(__file__))
-    test_filepath = os.path.join(test_dirpath, "steam-train-whistle-daniel_simon.mp3")
+    test_filepath = os.path.join(test_dirpath, "assets", "steam-train-whistle-daniel_simon.mp3")
    def test_load(self):
        # check normal loading
        x, sr = torchaudio.load(self.test_filepath)
@@ -76,33 +77,18 @@ class Test_LoadSave(unittest.TestCase):
            new_filepath = os.path.join(self.test_dirpath, "no-path", "test.wav")
            torchaudio.save(new_filepath, x, sr)
-steam_train = "assets/steam-train-whistle-daniel_simon.mp3"
+        # save created file
+        sinewave_filepath = os.path.join(self.test_dirpath, "assets", "sinewave.wav")
-x, sample_rate = torchaudio.load(steam_train)
+        sr = 16000
-print(sample_rate)
+        freq = 440
-print(x.size())
+        volume = 0.3
-print(x[10000])
-print(x.min(), x.max())
+        y = (torch.cos(2*math.pi*torch.arange(0, 4*sr) * freq/sr)).float()
-print(x.mean(), x.std())
+        y.unsqueeze_(1)
+        # y is between -1 and 1, so must scale
-x, sample_rate = torchaudio.load(steam_train,
+        y = (y*volume*2**31).long()
-                                 out=torch.LongTensor())
+        torchaudio.save(sinewave_filepath, y, sr)
-print(sample_rate)
+        self.assertTrue(os.path.isfile(sinewave_filepath))
-print(x.size())
-print(x[10000])
-print(x.min(), x.max())
-sine_wave = "assets/sinewave.wav"
-sr = 16000
-freq = 440
-volume = 0.3
-y = (torch.cos(2*math.pi*torch.arange(0, 4*sr) * freq/sr)).float()
-y.unsqueeze_(1)
-# y is between -1 and 1, so must scale
-y = (y*volume*2**31).long()
-torchaudio.save(sine_wave, y, sr)
-print(y.min(), y.max())
 if __name__ == '__main__':
    unittest.main()
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -4,8 +4,6 @@ import torchaudio.transforms as transforms
 import numpy as np
 import unittest
-STEAM_TRAIN = "assets/steam-train-whistle-daniel_simon.mp3"
 class Tester(unittest.TestCase):
    sr = 16000
@@ -20,13 +18,13 @@ class Tester(unittest.TestCase):
        audio_orig = self.sig.clone()
        result = transforms.Scale()(audio_orig)
        self.assertTrue(result.min() >= -1. and result.max() <= 1.,
-                        "min: {}, max: {}".format(result.min(), result.max()))
+                        print("min: {}, max: {}".format(result.min(), result.max())))
        maxminmax = np.abs([audio_orig.min(), audio_orig.max()]).max().astype(np.float)
        result = transforms.Scale(factor=maxminmax)(audio_orig)
        self.assertTrue((result.min() == -1. or result.max() == 1.) and
                        result.min() >= -1. and result.max() <= 1.,
-                        "min: {}, max: {}".format(result.min(), result.max()))
+                        print("min: {}, max: {}".format(result.min(), result.max())))
    def test_pad_trim(self):
@@ -37,7 +35,7 @@ class Tester(unittest.TestCase):
        result = transforms.PadTrim(max_len=length_new)(audio_orig)
        self.assertTrue(result.size(0) == length_new,
-                        "old size: {}, new size: {}".format(audio_orig.size(0), result.size(0)))
+                        print("old size: {}, new size: {}".format(audio_orig.size(0), result.size(0))))
        audio_orig = self.sig.clone()
        length_orig = audio_orig.size(0)
@@ -46,7 +44,7 @@ class Tester(unittest.TestCase):
        result = transforms.PadTrim(max_len=length_new)(audio_orig)
        self.assertTrue(result.size(0) == length_new,
-                        "old size: {}, new size: {}".format(audio_orig.size(0), result.size(0)))
+                        print("old size: {}, new size: {}".format(audio_orig.size(0), result.size(0))))
    def test_downmix_mono(self):
@@ -64,6 +62,22 @@ class Tester(unittest.TestCase):
        self.assertTrue(result.size(1) == 1)
+    def test_lc2cl(self):
+        audio = self.sig.clone()
+        result = transforms.LC2CL()(audio)
+        self.assertTrue(result.size()[::-1] == audio.size())
+    def test_mel(self):
+        audio = self.sig.clone()
+        audio = transforms.Scale()(audio)
+        self.assertTrue(len(audio.size()) == 2)
+        result = transforms.MEL()(audio)
+        self.assertTrue(len(result.size()) == 3)
+        result = transforms.BLC2CBL()(result)
+        self.assertTrue(len(result.size()) == 3)
    def test_compose(self):
        audio_orig = self.sig.clone()

--- a/torchaudio/transforms.py
+++ b/torchaudio/transforms.py
 from __future__ import division
 import torch
 import numpy as np
+try:
+    import librosa
+except ImportError:
+    librosa = None
 class Compose(object):
    """Composes several transforms together.
@@ -105,3 +109,75 @@ class DownmixMono(object):
        if tensor.size(1) > 1:
            tensor = torch.mean(tensor.float(), 1, True)
        return tensor
+class LC2CL(object):
+    """Permute a 2d tensor from samples (Length) x Channels to Channels x
+       samples (Length)
+    """
+    def __call__(self, tensor):
+        """
+        Args:
+            tensor (Tensor): Tensor of spectrogram with shape (BxLxC)
+        Returns:
+            tensor (Tensor): Tensor of spectrogram with shape (CxBxL)
+        """
+        return tensor.transpose(0, 1).contiguous()
+class MEL(object):
+    """Create MEL Spectrograms from a raw audio signal. Relatively pretty slow.
+       Usage (see librosa.feature.melspectrogram docs):
+           MEL(sr=16000, n_fft=1600, hop_length=800, n_mels=64)
+    """
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+    def __call__(self, tensor):
+        """
+        Args:
+            tensor (Tensor): Tensor of audio of size (samples x channels)
+        Returns:
+            tensor (Tensor): n_mels x hops x channels (BxLxC), where n_mels is
+                the number of mel bins, hops is the number of hops, and channels
+                is unchanged.
+        """
+        if librosa is None:
+            print("librosa not installed, cannot create spectrograms")
+            return tensor
+        L = []
+        for i in range(tensor.size(1)):
+            nparr = tensor[:, i].numpy() # (samples, )
+            sgram = librosa.feature.melspectrogram(nparr, **self.kwargs) # (n_mels, hops)
+            L.append(sgram)
+        L = np.stack(L, 2) # (n_mels, hops, channels)
+        tensor = torch.from_numpy(L).type_as(tensor)
+        return tensor
+class BLC2CBL(object):
+    """Permute a 3d tensor from Bands x samples (Length) x Channels to Channels x
+       Bands x samples (Length)
+    """
+    def __call__(self, tensor):
+        """
+        Args:
+            tensor (Tensor): Tensor of spectrogram with shape (BxLxC)
+        Returns:
+            tensor (Tensor): Tensor of spectrogram with shape (CxBxL)
+        """
+        return tensor.permute(2, 0, 1).contiguous()