adding file

101e0d5f · Jason Lian · acdedc4a · 101e0d5f · 101e0d5f
Commit 101e0d5f authored May 16, 2019 by Jason Lian
Hide whitespace changes
Inline Side-by-side

Showing with 61 additions and 38 deletions

torchaudio/functional.py torchaudio/functional.py +54 -0

torchaudio/transforms.py torchaudio/transforms.py +7 -38

No files found.
--- a/torchaudio/functional.py
+++ b/torchaudio/functional.py
+import torch
+
+def scale(tensor, factor):
+    # type: (Tensor, int) -> Tensor
+    if not tensor.dtype.is_floating_point:
+        tensor = tensor.to(torch.float32)
+
+    return tensor / factor
+
+def pad_trim(tensor, ch_dim, max_len, len_dim, fill_value):
+    # type: (Tensor, int, int, int, float) -> Tensor
+    assert tensor.size(ch_dim) < 128, \
+        "Too many channels ({}) detected, see channels_first param.".format(tensor.size(ch_dim))
+    if max_len > tensor.size(len_dim):
+        padding = [max_len - tensor.size(len_dim)
+                   if (i % 2 == 1) and (i // 2 != len_dim)
+                   else 0
+                   for i in range(4)]
+        with torch.no_grad():
+            tensor = torch.nn.functional.pad(tensor, padding, "constant", fill_value)
+    elif max_len < tensor.size(len_dim):
+        tensor = tensor.narrow(len_dim, 0, max_len)
+    return tensor
+
+def downmix_mono(tensor, ch_dim):
+    # type: (Tensor, int) -> Tensor
+    if not tensor.dtype.is_floating_point:
+        tensor = tensor.to(torch.float32)
+
+    tensor = torch.mean(tensor, ch_dim, True)
+    return tensor
+
+def lc2cl(tensor):
+    # type: (Tensor) -> Tensor
+    return tensor.transpose(0, 1).contiguous()
+
+def spectrogram(sig, pad, window, n_fft, hop, ws, power, normalize):
+    # type: (Tensor, int, Tensor, int, int, int, int, bool) -> Tensor
+    assert sig.dim() == 2
+
+    if pad > 0:
+        with torch.no_grad():
+            sig = torch.nn.functional.pad(sig, (pad, pad), "constant")
+    window = window.to(sig.device)
+
+    # default values are consistent with librosa.core.spectrum._spectrogram
+    spec_f = torch.stft(sig, n_fft, hop, ws,
+                        window, center=True,
+                        normalized=False, onesided=True,
+                        pad_mode='reflect').transpose(1, 2)
+    if normalize:
+        spec_f /= window.pow(2).sum().sqrt()
+    spec_f = spec_f.pow(power).sum(-1)  # get power of "complex" tensor (c, l, n_fft)
+    return spec_f
--- a/torchaudio/transforms.py
+++ b/torchaudio/transforms.py
@@ -2,6 +2,7 @@ from __future__ import division, print_function
 from warnings import warn
 import torch
 import numpy as np
+import functional as F


 class Compose(object):
@@ -57,10 +58,7 @@ class Scale(object):
            Tensor: Scaled by the scale factor. (default between -1.0 and 1.0)

        """
-        if not tensor.dtype.is_floating_point:
-            tensor = tensor.to(torch.float32)
-
-        return tensor / self.factor
+        return F.scale(tensor, factor)

    def __repr__(self):
        return self.__class__.__name__ + '()'
@@ -88,18 +86,7 @@ class PadTrim(object):
            Tensor: (c x n) or (n x c)

        """
-        assert tensor.size(self.ch_dim) < 128, \
-            "Too many channels ({}) detected, see channels_first param.".format(tensor.size(self.ch_dim))
-        if self.max_len > tensor.size(self.len_dim):
-            padding = [self.max_len - tensor.size(self.len_dim)
-                       if (i % 2 == 1) and (i // 2 != self.len_dim)
-                       else 0
-                       for i in range(4)]
-            with torch.no_grad():
-                tensor = torch.nn.functional.pad(tensor, padding, "constant", self.fill_value)
-        elif self.max_len < tensor.size(self.len_dim):
-            tensor = tensor.narrow(self.len_dim, 0, self.max_len)
-        return tensor
+        return F.pad_trim(tensor, self.ch_dim, self.max_len, self.len_dim, self.fill_value)

    def __repr__(self):
        return self.__class__.__name__ + '(max_len={0})'.format(self.max_len)
@@ -122,11 +109,7 @@ class DownmixMono(object):
        self.ch_dim = int(not channels_first)

    def __call__(self, tensor):
-        if not tensor.dtype.is_floating_point:
-            tensor = tensor.to(torch.float32)
-
-        tensor = torch.mean(tensor, self.ch_dim, True)
-        return tensor
+        return F.downmix_mono(tensor, self.ch_dim)

    def __repr__(self):
        return self.__class__.__name__ + '()'
@@ -145,7 +128,7 @@ class LC2CL(object):
        Returns:
            tensor (Tensor): Tensor of audio signal with shape (CxL)
        """
-        return tensor.transpose(0, 1).contiguous()
+        return F.lc2cl(tensor)

    def __repr__(self):
        return self.__class__.__name__ + '()'
@@ -196,22 +179,8 @@ class Spectrogram(object):
                by 2 plus 1.

        """
-        assert sig.dim() == 2
-
-        if self.pad > 0:
-            with torch.no_grad():
-                sig = torch.nn.functional.pad(sig, (self.pad, self.pad), "constant")
-        self.window = self.window.to(sig.device)
-
-        # default values are consistent with librosa.core.spectrum._spectrogram
-        spec_f = torch.stft(sig, self.n_fft, self.hop, self.ws,
-                            self.window, center=True,
-                            normalized=False, onesided=True,
-                            pad_mode='reflect').transpose(1, 2)
-        if self.normalize:
-            spec_f /= self.window.pow(2).sum().sqrt()
-        spec_f = spec_f.pow(self.power).sum(-1)  # get power of "complex" tensor (c, l, n_fft)
-        return spec_f
+        return F.spectrogram(sig, self.pad, self.window, self.n_fft, self.hop,
+                             self.ws, self.power, self.normalize)


 def F2M(*args, **kwargs):