functional.py 11.8 KB
Newer Older
1
import math
Jason Lian's avatar
Jason Lian committed
2
3
import torch

Jason Lian's avatar
Jason Lian committed
4

Jason Lian's avatar
pre  
Jason Lian committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
__all__ = [
    'scale',
    'pad_trim',
    'downmix_mono',
    'LC2CL',
    'spectrogram',
    'create_fb_matrix',
    'mel_scale',
    'spectrogram_to_DB',
    'create_dct',
    'MFCC',
    'BLC2CBL',
    'mu_law_encoding',
    'mu_law_expanding'
]

Jason Lian's avatar
Jason Lian committed
21

Jason Lian's avatar
Jason Lian committed
22
23
def scale(tensor, factor):
    # type: (Tensor, int) -> Tensor
Jason Lian's avatar
Jason Lian committed
24
25
26
27
28
29
    """Scale audio tensor from a 16-bit integer (represented as a FloatTensor)
    to a floating point number between -1.0 and 1.0.  Note the 16-bit number is
    called the "bit depth" or "precision", not to be confused with "bit rate".

    Inputs:
        tensor (Tensor): Tensor of audio of size (Samples x Channels)
Jason Lian's avatar
Jason Lian committed
30
        factor (int): Maximum value of input tensor
Jason Lian's avatar
Jason Lian committed
31
32

    Outputs:
Jason Lian's avatar
Jason Lian committed
33
        Tensor: Scaled by the scale factor
Jason Lian's avatar
Jason Lian committed
34
    """
Jason Lian's avatar
Jason Lian committed
35
36
37
38
39
    if not tensor.dtype.is_floating_point:
        tensor = tensor.to(torch.float32)

    return tensor / factor

Jason Lian's avatar
more  
Jason Lian committed
40

Jason Lian's avatar
Jason Lian committed
41
42
def pad_trim(tensor, ch_dim, max_len, len_dim, fill_value):
    # type: (Tensor, int, int, int, float) -> Tensor
Jason Lian's avatar
Jason Lian committed
43
44
45
46
47
48
49
50
51
52
53
54
    """Pad/Trim a 2d-Tensor (Signal or Labels)

    Inputs:
        tensor (Tensor): Tensor of audio of size (n x c) or (c x n)
        ch_dim (int): Dimension of channel (not size)
        max_len (int): Length to which the tensor will be padded
        len_dim (int): Dimension of length (not size)
        fill_value (float): Value to fill in

    Outputs:
        Tensor: Padded/trimmed tensor
    """
Jason Lian's avatar
Jason Lian committed
55
    if max_len > tensor.size(len_dim):
Jason Lian's avatar
Jason Lian committed
56
57
58
        # tuple of (padding_left, padding_right, padding_top, padding_bottom)
        # so pad similar to append (aka only right/bottom) and do not pad
        # the length dimension. assumes equal sizes of padding.
Jason Lian's avatar
Jason Lian committed
59
60
61
62
63
64
65
66
67
68
        padding = [max_len - tensor.size(len_dim)
                   if (i % 2 == 1) and (i // 2 != len_dim)
                   else 0
                   for i in range(4)]
        with torch.no_grad():
            tensor = torch.nn.functional.pad(tensor, padding, "constant", fill_value)
    elif max_len < tensor.size(len_dim):
        tensor = tensor.narrow(len_dim, 0, max_len)
    return tensor

Jason Lian's avatar
more  
Jason Lian committed
69

Jason Lian's avatar
Jason Lian committed
70
71
def downmix_mono(tensor, ch_dim):
    # type: (Tensor, int) -> Tensor
Jason Lian's avatar
Jason Lian committed
72
73
74
75
76
77
78
79
80
    """Downmix any stereo signals to mono.

    Inputs:
        tensor (Tensor): Tensor of audio of size (c x n) or (n x c)
        ch_dim (int): Dimension of channel (not size)

    Outputs:
        Tensor: Mono signal
    """
Jason Lian's avatar
Jason Lian committed
81
82
83
84
85
86
    if not tensor.dtype.is_floating_point:
        tensor = tensor.to(torch.float32)

    tensor = torch.mean(tensor, ch_dim, True)
    return tensor

Jason Lian's avatar
more  
Jason Lian committed
87

Jason Lian's avatar
more  
Jason Lian committed
88
def LC2CL(tensor):
Jason Lian's avatar
Jason Lian committed
89
    # type: (Tensor) -> Tensor
Jason Lian's avatar
Jason Lian committed
90
91
92
93
94
95
96
97
    """Permute a 2d tensor from samples (n x c) to (c x n)

    Inputs:
        tensor (Tensor): Tensor of audio signal with shape (LxC)

    Outputs:
        Tensor: Tensor of audio signal with shape (CxL)
    """
Jason Lian's avatar
Jason Lian committed
98
99
    return tensor.transpose(0, 1).contiguous()

Jason Lian's avatar
more  
Jason Lian committed
100

Jason Lian's avatar
Jason Lian committed
101
102
def spectrogram(sig, pad, window, n_fft, hop, ws, power, normalize):
    # type: (Tensor, int, Tensor, int, int, int, int, bool) -> Tensor
Jason Lian's avatar
Jason Lian committed
103
104
105
106
107
108
109
110
    """Create a spectrogram from a raw audio signal

    Inputs:
        sig (Tensor): Tensor of audio of size (c, n)
        pad (int): two sided padding of signal
        window (Tensor): window_tensor
        n_fft (int): size of fft
        hop (int): length of hop between STFT windows
Jason Lian's avatar
Jason Lian committed
111
        ws (int): window size
Jason Lian's avatar
Jason Lian committed
112
113
114
115
116
117
118
119
120
121
122
        power (int > 0 ) : Exponent for the magnitude spectrogram,
                        e.g., 1 for energy, 2 for power, etc.
        normalize (bool) : whether to normalize by magnitude after stft


    Outputs:
        Tensor: channels x hops x n_fft (c, l, f), where channels
            is unchanged, hops is the number of hops, and n_fft is the
            number of fourier bins, which should be the window size divided
            by 2 plus 1.
    """
Jason Lian's avatar
Jason Lian committed
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
    assert sig.dim() == 2

    if pad > 0:
        with torch.no_grad():
            sig = torch.nn.functional.pad(sig, (pad, pad), "constant")
    window = window.to(sig.device)

    # default values are consistent with librosa.core.spectrum._spectrogram
    spec_f = torch.stft(sig, n_fft, hop, ws,
                        window, center=True,
                        normalized=False, onesided=True,
                        pad_mode='reflect').transpose(1, 2)
    if normalize:
        spec_f /= window.pow(2).sum().sqrt()
    spec_f = spec_f.pow(power).sum(-1)  # get power of "complex" tensor (c, l, n_fft)
    return spec_f
Jason Lian's avatar
more  
Jason Lian committed
139
140
141
142
143
144


def create_fb_matrix(n_stft, f_min, f_max, n_mels):
    # type: (int, float, float, int) -> Tensor
    """ Create a frequency bin conversion matrix.

Jason Lian's avatar
Jason Lian committed
145
    Inputs:
Jason Lian's avatar
more  
Jason Lian committed
146
        n_stft (int): number of filter banks from spectrogram
Jason Lian's avatar
Jason Lian committed
147
148
149
150
151
152
        f_min (float): minimum frequency
        f_max (float): maximum frequency
        n_mels (int): number of mel bins

    Outputs:
        Tensor: triangular filter banks (fb matrix)
Jason Lian's avatar
more  
Jason Lian committed
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
    """
    def _hertz_to_mel(f):
        # type: (float) -> Tensor
        return 2595. * torch.log10(torch.tensor(1.) + (f / 700.))

    def _mel_to_hertz(mel):
        # type: (Tensor) -> Tensor
        return 700. * (10**(mel / 2595.) - 1.)

    # get stft freq bins
    stft_freqs = torch.linspace(f_min, f_max, n_stft)
    # calculate mel freq bins
    m_min = 0. if f_min == 0 else _hertz_to_mel(f_min)
    m_max = _hertz_to_mel(f_max)
    m_pts = torch.linspace(m_min, m_max, n_mels + 2)
    f_pts = _mel_to_hertz(m_pts)
    # calculate the difference between each mel point and each stft freq point in hertz
    f_diff = f_pts[1:] - f_pts[:-1]  # (n_mels + 1)
    slopes = f_pts.unsqueeze(0) - stft_freqs.unsqueeze(1)  # (n_stft, n_mels + 2)
    # create overlapping triangles
    z = torch.tensor(0.)
    down_slopes = (-1. * slopes[:, :-2]) / f_diff[:-1]  # (n_stft, n_mels)
    up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_stft, n_mels)
    fb = torch.max(z, torch.min(down_slopes, up_slopes))
    return fb


def mel_scale(spec_f, f_min, f_max, n_mels, fb=None):
    # type: (Tensor, float, float, int, Optional[Tensor]) -> Tuple[Tensor, Tensor]
Jason Lian's avatar
Jason Lian committed
182
183
184
185
186
187
188
189
190
191
192
193
194
    """ This turns a normal STFT into a mel frequency STFT, using a conversion
    matrix.  This uses triangular filter banks.

    Inputs:
        spec_f (Tensor): normal STFT
        f_min (float): minimum frequency
        f_max (float): maximum frequency
        n_mels (int): number of mel bins
        fb (Optional[Tensor]): triangular filter banks (fb matrix)

    Outputs:
        Tuple[Tensor, Tensor]: triangular filter banks (fb matrix) and mel frequency STFT
    """
Jason Lian's avatar
more  
Jason Lian committed
195
196
197
198
199
200
201
202
203
    if fb is None:
        fb = create_fb_matrix(spec_f.size(2), f_min, f_max, n_mels).to(spec_f.device)
    else:
        # need to ensure same device for dot product
        fb = fb.to(spec_f.device)
    spec_m = torch.matmul(spec_f, fb)  # (c, l, n_fft) dot (n_fft, n_mels) -> (c, l, n_mels)
    return fb, spec_m


Jason Lian's avatar
Jason Lian committed
204
def spectrogram_to_DB(spec, multiplier, amin, db_multiplier, top_db=None):
Jason Lian's avatar
more  
Jason Lian committed
205
    # type: (Tensor, float, float, float, Optional[float]) -> Tensor
Jason Lian's avatar
Jason Lian committed
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
    """Turns a spectrogram from the power/amplitude scale to the decibel scale.

    This output depends on the maximum value in the input spectrogram, and so
    may return different values for an audio clip split into snippets vs. a
    a full clip.

    Inputs:
        spec (Tensor): normal STFT
        multiplier (float): use 10. for power and 20. for amplitude
        amin (float): number to clamp spec
        db_multiplier (float): log10(max(reference value and amin))
        top_db (Optional[float]): minimum negative cut-off in decibels.  A reasonable number
            is 80.

    Outputs:
        Tensor: spectrogram in DB
    """
Jason Lian's avatar
more  
Jason Lian committed
223
224
225
226
227
228
    spec_db = multiplier * torch.log10(torch.clamp(spec, min=amin))
    spec_db -= multiplier * db_multiplier

    if top_db is not None:
        spec_db = torch.max(spec_db, spec_db.new_full((1,), spec_db.max() - top_db))
    return spec_db
Jason Lian's avatar
more  
Jason Lian committed
229
230
231
232
233
234
235


def create_dct(n_mfcc, n_mels, norm):
    # type: (int, int, string) -> Tensor
    """
    Creates a DCT transformation matrix with shape (num_mels, num_mfcc),
    normalized depending on norm
Jason Lian's avatar
Jason Lian committed
236
237
238
239
240
241
242
243

    Inputs:
        n_mfcc (int) : number of mfc coefficients to retain
        n_mels (int): number of MEL bins
        norm (string) : norm to use

    Outputs:
        Tensor: The transformation matrix, to be right-multiplied to row-wise data.
Jason Lian's avatar
more  
Jason Lian committed
244
245
246
247
    """
    outdim = n_mfcc
    dim = n_mels
    # http://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II
248
249
250
    n = torch.arange(dim, dtype=torch.float32)
    k = torch.arange(outdim, dtype=torch.float32)[:, None]
    dct = torch.cos(math.pi / dim * (n + 0.5) * k)
Jason Lian's avatar
more  
Jason Lian committed
251
    if norm == 'ortho':
252
253
        dct[0] *= 1.0 / math.sqrt(2.0)
        dct *= math.sqrt(2.0 / dim)
Jason Lian's avatar
more  
Jason Lian committed
254
255
    else:
        dct *= 2
256
    return dct.t()
Jason Lian's avatar
more  
Jason Lian committed
257
258
259
260


def MFCC(sig, mel_spect, log_mels, s2db, dct_mat):
    # type: (Tensor, MelSpectrogram, bool, SpectrogramToDB, Tensor) -> Tensor
Jason Lian's avatar
Jason Lian committed
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
    """Create the Mel-frequency cepstrum coefficients from an audio signal

    By default, this calculates the MFCC on the DB-scaled Mel spectrogram.
    This is not the textbook implementation, but is implemented here to
    give consistency with librosa.

    This output depends on the maximum value in the input spectrogram, and so
    may return different values for an audio clip split into snippets vs. a
    a full clip.

    Inputs:
        sig (Tensor): Tensor of audio of size (channels [c], samples [n])
        mel_spect (MelSpectrogram): melspectrogram of sig
        log_mels (bool): whether to use log-mel spectrograms instead of db-scaled
        s2db (SpectrogramToDB): a SpectrogramToDB instance
        dct_mat (Tensor): The transformation matrix (dct matrix), to be
            right-multiplied to row-wise data
    Outputs:
        Tensor: Mel-frequency cepstrum coefficients
    """
Jason Lian's avatar
more  
Jason Lian committed
281
282
283
284
285
286
287
288
289
290
291
    if log_mels:
        log_offset = 1e-6
        mel_spect = torch.log(mel_spect + log_offset)
    else:
        mel_spect = s2db(mel_spect)
    mfcc = torch.matmul(mel_spect, dct_mat.to(mel_spect.device))
    return mfcc


def BLC2CBL(tensor):
    # type: (Tensor) -> Tensor
Jason Lian's avatar
Jason Lian committed
292
293
294
295
296
297
298
299
300
    """Permute a 3d tensor from Bands x Sample length x Channels to Channels x
       Bands x Samples length

    Inputs:
        tensor (Tensor): Tensor of spectrogram with shape (BxLxC)

    Outputs:
        Tensor: Tensor of spectrogram with shape (CxBxL)
    """
Jason Lian's avatar
more  
Jason Lian committed
301
302
303
304
    return tensor.permute(2, 0, 1).contiguous()


def mu_law_encoding(x, qc):
305
    # type: (Tensor, int) -> Tensor
Jason Lian's avatar
Jason Lian committed
306
307
308
309
310
311
312
313
314
315
316
317
318
    """Encode signal based on mu-law companding.  For more info see the
    `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_

    This algorithm assumes the signal has been scaled to between -1 and 1 and
    returns a signal encoded with values from 0 to quantization_channels - 1

    Inputs:
        x (Tensor): Input tensor
        qc (int): Number of channels (i.e. quantization channels)

    Outputs:
        Tensor: Input after mu-law companding
    """
319
    assert(isinstance(x, torch.Tensor)), 'mu_law_encoding expects a Tensor'
Jason Lian's avatar
more  
Jason Lian committed
320
    mu = qc - 1.
321
322
323
324
325
326
    if not x.dtype.is_floating_point:
        x = x.to(torch.float)
    mu = torch.tensor(mu, dtype=x.dtype)
    x_mu = torch.sign(x) * torch.log1p(mu *
                                       torch.abs(x)) / torch.log1p(mu)
    x_mu = ((x_mu + 1) / 2 * mu + 0.5).long()
Jason Lian's avatar
more  
Jason Lian committed
327
328
329
    return x_mu


Jason Lian's avatar
pre  
Jason Lian committed
330
def mu_law_expanding(x_mu, qc):
331
    # type: (Tensor, int) -> Tensor
Jason Lian's avatar
Jason Lian committed
332
333
334
335
336
337
338
    """Decode mu-law encoded signal.  For more info see the
    `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_

    This expects an input with values between 0 and quantization_channels - 1
    and returns a signal scaled between -1 and 1.

    Inputs:
Jason Lian's avatar
pre  
Jason Lian committed
339
        x_mu (Tensor): Input tensor
Jason Lian's avatar
Jason Lian committed
340
341
342
343
344
        qc (int): Number of channels (i.e. quantization channels)

    Outputs:
        Tensor: Input after decoding
    """
345
    assert(isinstance(x_mu, torch.Tensor)), 'mu_law_expanding expects a Tensor'
Jason Lian's avatar
more  
Jason Lian committed
346
    mu = qc - 1.
347
348
349
350
351
    if not x_mu.dtype.is_floating_point:
        x_mu = x_mu.to(torch.float)
    mu = torch.tensor(mu, dtype=x_mu.dtype)
    x = ((x_mu) / mu) * 2 - 1.
    x = torch.sign(x) * (torch.exp(torch.abs(x) * torch.log1p(mu)) - 1.) / mu
Jason Lian's avatar
more  
Jason Lian committed
352
    return x