Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
4f7886d1
Commit
4f7886d1
authored
Jul 02, 2019
by
jamarshon
Committed by
cpuhrsch
Jul 02, 2019
Browse files
Kaldi Fbank (#127)
parent
9bd633e3
Changes
102
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
375 additions
and
43 deletions
+375
-43
test/compliance/utils.py
test/compliance/utils.py
+27
-0
torchaudio/compliance/kaldi.py
torchaudio/compliance/kaldi.py
+348
-43
No files found.
test/compliance/utils.py
0 → 100644
View file @
4f7886d1
import
random
import
torchaudio
TEST_PREFIX
=
[
'fbank'
,
'spec'
]
def
generate_rand_boolean
():
# Generates a random boolean ('true', 'false')
return
'true'
if
random
.
randint
(
0
,
1
)
else
'false'
def
generate_rand_window_type
():
# Generates a random window type
return
torchaudio
.
compliance
.
kaldi
.
WINDOWS
[
random
.
randint
(
0
,
len
(
torchaudio
.
compliance
.
kaldi
.
WINDOWS
)
-
1
)]
def
parse
(
token
):
# converts an arg extracted from filepath to its corresponding python type
if
token
==
'true'
:
return
True
if
token
==
'false'
:
return
False
if
token
in
torchaudio
.
compliance
.
kaldi
.
WINDOWS
or
token
in
TEST_PREFIX
:
return
token
if
'.'
in
token
:
return
float
(
token
)
return
int
(
token
)
torchaudio/compliance/kaldi.py
View file @
4f7886d1
...
...
@@ -4,11 +4,19 @@ import torch
__all__
=
[
'spectrogram'
'fbank'
,
'get_mel_banks'
,
'inverse_mel_scale'
,
'inverse_mel_scale_scalar'
,
'mel_scale'
,
'mel_scale_scalar'
,
'spectrogram'
,
'vtln_warp_freq'
,
'vtln_warp_mel_freq'
,
]
# numeric_limits<float>::epsilon()
EPSILON
=
torch
.
tensor
(
1.19209290e-07
,
dtype
=
torch
.
get_default_dtype
())
# numeric_limits<float>::epsilon()
1.1920928955078125e-07
EPSILON
=
torch
.
tensor
(
torch
.
finfo
(
torch
.
float
).
eps
,
dtype
=
torch
.
get_default_dtype
())
# 1 milliseconds = 0.001 seconds
MILLISECONDS_TO_SECONDS
=
0.001
...
...
@@ -18,6 +26,7 @@ HANNING = 'hanning'
POVEY
=
'povey'
RECTANGULAR
=
'rectangular'
BLACKMAN
=
'blackman'
WINDOWS
=
[
HAMMING
,
HANNING
,
POVEY
,
RECTANGULAR
,
BLACKMAN
]
def
_next_power_of_2
(
x
):
...
...
@@ -101,43 +110,9 @@ def _get_log_energy(strided_input, epsilon, energy_floor):
torch
.
tensor
(
math
.
log
(
energy_floor
),
dtype
=
torch
.
get_default_dtype
()))
def
spectrogram
(
sig
,
blackman_coeff
=
0.42
,
channel
=-
1
,
dither
=
1.0
,
energy_floor
=
0.0
,
frame_length
=
25.0
,
frame_shift
=
10.0
,
min_duration
=
0.0
,
preemphasis_coefficient
=
0.97
,
raw_energy
=
True
,
remove_dc_offset
=
True
,
round_to_power_of_two
=
True
,
sample_frequency
=
16000.0
,
snip_edges
=
True
,
subtract_mean
=
False
,
window_type
=
POVEY
):
"""Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi's
compute-spectrogram-feats.
Inputs:
sig (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
blackman_coeff (float): Constant coefficient for generalized Blackman window. (default = 0.42)
channel (int): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (default = -1)
dither (float): Dithering constant (0.0 means no dither). If you turn this off, you should set
the energy_floor option, e.g. to 1.0 or 0.1 (default = 1.0)
energy_floor (float): Floor on energy (absolute, not relative) in Spectrogram computation. Caution:
this floor is applied to the zeroth component, representing the total signal energy. The floor on the
individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (default = 0.0)
frame_length (float): Frame length in milliseconds (default = 25.0)
frame_shift (float): Frame shift in milliseconds (default = 10.0)
min_duration (float): Minimum duration of segments to process (in seconds). (default = 0.0)
preemphasis_coefficient (float): Coefficient for use in signal preemphasis (default = 0.97)
raw_energy (bool): If True, compute energy before preemphasis and windowing (default = True)
remove_dc_offset: Subtract mean from waveform on each frame (default = True)
round_to_power_of_two (bool): If True, round window size to power of two by zero-padding input
to FFT. (default = True)
sample_frequency (float): Waveform data sample frequency (must match the waveform file, if
specified there) (default = 16000.0)
snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit
in the file, and the number of frames depends on the frame_length. If False, the number of frames
depends only on the frame_shift, and we reflect the data at the ends. (default = true)
subtract_mean (bool): Subtract mean of each feature file [CMS]; not recommended to do
it this way. (default = False)
window_type (str): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman') (default = 'povey')
Outputs:
Tensor: a spectrogram identical to what Kaldi would output. The shape is (, `padded_window_size` // 2 + 1)
def
_get_waveform_and_window_properties
(
sig
,
channel
,
sample_frequency
,
frame_shift
,
frame_length
,
round_to_power_of_two
,
preemphasis_coefficient
):
"""Gets the waveform and window properties
"""
waveform
=
sig
[
max
(
channel
,
0
),
:]
# size (n)
window_shift
=
int
(
sample_frequency
*
frame_shift
*
MILLISECONDS_TO_SECONDS
)
...
...
@@ -150,11 +125,16 @@ def spectrogram(
'`window_size` must be divisible by two. use `round_to_power_of_two` or change `frame_length`'
assert
0.
<=
preemphasis_coefficient
<=
1.0
,
'`preemphasis_coefficient` must be between [0,1]'
assert
sample_frequency
>
0
,
'`sample_frequency` must be greater than zero'
return
waveform
,
window_shift
,
window_size
,
padded_window_size
if
len
(
waveform
)
<
min_duration
*
sample_frequency
:
# signal is too short
return
torch
.
empty
(
0
)
def
_get_window
(
waveform
,
padded_window_size
,
window_size
,
window_shift
,
window_type
,
blackman_coeff
,
snip_edges
,
raw_energy
,
energy_floor
,
dither
,
remove_dc_offset
,
preemphasis_coefficient
):
"""Gets a window and its log energy
Outputs:
strided_input (Tensor): size (m, padded_window_size)
signal_log_energy (Tensor): size (m)
"""
# size (m, window_size)
strided_input
=
_get_strided
(
waveform
,
window_size
,
window_shift
,
snip_edges
)
...
...
@@ -195,6 +175,59 @@ def spectrogram(
if
not
raw_energy
:
signal_log_energy
=
_get_log_energy
(
strided_input
,
EPSILON
,
energy_floor
)
# size (m)
return
strided_input
,
signal_log_energy
def
spectrogram
(
sig
,
blackman_coeff
=
0.42
,
channel
=-
1
,
dither
=
1.0
,
energy_floor
=
0.0
,
frame_length
=
25.0
,
frame_shift
=
10.0
,
min_duration
=
0.0
,
preemphasis_coefficient
=
0.97
,
raw_energy
=
True
,
remove_dc_offset
=
True
,
round_to_power_of_two
=
True
,
sample_frequency
=
16000.0
,
snip_edges
=
True
,
subtract_mean
=
False
,
window_type
=
POVEY
):
"""Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi's
compute-spectrogram-feats.
Inputs:
sig (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
blackman_coeff (float): Constant coefficient for generalized Blackman window. (default = 0.42)
channel (int): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (default = -1)
dither (float): Dithering constant (0.0 means no dither). If you turn this off, you should set
the energy_floor option, e.g. to 1.0 or 0.1 (default = 1.0)
energy_floor (float): Floor on energy (absolute, not relative) in Spectrogram computation. Caution:
this floor is applied to the zeroth component, representing the total signal energy. The floor on the
individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (default = 0.0)
frame_length (float): Frame length in milliseconds (default = 25.0)
frame_shift (float): Frame shift in milliseconds (default = 10.0)
min_duration (float): Minimum duration of segments to process (in seconds). (default = 0.0)
preemphasis_coefficient (float): Coefficient for use in signal preemphasis (default = 0.97)
raw_energy (bool): If True, compute energy before preemphasis and windowing (default = True)
remove_dc_offset: Subtract mean from waveform on each frame (default = True)
round_to_power_of_two (bool): If True, round window size to power of two by zero-padding input
to FFT. (default = True)
sample_frequency (float): Waveform data sample frequency (must match the waveform file, if
specified there) (default = 16000.0)
snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit
in the file, and the number of frames depends on the frame_length. If False, the number of frames
depends only on the frame_shift, and we reflect the data at the ends. (default = True)
subtract_mean (bool): Subtract mean of each feature file [CMS]; not recommended to do
it this way. (default = False)
window_type (str): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman') (default = 'povey')
Outputs:
Tensor: a spectrogram identical to what Kaldi would output. The shape is (m, `padded_window_size` // 2 + 1)
where m is calculated in _get_strided
"""
waveform
,
window_shift
,
window_size
,
padded_window_size
=
_get_waveform_and_window_properties
(
sig
,
channel
,
sample_frequency
,
frame_shift
,
frame_length
,
round_to_power_of_two
,
preemphasis_coefficient
)
if
len
(
waveform
)
<
min_duration
*
sample_frequency
:
# signal is too short
return
torch
.
empty
(
0
)
strided_input
,
signal_log_energy
=
_get_window
(
waveform
,
padded_window_size
,
window_size
,
window_shift
,
window_type
,
blackman_coeff
,
snip_edges
,
raw_energy
,
energy_floor
,
dither
,
remove_dc_offset
,
preemphasis_coefficient
)
# size (m, padded_window_size // 2 + 1, 2)
fft
=
torch
.
rfft
(
strided_input
,
1
,
normalized
=
False
,
onesided
=
True
)
...
...
@@ -207,3 +240,275 @@ def spectrogram(
power_spectrum
=
power_spectrum
-
col_means
return
power_spectrum
def
inverse_mel_scale_scalar
(
mel_freq
):
# type: (float) -> float
return
700.0
*
(
math
.
exp
(
mel_freq
/
1127.0
)
-
1.0
)
def
inverse_mel_scale
(
mel_freq
):
return
700.0
*
((
mel_freq
/
1127.0
).
exp
()
-
1.0
)
def
mel_scale_scalar
(
freq
):
# type: (float) -> float
return
1127.0
*
math
.
log
(
1.0
+
freq
/
700.0
)
def
mel_scale
(
freq
):
return
1127.0
*
(
1.0
+
freq
/
700.0
).
log
()
def
vtln_warp_freq
(
vtln_low_cutoff
,
vtln_high_cutoff
,
low_freq
,
high_freq
,
vtln_warp_factor
,
freq
):
"""
This computes a VTLN warping function that is not the same as HTK's one,
but has similar inputs (this function has the advantage of never producing
empty bins).
This function computes a warp function F(freq), defined between low_freq
and high_freq inclusive, with the following properties:
F(low_freq) == low_freq
F(high_freq) == high_freq
The function is continuous and piecewise linear with two inflection
points.
The lower inflection point (measured in terms of the unwarped
frequency) is at frequency l, determined as described below.
The higher inflection point is at a frequency h, determined as
described below.
If l <= f <= h, then F(f) = f/vtln_warp_factor.
If the higher inflection point (measured in terms of the unwarped
frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
Since (by the last point) F(h) == h/vtln_warp_factor, then
max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
= vtln_high_cutoff * min(1, vtln_warp_factor).
If the lower inflection point (measured in terms of the unwarped
frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
= vtln_low_cutoff * max(1, vtln_warp_factor)
Inputs:
vtln_low_cutoff (float): lower frequency cutoffs for VTLN
vtln_high_cutoff (float): upper frequency cutoffs for VTLN
low_freq (float): lower frequency cutoffs in mel computation
high_freq (float): upper frequency cutoffs in mel computation
vtln_warp_factor (float): Vtln warp factor
freq (Tensor): given frequency in Hz
Outputs:
Tensor: freq after vtln warp
"""
assert
vtln_low_cutoff
>
low_freq
,
'be sure to set the vtln_low option higher than low_freq'
assert
vtln_high_cutoff
<
high_freq
,
'be sure to set the vtln_high option lower than high_freq [or negative]'
l
=
vtln_low_cutoff
*
max
(
1.0
,
vtln_warp_factor
)
h
=
vtln_high_cutoff
*
min
(
1.0
,
vtln_warp_factor
)
scale
=
1.0
/
vtln_warp_factor
Fl
=
scale
*
l
# F(l)
Fh
=
scale
*
h
# F(h)
assert
l
>
low_freq
and
h
<
high_freq
# slope of left part of the 3-piece linear function
scale_left
=
(
Fl
-
low_freq
)
/
(
l
-
low_freq
)
# [slope of center part is just "scale"]
# slope of right part of the 3-piece linear function
scale_right
=
(
high_freq
-
Fh
)
/
(
high_freq
-
h
)
res
=
torch
.
empty_like
(
freq
)
outside_low_high_freq
=
torch
.
lt
(
freq
,
low_freq
)
|
torch
.
gt
(
freq
,
high_freq
)
# freq < low_freq || freq > high_freq
before_l
=
torch
.
lt
(
freq
,
l
)
# freq < l
before_h
=
torch
.
lt
(
freq
,
h
)
# freq < h
after_h
=
torch
.
ge
(
freq
,
h
)
# freq >= h
# order of operations matter here (since there is overlapping frequency regions)
res
[
after_h
]
=
high_freq
+
scale_right
*
(
freq
[
after_h
]
-
high_freq
)
res
[
before_h
]
=
scale
*
freq
[
before_h
]
res
[
before_l
]
=
low_freq
+
scale_left
*
(
freq
[
before_l
]
-
low_freq
)
res
[
outside_low_high_freq
]
=
freq
[
outside_low_high_freq
]
return
res
def
vtln_warp_mel_freq
(
vtln_low_cutoff
,
vtln_high_cutoff
,
low_freq
,
high_freq
,
vtln_warp_factor
,
mel_freq
):
"""
Inputs:
vtln_low_cutoff (float): lower frequency cutoffs for VTLN
vtln_high_cutoff (float): upper frequency cutoffs for VTLN
low_freq (float): lower frequency cutoffs in mel computation
high_freq (float): upper frequency cutoffs in mel computation
vtln_warp_factor (float): Vtln warp factor
mel_freq (Tensor): given frequency in Mel
Outputs:
Tensor: mel_freq after vtln warp
"""
return
mel_scale
(
vtln_warp_freq
(
vtln_low_cutoff
,
vtln_high_cutoff
,
low_freq
,
high_freq
,
vtln_warp_factor
,
inverse_mel_scale
(
mel_freq
)))
def
get_mel_banks
(
num_bins
,
window_length_padded
,
sample_freq
,
low_freq
,
high_freq
,
vtln_low
,
vtln_high
,
vtln_warp_factor
):
# type: (int, int, float, float, float, float, float)
"""
Outputs:
bins (Tensor): melbank of size (num_bins, num_fft_bins)
center_freqs (Tensor): center frequencies of bins of size (num_bins)
"""
assert
num_bins
>
3
,
'Must have at least 3 mel bins'
assert
window_length_padded
%
2
==
0
num_fft_bins
=
window_length_padded
/
2
nyquist
=
0.5
*
sample_freq
if
high_freq
<=
0.0
:
high_freq
+=
nyquist
assert
(
0.0
<=
low_freq
<
nyquist
)
and
(
0.0
<
high_freq
<=
nyquist
)
and
(
low_freq
<
high_freq
),
\
(
'Bad values in options: low-freq %f and high-freq %f vs. nyquist %f'
%
(
low_freq
,
high_freq
,
nyquist
))
# fft-bin width [think of it as Nyquist-freq / half-window-length]
fft_bin_width
=
sample_freq
/
window_length_padded
mel_low_freq
=
mel_scale_scalar
(
low_freq
)
mel_high_freq
=
mel_scale_scalar
(
high_freq
)
# divide by num_bins+1 in next line because of end-effects where the bins
# spread out to the sides.
mel_freq_delta
=
(
mel_high_freq
-
mel_low_freq
)
/
(
num_bins
+
1
)
if
vtln_high
<
0.0
:
vtln_high
+=
nyquist
assert
vtln_warp_factor
==
1.0
or
((
low_freq
<
vtln_low
<
high_freq
)
and
(
0.0
<
vtln_high
<
high_freq
)
and
(
vtln_low
<
vtln_high
)),
\
(
'Bad values in options: vtln-low %f and vtln-high %f, versus low-freq %f and high-freq %f'
%
(
vtln_low
,
vtln_high
,
low_freq
,
high_freq
))
bin
=
torch
.
arange
(
num_bins
,
dtype
=
torch
.
get_default_dtype
()).
unsqueeze
(
1
)
left_mel
=
mel_low_freq
+
bin
*
mel_freq_delta
# size(num_bins, 1)
center_mel
=
mel_low_freq
+
(
bin
+
1.0
)
*
mel_freq_delta
# size(num_bins, 1)
right_mel
=
mel_low_freq
+
(
bin
+
2.0
)
*
mel_freq_delta
# size(num_bins, 1)
if
vtln_warp_factor
!=
1.0
:
left_mel
=
vtln_warp_mel_freq
(
vtln_low
,
vtln_high
,
low_freq
,
high_freq
,
vtln_warp_factor
,
left_mel
)
center_mel
=
vtln_warp_mel_freq
(
vtln_low
,
vtln_high
,
low_freq
,
high_freq
,
vtln_warp_factor
,
center_mel
)
right_mel
=
vtln_warp_mel_freq
(
vtln_low
,
vtln_high
,
low_freq
,
high_freq
,
vtln_warp_factor
,
right_mel
)
center_freqs
=
inverse_mel_scale
(
center_mel
)
# size (num_bins)
# size(1, num_fft_bins)
mel
=
mel_scale
(
fft_bin_width
*
torch
.
arange
(
num_fft_bins
,
dtype
=
torch
.
get_default_dtype
())).
unsqueeze
(
0
)
# size (num_bins, num_fft_bins)
up_slope
=
(
mel
-
left_mel
)
/
(
center_mel
-
left_mel
)
down_slope
=
(
right_mel
-
mel
)
/
(
right_mel
-
center_mel
)
if
vtln_warp_factor
==
1.0
:
# left_mel < center_mel < right_mel so we can min the two slopes and clamp negative values
bins
=
torch
.
max
(
torch
.
zeros
(
1
),
torch
.
min
(
up_slope
,
down_slope
))
else
:
# warping can move the order of left_mel, center_mel, right_mel anywhere
bins
=
torch
.
zeros_like
(
up_slope
)
up_idx
=
torch
.
gt
(
mel
,
left_mel
)
&
torch
.
le
(
mel
,
center_mel
)
# left_mel < mel <= center_mel
down_idx
=
torch
.
gt
(
mel
,
center_mel
)
&
torch
.
lt
(
mel
,
right_mel
)
# center_mel < mel < right_mel
bins
[
up_idx
]
=
up_slope
[
up_idx
]
bins
[
down_idx
]
=
down_slope
[
down_idx
]
return
bins
,
center_freqs
def
fbank
(
sig
,
blackman_coeff
=
0.42
,
channel
=-
1
,
dither
=
1.0
,
energy_floor
=
0.0
,
frame_length
=
25.0
,
frame_shift
=
10.0
,
high_freq
=
0.0
,
htk_compat
=
False
,
low_freq
=
20.0
,
min_duration
=
0.0
,
num_mel_bins
=
23
,
preemphasis_coefficient
=
0.97
,
raw_energy
=
True
,
remove_dc_offset
=
True
,
round_to_power_of_two
=
True
,
sample_frequency
=
16000.0
,
snip_edges
=
True
,
subtract_mean
=
False
,
use_energy
=
False
,
use_log_fbank
=
True
,
use_power
=
True
,
vtln_high
=-
500.0
,
vtln_low
=
100.0
,
vtln_warp
=
1.0
,
window_type
=
'povey'
):
"""Create a fbank from a raw audio signal. This matches the input/output of Kaldi's
compute-fbank-feats.
Inputs:
sig (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
blackman_coeff (float): Constant coefficient for generalized Blackman window. (default = 0.42)
channel (int): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (default = -1)
dither (float): Dithering constant (0.0 means no dither). If you turn this off, you should set
the energy_floor option, e.g. to 1.0 or 0.1 (default = 1.0)
energy_floor (float): Floor on energy (absolute, not relative) in Spectrogram computation. Caution:
this floor is applied to the zeroth component, representing the total signal energy. The floor on the
individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (default = 0.0)
frame_length (float): Frame length in milliseconds (default = 25.0)
frame_shift (float): Frame shift in milliseconds (default = 10.0)
high_freq (float): High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (default = 0.0)
htk_compat (bool): If true, put energy last. Warning: not sufficient to get HTK compatible features (need
to change other parameters). (default = False)
low_freq (float): Low cutoff frequency for mel bins (default = 20.0)
min_duration (float): Minimum duration of segments to process (in seconds). (default = 0.0)
num_mel_bins (int): Number of triangular mel-frequency bins (default = 23)
preemphasis_coefficient (float): Coefficient for use in signal preemphasis (default = 0.97)
raw_energy (bool): If True, compute energy before preemphasis and windowing (default = True)
remove_dc_offset: Subtract mean from waveform on each frame (default = True)
round_to_power_of_two (bool): If True, round window size to power of two by zero-padding input
to FFT. (default = True)
sample_frequency (float): Waveform data sample frequency (must match the waveform file, if
specified there) (default = 16000.0)
snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit
in the file, and the number of frames depends on the frame_length. If False, the number of frames
depends only on the frame_shift, and we reflect the data at the ends. (default = True)
subtract_mean (bool): Subtract mean of each feature file [CMS]; not recommended to do
it this way. (default = False)
use_energy (bool): Add an extra dimension with energy to the FBANK output. (default = False)
use_log_fbank (bool):If true, produce log-filterbank, else produce linear. (default = True)
use_power (bool): If true, use power, else use magnitude. (default = True)
vtln_high (float): High inflection point in piecewise linear VTLN warping function (if
negative, offset from high-mel-freq (default = -500.0)
vtln_low (float): Low inflection point in piecewise linear VTLN warping function (float, default = 100.0)
vtln_warp (float): Vtln warp factor (only applicable if vtln_map not specified) (float, default = 1.0)
window_type (str): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman') (default = 'povey')
Outputs:
Tensor: a fbank identical to what Kaldi would output. The shape is (m, `num_mel_bins` + `use_energy`)
where m is calculated in _get_strided
"""
waveform
,
window_shift
,
window_size
,
padded_window_size
=
_get_waveform_and_window_properties
(
sig
,
channel
,
sample_frequency
,
frame_shift
,
frame_length
,
round_to_power_of_two
,
preemphasis_coefficient
)
if
len
(
waveform
)
<
min_duration
*
sample_frequency
:
# signal is too short
return
torch
.
empty
(
0
)
# strided_input, size (m, padded_window_size) and signal_log_energy, size (m)
strided_input
,
signal_log_energy
=
_get_window
(
waveform
,
padded_window_size
,
window_size
,
window_shift
,
window_type
,
blackman_coeff
,
snip_edges
,
raw_energy
,
energy_floor
,
dither
,
remove_dc_offset
,
preemphasis_coefficient
)
# size (m, padded_window_size // 2 + 1, 2)
fft
=
torch
.
rfft
(
strided_input
,
1
,
normalized
=
False
,
onesided
=
True
)
power_spectrum
=
fft
.
pow
(
2
).
sum
(
2
).
unsqueeze
(
1
)
# size (m, 1, padded_window_size // 2 + 1)
if
not
use_power
:
power_spectrum
=
power_spectrum
.
pow
(
0.5
)
# size (num_mel_bins, padded_window_size // 2)
mel_energies
,
_
=
get_mel_banks
(
num_mel_bins
,
padded_window_size
,
sample_frequency
,
low_freq
,
high_freq
,
vtln_low
,
vtln_high
,
vtln_warp
)
# pad right column with zeros and add dimension, size (1, num_mel_bins, padded_window_size // 2 + 1)
mel_energies
=
torch
.
nn
.
functional
.
pad
(
mel_energies
,
(
0
,
1
),
mode
=
'constant'
,
value
=
0
).
unsqueeze
(
0
)
# sum with mel fiterbanks over the power spectrum, size (m, num_mel_bins)
mel_energies
=
(
power_spectrum
*
mel_energies
).
sum
(
dim
=
2
)
if
use_log_fbank
:
# avoid log of zero (which should be prevented anyway by dithering)
mel_energies
=
torch
.
max
(
mel_energies
,
EPSILON
).
log
()
# if use_energy then add it as the first column for htk_compat == true else last column
if
use_energy
:
signal_log_energy
=
signal_log_energy
.
unsqueeze
(
1
)
# size (m, 1)
# returns size (m, num_mel_bins + 1)
if
htk_compat
:
mel_energies
=
torch
.
cat
((
mel_energies
,
signal_log_energy
),
dim
=
1
)
else
:
mel_energies
=
torch
.
cat
((
signal_log_energy
,
mel_energies
),
dim
=
1
)
if
subtract_mean
:
col_means
=
torch
.
mean
(
mel_energies
,
dim
=
0
).
unsqueeze
(
0
)
# size (1, num_mel_bins + use_energy)
mel_energies
=
mel_energies
-
col_means
return
mel_energies
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment