import numpy as np from scipy.fftpack import dct # ---------- feature-window ---------- def sliding_window(x, window_size, window_shift): shape = x.shape[:-1] + (x.shape[-1] - window_size + 1, window_size) strides = x.strides + (x.strides[-1],) return np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides)[::window_shift] def func_num_frames(num_samples, window_size, window_shift, snip_edges): if snip_edges: if num_samples < window_size: return 0 else: return 1 + ((num_samples - window_size) // window_shift) else: return (num_samples + (window_shift // 2)) // window_shift def func_dither(waveform, dither_value): if dither_value == 0.0: return waveform waveform += np.random.normal(size=waveform.shape).astype(waveform.dtype) * dither_value return waveform def func_remove_dc_offset(waveform): return waveform - np.mean(waveform) def func_log_energy(waveform): return np.log(np.dot(waveform, waveform).clip(min=np.finfo(waveform.dtype).eps)) def func_preemphasis(waveform, preemph_coeff): if preemph_coeff == 0.0: return waveform assert 0 < preemph_coeff <= 1 waveform[1:] -= preemph_coeff * waveform[:-1] waveform[0] -= preemph_coeff * waveform[0] return waveform def sine(M): if M < 1: return np.array([]) if M == 1: return np.ones(1, float) n = np.arange(0, M) return np.sin(np.pi*n/(M-1)) def povey(M): if M < 1: return np.array([]) if M == 1: return np.ones(1, float) n = np.arange(0, M) return (0.5 - 0.5*np.cos(2.0*np.pi*n/(M-1)))**0.85 def feature_window_function(window_type, window_size, blackman_coeff): assert window_size > 0 if window_type == 'hanning': return np.hanning(window_size) elif window_type == 'sine': return sine(window_size) elif window_type == 'hamming': return np.hamming(window_size) elif window_type == 'povey': return povey(window_size) elif window_type == 'rectangular': return np.ones(window_size) elif window_type == 'blackman': window_func = np.blackman(window_size) if blackman_coeff == 0.42: return window_func else: return window_func - 0.42 + blackman_coeff else: raise ValueError('Invalid window type {}'.format(window_type)) def process_window(window, dither, remove_dc_offset, preemphasis_coefficient, window_function, raw_energy): if dither != 0.0: window = func_dither(window, dither) if remove_dc_offset: window = func_remove_dc_offset(window) if raw_energy: log_energy = func_log_energy(window) if preemphasis_coefficient != 0.0: window = func_preemphasis(window, preemphasis_coefficient) window *= window_function if not raw_energy: log_energy = func_log_energy(window) return window, log_energy def extract_window(waveform, blackman_coeff, dither, window_size, window_shift, preemphasis_coefficient, raw_energy, remove_dc_offset, snip_edges, window_type, dtype): num_samples = len(waveform) num_frames = func_num_frames(num_samples, window_size, window_shift, snip_edges) num_samples_ = (num_frames - 1) * window_shift + window_size if snip_edges: waveform = waveform[:num_samples_] else: offset = window_shift // 2 - window_size // 2 waveform = np.concatenate([ waveform[-offset - 1::-1], waveform, waveform[:-(offset + num_samples_ - num_samples + 1):-1] ]) frames = sliding_window(waveform, window_size=window_size, window_shift=window_shift) frames = frames.astype(dtype) log_enery = np.empty(frames.shape[0], dtype=dtype) for i in range(frames.shape[0]): frames[i], log_enery[i] = process_window( window=frames[i], dither=dither, remove_dc_offset=remove_dc_offset, preemphasis_coefficient=preemphasis_coefficient, window_function=feature_window_function( window_type=window_type, window_size=window_size, blackman_coeff=blackman_coeff ).astype(dtype), raw_energy=raw_energy ) return frames, log_enery # ---------- feature-window ---------- # ---------- feature-functions ---------- def compute_spectrum(frames, n): complex_spec = np.fft.rfft(frames, n) return np.absolute(complex_spec) def compute_power_spectrum(frames, n): return np.square(compute_spectrum(frames, n)) def apply_cmvn_sliding_internal(feat, center=False, window=600, min_window=100, norm_vars=False): num_frames, feat_dim = feat.shape std = 1 if center: if num_frames <= window: mean = feat.mean(axis=0, keepdims=True).repeat(num_frames, axis=0) if norm_vars: std = feat.std(axis=0, keepdims=True).repeat(num_frames, axis=0) else: feat1 = feat[:window] feat2 = sliding_window(feat.T, window, 1) feat3 = feat[-window:] mean1 = feat1.mean(axis=0, keepdims=True).repeat(window // 2, axis=0) mean2 = feat2.mean(axis=2).T mean3 = feat3.mean(axis=0, keepdims=True).repeat((window - 1) // 2, axis=0) mean = np.concatenate([mean1, mean2, mean3]) if norm_vars: std1 = feat1.std(axis=0, keepdims=True).repeat(window // 2, axis=0) std2 = feat2.std(axis=2).T std3 = feat3.mean(axis=0, keepdims=True).repeat((window - 1) // 2, axis=0) std = np.concatenate([std1, std2, std3]) else: if num_frames <= min_window: mean = feat.mean(axis=0, keepdims=True).repeat(num_frames, axis=0) if norm_vars: std = feat.std(axis=0, keepdims=True).repeat(num_frames, axis=0) else: feat1 = feat[:min_window] mean1 = feat1.mean(axis=0, keepdims=True).repeat(min_window, axis=0) feat2_cumsum = np.cumsum(feat[:window], axis=0)[min_window:] cumcnt = np.arange(min_window + 1, min(window, num_frames) + 1, dtype=feat.dtype)[:, np.newaxis] mean2 = feat2_cumsum / cumcnt mean = np.concatenate([mean1, mean2]) if norm_vars: std1 = feat1.std(axis=0, keepdims=True).repeat(min_window, axis=0) feat2_power_cumsum = np.cumsum(np.square(feat[:window]), axis=0)[min_window:] std2 = np.sqrt(feat2_power_cumsum / cumcnt - np.square(mean2)) std = np.concatenate([std1, std2]) if num_frames > window: feat3 = sliding_window(feat.T, window, 1) mean3 = feat3.mean(axis=2).T mean = np.concatenate([mean, mean3[1:]]) if norm_vars: std3 = feat3.std(axis=2).T std = np.concatenate([std, std3[1:]]) feat = (feat - mean) / std return feat # ---------- feature-functions ---------- # ---------- mel-computations ---------- def inverse_mel_scale(mel_freq): return 700.0 * (np.exp(mel_freq / 1127.0) - 1.0) def mel_scale(freq): return 1127.0 * np.log(1.0 + freq / 700.0) def compute_mel_banks(num_bins, sample_frequency, low_freq, high_freq, n): """ Compute Mel banks. :param num_bins: Number of triangular mel-frequency bins :param sample_frequency: Waveform data sample frequency :param low_freq: Low cutoff frequency for mel bins :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) :param n: Window size :return: Mel banks. """ assert num_bins >= 3, 'Must have at least 3 mel bins' num_fft_bins = n // 2 nyquist = 0.5 * sample_frequency if high_freq <= 0: high_freq = nyquist + high_freq assert 0 <= low_freq < high_freq <= nyquist fft_bin_width = sample_frequency / n mel_low_freq = mel_scale(low_freq) mel_high_freq = mel_scale(high_freq) mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1) mel_banks = np.zeros([num_bins, num_fft_bins + 1]) for i in range(num_bins): left_mel = mel_low_freq + mel_freq_delta * i center_mel = left_mel + mel_freq_delta right_mel = center_mel + mel_freq_delta for j in range(num_fft_bins): mel = mel_scale(fft_bin_width * j) if left_mel < mel < right_mel: if mel <= center_mel: mel_banks[i, j] = (mel - left_mel) / (center_mel - left_mel) else: mel_banks[i, j] = (right_mel - mel) / (right_mel - center_mel) return mel_banks def compute_lifter_coeffs(q, M): """ Compute liftering coefficients (scaling on cepstral coeffs) the zeroth index is C0, which is not affected. :param q: Number of lifters :param M: Number of coefficients :return: Lifters. """ if M < 1: return np.array([]) if M == 1: return np.ones(1, float) n = np.arange(0, M) return 1 + 0.5*np.sin(np.pi*n/q)*q # ---------- mel-computations ---------- # ---------- compute-fbank-feats ---------- def compute_fbank_feats( waveform, blackman_coeff=0.42, dither=1.0, energy_floor=1.0, frame_length=25, frame_shift=10, high_freq=0, low_freq=20, num_mel_bins=23, preemphasis_coefficient=0.97, raw_energy=True, remove_dc_offset=True, round_to_power_of_two=True, sample_frequency=16000, snip_edges=True, use_energy=False, use_log_fbank=True, use_power=True, window_type='povey', dtype=np.float32): """ Compute (log) Mel filter bank energies :param waveform: Input waveform. :param blackman_coeff: Constant coefficient for generalized Blackman window. (float, default = 0.42) :param dither: Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1) :param energy_floor: Floor on energy (absolute, not relative) in FBANK computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0) :param frame_length: Frame length in milliseconds (float, default = 25) :param frame_shift: Frame shift in milliseconds (float, default = 10) :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0) :param low_freq: Low cutoff frequency for mel bins (float, default = 20) :param num_mel_bins: Number of triangular mel-frequency bins (int, default = 23) :param preemphasis_coefficient: Coefficient for use in signal preemphasis (float, default = 0.97) :param raw_energy: If true, compute energy before preemphasis and windowing (bool, default = true) :param remove_dc_offset: Subtract mean from waveform on each frame (bool, default = true) :param round_to_power_of_two: If true, round window size to power of two by zero-padding input to FFT. (bool, default = true) :param sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000) :param snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true) :param use_energy: Add an extra energy output. (bool, default = false) :param use_log_fbank: If true, produce log-filterbank, else produce linear. (bool, default = true) :param use_power: If true, use power, else use magnitude. (bool, default = true) :param window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey") :param dtype: Type of array (np.float32|np.float64) (dtype or string, default=np.float32) :return: (Log) Mel filter bank energies. """ window_size = int(frame_length * sample_frequency * 0.001) window_shift = int(frame_shift * sample_frequency * 0.001) frames, log_energy = extract_window( waveform=waveform, blackman_coeff=blackman_coeff, dither=dither, window_size=window_size, window_shift=window_shift, preemphasis_coefficient=preemphasis_coefficient, raw_energy=raw_energy, remove_dc_offset=remove_dc_offset, snip_edges=snip_edges, window_type=window_type, dtype=dtype ) if round_to_power_of_two: n = 1 while n < window_size: n *= 2 else: n = window_size if use_power: spectrum = compute_power_spectrum(frames, n) else: spectrum = compute_spectrum(frames, n) mel_banks = compute_mel_banks( num_bins=num_mel_bins, sample_frequency=sample_frequency, low_freq=low_freq, high_freq=high_freq, n=n ).astype(dtype) feat = np.dot(spectrum, mel_banks.T) if use_log_fbank: feat = np.log(feat.clip(min=np.finfo(dtype).eps)) if use_energy: if energy_floor > 0.0: log_energy.clip(min=np.math.log(energy_floor)) return feat, log_energy return feat # ---------- compute-fbank-feats ---------- # ---------- compute-mfcc-feats ---------- def compute_mfcc_feats( waveform, blackman_coeff=0.42, cepstral_lifter=22, dither=1.0, energy_floor=0.0, frame_length=25, frame_shift=10, high_freq=0, low_freq=20, num_ceps=13, num_mel_bins=23, preemphasis_coefficient=0.97, raw_energy=True, remove_dc_offset=True, round_to_power_of_two=True, sample_frequency=16000, snip_edges=True, use_energy=True, window_type='povey', dtype=np.float32): """ Compute mel-frequency cepstral coefficients :param waveform: Input waveform. :param blackman_coeff: Constant coefficient for generalized Blackman window. (float, default = 0.42) :param cepstral_lifter: Constant that controls scaling of MFCCs (float, default = 22) :param dither: Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1) :param energy_floor: Floor on energy (absolute, not relative) in MFCC computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0) :param frame_length: Frame length in milliseconds (float, default = 25) :param frame_shift: Frame shift in milliseconds (float, default = 10) :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0) :param low_freq: Low cutoff frequency for mel bins (float, default = 20) :param num_ceps: Number of cepstra in MFCC computation (including C0) (int, default = 13) :param num_mel_bins: Number of triangular mel-frequency bins (int, default = 23) :param preemphasis_coefficient: Coefficient for use in signal preemphasis (float, default = 0.97) :param raw_energy: If true, compute energy before preemphasis and windowing (bool, default = true) :param remove_dc_offset: Subtract mean from waveform on each frame (bool, default = true) :param round_to_power_of_two: If true, round window size to power of two by zero-padding input to FFT. (bool, default = true) :param sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000) :param snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true) :param use_energy: Use energy (not C0) in MFCC computation (bool, default = true) :param window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey") :param dtype: Type of array (np.float32|np.float64) (dtype or string, default=np.float32) :return: Mel-frequency cespstral coefficients. """ feat, log_energy = compute_fbank_feats( waveform=waveform, blackman_coeff=blackman_coeff, dither=dither, energy_floor=energy_floor, frame_length=frame_length, frame_shift=frame_shift, high_freq=high_freq, low_freq=low_freq, num_mel_bins=num_mel_bins, preemphasis_coefficient=preemphasis_coefficient, raw_energy=raw_energy, remove_dc_offset=remove_dc_offset, round_to_power_of_two=round_to_power_of_two, sample_frequency=sample_frequency, snip_edges=snip_edges, use_energy=use_energy, use_log_fbank=True, use_power=True, window_type=window_type, dtype=dtype ) feat = dct(feat, type=2, axis=1, norm='ortho')[:, :num_ceps] lifter_coeffs = compute_lifter_coeffs(cepstral_lifter, num_ceps).astype(dtype) feat = feat * lifter_coeffs if use_energy: feat[:, 0] = log_energy return feat # ---------- compute-mfcc-feats ---------- # ---------- apply-cmvn-sliding ---------- def apply_cmvn_sliding(feat, center=False, window=600, min_window=100, norm_vars=False): """ Apply sliding-window cepstral mean (and optionally variance) normalization :param feat: Cepstrum. :param center: If true, use a window centered on the current frame (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false) :param window: Window in frames for running average CMN computation (int, default = 600) :param min_window: Minimum CMN window used at start of decoding (adds latency only at start). Only applicable if center == false, ignored if center==true (int, default = 100) :param norm_vars: If true, normalize variance to one. (bool, default = false) :return: Normalized cepstrum. """ # double-precision feat = apply_cmvn_sliding_internal( feat=feat.astype(np.float64), center=center, window=window, min_window=min_window, norm_vars=norm_vars ).astype(feat.dtype) return feat # ---------- apply-cmvn-sliding ----------