Commit cdab2875 authored by SWHL's avatar SWHL
Browse files

Update files

parents
Pipeline #335 failed with stages
in 0 seconds
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains the speech segment class."""
import numpy as np
from .audio import AudioSegment
class SpeechSegment(AudioSegment):
"""Speech Segment with Text
Args:
AudioSegment (AudioSegment): Audio Segment
"""
def __init__(self,
samples,
sample_rate,
transcript,
tokens=None,
token_ids=None):
"""Speech segment abstraction, a subclass of AudioSegment,
with an additional transcript.
Args:
samples (ndarray.float32): Audio samples [num_samples x num_channels].
sample_rate (int): Audio sample rate.
transcript (str): Transcript text for the speech.
tokens (List[str], optinal): Transcript tokens for the speech.
token_ids (List[int], optional): Transcript token ids for the speech.
"""
AudioSegment.__init__(self, samples, sample_rate)
self._transcript = transcript
# must init `tokens` with `token_ids` at the same time
self._tokens = tokens
self._token_ids = token_ids
def __eq__(self, other):
"""Return whether two objects are equal.
Returns:
bool: True, when equal to other
"""
if not AudioSegment.__eq__(self, other):
return False
if self._transcript != other._transcript:
return False
if self.has_token and other.has_token:
if self._tokens != other._tokens:
return False
if self._token_ids != other._token_ids:
return False
return True
def __ne__(self, other):
"""Return whether two objects are unequal."""
return not self.__eq__(other)
@classmethod
def from_file(cls,
filepath,
transcript,
tokens=None,
token_ids=None,
infos=None):
"""Create speech segment from audio file and corresponding transcript.
Args:
filepath (str|file): Filepath or file object to audio file.
transcript (str): Transcript text for the speech.
tokens (List[str], optional): text tokens. Defaults to None.
token_ids (List[int], optional): text token ids. Defaults to None.
infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.
Returns:
SpeechSegment: Speech segment instance.
"""
audio = AudioSegment.from_file(filepath, infos)
return cls(audio.samples, audio.sample_rate, transcript, tokens,
token_ids)
@classmethod
def from_bytes(cls, bytes, transcript, tokens=None, token_ids=None):
"""Create speech segment from a byte string and corresponding
Args:
filepath (str|file): Filepath or file object to audio file.
transcript (str): Transcript text for the speech.
tokens (List[str], optional): text tokens. Defaults to None.
token_ids (List[int], optional): text token ids. Defaults to None.
Returns:
SpeechSegment: Speech segment instance.
"""
audio = AudioSegment.from_bytes(bytes)
return cls(audio.samples, audio.sample_rate, transcript, tokens,
token_ids)
@classmethod
def from_pcm(cls, samples, sample_rate, transcript,
tokens=None, token_ids=None):
"""Create speech segment from pcm on online mode
Args:
samples (numpy.ndarray): Audio samples [num_samples x num_channels].
sample_rate (int): Audio sample rate.
transcript (str): Transcript text for the speech.
tokens (List[str], optional): text tokens. Defaults to None.
token_ids (List[int], optional): text token ids. Defaults to None.
Returns:
SpeechSegment: Speech segment instance.
"""
audio = AudioSegment.from_pcm(samples, sample_rate)
return cls(audio.samples, audio.sample_rate, transcript, tokens,
token_ids)
@classmethod
def concatenate(cls, *segments):
"""Concatenate an arbitrary number of speech segments together, both
audio and transcript will be concatenated.
:param *segments: Input speech segments to be concatenated.
:type *segments: tuple of SpeechSegment
:return: Speech segment instance.
:rtype: SpeechSegment
:raises ValueError: If the number of segments is zero, or if the
sample_rate of any two segments does not match.
:raises TypeError: If any segment is not SpeechSegment instance.
"""
if len(segments) == 0:
raise ValueError("No speech segments are given to concatenate.")
sample_rate = segments[0]._sample_rate
transcripts = ""
tokens = []
token_ids = []
for seg in segments:
if sample_rate != seg._sample_rate:
raise ValueError("Can't concatenate segments with "
"different sample rates")
if not isinstance(seg, cls):
raise TypeError("Only speech segments of the same type "
"instance can be concatenated.")
transcripts += seg._transcript
if self.has_token:
tokens += seg._tokens
token_ids += seg._token_ids
samples = np.concatenate([seg.samples for seg in segments])
return cls(samples, sample_rate, transcripts, tokens, token_ids)
@classmethod
def slice_from_file(cls,
filepath,
transcript,
tokens=None,
token_ids=None,
start=None,
end=None):
"""Loads a small section of an speech without having to load
the entire file into the memory which can be incredibly wasteful.
:param filepath: Filepath or file object to audio file.
:type filepath: str|file
:param start: Start time in seconds. If start is negative, it wraps
around from the end. If not provided, this function
reads from the very beginning.
:type start: float
:param end: End time in seconds. If end is negative, it wraps around
from the end. If not provided, the default behvaior is
to read to the end of the file.
:type end: float
:param transcript: Transcript text for the speech. if not provided,
the defaults is an empty string.
:type transript: str
:return: SpeechSegment instance of the specified slice of the input
speech file.
:rtype: SpeechSegment
"""
audio = AudioSegment.slice_from_file(filepath, start, end)
return cls(audio.samples, audio.sample_rate, transcript, tokens,
token_ids)
@classmethod
def make_silence(cls, duration, sample_rate):
"""Creates a silent speech segment of the given duration and
sample rate, transcript will be an empty string.
Args:
duration (float): Length of silence in seconds.
sample_rate (float): Sample rate.
Returns:
SpeechSegment: Silence of the given duration.
"""
audio = AudioSegment.make_silence(duration, sample_rate)
return cls(audio.samples, audio.sample_rate, "")
@property
def has_token(self):
if self._tokens and self._token_ids:
return True
return False
@property
def transcript(self):
"""Return the transcript text.
Returns:
str: Transcript text for the speech.
"""
return self._transcript
@property
def tokens(self):
"""Return the transcript text tokens.
Returns:
List[str]: text tokens.
"""
return self._tokens
@property
def token_ids(self):
"""Return the transcript text token ids.
Returns:
List[int]: text token ids.
"""
return self._token_ids
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains data helper functions."""
import json
import math
import tarfile
from collections import namedtuple
from typing import List, Optional, Text
import jsonlines
import numpy as np
__all__ = [
"load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
"max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
"EOS", "UNK", "BLANK", "MASKCTC", "SPACE", "convert_samples_to_float32",
"convert_samples_from_float32"
]
IGNORE_ID = -1
# `sos` and `eos` using same token
SOS = "<eos>"
EOS = SOS
UNK = "<unk>"
BLANK = "<blank>"
MASKCTC = "<mask>"
SPACE = "<space>"
def load_dict(dict_path: Optional[Text],
maskctc=False) -> Optional[List[Text]]:
if dict_path is None:
return None
with open(dict_path, "r") as f:
dictionary = f.readlines()
# first token is `<blank>`
# multi line: `<blank> 0\n`
# one line: `<blank>`
# space is relpace with <space>
char_list = [entry[:-1].split(" ")[0] for entry in dictionary]
if BLANK not in char_list:
char_list.insert(0, BLANK)
if EOS not in char_list:
char_list.append(EOS)
# for non-autoregressive maskctc model
if maskctc and MASKCTC not in char_list:
char_list.append(MASKCTC)
return char_list
def read_manifest(
manifest_path,
max_input_len=float('inf'),
min_input_len=0.0,
max_output_len=float('inf'),
min_output_len=0.0,
max_output_input_ratio=float('inf'),
min_output_input_ratio=0.0, ):
"""Load and parse manifest file.
Args:
manifest_path ([type]): Manifest file to load and parse.
max_input_len ([type], optional): maximum output seq length,
in seconds for raw wav, in frame numbers for feature data.
Defaults to float('inf').
min_input_len (float, optional): minimum input seq length,
in seconds for raw wav, in frame numbers for feature data.
Defaults to 0.0.
max_output_len (float, optional): maximum input seq length,
in modeling units. Defaults to 500.0.
min_output_len (float, optional): minimum input seq length,
in modeling units. Defaults to 0.0.
max_output_input_ratio (float, optional):
maximum output seq length/output seq length ratio. Defaults to 10.0.
min_output_input_ratio (float, optional):
minimum output seq length/output seq length ratio. Defaults to 0.05.
Raises:
IOError: If failed to parse the manifest.
Returns:
List[dict]: Manifest parsing results.
"""
manifest = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
feat_len = json_data["input"][0]["shape"][
0] if "input" in json_data and "shape" in json_data["input"][
0] else 1.0
token_len = json_data["output"][0]["shape"][
0] if "output" in json_data and "shape" in json_data["output"][
0] else 1.0
conditions = [
feat_len >= min_input_len,
feat_len <= max_input_len,
token_len >= min_output_len,
token_len <= max_output_len,
token_len / feat_len >= min_output_input_ratio,
token_len / feat_len <= max_output_input_ratio,
]
if all(conditions):
manifest.append(json_data)
return manifest
# Tar File read
TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
def parse_tar(file):
"""Parse a tar file to get a tarfile object
and a map containing tarinfoes
"""
result = {}
f = tarfile.open(file)
for tarinfo in f.getmembers():
result[tarinfo.name] = tarinfo
return f, result
def subfile_from_tar(file, local_data=None):
"""Get subfile object from tar.
tar:tarpath#filename
It will return a subfile object from tar file
and cached tar file info for next reading request.
"""
tarpath, filename = file.split(':', 1)[1].split('#', 1)
if local_data is None:
local_data = TarLocalData(tar2info={}, tar2object={})
assert isinstance(local_data, TarLocalData)
if 'tar2info' not in local_data.__dict__:
local_data.tar2info = {}
if 'tar2object' not in local_data.__dict__:
local_data.tar2object = {}
if tarpath not in local_data.tar2info:
fobj, infos = parse_tar(tarpath)
local_data.tar2info[tarpath] = infos
local_data.tar2object[tarpath] = fobj
else:
fobj = local_data.tar2object[tarpath]
infos = local_data.tar2info[tarpath]
return fobj.extractfile(infos[filename])
def rms_to_db(rms: float):
"""Root Mean Square to dB.
Args:
rms ([float]): root mean square
Returns:
float: dB
"""
return 20.0 * math.log10(max(1e-16, rms))
def rms_to_dbfs(rms: float):
"""Root Mean Square to dBFS.
https://fireattack.wordpress.com/2017/02/06/replaygain-loudness-normalization-and-applications/
Audio is mix of sine wave, so 1 amp sine wave's Full scale is 0.7071, equal to -3.0103dB.
dB = dBFS + 3.0103
dBFS = db - 3.0103
e.g. 0 dB = -3.0103 dBFS
Args:
rms ([float]): root mean square
Returns:
float: dBFS
"""
return rms_to_db(rms) - 3.0103
def max_dbfs(sample_data: np.ndarray):
"""Peak dBFS based on the maximum energy sample.
Args:
sample_data ([np.ndarray]): float array, [-1, 1].
Returns:
float: dBFS
"""
# Peak dBFS based on the maximum energy sample. Will prevent overdrive if
# used for normalization.
return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data))))
def mean_dbfs(sample_data):
"""Peak dBFS based on the RMS energy.
Args:
sample_data ([np.ndarray]): float array, [-1, 1].
Returns:
float: dBFS
"""
return rms_to_dbfs(
math.sqrt(np.mean(np.square(sample_data, dtype=np.float64))))
def gain_db_to_ratio(gain_db: float):
"""dB to ratio
Args:
gain_db (float): gain in dB
Returns:
float: scale in amp
"""
return math.pow(10.0, gain_db / 20.0)
def normalize_audio(sample_data: np.ndarray, dbfs: float = -3.0103):
"""Nomalize audio to dBFS.
Args:
sample_data (np.ndarray): input wave samples, [-1, 1].
dbfs (float, optional): target dBFS. Defaults to -3.0103.
Returns:
np.ndarray: normalized wave
"""
return np.maximum(
np.minimum(sample_data * gain_db_to_ratio(dbfs - max_dbfs(sample_data)),
1.0), -1.0)
def _load_json_cmvn(json_cmvn_file):
""" Load the json format cmvn stats file and calculate cmvn
Args:
json_cmvn_file: cmvn stats file in json format
Returns:
a numpy array of [means, vars]
"""
with open(json_cmvn_file) as f:
cmvn_stats = json.load(f)
means = cmvn_stats['mean_stat']
variance = cmvn_stats['var_stat']
count = cmvn_stats['frame_num']
for i in range(len(means)):
means[i] /= count
variance[i] = variance[i] / count - means[i] * means[i]
if variance[i] < 1.0e-20:
variance[i] = 1.0e-20
variance[i] = 1.0 / math.sqrt(variance[i])
cmvn = np.array([means, variance])
return cmvn
def _load_kaldi_cmvn(kaldi_cmvn_file):
""" Load the kaldi format cmvn stats file and calculate cmvn
Args:
kaldi_cmvn_file: kaldi text style global cmvn file, which
is generated by:
compute-cmvn-stats --binary=false scp:feats.scp global_cmvn
Returns:
a numpy array of [means, vars]
"""
means = []
variance = []
with open(kaldi_cmvn_file, 'r') as fid:
# kaldi binary file start with '\0B'
if fid.read(2) == '\0B':
print('kaldi cmvn binary file is not supported, please '
'recompute it by: compute-cmvn-stats --binary=false '
' scp:feats.scp global_cmvn')
sys.exit(1)
fid.seek(0)
arr = fid.read().split()
assert (arr[0] == '[')
assert (arr[-2] == '0')
assert (arr[-1] == ']')
feat_dim = int((len(arr) - 2 - 2) / 2)
for i in range(1, feat_dim + 1):
means.append(float(arr[i]))
count = float(arr[feat_dim + 1])
for i in range(feat_dim + 2, 2 * feat_dim + 2):
variance.append(float(arr[i]))
for i in range(len(means)):
means[i] /= count
variance[i] = variance[i] / count - means[i] * means[i]
if variance[i] < 1.0e-20:
variance[i] = 1.0e-20
variance[i] = 1.0 / math.sqrt(variance[i])
cmvn = np.array([means, variance])
return cmvn
def load_cmvn(cmvn_file: str, filetype: str):
"""load cmvn from file.
Args:
cmvn_file (str): cmvn path.
filetype (str): file type, optional[npz, json, kaldi].
Raises:
ValueError: file type not support.
Returns:
Tuple[np.ndarray, np.ndarray]: mean, istd
"""
assert filetype in ['npz', 'json', 'kaldi'], filetype
filetype = filetype.lower()
if filetype == "json":
cmvn = _load_json_cmvn(cmvn_file)
elif filetype == "kaldi":
cmvn = _load_kaldi_cmvn(cmvn_file)
elif filetype == "npz":
eps = 1e-14
npzfile = np.load(cmvn_file)
mean = np.squeeze(npzfile["mean"])
std = np.squeeze(npzfile["std"])
istd = 1 / (std + eps)
cmvn = [mean, istd]
else:
raise ValueError(f"cmvn file type no support: {filetype}")
return cmvn[0], cmvn[1]
def convert_samples_to_float32(samples):
"""Convert sample type to float32.
Audio sample type is usually integer or float-point.
Integers will be scaled to [-1, 1] in float32.
PCM16 -> PCM32
"""
float32_samples = samples.astype('float32')
if samples.dtype in np.sctypes['int']:
bits = np.iinfo(samples.dtype).bits
float32_samples *= (1. / 2**(bits - 1))
elif samples.dtype in np.sctypes['float']:
pass
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return float32_samples
def convert_samples_from_float32(samples, dtype):
"""Convert sample type from float32 to dtype.
Audio sample type is usually integer or float-point. For integer
type, float32 will be rescaled from [-1, 1] to the maximum range
supported by the integer type.
PCM32 -> PCM16
"""
dtype = np.dtype(dtype)
output_samples = samples.copy()
if dtype in np.sctypes['int']:
bits = np.iinfo(dtype).bits
output_samples *= (2**(bits - 1) / 1.)
min_val = np.iinfo(dtype).min
max_val = np.iinfo(dtype).max
output_samples[output_samples > max_val] = max_val
output_samples[output_samples < min_val] = min_val
elif samples.dtype in np.sctypes['float']:
min_val = np.finfo(dtype).min
max_val = np.finfo(dtype).max
output_samples[output_samples > max_val] = max_val
output_samples[output_samples < min_val] = min_val
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return output_samples.astype(dtype)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import numpy as np
from ..frontend.augmentor.augmentation import AugmentationPipeline
from ..frontend.featurizer.speech_featurizer import SpeechFeaturizer
from ..frontend.normalizer import FeatureNormalizer
from ..frontend.speech import SpeechSegment
from ..frontend.utility import IGNORE_ID, TarLocalData
from .reader import LoadInputsAndTargets
from .utility import pad_list
__all__ = ["SpeechCollator"]
def _tokenids(text, keep_transcription_text):
# for training text is token ids
tokens = text # token ids
if keep_transcription_text:
# text is string, convert to unicode ord
assert isinstance(text, str), (type(text), text)
tokens = [ord(t) for t in text]
tokens = np.array(tokens, dtype=np.int64)
return tokens
class SpeechCollatorBase():
def __init__(
self,
aug_file,
mean_std_filepath,
vocab_filepath,
spm_model_prefix,
random_seed=0,
unit_type="char",
spectrum_type='linear', # 'linear', 'mfcc', 'fbank'
feat_dim=0, # 'mfcc', 'fbank'
delta_delta=False, # 'mfcc', 'fbank'
stride_ms=10.0, # ms
window_ms=20.0, # ms
n_fft=None, # fft points
max_freq=None, # None for samplerate/2
target_sample_rate=16000, # target sample rate
use_dB_normalization=True,
target_dB=-20,
dither=1.0,
keep_transcription_text=True):
"""SpeechCollator Collator
Args:
unit_type(str): token unit type, e.g. char, word, spm
vocab_filepath (str): vocab file path.
mean_std_filepath (str): mean and std file path, which suffix is *.npy
spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
stride_ms (float, optional): stride size in ms. Defaults to 10.0.
window_ms (float, optional): window size in ms. Defaults to 20.0.
n_fft (int, optional): fft points for rfft. Defaults to None.
max_freq (int, optional): max cut freq. Defaults to None.
target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
spectrum_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
target_dB (int, optional): target dB. Defaults to -20.
random_seed (int, optional): for random generator. Defaults to 0.
keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
if ``keep_transcription_text`` is False, text is token ids else is raw string.
Do augmentations
Padding audio features with zeros to make them have the same shape (or
a user-defined shape) within one batch.
"""
self.keep_transcription_text = keep_transcription_text
self.train_mode = not keep_transcription_text
self.stride_ms = stride_ms
self.window_ms = window_ms
self.feat_dim = feat_dim
self.loader = LoadInputsAndTargets()
# only for tar filetype
self._local_data = TarLocalData(tar2info={}, tar2object={})
self.augmentation = AugmentationPipeline(
preprocess_conf=aug_file.read(), random_seed=random_seed)
self._normalizer = FeatureNormalizer(
mean_std_filepath) if mean_std_filepath else None
self._speech_featurizer = SpeechFeaturizer(
unit_type=unit_type,
vocab_filepath=vocab_filepath,
spm_model_prefix=spm_model_prefix,
spectrum_type=spectrum_type,
feat_dim=feat_dim,
delta_delta=delta_delta,
stride_ms=stride_ms,
window_ms=window_ms,
n_fft=n_fft,
max_freq=max_freq,
target_sample_rate=target_sample_rate,
use_dB_normalization=use_dB_normalization,
target_dB=target_dB,
dither=dither)
self.feature_size = self._speech_featurizer.audio_feature.feature_size
self.text_feature = self._speech_featurizer.text_feature
self.vocab_dict = self.text_feature.vocab_dict
self.vocab_list = self.text_feature.vocab_list
self.vocab_size = self.text_feature.vocab_size
def process_utterance(self, audio_file, transcript):
"""Load, augment, featurize and normalize for speech data.
:param audio_file: Filepath or file object of audio file.
:type audio_file: str | file
:param transcript: Transcription text.
:type transcript: str
:return: Tuple of audio feature tensor and data of transcription part,
where transcription part could be token ids or text.
:rtype: tuple of (2darray, list)
"""
filetype = self.loader.file_type(audio_file)
if filetype != 'sound':
spectrum = self.loader._get_from_loader(audio_file, filetype)
feat_dim = spectrum.shape[1]
assert feat_dim == self.feat_dim, f"expect feat dim {self.feat_dim}, but got {feat_dim}"
if self.keep_transcription_text:
transcript_part = transcript
else:
text_ids = self.text_feature.featurize(transcript)
transcript_part = text_ids
else:
# read audio
speech_segment = SpeechSegment.from_file(
audio_file, transcript, infos=self._local_data)
# audio augment
self.augmentation.transform_audio(speech_segment)
# extract speech feature
spectrum, transcript_part = self._speech_featurizer.featurize(
speech_segment, self.keep_transcription_text)
# CMVN spectrum
if self._normalizer:
spectrum = self._normalizer.apply(spectrum)
# spectrum augment
spectrum = self.augmentation.transform_feature(spectrum)
return spectrum, transcript_part
def __call__(self, batch):
"""batch examples
Args:
batch (List[Dict]): batch is [dict(audio, text, ...)]
audio (np.ndarray) shape (T, D)
text (List[int] or str): shape (U,)
Returns:
tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
utts: (B,)
xs_pad : (B, Tmax, D)
ilens: (B,)
ys_pad : (B, Umax)
olens: (B,)
"""
audios = []
audio_lens = []
texts = []
text_lens = []
utts = []
tids = [] # tokenids
for idx, item in enumerate(batch):
utts.append(item['utt'])
audio = item['input'][0]['feat']
text = item['output'][0]['text']
audio, text = self.process_utterance(audio, text)
audios.append(audio) # [T, D]
audio_lens.append(audio.shape[0])
tokens = _tokenids(text, self.keep_transcription_text)
texts.append(tokens)
text_lens.append(tokens.shape[0])
#[B, T, D]
xs_pad = pad_list(audios, 0.0).astype(np.float32)
ilens = np.array(audio_lens).astype(np.int64)
ys_pad = pad_list(texts, IGNORE_ID).astype(np.int64)
olens = np.array(text_lens).astype(np.int64)
return utts, xs_pad, ilens, ys_pad, olens
class SpeechCollator(SpeechCollatorBase):
@classmethod
def from_config(cls, config):
"""Build a SpeechCollator object from a config.
Args:
config (yacs.config.CfgNode): configs object.
Returns:
SpeechCollator: collator object.
"""
assert 'augmentation_config' in config
assert 'keep_transcription_text' in config
assert 'mean_std_filepath' in config
assert 'vocab_filepath' in config
assert 'spectrum_type' in config
assert 'n_fft' in config
assert config
if isinstance(config.augmentation_config, (str, bytes)):
if config.augmentation_config:
aug_file = io.open(
config.augmentation_config, mode='r', encoding='utf8')
else:
aug_file = io.StringIO(initial_value='{}', newline='')
else:
aug_file = config.augmentation_config
assert isinstance(aug_file, io.StringIO)
speech_collator = cls(
aug_file=aug_file,
random_seed=0,
mean_std_filepath=config.mean_std_filepath,
unit_type=config.unit_type,
vocab_filepath=config.vocab_filepath,
spm_model_prefix=config.spm_model_prefix,
spectrum_type=config.spectrum_type,
feat_dim=config.feat_dim,
delta_delta=config.delta_delta,
stride_ms=config.stride_ms,
window_ms=config.window_ms,
n_fft=config.n_fft,
max_freq=config.max_freq,
target_sample_rate=config.target_sample_rate,
use_dB_normalization=config.use_dB_normalization,
target_dB=config.target_dB,
dither=config.dither,
keep_transcription_text=config.keep_transcription_text)
return speech_collator
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
from collections import OrderedDict
from typing import Optional, Union
import paddle
import soundfile
from ..deepspeech2 import DeepSpeech2ModelOnline
from paddlespeech.s2t.io.collator import SpeechCollator
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.utils.utility import UpdateConfig
from yacs.config import CfgNode
__all__ = ['ASRExecutor']
model_alias = {
"deepspeech2offline":
"paddlespeech.s2t.models.ds2:DeepSpeech2Model",
"deepspeech2online":
"paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
"conformer":
"paddlespeech.s2t.models.u2:U2Model",
"transformer":
"paddlespeech.s2t.models.u2:U2Model",
"wenetspeech":
"paddlespeech.s2t.models.u2:U2Model",
}
class ASRExecutor(object):
def __init__(self,
model: str = 'conformer_wenetspeech',
lang: str = 'zh',
sample_rate: int = 16000,
config: os.PathLike = None,
onnx_path: os.PathLike = None,
decode_method: str = 'attention_rescoring',
language_model_dir=None):
self.model = model
self.lang = lang
self.sample_rate = sample_rate
self.config = config
self.onnx_path = onnx_path
self.decode_method = decode_method
self._inputs = OrderedDict()
self._outputs = OrderedDict()
self._init_from_path(self.model, self.lang, self.sample_rate,
self.config, self.decode_method,
self.onnx_path)
def __call__(self, audio_file: os.PathLike, force_yes: bool = False):
audio_file = os.path.abspath(audio_file)
if not self._check(audio_file, self.sample_rate, force_yes):
sys.exit(-1)
self.preprocess(audio_file)
self.infer()
res = self.postprocess() # Retrieve result of asr.
return res
def _init_from_path(self,
model_type: str = 'wenetspeech',
lang: str = 'zh',
sample_rate: int = 16000,
cfg_path: Optional[os.PathLike] = None,
decode_method: str = 'attention_rescoring',
onnx_path: Optional[os.PathLike] = None):
"""
Init model and other resources from a specific path.
"""
if hasattr(self, 'model'):
print('Model had been initialized.')
return
self.cfg_path = os.path.abspath(cfg_path)
self.res_path = os.path.dirname(
os.path.dirname(os.path.abspath(self.cfg_path)))
# Init body.
self.config = CfgNode(new_allowed=True)
self.config.merge_from_file(self.cfg_path)
with UpdateConfig(self.config):
self.vocab = self.config.vocab_filepath
self.config.decode.lang_model_path = os.path.join(
MODEL_HOME, 'language_model',
self.config.decode.lang_model_path)
self.collate_fn_test = SpeechCollator.from_config(self.config)
self.text_feature = TextFeaturizer(unit_type=self.config.unit_type,
vocab=self.vocab)
# model_type: {model_name}_{dataset}
model_name = model_type[:model_type.rindex('_')]
model_class = DeepSpeech2ModelOnline()
model_conf = self.config
model = model_class.from_config(model_conf)
self.model = model
def preprocess(self, input: Union[str, os.PathLike]):
"""
Input preprocess and return paddle.Tensor stored in self.input.
Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet).
"""
audio_file = input
if isinstance(audio_file, (str, os.PathLike)):
print("Preprocess audio_file:" + audio_file)
# Get the object for feature extraction
audio, _ = self.collate_fn_test.process_utterance(
audio_file=audio_file, transcript=" ")
audio_len = audio.shape[0]
audio = paddle.to_tensor(audio, dtype='float32')
audio_len = paddle.to_tensor(audio_len)
audio = paddle.unsqueeze(audio, axis=0)
# vocab_list = collate_fn_test.vocab_list
self._inputs["audio"] = audio
self._inputs["audio_len"] = audio_len
print(f"audio feat shape: {audio.shape}")
def infer(self):
"""
Model inference and result stored in self.output.
"""
cfg = self.config.decode
audio = self._inputs["audio"]
audio_len = self._inputs["audio_len"]
decode_batch_size = audio.shape[0]
self.model.decoder.init_decoder(
decode_batch_size, self.text_feature.vocab_list,
cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta,
cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n,
cfg.num_proc_bsearch)
result_transcripts = self.model.decode(audio, audio_len)
self.model.decoder.del_decoder()
self._outputs["result"] = result_transcripts[0]
def postprocess(self) -> Union[str, os.PathLike]:
"""
Output postprocess and return human-readable results such as texts and audio files.
"""
return self._outputs["result"]
def _check(self, audio_file: str, sample_rate: int, force_yes: bool):
self.sample_rate = sample_rate
if self.sample_rate != 16000 and self.sample_rate != 8000:
print(
"invalid sample rate, please input --sr 8000 or --sr 16000")
return False
if isinstance(audio_file, (str, os.PathLike)):
if not os.path.isfile(audio_file):
print("Please input the right audio file path")
return False
print("checking the audio file format......")
try:
audio, audio_sample_rate = soundfile.read(
audio_file, dtype="int16", always_2d=True)
except Exception as e:
print(
"can not open the audio file, please check the audio file format is 'wav'. \n \
you can try to use sox to change the file format.\n \
For example: \n \
sample rate: 16k \n \
sox input_audio.xx --rate 16k --bits 16 --channels 1 output_audio.wav \n \
sample rate: 8k \n \
sox input_audio.xx --rate 8k --bits 16 --channels 1 output_audio.wav \n \
")
return False
print("The sample rate is %d" % audio_sample_rate)
if audio_sample_rate != self.sample_rate:
print("The sample rate of the input file is not {}.\n \
The program will resample the wav file to {}.\n \
If the result does not meet your expectations,\n \
Please input the 16k 16 bit 1 channel wav file. \
".format(self.sample_rate, self.sample_rate))
if force_yes is False:
while (True):
print(
"Whether to change the sample rate and the channel. Y: change the sample. N: exit the prgream."
)
content = input("Input(Y/N):")
if content.strip() == "Y" or content.strip(
) == "y" or content.strip() == "yes" or content.strip(
) == "Yes":
print(
"change the sampele rate, channel to 16k and 1 channel"
)
break
elif content.strip() == "N" or content.strip(
) == "n" or content.strip() == "no" or content.strip(
) == "No":
print("Exit the program")
exit(1)
else:
print("Not regular input, please input again")
self.change_format = True
else:
print("The audio file format is right")
self.change_format = False
return True
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
from collections import OrderedDict
import kaldiio
import numpy as np
import soundfile
from ..transform.transformation import Transformation
from .utility import feat_type
__all__ = ["LoadInputsAndTargets"]
class LoadInputsAndTargets():
"""Create a mini-batch from a list of dicts
>>> batch = [('utt1',
... dict(input=[dict(feat='some.ark:123',
... filetype='mat',
... name='input1',
... shape=[100, 80])],
... output=[dict(tokenid='1 2 3 4',
... name='target1',
... shape=[4, 31])]]))
>>> l = LoadInputsAndTargets()
>>> feat, target = l(batch)
:param: str mode: Specify the task mode, "asr" or "tts"
:param: str preprocess_conf: The path of a json file for pre-processing
:param: bool load_input: If False, not to load the input data
:param: bool load_output: If False, not to load the output data
:param: bool sort_in_input_length: Sort the mini-batch in descending order
of the input length
:param: bool use_speaker_embedding: Used for tts mode only
:param: bool use_second_target: Used for tts mode only
:param: dict preprocess_args: Set some optional arguments for preprocessing
:param: Optional[dict] preprocess_args: Used for tts mode only
"""
def __init__(
self,
mode="asr",
preprocess_conf=None,
load_input=True,
load_output=True,
sort_in_input_length=True,
preprocess_args=None,
keep_all_data_on_mem=False, ):
self._loaders = {}
if mode not in ["asr"]:
raise ValueError("Only asr are allowed: mode={}".format(mode))
if preprocess_conf:
self.preprocessing = Transformation(preprocess_conf)
logger.warning(
"[Experimental feature] Some preprocessing will be done "
"for the mini-batch creation using {}".format(
self.preprocessing))
else:
# If conf doesn't exist, this function don't touch anything.
self.preprocessing = None
self.mode = mode
self.load_output = load_output
self.load_input = load_input
self.sort_in_input_length = sort_in_input_length
if preprocess_args:
assert isinstance(preprocess_args, dict), type(preprocess_args)
self.preprocess_args = dict(preprocess_args)
else:
self.preprocess_args = {}
self.keep_all_data_on_mem = keep_all_data_on_mem
def __call__(self, batch, return_uttid=False):
"""Function to load inputs and targets from list of dicts
:param List[Tuple[str, dict]] batch: list of dict which is subset of
loaded data.json
:param bool return_uttid: return utterance ID information for visualization
:return: list of input token id sequences [(L_1), (L_2), ..., (L_B)]
:return: list of input feature sequences
[(T_1, D), (T_2, D), ..., (T_B, D)]
:rtype: list of float ndarray
:return: list of target token id sequences [(L_1), (L_2), ..., (L_B)]
:rtype: list of int ndarray
"""
x_feats_dict = OrderedDict() # OrderedDict[str, List[np.ndarray]]
y_feats_dict = OrderedDict() # OrderedDict[str, List[np.ndarray]]
uttid_list = [] # List[str]
for uttid, info in batch:
uttid_list.append(uttid)
if self.load_input:
# Note(kamo): This for-loop is for multiple inputs
for idx, inp in enumerate(info["input"]):
# {"input":
# [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
# "filetype": "hdf5",
# "name": "input1", ...}], ...}
x = self._get_from_loader(
filepath=inp["feat"],
filetype=inp.get("filetype", "mat"))
x_feats_dict.setdefault(inp["name"], []).append(x)
if self.load_output:
for idx, inp in enumerate(info["output"]):
if "tokenid" in inp:
# ======= Legacy format for output =======
# {"output": [{"tokenid": "1 2 3 4"}])
x = np.fromiter(
map(int, inp["tokenid"].split()), dtype=np.int64)
else:
# ======= New format =======
# {"input":
# [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
# "filetype": "hdf5",
# "name": "target1", ...}], ...}
x = self._get_from_loader(
filepath=inp["feat"],
filetype=inp.get("filetype", "mat"))
y_feats_dict.setdefault(inp["name"], []).append(x)
if self.mode == "asr":
return_batch, uttid_list = self._create_batch_asr(
x_feats_dict, y_feats_dict, uttid_list)
else:
raise NotImplementedError(self.mode)
if self.preprocessing is not None:
# Apply pre-processing all input features
for x_name in return_batch.keys():
if x_name.startswith("input"):
return_batch[x_name] = self.preprocessing(
return_batch[x_name], uttid_list,
**self.preprocess_args)
if return_uttid:
return tuple(return_batch.values()), uttid_list
# Doesn't return the names now.
return tuple(return_batch.values())
def _create_batch_asr(self, x_feats_dict, y_feats_dict, uttid_list):
"""Create a OrderedDict for the mini-batch
:param OrderedDict x_feats_dict:
e.g. {"input1": [ndarray, ndarray, ...],
"input2": [ndarray, ndarray, ...]}
:param OrderedDict y_feats_dict:
e.g. {"target1": [ndarray, ndarray, ...],
"target2": [ndarray, ndarray, ...]}
:param: List[str] uttid_list:
Give uttid_list to sort in the same order as the mini-batch
:return: batch, uttid_list
:rtype: Tuple[OrderedDict, List[str]]
"""
# handle single-input and multi-input (paralell) asr mode
xs = list(x_feats_dict.values())
if self.load_output:
ys = list(y_feats_dict.values())
assert len(xs[0]) == len(ys[0]), (len(xs[0]), len(ys[0]))
# get index of non-zero length samples
nonzero_idx = list(
filter(lambda i: len(ys[0][i]) > 0, range(len(ys[0]))))
for n in range(1, len(y_feats_dict)):
nonzero_idx = filter(lambda i: len(ys[n][i]) > 0, nonzero_idx)
else:
# Note(kamo): Be careful not to make nonzero_idx to a generator
nonzero_idx = list(range(len(xs[0])))
if self.sort_in_input_length:
# sort in input lengths based on the first input
nonzero_sorted_idx = sorted(
nonzero_idx, key=lambda i: -len(xs[0][i]))
else:
nonzero_sorted_idx = nonzero_idx
if len(nonzero_sorted_idx) != len(xs[0]):
logger.warning(
"Target sequences include empty tokenid (batch {} -> {}).".
format(len(xs[0]), len(nonzero_sorted_idx)))
# remove zero-length samples
xs = [[x[i] for i in nonzero_sorted_idx] for x in xs]
uttid_list = [uttid_list[i] for i in nonzero_sorted_idx]
x_names = list(x_feats_dict.keys())
if self.load_output:
ys = [[y[i] for i in nonzero_sorted_idx] for y in ys]
y_names = list(y_feats_dict.keys())
# Keeping x_name and y_name, e.g. input1, for future extension
return_batch = OrderedDict([
* [(x_name, x) for x_name, x in zip(x_names, xs)],
* [(y_name, y) for y_name, y in zip(y_names, ys)],
])
else:
return_batch = OrderedDict(
[(x_name, x) for x_name, x in zip(x_names, xs)])
return return_batch, uttid_list
def _get_from_loader(self, filepath, filetype):
"""Return ndarray
In order to make the fds to be opened only at the first referring,
the loader are stored in self._loaders
>>> ndarray = loader.get_from_loader(
... 'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5')
:param: str filepath:
:param: str filetype:
:return:
:rtype: np.ndarray
"""
if filetype == "hdf5":
# e.g.
# {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
# "filetype": "hdf5",
# -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
filepath, key = filepath.split(":", 1)
loader = self._loaders.get(filepath)
if loader is None:
# To avoid disk access, create loader only for the first time
loader = h5py.File(filepath, "r")
self._loaders[filepath] = loader
return loader[key][()]
elif filetype == "sound.hdf5":
# e.g.
# {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
# "filetype": "sound.hdf5",
# -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
filepath, key = filepath.split(":", 1)
loader = self._loaders.get(filepath)
if loader is None:
# To avoid disk access, create loader only for the first time
loader = SoundHDF5File(filepath, "r", dtype="int16")
self._loaders[filepath] = loader
array, rate = loader[key]
return array
elif filetype == "sound":
# e.g.
# {"input": [{"feat": "some/path.wav",
# "filetype": "sound"},
# Assume PCM16
if not self.keep_all_data_on_mem:
array, _ = soundfile.read(filepath, dtype="int16")
return array
if filepath not in self._loaders:
array, _ = soundfile.read(filepath, dtype="int16")
self._loaders[filepath] = array
return self._loaders[filepath]
elif filetype == "npz":
# e.g.
# {"input": [{"feat": "some/path.npz:F01_050C0101_PED_REAL",
# "filetype": "npz",
filepath, key = filepath.split(":", 1)
loader = self._loaders.get(filepath)
if loader is None:
# To avoid disk access, create loader only for the first time
loader = np.load(filepath)
self._loaders[filepath] = loader
return loader[key]
elif filetype == "npy":
# e.g.
# {"input": [{"feat": "some/path.npy",
# "filetype": "npy"},
if not self.keep_all_data_on_mem:
return np.load(filepath)
if filepath not in self._loaders:
self._loaders[filepath] = np.load(filepath)
return self._loaders[filepath]
elif filetype in ["mat", "vec"]:
# e.g.
# {"input": [{"feat": "some/path.ark:123",
# "filetype": "mat"}]},
# In this case, "123" indicates the starting points of the matrix
# load_mat can load both matrix and vector
if not self.keep_all_data_on_mem:
return kaldiio.load_mat(filepath)
if filepath not in self._loaders:
self._loaders[filepath] = kaldiio.load_mat(filepath)
return self._loaders[filepath]
elif filetype == "scp":
# e.g.
# {"input": [{"feat": "some/path.scp:F01_050C0101_PED_REAL",
# "filetype": "scp",
filepath, key = filepath.split(":", 1)
loader = self._loaders.get(filepath)
if loader is None:
# To avoid disk access, create loader only for the first time
loader = kaldiio.load_scp(filepath)
self._loaders[filepath] = loader
return loader[key]
else:
raise NotImplementedError(
"Not supported: loader_type={}".format(filetype))
def file_type(self, filepath):
return feat_type(filepath)
class SoundHDF5File():
"""Collecting sound files to a HDF5 file
>>> f = SoundHDF5File('a.flac.h5', mode='a')
>>> array = np.random.randint(0, 100, 100, dtype=np.int16)
>>> f['id'] = (array, 16000)
>>> array, rate = f['id']
:param: str filepath:
:param: str mode:
:param: str format: The type used when saving wav. flac, nist, htk, etc.
:param: str dtype:
"""
def __init__(self,
filepath,
mode="r+",
format=None,
dtype="int16",
**kwargs):
self.filepath = filepath
self.mode = mode
self.dtype = dtype
self.file = h5py.File(filepath, mode, **kwargs)
if format is None:
# filepath = a.flac.h5 -> format = flac
second_ext = os.path.splitext(os.path.splitext(filepath)[0])[1]
format = second_ext[1:]
if format.upper() not in soundfile.available_formats():
# If not found, flac is selected
format = "flac"
# This format affects only saving
self.format = format
def __repr__(self):
return '<SoundHDF5 file "{}" (mode {}, format {}, type {})>'.format(
self.filepath, self.mode, self.format, self.dtype)
def create_dataset(self, name, shape=None, data=None, **kwds):
f = io.BytesIO()
array, rate = data
soundfile.write(f, array, rate, format=self.format)
self.file.create_dataset(
name, shape=shape, data=np.void(f.getvalue()), **kwds)
def __setitem__(self, name, data):
self.create_dataset(name, data=data)
def __getitem__(self, key):
data = self.file[key][()]
f = io.BytesIO(data.tobytes())
array, rate = soundfile.read(f, dtype=self.dtype)
return array, rate
def keys(self):
return self.file.keys()
def values(self):
for k in self.file:
yield self[k]
def items(self):
for k in self.file:
yield k, self[k]
def __iter__(self):
return iter(self.file)
def __contains__(self, item):
return item in self.file
def __len__(self, item):
return len(self.file)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.file.close()
def close(self):
self.file.close()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from io import BytesIO
from pathlib import Path
from typing import List
import numpy as np
__all__ = ["pad_list", "pad_sequence", "feat_type"]
def pad_list(sequences: List[np.ndarray],
padding_value: float = 0.0) -> np.ndarray:
return pad_sequence(sequences, True, padding_value)
def pad_sequence(sequences: List[np.ndarray],
batch_first: bool = True,
padding_value: float = 0.0) -> np.ndarray:
r"""Pad a list of variable length Tensors with ``padding_value``
``pad_sequence`` stacks a list of Tensors along a new dimension,
and pads them to equal length. For example, if the input is list of
sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
otherwise.
`B` is batch size. It is equal to the number of elements in ``sequences``.
`T` is length of the longest sequence.
`L` is length of the sequence.
`*` is any number of trailing dimensions, including none.
Example:
>>> a = np.ones([25, 300])
>>> b = np.ones([22, 300])
>>> c = np.ones([15, 300])
>>> pad_sequence([a, b, c]).shape
[25, 3, 300]
Note:
This function returns a np.ndarray of size ``T x B x *`` or ``B x T x *``
where `T` is the length of the longest sequence. This function assumes
trailing dimensions and type of all the Tensors in sequences are same.
Args:
sequences (list[np.ndarray]): list of variable length sequences.
batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
``T x B x *`` otherwise
padding_value (float, optional): value for padded elements. Default: 0.
Returns:
np.ndarray of size ``T x B x *`` if :attr:`batch_first` is ``False``.
np.ndarray of size ``B x T x *`` otherwise
"""
# assuming trailing dimensions and type of all the Tensors
# in sequences are same and fetching those from sequences[0]
max_size = sequences[0].shape
trailing_dims = max_size[1:]
max_len = max([s.shape[0] for s in sequences])
if batch_first:
out_dims = (len(sequences), max_len) + trailing_dims
else:
out_dims = (max_len, len(sequences)) + trailing_dims
out_tensor = np.full(out_dims, padding_value, dtype=sequences[0].dtype)
for i, tensor in enumerate(sequences):
length = tensor.shape[0]
# use index notation to prevent duplicate references to the tensor
if batch_first:
out_tensor[i, :length, ...] = tensor
else:
out_tensor[:length, i, ...] = tensor
return out_tensor
def feat_type(filepath):
# deal with Byteio type for paddlespeech server
if isinstance(filepath, BytesIO):
return 'sound'
suffix = Path(filepath).suffix[1:]
if suffix == 'ark':
return 'mat'
elif suffix == 'scp':
return 'scp'
elif suffix == 'npy':
return 'npy'
elif suffix == 'npz':
return 'npz'
elif suffix in ['wav', 'flac']:
# PCM16
return 'sound'
else:
raise ValueError(f"Not support filetype: {suffix}")
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
try:
from ..decoders.ctcdecoder import CTCBeamSearchDecoder # noqa: F401
from ..decoders.ctcdecoder import Scorer # noqa: F401
from ..decoders.ctcdecoder import \
ctc_beam_search_decoding_batch # noqa: F401
from ..decoders.ctcdecoder import ctc_greedy_decoding # noqa: F401
except ImportError:
try:
from ..decoders.ctcdecoder import CTCBeamSearchDecoder # noqa: F401
from ..decoders.ctcdecoder import Scorer # noqa: F401
from ..decoders.ctcdecoder import \
ctc_beam_search_decoding_batch # noqa: F401
from ..decoders.ctcdecoder import ctc_greedy_decoding # noqa: F401
except Exception as e:
print("paddlespeech_ctcdecoders not installed!")
class CTCDecoder(object):
def __init__(self):
# CTCDecoder LM Score handle
self._ext_scorer = None
self.beam_search_decoder = None
self.blank_id = 0
def _decode_batch_greedy_offline(self, probs_split, vocab_list):
"""This function will be deprecated in future.
Decode by best path for a batch of probs matrix input.
:param probs_split: List of 2-D probability matrix, and each consists
of prob vectors for one speech utterancce.
:param probs_split: List of matrix
:param vocab_list: List of tokens in the vocabulary, for decoding.
:type vocab_list: list
:return: List of transcription texts.
:rtype: List of str
"""
results = []
for i, probs in enumerate(probs_split):
output_transcription = ctc_greedy_decoding(
probs_seq=probs, vocabulary=vocab_list, blank_id=self.blank_id)
results.append(output_transcription)
return results
def _init_ext_scorer(self, beam_alpha, beam_beta, language_model_path,
vocab_list):
"""Initialize the external scorer.
:param beam_alpha: Parameter associated with language model.
:type beam_alpha: float
:param beam_beta: Parameter associated with word count.
:type beam_beta: float
:param language_model_path: Filepath for language model. If it is
empty, the external scorer will be set to
None, and the decoding method will be pure
beam search without scorer.
:type language_model_path: str|None
:param vocab_list: List of tokens in the vocabulary, for decoding.
:type vocab_list: list
"""
# init once
if self._ext_scorer is not None:
return
if language_model_path != '':
self._ext_scorer = Scorer(beam_alpha, beam_beta,
language_model_path, vocab_list)
else:
self._ext_scorer = None
def _decode_batch_beam_search_offline(
self, probs_split, beam_alpha, beam_beta, beam_size, cutoff_prob,
cutoff_top_n, vocab_list, num_processes):
"""
This function will be deprecated in future.
Decode by beam search for a batch of probs matrix input.
:param probs_split: List of 2-D probability matrix, and each consists
of prob vectors for one speech utterancce.
:param probs_split: List of matrix
:param beam_alpha: Parameter associated with language model.
:type beam_alpha: float
:param beam_beta: Parameter associated with word count.
:type beam_beta: float
:param beam_size: Width for Beam search.
:type beam_size: int
:param cutoff_prob: Cutoff probability in pruning,
default 1.0, no pruning.
:type cutoff_prob: float
:param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
characters with highest probs in vocabulary will be
used in beam search, default 40.
:type cutoff_top_n: int
:param vocab_list: List of tokens in the vocabulary, for decoding.
:type vocab_list: list
:param num_processes: Number of processes (CPU) for decoder.
:type num_processes: int
:return: List of transcription texts.
:rtype: List of str
"""
if self._ext_scorer is not None:
self._ext_scorer.reset_params(beam_alpha, beam_beta)
# beam search decode
num_processes = min(num_processes, len(probs_split))
beam_search_results = ctc_beam_search_decoding_batch(
probs_split=probs_split,
vocabulary=vocab_list,
beam_size=beam_size,
num_processes=num_processes,
ext_scoring_func=self._ext_scorer,
cutoff_prob=cutoff_prob,
cutoff_top_n=cutoff_top_n,
blank_id=self.blank_id)
results = [result[0][1] for result in beam_search_results]
return results
def init_decoder(self, batch_size, vocab_list, decoding_method,
lang_model_path, beam_alpha, beam_beta, beam_size,
cutoff_prob, cutoff_top_n, num_processes):
"""
init ctc decoders
Args:
batch_size(int): Batch size for input data
vocab_list (list): List of tokens in the vocabulary, for decoding
decoding_method (str): ctc_beam_search
lang_model_path (str): language model path
beam_alpha (float): beam_alpha
beam_beta (float): beam_beta
beam_size (int): beam_size
cutoff_prob (float): cutoff probability in beam search
cutoff_top_n (int): cutoff_top_n
num_processes (int): num_processes
Raises:
ValueError: when decoding_method not support.
Returns:
CTCBeamSearchDecoder
"""
self.batch_size = batch_size
self.vocab_list = vocab_list
self.decoding_method = decoding_method
self.beam_size = beam_size
self.cutoff_prob = cutoff_prob
self.cutoff_top_n = cutoff_top_n
self.num_processes = num_processes
if decoding_method == "ctc_beam_search":
self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
vocab_list)
if self.beam_search_decoder is None:
self.beam_search_decoder = self.get_decoder(
vocab_list, batch_size, beam_alpha, beam_beta, beam_size,
num_processes, cutoff_prob, cutoff_top_n)
return self.beam_search_decoder
elif decoding_method == "ctc_greedy":
self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
vocab_list)
else:
raise ValueError(f"Not support: {decoding_method}")
def decode_probs_offline(self, probs, logits_lens, vocab_list,
decoding_method, lang_model_path, beam_alpha,
beam_beta, beam_size, cutoff_prob, cutoff_top_n,
num_processes):
"""
This function will be deprecated in future.
ctc decoding with probs.
Args:
probs (Tensor): activation after softmax
logits_lens (Tensor): audio output lens
vocab_list (list): List of tokens in the vocabulary, for decoding
decoding_method (str): ctc_beam_search
lang_model_path (str): language model path
beam_alpha (float): beam_alpha
beam_beta (float): beam_beta
beam_size (int): beam_size
cutoff_prob (float): cutoff probability in beam search
cutoff_top_n (int): cutoff_top_n
num_processes (int): num_processes
Raises:
ValueError: when decoding_method not support.
Returns:
List[str]: transcripts.
"""
logger.warn(
"This function will be deprecated in future: decode_probs_offline")
probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)]
if decoding_method == "ctc_greedy":
result_transcripts = self._decode_batch_greedy_offline(
probs_split=probs_split, vocab_list=vocab_list)
elif decoding_method == "ctc_beam_search":
result_transcripts = self._decode_batch_beam_search_offline(
probs_split=probs_split,
beam_alpha=beam_alpha,
beam_beta=beam_beta,
beam_size=beam_size,
cutoff_prob=cutoff_prob,
cutoff_top_n=cutoff_top_n,
vocab_list=vocab_list,
num_processes=num_processes)
else:
raise ValueError(f"Not support: {decoding_method}")
return result_transcripts
def get_decoder(self, vocab_list, batch_size, beam_alpha, beam_beta,
beam_size, num_processes, cutoff_prob, cutoff_top_n):
"""
init get ctc decoder
Args:
vocab_list (list): List of tokens in the vocabulary, for decoding.
batch_size(int): Batch size for input data
beam_alpha (float): beam_alpha
beam_beta (float): beam_beta
beam_size (int): beam_size
num_processes (int): num_processes
cutoff_prob (float): cutoff probability in beam search
cutoff_top_n (int): cutoff_top_n
Raises:
ValueError: when decoding_method not support.
Returns:
CTCBeamSearchDecoder
"""
num_processes = min(num_processes, batch_size)
if self._ext_scorer is not None:
self._ext_scorer.reset_params(beam_alpha, beam_beta)
if self.decoding_method == "ctc_beam_search":
beam_search_decoder = CTCBeamSearchDecoder(
vocab_list, batch_size, beam_size, num_processes, cutoff_prob,
cutoff_top_n, self._ext_scorer, self.blank_id)
else:
raise ValueError(f"Not support: {self.decoding_method}")
return beam_search_decoder
def next(self, probs, logits_lens):
"""
Input probs into ctc decoder
Args:
probs (list(list(float))): probs for a batch of data
logits_lens (list(int)): logits lens for a batch of data
Raises:
Exception: when the ctc decoder is not initialized
ValueError: when decoding_method not support.
"""
if self.beam_search_decoder is None:
raise Exception(
"You need to initialize the beam_search_decoder firstly")
beam_search_decoder = self.beam_search_decoder
has_value = (logits_lens > 0).tolist()
has_value = [
"true" if has_value[i] is True else "false"
for i in range(len(has_value))
]
probs_split = [
probs[i, :l, :].tolist() if has_value[i] else probs[i].tolist()
for i, l in enumerate(logits_lens)
]
if self.decoding_method == "ctc_beam_search":
beam_search_decoder.next(probs_split, has_value)
else:
raise ValueError(f"Not support: {self.decoding_method}")
return
def decode(self):
"""
Get the decoding result
Raises:
Exception: when the ctc decoder is not initialized
ValueError: when decoding_method not support.
Returns:
results_best (list(str)): The best result for a batch of data
results_beam (list(list(str))): The beam search result for a batch of data
"""
if self.beam_search_decoder is None:
raise Exception(
"You need to initialize the beam_search_decoder firstly")
beam_search_decoder = self.beam_search_decoder
if self.decoding_method == "ctc_beam_search":
batch_beam_results = beam_search_decoder.decode()
batch_beam_results = [[(res[0], res[1]) for res in beam_results]
for beam_results in batch_beam_results]
results_best = [result[0][1] for result in batch_beam_results]
results_beam = [[trans[1] for trans in result]
for result in batch_beam_results]
else:
raise ValueError(f"Not support: {self.decoding_method}")
return results_best, results_beam
def reset_decoder(self,
batch_size=-1,
beam_size=-1,
num_processes=-1,
cutoff_prob=-1.0,
cutoff_top_n=-1):
if batch_size > 0:
self.batch_size = batch_size
if beam_size > 0:
self.beam_size = beam_size
if num_processes > 0:
self.num_processes = num_processes
if cutoff_prob > 0:
self.cutoff_prob = cutoff_prob
if cutoff_top_n > 0:
self.cutoff_top_n = cutoff_top_n
"""
Reset the decoder state
Args:
batch_size(int): Batch size for input data
beam_size (int): beam_size
num_processes (int): num_processes
cutoff_prob (float): cutoff probability in beam search
cutoff_top_n (int): cutoff_top_n
Raises:
Exception: when the ctc decoder is not initialized
"""
if self.beam_search_decoder is None:
raise Exception(
"You need to initialize the beam_search_decoder firstly")
self.beam_search_decoder.reset_state(
self.batch_size, self.beam_size, self.num_processes,
self.cutoff_prob, self.cutoff_top_n)
def del_decoder(self):
"""
Delete the decoder
"""
if self.beam_search_decoder is not None:
del self.beam_search_decoder
self.beam_search_decoder = None
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
"""Transformation module."""
import copy
import io
import logging
from collections import OrderedDict
from collections.abc import Sequence
from inspect import signature
import yaml
from ..utils.dynamic_import import dynamic_import
import_alias = dict(
identity="paddlespeech.s2t.transform.transform_interface:Identity",
time_warp="paddlespeech.s2t.transform.spec_augment:TimeWarp",
time_mask="paddlespeech.s2t.transform.spec_augment:TimeMask",
freq_mask="paddlespeech.s2t.transform.spec_augment:FreqMask",
spec_augment="paddlespeech.s2t.transform.spec_augment:SpecAugment",
speed_perturbation="paddlespeech.s2t.transform.perturb:SpeedPerturbation",
speed_perturbation_sox="paddlespeech.s2t.transform.perturb:SpeedPerturbationSox",
volume_perturbation="paddlespeech.s2t.transform.perturb:VolumePerturbation",
noise_injection="paddlespeech.s2t.transform.perturb:NoiseInjection",
bandpass_perturbation="paddlespeech.s2t.transform.perturb:BandpassPerturbation",
rir_convolve="paddlespeech.s2t.transform.perturb:RIRConvolve",
delta="paddlespeech.s2t.transform.add_deltas:AddDeltas",
cmvn="paddlespeech.s2t.transform.cmvn:CMVN",
utterance_cmvn="paddlespeech.s2t.transform.cmvn:UtteranceCMVN",
fbank="paddlespeech.s2t.transform.spectrogram:LogMelSpectrogram",
spectrogram="paddlespeech.s2t.transform.spectrogram:Spectrogram",
stft="paddlespeech.s2t.transform.spectrogram:Stft",
istft="paddlespeech.s2t.transform.spectrogram:IStft",
stft2fbank="paddlespeech.s2t.transform.spectrogram:Stft2LogMelSpectrogram",
wpe="paddlespeech.s2t.transform.wpe:WPE",
channel_selector="paddlespeech.s2t.transform.channel_selector:ChannelSelector",
fbank_kaldi="paddlespeech.s2t.transform.spectrogram:LogMelSpectrogramKaldi",
cmvn_json="paddlespeech.s2t.transform.cmvn:GlobalCMVN")
class Transformation():
"""Apply some functions to the mini-batch
Examples:
>>> kwargs = {"process": [{"type": "fbank",
... "n_mels": 80,
... "fs": 16000},
... {"type": "cmvn",
... "stats": "data/train/cmvn.ark",
... "norm_vars": True},
... {"type": "delta", "window": 2, "order": 2}]}
>>> transform = Transformation(kwargs)
>>> bs = 10
>>> xs = [np.random.randn(100, 80).astype(np.float32)
... for _ in range(bs)]
>>> xs = transform(xs)
"""
def __init__(self, conffile=None):
if conffile is not None:
if isinstance(conffile, dict):
self.conf = copy.deepcopy(conffile)
else:
with io.open(conffile, encoding="utf-8") as f:
self.conf = yaml.safe_load(f)
assert isinstance(self.conf, dict), type(self.conf)
else:
self.conf = {"mode": "sequential", "process": []}
self.functions = OrderedDict()
if self.conf.get("mode", "sequential") == "sequential":
for idx, process in enumerate(self.conf["process"]):
assert isinstance(process, dict), type(process)
opts = dict(process)
process_type = opts.pop("type")
class_obj = dynamic_import(process_type, import_alias)
# TODO(karita): assert issubclass(class_obj,
# TransformInterface)
try:
self.functions[idx] = class_obj(**opts)
except TypeError:
try:
signa = signature(class_obj)
except ValueError:
# Some function, e.g. built-in function, are failed
pass
else:
logging.error("Expected signature: {}({})".format(
class_obj.__name__, signa))
raise
else:
raise NotImplementedError(
"Not supporting mode={}".format(self.conf["mode"]))
def __repr__(self):
rep = "\n" + "\n".join(" {}: {}".format(k, v)
for k, v in self.functions.items())
return "{}({})".format(self.__class__.__name__, rep)
def __call__(self, xs, uttid_list=None, **kwargs):
"""Return new mini-batch
:param Union[Sequence[np.ndarray], np.ndarray] xs:
:param Union[Sequence[str], str] uttid_list:
:return: batch:
:rtype: List[np.ndarray]
"""
if not isinstance(xs, Sequence):
is_batch = False
xs = [xs]
else:
is_batch = True
if isinstance(uttid_list, str):
uttid_list = [uttid_list for _ in range(len(xs))]
if self.conf.get("mode", "sequential") == "sequential":
for idx in range(len(self.conf["process"])):
func = self.functions[idx]
# TODO(karita): use TrainingTrans and UttTrans to check __call__ args
# Derive only the args which the func has
try:
param = signature(func).parameters
except ValueError:
# Some function, e.g. built-in function, are failed
param = {}
_kwargs = {k: v for k, v in kwargs.items() if k in param}
try:
if uttid_list is not None and "uttid" in param:
xs = [
func(x, u, **_kwargs)
for x, u in zip(xs, uttid_list)
]
else:
xs = [func(x, **_kwargs) for x in xs]
except Exception:
logging.fatal("Catch a exception from {}th func: {}".format(
idx, func))
raise
else:
raise NotImplementedError(
"Not supporting mode={}".format(self.conf["mode"]))
if is_batch:
return xs
else:
return xs[0]
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# !/usr/bin/env python
# -*- encoding: utf-8 -*-
# @File: main.py
# @Author: SWHL
# @Contact: liekkaskono@163.com
from deepspeech2 import ASRExecutor
config_path = 'resources/model.yaml'
model_path = 'resources/models/asr0_deepspeech2_online_aishell_ckpt_0.2.0.onnx'
lan_model_path = 'resources/models/language_model/zh_giga.no_cna_cmn.prune01244.klm'
wav_path = 'test_wav/zh.wav'
asr_executor = ASRExecutor(sample_rate=16000,
config_path=config_path,
onnx_path=model_path,
lan_model_path=lan_model_path)
text = asr_executor(audio_file=wav_path)
print('ASR Result: \t{}'.format(text))
File added
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment