Commit 39ac40a9 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #2747 failed with stages
in 0 seconds
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import torch
src_ckpt = "/checkpoint/wnhsu/w2v/archived/hubert_base_ls960_it2.pt"
ref_ckpt = "/checkpoint/wnhsu/w2v/hubert_icassp_oss_v3/iter2_km100-400k-grp-L6/oss.km500_p0_1_s334.pmw1_0.puw0_0.grpnorm.ml10.mp0_8.untie.mxsz250000.ufreq1.maxtok1400000.MU100k.s1337.ngpu32/checkpoint_last.pt"
new_ckpt = "/checkpoint/wnhsu/w2v/archived/hubert_base_ls960_it2_updated.pt"
def update_state(state):
state["model"]["label_embs_concat"] = state["model"].pop("label_embs")
state["args"].task = "hubert_pretraining"
state["args"].labels = f"['{state['args'].labels}']"
return state
src_state = torch.load(src_ckpt)
src_state = update_state(src_state)
torch.save(src_state, new_ckpt)
## Pre-training Representations for Speaker Verification
### Pre-trained models
| Model | Fix pre-train | Vox1-O | Vox1-E | Vox1-H |
| ------------------------------------------------------------ | ------------- | --------- | --------- | -------- |
| [ECAPA-TDNN](https://drive.google.com/file/d/1kWmLyTGkBExTdxtwmrXoP4DhWz_7ZAv3/view?usp=sharing) | - | 1.080 | 1.200 | 2.127 |
| [HuBERT large](https://drive.google.com/file/d/1cQAPIzg8DJASZyAYdaBN0gRa8piPQTMo/view?usp=sharing) | Yes | 0.888 | 0.912 | 1.853 |
| [Wav2Vec2.0 (XLSR)](https://drive.google.com/file/d/1FiGokGtF2d7rkD9OpqLiQxKSqppTSXbl/view?usp=sharing) | Yes | 0.915 | 0.945 | 1.895 |
| [**UniSpeech-SAT large**](https://drive.google.com/file/d/1W6KRt5Ci2T7_xPVdlE3JGdQG2KTrZ750/view?usp=sharing) | Yes | 0.771 | 0.781 | 1.669 |
| [HuBERT large](https://drive.google.com/file/d/1nit9Z6RyM8Sdb3n8ccaglOQVNnqsjnui/view?usp=sharing) | No | 0.585 | 0.654 | 1.342 |
| [Wav2Vec2.0 (XLSR)](https://drive.google.com/file/d/1TgKro9pp197TCgIF__IlE_rMVQOk50Eb/view?usp=sharing) | No | 0.564 | 0.605 | 1.23 |
| [**UniSpeech-SAT large**](https://drive.google.com/file/d/10o6NHZsPXJn2k8n57e8Z_FkKh3V4TC3g/view?usp=sharing) | No | **0.564** | **0.561** | **1.23** |
### How to use?
#### Environment Setup
1. `pip install -r requirements.txt`
2. Install fairseq code
- For HuBERT_Large and Wav2Vec2.0 (XLSR), we should install the official [fairseq](https://github.com/pytorch/fairseq).
- For UniSpeech-SAT large, we should install the [Unispeech-SAT](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech-SAT) fairseq code.
#### Example
Take `unispeech_sat ` and `ecapa_tdnn` for example:
1. First, you should download the pre-trained model in the above table to `checkpoint_path`.
2. Then, run the following codes:
- The wav files are sampled from [voxceleb1](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html).
```bash
python verification.py --model_name unispeech_sat --wav1 vox1_data/David_Faustino/hn8GyCJIfLM_0000012.wav --wav2 vox1_data/Josh_Gad/HXUqYaOwrxA_0000015.wav --checkpoint $checkpoint_path
# output: The similarity score between two audios is 0.0317 (-1.0, 1.0).
python verification.py --model_name unispeech_sat --wav1 vox1_data/David_Faustino/hn8GyCJIfLM_0000012.wav --wav2 vox1_data/David_Faustino/xTOk1Jz-F_g_0000015.wav --checkpoint --checkpoint $checkpoint_path
# output: The similarity score between two audios is 0.5389 (-1.0, 1.0).
python verification.py --model_name ecapa_tdnn --wav1 vox1_data/David_Faustino/hn8GyCJIfLM_0000012.wav --wav2 vox1_data/Josh_Gad/HXUqYaOwrxA_0000015.wav --checkpoint $checkpoint_path
# output: The similarity score between two audios is 0.2053 (-1.0, 1.0).
python verification.py --model_name ecapa_tdnn --wav1 vox1_data/David_Faustino/hn8GyCJIfLM_0000012.wav --wav2 vox1_data/David_Faustino/xTOk1Jz-F_g_0000015.wav --checkpoint --checkpoint $checkpoint_path
# output: he similarity score between two audios is 0.5302 (-1.0, 1.0).
```
# part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio.transforms as trans
from .utils import UpstreamExpert
''' Res2Conv1d + BatchNorm1d + ReLU
'''
class Res2Conv1dReluBn(nn.Module):
'''
in_channels == out_channels == channels
'''
def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True, scale=4):
super().__init__()
assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
self.scale = scale
self.width = channels // scale
self.nums = scale if scale == 1 else scale - 1
self.convs = []
self.bns = []
for i in range(self.nums):
self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
self.bns.append(nn.BatchNorm1d(self.width))
self.convs = nn.ModuleList(self.convs)
self.bns = nn.ModuleList(self.bns)
def forward(self, x):
out = []
spx = torch.split(x, self.width, 1)
for i in range(self.nums):
if i == 0:
sp = spx[i]
else:
sp = sp + spx[i]
# Order: conv -> relu -> bn
sp = self.convs[i](sp)
sp = self.bns[i](F.relu(sp))
out.append(sp)
if self.scale != 1:
out.append(spx[self.nums])
out = torch.cat(out, dim=1)
return out
''' Conv1d + BatchNorm1d + ReLU
'''
class Conv1dReluBn(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True):
super().__init__()
self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
self.bn = nn.BatchNorm1d(out_channels)
def forward(self, x):
return self.bn(F.relu(self.conv(x)))
''' The SE connection of 1D case.
'''
class SE_Connect(nn.Module):
def __init__(self, channels, se_bottleneck_dim=128):
super().__init__()
self.linear1 = nn.Linear(channels, se_bottleneck_dim)
self.linear2 = nn.Linear(se_bottleneck_dim, channels)
def forward(self, x):
out = x.mean(dim=2)
out = F.relu(self.linear1(out))
out = torch.sigmoid(self.linear2(out))
out = x * out.unsqueeze(2)
return out
''' SE-Res2Block of the ECAPA-TDNN architecture.
'''
# def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale):
# return nn.Sequential(
# Conv1dReluBn(channels, 512, kernel_size=1, stride=1, padding=0),
# Res2Conv1dReluBn(512, kernel_size, stride, padding, dilation, scale=scale),
# Conv1dReluBn(512, channels, kernel_size=1, stride=1, padding=0),
# SE_Connect(channels)
# )
class SE_Res2Block(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, scale, se_bottleneck_dim):
super().__init__()
self.Conv1dReluBn1 = Conv1dReluBn(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
self.Res2Conv1dReluBn = Res2Conv1dReluBn(out_channels, kernel_size, stride, padding, dilation, scale=scale)
self.Conv1dReluBn2 = Conv1dReluBn(out_channels, out_channels, kernel_size=1, stride=1, padding=0)
self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim)
self.shortcut = None
if in_channels != out_channels:
self.shortcut = nn.Conv1d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
)
def forward(self, x):
residual = x
if self.shortcut:
residual = self.shortcut(x)
x = self.Conv1dReluBn1(x)
x = self.Res2Conv1dReluBn(x)
x = self.Conv1dReluBn2(x)
x = self.SE_Connect(x)
return x + residual
''' Attentive weighted mean and standard deviation pooling.
'''
class AttentiveStatsPool(nn.Module):
def __init__(self, in_dim, attention_channels=128, global_context_att=False):
super().__init__()
self.global_context_att = global_context_att
# Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
if global_context_att:
self.linear1 = nn.Conv1d(in_dim * 3, attention_channels, kernel_size=1) # equals W and b in the paper
else:
self.linear1 = nn.Conv1d(in_dim, attention_channels, kernel_size=1) # equals W and b in the paper
self.linear2 = nn.Conv1d(attention_channels, in_dim, kernel_size=1) # equals V and k in the paper
def forward(self, x):
if self.global_context_att:
context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
x_in = torch.cat((x, context_mean, context_std), dim=1)
else:
x_in = x
# DON'T use ReLU here! In experiments, I find ReLU hard to converge.
alpha = torch.tanh(self.linear1(x_in))
# alpha = F.relu(self.linear1(x_in))
alpha = torch.softmax(self.linear2(alpha), dim=2)
mean = torch.sum(alpha * x, dim=2)
residuals = torch.sum(alpha * (x ** 2), dim=2) - mean ** 2
std = torch.sqrt(residuals.clamp(min=1e-9))
return torch.cat([mean, std], dim=1)
class ECAPA_TDNN(nn.Module):
def __init__(self, feat_dim=80, channels=512, emb_dim=192, global_context_att=False,
feat_type='fbank', sr=16000, feature_selection="hidden_states", update_extract=False, config_path=None):
super().__init__()
self.feat_type = feat_type
self.feature_selection = feature_selection
self.update_extract = update_extract
self.sr = sr
if feat_type == "fbank" or feat_type == "mfcc":
self.update_extract = False
win_len = int(sr * 0.025)
hop_len = int(sr * 0.01)
if feat_type == 'fbank':
self.feature_extract = trans.MelSpectrogram(sample_rate=sr, n_fft=512, win_length=win_len,
hop_length=hop_len, f_min=0.0, f_max=sr // 2,
pad=0, n_mels=feat_dim)
elif feat_type == 'mfcc':
melkwargs = {
'n_fft': 512,
'win_length': win_len,
'hop_length': hop_len,
'f_min': 0.0,
'f_max': sr // 2,
'pad': 0
}
self.feature_extract = trans.MFCC(sample_rate=sr, n_mfcc=feat_dim, log_mels=False,
melkwargs=melkwargs)
else:
if config_path is None:
self.feature_extract = torch.hub.load('s3prl/s3prl', feat_type)
else:
self.feature_extract = UpstreamExpert(config_path)
if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(self.feature_extract.model.encoder.layers[23].self_attn, "fp32_attention"):
self.feature_extract.model.encoder.layers[23].self_attn.fp32_attention = False
if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(self.feature_extract.model.encoder.layers[11].self_attn, "fp32_attention"):
self.feature_extract.model.encoder.layers[11].self_attn.fp32_attention = False
self.feat_num = self.get_feat_num()
self.feature_weight = nn.Parameter(torch.zeros(self.feat_num))
if feat_type != 'fbank' and feat_type != 'mfcc':
freeze_list = ['final_proj', 'label_embs_concat', 'mask_emb', 'project_q', 'quantizer']
for name, param in self.feature_extract.named_parameters():
for freeze_val in freeze_list:
if freeze_val in name:
param.requires_grad = False
break
if not self.update_extract:
for param in self.feature_extract.parameters():
param.requires_grad = False
self.instance_norm = nn.InstanceNorm1d(feat_dim)
# self.channels = [channels] * 4 + [channels * 3]
self.channels = [channels] * 4 + [1536]
self.layer1 = Conv1dReluBn(feat_dim, self.channels[0], kernel_size=5, padding=2)
self.layer2 = SE_Res2Block(self.channels[0], self.channels[1], kernel_size=3, stride=1, padding=2, dilation=2, scale=8, se_bottleneck_dim=128)
self.layer3 = SE_Res2Block(self.channels[1], self.channels[2], kernel_size=3, stride=1, padding=3, dilation=3, scale=8, se_bottleneck_dim=128)
self.layer4 = SE_Res2Block(self.channels[2], self.channels[3], kernel_size=3, stride=1, padding=4, dilation=4, scale=8, se_bottleneck_dim=128)
# self.conv = nn.Conv1d(self.channels[-1], self.channels[-1], kernel_size=1)
cat_channels = channels * 3
self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1)
self.pooling = AttentiveStatsPool(self.channels[-1], attention_channels=128, global_context_att=global_context_att)
self.bn = nn.BatchNorm1d(self.channels[-1] * 2)
self.linear = nn.Linear(self.channels[-1] * 2, emb_dim)
def get_feat_num(self):
self.feature_extract.eval()
wav = [torch.randn(self.sr).to(next(self.feature_extract.parameters()).device)]
with torch.no_grad():
features = self.feature_extract(wav)
select_feature = features[self.feature_selection]
if isinstance(select_feature, (list, tuple)):
return len(select_feature)
else:
return 1
def get_feat(self, x):
if self.update_extract:
x = self.feature_extract([sample for sample in x])
else:
with torch.no_grad():
if self.feat_type == 'fbank' or self.feat_type == 'mfcc':
x = self.feature_extract(x) + 1e-6 # B x feat_dim x time_len
else:
x = self.feature_extract([sample for sample in x])
if self.feat_type == 'fbank':
x = x.log()
if self.feat_type != "fbank" and self.feat_type != "mfcc":
x = x[self.feature_selection]
if isinstance(x, (list, tuple)):
x = torch.stack(x, dim=0)
else:
x = x.unsqueeze(0)
norm_weights = F.softmax(self.feature_weight, dim=-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
x = (norm_weights * x).sum(dim=0)
x = torch.transpose(x, 1, 2) + 1e-6
x = self.instance_norm(x)
return x
def forward(self, x):
x = self.get_feat(x)
out1 = self.layer1(x)
out2 = self.layer2(out1)
out3 = self.layer3(out2)
out4 = self.layer4(out3)
out = torch.cat([out2, out3, out4], dim=1)
out = F.relu(self.conv(out))
out = self.bn(self.pooling(out))
out = self.linear(out)
return out
def ECAPA_TDNN_SMALL(feat_dim, emb_dim=256, feat_type='fbank', sr=16000, feature_selection="hidden_states", update_extract=False, config_path=None):
return ECAPA_TDNN(feat_dim=feat_dim, channels=512, emb_dim=emb_dim,
feat_type=feat_type, sr=sr, feature_selection=feature_selection, update_extract=update_extract, config_path=config_path)
if __name__ == '__main__':
x = torch.zeros(2, 32000)
model = ECAPA_TDNN_SMALL(feat_dim=768, emb_dim=256, feat_type='hubert_base', feature_selection="hidden_states",
update_extract=False)
out = model(x)
# print(model)
print(out.shape)
import torch
import fairseq
from packaging import version
import torch.nn.functional as F
from fairseq import tasks
from fairseq.checkpoint_utils import load_checkpoint_to_cpu
from fairseq.dataclass.utils import convert_namespace_to_omegaconf
from omegaconf import OmegaConf
from s3prl.upstream.interfaces import UpstreamBase
from torch.nn.utils.rnn import pad_sequence
def load_model(filepath):
state = torch.load(filepath, map_location=lambda storage, loc: storage)
# state = load_checkpoint_to_cpu(filepath)
state["cfg"] = OmegaConf.create(state["cfg"])
if "args" in state and state["args"] is not None:
cfg = convert_namespace_to_omegaconf(state["args"])
elif "cfg" in state and state["cfg"] is not None:
cfg = state["cfg"]
else:
raise RuntimeError(
f"Neither args nor cfg exist in state keys = {state.keys()}"
)
task = tasks.setup_task(cfg.task)
if "task_state" in state:
task.load_state_dict(state["task_state"])
model = task.build_model(cfg.model)
return model, cfg, task
###################
# UPSTREAM EXPERT #
###################
class UpstreamExpert(UpstreamBase):
def __init__(self, ckpt, **kwargs):
super().__init__(**kwargs)
assert version.parse(fairseq.__version__) > version.parse(
"0.10.2"
), "Please install the fairseq master branch."
model, cfg, task = load_model(ckpt)
self.model = model
self.task = task
if len(self.hooks) == 0:
module_name = "self.model.encoder.layers"
for module_id in range(len(eval(module_name))):
self.add_hook(
f"{module_name}[{module_id}]",
lambda input, output: input[0].transpose(0, 1),
)
self.add_hook("self.model.encoder", lambda input, output: output[0])
def forward(self, wavs):
if self.task.cfg.normalize:
wavs = [F.layer_norm(wav, wav.shape) for wav in wavs]
device = wavs[0].device
wav_lengths = torch.LongTensor([len(wav) for wav in wavs]).to(device)
wav_padding_mask = ~torch.lt(
torch.arange(max(wav_lengths)).unsqueeze(0).to(device),
wav_lengths.unsqueeze(1),
)
padded_wav = pad_sequence(wavs, batch_first=True)
features, feat_padding_mask = self.model.extract_features(
padded_wav,
padding_mask=wav_padding_mask,
mask=None,
)
return {
"default": features,
}
import soundfile as sf
import torch
import fire
import torch.nn.functional as F
from torchaudio.functional import resample
from models.ecapa_tdnn import ECAPA_TDNN_SMALL
MODEL_LIST = ['ecapa_tdnn', 'hubert_large', 'wav2vec2_xlsr', 'unispeech_sat']
def init_model(model_name, checkpoint=None):
if model_name == 'unispeech_sat':
config_path = 'config/unispeech_sat.th'
model = ECAPA_TDNN_SMALL(feat_dim=1024, feat_type='unispeech_sat', config_path=config_path)
elif model_name == 'hubert_large':
config_path = None
model = ECAPA_TDNN_SMALL(feat_dim=1024, feat_type='hubert_large_ll60k', config_path=config_path)
elif model_name == 'wav2vec2_xlsr':
config_path = None
model = ECAPA_TDNN_SMALL(feat_dim=1024, feat_type='wav2vec2_xlsr', config_path=config_path)
else:
model = ECAPA_TDNN_SMALL(feat_dim=40, feat_type='fbank')
if checkpoint is not None:
state_dict = torch.load(checkpoint, map_location=lambda storage, loc: storage)
model.load_state_dict(state_dict['model'], strict=False)
return model
def verification(model_name, wav1, wav2, use_gpu=True, checkpoint=None):
assert model_name in MODEL_LIST, 'The model_name should be in {}'.format(MODEL_LIST)
model = init_model(model_name, checkpoint)
wav1, sr1 = sf.read(wav1)
wav2, sr2 = sf.read(wav2)
wav1 = torch.from_numpy(wav1).unsqueeze(0).float()
wav2 = torch.from_numpy(wav2).unsqueeze(0).float()
wav1 = resample(wav1, orig_freq=sr1, new_freq=16000)
wav2 = resample(wav2, orig_freq=sr2, new_freq=16000)
if use_gpu:
model = model.cuda()
wav1 = wav1.cuda()
wav2 = wav2.cuda()
model.eval()
with torch.no_grad():
emb1 = model(wav1)
emb2 = model(wav2)
sim = F.cosine_similarity(emb1, emb2)
print("The similarity score between two audios is {:.4f} (-1.0, 1.0).".format(sim[0].item()))
if __name__ == "__main__":
fire.Fire(verification)
# encoding: utf-8
import editdistance
import argparse
import re
def compute_wer_txt(hypo_f, ref_f):
f = open(hypo_f)
hypos = f.readlines()
f.close()
f = open(ref_f)
refs = f.readlines()
f.close()
err,cnt = 0,0
print("Start Computing WER")
for i in range(len(hypos)):
hypo = hypos[i].strip().split()[:-1]
ref = refs[i].strip().split()[:-1]
e = editdistance.eval(ref, hypo)
#print(ref_token,predict_token,e/float(len(ref_token)+0.001))
err += e
cnt += len(ref)
print(err,cnt,float(float(err+0.0) / float(cnt+0.01)*100))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--hypo', '--h', type=str, required=True,
help='Input file path')
parser.add_argument('--ref', '--r', type=str, required=True)
args = parser.parse_args()
compute_wer_txt(args.hypo, args.ref)
#!/usr/bin/env python3 -u
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Run inference for pre-processed data with a trained model.
"""
import ast
import logging
import math
import os
import sys
sys.path.append(os.getcwd())
import editdistance
import numpy as np
import torch
from fairseq import checkpoint_utils, options, progress_bar, tasks, utils
from fairseq.data.data_utils import post_process
from fairseq.dataclass.utils import convert_namespace_to_omegaconf
from fairseq.logging.meters import StopwatchMeter, TimeMeter
logging.basicConfig()
logging.root.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def add_asr_eval_argument(parser):
parser.add_argument("--kspmodel", default=None, help="sentence piece model")
parser.add_argument(
"--wfstlm", default=None, help="wfstlm on dictonary output units"
)
parser.add_argument(
"--rnnt_decoding_type",
default="greedy",
help="wfstlm on dictonary\
output units",
)
try:
parser.add_argument(
"--lm-weight",
"--lm_weight",
type=float,
default=0.2,
help="weight for lm while interpolating with neural score",
)
except:
pass
parser.add_argument(
"--rnnt_len_penalty", default=-0.5, help="rnnt length penalty on word level"
)
parser.add_argument(
"--w2l-decoder",
choices=["viterbi", "kenlm", "fairseqlm", "transducer"],
help="use a w2l decoder",
)
parser.add_argument("--lexicon", help="lexicon for w2l decoder")
parser.add_argument("--unit-lm", action="store_true", help="if using a unit lm")
parser.add_argument("--kenlm-model", "--lm-model", help="lm model for w2l decoder")
parser.add_argument("--beam-threshold", type=float, default=25.0)
parser.add_argument("--beam-size-token", type=float, default=100)
parser.add_argument("--word-score", type=float, default=1.0)
parser.add_argument("--unk-weight", type=float, default=-math.inf)
parser.add_argument("--sil-weight", type=float, default=0.0)
parser.add_argument(
"--dump-emissions",
type=str,
default=None,
help="if present, dumps emissions into this file and exits",
)
parser.add_argument(
"--dump-features",
type=str,
default=None,
help="if present, dumps features into this file and exits",
)
parser.add_argument(
"--load-emissions",
type=str,
default=None,
help="if present, loads emissions from this file",
)
return parser
def check_args(args):
# assert args.path is not None, "--path required for generation!"
# assert args.results_path is not None, "--results_path required for generation!"
assert (
not args.sampling or args.nbest == args.beam
), "--sampling requires --nbest to be equal to --beam"
assert (
args.replace_unk is None or args.raw_text
), "--replace-unk requires a raw text dataset (--raw-text)"
def get_dataset_itr(args, task, models):
return task.get_batch_iterator(
dataset=task.dataset(args.gen_subset),
max_tokens=args.max_tokens,
max_sentences=args.batch_size,
max_positions=(sys.maxsize, sys.maxsize),
ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
required_batch_size_multiple=args.required_batch_size_multiple,
num_shards=args.num_shards,
shard_id=args.shard_id,
num_workers=args.num_workers,
data_buffer_size=args.data_buffer_size,
).next_epoch_itr(shuffle=False)
def process_predictions(
args, hypos, sp, tgt_dict, target_tokens, res_files, speaker, id
):
for hypo in hypos[: min(len(hypos), args.nbest)]:
hyp_pieces = tgt_dict.string(hypo["tokens"])
if "words" in hypo:
hyp_words = " ".join(hypo["words"])
else:
hyp_words = post_process(hyp_pieces, args.post_process)
if res_files is not None:
print(
"{} ({}-{})".format(hyp_pieces, speaker, id),
file=res_files["hypo.units"],
)
print(
"{} ({}-{})".format(hyp_words, speaker, id),
file=res_files["hypo.words"],
)
tgt_pieces = tgt_dict.string(target_tokens)
tgt_words = post_process(tgt_pieces, args.post_process)
if res_files is not None:
print(
"{} ({}-{})".format(tgt_pieces, speaker, id),
file=res_files["ref.units"],
)
print(
"{} ({}-{})".format(tgt_words, speaker, id), file=res_files["ref.words"]
)
if not args.quiet:
logger.info("HYPO:" + hyp_words)
logger.info("TARGET:" + tgt_words)
logger.info("___________________")
hyp_words = hyp_words.split()
tgt_words = tgt_words.split()
return editdistance.eval(hyp_words, tgt_words), len(tgt_words)
def prepare_result_files(args):
def get_res_file(file_prefix):
if args.num_shards > 1:
file_prefix = f"{args.shard_id}_{file_prefix}"
path = os.path.join(
args.results_path,
"{}-{}-{}.txt".format(
file_prefix, os.path.basename(args.path), args.gen_subset
),
)
return open(path, "w", buffering=1)
if not args.results_path:
return None
return {
"hypo.words": get_res_file("hypo.word"),
"hypo.units": get_res_file("hypo.units"),
"ref.words": get_res_file("ref.word"),
"ref.units": get_res_file("ref.units"),
}
def load_models_and_criterions(
filenames, data_path, arg_overrides=None, task=None, model_state=None
):
models = []
criterions = []
if arg_overrides is None:
arg_overrides = {}
arg_overrides["wer_args"] = None
arg_overrides["data"] = data_path
if filenames is None:
assert model_state is not None
filenames = [0]
else:
filenames = filenames.split(":")
for filename in filenames:
if model_state is None:
if not os.path.exists(filename):
raise IOError("Model file not found: {}".format(filename))
state = checkpoint_utils.load_checkpoint_to_cpu(filename, arg_overrides)
else:
state = model_state
if "cfg" in state:
cfg = state["cfg"]
else:
cfg = convert_namespace_to_omegaconf(state["args"])
cfg.task._name = "audio_pretraining"
cfg.task.task = "audio_pretraining"
cfg.task.sample_rate = 16000
if task is None:
cfg.task.data = data_path
task = tasks.setup_task(cfg.task)
model = task.build_model(cfg.model)
model.load_state_dict(state["model"], strict=True)
models.append(model)
criterion = task.build_criterion(cfg.criterion)
if "criterion" in state:
criterion.load_state_dict(state["criterion"], strict=True)
criterions.append(criterion)
return models, criterions, task
def optimize_models(args, use_cuda, models):
"""Optimize ensemble for generation"""
for model in models:
model.make_generation_fast_(
beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
need_attn=args.print_alignment,
)
if args.fp16:
model.half()
if use_cuda:
model.cuda()
class ExistingEmissionsDecoder(object):
def __init__(self, decoder, emissions):
self.decoder = decoder
self.emissions = emissions
def generate(self, models, sample, **unused):
ids = sample["id"].cpu().numpy()
try:
emissions = np.stack(self.emissions[ids])
except:
print([x.shape for x in self.emissions[ids]])
raise Exception("invalid sizes")
emissions = torch.from_numpy(emissions)
return self.decoder.decode(emissions)
def main(args, task=None, model_state=None):
check_args(args)
if args.max_tokens is None and args.batch_size is None:
args.max_tokens = 4000000
logger.info(args)
use_cuda = torch.cuda.is_available() and not args.cpu
logger.info("| decoding with criterion {}".format(args.criterion))
# Load ensemble
if args.load_emissions:
models, criterions = [], []
task.load_dataset(args.gen_subset)
else:
logger.info("| loading model(s) from {}".format(args.path))
model_overrides = ast.literal_eval(args.model_overrides)
model_overrides["wer_args"] = None
model_overrides["data"] = args.data
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
utils.split_paths(args.path, separator="\\"),
arg_overrides=model_overrides,
task=task,
suffix=args.checkpoint_suffix,
strict=(args.checkpoint_shard_count == 1),
num_shards=args.checkpoint_shard_count,
state=model_state,
)
optimize_models(args, use_cuda, models)
task.load_dataset(args.gen_subset, task_cfg=saved_cfg.task)
# Load dataset splits
task.load_dataset(args.gen_subset)
# Set dictionary
tgt_dict = task.target_dictionary
logger.info(
"| {} {} {} examples".format(
args.data, args.gen_subset, len(task.dataset(args.gen_subset))
)
)
# hack to pass transitions to W2lDecoder
if args.criterion == "asg_loss":
raise NotImplementedError("asg_loss is currently not supported")
# trans = criterions[0].asg.trans.data
# args.asg_transitions = torch.flatten(trans).tolist()
# Load dataset (possibly sharded)
itr = get_dataset_itr(args, task, models)
# Initialize generator
gen_timer = StopwatchMeter()
def build_generator(args):
w2l_decoder = getattr(args, "w2l_decoder", None)
if w2l_decoder == "viterbi":
from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder
return W2lViterbiDecoder(args, task.target_dictionary)
elif w2l_decoder == "transducer":
from examples.speech_recognition.w2l_decoder import TransducerDecoder
return TransducerDecoder(args, task.target_dictionary)
elif w2l_decoder == "kenlm":
from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder
return W2lKenLMDecoder(args, task.target_dictionary)
elif w2l_decoder == "fairseqlm":
from examples.speech_recognition.w2l_decoder import W2lFairseqLMDecoder
return W2lFairseqLMDecoder(args, task.target_dictionary)
else:
print(
"only wav2letter decoders with (viterbi, kenlm, fairseqlm) options are supported at the moment"
)
# please do not touch this unless you test both generate.py and infer.py with audio_pretraining task
generator = build_generator(args)
if args.load_emissions:
generator = ExistingEmissionsDecoder(
generator, np.load(args.load_emissions, allow_pickle=True)
)
logger.info("loaded emissions from " + args.load_emissions)
num_sentences = 0
if args.results_path is not None and not os.path.exists(args.results_path):
os.makedirs(args.results_path)
max_source_pos = (
utils.resolve_max_positions(
task.max_positions(), *[model.max_positions() for model in models]
),
)
if max_source_pos is not None:
max_source_pos = max_source_pos[0]
if max_source_pos is not None:
max_source_pos = max_source_pos[0] - 1
if args.dump_emissions:
emissions = {}
if args.dump_features:
features = {}
models[0].bert.proj = None
else:
res_files = prepare_result_files(args)
errs_t = 0
lengths_t = 0
with progress_bar.build_progress_bar(args, itr) as t:
wps_meter = TimeMeter()
for sample in t:
sample = utils.move_to_cuda(sample) if use_cuda else sample
if "net_input" not in sample:
continue
prefix_tokens = None
if args.prefix_size > 0:
prefix_tokens = sample["target"][:, : args.prefix_size]
gen_timer.start()
if args.dump_emissions:
with torch.no_grad():
encoder_out = models[0](**sample["net_input"])
emm = models[0].get_normalized_probs(encoder_out, log_probs=True)
emm = emm.transpose(0, 1).cpu().numpy()
for i, id in enumerate(sample["id"]):
emissions[id.item()] = emm[i]
continue
elif args.dump_features:
with torch.no_grad():
encoder_out = models[0](**sample["net_input"])
feat = encoder_out["encoder_out"].transpose(0, 1).cpu().numpy()
for i, id in enumerate(sample["id"]):
padding = (
encoder_out["encoder_padding_mask"][i].cpu().numpy()
if encoder_out["encoder_padding_mask"] is not None
else None
)
features[id.item()] = (feat[i], padding)
continue
hypos = task.inference_step(generator, models, sample, prefix_tokens)
num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos)
gen_timer.stop(num_generated_tokens)
for i, sample_id in enumerate(sample["id"].tolist()):
speaker = None
# id = task.dataset(args.gen_subset).ids[int(sample_id)]
id = sample_id
toks = (
sample["target"][i, :]
if "target_label" not in sample
else sample["target_label"][i, :]
)
target_tokens = utils.strip_pad(toks, tgt_dict.pad()).int().cpu()
# Process top predictions
errs, length = process_predictions(
args,
hypos[i],
None,
tgt_dict,
target_tokens,
res_files,
speaker,
id,
)
errs_t += errs
lengths_t += length
wps_meter.update(num_generated_tokens)
t.log({"wps": round(wps_meter.avg)})
num_sentences += (
sample["nsentences"] if "nsentences" in sample else sample["id"].numel()
)
wer = None
if args.dump_emissions:
emm_arr = []
for i in range(len(emissions)):
emm_arr.append(emissions[i])
np.save(args.dump_emissions, emm_arr)
logger.info(f"saved {len(emissions)} emissions to {args.dump_emissions}")
elif args.dump_features:
feat_arr = []
for i in range(len(features)):
feat_arr.append(features[i])
np.save(args.dump_features, feat_arr)
logger.info(f"saved {len(features)} emissions to {args.dump_features}")
else:
if lengths_t > 0:
wer = errs_t * 100.0 / lengths_t
logger.info(f"WER: {wer}")
logger.info(
"| Processed {} sentences ({} tokens) in {:.1f}s ({:.2f}"
"sentences/s, {:.2f} tokens/s)".format(
num_sentences,
gen_timer.n,
gen_timer.sum,
num_sentences / gen_timer.sum,
1.0 / gen_timer.avg,
)
)
logger.info("| Generate {} with beam={}".format(args.gen_subset, args.beam))
return task, wer
def make_parser():
parser = options.get_generation_parser()
parser = add_asr_eval_argument(parser)
return parser
def cli_main():
parser = make_parser()
args = options.parse_args_and_arch(parser)
main(args)
if __name__ == "__main__":
cli_main()
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Wav2letter decoders.
"""
import gc
import pdb
import itertools as it
import os.path as osp
import warnings
from collections import deque, namedtuple
import numpy as np
import torch
from fairseq import tasks
from fairseq.utils import apply_to_sample
from omegaconf import open_dict
from fairseq.dataclass.utils import convert_namespace_to_omegaconf
try:
from flashlight.lib.text.dictionary import create_word_dict, load_words
from flashlight.lib.sequence.criterion import CpuViterbiPath, get_data_ptr_as_bytes
from flashlight.lib.text.decoder import (
CriterionType,
LexiconDecoderOptions,
KenLM,
LM,
LMState,
SmearingMode,
Trie,
LexiconDecoder,
)
except:
warnings.warn(
"flashlight python bindings are required to use this functionality. Please install from https://github.com/facebookresearch/flashlight/tree/master/bindings/python"
)
LM = object
LMState = object
class W2lDecoder(object):
def __init__(self, args, tgt_dict):
self.tgt_dict = tgt_dict
self.vocab_size = len(tgt_dict)
self.nbest = args.nbest
# criterion-specific init
if args.criterion == "ctc" or args.criterion == "transducer":
self.criterion_type = CriterionType.CTC
self.blank = (
tgt_dict.index("<ctc_blank>")
if "<ctc_blank>" in tgt_dict.indices
else tgt_dict.bos()
)
if "<sep>" in tgt_dict.indices:
self.silence = tgt_dict.index("<sep>")
elif "|" in tgt_dict.indices:
self.silence = tgt_dict.index("|")
else:
self.silence = tgt_dict.eos()
self.asg_transitions = None
elif args.criterion == "asg_loss":
self.criterion_type = CriterionType.ASG
self.blank = -1
self.asg_transitions = args.asg_transitions
self.max_replabel = args.max_replabel
assert len(self.asg_transitions) == self.vocab_size ** 2
else:
raise RuntimeError(f"unknown criterion: {args.criterion}")
def generate(self, models, sample, **unused):
"""Generate a batch of inferences."""
# model.forward normally channels prev_output_tokens into the decoder
# separately, but SequenceGenerator directly calls model.encoder
encoder_input = {
k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens"
}
emissions = self.get_emissions(models, encoder_input)
return self.decode(emissions)
def get_emissions(self, models, encoder_input):
"""Run encoder and normalize emissions"""
# encoder_out = models[0].encoder(**encoder_input)
encoder_out = models[0](**encoder_input)
if self.criterion_type == CriterionType.CTC:
emissions = models[0].get_normalized_probs(encoder_out, log_probs=True)
elif self.criterion_type == CriterionType.ASG:
emissions = encoder_out["encoder_out"]
return emissions.transpose(0, 1).float().cpu().contiguous()
def get_tokens(self, idxs):
"""Normalize tokens by handling CTC blank, ASG replabels, etc."""
idxs = (g[0] for g in it.groupby(idxs))
idxs = filter(lambda x: x != self.blank, idxs)
return torch.LongTensor(list(idxs))
class W2lViterbiDecoder(W2lDecoder):
def __init__(self, args, tgt_dict):
super().__init__(args, tgt_dict)
def decode(self, emissions):
"""
B, T, N = emissions.size()
results = []
for b in range(B):
am_scores = emissions[b]
tokens = am_scores.argmax(dim=-1)
ids = []
for i in range(T):
if tokens[i].item() != self.tgt_dict.bos():
if i == 0:
ids.append(tokens[i].item())
elif tokens[i].item() != tokens[i-1].item():
ids.append(tokens[i].item())
results.append([{"tokens": ids, "score": 0.0}])
return results
"""
B, T, N = emissions.size()
hypos = []
if self.asg_transitions is None:
transitions = torch.FloatTensor(N, N).zero_()
else:
transitions = torch.FloatTensor(self.asg_transitions).view(N, N)
viterbi_path = torch.IntTensor(B, T)
workspace = torch.ByteTensor(CpuViterbiPath.get_workspace_size(B, T, N))
CpuViterbiPath.compute(
B,
T,
N,
get_data_ptr_as_bytes(emissions),
get_data_ptr_as_bytes(transitions),
get_data_ptr_as_bytes(viterbi_path),
get_data_ptr_as_bytes(workspace),
)
return [
[{"tokens": self.get_tokens(viterbi_path[b].tolist()), "score": 0}]
for b in range(B)
]
class TransducerDecoder(W2lDecoder):
def __init__(self, args, tgt_dict):
super().__init__(args, tgt_dict)
def generate(self, models, sample, **unused):
encoder_input = {
k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens"
}
encoder_out = models[0].encoder(**encoder_input, tbc=False)
ret = []
for i in range(len(encoder_out["encoder_out"])):
seq_len = ~encoder_out["padding_mask"][i].sum()
hs = encoder_out["encoder_out"][i][:seq_len]
ret.append(models[0].decoder.beam_search(hs, prefix=True, beam_size=5))
return ret
class W2lKenLMDecoder(W2lDecoder):
def __init__(self, args, tgt_dict):
super().__init__(args, tgt_dict)
self.unit_lm = getattr(args, "unit_lm", False)
if args.lexicon:
self.lexicon = load_words(args.lexicon)
self.word_dict = create_word_dict(self.lexicon)
self.unk_word = self.word_dict.get_index("<unk>")
self.lm = KenLM(args.kenlm_model, self.word_dict)
self.trie = Trie(self.vocab_size, self.silence)
start_state = self.lm.start(False)
for i, (word, spellings) in enumerate(self.lexicon.items()):
word_idx = self.word_dict.get_index(word)
_, score = self.lm.score(start_state, word_idx)
# spelling_idxs = [tgt_dict.index(w.upper()) for w in word]
# spelling_idxs.append(4)
# self.trie.insert(spelling_idxs, word_idx, score)
for spelling in spellings:
spelling_idxs = [tgt_dict.index(token) for token in spelling]
assert (
tgt_dict.unk() not in spelling_idxs
), f"{spelling} {spelling_idxs}"
self.trie.insert(spelling_idxs, word_idx, score)
self.trie.smear(SmearingMode.MAX)
self.decoder_opts = LexiconDecoderOptions(
beam_size=args.beam,
beam_size_token=int(getattr(args, "beam_size_token", len(tgt_dict))),
beam_threshold=args.beam_threshold,
lm_weight=args.lm_weight,
word_score=args.word_score,
unk_score=args.unk_weight,
sil_score=args.sil_weight,
log_add=False,
criterion_type=self.criterion_type,
)
if self.asg_transitions is None:
N = 768
# self.asg_transitions = torch.FloatTensor(N, N).zero_()
self.asg_transitions = []
self.decoder = LexiconDecoder(
self.decoder_opts,
self.trie,
self.lm,
self.silence,
self.blank,
self.unk_word,
self.asg_transitions,
self.unit_lm,
)
else:
assert args.unit_lm, "lexicon free decoding can only be done with a unit language model"
from flashlight.lib.text.decoder import LexiconFreeDecoder, LexiconFreeDecoderOptions
d = {w: [[w]] for w in tgt_dict.symbols}
self.word_dict = create_word_dict(d)
self.lm = KenLM(args.kenlm_model, self.word_dict)
self.decoder_opts = LexiconFreeDecoderOptions(
beam_size=args.beam,
beam_size_token=int(getattr(args, "beam_size_token", len(tgt_dict))),
beam_threshold=args.beam_threshold,
lm_weight=args.lm_weight,
sil_score=args.sil_weight,
log_add=False,
criterion_type=self.criterion_type,
)
self.decoder = LexiconFreeDecoder(
self.decoder_opts, self.lm, self.silence, self.blank, []
)
def get_timesteps(self, token_idxs):
"""Returns frame numbers corresponding to every non-blank token.
Parameters
----------
token_idxs : List[int]
IDs of decoded tokens.
Returns
-------
List[int]
Frame numbers corresponding to every non-blank token.
"""
timesteps = []
for i, token_idx in enumerate(token_idxs):
if token_idx == self.blank:
continue
if i == 0 or token_idx != token_idxs[i - 1]:
timesteps.append(i)
return timesteps
def decode(self, emissions):
B, T, N = emissions.size()
hypos = []
for b in range(B):
emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0)
results = self.decoder.decode(emissions_ptr, T, N)
nbest_results = results[: self.nbest]
hypos.append(
[
{
"tokens": self.get_tokens(result.tokens),
"score": result.score,
"timesteps": self.get_timesteps(result.tokens),
"words": [
self.word_dict.get_entry(x) for x in result.words if x >= 0
],
}
for result in nbest_results
]
)
return hypos
FairseqLMState = namedtuple("FairseqLMState", ["prefix", "incremental_state", "probs"])
class FairseqLM(LM):
def __init__(self, dictionary, model):
LM.__init__(self)
self.dictionary = dictionary
self.model = model
self.unk = self.dictionary.unk()
self.save_incremental = False # this currently does not work properly
self.max_cache = 20_000
model.cuda()
model.eval()
model.make_generation_fast_()
self.states = {}
self.stateq = deque()
def start(self, start_with_nothing):
state = LMState()
prefix = torch.LongTensor([[self.dictionary.eos()]])
incremental_state = {} if self.save_incremental else None
with torch.no_grad():
res = self.model(prefix.cuda(), incremental_state=incremental_state)
probs = self.model.get_normalized_probs(res, log_probs=True, sample=None)
if incremental_state is not None:
incremental_state = apply_to_sample(lambda x: x.cpu(), incremental_state)
self.states[state] = FairseqLMState(
prefix.numpy(), incremental_state, probs[0, -1].cpu().numpy()
)
self.stateq.append(state)
return state
def score(self, state: LMState, token_index: int, no_cache: bool = False):
"""
Evaluate language model based on the current lm state and new word
Parameters:
-----------
state: current lm state
token_index: index of the word
(can be lexicon index then you should store inside LM the
mapping between indices of lexicon and lm, or lm index of a word)
Returns:
--------
(LMState, float): pair of (new state, score for the current word)
"""
curr_state = self.states[state]
def trim_cache(targ_size):
while len(self.stateq) > targ_size:
rem_k = self.stateq.popleft()
rem_st = self.states[rem_k]
rem_st = FairseqLMState(rem_st.prefix, None, None)
self.states[rem_k] = rem_st
if curr_state.probs is None:
new_incremental_state = (
curr_state.incremental_state.copy()
if curr_state.incremental_state is not None
else None
)
with torch.no_grad():
if new_incremental_state is not None:
new_incremental_state = apply_to_sample(
lambda x: x.cuda(), new_incremental_state
)
elif self.save_incremental:
new_incremental_state = {}
res = self.model(
torch.from_numpy(curr_state.prefix).cuda(),
incremental_state=new_incremental_state,
)
probs = self.model.get_normalized_probs(
res, log_probs=True, sample=None
)
if new_incremental_state is not None:
new_incremental_state = apply_to_sample(
lambda x: x.cpu(), new_incremental_state
)
curr_state = FairseqLMState(
curr_state.prefix, new_incremental_state, probs[0, -1].cpu().numpy()
)
if not no_cache:
self.states[state] = curr_state
self.stateq.append(state)
score = curr_state.probs[token_index].item()
trim_cache(self.max_cache)
outstate = state.child(token_index)
if outstate not in self.states and not no_cache:
prefix = np.concatenate(
[curr_state.prefix, torch.LongTensor([[token_index]])], -1
)
incr_state = curr_state.incremental_state
self.states[outstate] = FairseqLMState(prefix, incr_state, None)
if token_index == self.unk:
score = float("-inf")
return outstate, score
def finish(self, state: LMState):
"""
Evaluate eos for language model based on the current lm state
Returns:
--------
(LMState, float): pair of (new state, score for the current word)
"""
return self.score(state, self.dictionary.eos())
def empty_cache(self):
self.states = {}
self.stateq = deque()
gc.collect()
class W2lFairseqLMDecoder(W2lDecoder):
def __init__(self, args, tgt_dict):
super().__init__(args, tgt_dict)
self.unit_lm = getattr(args, "unit_lm", False)
self.lexicon = load_words(args.lexicon) if args.lexicon else None
self.idx_to_wrd = {}
checkpoint = torch.load(args.kenlm_model, map_location="cpu")
if "cfg" in checkpoint and checkpoint["cfg"] is not None:
lm_args = checkpoint["cfg"]
else:
lm_args = convert_namespace_to_omegaconf(checkpoint["args"])
with open_dict(lm_args.task):
lm_args.task.data = osp.dirname(args.kenlm_model)
task = tasks.setup_task(lm_args.task)
model = task.build_model(lm_args.model)
model.load_state_dict(checkpoint["model"], strict=False)
self.trie = Trie(self.vocab_size, self.silence)
self.word_dict = task.dictionary
self.unk_word = self.word_dict.unk()
self.lm = FairseqLM(self.word_dict, model)
if self.lexicon:
start_state = self.lm.start(False)
for i, (word, spellings) in enumerate(self.lexicon.items()):
if self.unit_lm:
word_idx = i
self.idx_to_wrd[i] = word
score = 0
else:
word_idx = self.word_dict.index(word)
_, score = self.lm.score(start_state, word_idx, no_cache=True)
for spelling in spellings:
spelling_idxs = [tgt_dict.index(token.upper()) for token in spelling]
assert (
tgt_dict.unk() not in spelling_idxs
), f"{spelling} {spelling_idxs}"
self.trie.insert(spelling_idxs, word_idx, score)
self.trie.smear(SmearingMode.MAX)
self.decoder_opts = LexiconDecoderOptions(
beam_size=args.beam,
beam_size_token=int(getattr(args, "beam_size_token", len(tgt_dict))),
beam_threshold=args.beam_threshold,
lm_weight=args.lm_weight,
word_score=args.word_score,
unk_score=args.unk_weight,
sil_score=args.sil_weight,
log_add=False,
criterion_type=self.criterion_type,
)
self.decoder = LexiconDecoder(
self.decoder_opts,
self.trie,
self.lm,
self.silence,
self.blank,
self.unk_word,
[],
self.unit_lm,
)
else:
assert args.unit_lm, "lexicon free decoding can only be done with a unit language model"
from flashlight.lib.text.decoder import LexiconFreeDecoder, LexiconFreeDecoderOptions
d = {w: [[w]] for w in tgt_dict.symbols}
self.word_dict = create_word_dict(d)
self.lm = KenLM(args.kenlm_model, self.word_dict)
self.decoder_opts = LexiconFreeDecoderOptions(
beam_size=args.beam,
beam_size_token=int(getattr(args, "beam_size_token", len(tgt_dict))),
beam_threshold=args.beam_threshold,
lm_weight=args.lm_weight,
sil_score=args.sil_weight,
log_add=False,
criterion_type=self.criterion_type,
)
self.decoder = LexiconFreeDecoder(
self.decoder_opts, self.lm, self.silence, self.blank, []
)
def decode(self, emissions):
B, T, N = emissions.size()
hypos = []
def idx_to_word(idx):
if self.unit_lm:
return self.idx_to_wrd[idx]
else:
return self.word_dict[idx]
def make_hypo(result):
hypo = {"tokens": self.get_tokens(result.tokens), "score": result.score}
if self.lexicon:
hypo["words"] = [idx_to_word(x) for x in result.words if x >= 0]
return hypo
for b in range(B):
emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0)
results = self.decoder.decode(emissions_ptr, T, N)
nbest_results = results[: self.nbest]
hypos.append([make_hypo(result) for result in nbest_results])
self.lm.empty_cache()
return hypos
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import argparse
import os
import librosa
import soundfile as sf
from pydub import AudioSegment
def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument(
'--wav-path', type=str
)
parser.add_argument(
'--dest-path', type=str
)
parser.add_argument(
'--input', type=str
)
parser.add_argument(
'--output', type=str
)
return parser
def main(args):
os.makedirs(args.dest_path, exist_ok=True)
f = open(args.input)
data = f.readlines()
f.close()
wf = open(args.output, 'w')
wf.write(args.dest_path+"\n")
count = len(data)
for line in data:
wav_name = line.strip()
wav_file = os.path.join(args.wav_path, wav_name)
base_wav_name = os.path.splitext(wav_name)[0]
output_file = os.path.join(args.dest_path, base_wav_name+".wav")
if os.path.exists(wav_file) and not os.path.exists(output_file):
sound = AudioSegment.from_mp3(wav_file)
sound.export(os.path.join(args.dest_path, 'tmp.wav'), format='wav')
y, sr = librosa.load(os.path.join(args.dest_path, 'tmp.wav'), sr=16000)
sf.write(output_file, y, sr)
infos = sf.info(output_file)
frames = infos.frames
sr = infos.samplerate
wf.write("{}\t{}\t{}\n".format(base_wav_name+".wav", frames, sr))
count += 1
if count % 100 == 0:
print('process {} done'.format(count))
wf.close()
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment