Commit 719799a2 authored by lidc's avatar lidc
Browse files

增加了pytorch框架下的音频处理模型FastSpeech和ECAPA-TDNN的测试代码

parent 13a50bfe
'''
This part is used to train the speaker model and evaluate the performances
'''
import torch, sys, os, tqdm, numpy, soundfile, time, pickle
import torch.nn as nn
from tools import *
from loss import AAMsoftmax
from model import ECAPA_TDNN
from tqdm import tqdm
class ECAPAModel(nn.Module):
def __init__(self, lr, lr_decay, C , n_class, m, s, test_step, **kwargs):
super(ECAPAModel, self).__init__()
## ECAPA-TDNN
self.speaker_encoder = ECAPA_TDNN(C = C).cuda()
## Classifier
self.speaker_loss = AAMsoftmax(n_class = n_class, m = m, s = s).cuda()
self.optim = torch.optim.Adam(self.parameters(), lr = lr, weight_decay = 2e-5)
self.scheduler = torch.optim.lr_scheduler.StepLR(self.optim, step_size = test_step, gamma=lr_decay)
print(time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f"%(sum(param.numel() for param in self.speaker_encoder.parameters()) / 1024 / 1024))
def train_network(self, epoch, loader):
self.train()
## Update the learning rate based on the current epcoh
self.scheduler.step(epoch - 1)
index, top1, loss = 0, 0, 0
lr = self.optim.param_groups[0]['lr']
for num, (data, labels) in tqdm(enumerate(loader, start = 1),total=len(loader)):
self.zero_grad()
labels = torch.LongTensor(labels).cuda()
speaker_embedding = self.speaker_encoder.forward(data.cuda(), aug = True)
nloss, prec = self.speaker_loss.forward(speaker_embedding, labels)
nloss.backward()
self.optim.step()
index += len(labels)
top1 += prec
loss += nloss.detach().cpu().numpy()
sys.stderr.write(time.strftime("%m-%d %H:%M:%S") + \
" [%2d] Lr: %5f, Training: %.2f%%, " %(epoch, lr, 100 * (num / loader.__len__())) + \
" Loss: %.5f, ACC: %2.2f%% \r" %(loss/(num), top1/index*len(labels)))
sys.stderr.flush()
sys.stdout.write("\n")
return loss/num, lr, top1/index*len(labels)
def eval_network(self, eval_list, eval_path):
self.eval()
files = []
embeddings = {}
lines = open(eval_list).read().splitlines()
for line in lines:
files.append(line.split()[1])
files.append(line.split()[2])
setfiles = list(set(files))
setfiles.sort()
#print(files)
#print(eval_list)
# print(setfiles)
for idx, file in tqdm(enumerate(setfiles), total = len(setfiles)):
# print('begin for and tqdm')
audio, _ = soundfile.read(os.path.join(eval_path, file))
# print(file)
# print(audio.shape)
# print('read end&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
# Full utterance
data_1 = torch.FloatTensor(numpy.stack([audio],axis=0)).cuda()
# print('data_1 end')
# Spliited utterance matrix
max_audio = 300 * 160 + 240
if audio.shape[0] <= max_audio:
shortage = max_audio - audio.shape[0]
audio = numpy.pad(audio, (0, shortage), 'wrap')
feats = []
startframe = numpy.linspace(0, audio.shape[0]-max_audio, num=5)
for asf in startframe:
feats.append(audio[int(asf):int(asf)+max_audio])
feats = numpy.stack(feats, axis = 0).astype(numpy.float)
data_2 = torch.FloatTensor(feats).cuda()
# print('begin eval********************')
# Speaker embeddings
with torch.no_grad():
embedding_1 = self.speaker_encoder.forward(data_1, aug = False)
# print('forward1 end')
embedding_1 = F.normalize(embedding_1, p=2, dim=1)
# print('normalize end')
embedding_2 = self.speaker_encoder.forward(data_2, aug = False)
# print('forward2 end')
embedding_2 = F.normalize(embedding_2, p=2, dim=1)
# print('normalize2 end')
embeddings[file] = [embedding_1, embedding_2]
# print('embeddings end')
#print(num7 + 1)
#num7=num7 + 1
scores, labels = [], []
for line in lines:
embedding_11, embedding_12 = embeddings[line.split()[1]]
embedding_21, embedding_22 = embeddings[line.split()[2]]
# Compute the scores
score_1 = torch.mean(torch.matmul(embedding_11, embedding_21.T)) # higher is positive
score_2 = torch.mean(torch.matmul(embedding_12, embedding_22.T))
score = (score_1 + score_2) / 2
score = score.detach().cpu().numpy()
scores.append(score)
labels.append(int(line.split()[0]))
# Coumpute EER and minDCF
EER = tuneThresholdfromScore(scores, labels, [1, 0.1])[1]
fnrs, fprs, thresholds = ComputeErrorRates(scores, labels)
minDCF, _ = ComputeMinDcf(fnrs, fprs, thresholds, 0.05, 1, 1)
return EER, minDCF
def save_parameters(self, path):
torch.save(self.state_dict(), path)
def load_parameters(self, path):
self_state = self.state_dict()
loaded_state = torch.load(path)
for name, param in loaded_state.items():
origname = name
if name not in self_state:
name = name.replace("module.", "")
if name not in self_state:
print("%s is not in the model."%origname)
continue
if self_state[name].size() != loaded_state[origname].size():
print("Wrong parameter length: %s, model: %s, loaded: %s"%(origname, self_state[name].size(), loaded_state[origname].size()))
continue
self_state[name].copy_(param)
MIT License
Copyright (c) 2022 Tao Ruijie
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# ECAPA-TDNN
该文件中包换对ECAPA-TDNN的非官方重新实现,这是Voxceleb2数据集中的说话人识别。
这个文件是基于[voxceleb_trainer](https://github.com/clovaai/voxceleb_trainer)修改的。
若想查看系统的具体结构,请在[第三章 ECAPA-SYSTEM](https://arxiv.org/pdf/2111.06671.pdf)中查看。
## 环境和依赖
源程序是在DTK22.04.1下的PyTorch1.10版本环境下进行的,使用的是python3.7版本。
### 环境搭建
```
conda create --name ECAPA-TDNN python=”3.7”
conda activate ECAPA-TDNN
```
### 安装依赖
安装dtk22.04.1环境下的torch包
```
pip3 install torch-1.10.0a0+git450cdd1.dtk22.4-cp37-cp37m-linux_x86_64.whl
pip3 install torchvision-0.10.0a0_dtk22.04_e17f5ea-cp37-cp37m-linux_x86_64.whl
pip3 install torchaudio-0.10.0-cp37-cp37m-linux_x86_64.whl
```
安装其他依赖包
```
pip3 install numpy
pip3 install scipy
pip3 install scikit-learn
pip3 install tqdm
pip3 install torchvision
pip3 install soundfile
```
## 数据准备
### 数据索引和预训练模型
从如下链接中将ECAPA-TDNN_data.zip文件解压至程序根目录,解压完成后根目录中应包括exps文件夹、train_cut.txt、train_list.txt和veri_test2.txt文件。
链接:https://pan.baidu.com/s/11TWAUSwROOjvKOcx36bqFQ
提取码:uaia
### 数据集下载
**VoxCeleb1**
**VoxCeleb2**
(voxceleb1:
链接:https://pan.baidu.com/s/1iASZ01mUny7udnjChJIbOg
提取码:k7v0
voxceleb2:
链接:https://pan.baidu.com/s/1tBbdq2tm5KX7znM89o9LHg
提取码:6hzs)
**MUSAN**(https://www.openslr.org/resources/17/musan.tar.gz)
**RIR**(https://openslr.org/resources/28/rirs_noises.zip)
### 数据预处理
需要进行预处理的有voxceleb2和musan两个数据集,前者需要将m4a格式的音频转化为wav,而后者需要将其处理为musan_split,即拆分为可以更快进行随机读取的数据。其步骤可以参考[voxceleb_trainer](https://github.com/clovaai/voxceleb_trainer)中的Data preparation部分。
#### voxceleb2
所需的几个数据集中,VoxCeleb2中的数据需要进行转换,转换过程中需要提前安装ffmpeg,否则会导致转换失败。执行如下代码即可完成转换(注意:该转换时间至少需要24h以上)
```
python ./dataprep.py --save_path data --convert
```
#### musan
对于musan数据集的处理需要手动调用dataprep.py文件中的split_musan函数,从而完成转换。
## 模型训练
建立train_single.sh脚本
```
export HIP_VISIBLE_DEVICES=0
python3 trainECAPAModel.py --save_path exps/exp1
```
执行脚本即可进行训练
```
bash train_single.sh
```
![图片1](.\imgs\图片1.png)
## 参考
https://github.com/TaoRuijie/ECAPA-TDNN
\ No newline at end of file
'''
DataLoader for training
'''
import glob, numpy, os, random, soundfile, torch
from scipy import signal
import tqdm
class train_loader(object):
def __init__(self, train_list, train_path, musan_path, rir_path, num_frames, **kwargs):
self.train_path = train_path
self.num_frames = num_frames
# Load and configure augmentation files
self.noisetypes = ['noise','speech','music']
self.noisesnr = {'noise':[0,15],'speech':[13,20],'music':[5,15]}
self.numnoise = {'noise':[1,1], 'speech':[3,8], 'music':[1,1]}
self.noiselist = {}
augment_files = glob.glob(os.path.join(musan_path,'*/*/*/*.wav'))
for file in augment_files:
if file.split('/')[-4] not in self.noiselist:
self.noiselist[file.split('/')[-4]] = []
self.noiselist[file.split('/')[-4]].append(file)
self.rir_files = glob.glob(os.path.join(rir_path,'*/*/*.wav'))
#print(self.rir_files)
# Load data & labels
self.data_list = []
self.data_label = []
lines = open(train_list).read().splitlines()
dictkeys = list(set([x.split()[0] for x in lines]))
dictkeys.sort()
dictkeys = { key : ii for ii, key in enumerate(dictkeys) }
for index, line in enumerate(lines):
speaker_label = dictkeys[line.split()[0]]
# print('speaker_label:',speaker_label)
file_name = os.path.join(train_path, line.split()[1])
self.data_label.append(speaker_label)
self.data_list.append(file_name)
#print('data_label:',self.data_label)
def __getitem__(self, index):
# Read the utterance and randomly select the segment
audio, sr = soundfile.read(self.data_list[index])
length = self.num_frames * 160 + 240
if audio.shape[0] <= length:
shortage = length - audio.shape[0]
audio = numpy.pad(audio, (0, shortage), 'wrap')
start_frame = numpy.int64(random.random()*(audio.shape[0]-length))
audio = audio[start_frame:start_frame + length]
audio = numpy.stack([audio],axis=0)
# Data Augmentation
augtype = random.randint(0,5)
if augtype == 0: # Original
audio = audio
elif augtype == 1: # Reverberation
audio = self.add_rev(audio)
elif augtype == 2: # Babble
audio = self.add_noise(audio, 'speech')
elif augtype == 3: # Music
audio = self.add_noise(audio, 'music')
elif augtype == 4: # Noise
audio = self.add_noise(audio, 'noise')
elif augtype == 5: # Television noise
audio = self.add_noise(audio, 'speech')
audio = self.add_noise(audio, 'music')
return torch.FloatTensor(audio[0]), self.data_label[index]
def __len__(self):
return len(self.data_list)
def add_rev(self, audio):
#print('rir_files:',self.rir_files)
rir_file = random.choice(self.rir_files)
rir, sr = soundfile.read(rir_file)
rir = numpy.expand_dims(rir.astype(numpy.float),0)
rir = rir / numpy.sqrt(numpy.sum(rir**2))
return signal.convolve(audio, rir, mode='full')[:,:self.num_frames * 160 + 240]
def add_noise(self, audio, noisecat):
clean_db = 10 * numpy.log10(numpy.mean(audio ** 2)+1e-4)
numnoise = self.numnoise[noisecat]
noiselist = random.sample(self.noiselist[noisecat], random.randint(numnoise[0],numnoise[1]))
noises = []
for noise in noiselist:
noiseaudio, sr = soundfile.read(noise)
length = self.num_frames * 160 + 240
if noiseaudio.shape[0] <= length:
shortage = length - noiseaudio.shape[0]
noiseaudio = numpy.pad(noiseaudio, (0, shortage), 'wrap')
start_frame = numpy.int64(random.random()*(noiseaudio.shape[0]-length))
noiseaudio = noiseaudio[start_frame:start_frame + length]
noiseaudio = numpy.stack([noiseaudio],axis=0)
noise_db = 10 * numpy.log10(numpy.mean(noiseaudio ** 2)+1e-4)
noisesnr = random.uniform(self.noisesnr[noisecat][0],self.noisesnr[noisecat][1])
noises.append(numpy.sqrt(10 ** ((clean_db - noise_db - noisesnr) / 10)) * noiseaudio)
noise = numpy.sum(numpy.concatenate(noises,axis=0),axis=0,keepdims=True)
return noise + audio
'''
AAMsoftmax loss function copied from voxceleb_trainer: https://github.com/clovaai/voxceleb_trainer/blob/master/loss/aamsoftmax.py
'''
import torch, math
import torch.nn as nn
import torch.nn.functional as F
from tools import *
class AAMsoftmax(nn.Module):
def __init__(self, n_class, m, s):
super(AAMsoftmax, self).__init__()
self.m = m
self.s = s
self.weight = torch.nn.Parameter(torch.FloatTensor(n_class, 192), requires_grad=True)
self.ce = nn.CrossEntropyLoss()
nn.init.xavier_normal_(self.weight, gain=1)
self.cos_m = math.cos(self.m)
self.sin_m = math.sin(self.m)
self.th = math.cos(math.pi - self.m)
self.mm = math.sin(math.pi - self.m) * self.m
def forward(self, x, label=None):
cosine = F.linear(F.normalize(x), F.normalize(self.weight))
sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
phi = cosine * self.cos_m - sine * self.sin_m
phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm)
one_hot = torch.zeros_like(cosine)
one_hot.scatter_(1, label.view(-1, 1), 1)
output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
output = output * self.s
loss = self.ce(output, label)
prec1 = accuracy(output.detach(), label.detach(), topk=(1,))[0]
return loss, prec1
\ No newline at end of file
'''
This is the ECAPA-TDNN model.
This model is modified and combined based on the following three projects:
1. https://github.com/clovaai/voxceleb_trainer/issues/86
2. https://github.com/lawlict/ECAPA-TDNN/blob/master/ecapa_tdnn.py
3. https://github.com/speechbrain/speechbrain/blob/96077e9a1afff89d3f5ff47cab4bca0202770e4f/speechbrain/lobes/models/ECAPA_TDNN.py
'''
import math, torch, torchaudio
import torch.nn as nn
import torch.nn.functional as F
class SEModule(nn.Module):
def __init__(self, channels, bottleneck=128):
super(SEModule, self).__init__()
self.se = nn.Sequential(
nn.AdaptiveAvgPool1d(1),
nn.Conv1d(channels, bottleneck, kernel_size=1, padding=0),
nn.ReLU(),
# nn.BatchNorm1d(bottleneck), # I remove this layer
nn.Conv1d(bottleneck, channels, kernel_size=1, padding=0),
nn.Sigmoid(),
)
def forward(self, input):
x = self.se(input)
return input * x
class Bottle2neck(nn.Module):
def __init__(self, inplanes, planes, kernel_size=None, dilation=None, scale = 8):
super(Bottle2neck, self).__init__()
width = int(math.floor(planes / scale))
self.conv1 = nn.Conv1d(inplanes, width*scale, kernel_size=1)
self.bn1 = nn.BatchNorm1d(width*scale)
self.nums = scale -1
convs = []
bns = []
num_pad = math.floor(kernel_size/2)*dilation
for i in range(self.nums):
convs.append(nn.Conv1d(width, width, kernel_size=kernel_size, dilation=dilation, padding=num_pad))
bns.append(nn.BatchNorm1d(width))
self.convs = nn.ModuleList(convs)
self.bns = nn.ModuleList(bns)
self.conv3 = nn.Conv1d(width*scale, planes, kernel_size=1)
self.bn3 = nn.BatchNorm1d(planes)
self.relu = nn.ReLU()
self.width = width
self.se = SEModule(planes)
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.relu(out)
out = self.bn1(out)
spx = torch.split(out, self.width, 1)
for i in range(self.nums):
if i==0:
sp = spx[i]
else:
sp = sp + spx[i]
sp = self.convs[i](sp)
sp = self.relu(sp)
sp = self.bns[i](sp)
if i==0:
out = sp
else:
out = torch.cat((out, sp), 1)
out = torch.cat((out, spx[self.nums]),1)
out = self.conv3(out)
out = self.relu(out)
out = self.bn3(out)
out = self.se(out)
out += residual
return out
class PreEmphasis(torch.nn.Module):
def __init__(self, coef: float = 0.97):
super().__init__()
self.coef = coef
self.register_buffer(
'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
)
def forward(self, input: torch.tensor) -> torch.tensor:
input = input.unsqueeze(1)
input = F.pad(input, (1, 0), 'reflect')
return F.conv1d(input, self.flipped_filter).squeeze(1)
class FbankAug(nn.Module):
def __init__(self, freq_mask_width = (0, 8), time_mask_width = (0, 10)):
self.time_mask_width = time_mask_width
self.freq_mask_width = freq_mask_width
super().__init__()
def mask_along_axis(self, x, dim):
original_size = x.shape
batch, fea, time = x.shape
if dim == 1:
D = fea
width_range = self.freq_mask_width
else:
D = time
width_range = self.time_mask_width
mask_len = torch.randint(width_range[0], width_range[1], (batch, 1), device=x.device).unsqueeze(2)
mask_pos = torch.randint(0, max(1, D - mask_len.max()), (batch, 1), device=x.device).unsqueeze(2)
arange = torch.arange(D, device=x.device).view(1, 1, -1)
mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len))
mask = mask.any(dim=1)
if dim == 1:
mask = mask.unsqueeze(2)
else:
mask = mask.unsqueeze(1)
x = x.masked_fill_(mask, 0.0)
return x.view(*original_size)
def forward(self, x):
x = self.mask_along_axis(x, dim=2)
x = self.mask_along_axis(x, dim=1)
return x
class ECAPA_TDNN(nn.Module):
def __init__(self, C):
super(ECAPA_TDNN, self).__init__()
self.torchfbank = torch.nn.Sequential(
PreEmphasis(),
torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \
f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=80),
)
self.specaug = FbankAug() # Spec augmentation
self.conv1 = nn.Conv1d(80, C, kernel_size=5, stride=1, padding=2)
self.relu = nn.ReLU()
self.bn1 = nn.BatchNorm1d(C)
self.layer1 = Bottle2neck(C, C, kernel_size=3, dilation=2, scale=8)
self.layer2 = Bottle2neck(C, C, kernel_size=3, dilation=3, scale=8)
self.layer3 = Bottle2neck(C, C, kernel_size=3, dilation=4, scale=8)
# I fixed the shape of the output from MFA layer, that is close to the setting from ECAPA paper.
self.layer4 = nn.Conv1d(3*C, 1536, kernel_size=1)
self.attention = nn.Sequential(
nn.Conv1d(4608, 256, kernel_size=1),
nn.ReLU(),
nn.BatchNorm1d(256),
nn.Tanh(), # I add this layer
nn.Conv1d(256, 1536, kernel_size=1),
nn.Softmax(dim=2),
)
self.bn5 = nn.BatchNorm1d(3072)
self.fc6 = nn.Linear(3072, 192)
self.bn6 = nn.BatchNorm1d(192)
def forward(self, x, aug):
with torch.no_grad():
x = self.torchfbank(x)+1e-6
x = x.log()
x = x - torch.mean(x, dim=-1, keepdim=True)
if aug == True:
x = self.specaug(x)
x = self.conv1(x)
x = self.relu(x)
x = self.bn1(x)
x1 = self.layer1(x)
x2 = self.layer2(x+x1)
x3 = self.layer3(x+x1+x2)
x = self.layer4(torch.cat((x1,x2,x3),dim=1))
x = self.relu(x)
t = x.size()[-1]
global_x = torch.cat((x,torch.mean(x,dim=2,keepdim=True).repeat(1,1,t), torch.sqrt(torch.var(x,dim=2,keepdim=True).clamp(min=1e-4)).repeat(1,1,t)), dim=1)
w = self.attention(global_x)
mu = torch.sum(x * w, dim=2)
sg = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-4) )
x = torch.cat((mu,sg),1)
x = self.bn5(x)
x = self.fc6(x)
x = self.bn6(x)
return x
\ No newline at end of file
numpy
scipy
scikit-learn
tqdm
soundfile
\ No newline at end of file
'''
Some utilized functions
These functions are all copied from voxceleb_trainer: https://github.com/clovaai/voxceleb_trainer/blob/master/tuneThreshold.py
'''
import os, numpy, torch
from sklearn import metrics
from operator import itemgetter
import torch.nn.functional as F
def init_args(args):
args.score_save_path = os.path.join(args.save_path, 'score.txt')
args.model_save_path = os.path.join(args.save_path, 'model')
os.makedirs(args.model_save_path, exist_ok = True)
return args
def tuneThresholdfromScore(scores, labels, target_fa, target_fr = None):
fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
fnr = 1 - tpr
tunedThreshold = [];
if target_fr:
for tfr in target_fr:
idx = numpy.nanargmin(numpy.absolute((tfr - fnr)))
tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]])
for tfa in target_fa:
idx = numpy.nanargmin(numpy.absolute((tfa - fpr))) # numpy.where(fpr<=tfa)[0][-1]
tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]])
idxE = numpy.nanargmin(numpy.absolute((fnr - fpr)))
eer = max(fpr[idxE],fnr[idxE])*100
return tunedThreshold, eer, fpr, fnr
# Creates a list of false-negative rates, a list of false-positive rates
# and a list of decision thresholds that give those error-rates.
def ComputeErrorRates(scores, labels):
# Sort the scores from smallest to largest, and also get the corresponding
# indexes of the sorted scores. We will treat the sorted scores as the
# thresholds at which the the error-rates are evaluated.
sorted_indexes, thresholds = zip(*sorted(
[(index, threshold) for index, threshold in enumerate(scores)],
key=itemgetter(1)))
sorted_labels = []
labels = [labels[i] for i in sorted_indexes]
fnrs = []
fprs = []
# At the end of this loop, fnrs[i] is the number of errors made by
# incorrectly rejecting scores less than thresholds[i]. And, fprs[i]
# is the total number of times that we have correctly accepted scores
# greater than thresholds[i].
for i in range(0, len(labels)):
if i == 0:
fnrs.append(labels[i])
fprs.append(1 - labels[i])
else:
fnrs.append(fnrs[i-1] + labels[i])
fprs.append(fprs[i-1] + 1 - labels[i])
fnrs_norm = sum(labels)
fprs_norm = len(labels) - fnrs_norm
# Now divide by the total number of false negative errors to
# obtain the false positive rates across all thresholds
fnrs = [x / float(fnrs_norm) for x in fnrs]
# Divide by the total number of corret positives to get the
# true positive rate. Subtract these quantities from 1 to
# get the false positive rates.
fprs = [1 - x / float(fprs_norm) for x in fprs]
return fnrs, fprs, thresholds
# Computes the minimum of the detection cost function. The comments refer to
# equations in Section 3 of the NIST 2016 Speaker Recognition Evaluation Plan.
def ComputeMinDcf(fnrs, fprs, thresholds, p_target, c_miss, c_fa):
min_c_det = float("inf")
min_c_det_threshold = thresholds[0]
for i in range(0, len(fnrs)):
# See Equation (2). it is a weighted sum of false negative
# and false positive errors.
c_det = c_miss * fnrs[i] * p_target + c_fa * fprs[i] * (1 - p_target)
if c_det < min_c_det:
min_c_det = c_det
min_c_det_threshold = thresholds[i]
# See Equations (3) and (4). Now we normalize the cost.
c_def = min(c_miss * p_target, c_fa * (1 - p_target))
min_dcf = min_c_det / c_def
return min_dcf, min_c_det_threshold
def accuracy(output, target, topk=(1,)):
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
\ No newline at end of file
'''
This is the main code of the ECAPATDNN project, to define the parameters and build the construction
'''
import argparse, glob, os, torch, warnings, time
from tools import *
from dataLoader import train_loader
from ECAPAModel import ECAPAModel
import tqdm
parser = argparse.ArgumentParser(description = "ECAPA_trainer")
## Training Settings
parser.add_argument('--num_frames', type=int, default=200, help='Duration of the input segments, eg: 200 for 2 second')
parser.add_argument('--max_epoch', type=int, default=80, help='Maximum number of epochs')
parser.add_argument('--batch_size', type=int, default=400, help='Batch size')
parser.add_argument('--n_cpu', type=int, default=0, help='Number of loader threads')
parser.add_argument('--test_step', type=int, default=1, help='Test and save every [test_step] epochs')
parser.add_argument('--lr', type=float, default=0.001, help='Learning rate')
parser.add_argument("--lr_decay", type=float, default=0.97, help='Learning rate decay every [test_step] epochs')
## Training and evaluation path/lists, save path
parser.add_argument('--train_list', type=str, default="/datasets/projects/ECAPA-TDNN-main/train_cut.txt", help='The path of the training list, https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/train_list.txt')
parser.add_argument('--train_path', type=str, default="/datasets/datasets/voxceleb2/", help='The path of the training data, eg:"/data08/VoxCeleb2/train/wav" in my case')
parser.add_argument('--eval_list', type=str, default="/datasets/projects/ECAPA-TDNN-main/veri_test2.txt", help='The path of the evaluation list, veri_test2.txt comes from https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt')
parser.add_argument('--eval_path', type=str, default="/datasets/datasets/voxceleb1/wav/", help='The path of the evaluation data, eg:"/data08/VoxCeleb1/test/wav" in my case')
parser.add_argument('--musan_path', type=str, default="/datasets/datasets/musan_split/", help='The path to the MUSAN set, eg:"/data08/Others/musan_split" in my case')
parser.add_argument('--rir_path', type=str, default="/datasets/datasets/RIRS_NOISES/simulated_rirs/", help='The path to the RIR set, eg:"/data08/Others/RIRS_NOISES/simulated_rirs" in my case');
parser.add_argument('--save_path', type=str, default="exps/exp1", help='Path to save the score.txt and models')
parser.add_argument('--initial_model', type=str, default="", help='Path of the initial_model')
## Model and Loss settings
parser.add_argument('--C', type=int, default=1024, help='Channel size for the speaker encoder')
parser.add_argument('--m', type=float, default=0.2, help='Loss margin in AAM softmax')
parser.add_argument('--s', type=float, default=30, help='Loss scale in AAM softmax')
parser.add_argument('--n_class', type=int, default=5994, help='Number of speakers')
## Command
parser.add_argument('--eval', dest='eval', action='store_true', help='Only do evaluation')
## Initialization
warnings.simplefilter("ignore")
torch.multiprocessing.set_sharing_strategy('file_system')
args = parser.parse_args()
args = init_args(args)
## Define the data loader
trainloader = train_loader(**vars(args))
trainLoader = torch.utils.data.DataLoader(trainloader, batch_size = args.batch_size, shuffle = True, num_workers = args.n_cpu, drop_last = True)
#trainLoader_num = tqdm.tqdm(enumerate(trainLoader),total=len(trainLoader))
## Search for the exist models
modelfiles = glob.glob('%s/model_0*.model'%args.model_save_path)
modelfiles.sort()
## Only do evaluation, the initial_model is necessary
if args.eval == True:
s = ECAPAModel(**vars(args))
print("Model %s loaded from previous state!"%args.initial_model)
s.load_parameters(args.initial_model)
EER, minDCF = s.eval_network(eval_list = args.eval_list, eval_path = args.eval_path)
print("EER %2.2f%%, minDCF %.4f%%"%(EER, minDCF))
quit()
## If initial_model is exist, system will train from the initial_model
if args.initial_model != "":
print("Model %s loaded from previous state!"%args.initial_model)
s = ECAPAModel(**vars(args))
s.load_parameters(args.initial_model)
epoch = 1
## Otherwise, system will try to start from the saved model&epoch
elif len(modelfiles) >= 1:
print("Model %s loaded from previous state!"%modelfiles[-1])
epoch = int(os.path.splitext(os.path.basename(modelfiles[-1]))[0][6:]) + 1
s = ECAPAModel(**vars(args))
s.load_parameters(modelfiles[-1])
## Otherwise, system will train from scratch
else:
epoch = 1
s = ECAPAModel(**vars(args))
EERs = []
score_file = open(args.score_save_path, "a+")
while(1):
## Training for one epoch
loss, lr, acc = s.train_network(epoch = epoch, loader = trainLoader)
## Evaluation every [test_step] epochs
if epoch % args.test_step == 0:
s.save_parameters(args.model_save_path + "/model_%04d.model"%epoch)
EERs.append(s.eval_network(eval_list = args.eval_list, eval_path = args.eval_path)[0])
print(time.strftime("%Y-%m-%d %H:%M:%S"), "%d epoch, ACC %2.2f%%, EER %2.2f%%, bestEER %2.2f%%"%(epoch, acc, EERs[-1], min(EERs)))
score_file.write("%d epoch, LR %f, LOSS %f, ACC %2.2f%%, EER %2.2f%%, bestEER %2.2f%%\n"%(epoch, lr, loss, acc, EERs[-1], min(EERs)))
score_file.flush()
if epoch >= args.max_epoch:
quit()
epoch += 1
#!/bin/bash
export HIP_VISIBLE_DEVICES=0
python3 trainECAPAModel.py --save_path exps/exp1
MIT License
Copyright (c) 2020 Chung-Ming Chien
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# Fastspeech2-快速和高质量的端到端文本到语音
FastSpeech是基于Pytorch实现微软的文本到语音系统 [**FastSpeech 2: Fast and High-Quality End-to-End Text to Speech**](https://arxiv.org/abs/2006.04558v1)。这个项目是基于[FastSpeech](https://github.com/xcmyz/FastSpeech)的。
FastSpeech2有几个版本,这个版本的实现更类似于[version 1](https://arxiv.org/abs/2006.04558v1),version1版本使用F0值作为音高特征(pitch feature)
另一方面,[后续版本](https://arxiv.org/abs/2006.04558)采用连续小波变换提取的基音谱作为基音特征
![](./img/model.png)
## 安装相关环境和依赖
### 创建环境
```
#创建conda虚拟环境
conda create -name FastSpeech2 python=’3.7’
#查看环境是否创建成功
conda env list
#激活环境
conda activate FastSpeech2
```
### 安装依赖
```
pip3 install torch-1.10.0a0+git450cdd1.dtk22.4-cp37-cp37m-linux_x86_64.whl
pip3 install torchvision-0.10.0a0_dtk22.04_300a8a4-cp37-cp37m-linux_x86_64.whl
pip3 install -r requirments.txt
```
## 数据预处理
使用的数据集为[LJSpeech](https://keithito.com/LJ-Speech-Dataset/):它是一个单讲英语的数据集,一共包含13100个短音频片段,内容是一位女性演讲者阅读7本非小说类书籍中的段落,总共大约24小时。
```
wget https://drive.google.com/drive/folders/1DBRkALpPd6FL9gjHMmMEdHODmkgNIIK4?usp=sharing
```
本模型使用蒙特利尔强制对齐器(Montreal Forced Aligner, MFA)来获得语音和音素序列之间的对齐。
[这里]((https://drive.google.com/drive/folders/1DBRkALpPd6FL9gjHMmMEdHODmkgNIIK4?usp=sharing))提供了受支持的数据集的对齐方式。
您需要从如下链接中下载preprocessed_data.zip和pretrain_model.zip文件,并将preprocessed_data.zip解压到到程序的根目录中,将pretrain_model.zip解压至hifigan文件夹中(注意pretrain_model.zip文件中的两个预训练模型generator_LJSpeech和generator_universal需要再次进行解压操作,否则会导致训练过程中报错,提示找不到文件)
链接:https://pan.baidu.com/s/1kDAAyXYClgS8U-703DA-nA
提取码:ujn6
之后执行如下命令即可完成对数据集LJSpeech的预处理:
```
python3 prepare_align.py config/LJSpeech/preprocess.yaml
python3 preprocess.py config/LJSpeech/preprocess.yaml
```
需要注意,在程序运行过程中,preprocess.yaml文件中的路径需按当前机器下载数据集的位置进行指定。如图,只需要在corpus_path中指定下载好的数据集路径即可,同时要指定存在的空文件夹作为raw_path的路径,否则会出现报错。
![pretrain](.\img\pretrain.png)
## 训练
单卡训练
建立如下脚本运行即可对模型进行训练
```
train_single.sh:
export HIP_VISIBLE_DEVICES=0
python3 train.py -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml
```
```
bash train_single.sh
```
多卡训练
建立如下脚本运行即可对模型进行训练
```
train_ddp.sh:
export HIP_VISIBLE_DEVICES=0,1,2,3
export NGPUS=4
python3 -m torch.distributed.launch --nproc_per_node ${NGPUS} train_ddp.py -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml
```
模型训练结束后,日志和模型会保存在output文件夹中,通过tensorboard可对日志进行可视化,从而观察损失曲线梅尔频谱图和音频显示。
利用如下命令即可实现tensorboard的loss观测和语音的试听。
```
tensorboard --logdir output/log/LJSpeech
```
![](./img/tensorboard_loss.png)
![](./img/tensorboard_spec.png)
![](./img/tensorboard_audio.png)
## 性能测试
![图片2](.\img\图片2.png)
## FAQ
**a、AttributeError: module ‘distutils‘ has no attribute ‘version‘**
这是因为setuptools版本的问题,将其版本降级即可:
```
pip3 uninstall setuptools
pip3 install setuptools==59.5.0
```
**b、If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.**
将protobuf版本定为3.20X可解决问题
```
pip3 install –upgrade protobuf==3.20.1
```
**c、 RuntimeErrorNumpy is not available**
尝试重新安装numpy可解决问题
```
pip3 uninstall numpy
pip3 install numpy
```
## 参考
https://github.com/ming024/FastSpeech2
\ No newline at end of file
import audio.tools
import audio.stft
import audio.audio_processing
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment