增加了pytorch框架下的音频处理模型FastSpeech和ECAPA-TDNN的测试代码

719799a2 · lidc · 13a50bfe · 719799a2 · 719799a2 · 719799a2
Commit 719799a2 authored Jul 26, 2022 by lidc
20 changed files
--- a/PyTorch/Speech/ECAPA-TDNN/Deep learning based speaker recognition tutorial_Ruijie.pdf
+++ b/PyTorch/Speech/ECAPA-TDNN/Deep learning based speaker recognition tutorial_Ruijie.pdf
--- a/PyTorch/Speech/ECAPA-TDNN/ECAPAModel.py
+++ b/PyTorch/Speech/ECAPA-TDNN/ECAPAModel.py
+'''
+This part is used to train the speaker model and evaluate the performances
+'''
+
+import torch, sys, os, tqdm, numpy, soundfile, time, pickle
+import torch.nn as nn
+from tools import *
+from loss import AAMsoftmax
+from model import ECAPA_TDNN
+from tqdm import tqdm
+class ECAPAModel(nn.Module):
+	def __init__(self, lr, lr_decay, C , n_class, m, s, test_step, **kwargs):
+		super(ECAPAModel, self).__init__()
+		## ECAPA-TDNN
+		self.speaker_encoder = ECAPA_TDNN(C = C).cuda()
+		## Classifier
+		self.speaker_loss    = AAMsoftmax(n_class = n_class, m = m, s = s).cuda()
+
+		self.optim           = torch.optim.Adam(self.parameters(), lr = lr, weight_decay = 2e-5)
+		self.scheduler       = torch.optim.lr_scheduler.StepLR(self.optim, step_size = test_step, gamma=lr_decay)
+		print(time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f"%(sum(param.numel() for param in self.speaker_encoder.parameters()) / 1024 / 1024))
+
+	def train_network(self, epoch, loader):
+		self.train()
+		## Update the learning rate based on the current epcoh
+		self.scheduler.step(epoch - 1)
+		index, top1, loss = 0, 0, 0
+		lr = self.optim.param_groups[0]['lr']
+		for num, (data, labels) in tqdm(enumerate(loader, start = 1),total=len(loader)):
+			self.zero_grad()
+			labels            = torch.LongTensor(labels).cuda()
+			speaker_embedding = self.speaker_encoder.forward(data.cuda(), aug = True)
+			nloss, prec       = self.speaker_loss.forward(speaker_embedding, labels)			
+			nloss.backward()
+			self.optim.step()
+			index += len(labels)
+			top1 += prec
+			loss += nloss.detach().cpu().numpy()
+			sys.stderr.write(time.strftime("%m-%d %H:%M:%S") + \
+			" [%2d] Lr: %5f, Training: %.2f%%, "    %(epoch, lr, 100 * (num / loader.__len__())) + \
+			" Loss: %.5f, ACC: %2.2f%% \r"        %(loss/(num), top1/index*len(labels)))
+			sys.stderr.flush()
+		sys.stdout.write("\n")
+		return loss/num, lr, top1/index*len(labels)
+
+	def eval_network(self, eval_list, eval_path):
+		self.eval()
+		files = []
+		embeddings = {}
+		lines = open(eval_list).read().splitlines()
+		for line in lines:
+			files.append(line.split()[1])
+			files.append(line.split()[2])
+		setfiles = list(set(files))
+		setfiles.sort()
+		#print(files)
+		#print(eval_list)
+	#	print(setfiles)	
+		for idx, file in tqdm(enumerate(setfiles), total = len(setfiles)):
+		#	print('begin for and tqdm')
+			audio, _  = soundfile.read(os.path.join(eval_path, file))
+	 #	print(file)
+	#		print(audio.shape)
+		#	print('read end&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
+			# Full utterance
+			data_1 = torch.FloatTensor(numpy.stack([audio],axis=0)).cuda()
+		#	print('data_1 end')
+			# Spliited utterance matrix
+			max_audio = 300 * 160 + 240
+			if audio.shape[0] <= max_audio:
+				shortage = max_audio - audio.shape[0]
+				audio = numpy.pad(audio, (0, shortage), 'wrap')
+			feats = []
+			startframe = numpy.linspace(0, audio.shape[0]-max_audio, num=5)
+			for asf in startframe:
+				feats.append(audio[int(asf):int(asf)+max_audio])
+			feats = numpy.stack(feats, axis = 0).astype(numpy.float)
+			data_2 = torch.FloatTensor(feats).cuda()
+		#	print('begin eval********************')
+			# Speaker embeddings
+			with torch.no_grad():
+				embedding_1 = self.speaker_encoder.forward(data_1, aug = False)
+			#	print('forward1 end')
+				embedding_1 = F.normalize(embedding_1, p=2, dim=1)
+			#	print('normalize end')
+				embedding_2 = self.speaker_encoder.forward(data_2, aug = False)
+			#	print('forward2 end')
+				embedding_2 = F.normalize(embedding_2, p=2, dim=1)
+			#	print('normalize2 end')
+			embeddings[file] = [embedding_1, embedding_2]
+		#	print('embeddings end')
+			#print(num7 + 1)
+			#num7=num7 + 1
+		scores, labels  = [], []
+
+		for line in lines:			
+			embedding_11, embedding_12 = embeddings[line.split()[1]]
+			embedding_21, embedding_22 = embeddings[line.split()[2]]
+			# Compute the scores
+			score_1 = torch.mean(torch.matmul(embedding_11, embedding_21.T)) # higher is positive
+			score_2 = torch.mean(torch.matmul(embedding_12, embedding_22.T))
+			score = (score_1 + score_2) / 2
+			score = score.detach().cpu().numpy()
+			scores.append(score)
+			labels.append(int(line.split()[0]))
+			
+		# Coumpute EER and minDCF
+		EER = tuneThresholdfromScore(scores, labels, [1, 0.1])[1]
+		fnrs, fprs, thresholds = ComputeErrorRates(scores, labels)
+		minDCF, _ = ComputeMinDcf(fnrs, fprs, thresholds, 0.05, 1, 1)
+
+		return EER, minDCF
+
+	def save_parameters(self, path):
+		torch.save(self.state_dict(), path)
+
+	def load_parameters(self, path):
+		self_state = self.state_dict()
+		loaded_state = torch.load(path)
+		for name, param in loaded_state.items():
+			origname = name
+			if name not in self_state:
+				name = name.replace("module.", "")
+				if name not in self_state:
+					print("%s is not in the model."%origname)
+					continue
+			if self_state[name].size() != loaded_state[origname].size():
+				print("Wrong parameter length: %s, model: %s, loaded: %s"%(origname, self_state[name].size(), loaded_state[origname].size()))
+				continue
+			self_state[name].copy_(param)
--- a/PyTorch/Speech/ECAPA-TDNN/LICENSE.md
+++ b/PyTorch/Speech/ECAPA-TDNN/LICENSE.md
+MIT License
+
+Copyright (c) 2022 Tao Ruijie
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/PyTorch/Speech/ECAPA-TDNN/README.md
+++ b/PyTorch/Speech/ECAPA-TDNN/README.md
+# ECAPA-TDNN
+
+该文件中包换对ECAPA-TDNN的非官方重新实现，这是Voxceleb2数据集中的说话人识别。
+
+这个文件是基于[voxceleb_trainer](https://github.com/clovaai/voxceleb_trainer)修改的。
+
+若想查看系统的具体结构，请在[第三章 ECAPA-SYSTEM](https://arxiv.org/pdf/2111.06671.pdf)中查看。
+
+## 环境和依赖
+
+源程序是在DTK22.04.1下的PyTorch1.10版本环境下进行的，使用的是python3.7版本。
+
+### 环境搭建
+
+```
+conda create --name ECAPA-TDNN python=”3.7”
+
+conda activate ECAPA-TDNN
+```
+
+### 安装依赖
+
+安装dtk22.04.1环境下的torch包
+
+```
+pip3 install torch-1.10.0a0+git450cdd1.dtk22.4-cp37-cp37m-linux_x86_64.whl
+
+pip3 install torchvision-0.10.0a0_dtk22.04_e17f5ea-cp37-cp37m-linux_x86_64.whl
+
+pip3 install torchaudio-0.10.0-cp37-cp37m-linux_x86_64.whl
+```
+
+安装其他依赖包
+
+```
+pip3 install numpy
+
+pip3 install scipy
+
+pip3 install scikit-learn
+
+pip3 install tqdm
+
+pip3 install torchvision
+
+pip3 install soundfile
+```
+
+## 数据准备
+
+### 数据索引和预训练模型
+
+从如下链接中将ECAPA-TDNN_data.zip文件解压至程序根目录，解压完成后根目录中应包括exps文件夹、train_cut.txt、train_list.txt和veri_test2.txt文件。
+
+链接：https://pan.baidu.com/s/11TWAUSwROOjvKOcx36bqFQ 
+提取码：uaia 
+
+### 数据集下载
+
+**VoxCeleb1**
+
+**VoxCeleb2**
+
+(voxceleb1：
+
+链接：https://pan.baidu.com/s/1iASZ01mUny7udnjChJIbOg 
+
+提取码：k7v0 
+
+voxceleb2：
+
+链接：https://pan.baidu.com/s/1tBbdq2tm5KX7znM89o9LHg 
+
+提取码：6hzs)
+
+**MUSAN**(https://www.openslr.org/resources/17/musan.tar.gz)
+
+**RIR**(https://openslr.org/resources/28/rirs_noises.zip)
+
+### 数据预处理
+
+需要进行预处理的有voxceleb2和musan两个数据集，前者需要将m4a格式的音频转化为wav，而后者需要将其处理为musan_split，即拆分为可以更快进行随机读取的数据。其步骤可以参考[voxceleb_trainer](https://github.com/clovaai/voxceleb_trainer)中的Data preparation部分。
+
+#### voxceleb2
+
+所需的几个数据集中，VoxCeleb2中的数据需要进行转换，转换过程中需要提前安装ffmpeg，否则会导致转换失败。执行如下代码即可完成转换（注意：该转换时间至少需要24h以上）
+
+```
+python ./dataprep.py --save_path data --convert
+```
+
+#### musan
+
+对于musan数据集的处理需要手动调用dataprep.py文件中的split_musan函数，从而完成转换。
+
+## 模型训练
+
+建立train_single.sh脚本
+
+```
+export HIP_VISIBLE_DEVICES=0
+
+python3 trainECAPAModel.py --save_path exps/exp1
+```
+
+执行脚本即可进行训练
+
+```
+bash train_single.sh
+```
+
+![图片1](.\imgs\图片1.png)
+
+## 参考
+
+https://github.com/TaoRuijie/ECAPA-TDNN
\ No newline at end of file
--- a/PyTorch/Speech/ECAPA-TDNN/__pycache__/ECAPAModel.cpython-37.pyc
+++ b/PyTorch/Speech/ECAPA-TDNN/__pycache__/ECAPAModel.cpython-37.pyc
--- a/PyTorch/Speech/ECAPA-TDNN/__pycache__/dataLoader.cpython-37.pyc
+++ b/PyTorch/Speech/ECAPA-TDNN/__pycache__/dataLoader.cpython-37.pyc
--- a/PyTorch/Speech/ECAPA-TDNN/__pycache__/loss.cpython-37.pyc
+++ b/PyTorch/Speech/ECAPA-TDNN/__pycache__/loss.cpython-37.pyc
--- a/PyTorch/Speech/ECAPA-TDNN/__pycache__/model.cpython-37.pyc
+++ b/PyTorch/Speech/ECAPA-TDNN/__pycache__/model.cpython-37.pyc
--- a/PyTorch/Speech/ECAPA-TDNN/__pycache__/tools.cpython-37.pyc
+++ b/PyTorch/Speech/ECAPA-TDNN/__pycache__/tools.cpython-37.pyc
--- a/PyTorch/Speech/ECAPA-TDNN/dataLoader.py
+++ b/PyTorch/Speech/ECAPA-TDNN/dataLoader.py
+'''
+DataLoader for training
+'''
+
+import glob, numpy, os, random, soundfile, torch
+from scipy import signal
+import tqdm
+
+class train_loader(object):
+	def __init__(self, train_list, train_path, musan_path, rir_path, num_frames, **kwargs):
+		self.train_path = train_path
+		self.num_frames = num_frames
+		# Load and configure augmentation files
+		self.noisetypes = ['noise','speech','music']
+		self.noisesnr = {'noise':[0,15],'speech':[13,20],'music':[5,15]}
+		self.numnoise = {'noise':[1,1], 'speech':[3,8], 'music':[1,1]}
+		self.noiselist = {}
+		augment_files   = glob.glob(os.path.join(musan_path,'*/*/*/*.wav'))
+		for file in augment_files:
+			if file.split('/')[-4] not in self.noiselist:
+				self.noiselist[file.split('/')[-4]] = []
+			self.noiselist[file.split('/')[-4]].append(file)
+		self.rir_files  = glob.glob(os.path.join(rir_path,'*/*/*.wav'))
+		#print(self.rir_files)
+		# Load data & labels
+		self.data_list  = []
+		self.data_label = []
+		lines = open(train_list).read().splitlines()
+		dictkeys = list(set([x.split()[0] for x in lines]))
+		dictkeys.sort()
+		dictkeys = { key : ii for ii, key in enumerate(dictkeys) }
+		for index, line in enumerate(lines):
+			speaker_label = dictkeys[line.split()[0]]
+		#	print('speaker_label:',speaker_label)
+			file_name     = os.path.join(train_path, line.split()[1])
+			self.data_label.append(speaker_label)
+			self.data_list.append(file_name)
+		#print('data_label:',self.data_label)
+
+	def __getitem__(self, index):
+		# Read the utterance and randomly select the segment
+		audio, sr = soundfile.read(self.data_list[index])		
+		length = self.num_frames * 160 + 240
+		if audio.shape[0] <= length:
+			shortage = length - audio.shape[0]
+			audio = numpy.pad(audio, (0, shortage), 'wrap')
+		start_frame = numpy.int64(random.random()*(audio.shape[0]-length))
+		audio = audio[start_frame:start_frame + length]
+		audio = numpy.stack([audio],axis=0)
+		# Data Augmentation
+		augtype = random.randint(0,5)
+		if augtype == 0:   # Original
+			audio = audio
+		elif augtype == 1: # Reverberation
+			audio = self.add_rev(audio)
+		elif augtype == 2: # Babble
+			audio = self.add_noise(audio, 'speech')
+		elif augtype == 3: # Music
+			audio = self.add_noise(audio, 'music')
+		elif augtype == 4: # Noise
+			audio = self.add_noise(audio, 'noise')
+		elif augtype == 5: # Television noise
+			audio = self.add_noise(audio, 'speech')
+			audio = self.add_noise(audio, 'music')
+		return torch.FloatTensor(audio[0]), self.data_label[index]
+
+	def __len__(self):
+		return len(self.data_list)
+
+	def add_rev(self, audio):
+		#print('rir_files:',self.rir_files)
+		rir_file    = random.choice(self.rir_files)
+		rir, sr     = soundfile.read(rir_file)
+		rir         = numpy.expand_dims(rir.astype(numpy.float),0)
+		rir         = rir / numpy.sqrt(numpy.sum(rir**2))
+		return signal.convolve(audio, rir, mode='full')[:,:self.num_frames * 160 + 240]
+
+	def add_noise(self, audio, noisecat):
+		clean_db    = 10 * numpy.log10(numpy.mean(audio ** 2)+1e-4) 
+		numnoise    = self.numnoise[noisecat]
+		noiselist   = random.sample(self.noiselist[noisecat], random.randint(numnoise[0],numnoise[1]))
+		noises = []
+		for noise in noiselist:
+			noiseaudio, sr = soundfile.read(noise)
+			length = self.num_frames * 160 + 240
+			if noiseaudio.shape[0] <= length:
+				shortage = length - noiseaudio.shape[0]
+				noiseaudio = numpy.pad(noiseaudio, (0, shortage), 'wrap')
+			start_frame = numpy.int64(random.random()*(noiseaudio.shape[0]-length))
+			noiseaudio = noiseaudio[start_frame:start_frame + length]
+			noiseaudio = numpy.stack([noiseaudio],axis=0)
+			noise_db = 10 * numpy.log10(numpy.mean(noiseaudio ** 2)+1e-4) 
+			noisesnr   = random.uniform(self.noisesnr[noisecat][0],self.noisesnr[noisecat][1])
+			noises.append(numpy.sqrt(10 ** ((clean_db - noise_db - noisesnr) / 10)) * noiseaudio)
+		noise = numpy.sum(numpy.concatenate(noises,axis=0),axis=0,keepdims=True)
+		return noise + audio
--- a/PyTorch/Speech/ECAPA-TDNN/imgs/图片1.png
+++ b/PyTorch/Speech/ECAPA-TDNN/imgs/图片1.png
--- a/PyTorch/Speech/ECAPA-TDNN/loss.py
+++ b/PyTorch/Speech/ECAPA-TDNN/loss.py
+'''
+AAMsoftmax loss function copied from voxceleb_trainer: https://github.com/clovaai/voxceleb_trainer/blob/master/loss/aamsoftmax.py
+'''
+
+import torch, math
+import torch.nn as nn
+import torch.nn.functional as F
+from tools import *
+
+class AAMsoftmax(nn.Module):
+    def __init__(self, n_class, m, s):
+        
+        super(AAMsoftmax, self).__init__()
+        self.m = m
+        self.s = s
+        self.weight = torch.nn.Parameter(torch.FloatTensor(n_class, 192), requires_grad=True)
+        self.ce = nn.CrossEntropyLoss()
+        nn.init.xavier_normal_(self.weight, gain=1)
+        self.cos_m = math.cos(self.m)
+        self.sin_m = math.sin(self.m)
+        self.th = math.cos(math.pi - self.m)
+        self.mm = math.sin(math.pi - self.m) * self.m
+
+    def forward(self, x, label=None):
+        
+        cosine = F.linear(F.normalize(x), F.normalize(self.weight))
+        sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
+        phi = cosine * self.cos_m - sine * self.sin_m
+        phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm)
+        one_hot = torch.zeros_like(cosine)
+        one_hot.scatter_(1, label.view(-1, 1), 1)
+        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
+        output = output * self.s
+        
+        loss = self.ce(output, label)
+        prec1 = accuracy(output.detach(), label.detach(), topk=(1,))[0]
+
+        return loss, prec1
\ No newline at end of file
--- a/PyTorch/Speech/ECAPA-TDNN/model.py
+++ b/PyTorch/Speech/ECAPA-TDNN/model.py
+'''
+This is the ECAPA-TDNN model.
+This model is modified and combined based on the following three projects:
+  1. https://github.com/clovaai/voxceleb_trainer/issues/86
+  2. https://github.com/lawlict/ECAPA-TDNN/blob/master/ecapa_tdnn.py
+  3. https://github.com/speechbrain/speechbrain/blob/96077e9a1afff89d3f5ff47cab4bca0202770e4f/speechbrain/lobes/models/ECAPA_TDNN.py
+
+'''
+
+import math, torch, torchaudio
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SEModule(nn.Module):
+    def __init__(self, channels, bottleneck=128):
+        super(SEModule, self).__init__()
+        self.se = nn.Sequential(
+            nn.AdaptiveAvgPool1d(1),
+            nn.Conv1d(channels, bottleneck, kernel_size=1, padding=0),
+            nn.ReLU(),
+            # nn.BatchNorm1d(bottleneck), # I remove this layer
+            nn.Conv1d(bottleneck, channels, kernel_size=1, padding=0),
+            nn.Sigmoid(),
+            )
+
+    def forward(self, input):
+        x = self.se(input)
+        return input * x
+
+class Bottle2neck(nn.Module):
+
+    def __init__(self, inplanes, planes, kernel_size=None, dilation=None, scale = 8):
+        super(Bottle2neck, self).__init__()
+        width       = int(math.floor(planes / scale))
+        self.conv1  = nn.Conv1d(inplanes, width*scale, kernel_size=1)
+        self.bn1    = nn.BatchNorm1d(width*scale)
+        self.nums   = scale -1
+        convs       = []
+        bns         = []
+        num_pad = math.floor(kernel_size/2)*dilation
+        for i in range(self.nums):
+            convs.append(nn.Conv1d(width, width, kernel_size=kernel_size, dilation=dilation, padding=num_pad))
+            bns.append(nn.BatchNorm1d(width))
+        self.convs  = nn.ModuleList(convs)
+        self.bns    = nn.ModuleList(bns)
+        self.conv3  = nn.Conv1d(width*scale, planes, kernel_size=1)
+        self.bn3    = nn.BatchNorm1d(planes)
+        self.relu   = nn.ReLU()
+        self.width  = width
+        self.se     = SEModule(planes)
+
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.bn1(out)
+
+        spx = torch.split(out, self.width, 1)
+        for i in range(self.nums):
+          if i==0:
+            sp = spx[i]
+          else:
+            sp = sp + spx[i]
+          sp = self.convs[i](sp)
+          sp = self.relu(sp)
+          sp = self.bns[i](sp)
+          if i==0:
+            out = sp
+          else:
+            out = torch.cat((out, sp), 1)
+        out = torch.cat((out, spx[self.nums]),1)
+
+        out = self.conv3(out)
+        out = self.relu(out)
+        out = self.bn3(out)
+        
+        out = self.se(out)
+        out += residual
+        return out 
+
+class PreEmphasis(torch.nn.Module):
+
+    def __init__(self, coef: float = 0.97):
+        super().__init__()
+        self.coef = coef
+        self.register_buffer(
+            'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
+        )
+
+    def forward(self, input: torch.tensor) -> torch.tensor:
+        input = input.unsqueeze(1)
+        input = F.pad(input, (1, 0), 'reflect')
+        return F.conv1d(input, self.flipped_filter).squeeze(1)
+
+class FbankAug(nn.Module):
+
+    def __init__(self, freq_mask_width = (0, 8), time_mask_width = (0, 10)):
+        self.time_mask_width = time_mask_width
+        self.freq_mask_width = freq_mask_width
+        super().__init__()
+
+    def mask_along_axis(self, x, dim):
+        original_size = x.shape
+        batch, fea, time = x.shape
+        if dim == 1:
+            D = fea
+            width_range = self.freq_mask_width
+        else:
+            D = time
+            width_range = self.time_mask_width
+
+        mask_len = torch.randint(width_range[0], width_range[1], (batch, 1), device=x.device).unsqueeze(2)
+        mask_pos = torch.randint(0, max(1, D - mask_len.max()), (batch, 1), device=x.device).unsqueeze(2)
+        arange = torch.arange(D, device=x.device).view(1, 1, -1)
+        mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len))
+        mask = mask.any(dim=1)
+
+        if dim == 1:
+            mask = mask.unsqueeze(2)
+        else:
+            mask = mask.unsqueeze(1)
+            
+        x = x.masked_fill_(mask, 0.0)
+        return x.view(*original_size)
+
+    def forward(self, x):    
+        x = self.mask_along_axis(x, dim=2)
+        x = self.mask_along_axis(x, dim=1)
+        return x
+
+class ECAPA_TDNN(nn.Module):
+
+    def __init__(self, C):
+
+        super(ECAPA_TDNN, self).__init__()
+
+        self.torchfbank = torch.nn.Sequential(
+            PreEmphasis(),            
+            torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \
+                                                 f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=80),
+            )
+
+        self.specaug = FbankAug() # Spec augmentation
+
+        self.conv1  = nn.Conv1d(80, C, kernel_size=5, stride=1, padding=2)
+        self.relu   = nn.ReLU()
+        self.bn1    = nn.BatchNorm1d(C)
+        self.layer1 = Bottle2neck(C, C, kernel_size=3, dilation=2, scale=8)
+        self.layer2 = Bottle2neck(C, C, kernel_size=3, dilation=3, scale=8)
+        self.layer3 = Bottle2neck(C, C, kernel_size=3, dilation=4, scale=8)
+        # I fixed the shape of the output from MFA layer, that is close to the setting from ECAPA paper.
+        self.layer4 = nn.Conv1d(3*C, 1536, kernel_size=1)
+        self.attention = nn.Sequential(
+            nn.Conv1d(4608, 256, kernel_size=1),
+            nn.ReLU(),
+            nn.BatchNorm1d(256),
+            nn.Tanh(), # I add this layer
+            nn.Conv1d(256, 1536, kernel_size=1),
+            nn.Softmax(dim=2),
+            )
+        self.bn5 = nn.BatchNorm1d(3072)
+        self.fc6 = nn.Linear(3072, 192)
+        self.bn6 = nn.BatchNorm1d(192)
+
+
+    def forward(self, x, aug):
+        with torch.no_grad():
+            x = self.torchfbank(x)+1e-6
+            x = x.log()   
+            x = x - torch.mean(x, dim=-1, keepdim=True)
+            if aug == True:
+                x = self.specaug(x)
+
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.bn1(x)
+
+        x1 = self.layer1(x)
+        x2 = self.layer2(x+x1)
+        x3 = self.layer3(x+x1+x2)
+
+        x = self.layer4(torch.cat((x1,x2,x3),dim=1))
+        x = self.relu(x)
+
+        t = x.size()[-1]
+
+        global_x = torch.cat((x,torch.mean(x,dim=2,keepdim=True).repeat(1,1,t), torch.sqrt(torch.var(x,dim=2,keepdim=True).clamp(min=1e-4)).repeat(1,1,t)), dim=1)
+        
+        w = self.attention(global_x)
+
+        mu = torch.sum(x * w, dim=2)
+        sg = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-4) )
+
+        x = torch.cat((mu,sg),1)
+        x = self.bn5(x)
+        x = self.fc6(x)
+        x = self.bn6(x)
+
+        return x
\ No newline at end of file
--- a/PyTorch/Speech/ECAPA-TDNN/requirements.txt
+++ b/PyTorch/Speech/ECAPA-TDNN/requirements.txt
+numpy
+scipy
+scikit-learn
+tqdm
+soundfile
\ No newline at end of file
--- a/PyTorch/Speech/ECAPA-TDNN/tools.py
+++ b/PyTorch/Speech/ECAPA-TDNN/tools.py
+'''
+Some utilized functions
+These functions are all copied from voxceleb_trainer: https://github.com/clovaai/voxceleb_trainer/blob/master/tuneThreshold.py
+'''
+
+import os, numpy, torch
+from sklearn import metrics
+from operator import itemgetter
+import torch.nn.functional as F
+
+def init_args(args):
+	args.score_save_path    = os.path.join(args.save_path, 'score.txt')
+	args.model_save_path    = os.path.join(args.save_path, 'model')
+	os.makedirs(args.model_save_path, exist_ok = True)
+	return args
+
+def tuneThresholdfromScore(scores, labels, target_fa, target_fr = None):
+	
+	fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
+	fnr = 1 - tpr
+	tunedThreshold = [];
+	if target_fr:
+		for tfr in target_fr:
+			idx = numpy.nanargmin(numpy.absolute((tfr - fnr)))
+			tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]])
+	for tfa in target_fa:
+		idx = numpy.nanargmin(numpy.absolute((tfa - fpr))) # numpy.where(fpr<=tfa)[0][-1]
+		tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]])
+	idxE = numpy.nanargmin(numpy.absolute((fnr - fpr)))
+	eer  = max(fpr[idxE],fnr[idxE])*100
+	
+	return tunedThreshold, eer, fpr, fnr
+
+# Creates a list of false-negative rates, a list of false-positive rates
+# and a list of decision thresholds that give those error-rates.
+def ComputeErrorRates(scores, labels):
+
+      # Sort the scores from smallest to largest, and also get the corresponding
+      # indexes of the sorted scores.  We will treat the sorted scores as the
+      # thresholds at which the the error-rates are evaluated.
+      sorted_indexes, thresholds = zip(*sorted(
+          [(index, threshold) for index, threshold in enumerate(scores)],
+          key=itemgetter(1)))
+      sorted_labels = []
+      labels = [labels[i] for i in sorted_indexes]
+      fnrs = []
+      fprs = []
+
+      # At the end of this loop, fnrs[i] is the number of errors made by
+      # incorrectly rejecting scores less than thresholds[i]. And, fprs[i]
+      # is the total number of times that we have correctly accepted scores
+      # greater than thresholds[i].
+      for i in range(0, len(labels)):
+          if i == 0:
+              fnrs.append(labels[i])
+              fprs.append(1 - labels[i])
+          else:
+              fnrs.append(fnrs[i-1] + labels[i])
+              fprs.append(fprs[i-1] + 1 - labels[i])
+      fnrs_norm = sum(labels)
+      fprs_norm = len(labels) - fnrs_norm
+
+      # Now divide by the total number of false negative errors to
+      # obtain the false positive rates across all thresholds
+      fnrs = [x / float(fnrs_norm) for x in fnrs]
+
+      # Divide by the total number of corret positives to get the
+      # true positive rate.  Subtract these quantities from 1 to
+      # get the false positive rates.
+      fprs = [1 - x / float(fprs_norm) for x in fprs]
+      return fnrs, fprs, thresholds
+
+# Computes the minimum of the detection cost function.  The comments refer to
+# equations in Section 3 of the NIST 2016 Speaker Recognition Evaluation Plan.
+def ComputeMinDcf(fnrs, fprs, thresholds, p_target, c_miss, c_fa):
+    min_c_det = float("inf")
+    min_c_det_threshold = thresholds[0]
+    for i in range(0, len(fnrs)):
+        # See Equation (2).  it is a weighted sum of false negative
+        # and false positive errors.
+        c_det = c_miss * fnrs[i] * p_target + c_fa * fprs[i] * (1 - p_target)
+        if c_det < min_c_det:
+            min_c_det = c_det
+            min_c_det_threshold = thresholds[i]
+    # See Equations (3) and (4).  Now we normalize the cost.
+    c_def = min(c_miss * p_target, c_fa * (1 - p_target))
+    min_dcf = min_c_det / c_def
+    return min_dcf, min_c_det_threshold
+
+def accuracy(output, target, topk=(1,)):
+
+	maxk = max(topk)
+	batch_size = target.size(0)
+	_, pred = output.topk(maxk, 1, True, True)
+	pred = pred.t()
+	correct = pred.eq(target.view(1, -1).expand_as(pred))
+	res = []
+	for k in topk:
+		correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+		res.append(correct_k.mul_(100.0 / batch_size))
+	
+	return res
\ No newline at end of file
--- a/PyTorch/Speech/ECAPA-TDNN/trainECAPAModel.py
+++ b/PyTorch/Speech/ECAPA-TDNN/trainECAPAModel.py
+'''
+This is the main code of the ECAPATDNN project, to define the parameters and build the construction
+'''
+
+import argparse, glob, os, torch, warnings, time
+from tools import *
+from dataLoader import train_loader
+from ECAPAModel import ECAPAModel
+import tqdm
+
+parser = argparse.ArgumentParser(description = "ECAPA_trainer")
+## Training Settings
+parser.add_argument('--num_frames', type=int,   default=200,     help='Duration of the input segments, eg: 200 for 2 second')
+parser.add_argument('--max_epoch',  type=int,   default=80,      help='Maximum number of epochs')
+parser.add_argument('--batch_size', type=int,   default=400,     help='Batch size')
+parser.add_argument('--n_cpu',      type=int,   default=0,       help='Number of loader threads')
+parser.add_argument('--test_step',  type=int,   default=1,       help='Test and save every [test_step] epochs')
+parser.add_argument('--lr',         type=float, default=0.001,   help='Learning rate')
+parser.add_argument("--lr_decay",   type=float, default=0.97,    help='Learning rate decay every [test_step] epochs')
+
+## Training and evaluation path/lists, save path
+parser.add_argument('--train_list', type=str,   default="/datasets/projects/ECAPA-TDNN-main/train_cut.txt",     help='The path of the training list, https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/train_list.txt')
+parser.add_argument('--train_path', type=str,   default="/datasets/datasets/voxceleb2/",                    help='The path of the training data, eg:"/data08/VoxCeleb2/train/wav" in my case')
+parser.add_argument('--eval_list',  type=str,   default="/datasets/projects/ECAPA-TDNN-main/veri_test2.txt",              help='The path of the evaluation list, veri_test2.txt comes from https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt')
+parser.add_argument('--eval_path',  type=str,   default="/datasets/datasets/voxceleb1/wav/",                    help='The path of the evaluation data, eg:"/data08/VoxCeleb1/test/wav" in my case')
+parser.add_argument('--musan_path', type=str,   default="/datasets/datasets/musan_split/",                    help='The path to the MUSAN set, eg:"/data08/Others/musan_split" in my case')
+parser.add_argument('--rir_path',   type=str,   default="/datasets/datasets/RIRS_NOISES/simulated_rirs/",     help='The path to the RIR set, eg:"/data08/Others/RIRS_NOISES/simulated_rirs" in my case');
+parser.add_argument('--save_path',  type=str,   default="exps/exp1",                                     help='Path to save the score.txt and models')
+parser.add_argument('--initial_model',  type=str,   default="",                                          help='Path of the initial_model')
+
+## Model and Loss settings
+parser.add_argument('--C',       type=int,   default=1024,   help='Channel size for the speaker encoder')
+parser.add_argument('--m',       type=float, default=0.2,    help='Loss margin in AAM softmax')
+parser.add_argument('--s',       type=float, default=30,     help='Loss scale in AAM softmax')
+parser.add_argument('--n_class', type=int,   default=5994,   help='Number of speakers')
+
+## Command
+parser.add_argument('--eval',    dest='eval', action='store_true', help='Only do evaluation')
+
+## Initialization
+warnings.simplefilter("ignore")
+torch.multiprocessing.set_sharing_strategy('file_system')
+args = parser.parse_args()
+args = init_args(args)
+
+## Define the data loader
+trainloader = train_loader(**vars(args))
+trainLoader = torch.utils.data.DataLoader(trainloader, batch_size = args.batch_size, shuffle = True, num_workers = args.n_cpu, drop_last = True)
+#trainLoader_num = tqdm.tqdm(enumerate(trainLoader),total=len(trainLoader))
+## Search for the exist models
+modelfiles = glob.glob('%s/model_0*.model'%args.model_save_path)
+modelfiles.sort()
+
+## Only do evaluation, the initial_model is necessary
+if args.eval == True:
+	s = ECAPAModel(**vars(args))
+	print("Model %s loaded from previous state!"%args.initial_model)
+	s.load_parameters(args.initial_model)
+	EER, minDCF = s.eval_network(eval_list = args.eval_list, eval_path = args.eval_path)
+	print("EER %2.2f%%, minDCF %.4f%%"%(EER, minDCF))
+	quit()
+
+## If initial_model is exist, system will train from the initial_model
+if args.initial_model != "":
+	print("Model %s loaded from previous state!"%args.initial_model)
+	s = ECAPAModel(**vars(args))
+	s.load_parameters(args.initial_model)
+	epoch = 1
+
+## Otherwise, system will try to start from the saved model&epoch
+elif len(modelfiles) >= 1:
+	print("Model %s loaded from previous state!"%modelfiles[-1])
+	epoch = int(os.path.splitext(os.path.basename(modelfiles[-1]))[0][6:]) + 1
+	s = ECAPAModel(**vars(args))
+	s.load_parameters(modelfiles[-1])
+## Otherwise, system will train from scratch
+else:
+	epoch = 1
+	s = ECAPAModel(**vars(args))
+
+EERs = []
+score_file = open(args.score_save_path, "a+")
+
+while(1):
+	## Training for one epoch
+	loss, lr, acc = s.train_network(epoch = epoch, loader = trainLoader)
+
+	## Evaluation every [test_step] epochs
+	if epoch % args.test_step == 0:
+		s.save_parameters(args.model_save_path + "/model_%04d.model"%epoch)
+		EERs.append(s.eval_network(eval_list = args.eval_list, eval_path = args.eval_path)[0])
+		print(time.strftime("%Y-%m-%d %H:%M:%S"), "%d epoch, ACC %2.2f%%, EER %2.2f%%, bestEER %2.2f%%"%(epoch, acc, EERs[-1], min(EERs)))
+		score_file.write("%d epoch, LR %f, LOSS %f, ACC %2.2f%%, EER %2.2f%%, bestEER %2.2f%%\n"%(epoch, lr, loss, acc, EERs[-1], min(EERs)))
+		score_file.flush()
+
+	if epoch >= args.max_epoch:
+		quit()
+
+	epoch += 1
--- a/PyTorch/Speech/ECAPA-TDNN/train_single.sh
+++ b/PyTorch/Speech/ECAPA-TDNN/train_single.sh
+#!/bin/bash
+export HIP_VISIBLE_DEVICES=0
+python3 trainECAPAModel.py --save_path exps/exp1
--- a/PyTorch/Speech/FastSpeech2/LICENSE
+++ b/PyTorch/Speech/FastSpeech2/LICENSE
+MIT License
+
+Copyright (c) 2020 Chung-Ming Chien
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/PyTorch/Speech/FastSpeech2/READEME.md
+++ b/PyTorch/Speech/FastSpeech2/READEME.md
+# Fastspeech2-快速和高质量的端到端文本到语音
+
+FastSpeech是基于Pytorch实现微软的文本到语音系统 [**FastSpeech 2: Fast and High-Quality End-to-End Text to Speech**](https://arxiv.org/abs/2006.04558v1)。这个项目是基于[FastSpeech](https://github.com/xcmyz/FastSpeech)的。
+
+FastSpeech2有几个版本，这个版本的实现更类似于[version 1](https://arxiv.org/abs/2006.04558v1)，version1版本使用F0值作为音高特征(pitch feature)
+
+另一方面，[后续版本](https://arxiv.org/abs/2006.04558)采用连续小波变换提取的基音谱作为基音特征
+
+![](./img/model.png)
+
+## 安装相关环境和依赖
+
+### 创建环境
+
+```
+#创建conda虚拟环境
+conda create -name FastSpeech2 python=’3.7’
+#查看环境是否创建成功
+conda env list
+#激活环境
+conda activate FastSpeech2
+```
+
+### 安装依赖
+
+```
+pip3 install torch-1.10.0a0+git450cdd1.dtk22.4-cp37-cp37m-linux_x86_64.whl
+pip3 install torchvision-0.10.0a0_dtk22.04_300a8a4-cp37-cp37m-linux_x86_64.whl
+pip3 install -r requirments.txt
+```
+
+## 数据预处理
+
+使用的数据集为[LJSpeech](https://keithito.com/LJ-Speech-Dataset/)：它是一个单讲英语的数据集，一共包含13100个短音频片段，内容是一位女性演讲者阅读7本非小说类书籍中的段落，总共大约24小时。
+
+```
+wget https://drive.google.com/drive/folders/1DBRkALpPd6FL9gjHMmMEdHODmkgNIIK4?usp=sharing
+```
+
+本模型使用蒙特利尔强制对齐器(Montreal Forced Aligner, MFA)来获得语音和音素序列之间的对齐。
+
+[这里]((https://drive.google.com/drive/folders/1DBRkALpPd6FL9gjHMmMEdHODmkgNIIK4?usp=sharing))提供了受支持的数据集的对齐方式。
+
+您需要从如下链接中下载preprocessed_data.zip和pretrain_model.zip文件，并将preprocessed_data.zip解压到到程序的根目录中,将pretrain_model.zip解压至hifigan文件夹中（注意pretrain_model.zip文件中的两个预训练模型generator_LJSpeech和generator_universal需要再次进行解压操作，否则会导致训练过程中报错，提示找不到文件）
+
+链接：https://pan.baidu.com/s/1kDAAyXYClgS8U-703DA-nA 
+提取码：ujn6 
+
+之后执行如下命令即可完成对数据集LJSpeech的预处理：
+
+```
+python3 prepare_align.py config/LJSpeech/preprocess.yaml
+python3 preprocess.py config/LJSpeech/preprocess.yaml
+```
+
+需要注意，在程序运行过程中，preprocess.yaml文件中的路径需按当前机器下载数据集的位置进行指定。如图，只需要在corpus_path中指定下载好的数据集路径即可，同时要指定存在的空文件夹作为raw_path的路径，否则会出现报错。
+
+![pretrain](.\img\pretrain.png)
+
+## 训练
+
+单卡训练
+
+建立如下脚本运行即可对模型进行训练
+
+```
+train_single.sh:
+
+export HIP_VISIBLE_DEVICES=0
+python3 train.py -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml
+```
+
+```
+bash train_single.sh
+```
+
+多卡训练
+
+建立如下脚本运行即可对模型进行训练
+
+```
+train_ddp.sh:
+
+
+export HIP_VISIBLE_DEVICES=0,1,2,3
+
+export NGPUS=4
+
+python3 -m torch.distributed.launch --nproc_per_node ${NGPUS} train_ddp.py -p config/LJSpeech/preprocess.yaml -m config/LJSpeech/model.yaml -t config/LJSpeech/train.yaml
+```
+
+模型训练结束后，日志和模型会保存在output文件夹中，通过tensorboard可对日志进行可视化，从而观察损失曲线梅尔频谱图和音频显示。
+
+利用如下命令即可实现tensorboard的loss观测和语音的试听。
+
+```
+tensorboard --logdir output/log/LJSpeech
+```
+
+![](./img/tensorboard_loss.png)
+
+![](./img/tensorboard_spec.png)
+
+![](./img/tensorboard_audio.png)
+
+## 性能测试
+
+![图片2](.\img\图片2.png)
+
+## FAQ
+
+**a、AttributeError: module ‘distutils‘ has no attribute ‘version‘**
+
+这是因为setuptools版本的问题，将其版本降级即可：
+
+```
+pip3 uninstall setuptools
+pip3 install setuptools==59.5.0
+```
+
+**b、If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.**
+
+将protobuf版本定为3.20X可解决问题
+
+```
+pip3 install –upgrade protobuf==3.20.1
+```
+
+**c、 RuntimeErrorNumpy is not available**
+
+尝试重新安装numpy可解决问题
+
+```
+pip3 uninstall numpy
+pip3 install numpy
+```
+
+## 参考
+
+https://github.com/ming024/FastSpeech2
\ No newline at end of file
--- a/PyTorch/Speech/FastSpeech2/audio/__init__.py
+++ b/PyTorch/Speech/FastSpeech2/audio/__init__.py
+import audio.tools
+import audio.stft
+import audio.audio_processing