Commit ee10550a authored by liugh5's avatar liugh5
Browse files

Initial commit

parents
Pipeline #790 canceled with stages
from enum import Enum
class Tone(Enum):
UnAssigned = -1
NoneTone = 0
YinPing = 1 # ZhHK: YinPingYinRu EnUS: primary stress
YangPing = 2 # ZhHK: YinShang EnUS: secondary stress
ShangSheng = 3 # ZhHK: YinQuZhongRu
QuSheng = 4 # ZhHK: YangPing
QingSheng = 5 # ZhHK: YangShang
YangQuYangRu = 6 # ZhHK: YangQuYangRu
@classmethod
def parse(cls, in_str):
if not isinstance(in_str, str):
return super(Tone, cls).__new__(cls, in_str)
if in_str in ["UnAssigned", "-1"]:
return Tone.UnAssigned
elif in_str in ["NoneTone", "0"]:
return Tone.NoneTone
elif in_str in ["YinPing", "1"]:
return Tone.YinPing
elif in_str in ["YangPing", "2"]:
return Tone.YangPing
elif in_str in ["ShangSheng", "3"]:
return Tone.ShangSheng
elif in_str in ["QuSheng", "4"]:
return Tone.QuSheng
elif in_str in ["QingSheng", "5"]:
return Tone.QingSheng
elif in_str in ["YangQuYangRu", "6"]:
return Tone.YangQuYangRu
else:
return Tone.NoneTone
class BreakLevel(Enum):
UnAssigned = -1
L0 = 0
L1 = 1
L2 = 2
L3 = 3
L4 = 4
@classmethod
def parse(cls, in_str):
if not isinstance(in_str, str):
return super(BreakLevel, cls).__new__(cls, in_str)
if in_str in ["UnAssigned", "-1"]:
return BreakLevel.UnAssigned
elif in_str in ["L0", "0"]:
return BreakLevel.L0
elif in_str in ["L1", "1"]:
return BreakLevel.L1
elif in_str in ["L2", "2"]:
return BreakLevel.L2
elif in_str in ["L3", "3"]:
return BreakLevel.L3
elif in_str in ["L4", "4"]:
return BreakLevel.L4
else:
return BreakLevel.UnAssigned
class SentencePurpose(Enum):
Declarative = 0
Interrogative = 1
Exclamatory = 2
Imperative = 3
class Language(Enum):
Neutral = 0
EnUS = 1033
EnGB = 2057
ZhCN = 2052
PinYin = 2053
WuuShanghai = 2054
Sichuan = 2055
ZhHK = 3076
ZhEn = ZhCN | EnUS
@classmethod
def parse(cls, in_str):
if not isinstance(in_str, str):
return super(Language, cls).__new__(cls, in_str)
if in_str in ["Neutral", "0"]:
return Language.Neutral
elif in_str in ["EnUS", "1033"]:
return Language.EnUS
elif in_str in ["EnGB", "2057"]:
return Language.EnGB
elif in_str in ["ZhCN", "2052"]:
return Language.ZhCN
elif in_str in ["PinYin", "2053"]:
return Language.PinYin
elif in_str in ["WuuShanghai", "2054"]:
return Language.WuuShanghai
elif in_str in ["Sichuan", "2055"]:
return Language.Sichuan
elif in_str in ["ZhHK", "3076"]:
return Language.ZhHK
elif in_str in ["ZhEn", "2052|1033"]:
return Language.ZhEn
else:
return Language.Neutral
"""
Phone Types
"""
class PhoneCVType(Enum):
NULL = -1
Consonant = 1
Vowel = 2
@classmethod
def parse(cls, in_str):
if not isinstance(in_str, str):
return super(PhoneCVType, cls).__new__(cls, in_str)
if in_str in ["consonant", "Consonant"]:
return PhoneCVType.Consonant
elif in_str in ["vowel", "Vowel"]:
return PhoneCVType.Vowel
else:
return PhoneCVType.NULL
class PhoneIFType(Enum):
NULL = -1
Initial = 1
Final = 2
@classmethod
def parse(cls, in_str):
if not isinstance(in_str, str):
return super(PhoneIFType, cls).__new__(cls, in_str)
if in_str in ["initial", "Initial"]:
return PhoneIFType.Initial
elif in_str in ["final", "Final"]:
return PhoneIFType.Final
else:
return PhoneIFType.NULL
class PhoneUVType(Enum):
NULL = -1
Voiced = 1
UnVoiced = 2
@classmethod
def parse(cls, in_str):
if not isinstance(in_str, str):
return super(PhoneUVType, cls).__new__(cls, in_str)
if in_str in ["voiced", "Voiced"]:
return PhoneUVType.Voiced
elif in_str in ["unvoiced", "UnVoiced"]:
return PhoneUVType.UnVoiced
else:
return PhoneUVType.NULL
class PhoneAPType(Enum):
NULL = -1
DoubleLips = 1
LipTooth = 2
FrontTongue = 3
CentralTongue = 4
BackTongue = 5
Dorsal = 6
Velar = 7
Low = 8
Middle = 9
High = 10
@classmethod
def parse(cls, in_str):
if not isinstance(in_str, str):
return super(PhoneAPType, cls).__new__(cls, in_str)
if in_str in ["doublelips", "DoubleLips"]:
return PhoneAPType.DoubleLips
elif in_str in ["liptooth", "LipTooth"]:
return PhoneAPType.LipTooth
elif in_str in ["fronttongue", "FrontTongue"]:
return PhoneAPType.FrontTongue
elif in_str in ["centraltongue", "CentralTongue"]:
return PhoneAPType.CentralTongue
elif in_str in ["backtongue", "BackTongue"]:
return PhoneAPType.BackTongue
elif in_str in ["dorsal", "Dorsal"]:
return PhoneAPType.Dorsal
elif in_str in ["velar", "Velar"]:
return PhoneAPType.Velar
elif in_str in ["low", "Low"]:
return PhoneAPType.Low
elif in_str in ["middle", "Middle"]:
return PhoneAPType.Middle
elif in_str in ["high", "High"]:
return PhoneAPType.High
else:
return PhoneAPType.NULL
class PhoneAMType(Enum):
NULL = -1
Stop = 1
Affricate = 2
Fricative = 3
Nasal = 4
Lateral = 5
Open = 6
Close = 7
@classmethod
def parse(cls, in_str):
if not isinstance(in_str, str):
return super(PhoneAMType, cls).__new__(cls, in_str)
if in_str in ["stop", "Stop"]:
return PhoneAMType.Stop
elif in_str in ["affricate", "Affricate"]:
return PhoneAMType.Affricate
elif in_str in ["fricative", "Fricative"]:
return PhoneAMType.Fricative
elif in_str in ["nasal", "Nasal"]:
return PhoneAMType.Nasal
elif in_str in ["lateral", "Lateral"]:
return PhoneAMType.Lateral
elif in_str in ["open", "Open"]:
return PhoneAMType.Open
elif in_str in ["close", "Close"]:
return PhoneAMType.Close
else:
return PhoneAMType.NULL
import re
import unicodedata
import codecs
WordPattern = r"((?P<Word>\w+)(\(\w+\))?)"
BreakPattern = r"(?P<Break>(\*?#(?P<BreakLevel>[0-4])))"
MarkPattern = r"(?P<Mark>[、,。!?:“”《》·])"
POSPattern = r"(?P<POS>(\*?\|(?P<POSClass>[1-9])))"
PhraseTonePattern = r"(?P<PhraseTone>(\*?%([L|H])))"
NgBreakPattern = r"^ng(?P<break>\d)"
RegexWord = re.compile(WordPattern + r"\s*")
RegexBreak = re.compile(BreakPattern + r"\s*")
RegexID = re.compile(r"^(?P<ID>.*?)\s")
RegexSentence = re.compile(
r"({}|{}|{}|{}|{})\s*".format(
WordPattern, BreakPattern, MarkPattern, POSPattern, PhraseTonePattern
)
)
RegexForeignLang = re.compile(r"[A-Z@]")
RegexSpace = re.compile(r"^\s*")
RegexNeutralTone = re.compile(r"[1-5]5")
def do_character_normalization(line):
return unicodedata.normalize("NFKC", line)
def do_prosody_text_normalization(line):
tokens = line.split("\t")
text = tokens[1]
# Remove punctuations
text = text.replace(u"。", " ")
text = text.replace(u"、", " ")
text = text.replace(u"“", " ")
text = text.replace(u"”", " ")
text = text.replace(u"‘", " ")
text = text.replace(u"’", " ")
text = text.replace(u"|", " ")
text = text.replace(u"《", " ")
text = text.replace(u"》", " ")
text = text.replace(u"【", " ")
text = text.replace(u"】", " ")
text = text.replace(u"—", " ")
text = text.replace(u"―", " ")
text = text.replace(".", " ")
text = text.replace("!", " ")
text = text.replace("?", " ")
text = text.replace("(", " ")
text = text.replace(")", " ")
text = text.replace("[", " ")
text = text.replace("]", " ")
text = text.replace("{", " ")
text = text.replace("}", " ")
text = text.replace("~", " ")
text = text.replace(":", " ")
text = text.replace(";", " ")
text = text.replace("+", " ")
text = text.replace(",", " ")
# text = text.replace('·', ' ')
text = text.replace('"', " ")
text = text.replace(
"-", ""
) # don't replace by space because compond word like two-year-old
text = text.replace(
"'", ""
) # don't replace by space because English word like that's
# Replace break
text = text.replace("/", "#2")
text = text.replace("%", "#3")
# Remove useless spaces surround #2 #3 #4
text = re.sub(r"(#\d)[ ]+", r"\1", text)
text = re.sub(r"[ ]+(#\d)", r"\1", text)
# Replace space by #1
text = re.sub("[ ]+", "#1", text)
# Remove break at the end of the text
text = re.sub(r"#\d$", "", text)
# Add #1 between target language and foreign language
text = re.sub(r"([a-zA-Z])([^a-zA-Z\d\#\s\'\%\/\-])", r"\1#1\2", text)
text = re.sub(r"([^a-zA-Z\d\#\s\'\%\/\-])([a-zA-Z])", r"\1#1\2", text)
return tokens[0] + "\t" + text
def is_fp_line(line):
fp_category_list = ["FP", "I", "N", "Q"]
elements = line.strip().split(" ")
res = True
for ele in elements:
if ele not in fp_category_list:
res = False
break
return res
def format_prosody(src_prosody):
formatted_lines = []
with codecs.open(src_prosody, "r", "utf-8") as f:
lines = f.readlines()
idx = 0
while idx < len(lines):
line = do_character_normalization(lines[idx])
if len(line.strip().split("\t")) == 2:
line = do_prosody_text_normalization(line)
else:
fp_enable = is_fp_line(line)
if fp_enable:
idx += 3
continue
formatted_lines.append(line)
idx += 1
# with codecs.open(tgt_prosody, 'w', 'utf-8') as f:
# f.writelines(formatted_lines)
return formatted_lines
from collections import OrderedDict
import torch
from torch import nn
import torch.nn.functional as F
from .layers import (DenseLayer, DenseTDNNBlock, StatsPool, TDNNLayer, SEDenseTDNNBlock,
TransitLayer)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, in_planes, planes, stride=1):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_planes,
planes,
kernel_size=3,
stride=(stride, 1),
padding=1,
bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes,
planes,
kernel_size=3,
stride=1,
padding=1,
bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes,
self.expansion * planes,
kernel_size=1,
stride=(stride, 1),
bias=False),
nn.BatchNorm2d(self.expansion * planes))
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x)
out = F.relu(out)
return out
class CNN_Head(nn.Module):
def __init__(self,
block=BasicBlock,
num_blocks=[2, 2],
m_channels=32,
feat_dim=80):
super(CNN_Head, self).__init__()
self.in_planes = m_channels
self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(m_channels)
self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=2)
self.layer2 = self._make_layer(block, m_channels, num_blocks[0], stride=2)
self.conv2 = nn.Conv2d(m_channels, m_channels, kernel_size=3, stride=(2, 1), padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(m_channels)
self.out_channels = m_channels * (feat_dim // 8)
def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride))
self.in_planes = planes * block.expansion
return nn.Sequential(*layers)
def forward(self, x):
x = x.unsqueeze_(1)
out = F.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = F.relu(self.bn2(self.conv2(out)))
out = out.reshape(out.shape[0], out.shape[1]*out.shape[2], out.shape[3])
return out
class DTDNN(nn.Module):
def __init__(self,
feat_dim=80,
embedding_size=192,
growth_rate=32,
bn_size=4,
init_channels=128,
config_str='batchnorm-relu',
memory_efficient=True):
super(DTDNN, self).__init__()
self.head = CNN_Head()
feat_dim = self.head.out_channels
self.xvector = nn.Sequential(
OrderedDict([
('tdnn',
TDNNLayer(feat_dim,
init_channels,
5,
stride=2,
dilation=1,
padding=-1,
config_str=config_str)),
]))
channels = init_channels
for i, (num_layers, kernel_size,
dilation) in enumerate(zip((12, 24, 16), (3, 3, 3), (1, 2, 3))):
block = SEDenseTDNNBlock(num_layers=num_layers,
in_channels=channels,
out_channels=growth_rate,
bn_channels=bn_size * growth_rate,
kernel_size=kernel_size,
dilation=dilation,
config_str=config_str,
memory_efficient=memory_efficient)
self.xvector.add_module('block%d' % (i + 1), block)
channels = channels + num_layers * growth_rate
self.xvector.add_module(
'transit%d' % (i + 1),
TransitLayer(channels,
channels // 2,
bias=False,
config_str=config_str))
channels //= 2
self.bn = nn.BatchNorm1d(channels)
self.relu = nn.ReLU(inplace=True)
self.xvector.add_module('stats', StatsPool())
self.xvector.add_module(
'dense',
DenseLayer(channels * 2, embedding_size, config_str='batchnorm_'))
for m in self.modules():
if isinstance(m, (nn.Conv1d, nn.Linear)):
nn.init.kaiming_normal_(m.weight.data)
if m.bias is not None:
nn.init.zeros_(m.bias)
def forward(self, x):
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
x = self.head(x)
x = self.xvector.tdnn(x)
x = self.xvector.block1(x)
x = self.xvector.transit1(x)
x = self.xvector.block2(x)
x = self.xvector.transit2(x)
x = self.xvector.block3(x)
x = self.xvector.transit3(x)
x = self.relu(self.bn(x))
x = self.xvector.stats(x)
x = self.xvector.dense(x)
return x
import torch
import torch.nn.functional as F
import torch.utils.checkpoint as cp
from torch import nn
def get_nonlinear(config_str, channels):
nonlinear = nn.Sequential()
for name in config_str.split('-'):
if name == 'relu':
nonlinear.add_module('relu', nn.ReLU(inplace=True))
elif name == 'prelu':
nonlinear.add_module('prelu', nn.PReLU(channels))
elif name == 'batchnorm':
nonlinear.add_module('batchnorm', nn.BatchNorm1d(channels))
elif name == 'batchnorm_':
nonlinear.add_module('batchnorm',
nn.BatchNorm1d(channels, affine=False))
else:
raise ValueError('Unexpected module ({}).'.format(name))
return nonlinear
def statistics_pooling(x, dim=-1, keepdim=False, unbiased=True, eps=1e-2):
mean = x.mean(dim=dim)
std = x.std(dim=dim, unbiased=unbiased)
stats = torch.cat([mean, std], dim=-1)
if keepdim:
stats = stats.unsqueeze(dim=dim)
return stats
def high_order_statistics_pooling(x,
dim=-1,
keepdim=False,
unbiased=True,
eps=1e-2):
mean = x.mean(dim=dim)
std = x.std(dim=dim, unbiased=unbiased)
norm = (x - mean.unsqueeze(dim=dim)) \
/ std.clamp(min=eps).unsqueeze(dim=dim)
skewness = norm.pow(3).mean(dim=dim)
kurtosis = norm.pow(4).mean(dim=dim)
stats = torch.cat([mean, std, skewness, kurtosis], dim=-1)
if keepdim:
stats = stats.unsqueeze(dim=dim)
return stats
class StatsPool(nn.Module):
def forward(self, x):
return statistics_pooling(x)
class HighOrderStatsPool(nn.Module):
def forward(self, x):
return high_order_statistics_pooling(x)
class TDNNLayer(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
bias=False,
config_str='batchnorm-relu'):
super(TDNNLayer, self).__init__()
if padding < 0:
assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format(
kernel_size)
padding = (kernel_size - 1) // 2 * dilation
self.linear = nn.Conv1d(in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
self.nonlinear = get_nonlinear(config_str, out_channels)
def forward(self, x):
x = self.linear(x)
x = self.nonlinear(x)
return x
class DenseTDNNLayer(nn.Module):
def __init__(self,
in_channels,
out_channels,
bn_channels,
kernel_size,
stride=1,
dilation=1,
bias=False,
config_str='batchnorm-relu',
memory_efficient=False):
super(DenseTDNNLayer, self).__init__()
assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format(
kernel_size)
padding = (kernel_size - 1) // 2 * dilation
self.memory_efficient = memory_efficient
self.nonlinear1 = get_nonlinear(config_str, in_channels)
self.linear1 = nn.Conv1d(in_channels, bn_channels, 1, bias=False)
self.nonlinear2 = get_nonlinear(config_str, bn_channels)
self.linear2 = nn.Conv1d(bn_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
def bn_function(self, x):
return self.linear1(self.nonlinear1(x))
def forward(self, x):
if self.training and self.memory_efficient:
x = cp.checkpoint(self.bn_function, x)
else:
x = self.bn_function(x)
x = self.linear2(self.nonlinear2(x))
return x
class DenseTDNNBlock(nn.ModuleList):
def __init__(self,
num_layers,
in_channels,
out_channels,
bn_channels,
kernel_size,
stride=1,
dilation=1,
bias=False,
config_str='batchnorm-relu',
memory_efficient=False):
super(DenseTDNNBlock, self).__init__()
for i in range(num_layers):
layer = DenseTDNNLayer(in_channels=in_channels + i * out_channels,
out_channels=out_channels,
bn_channels=bn_channels,
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
bias=bias,
config_str=config_str,
memory_efficient=memory_efficient)
self.add_module('tdnnd%d' % (i + 1), layer)
def forward(self, x):
for layer in self:
x = torch.cat([x, layer(x)], dim=1)
return x
class StatsSelect(nn.Module):
def __init__(self, channels, branches, null=False, reduction=1):
super(StatsSelect, self).__init__()
self.gather = HighOrderStatsPool()
self.linear1 = nn.Conv1d(channels * 4, channels // reduction, 1)
self.linear2 = nn.ModuleList()
if null:
branches += 1
for _ in range(branches):
self.linear2.append(nn.Conv1d(channels // reduction, channels, 1))
self.channels = channels
self.branches = branches
self.null = null
self.reduction = reduction
def forward(self, x):
f = torch.cat([_x.unsqueeze(dim=1) for _x in x], dim=1)
x = torch.sum(f, dim=1)
x = self.linear1(self.gather(x).unsqueeze(dim=-1))
s = []
for linear in self.linear2:
s.append(linear(x).view(-1, 1, self.channels))
s = torch.cat(s, dim=1)
s = F.softmax(s, dim=1).unsqueeze(dim=-1)
if self.null:
s = s[:, :-1, :, :]
return torch.sum(f * s, dim=1)
def extra_repr(self):
return 'channels={}, branches={}, reduction={}'.format(
self.channels, self.branches, self.reduction)
class SqueezeExcitation(nn.Module):
def __init__(self, channels, reduction=1):
super(SqueezeExcitation, self).__init__()
self.linear1 = nn.Conv1d(channels, channels // reduction, 1)
self.relu = nn.ReLU(inplace=True)
self.linear2 = nn.Conv1d(channels // reduction, channels, 1)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
s = self.linear1(x.mean(-1, keepdim=True)+self.seg_pooling(x))
s = self.relu(s)
s = self.sigmoid(self.linear2(s))
return x*s
def seg_pooling(self, x, seg_len=100):
s_x = F.max_pool1d(x, kernel_size=seg_len, stride=seg_len, ceil_mode=True)
out = s_x.unsqueeze(-1).expand(-1, -1, -1, seg_len).reshape(*x.shape[:-1], -1)
out = out[:, :, :x.shape[-1]]
return out
class PoolingBlock(nn.Module):
def __init__(self, bn_channels, out_channels, kernel_size, stride, padding, dilation, bias, reduction=2):
super(PoolingBlock, self).__init__()
self.linear_stem = nn.Conv1d(bn_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
self.linear1 = nn.Conv1d(bn_channels, bn_channels // reduction, 1)
self.relu = nn.ReLU(inplace=True)
# self.bn = nn.BatchNorm1d(out_channels)
self.linear2 = nn.Conv1d(bn_channels // reduction, out_channels, 1)
self.sigmoid = nn.Sigmoid()
# self.linear3 = nn.Conv1d(out_channels, out_channels, 1)
def forward(self, x):
y = self.linear_stem(x)
s = self.linear1(x.mean(-1, keepdim=True)+self.seg_pooling(x))
s = self.relu(s)
s = self.sigmoid(self.linear2(s))
return y*s
def seg_pooling(self, x, seg_len=100):
s_x = F.max_pool1d(x, kernel_size=seg_len, stride=seg_len, ceil_mode=True)
out = s_x.unsqueeze(-1).expand(-1, -1, -1, seg_len).reshape(*x.shape[:-1], -1)
out = out[:, :, :x.shape[-1]]
return out
class MultiBranchDenseTDNNLayer(DenseTDNNLayer):
def __init__(self,
in_channels,
out_channels,
bn_channels,
kernel_size,
stride=1,
dilation=(1, ),
bias=False,
null=False,
reduction=1,
config_str='batchnorm-relu',
memory_efficient=False):
super(DenseTDNNLayer, self).__init__()
assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format(
kernel_size)
padding = (kernel_size - 1) // 2
if not isinstance(dilation, (tuple, list)):
dilation = (dilation, )
self.memory_efficient = memory_efficient
self.nonlinear1 = get_nonlinear(config_str, in_channels)
self.linear1 = nn.Conv1d(in_channels, bn_channels, 1, bias=False)
self.nonlinear2 = get_nonlinear(config_str, bn_channels)
self.linear2 = nn.ModuleList()
for _dilation in dilation:
self.linear2.append(
nn.Conv1d(bn_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding * _dilation,
dilation=_dilation,
bias=bias))
self.select = StatsSelect(out_channels,
len(dilation),
null=null,
reduction=reduction)
def forward(self, x):
if self.training and self.memory_efficient:
x = cp.checkpoint(self.bn_function, x)
else:
x = self.bn_function(x)
x = self.nonlinear2(x)
x = self.select([linear(x) for linear in self.linear2])
return x
class SEDenseTDNNLayer(nn.Module):
def __init__(self,
in_channels,
out_channels,
bn_channels,
kernel_size,
stride=1,
dilation=1,
bias=False,
config_str='batchnorm-relu',
memory_efficient=False):
super(SEDenseTDNNLayer, self).__init__()
assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format(
kernel_size)
padding = (kernel_size - 1) // 2 * dilation
self.memory_efficient = memory_efficient
self.nonlinear1 = get_nonlinear(config_str, in_channels)
self.linear1 = nn.Conv1d(in_channels, bn_channels, 1, bias=False)
self.nonlinear2 = get_nonlinear(config_str, bn_channels)
# self.linear2 = nn.Conv1d(bn_channels,
# out_channels,
# kernel_size,
# stride=stride,
# padding=padding,
# dilation=dilation,
# bias=bias)
# self.se = SqueezeExcitation(out_channels)
self.se = PoolingBlock(bn_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
def bn_function(self, x):
return self.linear1(self.nonlinear1(x))
def forward(self, x):
if self.training and self.memory_efficient:
x = cp.checkpoint(self.bn_function, x)
else:
x = self.bn_function(x)
# x = self.linear2(self.nonlinear2(x))
x = self.se(self.nonlinear2(x))
return x
class SEDenseTDNNBlock(nn.ModuleList):
def __init__(self,
num_layers,
in_channels,
out_channels,
bn_channels,
kernel_size,
stride=1,
dilation=1,
bias=False,
config_str='batchnorm-relu',
memory_efficient=False):
super(SEDenseTDNNBlock, self).__init__()
for i in range(num_layers):
layer = SEDenseTDNNLayer(in_channels=in_channels + i * out_channels,
out_channels=out_channels,
bn_channels=bn_channels,
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
bias=bias,
config_str=config_str,
memory_efficient=memory_efficient)
self.add_module('tdnnd%d' % (i + 1), layer)
def forward(self, x):
for layer in self:
x = torch.cat([x, layer(x)], dim=1)
return x
class MultiBranchDenseTDNNBlock(DenseTDNNBlock):
def __init__(self,
num_layers,
in_channels,
out_channels,
bn_channels,
kernel_size,
stride=1,
dilation=1,
bias=False,
null=False,
reduction=1,
config_str='batchnorm-relu',
memory_efficient=False):
super(DenseTDNNBlock, self).__init__()
for i in range(num_layers):
layer = MultiBranchDenseTDNNLayer(
in_channels=in_channels + i * out_channels,
out_channels=out_channels,
bn_channels=bn_channels,
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
bias=bias,
null=null,
reduction=reduction,
config_str=config_str,
memory_efficient=memory_efficient)
self.add_module('tdnnd%d' % (i + 1), layer)
class TransitLayer(nn.Module):
def __init__(self,
in_channels,
out_channels,
bias=True,
config_str='batchnorm-relu'):
super(TransitLayer, self).__init__()
self.nonlinear = get_nonlinear(config_str, in_channels)
self.linear = nn.Conv1d(in_channels, out_channels, 1, bias=bias)
def forward(self, x):
x = self.nonlinear(x)
x = self.linear(x)
return x
class DenseLayer(nn.Module):
def __init__(self,
in_channels,
out_channels,
bias=False,
config_str='batchnorm-relu'):
super(DenseLayer, self).__init__()
self.linear = nn.Conv1d(in_channels, out_channels, 1, bias=bias)
self.nonlinear = get_nonlinear(config_str, out_channels)
def forward(self, x):
if len(x.shape) == 2:
x = self.linear(x.unsqueeze(dim=-1)).squeeze(dim=-1)
else:
x = self.linear(x)
x = self.nonlinear(x)
return x
if __name__ == '__main__':
model = SqueezeExcitation(channels=32)
model.eval()
x = torch.randn(1, 32, 298)
y = model(x)
print(y.size())
from thop import profile
macs, num_params = profile(model, inputs=(x, ))
# num_params = sum(p.numel() for p in model.parameters())
print("MACs: {} G".format(macs / 1e9))
print("Params: {} M".format(num_params / 1e6))
\ No newline at end of file
import torch
import torchaudio
import numpy as np
import os
import torchaudio.compliance.kaldi as Kaldi
from .D_TDNN import DTDNN
import logging
import argparse
from glob import glob
logging.basicConfig(
format="%(asctime)s %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.DEBUG,
)
class SpeakerEmbeddingProcessor:
def __init__(self, sample_rate=16000):
self.sample_rate = sample_rate
self.min_wav_length = self.sample_rate * 30 * 10 / 1000
self.pcm_dict = {}
self.mfcc_dict = {}
self.se_list = []
def process(self, src_voice_dir, se_model):
logging.info("[SpeakerEmbeddingProcessor] Speaker embedding extractor started")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DTDNN()
try:
if os.path.basename(se_model) == "se.model":
model.load_state_dict(torch.load(se_model, map_location=device))
else:
raise Exception("[SpeakerEmbeddingProcessor] se model loading error!!!")
except Exception as e:
logging.info(e)
if os.path.basename(se_model) == 'se.onnx':
logging.info("[SpeakerEmbeddingProcessor] please update your se model to ensure that the version is greater than or equal to 1.0.5")
sys.exit()
model.eval()
model.to(device)
wav_dir = os.path.join(src_voice_dir, "wav")
se_dir = os.path.join(src_voice_dir, "se")
se_average_file = os.path.join(se_dir, "se.npy")
os.makedirs(se_dir, exist_ok=True)
wav_files = glob(os.path.join(wav_dir, '*.wav'))
for wav_file in wav_files:
basename = os.path.splitext(os.path.basename(wav_file))[0]
se_file = os.path.join(se_dir, basename + '.npy')
wav, fs = torchaudio.load(wav_file)
assert wav.shape[0] == 1
assert fs == 16000
if wav.shape[1] < self.min_wav_length:
continue
fbank_feat = Kaldi.fbank(wav, num_mel_bins=80)
feat = fbank_feat - fbank_feat.mean(dim=0, keepdim=True)
feat = feat.unsqueeze(0).to(device)
speaker_embedding = model(feat)
speaker_embedding = speaker_embedding.squeeze().cpu().detach().numpy()
speaker_embedding = np.expand_dims(speaker_embedding, axis=0)
np.save(se_file, speaker_embedding)
self.se_list.append(speaker_embedding)
self.se_average = np.expand_dims(
np.mean(
np.concatenate(self.se_list, axis=0),
axis=0
),
axis=0
)
np.save(se_average_file, self.se_average)
logging.info("[SpeakerEmbeddingProcessor] Speaker embedding extracted successfully!")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Speaker Embedding Processor")
parser.add_argument("--src_voice_dir", type=str, required=True)
parser.add_argument('--se_model', required=True)
args = parser.parse_args()
sep = SpeakerEmbeddingProcessor()
sep.process(args.src_voice_dir, args.se_onnx)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment