Initial commit

ee10550a · liugh5 · ee10550a · ee10550a · ee10550a · ee10550a
Commit ee10550a authored Feb 06, 2024 by liugh5
20 changed files
--- a/kantts/preprocess/script_convertor/core/__pycache__/Script.cpython-38.pyc
+++ b/kantts/preprocess/script_convertor/core/__pycache__/Script.cpython-38.pyc
--- a/kantts/preprocess/script_convertor/core/__pycache__/ScriptItem.cpython-38.pyc
+++ b/kantts/preprocess/script_convertor/core/__pycache__/ScriptItem.cpython-38.pyc
--- a/kantts/preprocess/script_convertor/core/__pycache__/ScriptSentence.cpython-38.pyc
+++ b/kantts/preprocess/script_convertor/core/__pycache__/ScriptSentence.cpython-38.pyc
--- a/kantts/preprocess/script_convertor/core/__pycache__/ScriptWord.cpython-38.pyc
+++ b/kantts/preprocess/script_convertor/core/__pycache__/ScriptWord.cpython-38.pyc
--- a/kantts/preprocess/script_convertor/core/__pycache__/Syllable.cpython-38.pyc
+++ b/kantts/preprocess/script_convertor/core/__pycache__/Syllable.cpython-38.pyc
--- a/kantts/preprocess/script_convertor/core/__pycache__/SyllableFormatter.cpython-38.pyc
+++ b/kantts/preprocess/script_convertor/core/__pycache__/SyllableFormatter.cpython-38.pyc
--- a/kantts/preprocess/script_convertor/core/__pycache__/XmlObj.cpython-38.pyc
+++ b/kantts/preprocess/script_convertor/core/__pycache__/XmlObj.cpython-38.pyc
--- a/kantts/preprocess/script_convertor/core/__pycache__/__init__.cpython-38.pyc
+++ b/kantts/preprocess/script_convertor/core/__pycache__/__init__.cpython-38.pyc
--- a/kantts/preprocess/script_convertor/core/__pycache__/core_types.cpython-38.pyc
+++ b/kantts/preprocess/script_convertor/core/__pycache__/core_types.cpython-38.pyc
--- a/kantts/preprocess/script_convertor/core/__pycache__/utils.cpython-38.pyc
+++ b/kantts/preprocess/script_convertor/core/__pycache__/utils.cpython-38.pyc
--- a/kantts/preprocess/script_convertor/core/core_types.py
+++ b/kantts/preprocess/script_convertor/core/core_types.py
+from enum import Enum
+
+
+class Tone(Enum):
+    UnAssigned = -1
+    NoneTone = 0
+    YinPing = 1  # ZhHK: YinPingYinRu   EnUS: primary stress
+    YangPing = 2  # ZhHK: YinShang       EnUS: secondary stress
+    ShangSheng = 3  # ZhHK: YinQuZhongRu
+    QuSheng = 4  # ZhHK: YangPing
+    QingSheng = 5  # ZhHK: YangShang
+    YangQuYangRu = 6  # ZhHK: YangQuYangRu
+
+    @classmethod
+    def parse(cls, in_str):
+        if not isinstance(in_str, str):
+            return super(Tone, cls).__new__(cls, in_str)
+
+        if in_str in ["UnAssigned", "-1"]:
+            return Tone.UnAssigned
+        elif in_str in ["NoneTone", "0"]:
+            return Tone.NoneTone
+        elif in_str in ["YinPing", "1"]:
+            return Tone.YinPing
+        elif in_str in ["YangPing", "2"]:
+            return Tone.YangPing
+        elif in_str in ["ShangSheng", "3"]:
+            return Tone.ShangSheng
+        elif in_str in ["QuSheng", "4"]:
+            return Tone.QuSheng
+        elif in_str in ["QingSheng", "5"]:
+            return Tone.QingSheng
+        elif in_str in ["YangQuYangRu", "6"]:
+            return Tone.YangQuYangRu
+        else:
+            return Tone.NoneTone
+
+
+class BreakLevel(Enum):
+    UnAssigned = -1
+    L0 = 0
+    L1 = 1
+    L2 = 2
+    L3 = 3
+    L4 = 4
+
+    @classmethod
+    def parse(cls, in_str):
+        if not isinstance(in_str, str):
+            return super(BreakLevel, cls).__new__(cls, in_str)
+
+        if in_str in ["UnAssigned", "-1"]:
+            return BreakLevel.UnAssigned
+        elif in_str in ["L0", "0"]:
+            return BreakLevel.L0
+        elif in_str in ["L1", "1"]:
+            return BreakLevel.L1
+        elif in_str in ["L2", "2"]:
+            return BreakLevel.L2
+        elif in_str in ["L3", "3"]:
+            return BreakLevel.L3
+        elif in_str in ["L4", "4"]:
+            return BreakLevel.L4
+        else:
+            return BreakLevel.UnAssigned
+
+
+class SentencePurpose(Enum):
+    Declarative = 0
+    Interrogative = 1
+    Exclamatory = 2
+    Imperative = 3
+
+
+class Language(Enum):
+    Neutral = 0
+    EnUS = 1033
+    EnGB = 2057
+    ZhCN = 2052
+    PinYin = 2053
+    WuuShanghai = 2054
+    Sichuan = 2055
+    ZhHK = 3076
+    ZhEn = ZhCN | EnUS
+
+    @classmethod
+    def parse(cls, in_str):
+        if not isinstance(in_str, str):
+            return super(Language, cls).__new__(cls, in_str)
+
+        if in_str in ["Neutral", "0"]:
+            return Language.Neutral
+        elif in_str in ["EnUS", "1033"]:
+            return Language.EnUS
+        elif in_str in ["EnGB", "2057"]:
+            return Language.EnGB
+        elif in_str in ["ZhCN", "2052"]:
+            return Language.ZhCN
+        elif in_str in ["PinYin", "2053"]:
+            return Language.PinYin
+        elif in_str in ["WuuShanghai", "2054"]:
+            return Language.WuuShanghai
+        elif in_str in ["Sichuan", "2055"]:
+            return Language.Sichuan
+        elif in_str in ["ZhHK", "3076"]:
+            return Language.ZhHK
+        elif in_str in ["ZhEn", "2052|1033"]:
+            return Language.ZhEn
+        else:
+            return Language.Neutral
+
+
+"""
+Phone Types
+"""
+
+
+class PhoneCVType(Enum):
+    NULL = -1
+    Consonant = 1
+    Vowel = 2
+
+    @classmethod
+    def parse(cls, in_str):
+        if not isinstance(in_str, str):
+            return super(PhoneCVType, cls).__new__(cls, in_str)
+
+        if in_str in ["consonant", "Consonant"]:
+            return PhoneCVType.Consonant
+        elif in_str in ["vowel", "Vowel"]:
+            return PhoneCVType.Vowel
+        else:
+            return PhoneCVType.NULL
+
+
+class PhoneIFType(Enum):
+    NULL = -1
+    Initial = 1
+    Final = 2
+
+    @classmethod
+    def parse(cls, in_str):
+        if not isinstance(in_str, str):
+            return super(PhoneIFType, cls).__new__(cls, in_str)
+        if in_str in ["initial", "Initial"]:
+            return PhoneIFType.Initial
+        elif in_str in ["final", "Final"]:
+            return PhoneIFType.Final
+        else:
+            return PhoneIFType.NULL
+
+
+class PhoneUVType(Enum):
+    NULL = -1
+    Voiced = 1
+    UnVoiced = 2
+
+    @classmethod
+    def parse(cls, in_str):
+        if not isinstance(in_str, str):
+            return super(PhoneUVType, cls).__new__(cls, in_str)
+        if in_str in ["voiced", "Voiced"]:
+            return PhoneUVType.Voiced
+        elif in_str in ["unvoiced", "UnVoiced"]:
+            return PhoneUVType.UnVoiced
+        else:
+            return PhoneUVType.NULL
+
+
+class PhoneAPType(Enum):
+    NULL = -1
+    DoubleLips = 1
+    LipTooth = 2
+    FrontTongue = 3
+    CentralTongue = 4
+    BackTongue = 5
+    Dorsal = 6
+    Velar = 7
+    Low = 8
+    Middle = 9
+    High = 10
+
+    @classmethod
+    def parse(cls, in_str):
+        if not isinstance(in_str, str):
+            return super(PhoneAPType, cls).__new__(cls, in_str)
+        if in_str in ["doublelips", "DoubleLips"]:
+            return PhoneAPType.DoubleLips
+        elif in_str in ["liptooth", "LipTooth"]:
+            return PhoneAPType.LipTooth
+        elif in_str in ["fronttongue", "FrontTongue"]:
+            return PhoneAPType.FrontTongue
+        elif in_str in ["centraltongue", "CentralTongue"]:
+            return PhoneAPType.CentralTongue
+        elif in_str in ["backtongue", "BackTongue"]:
+            return PhoneAPType.BackTongue
+        elif in_str in ["dorsal", "Dorsal"]:
+            return PhoneAPType.Dorsal
+        elif in_str in ["velar", "Velar"]:
+            return PhoneAPType.Velar
+        elif in_str in ["low", "Low"]:
+            return PhoneAPType.Low
+        elif in_str in ["middle", "Middle"]:
+            return PhoneAPType.Middle
+        elif in_str in ["high", "High"]:
+            return PhoneAPType.High
+        else:
+            return PhoneAPType.NULL
+
+
+class PhoneAMType(Enum):
+    NULL = -1
+    Stop = 1
+    Affricate = 2
+    Fricative = 3
+    Nasal = 4
+    Lateral = 5
+    Open = 6
+    Close = 7
+
+    @classmethod
+    def parse(cls, in_str):
+        if not isinstance(in_str, str):
+            return super(PhoneAMType, cls).__new__(cls, in_str)
+        if in_str in ["stop", "Stop"]:
+            return PhoneAMType.Stop
+        elif in_str in ["affricate", "Affricate"]:
+            return PhoneAMType.Affricate
+        elif in_str in ["fricative", "Fricative"]:
+            return PhoneAMType.Fricative
+        elif in_str in ["nasal", "Nasal"]:
+            return PhoneAMType.Nasal
+        elif in_str in ["lateral", "Lateral"]:
+            return PhoneAMType.Lateral
+        elif in_str in ["open", "Open"]:
+            return PhoneAMType.Open
+        elif in_str in ["close", "Close"]:
+            return PhoneAMType.Close
+        else:
+            return PhoneAMType.NULL
--- a/kantts/preprocess/script_convertor/core/utils.py
+++ b/kantts/preprocess/script_convertor/core/utils.py
+import re
+import unicodedata
+import codecs
+
+WordPattern = r"((?P<Word>\w+)(\(\w+\))?)"
+BreakPattern = r"(?P<Break>(\*?#(?P<BreakLevel>[0-4])))"
+MarkPattern = r"(?P<Mark>[、，。！？：“”《》·])"
+POSPattern = r"(?P<POS>(\*?\|(?P<POSClass>[1-9])))"
+PhraseTonePattern = r"(?P<PhraseTone>(\*?%([L|H])))"
+
+NgBreakPattern = r"^ng(?P<break>\d)"
+
+
+RegexWord = re.compile(WordPattern + r"\s*")
+RegexBreak = re.compile(BreakPattern + r"\s*")
+RegexID = re.compile(r"^(?P<ID>.*?)\s")
+RegexSentence = re.compile(
+    r"({}|{}|{}|{}|{})\s*".format(
+        WordPattern, BreakPattern, MarkPattern, POSPattern, PhraseTonePattern
+    )
+)
+RegexForeignLang = re.compile(r"[A-Z@]")
+RegexSpace = re.compile(r"^\s*")
+RegexNeutralTone = re.compile(r"[1-5]5")
+
+
+def do_character_normalization(line):
+    return unicodedata.normalize("NFKC", line)
+
+
+def do_prosody_text_normalization(line):
+    tokens = line.split("\t")
+    text = tokens[1]
+    # Remove punctuations
+    text = text.replace(u"。", " ")
+    text = text.replace(u"、", " ")
+    text = text.replace(u"“", " ")
+    text = text.replace(u"”", " ")
+    text = text.replace(u"‘", " ")
+    text = text.replace(u"’", " ")
+    text = text.replace(u"|", " ")
+    text = text.replace(u"《", " ")
+    text = text.replace(u"》", " ")
+    text = text.replace(u"【", " ")
+    text = text.replace(u"】", " ")
+    text = text.replace(u"—", " ")
+    text = text.replace(u"―", " ")
+    text = text.replace(".", " ")
+    text = text.replace("!", " ")
+    text = text.replace("?", " ")
+    text = text.replace("(", " ")
+    text = text.replace(")", " ")
+    text = text.replace("[", " ")
+    text = text.replace("]", " ")
+    text = text.replace("{", " ")
+    text = text.replace("}", " ")
+    text = text.replace("~", " ")
+    text = text.replace(":", " ")
+    text = text.replace(";", " ")
+    text = text.replace("+", " ")
+    text = text.replace(",", " ")
+    #    text = text.replace('·', ' ')
+    text = text.replace('"', " ")
+    text = text.replace(
+        "-", ""
+    )  # don't replace by space because compond word like two-year-old
+    text = text.replace(
+        "'", ""
+    )  # don't replace by space because English word like that's
+
+    # Replace break
+    text = text.replace("/", "#2")
+    text = text.replace("%", "#3")
+    # Remove useless spaces surround #2 #3 #4
+    text = re.sub(r"(#\d)[ ]+", r"\1", text)
+    text = re.sub(r"[ ]+(#\d)", r"\1", text)
+    # Replace space by #1
+    text = re.sub("[ ]+", "#1", text)
+
+    # Remove break at the end of the text
+    text = re.sub(r"#\d$", "", text)
+
+    # Add #1 between target language and foreign language
+    text = re.sub(r"([a-zA-Z])([^a-zA-Z\d\#\s\'\%\/\-])", r"\1#1\2", text)
+    text = re.sub(r"([^a-zA-Z\d\#\s\'\%\/\-])([a-zA-Z])", r"\1#1\2", text)
+
+    return tokens[0] + "\t" + text
+
+
+def is_fp_line(line):
+    fp_category_list = ["FP", "I", "N", "Q"]
+    elements = line.strip().split(" ")
+    res = True
+    for ele in elements:
+        if ele not in fp_category_list:
+            res = False
+            break
+    return res
+
+
+def format_prosody(src_prosody):
+    formatted_lines = []
+    with codecs.open(src_prosody, "r", "utf-8") as f:
+        lines = f.readlines()
+
+        idx = 0
+        while idx < len(lines):
+            line = do_character_normalization(lines[idx])
+
+            if len(line.strip().split("\t")) == 2:
+                line = do_prosody_text_normalization(line)
+            else:
+                fp_enable = is_fp_line(line)
+                if fp_enable:
+                    idx += 3
+                    continue
+            formatted_lines.append(line)
+            idx += 1
+    #  with codecs.open(tgt_prosody, 'w', 'utf-8') as f:
+    #      f.writelines(formatted_lines)
+    return formatted_lines
--- a/kantts/preprocess/se_processor/D_TDNN.py
+++ b/kantts/preprocess/se_processor/D_TDNN.py
+from collections import OrderedDict
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from .layers import (DenseLayer, DenseTDNNBlock, StatsPool, TDNNLayer, SEDenseTDNNBlock,
+                     TransitLayer)
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes,
+                               planes,
+                               kernel_size=3,
+                               stride=(stride, 1),
+                               padding=1,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes,
+                               planes,
+                               kernel_size=3,
+                               stride=1,
+                               padding=1,
+                               bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes,
+                          self.expansion * planes,
+                          kernel_size=1,
+                          stride=(stride, 1),
+                          bias=False),
+                nn.BatchNorm2d(self.expansion * planes))
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+    
+class CNN_Head(nn.Module):
+    def __init__(self,
+                block=BasicBlock,
+                num_blocks=[2, 2],
+                m_channels=32,
+                feat_dim=80):
+        super(CNN_Head, self).__init__()
+        self.in_planes = m_channels
+        self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(m_channels)
+        
+        self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=2)
+        self.layer2 = self._make_layer(block, m_channels, num_blocks[0], stride=2)
+
+        self.conv2 = nn.Conv2d(m_channels, m_channels, kernel_size=3, stride=(2, 1), padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(m_channels)
+        self.out_channels =  m_channels * (feat_dim // 8)
+    
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1] * (num_blocks - 1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = x.unsqueeze_(1)
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = F.relu(self.bn2(self.conv2(out)))
+        
+        out = out.reshape(out.shape[0], out.shape[1]*out.shape[2], out.shape[3])
+        return out
+
+class DTDNN(nn.Module):
+    def __init__(self,
+                 feat_dim=80,
+                 embedding_size=192,
+                 growth_rate=32,
+                 bn_size=4,
+                 init_channels=128,
+                 config_str='batchnorm-relu',
+                 memory_efficient=True):
+        super(DTDNN, self).__init__()
+
+        self.head = CNN_Head()
+        feat_dim = self.head.out_channels
+
+        self.xvector = nn.Sequential(
+            OrderedDict([
+                ('tdnn',
+                 TDNNLayer(feat_dim,
+                           init_channels,
+                           5,
+                           stride=2,
+                           dilation=1,
+                           padding=-1,
+                           config_str=config_str)),
+            ]))
+        channels = init_channels
+        for i, (num_layers, kernel_size,
+                dilation) in enumerate(zip((12, 24, 16), (3, 3, 3), (1, 2, 3))):
+            block = SEDenseTDNNBlock(num_layers=num_layers,
+                                   in_channels=channels,
+                                   out_channels=growth_rate,
+                                   bn_channels=bn_size * growth_rate,
+                                   kernel_size=kernel_size,
+                                   dilation=dilation,
+                                   config_str=config_str,
+                                   memory_efficient=memory_efficient)
+            self.xvector.add_module('block%d' % (i + 1), block)
+            channels = channels + num_layers * growth_rate
+            self.xvector.add_module(
+                'transit%d' % (i + 1),
+                TransitLayer(channels,
+                             channels // 2,
+                             bias=False,
+                             config_str=config_str))
+            channels //= 2
+
+        self.bn = nn.BatchNorm1d(channels)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.xvector.add_module('stats', StatsPool())
+        self.xvector.add_module(
+            'dense',
+            DenseLayer(channels * 2, embedding_size, config_str='batchnorm_'))
+
+        for m in self.modules():
+            if isinstance(m, (nn.Conv1d, nn.Linear)):
+                nn.init.kaiming_normal_(m.weight.data)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+
+    def forward(self, x):
+        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
+        x = self.head(x)
+        x = self.xvector.tdnn(x)
+        
+        x = self.xvector.block1(x)
+        x = self.xvector.transit1(x)
+
+        x = self.xvector.block2(x)
+        x = self.xvector.transit2(x)
+
+        x = self.xvector.block3(x)
+        x = self.xvector.transit3(x)
+        x = self.relu(self.bn(x))
+
+        x = self.xvector.stats(x)
+        x = self.xvector.dense(x)
+        return x
+
--- a/kantts/preprocess/se_processor/__init__.py
+++ b/kantts/preprocess/se_processor/__init__.py
--- a/kantts/preprocess/se_processor/__pycache__/D_TDNN.cpython-38.pyc
+++ b/kantts/preprocess/se_processor/__pycache__/D_TDNN.cpython-38.pyc
--- a/kantts/preprocess/se_processor/__pycache__/__init__.cpython-38.pyc
+++ b/kantts/preprocess/se_processor/__pycache__/__init__.cpython-38.pyc
--- a/kantts/preprocess/se_processor/__pycache__/layers.cpython-38.pyc
+++ b/kantts/preprocess/se_processor/__pycache__/layers.cpython-38.pyc
--- a/kantts/preprocess/se_processor/__pycache__/se_processor.cpython-38.pyc
+++ b/kantts/preprocess/se_processor/__pycache__/se_processor.cpython-38.pyc
--- a/kantts/preprocess/se_processor/layers.py
+++ b/kantts/preprocess/se_processor/layers.py
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from torch import nn
+
+
+def get_nonlinear(config_str, channels):
+    nonlinear = nn.Sequential()
+    for name in config_str.split('-'):
+        if name == 'relu':
+            nonlinear.add_module('relu', nn.ReLU(inplace=True))
+        elif name == 'prelu':
+            nonlinear.add_module('prelu', nn.PReLU(channels))
+        elif name == 'batchnorm':
+            nonlinear.add_module('batchnorm', nn.BatchNorm1d(channels))
+        elif name == 'batchnorm_':
+            nonlinear.add_module('batchnorm',
+                                 nn.BatchNorm1d(channels, affine=False))
+        else:
+            raise ValueError('Unexpected module ({}).'.format(name))
+    return nonlinear
+
+
+def statistics_pooling(x, dim=-1, keepdim=False, unbiased=True, eps=1e-2):
+    mean = x.mean(dim=dim)
+    std = x.std(dim=dim, unbiased=unbiased)
+    stats = torch.cat([mean, std], dim=-1)
+    if keepdim:
+        stats = stats.unsqueeze(dim=dim)
+    return stats
+
+
+def high_order_statistics_pooling(x,
+                                  dim=-1,
+                                  keepdim=False,
+                                  unbiased=True,
+                                  eps=1e-2):
+    mean = x.mean(dim=dim)
+    std = x.std(dim=dim, unbiased=unbiased)
+    norm = (x - mean.unsqueeze(dim=dim)) \
+        / std.clamp(min=eps).unsqueeze(dim=dim)
+    skewness = norm.pow(3).mean(dim=dim)
+    kurtosis = norm.pow(4).mean(dim=dim)
+    stats = torch.cat([mean, std, skewness, kurtosis], dim=-1)
+    if keepdim:
+        stats = stats.unsqueeze(dim=dim)
+    return stats
+
+
+class StatsPool(nn.Module):
+    def forward(self, x):
+        return statistics_pooling(x)
+
+
+class HighOrderStatsPool(nn.Module):
+    def forward(self, x):
+        return high_order_statistics_pooling(x)
+
+
+class TDNNLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 bias=False,
+                 config_str='batchnorm-relu'):
+        super(TDNNLayer, self).__init__()
+        if padding < 0:
+            assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format(
+                kernel_size)
+            padding = (kernel_size - 1) // 2 * dilation
+        self.linear = nn.Conv1d(in_channels,
+                                out_channels,
+                                kernel_size,
+                                stride=stride,
+                                padding=padding,
+                                dilation=dilation,
+                                bias=bias)
+        self.nonlinear = get_nonlinear(config_str, out_channels)
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = self.nonlinear(x)
+        return x
+
+
+class DenseTDNNLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 bn_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1,
+                 bias=False,
+                 config_str='batchnorm-relu',
+                 memory_efficient=False):
+        super(DenseTDNNLayer, self).__init__()
+        assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format(
+            kernel_size)
+        padding = (kernel_size - 1) // 2 * dilation
+        self.memory_efficient = memory_efficient
+        self.nonlinear1 = get_nonlinear(config_str, in_channels)
+        self.linear1 = nn.Conv1d(in_channels, bn_channels, 1, bias=False)
+        self.nonlinear2 = get_nonlinear(config_str, bn_channels)
+        self.linear2 = nn.Conv1d(bn_channels,
+                                 out_channels,
+                                 kernel_size,
+                                 stride=stride,
+                                 padding=padding,
+                                 dilation=dilation,
+                                 bias=bias)
+
+    def bn_function(self, x):
+        return self.linear1(self.nonlinear1(x))
+
+    def forward(self, x):
+        if self.training and self.memory_efficient:
+            x = cp.checkpoint(self.bn_function, x)
+        else:
+            x = self.bn_function(x)
+        x = self.linear2(self.nonlinear2(x))
+        return x
+
+
+class DenseTDNNBlock(nn.ModuleList):
+    def __init__(self,
+                 num_layers,
+                 in_channels,
+                 out_channels,
+                 bn_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1,
+                 bias=False,
+                 config_str='batchnorm-relu',
+                 memory_efficient=False):
+        super(DenseTDNNBlock, self).__init__()
+        for i in range(num_layers):
+            layer = DenseTDNNLayer(in_channels=in_channels + i * out_channels,
+                                   out_channels=out_channels,
+                                   bn_channels=bn_channels,
+                                   kernel_size=kernel_size,
+                                   stride=stride,
+                                   dilation=dilation,
+                                   bias=bias,
+                                   config_str=config_str,
+                                   memory_efficient=memory_efficient)
+            self.add_module('tdnnd%d' % (i + 1), layer)
+
+    def forward(self, x):
+        for layer in self:
+            x = torch.cat([x, layer(x)], dim=1)
+        return x
+
+
+class StatsSelect(nn.Module):
+    def __init__(self, channels, branches, null=False, reduction=1):
+        super(StatsSelect, self).__init__()
+        self.gather = HighOrderStatsPool()
+        self.linear1 = nn.Conv1d(channels * 4, channels // reduction, 1)
+        self.linear2 = nn.ModuleList()
+        if null:
+            branches += 1
+        for _ in range(branches):
+            self.linear2.append(nn.Conv1d(channels // reduction, channels, 1))
+        self.channels = channels
+        self.branches = branches
+        self.null = null
+        self.reduction = reduction
+
+    def forward(self, x):
+        f = torch.cat([_x.unsqueeze(dim=1) for _x in x], dim=1)
+        x = torch.sum(f, dim=1)
+        x = self.linear1(self.gather(x).unsqueeze(dim=-1))
+        s = []
+        for linear in self.linear2:
+            s.append(linear(x).view(-1, 1, self.channels))
+        s = torch.cat(s, dim=1)
+        s = F.softmax(s, dim=1).unsqueeze(dim=-1)
+        if self.null:
+            s = s[:, :-1, :, :]
+        return torch.sum(f * s, dim=1)
+
+    def extra_repr(self):
+        return 'channels={}, branches={}, reduction={}'.format(
+            self.channels, self.branches, self.reduction)
+
+
+class SqueezeExcitation(nn.Module):
+    def __init__(self, channels, reduction=1):
+        super(SqueezeExcitation, self).__init__()
+        self.linear1 = nn.Conv1d(channels, channels // reduction, 1)
+        self.relu = nn.ReLU(inplace=True)
+        self.linear2 = nn.Conv1d(channels // reduction, channels, 1)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        s = self.linear1(x.mean(-1, keepdim=True)+self.seg_pooling(x))
+        s = self.relu(s)
+        s = self.sigmoid(self.linear2(s))
+        return x*s
+
+    def seg_pooling(self, x, seg_len=100):
+        s_x = F.max_pool1d(x, kernel_size=seg_len, stride=seg_len, ceil_mode=True)
+        out = s_x.unsqueeze(-1).expand(-1, -1, -1, seg_len).reshape(*x.shape[:-1], -1)
+        out = out[:, :, :x.shape[-1]]
+        return out
+
+class PoolingBlock(nn.Module):
+    def __init__(self, bn_channels, out_channels, kernel_size, stride, padding, dilation, bias, reduction=2):
+        super(PoolingBlock, self).__init__()
+        self.linear_stem = nn.Conv1d(bn_channels,
+                                 out_channels,
+                                 kernel_size,
+                                 stride=stride,
+                                 padding=padding,
+                                 dilation=dilation,
+                                 bias=bias)
+        self.linear1 = nn.Conv1d(bn_channels, bn_channels // reduction, 1)
+        self.relu = nn.ReLU(inplace=True)
+        # self.bn = nn.BatchNorm1d(out_channels)
+        self.linear2 = nn.Conv1d(bn_channels // reduction, out_channels, 1)
+        self.sigmoid = nn.Sigmoid()
+        # self.linear3 = nn.Conv1d(out_channels, out_channels, 1)
+
+    def forward(self, x):
+        y = self.linear_stem(x)
+        s = self.linear1(x.mean(-1, keepdim=True)+self.seg_pooling(x))
+        s = self.relu(s)
+        s = self.sigmoid(self.linear2(s))
+        return y*s
+    
+    def seg_pooling(self, x, seg_len=100):
+        s_x = F.max_pool1d(x, kernel_size=seg_len, stride=seg_len, ceil_mode=True)
+        out = s_x.unsqueeze(-1).expand(-1, -1, -1, seg_len).reshape(*x.shape[:-1], -1)
+        out = out[:, :, :x.shape[-1]]
+        return out
+
+
+class MultiBranchDenseTDNNLayer(DenseTDNNLayer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 bn_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=(1, ),
+                 bias=False,
+                 null=False,
+                 reduction=1,
+                 config_str='batchnorm-relu',
+                 memory_efficient=False):
+        super(DenseTDNNLayer, self).__init__()
+        assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format(
+            kernel_size)
+        padding = (kernel_size - 1) // 2
+        if not isinstance(dilation, (tuple, list)):
+            dilation = (dilation, )
+        self.memory_efficient = memory_efficient
+        self.nonlinear1 = get_nonlinear(config_str, in_channels)
+        self.linear1 = nn.Conv1d(in_channels, bn_channels, 1, bias=False)
+        self.nonlinear2 = get_nonlinear(config_str, bn_channels)
+        self.linear2 = nn.ModuleList()
+        for _dilation in dilation:
+            self.linear2.append(
+                nn.Conv1d(bn_channels,
+                          out_channels,
+                          kernel_size,
+                          stride=stride,
+                          padding=padding * _dilation,
+                          dilation=_dilation,
+                          bias=bias))
+        self.select = StatsSelect(out_channels,
+                                  len(dilation),
+                                  null=null,
+                                  reduction=reduction)
+
+    def forward(self, x):
+        if self.training and self.memory_efficient:
+            x = cp.checkpoint(self.bn_function, x)
+        else:
+            x = self.bn_function(x)
+        x = self.nonlinear2(x)
+        x = self.select([linear(x) for linear in self.linear2])
+        return x
+
+class SEDenseTDNNLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 bn_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1,
+                 bias=False,
+                 config_str='batchnorm-relu',
+                 memory_efficient=False):
+        super(SEDenseTDNNLayer, self).__init__()
+        assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format(
+            kernel_size)
+        padding = (kernel_size - 1) // 2 * dilation
+        self.memory_efficient = memory_efficient
+        self.nonlinear1 = get_nonlinear(config_str, in_channels)
+        self.linear1 = nn.Conv1d(in_channels, bn_channels, 1, bias=False)
+        self.nonlinear2 = get_nonlinear(config_str, bn_channels)
+        # self.linear2 = nn.Conv1d(bn_channels,
+        #                          out_channels,
+        #                          kernel_size,
+        #                          stride=stride,
+        #                          padding=padding,
+        #                          dilation=dilation,
+        #                          bias=bias)
+        # self.se = SqueezeExcitation(out_channels)
+        self.se = PoolingBlock(bn_channels,
+                                out_channels,
+                                kernel_size,
+                                stride=stride,
+                                padding=padding,
+                                dilation=dilation,
+                                bias=bias)
+
+    def bn_function(self, x):
+        return self.linear1(self.nonlinear1(x))
+
+    def forward(self, x):
+        if self.training and self.memory_efficient:
+            x = cp.checkpoint(self.bn_function, x)
+        else:
+            x = self.bn_function(x)
+        # x = self.linear2(self.nonlinear2(x))
+        x = self.se(self.nonlinear2(x))
+        return x
+
+class SEDenseTDNNBlock(nn.ModuleList):
+    def __init__(self,
+                 num_layers,
+                 in_channels,
+                 out_channels,
+                 bn_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1,
+                 bias=False,
+                 config_str='batchnorm-relu',
+                 memory_efficient=False):
+        super(SEDenseTDNNBlock, self).__init__()
+        for i in range(num_layers):
+            layer = SEDenseTDNNLayer(in_channels=in_channels + i * out_channels,
+                                   out_channels=out_channels,
+                                   bn_channels=bn_channels,
+                                   kernel_size=kernel_size,
+                                   stride=stride,
+                                   dilation=dilation,
+                                   bias=bias,
+                                   config_str=config_str,
+                                   memory_efficient=memory_efficient)
+            self.add_module('tdnnd%d' % (i + 1), layer)
+
+    def forward(self, x):
+        for layer in self:
+            x = torch.cat([x, layer(x)], dim=1)
+        return x
+
+class MultiBranchDenseTDNNBlock(DenseTDNNBlock):
+    def __init__(self,
+                 num_layers,
+                 in_channels,
+                 out_channels,
+                 bn_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1,
+                 bias=False,
+                 null=False,
+                 reduction=1,
+                 config_str='batchnorm-relu',
+                 memory_efficient=False):
+        super(DenseTDNNBlock, self).__init__()
+        for i in range(num_layers):
+            layer = MultiBranchDenseTDNNLayer(
+                in_channels=in_channels + i * out_channels,
+                out_channels=out_channels,
+                bn_channels=bn_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                bias=bias,
+                null=null,
+                reduction=reduction,
+                config_str=config_str,
+                memory_efficient=memory_efficient)
+            self.add_module('tdnnd%d' % (i + 1), layer)
+
+
+class TransitLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 bias=True,
+                 config_str='batchnorm-relu'):
+        super(TransitLayer, self).__init__()
+        self.nonlinear = get_nonlinear(config_str, in_channels)
+        self.linear = nn.Conv1d(in_channels, out_channels, 1, bias=bias)
+
+    def forward(self, x):
+        x = self.nonlinear(x)
+        x = self.linear(x)
+        return x
+
+
+class DenseLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 bias=False,
+                 config_str='batchnorm-relu'):
+        super(DenseLayer, self).__init__()
+        self.linear = nn.Conv1d(in_channels, out_channels, 1, bias=bias)
+        self.nonlinear = get_nonlinear(config_str, out_channels)
+
+    def forward(self, x):
+        if len(x.shape) == 2:
+            x = self.linear(x.unsqueeze(dim=-1)).squeeze(dim=-1)
+        else:
+            x = self.linear(x)
+        x = self.nonlinear(x)
+        return x
+
+
+if __name__ == '__main__':
+    model = SqueezeExcitation(channels=32)
+    model.eval() 
+
+    x = torch.randn(1, 32, 298)
+    y = model(x)
+    print(y.size())
+    from thop import profile
+    macs, num_params = profile(model, inputs=(x, ))
+    # num_params = sum(p.numel() for p in model.parameters())
+    print("MACs: {} G".format(macs / 1e9))
+    print("Params: {} M".format(num_params / 1e6))
\ No newline at end of file
--- a/kantts/preprocess/se_processor/se_processor.py
+++ b/kantts/preprocess/se_processor/se_processor.py
+import torch
+import torchaudio
+import numpy as np
+import os
+import torchaudio.compliance.kaldi as Kaldi
+from .D_TDNN import DTDNN
+import logging
+import argparse
+from glob import glob
+
+
+logging.basicConfig(
+    format="%(asctime)s %(levelname)-4s [%(filename)s:%(lineno)d] %(message)s",
+    datefmt="%Y-%m-%d:%H:%M:%S",
+    level=logging.DEBUG,
+)
+
+class SpeakerEmbeddingProcessor:
+    def __init__(self, sample_rate=16000):
+        self.sample_rate = sample_rate
+        self.min_wav_length = self.sample_rate * 30 * 10 / 1000
+
+        self.pcm_dict = {}
+        self.mfcc_dict = {}
+        self.se_list = []
+
+    def process(self, src_voice_dir, se_model):
+        logging.info("[SpeakerEmbeddingProcessor] Speaker embedding extractor started")
+
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model = DTDNN()
+        try:
+            if os.path.basename(se_model) == "se.model":
+                model.load_state_dict(torch.load(se_model, map_location=device))
+            else:
+                raise Exception("[SpeakerEmbeddingProcessor] se model loading error!!!")
+        except Exception as e:
+            logging.info(e)
+            if os.path.basename(se_model) == 'se.onnx':
+                logging.info("[SpeakerEmbeddingProcessor] please update your se model to ensure that the version is greater than or equal to 1.0.5")
+            sys.exit()
+        model.eval()
+        model.to(device)
+
+        wav_dir = os.path.join(src_voice_dir, "wav")
+        se_dir = os.path.join(src_voice_dir, "se")
+        se_average_file = os.path.join(se_dir, "se.npy")
+
+        os.makedirs(se_dir, exist_ok=True)
+
+        wav_files = glob(os.path.join(wav_dir, '*.wav'))
+
+
+        for wav_file in wav_files:
+            basename = os.path.splitext(os.path.basename(wav_file))[0]
+            se_file = os.path.join(se_dir, basename + '.npy')
+             
+            wav, fs = torchaudio.load(wav_file)
+            assert wav.shape[0] == 1
+            assert fs == 16000
+
+            if wav.shape[1] < self.min_wav_length:
+                continue
+
+            fbank_feat = Kaldi.fbank(wav, num_mel_bins=80)
+            
+            feat = fbank_feat - fbank_feat.mean(dim=0, keepdim=True)
+            feat = feat.unsqueeze(0).to(device)
+            
+            speaker_embedding = model(feat)
+            speaker_embedding = speaker_embedding.squeeze().cpu().detach().numpy()
+            speaker_embedding = np.expand_dims(speaker_embedding,  axis=0)
+            
+            
+            np.save(se_file, speaker_embedding)
+            self.se_list.append(speaker_embedding)
+        self.se_average = np.expand_dims(
+            np.mean(
+                np.concatenate(self.se_list, axis=0), 
+                axis=0
+            ), 
+            axis=0
+        )
+        np.save(se_average_file, self.se_average)
+
+        logging.info("[SpeakerEmbeddingProcessor] Speaker embedding extracted successfully!")
+        
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Speaker Embedding Processor")
+    parser.add_argument("--src_voice_dir", type=str, required=True)    
+    parser.add_argument('--se_model', required=True)
+    args = parser.parse_args()
+
+    sep = SpeakerEmbeddingProcessor()
+    sep.process(args.src_voice_dir, args.se_onnx)
\ No newline at end of file