v1.0

9867304a · chenzk · 9867304a · 9867304a · 9867304a · 9867304a
Commit 9867304a authored Jul 23, 2024 by chenzk
20 changed files
--- a/infer/lib/rtrvc.py
+++ b/infer/lib/rtrvc.py
+from io import BytesIO
+import os
+import sys
+import traceback
+from infer.lib import jit
+from infer.lib.jit.get_synthesizer import get_synthesizer
+from time import time as ttime
+import fairseq
+import faiss
+import numpy as np
+import parselmouth
+import pyworld
+import scipy.signal as signal
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchcrepe
+from torchaudio.transforms import Resample
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+from multiprocessing import Manager as M
+
+from configs.config import Config
+
+# config = Config()
+
+mm = M()
+
+
+def printt(strr, *args):
+    if len(args) == 0:
+        print(strr)
+    else:
+        print(strr % args)
+
+
+# config.device=torch.device("cpu")########强制cpu测试
+# config.is_half=False########强制cpu测试
+class RVC:
+    def __init__(
+        self,
+        key,
+        formant,
+        pth_path,
+        index_path,
+        index_rate,
+        n_cpu,
+        inp_q,
+        opt_q,
+        config: Config,
+        last_rvc=None,
+    ) -> None:
+        """
+        初始化
+        """
+        try:
+            if config.dml == True:
+
+                def forward_dml(ctx, x, scale):
+                    ctx.scale = scale
+                    res = x.clone().detach()
+                    return res
+
+                fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
+            # global config
+            self.config = config
+            self.inp_q = inp_q
+            self.opt_q = opt_q
+            # device="cpu"########强制cpu测试
+            self.device = config.device
+            self.f0_up_key = key
+            self.formant_shift = formant
+            self.f0_min = 50
+            self.f0_max = 1100
+            self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
+            self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
+            self.n_cpu = n_cpu
+            self.use_jit = self.config.use_jit
+            self.is_half = config.is_half
+
+            if index_rate != 0:
+                self.index = faiss.read_index(index_path)
+                self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
+                printt("Index search enabled")
+            self.pth_path: str = pth_path
+            self.index_path = index_path
+            self.index_rate = index_rate
+            self.cache_pitch: torch.Tensor = torch.zeros(
+                1024, device=self.device, dtype=torch.long
+            )
+            self.cache_pitchf = torch.zeros(
+                1024, device=self.device, dtype=torch.float32
+            )
+
+            self.resample_kernel = {}
+
+            if last_rvc is None:
+                models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+                    ["assets/hubert/hubert_base.pt"],
+                    suffix="",
+                )
+                hubert_model = models[0]
+                hubert_model = hubert_model.to(self.device)
+                if self.is_half:
+                    hubert_model = hubert_model.half()
+                else:
+                    hubert_model = hubert_model.float()
+                hubert_model.eval()
+                self.model = hubert_model
+            else:
+                self.model = last_rvc.model
+
+            self.net_g: nn.Module = None
+
+            def set_default_model():
+                self.net_g, cpt = get_synthesizer(self.pth_path, self.device)
+                self.tgt_sr = cpt["config"][-1]
+                cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
+                self.if_f0 = cpt.get("f0", 1)
+                self.version = cpt.get("version", "v1")
+                if self.is_half:
+                    self.net_g = self.net_g.half()
+                else:
+                    self.net_g = self.net_g.float()
+
+            def set_jit_model():
+                jit_pth_path = self.pth_path.rstrip(".pth")
+                jit_pth_path += ".half.jit" if self.is_half else ".jit"
+                reload = False
+                if str(self.device) == "cuda":
+                    self.device = torch.device("cuda:0")
+                if os.path.exists(jit_pth_path):
+                    cpt = jit.load(jit_pth_path)
+                    model_device = cpt["device"]
+                    if model_device != str(self.device):
+                        reload = True
+                else:
+                    reload = True
+
+                if reload:
+                    cpt = jit.synthesizer_jit_export(
+                        self.pth_path,
+                        "script",
+                        None,
+                        device=self.device,
+                        is_half=self.is_half,
+                    )
+
+                self.tgt_sr = cpt["config"][-1]
+                self.if_f0 = cpt.get("f0", 1)
+                self.version = cpt.get("version", "v1")
+                self.net_g = torch.jit.load(
+                    BytesIO(cpt["model"]), map_location=self.device
+                )
+                self.net_g.infer = self.net_g.forward
+                self.net_g.eval().to(self.device)
+
+            def set_synthesizer():
+                if self.use_jit and not config.dml:
+                    if self.is_half and "cpu" in str(self.device):
+                        printt(
+                            "Use default Synthesizer model. \
+                                    Jit is not supported on the CPU for half floating point"
+                        )
+                        set_default_model()
+                    else:
+                        set_jit_model()
+                else:
+                    set_default_model()
+
+            if last_rvc is None or last_rvc.pth_path != self.pth_path:
+                set_synthesizer()
+            else:
+                self.tgt_sr = last_rvc.tgt_sr
+                self.if_f0 = last_rvc.if_f0
+                self.version = last_rvc.version
+                self.is_half = last_rvc.is_half
+                if last_rvc.use_jit != self.use_jit:
+                    set_synthesizer()
+                else:
+                    self.net_g = last_rvc.net_g
+
+            if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"):
+                self.model_rmvpe = last_rvc.model_rmvpe
+            if last_rvc is not None and hasattr(last_rvc, "model_fcpe"):
+                self.device_fcpe = last_rvc.device_fcpe
+                self.model_fcpe = last_rvc.model_fcpe
+        except:
+            printt(traceback.format_exc())
+
+    def change_key(self, new_key):
+        self.f0_up_key = new_key
+
+    def change_formant(self, new_formant):
+        self.formant_shift = new_formant
+
+    def change_index_rate(self, new_index_rate):
+        if new_index_rate != 0 and self.index_rate == 0:
+            self.index = faiss.read_index(self.index_path)
+            self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
+            printt("Index search enabled")
+        self.index_rate = new_index_rate
+
+    def get_f0_post(self, f0):
+        if not torch.is_tensor(f0):
+            f0 = torch.from_numpy(f0)
+        f0 = f0.float().to(self.device).squeeze()
+        f0_mel = 1127 * torch.log(1 + f0 / 700)
+        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
+            self.f0_mel_max - self.f0_mel_min
+        ) + 1
+        f0_mel[f0_mel <= 1] = 1
+        f0_mel[f0_mel > 255] = 255
+        f0_coarse = torch.round(f0_mel).long()
+        return f0_coarse, f0
+
+    def get_f0(self, x, f0_up_key, n_cpu, method="harvest"):
+        n_cpu = int(n_cpu)
+        if method == "crepe":
+            return self.get_f0_crepe(x, f0_up_key)
+        if method == "rmvpe":
+            return self.get_f0_rmvpe(x, f0_up_key)
+        if method == "fcpe":
+            return self.get_f0_fcpe(x, f0_up_key)
+        x = x.cpu().numpy()
+        if method == "pm":
+            p_len = x.shape[0] // 160 + 1
+            f0_min = 65
+            l_pad = int(np.ceil(1.5 / f0_min * 16000))
+            r_pad = l_pad + 1
+            s = parselmouth.Sound(np.pad(x, (l_pad, r_pad)), 16000).to_pitch_ac(
+                time_step=0.01,
+                voicing_threshold=0.6,
+                pitch_floor=f0_min,
+                pitch_ceiling=1100,
+            )
+            assert np.abs(s.t1 - 1.5 / f0_min) < 0.001
+            f0 = s.selected_array["frequency"]
+            if len(f0) < p_len:
+                f0 = np.pad(f0, (0, p_len - len(f0)))
+            f0 = f0[:p_len]
+            f0 *= pow(2, f0_up_key / 12)
+            return self.get_f0_post(f0)
+        if n_cpu == 1:
+            f0, t = pyworld.harvest(
+                x.astype(np.double),
+                fs=16000,
+                f0_ceil=1100,
+                f0_floor=50,
+                frame_period=10,
+            )
+            f0 = signal.medfilt(f0, 3)
+            f0 *= pow(2, f0_up_key / 12)
+            return self.get_f0_post(f0)
+        f0bak = np.zeros(x.shape[0] // 160 + 1, dtype=np.float64)
+        length = len(x)
+        part_length = 160 * ((length // 160 - 1) // n_cpu + 1)
+        n_cpu = (length // 160 - 1) // (part_length // 160) + 1
+        ts = ttime()
+        res_f0 = mm.dict()
+        for idx in range(n_cpu):
+            tail = part_length * (idx + 1) + 320
+            if idx == 0:
+                self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts))
+            else:
+                self.inp_q.put(
+                    (idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts)
+                )
+        while 1:
+            res_ts = self.opt_q.get()
+            if res_ts == ts:
+                break
+        f0s = [i[1] for i in sorted(res_f0.items(), key=lambda x: x[0])]
+        for idx, f0 in enumerate(f0s):
+            if idx == 0:
+                f0 = f0[:-3]
+            elif idx != n_cpu - 1:
+                f0 = f0[2:-3]
+            else:
+                f0 = f0[2:]
+            f0bak[part_length * idx // 160 : part_length * idx // 160 + f0.shape[0]] = (
+                f0
+            )
+        f0bak = signal.medfilt(f0bak, 3)
+        f0bak *= pow(2, f0_up_key / 12)
+        return self.get_f0_post(f0bak)
+
+    def get_f0_crepe(self, x, f0_up_key):
+        if "privateuseone" in str(
+            self.device
+        ):  ###不支持dml，cpu又太慢用不成，拿fcpe顶替
+            return self.get_f0(x, f0_up_key, 1, "fcpe")
+        # printt("using crepe,device:%s"%self.device)
+        f0, pd = torchcrepe.predict(
+            x.unsqueeze(0).float(),
+            16000,
+            160,
+            self.f0_min,
+            self.f0_max,
+            "full",
+            batch_size=512,
+            # device=self.device if self.device.type!="privateuseone" else "cpu",###crepe不用半精度全部是全精度所以不愁###cpu延迟高到没法用
+            device=self.device,
+            return_periodicity=True,
+        )
+        pd = torchcrepe.filter.median(pd, 3)
+        f0 = torchcrepe.filter.mean(f0, 3)
+        f0[pd < 0.1] = 0
+        f0 *= pow(2, f0_up_key / 12)
+        return self.get_f0_post(f0)
+
+    def get_f0_rmvpe(self, x, f0_up_key):
+        if hasattr(self, "model_rmvpe") == False:
+            from infer.lib.rmvpe import RMVPE
+
+            printt("Loading rmvpe model")
+            self.model_rmvpe = RMVPE(
+                "assets/rmvpe/rmvpe.pt",
+                is_half=self.is_half,
+                device=self.device,
+                use_jit=self.config.use_jit,
+            )
+        f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
+        f0 *= pow(2, f0_up_key / 12)
+        return self.get_f0_post(f0)
+
+    def get_f0_fcpe(self, x, f0_up_key):
+        if hasattr(self, "model_fcpe") == False:
+            from torchfcpe import spawn_bundled_infer_model
+
+            printt("Loading fcpe model")
+            if "privateuseone" in str(self.device):
+                self.device_fcpe = "cpu"
+            else:
+                self.device_fcpe = self.device
+            self.model_fcpe = spawn_bundled_infer_model(self.device_fcpe)
+        f0 = self.model_fcpe.infer(
+            x.to(self.device_fcpe).unsqueeze(0).float(),
+            sr=16000,
+            decoder_mode="local_argmax",
+            threshold=0.006,
+        )
+        f0 *= pow(2, f0_up_key / 12)
+        return self.get_f0_post(f0)
+
+    def infer(
+        self,
+        input_wav: torch.Tensor,
+        block_frame_16k,
+        skip_head,
+        return_length,
+        f0method,
+    ) -> np.ndarray:
+        t1 = ttime()
+        with torch.no_grad():
+            if self.config.is_half:
+                feats = input_wav.half().view(1, -1)
+            else:
+                feats = input_wav.float().view(1, -1)
+            padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
+            inputs = {
+                "source": feats,
+                "padding_mask": padding_mask,
+                "output_layer": 9 if self.version == "v1" else 12,
+            }
+            logits = self.model.extract_features(**inputs)
+            feats = (
+                self.model.final_proj(logits[0]) if self.version == "v1" else logits[0]
+            )
+            feats = torch.cat((feats, feats[:, -1:, :]), 1)
+        t2 = ttime()
+        try:
+            if hasattr(self, "index") and self.index_rate != 0:
+                npy = feats[0][skip_head // 2 :].cpu().numpy().astype("float32")
+                score, ix = self.index.search(npy, k=8)
+                if (ix >= 0).all():
+                    weight = np.square(1 / score)
+                    weight /= weight.sum(axis=1, keepdims=True)
+                    npy = np.sum(
+                        self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1
+                    )
+                    if self.config.is_half:
+                        npy = npy.astype("float16")
+                    feats[0][skip_head // 2 :] = (
+                        torch.from_numpy(npy).unsqueeze(0).to(self.device)
+                        * self.index_rate
+                        + (1 - self.index_rate) * feats[0][skip_head // 2 :]
+                    )
+                else:
+                    printt(
+                        "Invalid index. You MUST use added_xxxx.index but not trained_xxxx.index!"
+                    )
+            else:
+                printt("Index search FAILED or disabled")
+        except:
+            traceback.print_exc()
+            printt("Index search FAILED")
+        t3 = ttime()
+        p_len = input_wav.shape[0] // 160
+        factor = pow(2, self.formant_shift / 12)
+        return_length2 = int(np.ceil(return_length * factor))
+        if self.if_f0 == 1:
+            f0_extractor_frame = block_frame_16k + 800
+            if f0method == "rmvpe":
+                f0_extractor_frame = 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160
+            pitch, pitchf = self.get_f0(
+                input_wav[-f0_extractor_frame:], self.f0_up_key - self.formant_shift, self.n_cpu, f0method
+            )
+            shift = block_frame_16k // 160
+            self.cache_pitch[:-shift] = self.cache_pitch[shift:].clone()
+            self.cache_pitchf[:-shift] = self.cache_pitchf[shift:].clone()
+            self.cache_pitch[4 - pitch.shape[0] :] = pitch[3:-1]
+            self.cache_pitchf[4 - pitch.shape[0] :] = pitchf[3:-1]
+            cache_pitch = self.cache_pitch[None, -p_len:]
+            cache_pitchf = self.cache_pitchf[None, -p_len:] * return_length2 / return_length
+        t4 = ttime()
+        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
+        feats = feats[:, :p_len, :]
+        p_len = torch.LongTensor([p_len]).to(self.device)
+        sid = torch.LongTensor([0]).to(self.device)
+        skip_head = torch.LongTensor([skip_head])
+        return_length2 = torch.LongTensor([return_length2])
+        return_length = torch.LongTensor([return_length])
+        with torch.no_grad():
+            if self.if_f0 == 1:
+                infered_audio, _, _ = self.net_g.infer(
+                    feats,
+                    p_len,
+                    cache_pitch,
+                    cache_pitchf,
+                    sid,
+                    skip_head,
+                    return_length,
+                    return_length2,
+                )
+            else:
+                infered_audio, _, _ = self.net_g.infer(
+                    feats, p_len, sid, skip_head, return_length, return_length2
+                )
+        infered_audio = infered_audio.squeeze(1).float()
+        upp_res = int(np.floor(factor * self.tgt_sr // 100))
+        if upp_res != self.tgt_sr // 100:
+            if upp_res not in self.resample_kernel:
+                self.resample_kernel[upp_res] = Resample(
+                    orig_freq=upp_res,
+                    new_freq=self.tgt_sr // 100,
+                    dtype=torch.float32,
+                ).to(self.device)
+            infered_audio = self.resample_kernel[upp_res](
+                infered_audio[:, : return_length * upp_res]
+            )
+        t5 = ttime()
+        printt(
+            "Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs",
+            t2 - t1,
+            t3 - t2,
+            t4 - t3,
+            t5 - t4,
+        )
+        return infered_audio.squeeze()
--- a/infer/lib/slicer2.py
+++ b/infer/lib/slicer2.py
+import numpy as np
+
+
+# This function is obtained from librosa.
+def get_rms(
+    y,
+    frame_length=2048,
+    hop_length=512,
+    pad_mode="constant",
+):
+    padding = (int(frame_length // 2), int(frame_length // 2))
+    y = np.pad(y, padding, mode=pad_mode)
+
+    axis = -1
+    # put our new within-frame axis at the end for now
+    out_strides = y.strides + tuple([y.strides[axis]])
+    # Reduce the shape on the framing axis
+    x_shape_trimmed = list(y.shape)
+    x_shape_trimmed[axis] -= frame_length - 1
+    out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
+    xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
+    if axis < 0:
+        target_axis = axis - 1
+    else:
+        target_axis = axis + 1
+    xw = np.moveaxis(xw, -1, target_axis)
+    # Downsample along the target axis
+    slices = [slice(None)] * xw.ndim
+    slices[axis] = slice(0, None, hop_length)
+    x = xw[tuple(slices)]
+
+    # Calculate power
+    power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
+
+    return np.sqrt(power)
+
+
+class Slicer:
+    def __init__(
+        self,
+        sr: int,
+        threshold: float = -40.0,
+        min_length: int = 5000,
+        min_interval: int = 300,
+        hop_size: int = 20,
+        max_sil_kept: int = 5000,
+    ):
+        if not min_length >= min_interval >= hop_size:
+            raise ValueError(
+                "The following condition must be satisfied: min_length >= min_interval >= hop_size"
+            )
+        if not max_sil_kept >= hop_size:
+            raise ValueError(
+                "The following condition must be satisfied: max_sil_kept >= hop_size"
+            )
+        min_interval = sr * min_interval / 1000
+        self.threshold = 10 ** (threshold / 20.0)
+        self.hop_size = round(sr * hop_size / 1000)
+        self.win_size = min(round(min_interval), 4 * self.hop_size)
+        self.min_length = round(sr * min_length / 1000 / self.hop_size)
+        self.min_interval = round(min_interval / self.hop_size)
+        self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
+
+    def _apply_slice(self, waveform, begin, end):
+        if len(waveform.shape) > 1:
+            return waveform[
+                :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
+            ]
+        else:
+            return waveform[
+                begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
+            ]
+
+    # @timeit
+    def slice(self, waveform):
+        if len(waveform.shape) > 1:
+            samples = waveform.mean(axis=0)
+        else:
+            samples = waveform
+        if samples.shape[0] <= self.min_length:
+            return [waveform]
+        rms_list = get_rms(
+            y=samples, frame_length=self.win_size, hop_length=self.hop_size
+        ).squeeze(0)
+        sil_tags = []
+        silence_start = None
+        clip_start = 0
+        for i, rms in enumerate(rms_list):
+            # Keep looping while frame is silent.
+            if rms < self.threshold:
+                # Record start of silent frames.
+                if silence_start is None:
+                    silence_start = i
+                continue
+            # Keep looping while frame is not silent and silence start has not been recorded.
+            if silence_start is None:
+                continue
+            # Clear recorded silence start if interval is not enough or clip is too short
+            is_leading_silence = silence_start == 0 and i > self.max_sil_kept
+            need_slice_middle = (
+                i - silence_start >= self.min_interval
+                and i - clip_start >= self.min_length
+            )
+            if not is_leading_silence and not need_slice_middle:
+                silence_start = None
+                continue
+            # Need slicing. Record the range of silent frames to be removed.
+            if i - silence_start <= self.max_sil_kept:
+                pos = rms_list[silence_start : i + 1].argmin() + silence_start
+                if silence_start == 0:
+                    sil_tags.append((0, pos))
+                else:
+                    sil_tags.append((pos, pos))
+                clip_start = pos
+            elif i - silence_start <= self.max_sil_kept * 2:
+                pos = rms_list[
+                    i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
+                ].argmin()
+                pos += i - self.max_sil_kept
+                pos_l = (
+                    rms_list[
+                        silence_start : silence_start + self.max_sil_kept + 1
+                    ].argmin()
+                    + silence_start
+                )
+                pos_r = (
+                    rms_list[i - self.max_sil_kept : i + 1].argmin()
+                    + i
+                    - self.max_sil_kept
+                )
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                    clip_start = pos_r
+                else:
+                    sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
+                    clip_start = max(pos_r, pos)
+            else:
+                pos_l = (
+                    rms_list[
+                        silence_start : silence_start + self.max_sil_kept + 1
+                    ].argmin()
+                    + silence_start
+                )
+                pos_r = (
+                    rms_list[i - self.max_sil_kept : i + 1].argmin()
+                    + i
+                    - self.max_sil_kept
+                )
+                if silence_start == 0:
+                    sil_tags.append((0, pos_r))
+                else:
+                    sil_tags.append((pos_l, pos_r))
+                clip_start = pos_r
+            silence_start = None
+        # Deal with trailing silence.
+        total_frames = rms_list.shape[0]
+        if (
+            silence_start is not None
+            and total_frames - silence_start >= self.min_interval
+        ):
+            silence_end = min(total_frames, silence_start + self.max_sil_kept)
+            pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
+            sil_tags.append((pos, total_frames + 1))
+        # Apply and return slices.
+        if len(sil_tags) == 0:
+            return [waveform]
+        else:
+            chunks = []
+            if sil_tags[0][0] > 0:
+                chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
+            for i in range(len(sil_tags) - 1):
+                chunks.append(
+                    self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
+                )
+            if sil_tags[-1][1] < total_frames:
+                chunks.append(
+                    self._apply_slice(waveform, sil_tags[-1][1], total_frames)
+                )
+            return chunks
+
+
+def main():
+    import os.path
+    from argparse import ArgumentParser
+
+    import librosa
+    import soundfile
+
+    parser = ArgumentParser()
+    parser.add_argument("audio", type=str, help="The audio to be sliced")
+    parser.add_argument(
+        "--out", type=str, help="Output directory of the sliced audio clips"
+    )
+    parser.add_argument(
+        "--db_thresh",
+        type=float,
+        required=False,
+        default=-40,
+        help="The dB threshold for silence detection",
+    )
+    parser.add_argument(
+        "--min_length",
+        type=int,
+        required=False,
+        default=5000,
+        help="The minimum milliseconds required for each sliced audio clip",
+    )
+    parser.add_argument(
+        "--min_interval",
+        type=int,
+        required=False,
+        default=300,
+        help="The minimum milliseconds for a silence part to be sliced",
+    )
+    parser.add_argument(
+        "--hop_size",
+        type=int,
+        required=False,
+        default=10,
+        help="Frame length in milliseconds",
+    )
+    parser.add_argument(
+        "--max_sil_kept",
+        type=int,
+        required=False,
+        default=500,
+        help="The maximum silence length kept around the sliced clip, presented in milliseconds",
+    )
+    args = parser.parse_args()
+    out = args.out
+    if out is None:
+        out = os.path.dirname(os.path.abspath(args.audio))
+    audio, sr = librosa.load(args.audio, sr=None, mono=False)
+    slicer = Slicer(
+        sr=sr,
+        threshold=args.db_thresh,
+        min_length=args.min_length,
+        min_interval=args.min_interval,
+        hop_size=args.hop_size,
+        max_sil_kept=args.max_sil_kept,
+    )
+    chunks = slicer.slice(audio)
+    if not os.path.exists(out):
+        os.makedirs(out)
+    for i, chunk in enumerate(chunks):
+        if len(chunk.shape) > 1:
+            chunk = chunk.T
+        soundfile.write(
+            os.path.join(
+                out,
+                f"%s_%d.wav"
+                % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
+            ),
+            chunk,
+            sr,
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/infer/lib/train/data_utils.py
+++ b/infer/lib/train/data_utils.py
+import os
+import traceback
+import logging
+
+logger = logging.getLogger(__name__)
+
+import numpy as np
+import torch
+import torch.utils.data
+
+from infer.lib.train.mel_processing import spectrogram_torch
+from infer.lib.train.utils import load_filepaths_and_text, load_wav_to_torch
+
+
+class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
+    """
+    1) loads audio, text pairs
+    2) normalizes text and converts them to sequences of integers
+    3) computes spectrograms from audio files.
+    """
+
+    def __init__(self, audiopaths_and_text, hparams):
+        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
+        self.max_wav_value = hparams.max_wav_value
+        self.sampling_rate = hparams.sampling_rate
+        self.filter_length = hparams.filter_length
+        self.hop_length = hparams.hop_length
+        self.win_length = hparams.win_length
+        self.sampling_rate = hparams.sampling_rate
+        self.min_text_len = getattr(hparams, "min_text_len", 1)
+        self.max_text_len = getattr(hparams, "max_text_len", 5000)
+        self._filter()
+
+    def _filter(self):
+        """
+        Filter text & store spec lengths
+        """
+        # Store spectrogram lengths for Bucketing
+        # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
+        # spec_length = wav_length // hop_length
+        audiopaths_and_text_new = []
+        lengths = []
+        for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text:
+            if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
+                audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv])
+                lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length))
+        self.audiopaths_and_text = audiopaths_and_text_new
+        self.lengths = lengths
+
+    def get_sid(self, sid):
+        sid = torch.LongTensor([int(sid)])
+        return sid
+
+    def get_audio_text_pair(self, audiopath_and_text):
+        # separate filename and text
+        file = audiopath_and_text[0]
+        phone = audiopath_and_text[1]
+        pitch = audiopath_and_text[2]
+        pitchf = audiopath_and_text[3]
+        dv = audiopath_and_text[4]
+
+        phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf)
+        spec, wav = self.get_audio(file)
+        dv = self.get_sid(dv)
+
+        len_phone = phone.size()[0]
+        len_spec = spec.size()[-1]
+        # print(123,phone.shape,pitch.shape,spec.shape)
+        if len_phone != len_spec:
+            len_min = min(len_phone, len_spec)
+            # amor
+            len_wav = len_min * self.hop_length
+
+            spec = spec[:, :len_min]
+            wav = wav[:, :len_wav]
+
+            phone = phone[:len_min, :]
+            pitch = pitch[:len_min]
+            pitchf = pitchf[:len_min]
+
+        return (spec, wav, phone, pitch, pitchf, dv)
+
+    def get_labels(self, phone, pitch, pitchf):
+        phone = np.load(phone)
+        phone = np.repeat(phone, 2, axis=0)
+        pitch = np.load(pitch)
+        pitchf = np.load(pitchf)
+        n_num = min(phone.shape[0], 900)  # DistributedBucketSampler
+        # print(234,phone.shape,pitch.shape)
+        phone = phone[:n_num, :]
+        pitch = pitch[:n_num]
+        pitchf = pitchf[:n_num]
+        phone = torch.FloatTensor(phone)
+        pitch = torch.LongTensor(pitch)
+        pitchf = torch.FloatTensor(pitchf)
+        return phone, pitch, pitchf
+
+    def get_audio(self, filename):
+        audio, sampling_rate = load_wav_to_torch(filename)
+        if sampling_rate != self.sampling_rate:
+            raise ValueError(
+                "{} SR doesn't match target {} SR".format(
+                    sampling_rate, self.sampling_rate
+                )
+            )
+        audio_norm = audio
+        #        audio_norm = audio / self.max_wav_value
+        #        audio_norm = audio / np.abs(audio).max()
+
+        audio_norm = audio_norm.unsqueeze(0)
+        spec_filename = filename.replace(".wav", ".spec.pt")
+        if os.path.exists(spec_filename):
+            try:
+                spec = torch.load(spec_filename)
+            except:
+                logger.warning("%s %s", spec_filename, traceback.format_exc())
+                spec = spectrogram_torch(
+                    audio_norm,
+                    self.filter_length,
+                    self.sampling_rate,
+                    self.hop_length,
+                    self.win_length,
+                    center=False,
+                )
+                spec = torch.squeeze(spec, 0)
+                torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
+        else:
+            spec = spectrogram_torch(
+                audio_norm,
+                self.filter_length,
+                self.sampling_rate,
+                self.hop_length,
+                self.win_length,
+                center=False,
+            )
+            spec = torch.squeeze(spec, 0)
+            torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
+        return spec, audio_norm
+
+    def __getitem__(self, index):
+        return self.get_audio_text_pair(self.audiopaths_and_text[index])
+
+    def __len__(self):
+        return len(self.audiopaths_and_text)
+
+
+class TextAudioCollateMultiNSFsid:
+    """Zero-pads model inputs and targets"""
+
+    def __init__(self, return_ids=False):
+        self.return_ids = return_ids
+
+    def __call__(self, batch):
+        """Collate's training batch from normalized text and aduio
+        PARAMS
+        ------
+        batch: [text_normalized, spec_normalized, wav_normalized]
+        """
+        # Right zero-pad all one-hot text sequences to max input length
+        _, ids_sorted_decreasing = torch.sort(
+            torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True
+        )
+
+        max_spec_len = max([x[0].size(1) for x in batch])
+        max_wave_len = max([x[1].size(1) for x in batch])
+        spec_lengths = torch.LongTensor(len(batch))
+        wave_lengths = torch.LongTensor(len(batch))
+        spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len)
+        wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len)
+        spec_padded.zero_()
+        wave_padded.zero_()
+
+        max_phone_len = max([x[2].size(0) for x in batch])
+        phone_lengths = torch.LongTensor(len(batch))
+        phone_padded = torch.FloatTensor(
+            len(batch), max_phone_len, batch[0][2].shape[1]
+        )  # (spec, wav, phone, pitch)
+        pitch_padded = torch.LongTensor(len(batch), max_phone_len)
+        pitchf_padded = torch.FloatTensor(len(batch), max_phone_len)
+        phone_padded.zero_()
+        pitch_padded.zero_()
+        pitchf_padded.zero_()
+        # dv = torch.FloatTensor(len(batch), 256)#gin=256
+        sid = torch.LongTensor(len(batch))
+
+        for i in range(len(ids_sorted_decreasing)):
+            row = batch[ids_sorted_decreasing[i]]
+
+            spec = row[0]
+            spec_padded[i, :, : spec.size(1)] = spec
+            spec_lengths[i] = spec.size(1)
+
+            wave = row[1]
+            wave_padded[i, :, : wave.size(1)] = wave
+            wave_lengths[i] = wave.size(1)
+
+            phone = row[2]
+            phone_padded[i, : phone.size(0), :] = phone
+            phone_lengths[i] = phone.size(0)
+
+            pitch = row[3]
+            pitch_padded[i, : pitch.size(0)] = pitch
+            pitchf = row[4]
+            pitchf_padded[i, : pitchf.size(0)] = pitchf
+
+            # dv[i] = row[5]
+            sid[i] = row[5]
+
+        return (
+            phone_padded,
+            phone_lengths,
+            pitch_padded,
+            pitchf_padded,
+            spec_padded,
+            spec_lengths,
+            wave_padded,
+            wave_lengths,
+            # dv
+            sid,
+        )
+
+
+class TextAudioLoader(torch.utils.data.Dataset):
+    """
+    1) loads audio, text pairs
+    2) normalizes text and converts them to sequences of integers
+    3) computes spectrograms from audio files.
+    """
+
+    def __init__(self, audiopaths_and_text, hparams):
+        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
+        self.max_wav_value = hparams.max_wav_value
+        self.sampling_rate = hparams.sampling_rate
+        self.filter_length = hparams.filter_length
+        self.hop_length = hparams.hop_length
+        self.win_length = hparams.win_length
+        self.sampling_rate = hparams.sampling_rate
+        self.min_text_len = getattr(hparams, "min_text_len", 1)
+        self.max_text_len = getattr(hparams, "max_text_len", 5000)
+        self._filter()
+
+    def _filter(self):
+        """
+        Filter text & store spec lengths
+        """
+        # Store spectrogram lengths for Bucketing
+        # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
+        # spec_length = wav_length // hop_length
+        audiopaths_and_text_new = []
+        lengths = []
+        for audiopath, text, dv in self.audiopaths_and_text:
+            if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
+                audiopaths_and_text_new.append([audiopath, text, dv])
+                lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length))
+        self.audiopaths_and_text = audiopaths_and_text_new
+        self.lengths = lengths
+
+    def get_sid(self, sid):
+        sid = torch.LongTensor([int(sid)])
+        return sid
+
+    def get_audio_text_pair(self, audiopath_and_text):
+        # separate filename and text
+        file = audiopath_and_text[0]
+        phone = audiopath_and_text[1]
+        dv = audiopath_and_text[2]
+
+        phone = self.get_labels(phone)
+        spec, wav = self.get_audio(file)
+        dv = self.get_sid(dv)
+
+        len_phone = phone.size()[0]
+        len_spec = spec.size()[-1]
+        if len_phone != len_spec:
+            len_min = min(len_phone, len_spec)
+            len_wav = len_min * self.hop_length
+            spec = spec[:, :len_min]
+            wav = wav[:, :len_wav]
+            phone = phone[:len_min, :]
+        return (spec, wav, phone, dv)
+
+    def get_labels(self, phone):
+        phone = np.load(phone)
+        phone = np.repeat(phone, 2, axis=0)
+        n_num = min(phone.shape[0], 900)  # DistributedBucketSampler
+        phone = phone[:n_num, :]
+        phone = torch.FloatTensor(phone)
+        return phone
+
+    def get_audio(self, filename):
+        audio, sampling_rate = load_wav_to_torch(filename)
+        if sampling_rate != self.sampling_rate:
+            raise ValueError(
+                "{} SR doesn't match target {} SR".format(
+                    sampling_rate, self.sampling_rate
+                )
+            )
+        audio_norm = audio
+        #        audio_norm = audio / self.max_wav_value
+        #        audio_norm = audio / np.abs(audio).max()
+
+        audio_norm = audio_norm.unsqueeze(0)
+        spec_filename = filename.replace(".wav", ".spec.pt")
+        if os.path.exists(spec_filename):
+            try:
+                spec = torch.load(spec_filename)
+            except:
+                logger.warning("%s %s", spec_filename, traceback.format_exc())
+                spec = spectrogram_torch(
+                    audio_norm,
+                    self.filter_length,
+                    self.sampling_rate,
+                    self.hop_length,
+                    self.win_length,
+                    center=False,
+                )
+                spec = torch.squeeze(spec, 0)
+                torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
+        else:
+            spec = spectrogram_torch(
+                audio_norm,
+                self.filter_length,
+                self.sampling_rate,
+                self.hop_length,
+                self.win_length,
+                center=False,
+            )
+            spec = torch.squeeze(spec, 0)
+            torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
+        return spec, audio_norm
+
+    def __getitem__(self, index):
+        return self.get_audio_text_pair(self.audiopaths_and_text[index])
+
+    def __len__(self):
+        return len(self.audiopaths_and_text)
+
+
+class TextAudioCollate:
+    """Zero-pads model inputs and targets"""
+
+    def __init__(self, return_ids=False):
+        self.return_ids = return_ids
+
+    def __call__(self, batch):
+        """Collate's training batch from normalized text and aduio
+        PARAMS
+        ------
+        batch: [text_normalized, spec_normalized, wav_normalized]
+        """
+        # Right zero-pad all one-hot text sequences to max input length
+        _, ids_sorted_decreasing = torch.sort(
+            torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True
+        )
+
+        max_spec_len = max([x[0].size(1) for x in batch])
+        max_wave_len = max([x[1].size(1) for x in batch])
+        spec_lengths = torch.LongTensor(len(batch))
+        wave_lengths = torch.LongTensor(len(batch))
+        spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len)
+        wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len)
+        spec_padded.zero_()
+        wave_padded.zero_()
+
+        max_phone_len = max([x[2].size(0) for x in batch])
+        phone_lengths = torch.LongTensor(len(batch))
+        phone_padded = torch.FloatTensor(
+            len(batch), max_phone_len, batch[0][2].shape[1]
+        )
+        phone_padded.zero_()
+        sid = torch.LongTensor(len(batch))
+
+        for i in range(len(ids_sorted_decreasing)):
+            row = batch[ids_sorted_decreasing[i]]
+
+            spec = row[0]
+            spec_padded[i, :, : spec.size(1)] = spec
+            spec_lengths[i] = spec.size(1)
+
+            wave = row[1]
+            wave_padded[i, :, : wave.size(1)] = wave
+            wave_lengths[i] = wave.size(1)
+
+            phone = row[2]
+            phone_padded[i, : phone.size(0), :] = phone
+            phone_lengths[i] = phone.size(0)
+
+            sid[i] = row[3]
+
+        return (
+            phone_padded,
+            phone_lengths,
+            spec_padded,
+            spec_lengths,
+            wave_padded,
+            wave_lengths,
+            sid,
+        )
+
+
+class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
+    """
+    Maintain similar input lengths in a batch.
+    Length groups are specified by boundaries.
+    Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
+
+    It removes samples which are not included in the boundaries.
+    Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
+    """
+
+    def __init__(
+        self,
+        dataset,
+        batch_size,
+        boundaries,
+        num_replicas=None,
+        rank=None,
+        shuffle=True,
+    ):
+        super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+        self.lengths = dataset.lengths
+        self.batch_size = batch_size
+        self.boundaries = boundaries
+
+        self.buckets, self.num_samples_per_bucket = self._create_buckets()
+        self.total_size = sum(self.num_samples_per_bucket)
+        self.num_samples = self.total_size // self.num_replicas
+
+    def _create_buckets(self):
+        buckets = [[] for _ in range(len(self.boundaries) - 1)]
+        for i in range(len(self.lengths)):
+            length = self.lengths[i]
+            idx_bucket = self._bisect(length)
+            if idx_bucket != -1:
+                buckets[idx_bucket].append(i)
+
+        for i in range(len(buckets) - 1, -1, -1):  #
+            if len(buckets[i]) == 0:
+                buckets.pop(i)
+                self.boundaries.pop(i + 1)
+
+        num_samples_per_bucket = []
+        for i in range(len(buckets)):
+            len_bucket = len(buckets[i])
+            total_batch_size = self.num_replicas * self.batch_size
+            rem = (
+                total_batch_size - (len_bucket % total_batch_size)
+            ) % total_batch_size
+            num_samples_per_bucket.append(len_bucket + rem)
+        return buckets, num_samples_per_bucket
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+
+        indices = []
+        if self.shuffle:
+            for bucket in self.buckets:
+                indices.append(torch.randperm(len(bucket), generator=g).tolist())
+        else:
+            for bucket in self.buckets:
+                indices.append(list(range(len(bucket))))
+
+        batches = []
+        for i in range(len(self.buckets)):
+            bucket = self.buckets[i]
+            len_bucket = len(bucket)
+            ids_bucket = indices[i]
+            num_samples_bucket = self.num_samples_per_bucket[i]
+
+            # add extra samples to make it evenly divisible
+            rem = num_samples_bucket - len_bucket
+            ids_bucket = (
+                ids_bucket
+                + ids_bucket * (rem // len_bucket)
+                + ids_bucket[: (rem % len_bucket)]
+            )
+
+            # subsample
+            ids_bucket = ids_bucket[self.rank :: self.num_replicas]
+
+            # batching
+            for j in range(len(ids_bucket) // self.batch_size):
+                batch = [
+                    bucket[idx]
+                    for idx in ids_bucket[
+                        j * self.batch_size : (j + 1) * self.batch_size
+                    ]
+                ]
+                batches.append(batch)
+
+        if self.shuffle:
+            batch_ids = torch.randperm(len(batches), generator=g).tolist()
+            batches = [batches[i] for i in batch_ids]
+        self.batches = batches
+
+        assert len(self.batches) * self.batch_size == self.num_samples
+        return iter(self.batches)
+
+    def _bisect(self, x, lo=0, hi=None):
+        if hi is None:
+            hi = len(self.boundaries) - 1
+
+        if hi > lo:
+            mid = (hi + lo) // 2
+            if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
+                return mid
+            elif x <= self.boundaries[mid]:
+                return self._bisect(x, lo, mid)
+            else:
+                return self._bisect(x, mid + 1, hi)
+        else:
+            return -1
+
+    def __len__(self):
+        return self.num_samples // self.batch_size
--- a/infer/lib/train/losses.py
+++ b/infer/lib/train/losses.py
+import torch
+
+
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            rl = rl.float().detach()
+            gl = gl.float()
+            loss += torch.mean(torch.abs(rl - gl))
+
+    return loss * 2
+
+
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        dr = dr.float()
+        dg = dg.float()
+        r_loss = torch.mean((1 - dr) ** 2)
+        g_loss = torch.mean(dg**2)
+        loss += r_loss + g_loss
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+
+    return loss, r_losses, g_losses
+
+
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        dg = dg.float()
+        l = torch.mean((1 - dg) ** 2)
+        gen_losses.append(l)
+        loss += l
+
+    return loss, gen_losses
+
+
+def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
+    """
+    z_p, logs_q: [b, h, t_t]
+    m_p, logs_p: [b, h, t_t]
+    """
+    z_p = z_p.float()
+    logs_q = logs_q.float()
+    m_p = m_p.float()
+    logs_p = logs_p.float()
+    z_mask = z_mask.float()
+
+    kl = logs_p - logs_q - 0.5
+    kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
+    kl = torch.sum(kl * z_mask)
+    l = kl / torch.sum(z_mask)
+    return l
--- a/infer/lib/train/mel_processing.py
+++ b/infer/lib/train/mel_processing.py
+import torch
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+import logging
+
+logger = logging.getLogger(__name__)
+
+MAX_WAV_VALUE = 32768.0
+
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression_torch(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+
+
+def spectral_normalize_torch(magnitudes):
+    return dynamic_range_compression_torch(magnitudes)
+
+
+def spectral_de_normalize_torch(magnitudes):
+    return dynamic_range_decompression_torch(magnitudes)
+
+
+# Reusable banks
+mel_basis = {}
+hann_window = {}
+
+
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+    """Convert waveform into Linear-frequency Linear-amplitude spectrogram.
+
+    Args:
+        y             :: (B, T) - Audio waveforms
+        n_fft
+        sampling_rate
+        hop_size
+        win_size
+        center
+    Returns:
+        :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram
+    """
+
+    # Window - Cache if needed
+    global hann_window
+    dtype_device = str(y.dtype) + "_" + str(y.device)
+    wnsize_dtype_device = str(win_size) + "_" + dtype_device
+    if wnsize_dtype_device not in hann_window:
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
+            dtype=y.dtype, device=y.device
+        )
+
+    # Padding
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+
+    # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2)
+    spec = torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window[wnsize_dtype_device],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+
+    # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame)
+    spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6)
+    return spec
+
+
+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
+    # MelBasis - Cache if needed
+    global mel_basis
+    dtype_device = str(spec.dtype) + "_" + str(spec.device)
+    fmax_dtype_device = str(fmax) + "_" + dtype_device
+    if fmax_dtype_device not in mel_basis:
+        mel = librosa_mel_fn(
+            sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
+        )
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
+            dtype=spec.dtype, device=spec.device
+        )
+
+    # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame)
+    melspec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+    melspec = spectral_normalize_torch(melspec)
+    return melspec
+
+
+def mel_spectrogram_torch(
+    y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
+):
+    """Convert waveform into Mel-frequency Log-amplitude spectrogram.
+
+    Args:
+        y       :: (B, T)           - Waveforms
+    Returns:
+        melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram
+    """
+    # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame)
+    spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center)
+
+    # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame)
+    melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax)
+
+    return melspec
--- a/infer/lib/train/process_ckpt.py
+++ b/infer/lib/train/process_ckpt.py
+import os
+import sys
+import traceback
+from collections import OrderedDict
+
+import torch
+
+from i18n.i18n import I18nAuto
+
+i18n = I18nAuto()
+
+
+def savee(ckpt, sr, if_f0, name, epoch, version, hps):
+    try:
+        opt = OrderedDict()
+        opt["weight"] = {}
+        for key in ckpt.keys():
+            if "enc_q" in key:
+                continue
+            opt["weight"][key] = ckpt[key].half()
+        opt["config"] = [
+            hps.data.filter_length // 2 + 1,
+            32,
+            hps.model.inter_channels,
+            hps.model.hidden_channels,
+            hps.model.filter_channels,
+            hps.model.n_heads,
+            hps.model.n_layers,
+            hps.model.kernel_size,
+            hps.model.p_dropout,
+            hps.model.resblock,
+            hps.model.resblock_kernel_sizes,
+            hps.model.resblock_dilation_sizes,
+            hps.model.upsample_rates,
+            hps.model.upsample_initial_channel,
+            hps.model.upsample_kernel_sizes,
+            hps.model.spk_embed_dim,
+            hps.model.gin_channels,
+            hps.data.sampling_rate,
+        ]
+        opt["info"] = "%sepoch" % epoch
+        opt["sr"] = sr
+        opt["f0"] = if_f0
+        opt["version"] = version
+        torch.save(opt, "assets/weights/%s.pth" % name)
+        return "Success."
+    except:
+        return traceback.format_exc()
+
+
+def show_info(path):
+    try:
+        a = torch.load(path, map_location="cpu")
+        return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s\n版本:%s" % (
+            a.get("info", "None"),
+            a.get("sr", "None"),
+            a.get("f0", "None"),
+            a.get("version", "None"),
+        )
+    except:
+        return traceback.format_exc()
+
+
+def extract_small_model(path, name, sr, if_f0, info, version):
+    try:
+        ckpt = torch.load(path, map_location="cpu")
+        if "model" in ckpt:
+            ckpt = ckpt["model"]
+        opt = OrderedDict()
+        opt["weight"] = {}
+        for key in ckpt.keys():
+            if "enc_q" in key:
+                continue
+            opt["weight"][key] = ckpt[key].half()
+        if sr == "40k":
+            opt["config"] = [
+                1025,
+                32,
+                192,
+                192,
+                768,
+                2,
+                6,
+                3,
+                0,
+                "1",
+                [3, 7, 11],
+                [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                [10, 10, 2, 2],
+                512,
+                [16, 16, 4, 4],
+                109,
+                256,
+                40000,
+            ]
+        elif sr == "48k":
+            if version == "v1":
+                opt["config"] = [
+                    1025,
+                    32,
+                    192,
+                    192,
+                    768,
+                    2,
+                    6,
+                    3,
+                    0,
+                    "1",
+                    [3, 7, 11],
+                    [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                    [10, 6, 2, 2, 2],
+                    512,
+                    [16, 16, 4, 4, 4],
+                    109,
+                    256,
+                    48000,
+                ]
+            else:
+                opt["config"] = [
+                    1025,
+                    32,
+                    192,
+                    192,
+                    768,
+                    2,
+                    6,
+                    3,
+                    0,
+                    "1",
+                    [3, 7, 11],
+                    [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                    [12, 10, 2, 2],
+                    512,
+                    [24, 20, 4, 4],
+                    109,
+                    256,
+                    48000,
+                ]
+        elif sr == "32k":
+            if version == "v1":
+                opt["config"] = [
+                    513,
+                    32,
+                    192,
+                    192,
+                    768,
+                    2,
+                    6,
+                    3,
+                    0,
+                    "1",
+                    [3, 7, 11],
+                    [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                    [10, 4, 2, 2, 2],
+                    512,
+                    [16, 16, 4, 4, 4],
+                    109,
+                    256,
+                    32000,
+                ]
+            else:
+                opt["config"] = [
+                    513,
+                    32,
+                    192,
+                    192,
+                    768,
+                    2,
+                    6,
+                    3,
+                    0,
+                    "1",
+                    [3, 7, 11],
+                    [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                    [10, 8, 2, 2],
+                    512,
+                    [20, 16, 4, 4],
+                    109,
+                    256,
+                    32000,
+                ]
+        if info == "":
+            info = "Extracted model."
+        opt["info"] = info
+        opt["version"] = version
+        opt["sr"] = sr
+        opt["f0"] = int(if_f0)
+        torch.save(opt, "assets/weights/%s.pth" % name)
+        return "Success."
+    except:
+        return traceback.format_exc()
+
+
+def change_info(path, info, name):
+    try:
+        ckpt = torch.load(path, map_location="cpu")
+        ckpt["info"] = info
+        if name == "":
+            name = os.path.basename(path)
+        torch.save(ckpt, "assets/weights/%s" % name)
+        return "Success."
+    except:
+        return traceback.format_exc()
+
+
+def merge(path1, path2, alpha1, sr, f0, info, name, version):
+    try:
+
+        def extract(ckpt):
+            a = ckpt["model"]
+            opt = OrderedDict()
+            opt["weight"] = {}
+            for key in a.keys():
+                if "enc_q" in key:
+                    continue
+                opt["weight"][key] = a[key]
+            return opt
+
+        ckpt1 = torch.load(path1, map_location="cpu")
+        ckpt2 = torch.load(path2, map_location="cpu")
+        cfg = ckpt1["config"]
+        if "model" in ckpt1:
+            ckpt1 = extract(ckpt1)
+        else:
+            ckpt1 = ckpt1["weight"]
+        if "model" in ckpt2:
+            ckpt2 = extract(ckpt2)
+        else:
+            ckpt2 = ckpt2["weight"]
+        if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())):
+            return "Fail to merge the models. The model architectures are not the same."
+        opt = OrderedDict()
+        opt["weight"] = {}
+        for key in ckpt1.keys():
+            # try:
+            if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape:
+                min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0])
+                opt["weight"][key] = (
+                    alpha1 * (ckpt1[key][:min_shape0].float())
+                    + (1 - alpha1) * (ckpt2[key][:min_shape0].float())
+                ).half()
+            else:
+                opt["weight"][key] = (
+                    alpha1 * (ckpt1[key].float()) + (1 - alpha1) * (ckpt2[key].float())
+                ).half()
+        # except:
+        #     pdb.set_trace()
+        opt["config"] = cfg
+        """
+        if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000]
+        elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000]
+        elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000]
+        """
+        opt["sr"] = sr
+        opt["f0"] = 1 if f0 == i18n("是") else 0
+        opt["version"] = version
+        opt["info"] = info
+        torch.save(opt, "assets/weights/%s.pth" % name)
+        return "Success."
+    except:
+        return traceback.format_exc()
--- a/infer/lib/train/utils.py
+++ b/infer/lib/train/utils.py
+import argparse
+import glob
+import json
+import logging
+import os
+import subprocess
+import sys
+import shutil
+
+import numpy as np
+import torch
+from scipy.io.wavfile import read
+
+MATPLOTLIB_FLAG = False
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logger = logging
+
+
+def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1):
+    assert os.path.isfile(checkpoint_path)
+    checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
+
+    ##################
+    def go(model, bkey):
+        saved_state_dict = checkpoint_dict[bkey]
+        if hasattr(model, "module"):
+            state_dict = model.module.state_dict()
+        else:
+            state_dict = model.state_dict()
+        new_state_dict = {}
+        for k, v in state_dict.items():  # 模型需要的shape
+            try:
+                new_state_dict[k] = saved_state_dict[k]
+                if saved_state_dict[k].shape != state_dict[k].shape:
+                    logger.warning(
+                        "shape-%s-mismatch. need: %s, get: %s",
+                        k,
+                        state_dict[k].shape,
+                        saved_state_dict[k].shape,
+                    )  #
+                    raise KeyError
+            except:
+                # logger.info(traceback.format_exc())
+                logger.info("%s is not in the checkpoint", k)  # pretrain缺失的
+                new_state_dict[k] = v  # 模型自带的随机值
+        if hasattr(model, "module"):
+            model.module.load_state_dict(new_state_dict, strict=False)
+        else:
+            model.load_state_dict(new_state_dict, strict=False)
+        return model
+
+    go(combd, "combd")
+    model = go(sbd, "sbd")
+    #############
+    logger.info("Loaded model weights")
+
+    iteration = checkpoint_dict["iteration"]
+    learning_rate = checkpoint_dict["learning_rate"]
+    if (
+        optimizer is not None and load_opt == 1
+    ):  ###加载不了，如果是空的的话，重新初始化，可能还会影响lr时间表的更新，因此在train文件最外围catch
+        #   try:
+        optimizer.load_state_dict(checkpoint_dict["optimizer"])
+    #   except:
+    #     traceback.print_exc()
+    logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
+    return model, optimizer, learning_rate, iteration
+
+
+# def load_checkpoint(checkpoint_path, model, optimizer=None):
+#   assert os.path.isfile(checkpoint_path)
+#   checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+#   iteration = checkpoint_dict['iteration']
+#   learning_rate = checkpoint_dict['learning_rate']
+#   if optimizer is not None:
+#     optimizer.load_state_dict(checkpoint_dict['optimizer'])
+#   # print(1111)
+#   saved_state_dict = checkpoint_dict['model']
+#   # print(1111)
+#
+#   if hasattr(model, 'module'):
+#     state_dict = model.module.state_dict()
+#   else:
+#     state_dict = model.state_dict()
+#   new_state_dict= {}
+#   for k, v in state_dict.items():
+#     try:
+#       new_state_dict[k] = saved_state_dict[k]
+#     except:
+#       logger.info("%s is not in the checkpoint" % k)
+#       new_state_dict[k] = v
+#   if hasattr(model, 'module'):
+#     model.module.load_state_dict(new_state_dict)
+#   else:
+#     model.load_state_dict(new_state_dict)
+#   logger.info("Loaded checkpoint '{}' (epoch {})" .format(
+#     checkpoint_path, iteration))
+#   return model, optimizer, learning_rate, iteration
+def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
+    assert os.path.isfile(checkpoint_path)
+    checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
+
+    saved_state_dict = checkpoint_dict["model"]
+    if hasattr(model, "module"):
+        state_dict = model.module.state_dict()
+    else:
+        state_dict = model.state_dict()
+    new_state_dict = {}
+    for k, v in state_dict.items():  # 模型需要的shape
+        try:
+            new_state_dict[k] = saved_state_dict[k]
+            if saved_state_dict[k].shape != state_dict[k].shape:
+                logger.warning(
+                    "shape-%s-mismatch|need-%s|get-%s",
+                    k,
+                    state_dict[k].shape,
+                    saved_state_dict[k].shape,
+                )  #
+                raise KeyError
+        except:
+            # logger.info(traceback.format_exc())
+            logger.info("%s is not in the checkpoint", k)  # pretrain缺失的
+            new_state_dict[k] = v  # 模型自带的随机值
+    if hasattr(model, "module"):
+        model.module.load_state_dict(new_state_dict, strict=False)
+    else:
+        model.load_state_dict(new_state_dict, strict=False)
+    logger.info("Loaded model weights")
+
+    iteration = checkpoint_dict["iteration"]
+    learning_rate = checkpoint_dict["learning_rate"]
+    if (
+        optimizer is not None and load_opt == 1
+    ):  ###加载不了，如果是空的的话，重新初始化，可能还会影响lr时间表的更新，因此在train文件最外围catch
+        #   try:
+        optimizer.load_state_dict(checkpoint_dict["optimizer"])
+    #   except:
+    #     traceback.print_exc()
+    logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
+    return model, optimizer, learning_rate, iteration
+
+
+def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
+    logger.info(
+        "Saving model and optimizer state at epoch {} to {}".format(
+            iteration, checkpoint_path
+        )
+    )
+    if hasattr(model, "module"):
+        state_dict = model.module.state_dict()
+    else:
+        state_dict = model.state_dict()
+    torch.save(
+        {
+            "model": state_dict,
+            "iteration": iteration,
+            "optimizer": optimizer.state_dict(),
+            "learning_rate": learning_rate,
+        },
+        checkpoint_path,
+    )
+
+
+def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path):
+    logger.info(
+        "Saving model and optimizer state at epoch {} to {}".format(
+            iteration, checkpoint_path
+        )
+    )
+    if hasattr(combd, "module"):
+        state_dict_combd = combd.module.state_dict()
+    else:
+        state_dict_combd = combd.state_dict()
+    if hasattr(sbd, "module"):
+        state_dict_sbd = sbd.module.state_dict()
+    else:
+        state_dict_sbd = sbd.state_dict()
+    torch.save(
+        {
+            "combd": state_dict_combd,
+            "sbd": state_dict_sbd,
+            "iteration": iteration,
+            "optimizer": optimizer.state_dict(),
+            "learning_rate": learning_rate,
+        },
+        checkpoint_path,
+    )
+
+
+def summarize(
+    writer,
+    global_step,
+    scalars={},
+    histograms={},
+    images={},
+    audios={},
+    audio_sampling_rate=22050,
+):
+    for k, v in scalars.items():
+        writer.add_scalar(k, v, global_step)
+    for k, v in histograms.items():
+        writer.add_histogram(k, v, global_step)
+    for k, v in images.items():
+        writer.add_image(k, v, global_step, dataformats="HWC")
+    for k, v in audios.items():
+        writer.add_audio(k, v, global_step, audio_sampling_rate)
+
+
+def latest_checkpoint_path(dir_path, regex="G_*.pth"):
+    f_list = glob.glob(os.path.join(dir_path, regex))
+    f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
+    x = f_list[-1]
+    logger.debug(x)
+    return x
+
+
+def plot_spectrogram_to_numpy(spectrogram):
+    global MATPLOTLIB_FLAG
+    if not MATPLOTLIB_FLAG:
+        import matplotlib
+
+        matplotlib.use("Agg")
+        MATPLOTLIB_FLAG = True
+        mpl_logger = logging.getLogger("matplotlib")
+        mpl_logger.setLevel(logging.WARNING)
+    import matplotlib.pylab as plt
+    import numpy as np
+
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
+    plt.colorbar(im, ax=ax)
+    plt.xlabel("Frames")
+    plt.ylabel("Channels")
+    plt.tight_layout()
+
+    fig.canvas.draw()
+    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
+    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+    plt.close()
+    return data
+
+
+def plot_alignment_to_numpy(alignment, info=None):
+    global MATPLOTLIB_FLAG
+    if not MATPLOTLIB_FLAG:
+        import matplotlib
+
+        matplotlib.use("Agg")
+        MATPLOTLIB_FLAG = True
+        mpl_logger = logging.getLogger("matplotlib")
+        mpl_logger.setLevel(logging.WARNING)
+    import matplotlib.pylab as plt
+    import numpy as np
+
+    fig, ax = plt.subplots(figsize=(6, 4))
+    im = ax.imshow(
+        alignment.transpose(), aspect="auto", origin="lower", interpolation="none"
+    )
+    fig.colorbar(im, ax=ax)
+    xlabel = "Decoder timestep"
+    if info is not None:
+        xlabel += "\n\n" + info
+    plt.xlabel(xlabel)
+    plt.ylabel("Encoder timestep")
+    plt.tight_layout()
+
+    fig.canvas.draw()
+    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
+    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+    plt.close()
+    return data
+
+
+def load_wav_to_torch(full_path):
+    sampling_rate, data = read(full_path)
+    return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+
+
+def load_filepaths_and_text(filename, split="|"):
+    try:
+        with open(filename, encoding="utf-8") as f:
+            filepaths_and_text = [line.strip().split(split) for line in f]
+    except UnicodeDecodeError:
+        with open(filename) as f:
+            filepaths_and_text = [line.strip().split(split) for line in f]
+    
+    return filepaths_and_text
+
+
+def get_hparams(init=True):
+    """
+    todo:
+      结尾七人组：
+        保存频率、总epoch                     done
+        bs                                    done
+        pretrainG、pretrainD                  done
+        卡号：os.en["CUDA_VISIBLE_DEVICES"]   done
+        if_latest                             done
+      模型：if_f0                             done
+      采样率：自动选择config                  done
+      是否缓存数据集进GPU:if_cache_data_in_gpu done
+
+      -m:
+        自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files    done
+      -c不要了
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-se",
+        "--save_every_epoch",
+        type=int,
+        required=True,
+        help="checkpoint save frequency (epoch)",
+    )
+    parser.add_argument(
+        "-te", "--total_epoch", type=int, required=True, help="total_epoch"
+    )
+    parser.add_argument(
+        "-pg", "--pretrainG", type=str, default="", help="Pretrained Generator path"
+    )
+    parser.add_argument(
+        "-pd", "--pretrainD", type=str, default="", help="Pretrained Discriminator path"
+    )
+    parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -")
+    parser.add_argument(
+        "-bs", "--batch_size", type=int, required=True, help="batch size"
+    )
+    parser.add_argument(
+        "-e", "--experiment_dir", type=str, required=True, help="experiment dir"
+    )  # -m
+    parser.add_argument(
+        "-sr", "--sample_rate", type=str, required=True, help="sample rate, 32k/40k/48k"
+    )
+    parser.add_argument(
+        "-sw",
+        "--save_every_weights",
+        type=str,
+        default="0",
+        help="save the extracted model in weights directory when saving checkpoints",
+    )
+    parser.add_argument(
+        "-v", "--version", type=str, required=True, help="model version"
+    )
+    parser.add_argument(
+        "-f0",
+        "--if_f0",
+        type=int,
+        required=True,
+        help="use f0 as one of the inputs of the model, 1 or 0",
+    )
+    parser.add_argument(
+        "-l",
+        "--if_latest",
+        type=int,
+        required=True,
+        help="if only save the latest G/D pth file, 1 or 0",
+    )
+    parser.add_argument(
+        "-c",
+        "--if_cache_data_in_gpu",
+        type=int,
+        required=True,
+        help="if caching the dataset in GPU memory, 1 or 0",
+    )
+
+    args = parser.parse_args()
+    name = args.experiment_dir
+    experiment_dir = os.path.join("./logs", args.experiment_dir)
+
+    config_save_path = os.path.join(experiment_dir, "config.json")
+    with open(config_save_path, "r") as f:
+        config = json.load(f)
+
+    hparams = HParams(**config)
+    hparams.model_dir = hparams.experiment_dir = experiment_dir
+    hparams.save_every_epoch = args.save_every_epoch
+    hparams.name = name
+    hparams.total_epoch = args.total_epoch
+    hparams.pretrainG = args.pretrainG
+    hparams.pretrainD = args.pretrainD
+    hparams.version = args.version
+    hparams.gpus = args.gpus
+    hparams.train.batch_size = args.batch_size
+    hparams.sample_rate = args.sample_rate
+    hparams.if_f0 = args.if_f0
+    hparams.if_latest = args.if_latest
+    hparams.save_every_weights = args.save_every_weights
+    hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu
+    hparams.data.training_files = "%s/filelist.txt" % experiment_dir
+    return hparams
+
+
+def get_hparams_from_dir(model_dir):
+    config_save_path = os.path.join(model_dir, "config.json")
+    with open(config_save_path, "r") as f:
+        data = f.read()
+    config = json.loads(data)
+
+    hparams = HParams(**config)
+    hparams.model_dir = model_dir
+    return hparams
+
+
+def get_hparams_from_file(config_path):
+    with open(config_path, "r") as f:
+        data = f.read()
+    config = json.loads(data)
+
+    hparams = HParams(**config)
+    return hparams
+
+
+def check_git_hash(model_dir):
+    source_dir = os.path.dirname(os.path.realpath(__file__))
+    if not os.path.exists(os.path.join(source_dir, ".git")):
+        logger.warning(
+            "{} is not a git repository, therefore hash value comparison will be ignored.".format(
+                source_dir
+            )
+        )
+        return
+
+    cur_hash = subprocess.getoutput("git rev-parse HEAD")
+
+    path = os.path.join(model_dir, "githash")
+    if os.path.exists(path):
+        saved_hash = open(path).read()
+        if saved_hash != cur_hash:
+            logger.warning(
+                "git hash values are different. {}(saved) != {}(current)".format(
+                    saved_hash[:8], cur_hash[:8]
+                )
+            )
+    else:
+        open(path, "w").write(cur_hash)
+
+
+def get_logger(model_dir, filename="train.log"):
+    global logger
+    logger = logging.getLogger(os.path.basename(model_dir))
+    logger.setLevel(logging.DEBUG)
+
+    formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+    h = logging.FileHandler(os.path.join(model_dir, filename))
+    h.setLevel(logging.DEBUG)
+    h.setFormatter(formatter)
+    logger.addHandler(h)
+    return logger
+
+
+class HParams:
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            if type(v) == dict:
+                v = HParams(**v)
+            self[k] = v
+
+    def keys(self):
+        return self.__dict__.keys()
+
+    def items(self):
+        return self.__dict__.items()
+
+    def values(self):
+        return self.__dict__.values()
+
+    def __len__(self):
+        return len(self.__dict__)
+
+    def __getitem__(self, key):
+        return getattr(self, key)
+
+    def __setitem__(self, key, value):
+        return setattr(self, key, value)
+
+    def __contains__(self, key):
+        return key in self.__dict__
+
+    def __repr__(self):
+        return self.__dict__.__repr__()
--- a/infer/lib/uvr5_pack/lib_v5/dataset.py
+++ b/infer/lib/uvr5_pack/lib_v5/dataset.py
+import os
+import random
+
+import numpy as np
+import torch
+import torch.utils.data
+from tqdm import tqdm
+
+from . import spec_utils
+
+
+class VocalRemoverValidationSet(torch.utils.data.Dataset):
+    def __init__(self, patch_list):
+        self.patch_list = patch_list
+
+    def __len__(self):
+        return len(self.patch_list)
+
+    def __getitem__(self, idx):
+        path = self.patch_list[idx]
+        data = np.load(path)
+
+        X, y = data["X"], data["y"]
+
+        X_mag = np.abs(X)
+        y_mag = np.abs(y)
+
+        return X_mag, y_mag
+
+
+def make_pair(mix_dir, inst_dir):
+    input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
+
+    X_list = sorted(
+        [
+            os.path.join(mix_dir, fname)
+            for fname in os.listdir(mix_dir)
+            if os.path.splitext(fname)[1] in input_exts
+        ]
+    )
+    y_list = sorted(
+        [
+            os.path.join(inst_dir, fname)
+            for fname in os.listdir(inst_dir)
+            if os.path.splitext(fname)[1] in input_exts
+        ]
+    )
+
+    filelist = list(zip(X_list, y_list))
+
+    return filelist
+
+
+def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
+    if split_mode == "random":
+        filelist = make_pair(
+            os.path.join(dataset_dir, "mixtures"),
+            os.path.join(dataset_dir, "instruments"),
+        )
+
+        random.shuffle(filelist)
+
+        if len(val_filelist) == 0:
+            val_size = int(len(filelist) * val_rate)
+            train_filelist = filelist[:-val_size]
+            val_filelist = filelist[-val_size:]
+        else:
+            train_filelist = [
+                pair for pair in filelist if list(pair) not in val_filelist
+            ]
+    elif split_mode == "subdirs":
+        if len(val_filelist) != 0:
+            raise ValueError(
+                "The `val_filelist` option is not available in `subdirs` mode"
+            )
+
+        train_filelist = make_pair(
+            os.path.join(dataset_dir, "training/mixtures"),
+            os.path.join(dataset_dir, "training/instruments"),
+        )
+
+        val_filelist = make_pair(
+            os.path.join(dataset_dir, "validation/mixtures"),
+            os.path.join(dataset_dir, "validation/instruments"),
+        )
+
+    return train_filelist, val_filelist
+
+
+def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
+    perm = np.random.permutation(len(X))
+    for i, idx in enumerate(tqdm(perm)):
+        if np.random.uniform() < reduction_rate:
+            y[idx] = spec_utils.reduce_vocal_aggressively(
+                X[idx], y[idx], reduction_mask
+            )
+
+        if np.random.uniform() < 0.5:
+            # swap channel
+            X[idx] = X[idx, ::-1]
+            y[idx] = y[idx, ::-1]
+        if np.random.uniform() < 0.02:
+            # mono
+            X[idx] = X[idx].mean(axis=0, keepdims=True)
+            y[idx] = y[idx].mean(axis=0, keepdims=True)
+        if np.random.uniform() < 0.02:
+            # inst
+            X[idx] = y[idx]
+
+        if np.random.uniform() < mixup_rate and i < len(perm) - 1:
+            lam = np.random.beta(mixup_alpha, mixup_alpha)
+            X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]]
+            y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]]
+
+    return X, y
+
+
+def make_padding(width, cropsize, offset):
+    left = offset
+    roi_size = cropsize - left * 2
+    if roi_size == 0:
+        roi_size = cropsize
+    right = roi_size - (width % roi_size) + left
+
+    return left, right, roi_size
+
+
+def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
+    len_dataset = patches * len(filelist)
+
+    X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
+    y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
+
+    for i, (X_path, y_path) in enumerate(tqdm(filelist)):
+        X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
+        coef = np.max([np.abs(X).max(), np.abs(y).max()])
+        X, y = X / coef, y / coef
+
+        l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
+        X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
+        y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
+
+        starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
+        ends = starts + cropsize
+        for j in range(patches):
+            idx = i * patches + j
+            X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]]
+            y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]]
+
+    return X_dataset, y_dataset
+
+
+def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
+    patch_list = []
+    patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
+        cropsize, sr, hop_length, n_fft, offset
+    )
+    os.makedirs(patch_dir, exist_ok=True)
+
+    for i, (X_path, y_path) in enumerate(tqdm(filelist)):
+        basename = os.path.splitext(os.path.basename(X_path))[0]
+
+        X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
+        coef = np.max([np.abs(X).max(), np.abs(y).max()])
+        X, y = X / coef, y / coef
+
+        l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
+        X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
+        y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
+
+        len_dataset = int(np.ceil(X.shape[2] / roi_size))
+        for j in range(len_dataset):
+            outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
+            start = j * roi_size
+            if not os.path.exists(outpath):
+                np.savez(
+                    outpath,
+                    X=X_pad[:, :, start : start + cropsize],
+                    y=y_pad[:, :, start : start + cropsize],
+                )
+            patch_list.append(outpath)
+
+    return VocalRemoverValidationSet(patch_list)
--- a/infer/lib/uvr5_pack/lib_v5/layers.py
+++ b/infer/lib/uvr5_pack/lib_v5/layers.py
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin,
+                nout,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                bias=False,
+            ),
+            nn.BatchNorm2d(nout),
+            activ(),
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(SeperableConv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin,
+                nin,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                groups=nin,
+                bias=False,
+            ),
+            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+            nn.BatchNorm2d(nout),
+            activ(),
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class Encoder(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+    def __call__(self, x):
+        skip = self.conv1(x)
+        h = self.conv2(skip)
+
+        return h, skip
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+    ):
+        super(Decoder, self).__init__()
+        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def __call__(self, x, skip=None):
+        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, x)
+            x = torch.cat([x, skip], dim=1)
+        h = self.conv(x)
+
+        if self.dropout is not None:
+            h = self.dropout(h)
+
+        return h
+
+
+class ASPPModule(nn.Module):
+    def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, None)),
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+        )
+        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        self.conv3 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+        )
+        self.conv4 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+        )
+        self.conv5 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
+        self.bottleneck = nn.Sequential(
+            Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+        )
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(
+            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+        )
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+        bottle = self.bottleneck(out)
+        return bottle
--- a/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py
+++ b/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin,
+                nout,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                bias=False,
+            ),
+            nn.BatchNorm2d(nout),
+            activ(),
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(SeperableConv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin,
+                nin,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                groups=nin,
+                bias=False,
+            ),
+            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+            nn.BatchNorm2d(nout),
+            activ(),
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class Encoder(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+    def __call__(self, x):
+        skip = self.conv1(x)
+        h = self.conv2(skip)
+
+        return h, skip
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+    ):
+        super(Decoder, self).__init__()
+        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def __call__(self, x, skip=None):
+        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, x)
+            x = torch.cat([x, skip], dim=1)
+        h = self.conv(x)
+
+        if self.dropout is not None:
+            h = self.dropout(h)
+
+        return h
+
+
+class ASPPModule(nn.Module):
+    def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, None)),
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+        )
+        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        self.conv3 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+        )
+        self.conv4 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+        )
+        self.conv5 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
+        self.bottleneck = nn.Sequential(
+            Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+        )
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(
+            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+        )
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+        bottle = self.bottleneck(out)
+        return bottle
--- a/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py
+++ b/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin,
+                nout,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                bias=False,
+            ),
+            nn.BatchNorm2d(nout),
+            activ(),
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(SeperableConv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin,
+                nin,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                groups=nin,
+                bias=False,
+            ),
+            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+            nn.BatchNorm2d(nout),
+            activ(),
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class Encoder(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+    def __call__(self, x):
+        skip = self.conv1(x)
+        h = self.conv2(skip)
+
+        return h, skip
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+    ):
+        super(Decoder, self).__init__()
+        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def __call__(self, x, skip=None):
+        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, x)
+            x = torch.cat([x, skip], dim=1)
+        h = self.conv(x)
+
+        if self.dropout is not None:
+            h = self.dropout(h)
+
+        return h
+
+
+class ASPPModule(nn.Module):
+    def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, None)),
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+        )
+        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        self.conv3 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+        )
+        self.conv4 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+        )
+        self.conv5 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
+        self.bottleneck = nn.Sequential(
+            Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+        )
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(
+            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+        )
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+        bottle = self.bottleneck(out)
+        return bottle
--- a/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py
+++ b/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin,
+                nout,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                bias=False,
+            ),
+            nn.BatchNorm2d(nout),
+            activ(),
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(SeperableConv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin,
+                nin,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                groups=nin,
+                bias=False,
+            ),
+            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+            nn.BatchNorm2d(nout),
+            activ(),
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class Encoder(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+    def __call__(self, x):
+        skip = self.conv1(x)
+        h = self.conv2(skip)
+
+        return h, skip
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+    ):
+        super(Decoder, self).__init__()
+        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def __call__(self, x, skip=None):
+        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, x)
+            x = torch.cat([x, skip], dim=1)
+        h = self.conv(x)
+
+        if self.dropout is not None:
+            h = self.dropout(h)
+
+        return h
+
+
+class ASPPModule(nn.Module):
+    def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, None)),
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+        )
+        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        self.conv3 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+        )
+        self.conv4 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+        )
+        self.conv5 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
+        self.conv6 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
+        self.conv7 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
+        self.bottleneck = nn.Sequential(
+            Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+        )
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(
+            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+        )
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        feat6 = self.conv6(x)
+        feat7 = self.conv7(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
+        bottle = self.bottleneck(out)
+        return bottle
--- a/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py
+++ b/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin,
+                nout,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                bias=False,
+            ),
+            nn.BatchNorm2d(nout),
+            activ(),
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(SeperableConv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin,
+                nin,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                groups=nin,
+                bias=False,
+            ),
+            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+            nn.BatchNorm2d(nout),
+            activ(),
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class Encoder(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+    def __call__(self, x):
+        skip = self.conv1(x)
+        h = self.conv2(skip)
+
+        return h, skip
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+    ):
+        super(Decoder, self).__init__()
+        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def __call__(self, x, skip=None):
+        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, x)
+            x = torch.cat([x, skip], dim=1)
+        h = self.conv(x)
+
+        if self.dropout is not None:
+            h = self.dropout(h)
+
+        return h
+
+
+class ASPPModule(nn.Module):
+    def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, None)),
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+        )
+        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        self.conv3 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+        )
+        self.conv4 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+        )
+        self.conv5 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
+        self.conv6 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
+        self.conv7 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
+        self.bottleneck = nn.Sequential(
+            Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+        )
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(
+            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+        )
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        feat6 = self.conv6(x)
+        feat7 = self.conv7(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
+        bottle = self.bottleneck(out)
+        return bottle
--- a/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py
+++ b/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin,
+                nout,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                bias=False,
+            ),
+            nn.BatchNorm2d(nout),
+            activ(),
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class SeperableConv2DBNActiv(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(SeperableConv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin,
+                nin,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                groups=nin,
+                bias=False,
+            ),
+            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
+            nn.BatchNorm2d(nout),
+            activ(),
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class Encoder(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
+
+    def __call__(self, x):
+        skip = self.conv1(x)
+        h = self.conv2(skip)
+
+        return h, skip
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+    ):
+        super(Decoder, self).__init__()
+        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def __call__(self, x, skip=None):
+        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, x)
+            x = torch.cat([x, skip], dim=1)
+        h = self.conv(x)
+
+        if self.dropout is not None:
+            h = self.dropout(h)
+
+        return h
+
+
+class ASPPModule(nn.Module):
+    def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, None)),
+            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
+        )
+        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
+        self.conv3 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
+        )
+        self.conv4 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
+        )
+        self.conv5 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
+        self.conv6 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
+        self.conv7 = SeperableConv2DBNActiv(
+            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
+        )
+        self.bottleneck = nn.Sequential(
+            Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
+        )
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(
+            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+        )
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        feat6 = self.conv6(x)
+        feat7 = self.conv7(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
+        bottle = self.bottleneck(out)
+        return bottle
--- a/infer/lib/uvr5_pack/lib_v5/layers_new.py
+++ b/infer/lib/uvr5_pack/lib_v5/layers_new.py
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from . import spec_utils
+
+
+class Conv2DBNActiv(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
+        super(Conv2DBNActiv, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                nin,
+                nout,
+                kernel_size=ksize,
+                stride=stride,
+                padding=pad,
+                dilation=dilation,
+                bias=False,
+            ),
+            nn.BatchNorm2d(nout),
+            activ(),
+        )
+
+    def __call__(self, x):
+        return self.conv(x)
+
+
+class Encoder(nn.Module):
+    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
+        super(Encoder, self).__init__()
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
+        self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
+
+    def __call__(self, x):
+        h = self.conv1(x)
+        h = self.conv2(h)
+
+        return h
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
+    ):
+        super(Decoder, self).__init__()
+        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
+        # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def __call__(self, x, skip=None):
+        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+
+        if skip is not None:
+            skip = spec_utils.crop_center(skip, x)
+            x = torch.cat([x, skip], dim=1)
+
+        h = self.conv1(x)
+        # h = self.conv2(h)
+
+        if self.dropout is not None:
+            h = self.dropout(h)
+
+        return h
+
+
+class ASPPModule(nn.Module):
+    def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, None)),
+            Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
+        )
+        self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
+        self.conv3 = Conv2DBNActiv(
+            nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
+        )
+        self.conv4 = Conv2DBNActiv(
+            nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
+        )
+        self.conv5 = Conv2DBNActiv(
+            nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
+        )
+        self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
+        self.dropout = nn.Dropout2d(0.1) if dropout else None
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(
+            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
+        )
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
+        out = self.bottleneck(out)
+
+        if self.dropout is not None:
+            out = self.dropout(out)
+
+        return out
+
+
+class LSTMModule(nn.Module):
+    def __init__(self, nin_conv, nin_lstm, nout_lstm):
+        super(LSTMModule, self).__init__()
+        self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
+        self.lstm = nn.LSTM(
+            input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
+        )
+        self.dense = nn.Sequential(
+            nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
+        )
+
+    def forward(self, x):
+        N, _, nbins, nframes = x.size()
+        h = self.conv(x)[:, 0]  # N, nbins, nframes
+        h = h.permute(2, 0, 1)  # nframes, N, nbins
+        h, _ = self.lstm(h)
+        h = self.dense(h.reshape(-1, h.size()[-1]))  # nframes * N, nbins
+        h = h.reshape(nframes, N, 1, nbins)
+        h = h.permute(1, 2, 3, 0)
+
+        return h
--- a/infer/lib/uvr5_pack/lib_v5/model_param_init.py
+++ b/infer/lib/uvr5_pack/lib_v5/model_param_init.py
+import json
+import os
+import pathlib
+
+default_param = {}
+default_param["bins"] = 768
+default_param["unstable_bins"] = 9  # training only
+default_param["reduction_bins"] = 762  # training only
+default_param["sr"] = 44100
+default_param["pre_filter_start"] = 757
+default_param["pre_filter_stop"] = 768
+default_param["band"] = {}
+
+
+default_param["band"][1] = {
+    "sr": 11025,
+    "hl": 128,
+    "n_fft": 960,
+    "crop_start": 0,
+    "crop_stop": 245,
+    "lpf_start": 61,  # inference only
+    "res_type": "polyphase",
+}
+
+default_param["band"][2] = {
+    "sr": 44100,
+    "hl": 512,
+    "n_fft": 1536,
+    "crop_start": 24,
+    "crop_stop": 547,
+    "hpf_start": 81,  # inference only
+    "res_type": "sinc_best",
+}
+
+
+def int_keys(d):
+    r = {}
+    for k, v in d:
+        if k.isdigit():
+            k = int(k)
+        r[k] = v
+    return r
+
+
+class ModelParameters(object):
+    def __init__(self, config_path=""):
+        if ".pth" == pathlib.Path(config_path).suffix:
+            import zipfile
+
+            with zipfile.ZipFile(config_path, "r") as zip:
+                self.param = json.loads(
+                    zip.read("param.json"), object_pairs_hook=int_keys
+                )
+        elif ".json" == pathlib.Path(config_path).suffix:
+            with open(config_path, "r") as f:
+                self.param = json.loads(f.read(), object_pairs_hook=int_keys)
+        else:
+            self.param = default_param
+
+        for k in [
+            "mid_side",
+            "mid_side_b",
+            "mid_side_b2",
+            "stereo_w",
+            "stereo_n",
+            "reverse",
+        ]:
+            if not k in self.param:
+                self.param[k] = False
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json
+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 16000,
+			"hl": 512,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 16000,
+	"pre_filter_start": 1023,
+	"pre_filter_stop": 1024
+}
\ No newline at end of file
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json
+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 32000,
+			"hl": 512,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "kaiser_fast"
+		}
+	},
+	"sr": 32000,
+	"pre_filter_start": 1000,
+	"pre_filter_stop": 1021
+}
\ No newline at end of file
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json
+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 33075,
+			"hl": 384,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 33075,
+	"pre_filter_start": 1000,
+	"pre_filter_stop": 1021
+}
\ No newline at end of file
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json
+++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json
+{
+	"bins": 1024,
+	"unstable_bins": 0,
+	"reduction_bins": 0,
+	"band": {
+		"1": {
+			"sr": 44100,
+			"hl": 1024,
+			"n_fft": 2048,
+			"crop_start": 0,
+			"crop_stop": 1024,
+			"hpf_start": -1,
+			"res_type": "sinc_best"
+		}
+	},
+	"sr": 44100,
+	"pre_filter_start": 1023,
+	"pre_filter_stop": 1024
+}
\ No newline at end of file