v1.0

9867304a · chenzk · 9867304a · 9867304a · 9867304a · 9867304a
Commit 9867304a authored Jul 23, 2024 by chenzk
20 changed files
--- a/infer/modules/train/train.py
+++ b/infer/modules/train/train.py
+import os
+import sys
+import logging
+logger = logging.getLogger(__name__)
+now_dir = os.getcwd()
+sys.path.append(os.path.join(now_dir))
+import datetime
+from infer.lib.train import utils
+hps = utils.get_hparams()
+os.environ["CUDA_VISIBLE_DEVICES"] = hps.gpus.replace("-", ",")
+n_gpus = len(hps.gpus.split("-"))
+from random import randint, shuffle
+import torch
+try:
+    import intel_extension_for_pytorch as ipex  # pylint: disable=import-error, unused-import
+    if torch.xpu.is_available():
+        from infer.modules.ipex import ipex_init
+        from infer.modules.ipex.gradscaler import gradscaler_init
+        from torch.xpu.amp import autocast
+        GradScaler = gradscaler_init()
+        ipex_init()
+    else:
+        from torch.cuda.amp import GradScaler, autocast
+except Exception:
+    from torch.cuda.amp import GradScaler, autocast
+torch.backends.cudnn.deterministic = False
+torch.backends.cudnn.benchmark = False
+from time import sleep
+from time import time as ttime
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from torch.nn import functional as F
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from infer.lib.infer_pack import commons
+from infer.lib.train.data_utils import (
+    DistributedBucketSampler,
+    TextAudioCollate,
+    TextAudioCollateMultiNSFsid,
+    TextAudioLoader,
+    TextAudioLoaderMultiNSFsid,
+)
+if hps.version == "v1":
+    from infer.lib.infer_pack.models import MultiPeriodDiscriminator
+    from infer.lib.infer_pack.models import SynthesizerTrnMs256NSFsid as RVC_Model_f0
+    from infer.lib.infer_pack.models import (
+        SynthesizerTrnMs256NSFsid_nono as RVC_Model_nof0,
+    )
+else:
+    from infer.lib.infer_pack.models import (
+        SynthesizerTrnMs768NSFsid as RVC_Model_f0,
+        SynthesizerTrnMs768NSFsid_nono as RVC_Model_nof0,
+        MultiPeriodDiscriminatorV2 as MultiPeriodDiscriminator,
+    )
+from infer.lib.train.losses import (
+    discriminator_loss,
+    feature_loss,
+    generator_loss,
+    kl_loss,
+)
+from infer.lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
+from infer.lib.train.process_ckpt import savee
+global_step = 0
+class EpochRecorder:
+    def __init__(self):
+        self.last_time = ttime()
+    def record(self):
+        now_time = ttime()
+        elapsed_time = now_time - self.last_time
+        self.last_time = now_time
+        elapsed_time_str = str(datetime.timedelta(seconds=elapsed_time))
+        current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        return f"[{current_time}] | ({elapsed_time_str})"
+def main():
+    n_gpus = torch.cuda.device_count()
+    if torch.cuda.is_available() == False and torch.backends.mps.is_available() == True:
+        n_gpus = 1
+    if n_gpus < 1:
+        # patch to unblock people without gpus. there is probably a better way.
+        print("NO GPU DETECTED: falling back to CPU - this may take a while")
+        n_gpus = 1
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = str(randint(20000, 55555))
+    children = []
+    logger = utils.get_logger(hps.model_dir)
+    for i in range(n_gpus):
+        subproc = mp.Process(
+            target=run,
+            args=(i, n_gpus, hps, logger),
+        )
+        children.append(subproc)
+        subproc.start()
+    for i in range(n_gpus):
+        children[i].join()
+def run(rank, n_gpus, hps, logger: logging.Logger):
+    global global_step
+    if rank == 0:
+        # logger = utils.get_logger(hps.model_dir)
+        logger.info(hps)
+        # utils.check_git_hash(hps.model_dir)
+        writer = SummaryWriter(log_dir=hps.model_dir)
+        writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
+    dist.init_process_group(
+        backend="gloo", init_method="env://", world_size=n_gpus, rank=rank
+    )
+    torch.manual_seed(hps.train.seed)
+    if torch.cuda.is_available():
+        torch.cuda.set_device(rank)
+    if hps.if_f0 == 1:
+        train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data)
+    else:
+        train_dataset = TextAudioLoader(hps.data.training_files, hps.data)
+    train_sampler = DistributedBucketSampler(
+        train_dataset,
+        hps.train.batch_size * n_gpus,
+        # [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200,1400],  # 16s
+        [100, 200, 300, 400, 500, 600, 700, 800, 900],  # 16s
+        num_replicas=n_gpus,
+        rank=rank,
+        shuffle=True,
+    )
+    # It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
+    # num_workers=8 -> num_workers=4
+    if hps.if_f0 == 1:
+        collate_fn = TextAudioCollateMultiNSFsid()
+    else:
+        collate_fn = TextAudioCollate()
+    train_loader = DataLoader(
+        train_dataset,
+        num_workers=4,
+        shuffle=False,
+        pin_memory=True,
+        collate_fn=collate_fn,
+        batch_sampler=train_sampler,
+        persistent_workers=True,
+        prefetch_factor=8,
+    )
+    if hps.if_f0 == 1:
+        net_g = RVC_Model_f0(
+            hps.data.filter_length // 2 + 1,
+            hps.train.segment_size // hps.data.hop_length,
+            **hps.model,
+            is_half=hps.train.fp16_run,
+            sr=hps.sample_rate,
+        )
+    else:
+        net_g = RVC_Model_nof0(
+            hps.data.filter_length // 2 + 1,
+            hps.train.segment_size // hps.data.hop_length,
+            **hps.model,
+            is_half=hps.train.fp16_run,
+        )
+    if torch.cuda.is_available():
+        net_g = net_g.cuda(rank)
+    net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm)
+    if torch.cuda.is_available():
+        net_d = net_d.cuda(rank)
+    optim_g = torch.optim.AdamW(
+        net_g.parameters(),
+        hps.train.learning_rate,
+        betas=hps.train.betas,
+        eps=hps.train.eps,
+    )
+    optim_d = torch.optim.AdamW(
+        net_d.parameters(),
+        hps.train.learning_rate,
+        betas=hps.train.betas,
+        eps=hps.train.eps,
+    )
+    # net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
+    # net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        pass
+    elif torch.cuda.is_available():
+        net_g = DDP(net_g, device_ids=[rank])
+        net_d = DDP(net_d, device_ids=[rank])
+    else:
+        net_g = DDP(net_g)
+        net_d = DDP(net_d)
+    try:  # 如果能加载自动resume
+        _, _, _, epoch_str = utils.load_checkpoint(
+            utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d
+        )  # D多半加载没事
+        if rank == 0:
+            logger.info("loaded D")
+        # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0)
+        _, _, _, epoch_str = utils.load_checkpoint(
+            utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g
+        )
+        global_step = (epoch_str - 1) * len(train_loader)
+        # epoch_str = 1
+        # global_step = 0
+    except:  # 如果首次不能加载，加载pretrain
+        # traceback.print_exc()
+        epoch_str = 1
+        global_step = 0
+        if hps.pretrainG != "":
+            if rank == 0:
+                logger.info("loaded pretrained %s" % (hps.pretrainG))
+            if hasattr(net_g, "module"):
+                logger.info(
+                    net_g.module.load_state_dict(
+                        torch.load(hps.pretrainG, map_location="cpu")["model"]
+                    )
+                )  ##测试不加载优化器
+            else:
+                logger.info(
+                    net_g.load_state_dict(
+                        torch.load(hps.pretrainG, map_location="cpu")["model"]
+                    )
+                )  ##测试不加载优化器
+        if hps.pretrainD != "":
+            if rank == 0:
+                logger.info("loaded pretrained %s" % (hps.pretrainD))
+            if hasattr(net_d, "module"):
+                logger.info(
+                    net_d.module.load_state_dict(
+                        torch.load(hps.pretrainD, map_location="cpu")["model"]
+                    )
+                )
+            else:
+                logger.info(
+                    net_d.load_state_dict(
+                        torch.load(hps.pretrainD, map_location="cpu")["model"]
+                    )
+                )
+    scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
+        optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
+    )
+    scheduler_d = torch.optim.lr_scheduler.ExponentialLR(
+        optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
+    )
+    scaler = GradScaler(enabled=hps.train.fp16_run)
+    cache = []
+    for epoch in range(epoch_str, hps.train.epochs + 1):
+        if rank == 0:
+            train_and_evaluate(
+                rank,
+                epoch,
+                hps,
+                [net_g, net_d],
+                [optim_g, optim_d],
+                [scheduler_g, scheduler_d],
+                scaler,
+                [train_loader, None],
+                logger,
+                [writer, writer_eval],
+                cache,
+            )
+        else:
+            train_and_evaluate(
+                rank,
+                epoch,
+                hps,
+                [net_g, net_d],
+                [optim_g, optim_d],
+                [scheduler_g, scheduler_d],
+                scaler,
+                [train_loader, None],
+                None,
+                None,
+                cache,
+            )
+        scheduler_g.step()
+        scheduler_d.step()
+def train_and_evaluate(
+    rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers, cache
+):
+    net_g, net_d = nets
+    optim_g, optim_d = optims
+    train_loader, eval_loader = loaders
+    if writers is not None:
+        writer, writer_eval = writers
+    train_loader.batch_sampler.set_epoch(epoch)
+    global global_step
+    net_g.train()
+    net_d.train()
+    # Prepare data iterator
+    if hps.if_cache_data_in_gpu == True:
+        # Use Cache
+        data_iterator = cache
+        if cache == []:
+            # Make new cache
+            for batch_idx, info in enumerate(train_loader):
+                # Unpack
+                if hps.if_f0 == 1:
+                    (
+                        phone,
+                        phone_lengths,
+                        pitch,
+                        pitchf,
+                        spec,
+                        spec_lengths,
+                        wave,
+                        wave_lengths,
+                        sid,
+                    ) = info
+                else:
+                    (
+                        phone,
+                        phone_lengths,
+                        spec,
+                        spec_lengths,
+                        wave,
+                        wave_lengths,
+                        sid,
+                    ) = info
+                # Load on CUDA
+                if torch.cuda.is_available():
+                    phone = phone.cuda(rank, non_blocking=True)
+                    phone_lengths = phone_lengths.cuda(rank, non_blocking=True)
+                    if hps.if_f0 == 1:
+                        pitch = pitch.cuda(rank, non_blocking=True)
+                        pitchf = pitchf.cuda(rank, non_blocking=True)
+                    sid = sid.cuda(rank, non_blocking=True)
+                    spec = spec.cuda(rank, non_blocking=True)
+                    spec_lengths = spec_lengths.cuda(rank, non_blocking=True)
+                    wave = wave.cuda(rank, non_blocking=True)
+                    wave_lengths = wave_lengths.cuda(rank, non_blocking=True)
+                # Cache on list
+                if hps.if_f0 == 1:
+                    cache.append(
+                        (
+                            batch_idx,
+                            (
+                                phone,
+                                phone_lengths,
+                                pitch,
+                                pitchf,
+                                spec,
+                                spec_lengths,
+                                wave,
+                                wave_lengths,
+                                sid,
+                            ),
+                        )
+                    )
+                else:
+                    cache.append(
+                        (
+                            batch_idx,
+                            (
+                                phone,
+                                phone_lengths,
+                                spec,
+                                spec_lengths,
+                                wave,
+                                wave_lengths,
+                                sid,
+                            ),
+                        )
+                    )
+        else:
+            # Load shuffled cache
+            shuffle(cache)
+    else:
+        # Loader
+        data_iterator = enumerate(train_loader)
+    # Run steps
+    epoch_recorder = EpochRecorder()
+    for batch_idx, info in data_iterator:
+        # Data
+        ## Unpack
+        if hps.if_f0 == 1:
+            (
+                phone,
+                phone_lengths,
+                pitch,
+                pitchf,
+                spec,
+                spec_lengths,
+                wave,
+                wave_lengths,
+                sid,
+            ) = info
+        else:
+            phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info
+        ## Load on CUDA
+        if (hps.if_cache_data_in_gpu == False) and torch.cuda.is_available():
+            phone = phone.cuda(rank, non_blocking=True)
+            phone_lengths = phone_lengths.cuda(rank, non_blocking=True)
+            if hps.if_f0 == 1:
+                pitch = pitch.cuda(rank, non_blocking=True)
+                pitchf = pitchf.cuda(rank, non_blocking=True)
+            sid = sid.cuda(rank, non_blocking=True)
+            spec = spec.cuda(rank, non_blocking=True)
+            spec_lengths = spec_lengths.cuda(rank, non_blocking=True)
+            wave = wave.cuda(rank, non_blocking=True)
+            # wave_lengths = wave_lengths.cuda(rank, non_blocking=True)
+        # Calculate
+        with autocast(enabled=hps.train.fp16_run):
+            if hps.if_f0 == 1:
+                (
+                    y_hat,
+                    ids_slice,
+                    x_mask,
+                    z_mask,
+                    (z, z_p, m_p, logs_p, m_q, logs_q),
+                ) = net_g(phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid)
+            else:
+                (
+                    y_hat,
+                    ids_slice,
+                    x_mask,
+                    z_mask,
+                    (z, z_p, m_p, logs_p, m_q, logs_q),
+                ) = net_g(phone, phone_lengths, spec, spec_lengths, sid)
+            mel = spec_to_mel_torch(
+                spec,
+                hps.data.filter_length,
+                hps.data.n_mel_channels,
+                hps.data.sampling_rate,
+                hps.data.mel_fmin,
+                hps.data.mel_fmax,
+            )
+            y_mel = commons.slice_segments(
+                mel, ids_slice, hps.train.segment_size // hps.data.hop_length
+            )
+            with autocast(enabled=False):
+                y_hat_mel = mel_spectrogram_torch(
+                    y_hat.float().squeeze(1),
+                    hps.data.filter_length,
+                    hps.data.n_mel_channels,
+                    hps.data.sampling_rate,
+                    hps.data.hop_length,
+                    hps.data.win_length,
+                    hps.data.mel_fmin,
+                    hps.data.mel_fmax,
+                )
+            if hps.train.fp16_run == True:
+                y_hat_mel = y_hat_mel.half()
+            wave = commons.slice_segments(
+                wave, ids_slice * hps.data.hop_length, hps.train.segment_size
+            )  # slice
+            # Discriminator
+            y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
+            with autocast(enabled=False):
+                loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
+                    y_d_hat_r, y_d_hat_g
+                )
+        optim_d.zero_grad()
+        scaler.scale(loss_disc).backward()
+        scaler.unscale_(optim_d)
+        grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
+        scaler.step(optim_d)
+        with autocast(enabled=hps.train.fp16_run):
+            # Generator
+            y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
+            with autocast(enabled=False):
+                loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
+                loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
+                loss_fm = feature_loss(fmap_r, fmap_g)
+                loss_gen, losses_gen = generator_loss(y_d_hat_g)
+                loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
+        optim_g.zero_grad()
+        scaler.scale(loss_gen_all).backward()
+        scaler.unscale_(optim_g)
+        grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
+        scaler.step(optim_g)
+        scaler.update()
+        if rank == 0:
+            if global_step % hps.train.log_interval == 0:
+                lr = optim_g.param_groups[0]["lr"]
+                logger.info(
+                    "Train Epoch: {} [{:.0f}%]".format(
+                        epoch, 100.0 * batch_idx / len(train_loader)
+                    )
+                )
+                # Amor For Tensorboard display
+                if loss_mel > 75:
+                    loss_mel = 75
+                if loss_kl > 9:
+                    loss_kl = 9
+                logger.info([global_step, lr])
+                logger.info(
+                    f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}"
+                )
+                scalar_dict = {
+                    "loss/g/total": loss_gen_all,
+                    "loss/d/total": loss_disc,
+                    "learning_rate": lr,
+                    "grad_norm_d": grad_norm_d,
+                    "grad_norm_g": grad_norm_g,
+                }
+                scalar_dict.update(
+                    {
+                        "loss/g/fm": loss_fm,
+                        "loss/g/mel": loss_mel,
+                        "loss/g/kl": loss_kl,
+                    }
+                )
+                scalar_dict.update(
+                    {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
+                )
+                scalar_dict.update(
+                    {"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}
+                )
+                scalar_dict.update(
+                    {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
+                )
+                image_dict = {
+                    "slice/mel_org": utils.plot_spectrogram_to_numpy(
+                        y_mel[0].data.cpu().numpy()
+                    ),
+                    "slice/mel_gen": utils.plot_spectrogram_to_numpy(
+                        y_hat_mel[0].data.cpu().numpy()
+                    ),
+                    "all/mel": utils.plot_spectrogram_to_numpy(
+                        mel[0].data.cpu().numpy()
+                    ),
+                }
+                utils.summarize(
+                    writer=writer,
+                    global_step=global_step,
+                    images=image_dict,
+                    scalars=scalar_dict,
+                )
+        global_step += 1
+    # /Run steps
+    if epoch % hps.save_every_epoch == 0 and rank == 0:
+        if hps.if_latest == 0:
+            utils.save_checkpoint(
+                net_g,
+                optim_g,
+                hps.train.learning_rate,
+                epoch,
+                os.path.join(hps.model_dir, "G_{}.pth".format(global_step)),
+            )
+            utils.save_checkpoint(
+                net_d,
+                optim_d,
+                hps.train.learning_rate,
+                epoch,
+                os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
+            )
+        else:
+            utils.save_checkpoint(
+                net_g,
+                optim_g,
+                hps.train.learning_rate,
+                epoch,
+                os.path.join(hps.model_dir, "G_{}.pth".format(2333333)),
+            )
+            utils.save_checkpoint(
+                net_d,
+                optim_d,
+                hps.train.learning_rate,
+                epoch,
+                os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
+            )
+        if rank == 0 and hps.save_every_weights == "1":
+            if hasattr(net_g, "module"):
+                ckpt = net_g.module.state_dict()
+            else:
+                ckpt = net_g.state_dict()
+            logger.info(
+                "saving ckpt %s_e%s:%s"
+                % (
+                    hps.name,
+                    epoch,
+                    savee(
+                        ckpt,
+                        hps.sample_rate,
+                        hps.if_f0,
+                        hps.name + "_e%s_s%s" % (epoch, global_step),
+                        epoch,
+                        hps.version,
+                        hps,
+                    ),
+                )
+            )
+    if rank == 0:
+        logger.info("====> Epoch: {} {}".format(epoch, epoch_recorder.record()))
+    if epoch >= hps.total_epoch and rank == 0:
+        logger.info("Training is done. The program is closed.")
+        if hasattr(net_g, "module"):
+            ckpt = net_g.module.state_dict()
+        else:
+            ckpt = net_g.state_dict()
+        logger.info(
+            "saving final ckpt:%s"
+            % (
+                savee(
+                    ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch, hps.version, hps
+                )
+            )
+        )
+        sleep(1)
+        os._exit(2333333)
+if __name__ == "__main__":
+    torch.multiprocessing.set_start_method("spawn")
+    main()
--- a/infer/modules/uvr5/mdxnet.py
+++ b/infer/modules/uvr5/mdxnet.py
+import os
+import logging
+logger = logging.getLogger(__name__)
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+from tqdm import tqdm
+cpu = torch.device("cpu")
+class ConvTDFNetTrim:
+    def __init__(
+        self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024
+    ):
+        super(ConvTDFNetTrim, self).__init__()
+        self.dim_f = dim_f
+        self.dim_t = 2**dim_t
+        self.n_fft = n_fft
+        self.hop = hop
+        self.n_bins = self.n_fft // 2 + 1
+        self.chunk_size = hop * (self.dim_t - 1)
+        self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(
+            device
+        )
+        self.target_name = target_name
+        self.blender = "blender" in model_name
+        self.dim_c = 4
+        out_c = self.dim_c * 4 if target_name == "*" else self.dim_c
+        self.freq_pad = torch.zeros(
+            [1, out_c, self.n_bins - self.dim_f, self.dim_t]
+        ).to(device)
+        self.n = L // 2
+    def stft(self, x):
+        x = x.reshape([-1, self.chunk_size])
+        x = torch.stft(
+            x,
+            n_fft=self.n_fft,
+            hop_length=self.hop,
+            window=self.window,
+            center=True,
+            return_complex=True,
+        )
+        x = torch.view_as_real(x)
+        x = x.permute([0, 3, 1, 2])
+        x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
+            [-1, self.dim_c, self.n_bins, self.dim_t]
+        )
+        return x[:, :, : self.dim_f]
+    def istft(self, x, freq_pad=None):
+        freq_pad = (
+            self.freq_pad.repeat([x.shape[0], 1, 1, 1])
+            if freq_pad is None
+            else freq_pad
+        )
+        x = torch.cat([x, freq_pad], -2)
+        c = 4 * 2 if self.target_name == "*" else 2
+        x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape(
+            [-1, 2, self.n_bins, self.dim_t]
+        )
+        x = x.permute([0, 2, 3, 1])
+        x = x.contiguous()
+        x = torch.view_as_complex(x)
+        x = torch.istft(
+            x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True
+        )
+        return x.reshape([-1, c, self.chunk_size])
+def get_models(device, dim_f, dim_t, n_fft):
+    return ConvTDFNetTrim(
+        device=device,
+        model_name="Conv-TDF",
+        target_name="vocals",
+        L=11,
+        dim_f=dim_f,
+        dim_t=dim_t,
+        n_fft=n_fft,
+    )
+class Predictor:
+    def __init__(self, args):
+        import onnxruntime as ort
+        logger.info(ort.get_available_providers())
+        self.args = args
+        self.model_ = get_models(
+            device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft
+        )
+        self.model = ort.InferenceSession(
+            os.path.join(args.onnx, self.model_.target_name + ".onnx"),
+            providers=[
+                "CUDAExecutionProvider",
+                "DmlExecutionProvider",
+                "CPUExecutionProvider",
+            ],
+        )
+        logger.info("ONNX load done")
+    def demix(self, mix):
+        samples = mix.shape[-1]
+        margin = self.args.margin
+        chunk_size = self.args.chunks * 44100
+        assert not margin == 0, "margin cannot be zero!"
+        if margin > chunk_size:
+            margin = chunk_size
+        segmented_mix = {}
+        if self.args.chunks == 0 or samples < chunk_size:
+            chunk_size = samples
+        counter = -1
+        for skip in range(0, samples, chunk_size):
+            counter += 1
+            s_margin = 0 if counter == 0 else margin
+            end = min(skip + chunk_size + margin, samples)
+            start = skip - s_margin
+            segmented_mix[skip] = mix[:, start:end].copy()
+            if end == samples:
+                break
+        sources = self.demix_base(segmented_mix, margin_size=margin)
+        """
+        mix:(2,big_sample)
+        segmented_mix:offset->(2,small_sample)
+        sources:(1,2,big_sample)
+        """
+        return sources
+    def demix_base(self, mixes, margin_size):
+        chunked_sources = []
+        progress_bar = tqdm(total=len(mixes))
+        progress_bar.set_description("Processing")
+        for mix in mixes:
+            cmix = mixes[mix]
+            sources = []
+            n_sample = cmix.shape[1]
+            model = self.model_
+            trim = model.n_fft // 2
+            gen_size = model.chunk_size - 2 * trim
+            pad = gen_size - n_sample % gen_size
+            mix_p = np.concatenate(
+                (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1
+            )
+            mix_waves = []
+            i = 0
+            while i < n_sample + pad:
+                waves = np.array(mix_p[:, i : i + model.chunk_size])
+                mix_waves.append(waves)
+                i += gen_size
+            mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu)
+            with torch.no_grad():
+                _ort = self.model
+                spek = model.stft(mix_waves)
+                if self.args.denoise:
+                    spec_pred = (
+                        -_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5
+                        + _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5
+                    )
+                    tar_waves = model.istft(torch.tensor(spec_pred))
+                else:
+                    tar_waves = model.istft(
+                        torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0])
+                    )
+                tar_signal = (
+                    tar_waves[:, :, trim:-trim]
+                    .transpose(0, 1)
+                    .reshape(2, -1)
+                    .numpy()[:, :-pad]
+                )
+                start = 0 if mix == 0 else margin_size
+                end = None if mix == list(mixes.keys())[::-1][0] else -margin_size
+                if margin_size == 0:
+                    end = None
+                sources.append(tar_signal[:, start:end])
+                progress_bar.update(1)
+            chunked_sources.append(sources)
+        _sources = np.concatenate(chunked_sources, axis=-1)
+        # del self.model
+        progress_bar.close()
+        return _sources
+    def prediction(self, m, vocal_root, others_root, format):
+        os.makedirs(vocal_root, exist_ok=True)
+        os.makedirs(others_root, exist_ok=True)
+        basename = os.path.basename(m)
+        mix, rate = librosa.load(m, mono=False, sr=44100)
+        if mix.ndim == 1:
+            mix = np.asfortranarray([mix, mix])
+        mix = mix.T
+        sources = self.demix(mix.T)
+        opt = sources[0].T
+        if format in ["wav", "flac"]:
+            sf.write(
+                "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate
+            )
+            sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate)
+        else:
+            path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename)
+            path_other = "%s/%s_others.wav" % (others_root, basename)
+            sf.write(path_vocal, mix - opt, rate)
+            sf.write(path_other, opt, rate)
+            opt_path_vocal = path_vocal[:-4] + ".%s" % format
+            opt_path_other = path_other[:-4] + ".%s" % format
+            if os.path.exists(path_vocal):
+                os.system(
+                    "ffmpeg -i %s -vn %s -q:a 2 -y" % (path_vocal, opt_path_vocal)
+                )
+                if os.path.exists(opt_path_vocal):
+                    try:
+                        os.remove(path_vocal)
+                    except:
+                        pass
+            if os.path.exists(path_other):
+                os.system(
+                    "ffmpeg -i %s -vn %s -q:a 2 -y" % (path_other, opt_path_other)
+                )
+                if os.path.exists(opt_path_other):
+                    try:
+                        os.remove(path_other)
+                    except:
+                        pass
+class MDXNetDereverb:
+    def __init__(self, chunks, device):
+        self.onnx = "assets/uvr5_weights/onnx_dereverb_By_FoxJoy"
+        self.shifts = 10  # 'Predict with randomised equivariant stabilisation'
+        self.mixing = "min_mag"  # ['default','min_mag','max_mag']
+        self.chunks = chunks
+        self.margin = 44100
+        self.dim_t = 9
+        self.dim_f = 3072
+        self.n_fft = 6144
+        self.denoise = True
+        self.pred = Predictor(self)
+        self.device = device
+    def _path_audio_(self, input, vocal_root, others_root, format, is_hp3=False):
+        self.pred.prediction(input, vocal_root, others_root, format)
--- a/infer/modules/uvr5/modules.py
+++ b/infer/modules/uvr5/modules.py
+import os
+import traceback
+import logging
+logger = logging.getLogger(__name__)
+import ffmpeg
+import torch
+from configs.config import Config
+from infer.modules.uvr5.mdxnet import MDXNetDereverb
+from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho
+config = Config()
+def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
+    infos = []
+    try:
+        inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+        save_root_vocal = (
+            save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+        )
+        save_root_ins = (
+            save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+        )
+        if model_name == "onnx_dereverb_By_FoxJoy":
+            pre_fun = MDXNetDereverb(15, config.device)
+        else:
+            func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho
+            pre_fun = func(
+                agg=int(agg),
+                model_path=os.path.join(
+                    os.getenv("weight_uvr5_root"), model_name + ".pth"
+                ),
+                device=config.device,
+                is_half=config.is_half,
+            )
+        is_hp3 = "HP3" in model_name
+        if inp_root != "":
+            paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)]
+        else:
+            paths = [path.name for path in paths]
+        for path in paths:
+            inp_path = os.path.join(inp_root, path)
+            need_reformat = 1
+            done = 0
+            try:
+                info = ffmpeg.probe(inp_path, cmd="ffprobe")
+                if (
+                    info["streams"][0]["channels"] == 2
+                    and info["streams"][0]["sample_rate"] == "44100"
+                ):
+                    need_reformat = 0
+                    pre_fun._path_audio_(
+                        inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3
+                    )
+                    done = 1
+            except:
+                need_reformat = 1
+                traceback.print_exc()
+            if need_reformat == 1:
+                tmp_path = "%s/%s.reformatted.wav" % (
+                    os.path.join(os.environ["TEMP"]),
+                    os.path.basename(inp_path),
+                )
+                os.system(
+                    "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y"
+                    % (inp_path, tmp_path)
+                )
+                inp_path = tmp_path
+            try:
+                if done == 0:
+                    pre_fun._path_audio_(
+                        inp_path, save_root_ins, save_root_vocal, format0
+                    )
+                infos.append("%s->Success" % (os.path.basename(inp_path)))
+                yield "\n".join(infos)
+            except:
+                try:
+                    if done == 0:
+                        pre_fun._path_audio_(
+                            inp_path, save_root_ins, save_root_vocal, format0
+                        )
+                    infos.append("%s->Success" % (os.path.basename(inp_path)))
+                    yield "\n".join(infos)
+                except:
+                    infos.append(
+                        "%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
+                    )
+                    yield "\n".join(infos)
+    except:
+        infos.append(traceback.format_exc())
+        yield "\n".join(infos)
+    finally:
+        try:
+            if model_name == "onnx_dereverb_By_FoxJoy":
+                del pre_fun.pred.model
+                del pre_fun.pred.model_
+            else:
+                del pre_fun.model
+                del pre_fun
+        except:
+            traceback.print_exc()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            logger.info("Executed torch.cuda.empty_cache()")
+    yield "\n".join(infos)
--- a/infer/modules/uvr5/vr.py
+++ b/infer/modules/uvr5/vr.py
+import os
+import logging
+logger = logging.getLogger(__name__)
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+from infer.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets
+from infer.lib.uvr5_pack.lib_v5 import spec_utils
+from infer.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters
+from infer.lib.uvr5_pack.lib_v5.nets_new import CascadedNet
+from infer.lib.uvr5_pack.utils import inference
+class AudioPre:
+    def __init__(self, agg, model_path, device, is_half, tta=False):
+        self.model_path = model_path
+        self.device = device
+        self.data = {
+            # Processing Options
+            "postprocess": False,
+            "tta": tta,
+            # Constants
+            "window_size": 512,
+            "agg": agg,
+            "high_end_process": "mirroring",
+        }
+        mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json")
+        model = Nets.CascadedASPPNet(mp.param["bins"] * 2)
+        cpk = torch.load(model_path, map_location="cpu")
+        model.load_state_dict(cpk)
+        model.eval()
+        if is_half:
+            model = model.half().to(device)
+        else:
+            model = model.to(device)
+        self.mp = mp
+        self.model = model
+    def _path_audio_(
+        self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False
+    ):
+        if ins_root is None and vocal_root is None:
+            return "No save root."
+        name = os.path.basename(music_file)
+        if ins_root is not None:
+            os.makedirs(ins_root, exist_ok=True)
+        if vocal_root is not None:
+            os.makedirs(vocal_root, exist_ok=True)
+        X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
+        bands_n = len(self.mp.param["band"])
+        # print(bands_n)
+        for d in range(bands_n, 0, -1):
+            bp = self.mp.param["band"][d]
+            if d == bands_n:  # high-end band
+                (
+                    X_wave[d],
+                    _,
+                ) = librosa.load(  # 理论上librosa读取可能对某些音频有bug，应该上ffmpeg读取，但是太麻烦了弃坑
+                    music_file,
+                    sr=bp["sr"],
+                    mono=False,
+                    dtype=np.float32,
+                    res_type=bp["res_type"],
+                )
+                if X_wave[d].ndim == 1:
+                    X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
+            else:  # lower bands
+                X_wave[d] = librosa.resample(
+                    X_wave[d + 1],
+                    orig_sr=self.mp.param["band"][d + 1]["sr"],
+                    target_sr=bp["sr"],
+                    res_type=bp["res_type"],
+                )
+            # Stft of wave source
+            X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
+                X_wave[d],
+                bp["hl"],
+                bp["n_fft"],
+                self.mp.param["mid_side"],
+                self.mp.param["mid_side_b2"],
+                self.mp.param["reverse"],
+            )
+            # pdb.set_trace()
+            if d == bands_n and self.data["high_end_process"] != "none":
+                input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
+                    self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
+                )
+                input_high_end = X_spec_s[d][
+                    :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
+                ]
+        X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
+        aggresive_set = float(self.data["agg"] / 100)
+        aggressiveness = {
+            "value": aggresive_set,
+            "split_bin": self.mp.param["band"][1]["crop_stop"],
+        }
+        with torch.no_grad():
+            pred, X_mag, X_phase = inference(
+                X_spec_m, self.device, self.model, aggressiveness, self.data
+            )
+        # Postprocess
+        if self.data["postprocess"]:
+            pred_inv = np.clip(X_mag - pred, 0, np.inf)
+            pred = spec_utils.mask_silence(pred, pred_inv)
+        y_spec_m = pred * X_phase
+        v_spec_m = X_spec_m - y_spec_m
+        if ins_root is not None:
+            if self.data["high_end_process"].startswith("mirroring"):
+                input_high_end_ = spec_utils.mirroring(
+                    self.data["high_end_process"], y_spec_m, input_high_end, self.mp
+                )
+                wav_instrument = spec_utils.cmb_spectrogram_to_wave(
+                    y_spec_m, self.mp, input_high_end_h, input_high_end_
+                )
+            else:
+                wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
+            logger.info("%s instruments done" % name)
+            if is_hp3 == True:
+                head = "vocal_"
+            else:
+                head = "instrument_"
+            if format in ["wav", "flac"]:
+                sf.write(
+                    os.path.join(
+                        ins_root,
+                        head + "{}_{}.{}".format(name, self.data["agg"], format),
+                    ),
+                    (np.array(wav_instrument) * 32768).astype("int16"),
+                    self.mp.param["sr"],
+                )  #
+            else:
+                path = os.path.join(
+                    ins_root, head + "{}_{}.wav".format(name, self.data["agg"])
+                )
+                sf.write(
+                    path,
+                    (np.array(wav_instrument) * 32768).astype("int16"),
+                    self.mp.param["sr"],
+                )
+                if os.path.exists(path):
+                    opt_format_path = path[:-4] + ".%s" % format
+                    os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
+                    if os.path.exists(opt_format_path):
+                        try:
+                            os.remove(path)
+                        except:
+                            pass
+        if vocal_root is not None:
+            if is_hp3 == True:
+                head = "instrument_"
+            else:
+                head = "vocal_"
+            if self.data["high_end_process"].startswith("mirroring"):
+                input_high_end_ = spec_utils.mirroring(
+                    self.data["high_end_process"], v_spec_m, input_high_end, self.mp
+                )
+                wav_vocals = spec_utils.cmb_spectrogram_to_wave(
+                    v_spec_m, self.mp, input_high_end_h, input_high_end_
+                )
+            else:
+                wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
+            logger.info("%s vocals done" % name)
+            if format in ["wav", "flac"]:
+                sf.write(
+                    os.path.join(
+                        vocal_root,
+                        head + "{}_{}.{}".format(name, self.data["agg"], format),
+                    ),
+                    (np.array(wav_vocals) * 32768).astype("int16"),
+                    self.mp.param["sr"],
+                )
+            else:
+                path = os.path.join(
+                    vocal_root, head + "{}_{}.wav".format(name, self.data["agg"])
+                )
+                sf.write(
+                    path,
+                    (np.array(wav_vocals) * 32768).astype("int16"),
+                    self.mp.param["sr"],
+                )
+                if os.path.exists(path):
+                    opt_format_path = path[:-4] + ".%s" % format
+                    os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
+                    if os.path.exists(opt_format_path):
+                        try:
+                            os.remove(path)
+                        except:
+                            pass
+class AudioPreDeEcho:
+    def __init__(self, agg, model_path, device, is_half, tta=False):
+        self.model_path = model_path
+        self.device = device
+        self.data = {
+            # Processing Options
+            "postprocess": False,
+            "tta": tta,
+            # Constants
+            "window_size": 512,
+            "agg": agg,
+            "high_end_process": "mirroring",
+        }
+        mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json")
+        nout = 64 if "DeReverb" in model_path else 48
+        model = CascadedNet(mp.param["bins"] * 2, nout)
+        cpk = torch.load(model_path, map_location="cpu")
+        model.load_state_dict(cpk)
+        model.eval()
+        if is_half:
+            model = model.half().to(device)
+        else:
+            model = model.to(device)
+        self.mp = mp
+        self.model = model
+    def _path_audio_(
+        self, music_file, vocal_root=None, ins_root=None, format="flac", is_hp3=False
+    ):  # 3个VR模型vocal和ins是反的
+        if ins_root is None and vocal_root is None:
+            return "No save root."
+        name = os.path.basename(music_file)
+        if ins_root is not None:
+            os.makedirs(ins_root, exist_ok=True)
+        if vocal_root is not None:
+            os.makedirs(vocal_root, exist_ok=True)
+        X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
+        bands_n = len(self.mp.param["band"])
+        # print(bands_n)
+        for d in range(bands_n, 0, -1):
+            bp = self.mp.param["band"][d]
+            if d == bands_n:  # high-end band
+                (
+                    X_wave[d],
+                    _,
+                ) = librosa.load(  # 理论上librosa读取可能对某些音频有bug，应该上ffmpeg读取，但是太麻烦了弃坑
+                    music_file,
+                    sr=bp["sr"],
+                    mono=False,
+                    dtype=np.float32,
+                    res_type=bp["res_type"],
+                )
+                if X_wave[d].ndim == 1:
+                    X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
+            else:  # lower bands
+                X_wave[d] = librosa.resample(
+                    X_wave[d + 1],
+                    orig_sr=self.mp.param["band"][d + 1]["sr"],
+                    target_sr=bp["sr"],
+                    res_type=bp["res_type"],
+                )
+            # Stft of wave source
+            X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
+                X_wave[d],
+                bp["hl"],
+                bp["n_fft"],
+                self.mp.param["mid_side"],
+                self.mp.param["mid_side_b2"],
+                self.mp.param["reverse"],
+            )
+            # pdb.set_trace()
+            if d == bands_n and self.data["high_end_process"] != "none":
+                input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
+                    self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
+                )
+                input_high_end = X_spec_s[d][
+                    :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
+                ]
+        X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
+        aggresive_set = float(self.data["agg"] / 100)
+        aggressiveness = {
+            "value": aggresive_set,
+            "split_bin": self.mp.param["band"][1]["crop_stop"],
+        }
+        with torch.no_grad():
+            pred, X_mag, X_phase = inference(
+                X_spec_m, self.device, self.model, aggressiveness, self.data
+            )
+        # Postprocess
+        if self.data["postprocess"]:
+            pred_inv = np.clip(X_mag - pred, 0, np.inf)
+            pred = spec_utils.mask_silence(pred, pred_inv)
+        y_spec_m = pred * X_phase
+        v_spec_m = X_spec_m - y_spec_m
+        if ins_root is not None:
+            if self.data["high_end_process"].startswith("mirroring"):
+                input_high_end_ = spec_utils.mirroring(
+                    self.data["high_end_process"], y_spec_m, input_high_end, self.mp
+                )
+                wav_instrument = spec_utils.cmb_spectrogram_to_wave(
+                    y_spec_m, self.mp, input_high_end_h, input_high_end_
+                )
+            else:
+                wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
+            logger.info("%s instruments done" % name)
+            if format in ["wav", "flac"]:
+                sf.write(
+                    os.path.join(
+                        ins_root,
+                        "vocal_{}_{}.{}".format(name, self.data["agg"], format),
+                    ),
+                    (np.array(wav_instrument) * 32768).astype("int16"),
+                    self.mp.param["sr"],
+                )  #
+            else:
+                path = os.path.join(
+                    ins_root, "vocal_{}_{}.wav".format(name, self.data["agg"])
+                )
+                sf.write(
+                    path,
+                    (np.array(wav_instrument) * 32768).astype("int16"),
+                    self.mp.param["sr"],
+                )
+                if os.path.exists(path):
+                    opt_format_path = path[:-4] + ".%s" % format
+                    os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
+                    if os.path.exists(opt_format_path):
+                        try:
+                            os.remove(path)
+                        except:
+                            pass
+        if vocal_root is not None:
+            if self.data["high_end_process"].startswith("mirroring"):
+                input_high_end_ = spec_utils.mirroring(
+                    self.data["high_end_process"], v_spec_m, input_high_end, self.mp
+                )
+                wav_vocals = spec_utils.cmb_spectrogram_to_wave(
+                    v_spec_m, self.mp, input_high_end_h, input_high_end_
+                )
+            else:
+                wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
+            logger.info("%s vocals done" % name)
+            if format in ["wav", "flac"]:
+                sf.write(
+                    os.path.join(
+                        vocal_root,
+                        "instrument_{}_{}.{}".format(name, self.data["agg"], format),
+                    ),
+                    (np.array(wav_vocals) * 32768).astype("int16"),
+                    self.mp.param["sr"],
+                )
+            else:
+                path = os.path.join(
+                    vocal_root, "instrument_{}_{}.wav".format(name, self.data["agg"])
+                )
+                sf.write(
+                    path,
+                    (np.array(wav_vocals) * 32768).astype("int16"),
+                    self.mp.param["sr"],
+                )
+                if os.path.exists(path):
+                    opt_format_path = path[:-4] + ".%s" % format
+                    os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
+                    if os.path.exists(opt_format_path):
+                        try:
+                            os.remove(path)
+                        except:
+                            pass
--- a/infer/modules/vc/__init__.py
+++ b/infer/modules/vc/__init__.py
--- a/infer/modules/vc/modules.py
+++ b/infer/modules/vc/modules.py
+import traceback
+import logging
+logger = logging.getLogger(__name__)
+import numpy as np
+import soundfile as sf
+import torch
+from io import BytesIO
+from infer.lib.audio import load_audio, wav2
+from infer.lib.infer_pack.models import (
+    SynthesizerTrnMs256NSFsid,
+    SynthesizerTrnMs256NSFsid_nono,
+    SynthesizerTrnMs768NSFsid,
+    SynthesizerTrnMs768NSFsid_nono,
+)
+from infer.modules.vc.pipeline import Pipeline
+from infer.modules.vc.utils import *
+class VC:
+    def __init__(self, config):
+        self.n_spk = None
+        self.tgt_sr = None
+        self.net_g = None
+        self.pipeline = None
+        self.cpt = None
+        self.version = None
+        self.if_f0 = None
+        self.version = None
+        self.hubert_model = None
+        self.config = config
+    def get_vc(self, sid, *to_return_protect):
+        logger.info("Get sid: " + sid)
+        to_return_protect0 = {
+            "visible": self.if_f0 != 0,
+            "value": (
+                to_return_protect[0] if self.if_f0 != 0 and to_return_protect else 0.5
+            ),
+            "__type__": "update",
+        }
+        to_return_protect1 = {
+            "visible": self.if_f0 != 0,
+            "value": (
+                to_return_protect[1] if self.if_f0 != 0 and to_return_protect else 0.33
+            ),
+            "__type__": "update",
+        }
+        if sid == "" or sid == []:
+            if (
+                self.hubert_model is not None
+            ):  # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
+                logger.info("Clean model cache")
+                del (self.net_g, self.n_spk, self.hubert_model, self.tgt_sr)  # ,cpt
+                self.hubert_model = self.net_g = self.n_spk = self.hubert_model = (
+                    self.tgt_sr
+                ) = None
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                ###楼下不这么折腾清理不干净
+                self.if_f0 = self.cpt.get("f0", 1)
+                self.version = self.cpt.get("version", "v1")
+                if self.version == "v1":
+                    if self.if_f0 == 1:
+                        self.net_g = SynthesizerTrnMs256NSFsid(
+                            *self.cpt["config"], is_half=self.config.is_half
+                        )
+                    else:
+                        self.net_g = SynthesizerTrnMs256NSFsid_nono(*self.cpt["config"])
+                elif self.version == "v2":
+                    if self.if_f0 == 1:
+                        self.net_g = SynthesizerTrnMs768NSFsid(
+                            *self.cpt["config"], is_half=self.config.is_half
+                        )
+                    else:
+                        self.net_g = SynthesizerTrnMs768NSFsid_nono(*self.cpt["config"])
+                del self.net_g, self.cpt
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+            return (
+                {"visible": False, "__type__": "update"},
+                {
+                    "visible": True,
+                    "value": to_return_protect0,
+                    "__type__": "update",
+                },
+                {
+                    "visible": True,
+                    "value": to_return_protect1,
+                    "__type__": "update",
+                },
+                "",
+                "",
+            )
+        person = f'{os.getenv("weight_root")}/{sid}'
+        logger.info(f"Loading: {person}")
+        self.cpt = torch.load(person, map_location="cpu")
+        self.tgt_sr = self.cpt["config"][-1]
+        self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
+        self.if_f0 = self.cpt.get("f0", 1)
+        self.version = self.cpt.get("version", "v1")
+        synthesizer_class = {
+            ("v1", 1): SynthesizerTrnMs256NSFsid,
+            ("v1", 0): SynthesizerTrnMs256NSFsid_nono,
+            ("v2", 1): SynthesizerTrnMs768NSFsid,
+            ("v2", 0): SynthesizerTrnMs768NSFsid_nono,
+        }
+        self.net_g = synthesizer_class.get(
+            (self.version, self.if_f0), SynthesizerTrnMs256NSFsid
+        )(*self.cpt["config"], is_half=self.config.is_half)
+        del self.net_g.enc_q
+        self.net_g.load_state_dict(self.cpt["weight"], strict=False)
+        self.net_g.eval().to(self.config.device)
+        if self.config.is_half:
+            self.net_g = self.net_g.half()
+        else:
+            self.net_g = self.net_g.float()
+        self.pipeline = Pipeline(self.tgt_sr, self.config)
+        n_spk = self.cpt["config"][-3]
+        index = {"value": get_index_path_from_model(sid), "__type__": "update"}
+        logger.info("Select index: " + index["value"])
+        return (
+            (
+                {"visible": True, "maximum": n_spk, "__type__": "update"},
+                to_return_protect0,
+                to_return_protect1,
+                index,
+                index,
+            )
+            if to_return_protect
+            else {"visible": True, "maximum": n_spk, "__type__": "update"}
+        )
+    def vc_single(
+        self,
+        sid,
+        input_audio_path,
+        f0_up_key,
+        f0_file,
+        f0_method,
+        file_index,
+        file_index2,
+        index_rate,
+        filter_radius,
+        resample_sr,
+        rms_mix_rate,
+        protect,
+    ):
+        if input_audio_path is None:
+            return "You need to upload an audio", None
+        f0_up_key = int(f0_up_key)
+        try:
+            audio = load_audio(input_audio_path, 16000)
+            audio_max = np.abs(audio).max() / 0.95
+            if audio_max > 1:
+                audio /= audio_max
+            times = [0, 0, 0]
+            if self.hubert_model is None:
+                self.hubert_model = load_hubert(self.config)
+            if file_index:
+                file_index = (
+                    file_index.strip(" ")
+                    .strip('"')
+                    .strip("\n")
+                    .strip('"')
+                    .strip(" ")
+                    .replace("trained", "added")
+                )
+            elif file_index2:
+                file_index = file_index2
+            else:
+                file_index = ""  # 防止小白写错，自动帮他替换掉
+            audio_opt = self.pipeline.pipeline(
+                self.hubert_model,
+                self.net_g,
+                sid,
+                audio,
+                input_audio_path,
+                times,
+                f0_up_key,
+                f0_method,
+                file_index,
+                index_rate,
+                self.if_f0,
+                filter_radius,
+                self.tgt_sr,
+                resample_sr,
+                rms_mix_rate,
+                self.version,
+                protect,
+                f0_file,
+            )
+            if self.tgt_sr != resample_sr >= 16000:
+                tgt_sr = resample_sr
+            else:
+                tgt_sr = self.tgt_sr
+            index_info = (
+                "Index:\n%s." % file_index
+                if os.path.exists(file_index)
+                else "Index not used."
+            )
+            return (
+                "Success.\n%s\nTime:\nnpy: %.2fs, f0: %.2fs, infer: %.2fs."
+                % (index_info, *times),
+                (tgt_sr, audio_opt),
+            )
+        except:
+            info = traceback.format_exc()
+            logger.warning(info)
+            return info, (None, None)
+    def vc_multi(
+        self,
+        sid,
+        dir_path,
+        opt_root,
+        paths,
+        f0_up_key,
+        f0_method,
+        file_index,
+        file_index2,
+        index_rate,
+        filter_radius,
+        resample_sr,
+        rms_mix_rate,
+        protect,
+        format1,
+    ):
+        try:
+            dir_path = (
+                dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+            )  # 防止小白拷路径头尾带了空格和"和回车
+            opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+            os.makedirs(opt_root, exist_ok=True)
+            try:
+                if dir_path != "":
+                    paths = [
+                        os.path.join(dir_path, name) for name in os.listdir(dir_path)
+                    ]
+                else:
+                    paths = [path.name for path in paths]
+            except:
+                traceback.print_exc()
+                paths = [path.name for path in paths]
+            infos = []
+            for path in paths:
+                info, opt = self.vc_single(
+                    sid,
+                    path,
+                    f0_up_key,
+                    None,
+                    f0_method,
+                    file_index,
+                    file_index2,
+                    # file_big_npy,
+                    index_rate,
+                    filter_radius,
+                    resample_sr,
+                    rms_mix_rate,
+                    protect,
+                )
+                if "Success" in info:
+                    try:
+                        tgt_sr, audio_opt = opt
+                        if format1 in ["wav", "flac"]:
+                            sf.write(
+                                "%s/%s.%s"
+                                % (opt_root, os.path.basename(path), format1),
+                                audio_opt,
+                                tgt_sr,
+                            )
+                        else:
+                            path = "%s/%s.%s" % (
+                                opt_root,
+                                os.path.basename(path),
+                                format1,
+                            )
+                            with BytesIO() as wavf:
+                                sf.write(wavf, audio_opt, tgt_sr, format="wav")
+                                wavf.seek(0, 0)
+                                with open(path, "wb") as outf:
+                                    wav2(wavf, outf, format1)
+                    except:
+                        info += traceback.format_exc()
+                infos.append("%s->%s" % (os.path.basename(path), info))
+                yield "\n".join(infos)
+            yield "\n".join(infos)
+        except:
+            yield traceback.format_exc()
--- a/infer/modules/vc/pipeline.py
+++ b/infer/modules/vc/pipeline.py
+import os
+import sys
+import traceback
+import logging
+logger = logging.getLogger(__name__)
+from functools import lru_cache
+from time import time as ttime
+import faiss
+import librosa
+import numpy as np
+import parselmouth
+import pyworld
+import torch
+import torch.nn.functional as F
+import torchcrepe
+from scipy import signal
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
+input_audio_path2wav = {}
+@lru_cache
+def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
+    audio = input_audio_path2wav[input_audio_path]
+    f0, t = pyworld.harvest(
+        audio,
+        fs=fs,
+        f0_ceil=f0max,
+        f0_floor=f0min,
+        frame_period=frame_period,
+    )
+    f0 = pyworld.stonemask(audio, f0, t, fs)
+    return f0
+def change_rms(data1, sr1, data2, sr2, rate):  # 1是输入音频，2是输出音频,rate是2的占比
+    # print(data1.max(),data2.max())
+    rms1 = librosa.feature.rms(
+        y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
+    )  # 每半秒一个点
+    rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
+    rms1 = torch.from_numpy(rms1)
+    rms1 = F.interpolate(
+        rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
+    ).squeeze()
+    rms2 = torch.from_numpy(rms2)
+    rms2 = F.interpolate(
+        rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
+    ).squeeze()
+    rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
+    data2 *= (
+        torch.pow(rms1, torch.tensor(1 - rate))
+        * torch.pow(rms2, torch.tensor(rate - 1))
+    ).numpy()
+    return data2
+class Pipeline(object):
+    def __init__(self, tgt_sr, config):
+        self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
+            config.x_pad,
+            config.x_query,
+            config.x_center,
+            config.x_max,
+            config.is_half,
+        )
+        self.sr = 16000  # hubert输入采样率
+        self.window = 160  # 每帧点数
+        self.t_pad = self.sr * self.x_pad  # 每条前后pad时间
+        self.t_pad_tgt = tgt_sr * self.x_pad
+        self.t_pad2 = self.t_pad * 2
+        self.t_query = self.sr * self.x_query  # 查询切点前后查询时间
+        self.t_center = self.sr * self.x_center  # 查询切点位置
+        self.t_max = self.sr * self.x_max  # 免查询时长阈值
+        self.device = config.device
+    def get_f0(
+        self,
+        input_audio_path,
+        x,
+        p_len,
+        f0_up_key,
+        f0_method,
+        filter_radius,
+        inp_f0=None,
+    ):
+        global input_audio_path2wav
+        time_step = self.window / self.sr * 1000
+        f0_min = 50
+        f0_max = 1100
+        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+        f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+        if f0_method == "pm":
+            f0 = (
+                parselmouth.Sound(x, self.sr)
+                .to_pitch_ac(
+                    time_step=time_step / 1000,
+                    voicing_threshold=0.6,
+                    pitch_floor=f0_min,
+                    pitch_ceiling=f0_max,
+                )
+                .selected_array["frequency"]
+            )
+            pad_size = (p_len - len(f0) + 1) // 2
+            if pad_size > 0 or p_len - len(f0) - pad_size > 0:
+                f0 = np.pad(
+                    f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
+                )
+        elif f0_method == "harvest":
+            input_audio_path2wav[input_audio_path] = x.astype(np.double)
+            f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
+            if filter_radius > 2:
+                f0 = signal.medfilt(f0, 3)
+        elif f0_method == "crepe":
+            model = "full"
+            # Pick a batch size that doesn't cause memory errors on your gpu
+            batch_size = 512
+            # Compute pitch using first gpu
+            audio = torch.tensor(np.copy(x))[None].float()
+            f0, pd = torchcrepe.predict(
+                audio,
+                self.sr,
+                self.window,
+                f0_min,
+                f0_max,
+                model,
+                batch_size=batch_size,
+                device=self.device,
+                return_periodicity=True,
+            )
+            pd = torchcrepe.filter.median(pd, 3)
+            f0 = torchcrepe.filter.mean(f0, 3)
+            f0[pd < 0.1] = 0
+            f0 = f0[0].cpu().numpy()
+        elif f0_method == "rmvpe":
+            if not hasattr(self, "model_rmvpe"):
+                from infer.lib.rmvpe import RMVPE
+                logger.info(
+                    "Loading rmvpe model,%s" % "%s/rmvpe.pt" % os.environ["rmvpe_root"]
+                )
+                self.model_rmvpe = RMVPE(
+                    "%s/rmvpe.pt" % os.environ["rmvpe_root"],
+                    is_half=self.is_half,
+                    device=self.device,
+                )
+            f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
+            if "privateuseone" in str(self.device):  # clean ortruntime memory
+                del self.model_rmvpe.model
+                del self.model_rmvpe
+                logger.info("Cleaning ortruntime memory")
+        f0 *= pow(2, f0_up_key / 12)
+        # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
+        tf0 = self.sr // self.window  # 每秒f0点数
+        if inp_f0 is not None:
+            delta_t = np.round(
+                (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
+            ).astype("int16")
+            replace_f0 = np.interp(
+                list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
+            )
+            shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
+            f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
+                :shape
+            ]
+        # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
+        f0bak = f0.copy()
+        f0_mel = 1127 * np.log(1 + f0 / 700)
+        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
+            f0_mel_max - f0_mel_min
+        ) + 1
+        f0_mel[f0_mel <= 1] = 1
+        f0_mel[f0_mel > 255] = 255
+        f0_coarse = np.rint(f0_mel).astype(np.int32)
+        return f0_coarse, f0bak  # 1-0
+    def vc(
+        self,
+        model,
+        net_g,
+        sid,
+        audio0,
+        pitch,
+        pitchf,
+        times,
+        index,
+        big_npy,
+        index_rate,
+        version,
+        protect,
+    ):  # ,file_index,file_big_npy
+        feats = torch.from_numpy(audio0)
+        if self.is_half:
+            feats = feats.half()
+        else:
+            feats = feats.float()
+        if feats.dim() == 2:  # double channels
+            feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
+        inputs = {
+            "source": feats.to(self.device),
+            "padding_mask": padding_mask,
+            "output_layer": 9 if version == "v1" else 12,
+        }
+        t0 = ttime()
+        with torch.no_grad():
+            logits = model.extract_features(**inputs)
+            feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
+        if protect < 0.5 and pitch is not None and pitchf is not None:
+            feats0 = feats.clone()
+        if (
+            not isinstance(index, type(None))
+            and not isinstance(big_npy, type(None))
+            and index_rate != 0
+        ):
+            npy = feats[0].cpu().numpy()
+            if self.is_half:
+                npy = npy.astype("float32")
+            # _, I = index.search(npy, 1)
+            # npy = big_npy[I.squeeze()]
+            score, ix = index.search(npy, k=8)
+            weight = np.square(1 / score)
+            weight /= weight.sum(axis=1, keepdims=True)
+            npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
+            if self.is_half:
+                npy = npy.astype("float16")
+            feats = (
+                torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+                + (1 - index_rate) * feats
+            )
+        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
+        if protect < 0.5 and pitch is not None and pitchf is not None:
+            feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
+                0, 2, 1
+            )
+        t1 = ttime()
+        p_len = audio0.shape[0] // self.window
+        if feats.shape[1] < p_len:
+            p_len = feats.shape[1]
+            if pitch is not None and pitchf is not None:
+                pitch = pitch[:, :p_len]
+                pitchf = pitchf[:, :p_len]
+        if protect < 0.5 and pitch is not None and pitchf is not None:
+            pitchff = pitchf.clone()
+            pitchff[pitchf > 0] = 1
+            pitchff[pitchf < 1] = protect
+            pitchff = pitchff.unsqueeze(-1)
+            feats = feats * pitchff + feats0 * (1 - pitchff)
+            feats = feats.to(feats0.dtype)
+        p_len = torch.tensor([p_len], device=self.device).long()
+        with torch.no_grad():
+            hasp = pitch is not None and pitchf is not None
+            arg = (feats, p_len, pitch, pitchf, sid) if hasp else (feats, p_len, sid)
+            audio1 = (net_g.infer(*arg)[0][0, 0]).data.cpu().float().numpy()
+            del hasp, arg
+        del feats, p_len, padding_mask
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        t2 = ttime()
+        times[0] += t1 - t0
+        times[2] += t2 - t1
+        return audio1
+    def pipeline(
+        self,
+        model,
+        net_g,
+        sid,
+        audio,
+        input_audio_path,
+        times,
+        f0_up_key,
+        f0_method,
+        file_index,
+        index_rate,
+        if_f0,
+        filter_radius,
+        tgt_sr,
+        resample_sr,
+        rms_mix_rate,
+        version,
+        protect,
+        f0_file=None,
+    ):
+        if (
+            file_index != ""
+            # and file_big_npy != ""
+            # and os.path.exists(file_big_npy) == True
+            and os.path.exists(file_index)
+            and index_rate != 0
+        ):
+            try:
+                index = faiss.read_index(file_index)
+                # big_npy = np.load(file_big_npy)
+                big_npy = index.reconstruct_n(0, index.ntotal)
+            except:
+                traceback.print_exc()
+                index = big_npy = None
+        else:
+            index = big_npy = None
+        audio = signal.filtfilt(bh, ah, audio)
+        audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
+        opt_ts = []
+        if audio_pad.shape[0] > self.t_max:
+            audio_sum = np.zeros_like(audio)
+            for i in range(self.window):
+                audio_sum += np.abs(audio_pad[i : i - self.window])
+            for t in range(self.t_center, audio.shape[0], self.t_center):
+                opt_ts.append(
+                    t
+                    - self.t_query
+                    + np.where(
+                        audio_sum[t - self.t_query : t + self.t_query]
+                        == audio_sum[t - self.t_query : t + self.t_query].min()
+                    )[0][0]
+                )
+        s = 0
+        audio_opt = []
+        t = None
+        t1 = ttime()
+        audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
+        p_len = audio_pad.shape[0] // self.window
+        inp_f0 = None
+        if hasattr(f0_file, "name"):
+            try:
+                with open(f0_file.name, "r") as f:
+                    lines = f.read().strip("\n").split("\n")
+                inp_f0 = []
+                for line in lines:
+                    inp_f0.append([float(i) for i in line.split(",")])
+                inp_f0 = np.array(inp_f0, dtype="float32")
+            except:
+                traceback.print_exc()
+        sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
+        pitch, pitchf = None, None
+        if if_f0 == 1:
+            pitch, pitchf = self.get_f0(
+                input_audio_path,
+                audio_pad,
+                p_len,
+                f0_up_key,
+                f0_method,
+                filter_radius,
+                inp_f0,
+            )
+            pitch = pitch[:p_len]
+            pitchf = pitchf[:p_len]
+            if "mps" not in str(self.device) or "xpu" not in str(self.device):
+                pitchf = pitchf.astype(np.float32)
+            pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
+            pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
+        t2 = ttime()
+        times[1] += t2 - t1
+        for t in opt_ts:
+            t = t // self.window * self.window
+            if if_f0 == 1:
+                audio_opt.append(
+                    self.vc(
+                        model,
+                        net_g,
+                        sid,
+                        audio_pad[s : t + self.t_pad2 + self.window],
+                        pitch[:, s // self.window : (t + self.t_pad2) // self.window],
+                        pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
+                        times,
+                        index,
+                        big_npy,
+                        index_rate,
+                        version,
+                        protect,
+                    )[self.t_pad_tgt : -self.t_pad_tgt]
+                )
+            else:
+                audio_opt.append(
+                    self.vc(
+                        model,
+                        net_g,
+                        sid,
+                        audio_pad[s : t + self.t_pad2 + self.window],
+                        None,
+                        None,
+                        times,
+                        index,
+                        big_npy,
+                        index_rate,
+                        version,
+                        protect,
+                    )[self.t_pad_tgt : -self.t_pad_tgt]
+                )
+            s = t
+        if if_f0 == 1:
+            audio_opt.append(
+                self.vc(
+                    model,
+                    net_g,
+                    sid,
+                    audio_pad[t:],
+                    pitch[:, t // self.window :] if t is not None else pitch,
+                    pitchf[:, t // self.window :] if t is not None else pitchf,
+                    times,
+                    index,
+                    big_npy,
+                    index_rate,
+                    version,
+                    protect,
+                )[self.t_pad_tgt : -self.t_pad_tgt]
+            )
+        else:
+            audio_opt.append(
+                self.vc(
+                    model,
+                    net_g,
+                    sid,
+                    audio_pad[t:],
+                    None,
+                    None,
+                    times,
+                    index,
+                    big_npy,
+                    index_rate,
+                    version,
+                    protect,
+                )[self.t_pad_tgt : -self.t_pad_tgt]
+            )
+        audio_opt = np.concatenate(audio_opt)
+        if rms_mix_rate != 1:
+            audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
+        if tgt_sr != resample_sr >= 16000:
+            audio_opt = librosa.resample(
+                audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
+            )
+        audio_max = np.abs(audio_opt).max() / 0.99
+        max_int16 = 32768
+        if audio_max > 1:
+            max_int16 /= audio_max
+        audio_opt = (audio_opt * max_int16).astype(np.int16)
+        del pitch, pitchf, sid
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return audio_opt
--- a/infer/modules/vc/utils.py
+++ b/infer/modules/vc/utils.py
+import os
+from fairseq import checkpoint_utils
+def get_index_path_from_model(sid):
+    return next(
+        (
+            f
+            for f in [
+                os.path.join(root, name)
+                for root, _, files in os.walk(os.getenv("index_root"), topdown=False)
+                for name in files
+                if name.endswith(".index") and "trained" not in name
+            ]
+            if sid.split(".")[0] in f
+        ),
+        "",
+    )
+def load_hubert(config):
+    models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
+        ["assets/hubert/hubert_base.pt"],
+        suffix="",
+    )
+    hubert_model = models[0]
+    hubert_model = hubert_model.to(config.device)
+    if config.is_half:
+        hubert_model = hubert_model.half()
+    else:
+        hubert_model = hubert_model.float()
+    return hubert_model.eval()
--- a/model.properties
+++ b/model.properties
+# 模型编码
+modelCode=820
+# 模型名称
+modelName=retrieval-based-voice-conversion-webui_pytorch
+# 模型描述
+modelDescription=一个基于VITS简单易用的变声框架，使用少量数据进行训练也能得到较好结果，方便直播娱乐。
+# 应用场景
+appScenario=训练,推理,语音合成,直播,影视,电商
+# 框架类型
+frameType=pytorch
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
+[tool.poetry]
+name = "rvc-beta"
+version = "0.1.0"
+description = ""
+authors = ["lj1995"]
+license = "MIT"
+[tool.poetry.dependencies]
+python = "^3.8"
+torch = "^2.0.0"
+torchaudio = "^2.0.1"
+Cython = "^0.29.34"
+gradio = "^4.11.0"
+future = "^0.18.3"
+pydub = "^0.25.1"
+soundfile = "^0.12.1"
+ffmpeg-python = "^0.2.0"
+tensorboardX = "^2.6"
+functorch = "^2.0.0"
+fairseq = "^0.12.2"
+faiss-cpu = "^1.7.2"
+Jinja2 = "^3.1.2"
+json5 = "^0.9.11"
+librosa = "0.9.1"
+llvmlite = "0.39.0"
+Markdown = "^3.4.3"
+matplotlib = "^3.7.1"
+matplotlib-inline = "^0.1.6"
+numba = "0.56.4"
+numpy = "1.23.5"
+scipy = "1.9.3"
+praat-parselmouth = "^0.4.3"
+Pillow = "9.3.0"
+pyworld = "^0.3.2"
+resampy = "^0.4.2"
+scikit-learn = "^1.2.2"
+starlette = "^0.27.0"
+tensorboard = "^2.12.1"
+tensorboard-data-server = "^0.7.0"
+tensorboard-plugin-wit = "^1.8.1"
+torchgen = "^0.0.1"
+tqdm = "^4.65.0"
+tornado = "^6.3"
+Werkzeug = "^2.2.3"
+uc-micro-py = "^1.0.1"
+sympy = "^1.11.1"
+tabulate = "^0.9.0"
+PyYAML = "^6.0"
+pyasn1 = "^0.4.8"
+pyasn1-modules = "^0.2.8"
+fsspec = "^2023.3.0"
+absl-py = "^1.4.0"
+audioread = "^3.0.0"
+uvicorn = "^0.21.1"
+colorama = "^0.4.6"
+torchcrepe = "0.0.20"
+python-dotenv = "^1.0.0"
+av = "^10.0.0"
+[tool.poetry.dev-dependencies]
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
--- a/requirements-amd.txt
+++ b/requirements-amd.txt
+tensorflow-rocm
+joblib>=1.1.0
+numba==0.56.4
+numpy==1.23.5
+scipy
+librosa==0.10.2
+llvmlite==0.39.0
+fairseq==0.12.2
+faiss-cpu==1.7.3
+gradio==3.34.0
+Cython
+pydub>=0.25.1
+soundfile>=0.12.1
+ffmpeg-python>=0.2.0
+tensorboardX
+Jinja2>=3.1.2
+json5
+Markdown
+matplotlib>=3.7.0
+matplotlib-inline>=0.1.3
+praat-parselmouth>=0.4.2
+Pillow>=9.1.1
+resampy>=0.4.2
+scikit-learn
+tensorboard
+tqdm>=4.63.1
+tornado>=6.1
+Werkzeug>=2.2.3
+uc-micro-py>=1.0.1
+sympy>=1.11.1
+tabulate>=0.8.10
+PyYAML>=6.0
+pyasn1>=0.4.8
+pyasn1-modules>=0.2.8
+fsspec>=2022.11.0
+absl-py>=1.2.0
+audioread
+uvicorn>=0.21.1
+colorama>=0.4.5
+pyworld==0.3.2
+httpx
+onnxruntime
+onnxruntime-gpu
+torchcrepe==0.0.23
+fastapi==0.88
+ffmpy==0.3.1
+python-dotenv>=1.0.0
+av
+torchfcpe
--- a/requirements-dml.txt
+++ b/requirements-dml.txt
+joblib>=1.1.0
+numba==0.56.4
+numpy==1.23.5
+scipy
+librosa==0.10.2
+llvmlite==0.39.0
+fairseq==0.12.2
+faiss-cpu==1.7.3
+gradio==3.34.0
+Cython
+pydub>=0.25.1
+soundfile>=0.12.1
+ffmpeg-python>=0.2.0
+tensorboardX
+Jinja2>=3.1.2
+json5
+Markdown
+matplotlib>=3.7.0
+matplotlib-inline>=0.1.3
+praat-parselmouth>=0.4.2
+Pillow>=9.1.1
+resampy>=0.4.2
+scikit-learn
+tensorboard
+tqdm>=4.63.1
+tornado>=6.1
+Werkzeug>=2.2.3
+uc-micro-py>=1.0.1
+sympy>=1.11.1
+tabulate>=0.8.10
+PyYAML>=6.0
+pyasn1>=0.4.8
+pyasn1-modules>=0.2.8
+fsspec>=2022.11.0
+absl-py>=1.2.0
+audioread
+uvicorn>=0.21.1
+colorama>=0.4.5
+pyworld==0.3.2
+httpx
+onnxruntime-directml
+torchcrepe==0.0.23
+fastapi==0.88
+ffmpy==0.3.1
+python-dotenv>=1.0.0
+av
+torchfcpe
\ No newline at end of file
--- a/requirements-ipex.txt
+++ b/requirements-ipex.txt
+torch==2.0.1a0
+intel_extension_for_pytorch==2.0.110+xpu
+torchvision==0.15.2a0
+https://github.com/Disty0/Retrieval-based-Voice-Conversion-WebUI/releases/download/torchaudio_wheels_for_ipex/torchaudio-2.0.2+31de77d-cp310-cp310-linux_x86_64.whl
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+joblib>=1.1.0
+numba==0.56.4
+numpy==1.23.5
+scipy
+librosa==0.10.2
+llvmlite==0.39.0
+fairseq==0.12.2
+faiss-cpu==1.7.3
+gradio==3.34.0
+Cython
+pydub>=0.25.1
+soundfile>=0.12.1
+ffmpeg-python>=0.2.0
+tensorboardX
+Jinja2>=3.1.2
+json5
+Markdown
+matplotlib>=3.7.0
+matplotlib-inline>=0.1.3
+praat-parselmouth>=0.4.2
+Pillow>=9.1.1
+resampy>=0.4.2
+scikit-learn
+tensorboard
+tqdm>=4.63.1
+tornado>=6.1
+Werkzeug>=2.2.3
+uc-micro-py>=1.0.1
+sympy>=1.11.1
+tabulate>=0.8.10
+PyYAML>=6.0
+pyasn1>=0.4.8
+pyasn1-modules>=0.2.8
+fsspec>=2022.11.0
+absl-py>=1.2.0
+audioread
+uvicorn>=0.21.1
+colorama>=0.4.5
+pyworld==0.3.2
+httpx
+onnxruntime; sys_platform == 'darwin'
+onnxruntime-gpu; sys_platform != 'darwin'
+torchcrepe==0.0.23
+fastapi==0.88
+ffmpy==0.3.1
+python-dotenv>=1.0.0
+av
+PySimpleGUI
+sounddevice
+torchfcpe
\ No newline at end of file
--- a/requirements-py311.txt
+++ b/requirements-py311.txt
+joblib>=1.1.0
+numba
+numpy
+scipy
+librosa==0.10.2
+llvmlite
+fairseq @ git+https://github.com/One-sixth/fairseq.git
+faiss-cpu
+gradio==3.34.0
+Cython
+pydub>=0.25.1
+soundfile>=0.12.1
+ffmpeg-python>=0.2.0
+tensorboardX
+Jinja2>=3.1.2
+json5
+Markdown
+matplotlib>=3.7.0
+matplotlib-inline>=0.1.3
+praat-parselmouth>=0.4.2
+Pillow>=9.1.1
+resampy>=0.4.2
+scikit-learn
+tensorboard
+tqdm>=4.63.1
+tornado>=6.1
+Werkzeug>=2.2.3
+uc-micro-py>=1.0.1
+sympy>=1.11.1
+tabulate>=0.8.10
+PyYAML>=6.0
+pyasn1>=0.4.8
+pyasn1-modules>=0.2.8
+fsspec>=2022.11.0
+absl-py>=1.2.0
+audioread
+uvicorn>=0.21.1
+colorama>=0.4.5
+pyworld==0.3.2
+httpx
+onnxruntime; sys_platform == 'darwin'
+onnxruntime-gpu; sys_platform != 'darwin'
+torchcrepe==0.0.23
+fastapi==0.88
+torchfcpe
+ffmpy==0.3.1
+python-dotenv>=1.0.0
+av
--- a/requirements-win-for-realtime_vc_gui-dml.txt
+++ b/requirements-win-for-realtime_vc_gui-dml.txt
+#1.Install torch from pytorch.org:
+#torch 2.0 with cuda 11.8
+#pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+#torch 1.11.0 with cuda 11.3
+#pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113
+einops
+fairseq
+flask
+flask_cors
+gin
+gin_config
+librosa
+local_attention
+matplotlib
+praat-parselmouth
+pyworld
+PyYAML
+resampy
+scikit_learn
+scipy
+SoundFile
+tensorboard
+tqdm
+wave
+PySimpleGUI
+sounddevice
+gradio
+noisereduce
+onnxruntime-directml
+torchfcpe
\ No newline at end of file
--- a/requirements-win-for-realtime_vc_gui.txt
+++ b/requirements-win-for-realtime_vc_gui.txt
+#1.Install torch from pytorch.org:
+#torch 2.0 with cuda 11.8
+#pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+#torch 1.11.0 with cuda 11.3
+#pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113
+einops
+fairseq
+flask
+flask_cors
+gin
+gin_config
+librosa
+local_attention
+matplotlib
+praat-parselmouth
+pyworld
+PyYAML
+resampy
+scikit_learn
+scipy
+SoundFile
+tensorboard
+tqdm
+wave
+PySimpleGUI
+sounddevice
+gradio
+noisereduce
+torchfcpe
--- a/requirements.txt
+++ b/requirements.txt
+joblib>=1.1.0
+numba==0.56.4
+numpy==1.23.5
+scipy
+librosa==0.9.1
+llvmlite==0.39.0
+fairseq==0.12.2
+faiss-cpu==1.7.3
+gradio==3.34.0
+Cython
+pydub>=0.25.1
+soundfile>=0.12.1
+#ffmpeg-python>=0.2.0
+tensorboardX
+Jinja2>=3.1.2
+json5
+Markdown
+matplotlib>=3.7.0
+matplotlib-inline>=0.1.3
+praat-parselmouth>=0.4.2
+Pillow>=9.1.1
+resampy>=0.4.2
+scikit-learn
+tensorboard
+tqdm>=4.63.1
+tornado>=6.1
+Werkzeug>=2.2.3
+uc-micro-py>=1.0.1
+sympy>=1.11.1
+tabulate>=0.8.10
+PyYAML>=6.0
+pyasn1>=0.4.8
+pyasn1-modules>=0.2.8
+fsspec>=2022.11.0
+absl-py>=1.2.0
+audioread
+uvicorn>=0.21.1
+colorama>=0.4.5
+pyworld==0.3.2
+httpx
+#onnxruntime; sys_platform == 'darwin'
+#onnxruntime-gpu; sys_platform != 'darwin'
+torchcrepe==0.0.20
+fastapi==0.88
+torchfcpe
+ffmpy==0.3.1
+python-dotenv>=1.0.0
+av
--- a/run.sh
+++ b/run.sh
+#!/bin/sh
+if [ "$(uname)" = "Darwin" ]; then
+  # macOS specific env:
+  export PYTORCH_ENABLE_MPS_FALLBACK=1
+  export PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
+elif [ "$(uname)" != "Linux" ]; then
+  echo "Unsupported operating system."
+  exit 1
+fi
+if [ -d ".venv" ]; then
+  echo "Activate venv..."
+  . .venv/bin/activate
+else
+  echo "Create venv..."
+  requirements_file="requirements.txt"
+  # Check if Python 3.8 is installed
+  if ! command -v python3.8 >/dev/null 2>&1 || pyenv versions --bare | grep -q "3.8"; then
+    echo "Python 3 not found. Attempting to install 3.8..."
+    if [ "$(uname)" = "Darwin" ] && command -v brew >/dev/null 2>&1; then
+      brew install python@3.8
+    elif [ "$(uname)" = "Linux" ] && command -v apt-get >/dev/null 2>&1; then
+      sudo apt-get update
+      sudo apt-get install python3.8
+    else
+      echo "Please install Python 3.8 manually."
+      exit 1
+    fi
+  fi
+  python3.8 -m venv .venv
+  . .venv/bin/activate
+  # Check if required packages are installed and install them if not
+  if [ -f "${requirements_file}" ]; then
+    installed_packages=$(python3.8 -m pip freeze)
+    while IFS= read -r package; do
+      expr "${package}" : "^#.*" > /dev/null && continue
+      package_name=$(echo "${package}" | sed 's/[<>=!].*//')
+      if ! echo "${installed_packages}" | grep -q "${package_name}"; then
+        echo "${package_name} not found. Attempting to install..."
+        python3.8 -m pip install --upgrade "${package}"
+      fi
+    done < "${requirements_file}"
+  else
+    echo "${requirements_file} not found. Please ensure the requirements file with required packages exists."
+    exit 1
+  fi
+fi
+# Download models
+chmod +x tools/dlmodels.sh
+./tools/dlmodels.sh
+if [ $? -ne 0 ]; then
+  exit 1
+fi
+# Run the main script
+python3.8 infer-web.py --pycmd python3.8
--- a/tools/app.py
+++ b/tools/app.py
+import logging
+import os
+# os.system("wget -P cvec/ https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt")
+import gradio as gr
+from dotenv import load_dotenv
+from configs.config import Config
+from i18n.i18n import I18nAuto
+from infer.modules.vc.modules import VC
+logging.getLogger("numba").setLevel(logging.WARNING)
+logging.getLogger("markdown_it").setLevel(logging.WARNING)
+logging.getLogger("urllib3").setLevel(logging.WARNING)
+logging.getLogger("matplotlib").setLevel(logging.WARNING)
+logger = logging.getLogger(__name__)
+i18n = I18nAuto()
+logger.info(i18n)
+load_dotenv()
+config = Config()
+vc = VC(config)
+weight_root = os.getenv("weight_root")
+weight_uvr5_root = os.getenv("weight_uvr5_root")
+index_root = os.getenv("index_root")
+names = []
+hubert_model = None
+for name in os.listdir(weight_root):
+    if name.endswith(".pth"):
+        names.append(name)
+index_paths = []
+for root, dirs, files in os.walk(index_root, topdown=False):
+    for name in files:
+        if name.endswith(".index") and "trained" not in name:
+            index_paths.append("%s/%s" % (root, name))
+app = gr.Blocks()
+with app:
+    with gr.Tabs():
+        with gr.TabItem("在线demo"):
+            gr.Markdown(
+                value="""
+                RVC 在线demo
+                """
+            )
+            sid = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names))
+            with gr.Column():
+                spk_item = gr.Slider(
+                    minimum=0,
+                    maximum=2333,
+                    step=1,
+                    label=i18n("请选择说话人id"),
+                    value=0,
+                    visible=False,
+                    interactive=True,
+                )
+            sid.change(fn=vc.get_vc, inputs=[sid], outputs=[spk_item])
+            gr.Markdown(
+                value=i18n(
+                    "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. "
+                )
+            )
+            vc_input3 = gr.Audio(label="上传音频（长度小于90秒）")
+            vc_transform0 = gr.Number(
+                label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0
+            )
+            f0method0 = gr.Radio(
+                label=i18n(
+                    "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"
+                ),
+                choices=["pm", "harvest", "crepe", "rmvpe"],
+                value="pm",
+                interactive=True,
+            )
+            filter_radius0 = gr.Slider(
+                minimum=0,
+                maximum=7,
+                label=i18n(
+                    ">=3则使用对harvest音高识别的结果使用中值滤波，数值为滤波半径，使用可以削弱哑音"
+                ),
+                value=3,
+                step=1,
+                interactive=True,
+            )
+            with gr.Column():
+                file_index1 = gr.Textbox(
+                    label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
+                    value="",
+                    interactive=False,
+                    visible=False,
+                )
+            file_index2 = gr.Dropdown(
+                label=i18n("自动检测index路径,下拉式选择(dropdown)"),
+                choices=sorted(index_paths),
+                interactive=True,
+            )
+            index_rate1 = gr.Slider(
+                minimum=0,
+                maximum=1,
+                label=i18n("检索特征占比"),
+                value=0.88,
+                interactive=True,
+            )
+            resample_sr0 = gr.Slider(
+                minimum=0,
+                maximum=48000,
+                label=i18n("后处理重采样至最终采样率，0为不进行重采样"),
+                value=0,
+                step=1,
+                interactive=True,
+            )
+            rms_mix_rate0 = gr.Slider(
+                minimum=0,
+                maximum=1,
+                label=i18n(
+                    "输入源音量包络替换输出音量包络融合比例，越靠近1越使用输出包络"
+                ),
+                value=1,
+                interactive=True,
+            )
+            protect0 = gr.Slider(
+                minimum=0,
+                maximum=0.5,
+                label=i18n(
+                    "保护清辅音和呼吸声，防止电音撕裂等artifact，拉满0.5不开启，调低加大保护力度但可能降低索引效果"
+                ),
+                value=0.33,
+                step=0.01,
+                interactive=True,
+            )
+            f0_file = gr.File(
+                label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调")
+            )
+            but0 = gr.Button(i18n("转换"), variant="primary")
+            vc_output1 = gr.Textbox(label=i18n("输出信息"))
+            vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)"))
+            but0.click(
+                vc.vc_single,
+                [
+                    spk_item,
+                    vc_input3,
+                    vc_transform0,
+                    f0_file,
+                    f0method0,
+                    file_index1,
+                    file_index2,
+                    # file_big_npy1,
+                    index_rate1,
+                    filter_radius0,
+                    resample_sr0,
+                    rms_mix_rate0,
+                    protect0,
+                ],
+                [vc_output1, vc_output2],
+            )
+app.launch()